GeoLLaVA-8K / trainer_state.json
initiacms's picture
Upload folder using huggingface_hub
be5b614 verified
raw
history blame
443 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999803343166175,
"eval_steps": 500,
"global_step": 2542,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00039331366764995085,
"grad_norm": 7.214968204498291,
"learning_rate": 1.9607843137254902e-08,
"loss": 0.1717,
"step": 1
},
{
"epoch": 0.0007866273352999017,
"grad_norm": 8.45617961883545,
"learning_rate": 3.9215686274509804e-08,
"loss": 0.1394,
"step": 2
},
{
"epoch": 0.0011799410029498525,
"grad_norm": 9.644225120544434,
"learning_rate": 5.882352941176471e-08,
"loss": 0.1416,
"step": 3
},
{
"epoch": 0.0015732546705998034,
"grad_norm": 6.772904872894287,
"learning_rate": 7.843137254901961e-08,
"loss": 0.1696,
"step": 4
},
{
"epoch": 0.0019665683382497543,
"grad_norm": 11.89709758758545,
"learning_rate": 9.803921568627452e-08,
"loss": 0.2043,
"step": 5
},
{
"epoch": 0.002359882005899705,
"grad_norm": 30.768009185791016,
"learning_rate": 1.1764705882352942e-07,
"loss": 0.1557,
"step": 6
},
{
"epoch": 0.0027531956735496557,
"grad_norm": 7.8864569664001465,
"learning_rate": 1.3725490196078432e-07,
"loss": 0.1478,
"step": 7
},
{
"epoch": 0.003146509341199607,
"grad_norm": 10.4628267288208,
"learning_rate": 1.5686274509803921e-07,
"loss": 0.162,
"step": 8
},
{
"epoch": 0.0035398230088495575,
"grad_norm": 8.983762741088867,
"learning_rate": 1.7647058823529414e-07,
"loss": 0.1482,
"step": 9
},
{
"epoch": 0.003933136676499509,
"grad_norm": 9.961833953857422,
"learning_rate": 1.9607843137254904e-07,
"loss": 0.1851,
"step": 10
},
{
"epoch": 0.004326450344149459,
"grad_norm": 7.383552074432373,
"learning_rate": 2.1568627450980394e-07,
"loss": 0.1483,
"step": 11
},
{
"epoch": 0.00471976401179941,
"grad_norm": 10.243701934814453,
"learning_rate": 2.3529411764705883e-07,
"loss": 0.1457,
"step": 12
},
{
"epoch": 0.005113077679449361,
"grad_norm": 9.73193645477295,
"learning_rate": 2.5490196078431376e-07,
"loss": 0.1623,
"step": 13
},
{
"epoch": 0.005506391347099311,
"grad_norm": 6.044100284576416,
"learning_rate": 2.7450980392156863e-07,
"loss": 0.1346,
"step": 14
},
{
"epoch": 0.0058997050147492625,
"grad_norm": 28.241085052490234,
"learning_rate": 2.9411764705882356e-07,
"loss": 0.1583,
"step": 15
},
{
"epoch": 0.006293018682399214,
"grad_norm": 11.225924491882324,
"learning_rate": 3.1372549019607843e-07,
"loss": 0.1781,
"step": 16
},
{
"epoch": 0.006686332350049164,
"grad_norm": 9.774815559387207,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.1567,
"step": 17
},
{
"epoch": 0.007079646017699115,
"grad_norm": 10.569445610046387,
"learning_rate": 3.529411764705883e-07,
"loss": 0.1362,
"step": 18
},
{
"epoch": 0.007472959685349066,
"grad_norm": 6.202274322509766,
"learning_rate": 3.7254901960784315e-07,
"loss": 0.1493,
"step": 19
},
{
"epoch": 0.007866273352999017,
"grad_norm": 9.480630874633789,
"learning_rate": 3.921568627450981e-07,
"loss": 0.1528,
"step": 20
},
{
"epoch": 0.008259587020648967,
"grad_norm": 13.586874008178711,
"learning_rate": 4.1176470588235295e-07,
"loss": 0.1266,
"step": 21
},
{
"epoch": 0.008652900688298918,
"grad_norm": 11.455598831176758,
"learning_rate": 4.3137254901960787e-07,
"loss": 0.1225,
"step": 22
},
{
"epoch": 0.00904621435594887,
"grad_norm": 12.348589897155762,
"learning_rate": 4.509803921568628e-07,
"loss": 0.1863,
"step": 23
},
{
"epoch": 0.00943952802359882,
"grad_norm": 7.493137836456299,
"learning_rate": 4.7058823529411767e-07,
"loss": 0.1214,
"step": 24
},
{
"epoch": 0.00983284169124877,
"grad_norm": 11.203600883483887,
"learning_rate": 4.901960784313725e-07,
"loss": 0.1511,
"step": 25
},
{
"epoch": 0.010226155358898722,
"grad_norm": 10.017373085021973,
"learning_rate": 5.098039215686275e-07,
"loss": 0.1464,
"step": 26
},
{
"epoch": 0.010619469026548672,
"grad_norm": 7.930361270904541,
"learning_rate": 5.294117647058824e-07,
"loss": 0.1716,
"step": 27
},
{
"epoch": 0.011012782694198623,
"grad_norm": 6.609414577484131,
"learning_rate": 5.490196078431373e-07,
"loss": 0.1499,
"step": 28
},
{
"epoch": 0.011406096361848575,
"grad_norm": 9.198175430297852,
"learning_rate": 5.686274509803922e-07,
"loss": 0.1513,
"step": 29
},
{
"epoch": 0.011799410029498525,
"grad_norm": 7.527069091796875,
"learning_rate": 5.882352941176471e-07,
"loss": 0.1344,
"step": 30
},
{
"epoch": 0.012192723697148475,
"grad_norm": 25.97745704650879,
"learning_rate": 6.07843137254902e-07,
"loss": 0.1025,
"step": 31
},
{
"epoch": 0.012586037364798427,
"grad_norm": 6.214263916015625,
"learning_rate": 6.274509803921569e-07,
"loss": 0.1387,
"step": 32
},
{
"epoch": 0.012979351032448377,
"grad_norm": 7.101906776428223,
"learning_rate": 6.470588235294118e-07,
"loss": 0.1335,
"step": 33
},
{
"epoch": 0.013372664700098328,
"grad_norm": 7.696187496185303,
"learning_rate": 6.666666666666667e-07,
"loss": 0.1277,
"step": 34
},
{
"epoch": 0.01376597836774828,
"grad_norm": 9.324244499206543,
"learning_rate": 6.862745098039217e-07,
"loss": 0.1512,
"step": 35
},
{
"epoch": 0.01415929203539823,
"grad_norm": 3.9664223194122314,
"learning_rate": 7.058823529411766e-07,
"loss": 0.0816,
"step": 36
},
{
"epoch": 0.01455260570304818,
"grad_norm": 4.77344274520874,
"learning_rate": 7.254901960784315e-07,
"loss": 0.1036,
"step": 37
},
{
"epoch": 0.014945919370698132,
"grad_norm": 5.8425612449646,
"learning_rate": 7.450980392156863e-07,
"loss": 0.0857,
"step": 38
},
{
"epoch": 0.015339233038348082,
"grad_norm": 4.707705020904541,
"learning_rate": 7.647058823529413e-07,
"loss": 0.0905,
"step": 39
},
{
"epoch": 0.015732546705998034,
"grad_norm": 8.28884220123291,
"learning_rate": 7.843137254901962e-07,
"loss": 0.1273,
"step": 40
},
{
"epoch": 0.016125860373647983,
"grad_norm": 5.381669998168945,
"learning_rate": 8.039215686274511e-07,
"loss": 0.0938,
"step": 41
},
{
"epoch": 0.016519174041297935,
"grad_norm": 4.281416893005371,
"learning_rate": 8.235294117647059e-07,
"loss": 0.0935,
"step": 42
},
{
"epoch": 0.016912487708947887,
"grad_norm": 6.621143817901611,
"learning_rate": 8.431372549019609e-07,
"loss": 0.1002,
"step": 43
},
{
"epoch": 0.017305801376597835,
"grad_norm": 4.4914350509643555,
"learning_rate": 8.627450980392157e-07,
"loss": 0.097,
"step": 44
},
{
"epoch": 0.017699115044247787,
"grad_norm": 3.7035109996795654,
"learning_rate": 8.823529411764707e-07,
"loss": 0.0887,
"step": 45
},
{
"epoch": 0.01809242871189774,
"grad_norm": 4.306455612182617,
"learning_rate": 9.019607843137256e-07,
"loss": 0.1027,
"step": 46
},
{
"epoch": 0.018485742379547688,
"grad_norm": 5.768416881561279,
"learning_rate": 9.215686274509806e-07,
"loss": 0.1006,
"step": 47
},
{
"epoch": 0.01887905604719764,
"grad_norm": 19.471040725708008,
"learning_rate": 9.411764705882353e-07,
"loss": 0.1178,
"step": 48
},
{
"epoch": 0.019272369714847592,
"grad_norm": 6.249476432800293,
"learning_rate": 9.607843137254904e-07,
"loss": 0.1,
"step": 49
},
{
"epoch": 0.01966568338249754,
"grad_norm": 5.785927772521973,
"learning_rate": 9.80392156862745e-07,
"loss": 0.078,
"step": 50
},
{
"epoch": 0.020058997050147492,
"grad_norm": 6.312557220458984,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.1117,
"step": 51
},
{
"epoch": 0.020452310717797444,
"grad_norm": 3.5102477073669434,
"learning_rate": 1.019607843137255e-06,
"loss": 0.0913,
"step": 52
},
{
"epoch": 0.020845624385447393,
"grad_norm": 6.845943450927734,
"learning_rate": 1.03921568627451e-06,
"loss": 0.1353,
"step": 53
},
{
"epoch": 0.021238938053097345,
"grad_norm": 5.505466461181641,
"learning_rate": 1.0588235294117648e-06,
"loss": 0.0965,
"step": 54
},
{
"epoch": 0.021632251720747297,
"grad_norm": 4.362204551696777,
"learning_rate": 1.0784313725490197e-06,
"loss": 0.0844,
"step": 55
},
{
"epoch": 0.022025565388397245,
"grad_norm": 4.358127117156982,
"learning_rate": 1.0980392156862745e-06,
"loss": 0.1155,
"step": 56
},
{
"epoch": 0.022418879056047197,
"grad_norm": 7.55561637878418,
"learning_rate": 1.1176470588235296e-06,
"loss": 0.0742,
"step": 57
},
{
"epoch": 0.02281219272369715,
"grad_norm": 5.882073879241943,
"learning_rate": 1.1372549019607845e-06,
"loss": 0.1112,
"step": 58
},
{
"epoch": 0.023205506391347098,
"grad_norm": 2.456120491027832,
"learning_rate": 1.1568627450980394e-06,
"loss": 0.0605,
"step": 59
},
{
"epoch": 0.02359882005899705,
"grad_norm": 19.60419273376465,
"learning_rate": 1.1764705882352942e-06,
"loss": 0.1267,
"step": 60
},
{
"epoch": 0.023992133726647002,
"grad_norm": 3.074788808822632,
"learning_rate": 1.196078431372549e-06,
"loss": 0.0821,
"step": 61
},
{
"epoch": 0.02438544739429695,
"grad_norm": 3.561314344406128,
"learning_rate": 1.215686274509804e-06,
"loss": 0.0572,
"step": 62
},
{
"epoch": 0.024778761061946902,
"grad_norm": 13.668036460876465,
"learning_rate": 1.235294117647059e-06,
"loss": 0.1268,
"step": 63
},
{
"epoch": 0.025172074729596854,
"grad_norm": 3.8883397579193115,
"learning_rate": 1.2549019607843137e-06,
"loss": 0.0849,
"step": 64
},
{
"epoch": 0.025565388397246803,
"grad_norm": 4.154886245727539,
"learning_rate": 1.2745098039215686e-06,
"loss": 0.1071,
"step": 65
},
{
"epoch": 0.025958702064896755,
"grad_norm": 5.3974127769470215,
"learning_rate": 1.2941176470588237e-06,
"loss": 0.0749,
"step": 66
},
{
"epoch": 0.026352015732546707,
"grad_norm": 3.088780164718628,
"learning_rate": 1.3137254901960785e-06,
"loss": 0.0768,
"step": 67
},
{
"epoch": 0.026745329400196655,
"grad_norm": 3.2044262886047363,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0641,
"step": 68
},
{
"epoch": 0.027138643067846607,
"grad_norm": 5.424925327301025,
"learning_rate": 1.3529411764705883e-06,
"loss": 0.061,
"step": 69
},
{
"epoch": 0.02753195673549656,
"grad_norm": 4.061574935913086,
"learning_rate": 1.3725490196078434e-06,
"loss": 0.0851,
"step": 70
},
{
"epoch": 0.027925270403146508,
"grad_norm": 5.696750164031982,
"learning_rate": 1.3921568627450982e-06,
"loss": 0.1107,
"step": 71
},
{
"epoch": 0.02831858407079646,
"grad_norm": 4.410640716552734,
"learning_rate": 1.4117647058823531e-06,
"loss": 0.0714,
"step": 72
},
{
"epoch": 0.028711897738446412,
"grad_norm": 6.307974815368652,
"learning_rate": 1.4313725490196078e-06,
"loss": 0.0866,
"step": 73
},
{
"epoch": 0.02910521140609636,
"grad_norm": 2.53486967086792,
"learning_rate": 1.450980392156863e-06,
"loss": 0.0613,
"step": 74
},
{
"epoch": 0.029498525073746312,
"grad_norm": 6.9410881996154785,
"learning_rate": 1.4705882352941177e-06,
"loss": 0.086,
"step": 75
},
{
"epoch": 0.029891838741396264,
"grad_norm": 2.5871775150299072,
"learning_rate": 1.4901960784313726e-06,
"loss": 0.0507,
"step": 76
},
{
"epoch": 0.030285152409046213,
"grad_norm": 2.2673654556274414,
"learning_rate": 1.5098039215686275e-06,
"loss": 0.0676,
"step": 77
},
{
"epoch": 0.030678466076696165,
"grad_norm": 2.789076805114746,
"learning_rate": 1.5294117647058826e-06,
"loss": 0.0632,
"step": 78
},
{
"epoch": 0.031071779744346117,
"grad_norm": 6.127337455749512,
"learning_rate": 1.5490196078431374e-06,
"loss": 0.0498,
"step": 79
},
{
"epoch": 0.03146509341199607,
"grad_norm": 2.758253574371338,
"learning_rate": 1.5686274509803923e-06,
"loss": 0.0706,
"step": 80
},
{
"epoch": 0.03185840707964602,
"grad_norm": 6.687328815460205,
"learning_rate": 1.5882352941176472e-06,
"loss": 0.0961,
"step": 81
},
{
"epoch": 0.032251720747295966,
"grad_norm": 7.499604225158691,
"learning_rate": 1.6078431372549023e-06,
"loss": 0.0715,
"step": 82
},
{
"epoch": 0.03264503441494592,
"grad_norm": 6.008899211883545,
"learning_rate": 1.6274509803921571e-06,
"loss": 0.123,
"step": 83
},
{
"epoch": 0.03303834808259587,
"grad_norm": 4.841026306152344,
"learning_rate": 1.6470588235294118e-06,
"loss": 0.0647,
"step": 84
},
{
"epoch": 0.03343166175024582,
"grad_norm": 3.0710766315460205,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0372,
"step": 85
},
{
"epoch": 0.033824975417895774,
"grad_norm": 3.3783321380615234,
"learning_rate": 1.6862745098039217e-06,
"loss": 0.0843,
"step": 86
},
{
"epoch": 0.03421828908554572,
"grad_norm": 2.6547350883483887,
"learning_rate": 1.7058823529411766e-06,
"loss": 0.0589,
"step": 87
},
{
"epoch": 0.03461160275319567,
"grad_norm": 3.6741859912872314,
"learning_rate": 1.7254901960784315e-06,
"loss": 0.0308,
"step": 88
},
{
"epoch": 0.035004916420845626,
"grad_norm": 3.555490493774414,
"learning_rate": 1.7450980392156864e-06,
"loss": 0.0497,
"step": 89
},
{
"epoch": 0.035398230088495575,
"grad_norm": 3.1174697875976562,
"learning_rate": 1.7647058823529414e-06,
"loss": 0.063,
"step": 90
},
{
"epoch": 0.03579154375614552,
"grad_norm": 4.790848255157471,
"learning_rate": 1.7843137254901963e-06,
"loss": 0.0834,
"step": 91
},
{
"epoch": 0.03618485742379548,
"grad_norm": 3.2931265830993652,
"learning_rate": 1.8039215686274512e-06,
"loss": 0.0531,
"step": 92
},
{
"epoch": 0.03657817109144543,
"grad_norm": 13.777477264404297,
"learning_rate": 1.8235294117647058e-06,
"loss": 0.0786,
"step": 93
},
{
"epoch": 0.036971484759095376,
"grad_norm": 4.943524360656738,
"learning_rate": 1.8431372549019611e-06,
"loss": 0.0602,
"step": 94
},
{
"epoch": 0.03736479842674533,
"grad_norm": 6.189723014831543,
"learning_rate": 1.8627450980392158e-06,
"loss": 0.0697,
"step": 95
},
{
"epoch": 0.03775811209439528,
"grad_norm": 3.5542352199554443,
"learning_rate": 1.8823529411764707e-06,
"loss": 0.0863,
"step": 96
},
{
"epoch": 0.03815142576204523,
"grad_norm": 5.407109260559082,
"learning_rate": 1.9019607843137255e-06,
"loss": 0.088,
"step": 97
},
{
"epoch": 0.038544739429695184,
"grad_norm": 3.3334732055664062,
"learning_rate": 1.921568627450981e-06,
"loss": 0.0889,
"step": 98
},
{
"epoch": 0.03893805309734513,
"grad_norm": 2.48398756980896,
"learning_rate": 1.9411764705882353e-06,
"loss": 0.0483,
"step": 99
},
{
"epoch": 0.03933136676499508,
"grad_norm": 2.3380913734436035,
"learning_rate": 1.96078431372549e-06,
"loss": 0.0707,
"step": 100
},
{
"epoch": 0.039724680432645036,
"grad_norm": 4.355076789855957,
"learning_rate": 1.980392156862745e-06,
"loss": 0.0639,
"step": 101
},
{
"epoch": 0.040117994100294985,
"grad_norm": 4.081620693206787,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.06,
"step": 102
},
{
"epoch": 0.04051130776794493,
"grad_norm": 4.437114715576172,
"learning_rate": 2.019607843137255e-06,
"loss": 0.1017,
"step": 103
},
{
"epoch": 0.04090462143559489,
"grad_norm": 4.925793647766113,
"learning_rate": 2.03921568627451e-06,
"loss": 0.0934,
"step": 104
},
{
"epoch": 0.04129793510324484,
"grad_norm": 2.085400104522705,
"learning_rate": 2.058823529411765e-06,
"loss": 0.058,
"step": 105
},
{
"epoch": 0.041691248770894786,
"grad_norm": 2.8664395809173584,
"learning_rate": 2.07843137254902e-06,
"loss": 0.0709,
"step": 106
},
{
"epoch": 0.04208456243854474,
"grad_norm": 1.7521601915359497,
"learning_rate": 2.0980392156862747e-06,
"loss": 0.031,
"step": 107
},
{
"epoch": 0.04247787610619469,
"grad_norm": 3.7575159072875977,
"learning_rate": 2.1176470588235296e-06,
"loss": 0.0777,
"step": 108
},
{
"epoch": 0.04287118977384464,
"grad_norm": 4.240278720855713,
"learning_rate": 2.1372549019607844e-06,
"loss": 0.0965,
"step": 109
},
{
"epoch": 0.043264503441494594,
"grad_norm": 3.841932773590088,
"learning_rate": 2.1568627450980393e-06,
"loss": 0.0844,
"step": 110
},
{
"epoch": 0.04365781710914454,
"grad_norm": 4.4334397315979,
"learning_rate": 2.176470588235294e-06,
"loss": 0.0956,
"step": 111
},
{
"epoch": 0.04405113077679449,
"grad_norm": 4.255678653717041,
"learning_rate": 2.196078431372549e-06,
"loss": 0.0855,
"step": 112
},
{
"epoch": 0.044444444444444446,
"grad_norm": 2.3486170768737793,
"learning_rate": 2.215686274509804e-06,
"loss": 0.0417,
"step": 113
},
{
"epoch": 0.044837758112094395,
"grad_norm": 2.222768783569336,
"learning_rate": 2.2352941176470592e-06,
"loss": 0.0556,
"step": 114
},
{
"epoch": 0.04523107177974434,
"grad_norm": 2.750119686126709,
"learning_rate": 2.254901960784314e-06,
"loss": 0.0481,
"step": 115
},
{
"epoch": 0.0456243854473943,
"grad_norm": 4.375302314758301,
"learning_rate": 2.274509803921569e-06,
"loss": 0.098,
"step": 116
},
{
"epoch": 0.04601769911504425,
"grad_norm": 3.7654221057891846,
"learning_rate": 2.2941176470588234e-06,
"loss": 0.1025,
"step": 117
},
{
"epoch": 0.046411012782694196,
"grad_norm": 2.422442674636841,
"learning_rate": 2.3137254901960787e-06,
"loss": 0.0675,
"step": 118
},
{
"epoch": 0.04680432645034415,
"grad_norm": 3.3458054065704346,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.067,
"step": 119
},
{
"epoch": 0.0471976401179941,
"grad_norm": 2.7424211502075195,
"learning_rate": 2.3529411764705885e-06,
"loss": 0.0774,
"step": 120
},
{
"epoch": 0.04759095378564405,
"grad_norm": 3.4825127124786377,
"learning_rate": 2.3725490196078433e-06,
"loss": 0.086,
"step": 121
},
{
"epoch": 0.047984267453294004,
"grad_norm": 55.36836242675781,
"learning_rate": 2.392156862745098e-06,
"loss": 0.0938,
"step": 122
},
{
"epoch": 0.04837758112094395,
"grad_norm": 2.256223201751709,
"learning_rate": 2.411764705882353e-06,
"loss": 0.0673,
"step": 123
},
{
"epoch": 0.0487708947885939,
"grad_norm": 3.8095710277557373,
"learning_rate": 2.431372549019608e-06,
"loss": 0.0728,
"step": 124
},
{
"epoch": 0.049164208456243856,
"grad_norm": 1.8562949895858765,
"learning_rate": 2.450980392156863e-06,
"loss": 0.0629,
"step": 125
},
{
"epoch": 0.049557522123893805,
"grad_norm": 4.999472618103027,
"learning_rate": 2.470588235294118e-06,
"loss": 0.059,
"step": 126
},
{
"epoch": 0.04995083579154375,
"grad_norm": 3.9088096618652344,
"learning_rate": 2.490196078431373e-06,
"loss": 0.0662,
"step": 127
},
{
"epoch": 0.05034414945919371,
"grad_norm": 4.975748062133789,
"learning_rate": 2.5098039215686274e-06,
"loss": 0.0688,
"step": 128
},
{
"epoch": 0.05073746312684366,
"grad_norm": 2.183948516845703,
"learning_rate": 2.5294117647058823e-06,
"loss": 0.0477,
"step": 129
},
{
"epoch": 0.051130776794493606,
"grad_norm": 4.890422821044922,
"learning_rate": 2.549019607843137e-06,
"loss": 0.0793,
"step": 130
},
{
"epoch": 0.05152409046214356,
"grad_norm": 4.04612398147583,
"learning_rate": 2.568627450980392e-06,
"loss": 0.0705,
"step": 131
},
{
"epoch": 0.05191740412979351,
"grad_norm": 2.8650074005126953,
"learning_rate": 2.5882352941176473e-06,
"loss": 0.0777,
"step": 132
},
{
"epoch": 0.05231071779744346,
"grad_norm": 3.9029088020324707,
"learning_rate": 2.6078431372549022e-06,
"loss": 0.0766,
"step": 133
},
{
"epoch": 0.052704031465093414,
"grad_norm": 2.4210422039031982,
"learning_rate": 2.627450980392157e-06,
"loss": 0.0663,
"step": 134
},
{
"epoch": 0.05309734513274336,
"grad_norm": 3.0176892280578613,
"learning_rate": 2.647058823529412e-06,
"loss": 0.0703,
"step": 135
},
{
"epoch": 0.05349065880039331,
"grad_norm": 13.886055946350098,
"learning_rate": 2.666666666666667e-06,
"loss": 0.064,
"step": 136
},
{
"epoch": 0.053883972468043266,
"grad_norm": 2.40460205078125,
"learning_rate": 2.6862745098039217e-06,
"loss": 0.0492,
"step": 137
},
{
"epoch": 0.054277286135693215,
"grad_norm": 3.829288959503174,
"learning_rate": 2.7058823529411766e-06,
"loss": 0.0564,
"step": 138
},
{
"epoch": 0.05467059980334316,
"grad_norm": 2.2005629539489746,
"learning_rate": 2.7254901960784314e-06,
"loss": 0.0483,
"step": 139
},
{
"epoch": 0.05506391347099312,
"grad_norm": 14.79651927947998,
"learning_rate": 2.7450980392156867e-06,
"loss": 0.0937,
"step": 140
},
{
"epoch": 0.05545722713864307,
"grad_norm": 1.6898876428604126,
"learning_rate": 2.7647058823529416e-06,
"loss": 0.0693,
"step": 141
},
{
"epoch": 0.055850540806293016,
"grad_norm": 3.5447332859039307,
"learning_rate": 2.7843137254901965e-06,
"loss": 0.1311,
"step": 142
},
{
"epoch": 0.05624385447394297,
"grad_norm": 2.291607618331909,
"learning_rate": 2.8039215686274514e-06,
"loss": 0.061,
"step": 143
},
{
"epoch": 0.05663716814159292,
"grad_norm": 4.079521656036377,
"learning_rate": 2.8235294117647062e-06,
"loss": 0.1169,
"step": 144
},
{
"epoch": 0.05703048180924287,
"grad_norm": 5.1168012619018555,
"learning_rate": 2.843137254901961e-06,
"loss": 0.0436,
"step": 145
},
{
"epoch": 0.057423795476892824,
"grad_norm": 4.056823253631592,
"learning_rate": 2.8627450980392155e-06,
"loss": 0.09,
"step": 146
},
{
"epoch": 0.05781710914454277,
"grad_norm": 2.1756484508514404,
"learning_rate": 2.8823529411764704e-06,
"loss": 0.0747,
"step": 147
},
{
"epoch": 0.05821042281219272,
"grad_norm": 2.8064467906951904,
"learning_rate": 2.901960784313726e-06,
"loss": 0.0261,
"step": 148
},
{
"epoch": 0.058603736479842676,
"grad_norm": 2.9834907054901123,
"learning_rate": 2.9215686274509806e-06,
"loss": 0.0735,
"step": 149
},
{
"epoch": 0.058997050147492625,
"grad_norm": 15.821993827819824,
"learning_rate": 2.9411764705882355e-06,
"loss": 0.0835,
"step": 150
},
{
"epoch": 0.05939036381514257,
"grad_norm": 6.1172709465026855,
"learning_rate": 2.9607843137254903e-06,
"loss": 0.0621,
"step": 151
},
{
"epoch": 0.05978367748279253,
"grad_norm": 3.961477041244507,
"learning_rate": 2.980392156862745e-06,
"loss": 0.0777,
"step": 152
},
{
"epoch": 0.06017699115044248,
"grad_norm": 3.682879686355591,
"learning_rate": 3e-06,
"loss": 0.0836,
"step": 153
},
{
"epoch": 0.060570304818092426,
"grad_norm": 1.2253718376159668,
"learning_rate": 3.019607843137255e-06,
"loss": 0.0255,
"step": 154
},
{
"epoch": 0.06096361848574238,
"grad_norm": 2.107466220855713,
"learning_rate": 3.03921568627451e-06,
"loss": 0.0698,
"step": 155
},
{
"epoch": 0.06135693215339233,
"grad_norm": 2.720797061920166,
"learning_rate": 3.058823529411765e-06,
"loss": 0.0683,
"step": 156
},
{
"epoch": 0.06175024582104228,
"grad_norm": 2.0135252475738525,
"learning_rate": 3.07843137254902e-06,
"loss": 0.0594,
"step": 157
},
{
"epoch": 0.062143559488692234,
"grad_norm": 2.011382579803467,
"learning_rate": 3.098039215686275e-06,
"loss": 0.0643,
"step": 158
},
{
"epoch": 0.06253687315634218,
"grad_norm": 3.047201156616211,
"learning_rate": 3.1176470588235297e-06,
"loss": 0.0564,
"step": 159
},
{
"epoch": 0.06293018682399214,
"grad_norm": 2.3302555084228516,
"learning_rate": 3.1372549019607846e-06,
"loss": 0.0404,
"step": 160
},
{
"epoch": 0.06332350049164208,
"grad_norm": 2.7288010120391846,
"learning_rate": 3.1568627450980395e-06,
"loss": 0.1009,
"step": 161
},
{
"epoch": 0.06371681415929203,
"grad_norm": 2.852647304534912,
"learning_rate": 3.1764705882352943e-06,
"loss": 0.0508,
"step": 162
},
{
"epoch": 0.06411012782694199,
"grad_norm": 2.101698637008667,
"learning_rate": 3.1960784313725492e-06,
"loss": 0.0814,
"step": 163
},
{
"epoch": 0.06450344149459193,
"grad_norm": 2.864086151123047,
"learning_rate": 3.2156862745098045e-06,
"loss": 0.0543,
"step": 164
},
{
"epoch": 0.06489675516224189,
"grad_norm": 2.587751865386963,
"learning_rate": 3.2352941176470594e-06,
"loss": 0.0753,
"step": 165
},
{
"epoch": 0.06529006882989184,
"grad_norm": 1.5767340660095215,
"learning_rate": 3.2549019607843143e-06,
"loss": 0.0399,
"step": 166
},
{
"epoch": 0.06568338249754178,
"grad_norm": 3.7279415130615234,
"learning_rate": 3.2745098039215687e-06,
"loss": 0.0804,
"step": 167
},
{
"epoch": 0.06607669616519174,
"grad_norm": 2.9727795124053955,
"learning_rate": 3.2941176470588236e-06,
"loss": 0.0548,
"step": 168
},
{
"epoch": 0.0664700098328417,
"grad_norm": 2.0582468509674072,
"learning_rate": 3.3137254901960785e-06,
"loss": 0.0656,
"step": 169
},
{
"epoch": 0.06686332350049164,
"grad_norm": 7.246119499206543,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0499,
"step": 170
},
{
"epoch": 0.06725663716814159,
"grad_norm": 70.4866714477539,
"learning_rate": 3.352941176470588e-06,
"loss": 0.0764,
"step": 171
},
{
"epoch": 0.06764995083579155,
"grad_norm": 1.8262776136398315,
"learning_rate": 3.3725490196078435e-06,
"loss": 0.0497,
"step": 172
},
{
"epoch": 0.06804326450344149,
"grad_norm": 2.6392412185668945,
"learning_rate": 3.3921568627450984e-06,
"loss": 0.072,
"step": 173
},
{
"epoch": 0.06843657817109144,
"grad_norm": 1.2957279682159424,
"learning_rate": 3.4117647058823532e-06,
"loss": 0.0749,
"step": 174
},
{
"epoch": 0.0688298918387414,
"grad_norm": 1.5801424980163574,
"learning_rate": 3.431372549019608e-06,
"loss": 0.0504,
"step": 175
},
{
"epoch": 0.06922320550639134,
"grad_norm": 1.6194735765457153,
"learning_rate": 3.450980392156863e-06,
"loss": 0.0396,
"step": 176
},
{
"epoch": 0.0696165191740413,
"grad_norm": 3.31343674659729,
"learning_rate": 3.470588235294118e-06,
"loss": 0.0624,
"step": 177
},
{
"epoch": 0.07000983284169125,
"grad_norm": 2.1785762310028076,
"learning_rate": 3.4901960784313727e-06,
"loss": 0.0548,
"step": 178
},
{
"epoch": 0.0704031465093412,
"grad_norm": 1.3683737516403198,
"learning_rate": 3.5098039215686276e-06,
"loss": 0.0274,
"step": 179
},
{
"epoch": 0.07079646017699115,
"grad_norm": 3.2981035709381104,
"learning_rate": 3.529411764705883e-06,
"loss": 0.0816,
"step": 180
},
{
"epoch": 0.0711897738446411,
"grad_norm": 2.3660190105438232,
"learning_rate": 3.5490196078431378e-06,
"loss": 0.0445,
"step": 181
},
{
"epoch": 0.07158308751229105,
"grad_norm": 3.4103376865386963,
"learning_rate": 3.5686274509803926e-06,
"loss": 0.0959,
"step": 182
},
{
"epoch": 0.071976401179941,
"grad_norm": 2.7939486503601074,
"learning_rate": 3.5882352941176475e-06,
"loss": 0.096,
"step": 183
},
{
"epoch": 0.07236971484759096,
"grad_norm": 2.009209632873535,
"learning_rate": 3.6078431372549024e-06,
"loss": 0.0548,
"step": 184
},
{
"epoch": 0.0727630285152409,
"grad_norm": 1.9003010988235474,
"learning_rate": 3.6274509803921573e-06,
"loss": 0.058,
"step": 185
},
{
"epoch": 0.07315634218289085,
"grad_norm": 2.788331985473633,
"learning_rate": 3.6470588235294117e-06,
"loss": 0.0828,
"step": 186
},
{
"epoch": 0.07354965585054081,
"grad_norm": 2.2508130073547363,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.089,
"step": 187
},
{
"epoch": 0.07394296951819075,
"grad_norm": 14.532478332519531,
"learning_rate": 3.6862745098039223e-06,
"loss": 0.0878,
"step": 188
},
{
"epoch": 0.0743362831858407,
"grad_norm": 1.3768811225891113,
"learning_rate": 3.7058823529411767e-06,
"loss": 0.0534,
"step": 189
},
{
"epoch": 0.07472959685349066,
"grad_norm": 2.9948389530181885,
"learning_rate": 3.7254901960784316e-06,
"loss": 0.0704,
"step": 190
},
{
"epoch": 0.0751229105211406,
"grad_norm": 1.4626399278640747,
"learning_rate": 3.7450980392156865e-06,
"loss": 0.0306,
"step": 191
},
{
"epoch": 0.07551622418879056,
"grad_norm": 3.062840700149536,
"learning_rate": 3.7647058823529414e-06,
"loss": 0.0802,
"step": 192
},
{
"epoch": 0.07590953785644051,
"grad_norm": 5.729097843170166,
"learning_rate": 3.7843137254901962e-06,
"loss": 0.1013,
"step": 193
},
{
"epoch": 0.07630285152409046,
"grad_norm": 1.8716782331466675,
"learning_rate": 3.803921568627451e-06,
"loss": 0.0664,
"step": 194
},
{
"epoch": 0.07669616519174041,
"grad_norm": 2.058469533920288,
"learning_rate": 3.8235294117647055e-06,
"loss": 0.0683,
"step": 195
},
{
"epoch": 0.07708947885939037,
"grad_norm": 12.551715850830078,
"learning_rate": 3.843137254901962e-06,
"loss": 0.09,
"step": 196
},
{
"epoch": 0.07748279252704031,
"grad_norm": 2.2984426021575928,
"learning_rate": 3.862745098039216e-06,
"loss": 0.0672,
"step": 197
},
{
"epoch": 0.07787610619469026,
"grad_norm": 4.480764865875244,
"learning_rate": 3.882352941176471e-06,
"loss": 0.051,
"step": 198
},
{
"epoch": 0.07826941986234022,
"grad_norm": 1.4032012224197388,
"learning_rate": 3.901960784313726e-06,
"loss": 0.0289,
"step": 199
},
{
"epoch": 0.07866273352999016,
"grad_norm": 3.133589029312134,
"learning_rate": 3.92156862745098e-06,
"loss": 0.0807,
"step": 200
},
{
"epoch": 0.07905604719764012,
"grad_norm": 4.1782307624816895,
"learning_rate": 3.941176470588236e-06,
"loss": 0.0683,
"step": 201
},
{
"epoch": 0.07944936086529007,
"grad_norm": 11.163358688354492,
"learning_rate": 3.96078431372549e-06,
"loss": 0.0421,
"step": 202
},
{
"epoch": 0.07984267453294001,
"grad_norm": 1.3736735582351685,
"learning_rate": 3.980392156862745e-06,
"loss": 0.0339,
"step": 203
},
{
"epoch": 0.08023598820058997,
"grad_norm": 6.474332332611084,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0606,
"step": 204
},
{
"epoch": 0.08062930186823992,
"grad_norm": 2.8827829360961914,
"learning_rate": 4.019607843137255e-06,
"loss": 0.1104,
"step": 205
},
{
"epoch": 0.08102261553588987,
"grad_norm": 1.8476606607437134,
"learning_rate": 4.03921568627451e-06,
"loss": 0.0479,
"step": 206
},
{
"epoch": 0.08141592920353982,
"grad_norm": 3.2202746868133545,
"learning_rate": 4.058823529411765e-06,
"loss": 0.088,
"step": 207
},
{
"epoch": 0.08180924287118978,
"grad_norm": 3.4121432304382324,
"learning_rate": 4.07843137254902e-06,
"loss": 0.1051,
"step": 208
},
{
"epoch": 0.08220255653883972,
"grad_norm": 2.4771883487701416,
"learning_rate": 4.098039215686275e-06,
"loss": 0.0477,
"step": 209
},
{
"epoch": 0.08259587020648967,
"grad_norm": 2.9881558418273926,
"learning_rate": 4.11764705882353e-06,
"loss": 0.0472,
"step": 210
},
{
"epoch": 0.08298918387413963,
"grad_norm": 2.8722712993621826,
"learning_rate": 4.137254901960784e-06,
"loss": 0.0856,
"step": 211
},
{
"epoch": 0.08338249754178957,
"grad_norm": 1.9073129892349243,
"learning_rate": 4.15686274509804e-06,
"loss": 0.0542,
"step": 212
},
{
"epoch": 0.08377581120943953,
"grad_norm": 3.5067648887634277,
"learning_rate": 4.176470588235295e-06,
"loss": 0.0567,
"step": 213
},
{
"epoch": 0.08416912487708948,
"grad_norm": 2.5827410221099854,
"learning_rate": 4.196078431372549e-06,
"loss": 0.1062,
"step": 214
},
{
"epoch": 0.08456243854473942,
"grad_norm": 1.8257296085357666,
"learning_rate": 4.215686274509805e-06,
"loss": 0.0821,
"step": 215
},
{
"epoch": 0.08495575221238938,
"grad_norm": 3.9571404457092285,
"learning_rate": 4.235294117647059e-06,
"loss": 0.1143,
"step": 216
},
{
"epoch": 0.08534906588003933,
"grad_norm": 2.6589484214782715,
"learning_rate": 4.254901960784314e-06,
"loss": 0.0814,
"step": 217
},
{
"epoch": 0.08574237954768928,
"grad_norm": 0.915239155292511,
"learning_rate": 4.274509803921569e-06,
"loss": 0.0355,
"step": 218
},
{
"epoch": 0.08613569321533923,
"grad_norm": 2.9066381454467773,
"learning_rate": 4.294117647058823e-06,
"loss": 0.0783,
"step": 219
},
{
"epoch": 0.08652900688298919,
"grad_norm": 1.581722378730774,
"learning_rate": 4.313725490196079e-06,
"loss": 0.0589,
"step": 220
},
{
"epoch": 0.08692232055063913,
"grad_norm": 2.2173354625701904,
"learning_rate": 4.333333333333334e-06,
"loss": 0.0791,
"step": 221
},
{
"epoch": 0.08731563421828908,
"grad_norm": 1.784740686416626,
"learning_rate": 4.352941176470588e-06,
"loss": 0.0616,
"step": 222
},
{
"epoch": 0.08770894788593904,
"grad_norm": 1.9993363618850708,
"learning_rate": 4.372549019607844e-06,
"loss": 0.0864,
"step": 223
},
{
"epoch": 0.08810226155358898,
"grad_norm": 4.089532375335693,
"learning_rate": 4.392156862745098e-06,
"loss": 0.0982,
"step": 224
},
{
"epoch": 0.08849557522123894,
"grad_norm": 2.5914440155029297,
"learning_rate": 4.411764705882353e-06,
"loss": 0.0702,
"step": 225
},
{
"epoch": 0.08888888888888889,
"grad_norm": 2.555253028869629,
"learning_rate": 4.431372549019608e-06,
"loss": 0.0831,
"step": 226
},
{
"epoch": 0.08928220255653883,
"grad_norm": 2.2960548400878906,
"learning_rate": 4.450980392156863e-06,
"loss": 0.0641,
"step": 227
},
{
"epoch": 0.08967551622418879,
"grad_norm": 1.402106761932373,
"learning_rate": 4.4705882352941184e-06,
"loss": 0.0594,
"step": 228
},
{
"epoch": 0.09006882989183874,
"grad_norm": 3.1225955486297607,
"learning_rate": 4.490196078431373e-06,
"loss": 0.1042,
"step": 229
},
{
"epoch": 0.09046214355948869,
"grad_norm": 1.7568937540054321,
"learning_rate": 4.509803921568628e-06,
"loss": 0.0689,
"step": 230
},
{
"epoch": 0.09085545722713864,
"grad_norm": 2.8846213817596436,
"learning_rate": 4.529411764705883e-06,
"loss": 0.0955,
"step": 231
},
{
"epoch": 0.0912487708947886,
"grad_norm": 4.436802387237549,
"learning_rate": 4.549019607843138e-06,
"loss": 0.1668,
"step": 232
},
{
"epoch": 0.09164208456243854,
"grad_norm": 2.784074068069458,
"learning_rate": 4.568627450980392e-06,
"loss": 0.083,
"step": 233
},
{
"epoch": 0.0920353982300885,
"grad_norm": 2.276759147644043,
"learning_rate": 4.588235294117647e-06,
"loss": 0.0725,
"step": 234
},
{
"epoch": 0.09242871189773845,
"grad_norm": 2.5278875827789307,
"learning_rate": 4.607843137254902e-06,
"loss": 0.0744,
"step": 235
},
{
"epoch": 0.09282202556538839,
"grad_norm": 1.711602807044983,
"learning_rate": 4.627450980392157e-06,
"loss": 0.0749,
"step": 236
},
{
"epoch": 0.09321533923303835,
"grad_norm": 1.4517807960510254,
"learning_rate": 4.647058823529412e-06,
"loss": 0.0587,
"step": 237
},
{
"epoch": 0.0936086529006883,
"grad_norm": 1.090840220451355,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0719,
"step": 238
},
{
"epoch": 0.09400196656833824,
"grad_norm": 1.8589414358139038,
"learning_rate": 4.686274509803922e-06,
"loss": 0.0563,
"step": 239
},
{
"epoch": 0.0943952802359882,
"grad_norm": 2.264702081680298,
"learning_rate": 4.705882352941177e-06,
"loss": 0.0648,
"step": 240
},
{
"epoch": 0.09478859390363815,
"grad_norm": 1.4464210271835327,
"learning_rate": 4.725490196078431e-06,
"loss": 0.0238,
"step": 241
},
{
"epoch": 0.0951819075712881,
"grad_norm": 1.9937217235565186,
"learning_rate": 4.745098039215687e-06,
"loss": 0.0493,
"step": 242
},
{
"epoch": 0.09557522123893805,
"grad_norm": 2.2047340869903564,
"learning_rate": 4.764705882352941e-06,
"loss": 0.091,
"step": 243
},
{
"epoch": 0.09596853490658801,
"grad_norm": 4.057810306549072,
"learning_rate": 4.784313725490196e-06,
"loss": 0.0938,
"step": 244
},
{
"epoch": 0.09636184857423795,
"grad_norm": 1.6187644004821777,
"learning_rate": 4.803921568627452e-06,
"loss": 0.0673,
"step": 245
},
{
"epoch": 0.0967551622418879,
"grad_norm": 2.7249605655670166,
"learning_rate": 4.823529411764706e-06,
"loss": 0.0848,
"step": 246
},
{
"epoch": 0.09714847590953786,
"grad_norm": 1.7594577074050903,
"learning_rate": 4.8431372549019614e-06,
"loss": 0.0594,
"step": 247
},
{
"epoch": 0.0975417895771878,
"grad_norm": 2.6266980171203613,
"learning_rate": 4.862745098039216e-06,
"loss": 0.0866,
"step": 248
},
{
"epoch": 0.09793510324483776,
"grad_norm": 3.3526737689971924,
"learning_rate": 4.882352941176471e-06,
"loss": 0.1115,
"step": 249
},
{
"epoch": 0.09832841691248771,
"grad_norm": 2.7514872550964355,
"learning_rate": 4.901960784313726e-06,
"loss": 0.0694,
"step": 250
},
{
"epoch": 0.09872173058013765,
"grad_norm": 2.44143009185791,
"learning_rate": 4.921568627450981e-06,
"loss": 0.0715,
"step": 251
},
{
"epoch": 0.09911504424778761,
"grad_norm": 2.214268207550049,
"learning_rate": 4.941176470588236e-06,
"loss": 0.0576,
"step": 252
},
{
"epoch": 0.09950835791543756,
"grad_norm": 1.7012481689453125,
"learning_rate": 4.960784313725491e-06,
"loss": 0.0754,
"step": 253
},
{
"epoch": 0.0999016715830875,
"grad_norm": 1.8335487842559814,
"learning_rate": 4.980392156862746e-06,
"loss": 0.0617,
"step": 254
},
{
"epoch": 0.10029498525073746,
"grad_norm": 2.3848774433135986,
"learning_rate": 5e-06,
"loss": 0.1011,
"step": 255
},
{
"epoch": 0.10068829891838742,
"grad_norm": 2.1847634315490723,
"learning_rate": 4.999997641274725e-06,
"loss": 0.0793,
"step": 256
},
{
"epoch": 0.10108161258603736,
"grad_norm": 1.5467146635055542,
"learning_rate": 4.999990565103349e-06,
"loss": 0.0685,
"step": 257
},
{
"epoch": 0.10147492625368731,
"grad_norm": 1.5211800336837769,
"learning_rate": 4.999978771499224e-06,
"loss": 0.0453,
"step": 258
},
{
"epoch": 0.10186823992133727,
"grad_norm": 1.944356918334961,
"learning_rate": 4.999962260484607e-06,
"loss": 0.0726,
"step": 259
},
{
"epoch": 0.10226155358898721,
"grad_norm": 2.206536054611206,
"learning_rate": 4.999941032090652e-06,
"loss": 0.0963,
"step": 260
},
{
"epoch": 0.10265486725663717,
"grad_norm": 0.9998722076416016,
"learning_rate": 4.999915086357417e-06,
"loss": 0.0425,
"step": 261
},
{
"epoch": 0.10304818092428712,
"grad_norm": 2.102257013320923,
"learning_rate": 4.99988442333386e-06,
"loss": 0.0857,
"step": 262
},
{
"epoch": 0.10344149459193706,
"grad_norm": 2.055304765701294,
"learning_rate": 4.999849043077843e-06,
"loss": 0.058,
"step": 263
},
{
"epoch": 0.10383480825958702,
"grad_norm": 2.11883544921875,
"learning_rate": 4.999808945656128e-06,
"loss": 0.1135,
"step": 264
},
{
"epoch": 0.10422812192723697,
"grad_norm": 1.4651076793670654,
"learning_rate": 4.999764131144377e-06,
"loss": 0.0609,
"step": 265
},
{
"epoch": 0.10462143559488692,
"grad_norm": 1.3278563022613525,
"learning_rate": 4.999714599627155e-06,
"loss": 0.0506,
"step": 266
},
{
"epoch": 0.10501474926253687,
"grad_norm": 3.376959800720215,
"learning_rate": 4.999660351197926e-06,
"loss": 0.0505,
"step": 267
},
{
"epoch": 0.10540806293018683,
"grad_norm": 14.901459693908691,
"learning_rate": 4.999601385959056e-06,
"loss": 0.0717,
"step": 268
},
{
"epoch": 0.10580137659783677,
"grad_norm": 1.7644176483154297,
"learning_rate": 4.999537704021812e-06,
"loss": 0.1109,
"step": 269
},
{
"epoch": 0.10619469026548672,
"grad_norm": 1.3101154565811157,
"learning_rate": 4.99946930550636e-06,
"loss": 0.0433,
"step": 270
},
{
"epoch": 0.10658800393313668,
"grad_norm": 3.403160572052002,
"learning_rate": 4.999396190541766e-06,
"loss": 0.1082,
"step": 271
},
{
"epoch": 0.10698131760078662,
"grad_norm": 2.1354033946990967,
"learning_rate": 4.999318359265998e-06,
"loss": 0.0698,
"step": 272
},
{
"epoch": 0.10737463126843658,
"grad_norm": 1.1540406942367554,
"learning_rate": 4.999235811825921e-06,
"loss": 0.0857,
"step": 273
},
{
"epoch": 0.10776794493608653,
"grad_norm": 1.4908989667892456,
"learning_rate": 4.9991485483773e-06,
"loss": 0.0627,
"step": 274
},
{
"epoch": 0.10816125860373647,
"grad_norm": 1.5307058095932007,
"learning_rate": 4.999056569084801e-06,
"loss": 0.0555,
"step": 275
},
{
"epoch": 0.10855457227138643,
"grad_norm": 2.4000704288482666,
"learning_rate": 4.998959874121986e-06,
"loss": 0.068,
"step": 276
},
{
"epoch": 0.10894788593903638,
"grad_norm": 1.2169445753097534,
"learning_rate": 4.998858463671316e-06,
"loss": 0.0716,
"step": 277
},
{
"epoch": 0.10934119960668633,
"grad_norm": 1.496738076210022,
"learning_rate": 4.998752337924152e-06,
"loss": 0.063,
"step": 278
},
{
"epoch": 0.10973451327433628,
"grad_norm": 1.3070656061172485,
"learning_rate": 4.998641497080749e-06,
"loss": 0.0444,
"step": 279
},
{
"epoch": 0.11012782694198624,
"grad_norm": 3.1283788681030273,
"learning_rate": 4.998525941350264e-06,
"loss": 0.1097,
"step": 280
},
{
"epoch": 0.11052114060963618,
"grad_norm": 2.3517940044403076,
"learning_rate": 4.998405670950747e-06,
"loss": 0.0778,
"step": 281
},
{
"epoch": 0.11091445427728613,
"grad_norm": 1.4366756677627563,
"learning_rate": 4.998280686109146e-06,
"loss": 0.0645,
"step": 282
},
{
"epoch": 0.11130776794493609,
"grad_norm": 1.5536798238754272,
"learning_rate": 4.998150987061304e-06,
"loss": 0.0483,
"step": 283
},
{
"epoch": 0.11170108161258603,
"grad_norm": 2.191906690597534,
"learning_rate": 4.9980165740519625e-06,
"loss": 0.061,
"step": 284
},
{
"epoch": 0.11209439528023599,
"grad_norm": 2.2331135272979736,
"learning_rate": 4.997877447334754e-06,
"loss": 0.073,
"step": 285
},
{
"epoch": 0.11248770894788594,
"grad_norm": 2.7030222415924072,
"learning_rate": 4.99773360717221e-06,
"loss": 0.0924,
"step": 286
},
{
"epoch": 0.11288102261553588,
"grad_norm": 1.2399053573608398,
"learning_rate": 4.997585053835754e-06,
"loss": 0.0603,
"step": 287
},
{
"epoch": 0.11327433628318584,
"grad_norm": 1.5186935663223267,
"learning_rate": 4.997431787605701e-06,
"loss": 0.0733,
"step": 288
},
{
"epoch": 0.1136676499508358,
"grad_norm": 5.53955078125,
"learning_rate": 4.997273808771263e-06,
"loss": 0.0735,
"step": 289
},
{
"epoch": 0.11406096361848574,
"grad_norm": 1.861646294593811,
"learning_rate": 4.997111117630543e-06,
"loss": 0.0365,
"step": 290
},
{
"epoch": 0.11445427728613569,
"grad_norm": 1.5158923864364624,
"learning_rate": 4.996943714490535e-06,
"loss": 0.0598,
"step": 291
},
{
"epoch": 0.11484759095378565,
"grad_norm": 3.7808361053466797,
"learning_rate": 4.996771599667126e-06,
"loss": 0.09,
"step": 292
},
{
"epoch": 0.11524090462143559,
"grad_norm": 1.3470269441604614,
"learning_rate": 4.996594773485093e-06,
"loss": 0.0304,
"step": 293
},
{
"epoch": 0.11563421828908554,
"grad_norm": 2.0843825340270996,
"learning_rate": 4.996413236278104e-06,
"loss": 0.0556,
"step": 294
},
{
"epoch": 0.1160275319567355,
"grad_norm": 1.6657154560089111,
"learning_rate": 4.996226988388716e-06,
"loss": 0.0628,
"step": 295
},
{
"epoch": 0.11642084562438544,
"grad_norm": 1.9300707578659058,
"learning_rate": 4.9960360301683755e-06,
"loss": 0.0701,
"step": 296
},
{
"epoch": 0.1168141592920354,
"grad_norm": 1.6507627964019775,
"learning_rate": 4.995840361977416e-06,
"loss": 0.0783,
"step": 297
},
{
"epoch": 0.11720747295968535,
"grad_norm": 1.9679419994354248,
"learning_rate": 4.995639984185059e-06,
"loss": 0.0714,
"step": 298
},
{
"epoch": 0.1176007866273353,
"grad_norm": 1.7199714183807373,
"learning_rate": 4.9954348971694146e-06,
"loss": 0.046,
"step": 299
},
{
"epoch": 0.11799410029498525,
"grad_norm": 1.3099826574325562,
"learning_rate": 4.995225101317478e-06,
"loss": 0.0542,
"step": 300
},
{
"epoch": 0.1183874139626352,
"grad_norm": 1.4102526903152466,
"learning_rate": 4.99501059702513e-06,
"loss": 0.07,
"step": 301
},
{
"epoch": 0.11878072763028515,
"grad_norm": 2.6054928302764893,
"learning_rate": 4.9947913846971345e-06,
"loss": 0.0753,
"step": 302
},
{
"epoch": 0.1191740412979351,
"grad_norm": 2.4399526119232178,
"learning_rate": 4.994567464747141e-06,
"loss": 0.1051,
"step": 303
},
{
"epoch": 0.11956735496558506,
"grad_norm": 3.065548896789551,
"learning_rate": 4.994338837597683e-06,
"loss": 0.0955,
"step": 304
},
{
"epoch": 0.119960668633235,
"grad_norm": 1.3317792415618896,
"learning_rate": 4.994105503680176e-06,
"loss": 0.0595,
"step": 305
},
{
"epoch": 0.12035398230088495,
"grad_norm": 1.5237491130828857,
"learning_rate": 4.993867463434916e-06,
"loss": 0.0909,
"step": 306
},
{
"epoch": 0.12074729596853491,
"grad_norm": 0.8940740823745728,
"learning_rate": 4.9936247173110785e-06,
"loss": 0.0628,
"step": 307
},
{
"epoch": 0.12114060963618485,
"grad_norm": 2.6642251014709473,
"learning_rate": 4.993377265766723e-06,
"loss": 0.0679,
"step": 308
},
{
"epoch": 0.1215339233038348,
"grad_norm": 2.868943452835083,
"learning_rate": 4.993125109268784e-06,
"loss": 0.047,
"step": 309
},
{
"epoch": 0.12192723697148476,
"grad_norm": 1.1550475358963013,
"learning_rate": 4.992868248293077e-06,
"loss": 0.0771,
"step": 310
},
{
"epoch": 0.1223205506391347,
"grad_norm": 1.7380859851837158,
"learning_rate": 4.9926066833242926e-06,
"loss": 0.0573,
"step": 311
},
{
"epoch": 0.12271386430678466,
"grad_norm": 1.8886913061141968,
"learning_rate": 4.9923404148559995e-06,
"loss": 0.1034,
"step": 312
},
{
"epoch": 0.12310717797443461,
"grad_norm": 1.5682885646820068,
"learning_rate": 4.992069443390641e-06,
"loss": 0.0595,
"step": 313
},
{
"epoch": 0.12350049164208456,
"grad_norm": 2.2674522399902344,
"learning_rate": 4.991793769439534e-06,
"loss": 0.0855,
"step": 314
},
{
"epoch": 0.12389380530973451,
"grad_norm": 1.3800448179244995,
"learning_rate": 4.991513393522871e-06,
"loss": 0.0537,
"step": 315
},
{
"epoch": 0.12428711897738447,
"grad_norm": 1.9727108478546143,
"learning_rate": 4.991228316169715e-06,
"loss": 0.0698,
"step": 316
},
{
"epoch": 0.12468043264503441,
"grad_norm": 1.1997886896133423,
"learning_rate": 4.990938537918001e-06,
"loss": 0.0513,
"step": 317
},
{
"epoch": 0.12507374631268436,
"grad_norm": 1.0357115268707275,
"learning_rate": 4.990644059314536e-06,
"loss": 0.0537,
"step": 318
},
{
"epoch": 0.1254670599803343,
"grad_norm": 2.9861936569213867,
"learning_rate": 4.990344880914994e-06,
"loss": 0.0836,
"step": 319
},
{
"epoch": 0.12586037364798427,
"grad_norm": 1.0183316469192505,
"learning_rate": 4.990041003283921e-06,
"loss": 0.0595,
"step": 320
},
{
"epoch": 0.12625368731563422,
"grad_norm": 3.085170269012451,
"learning_rate": 4.989732426994725e-06,
"loss": 0.1097,
"step": 321
},
{
"epoch": 0.12664700098328416,
"grad_norm": 1.6864210367202759,
"learning_rate": 4.989419152629685e-06,
"loss": 0.0546,
"step": 322
},
{
"epoch": 0.12704031465093413,
"grad_norm": 1.678736686706543,
"learning_rate": 4.9891011807799435e-06,
"loss": 0.0436,
"step": 323
},
{
"epoch": 0.12743362831858407,
"grad_norm": 1.6153947114944458,
"learning_rate": 4.988778512045507e-06,
"loss": 0.0885,
"step": 324
},
{
"epoch": 0.127826941986234,
"grad_norm": 2.239644765853882,
"learning_rate": 4.9884511470352456e-06,
"loss": 0.0841,
"step": 325
},
{
"epoch": 0.12822025565388398,
"grad_norm": 2.258629560470581,
"learning_rate": 4.9881190863668895e-06,
"loss": 0.0547,
"step": 326
},
{
"epoch": 0.12861356932153392,
"grad_norm": 1.519643783569336,
"learning_rate": 4.98778233066703e-06,
"loss": 0.076,
"step": 327
},
{
"epoch": 0.12900688298918386,
"grad_norm": 2.382768154144287,
"learning_rate": 4.987440880571121e-06,
"loss": 0.0754,
"step": 328
},
{
"epoch": 0.12940019665683383,
"grad_norm": 1.1717922687530518,
"learning_rate": 4.98709473672347e-06,
"loss": 0.0431,
"step": 329
},
{
"epoch": 0.12979351032448377,
"grad_norm": 2.597674608230591,
"learning_rate": 4.986743899777244e-06,
"loss": 0.0831,
"step": 330
},
{
"epoch": 0.13018682399213372,
"grad_norm": 2.2018444538116455,
"learning_rate": 4.986388370394466e-06,
"loss": 0.0967,
"step": 331
},
{
"epoch": 0.13058013765978368,
"grad_norm": 2.4188756942749023,
"learning_rate": 4.986028149246013e-06,
"loss": 0.0706,
"step": 332
},
{
"epoch": 0.13097345132743363,
"grad_norm": 1.3178000450134277,
"learning_rate": 4.985663237011614e-06,
"loss": 0.0814,
"step": 333
},
{
"epoch": 0.13136676499508357,
"grad_norm": 1.007521390914917,
"learning_rate": 4.985293634379852e-06,
"loss": 0.0518,
"step": 334
},
{
"epoch": 0.13176007866273354,
"grad_norm": 2.3999087810516357,
"learning_rate": 4.984919342048159e-06,
"loss": 0.0526,
"step": 335
},
{
"epoch": 0.13215339233038348,
"grad_norm": 2.07135272026062,
"learning_rate": 4.984540360722819e-06,
"loss": 0.0493,
"step": 336
},
{
"epoch": 0.13254670599803342,
"grad_norm": 1.2785420417785645,
"learning_rate": 4.98415669111896e-06,
"loss": 0.0671,
"step": 337
},
{
"epoch": 0.1329400196656834,
"grad_norm": 1.264936089515686,
"learning_rate": 4.9837683339605615e-06,
"loss": 0.0619,
"step": 338
},
{
"epoch": 0.13333333333333333,
"grad_norm": 2.3385870456695557,
"learning_rate": 4.983375289980443e-06,
"loss": 0.1164,
"step": 339
},
{
"epoch": 0.13372664700098327,
"grad_norm": 2.5312047004699707,
"learning_rate": 4.982977559920273e-06,
"loss": 0.1017,
"step": 340
},
{
"epoch": 0.13411996066863324,
"grad_norm": 1.6104050874710083,
"learning_rate": 4.982575144530559e-06,
"loss": 0.0647,
"step": 341
},
{
"epoch": 0.13451327433628318,
"grad_norm": 1.557822346687317,
"learning_rate": 4.982168044570652e-06,
"loss": 0.0546,
"step": 342
},
{
"epoch": 0.13490658800393313,
"grad_norm": 1.430794596672058,
"learning_rate": 4.981756260808741e-06,
"loss": 0.0553,
"step": 343
},
{
"epoch": 0.1352999016715831,
"grad_norm": 1.718525767326355,
"learning_rate": 4.981339794021853e-06,
"loss": 0.0633,
"step": 344
},
{
"epoch": 0.13569321533923304,
"grad_norm": 0.9465076327323914,
"learning_rate": 4.9809186449958536e-06,
"loss": 0.0468,
"step": 345
},
{
"epoch": 0.13608652900688298,
"grad_norm": 1.7588387727737427,
"learning_rate": 4.980492814525442e-06,
"loss": 0.0687,
"step": 346
},
{
"epoch": 0.13647984267453295,
"grad_norm": 1.392269492149353,
"learning_rate": 4.980062303414152e-06,
"loss": 0.0363,
"step": 347
},
{
"epoch": 0.1368731563421829,
"grad_norm": 2.146742582321167,
"learning_rate": 4.97962711247435e-06,
"loss": 0.0604,
"step": 348
},
{
"epoch": 0.13726647000983283,
"grad_norm": 2.926267385482788,
"learning_rate": 4.979187242527233e-06,
"loss": 0.086,
"step": 349
},
{
"epoch": 0.1376597836774828,
"grad_norm": 1.9409819841384888,
"learning_rate": 4.978742694402825e-06,
"loss": 0.0588,
"step": 350
},
{
"epoch": 0.13805309734513274,
"grad_norm": 1.8433561325073242,
"learning_rate": 4.978293468939982e-06,
"loss": 0.0676,
"step": 351
},
{
"epoch": 0.13844641101278268,
"grad_norm": 2.0934383869171143,
"learning_rate": 4.977839566986382e-06,
"loss": 0.0713,
"step": 352
},
{
"epoch": 0.13883972468043265,
"grad_norm": 1.8030976057052612,
"learning_rate": 4.977380989398529e-06,
"loss": 0.1169,
"step": 353
},
{
"epoch": 0.1392330383480826,
"grad_norm": 2.014277935028076,
"learning_rate": 4.976917737041751e-06,
"loss": 0.0376,
"step": 354
},
{
"epoch": 0.13962635201573254,
"grad_norm": 1.3366997241973877,
"learning_rate": 4.976449810790196e-06,
"loss": 0.0644,
"step": 355
},
{
"epoch": 0.1400196656833825,
"grad_norm": 1.63720703125,
"learning_rate": 4.97597721152683e-06,
"loss": 0.067,
"step": 356
},
{
"epoch": 0.14041297935103245,
"grad_norm": 2.317793846130371,
"learning_rate": 4.975499940143439e-06,
"loss": 0.0732,
"step": 357
},
{
"epoch": 0.1408062930186824,
"grad_norm": 1.352824330329895,
"learning_rate": 4.975017997540625e-06,
"loss": 0.0721,
"step": 358
},
{
"epoch": 0.14119960668633236,
"grad_norm": 1.2860400676727295,
"learning_rate": 4.974531384627805e-06,
"loss": 0.0604,
"step": 359
},
{
"epoch": 0.1415929203539823,
"grad_norm": 2.315216064453125,
"learning_rate": 4.974040102323207e-06,
"loss": 0.0492,
"step": 360
},
{
"epoch": 0.14198623402163224,
"grad_norm": 1.771453857421875,
"learning_rate": 4.973544151553869e-06,
"loss": 0.0554,
"step": 361
},
{
"epoch": 0.1423795476892822,
"grad_norm": 0.9052230715751648,
"learning_rate": 4.973043533255645e-06,
"loss": 0.0524,
"step": 362
},
{
"epoch": 0.14277286135693215,
"grad_norm": 2.327606439590454,
"learning_rate": 4.972538248373188e-06,
"loss": 0.0583,
"step": 363
},
{
"epoch": 0.1431661750245821,
"grad_norm": 2.986643075942993,
"learning_rate": 4.9720282978599625e-06,
"loss": 0.0726,
"step": 364
},
{
"epoch": 0.14355948869223206,
"grad_norm": 1.1824491024017334,
"learning_rate": 4.971513682678234e-06,
"loss": 0.0749,
"step": 365
},
{
"epoch": 0.143952802359882,
"grad_norm": 3.0968868732452393,
"learning_rate": 4.970994403799072e-06,
"loss": 0.0547,
"step": 366
},
{
"epoch": 0.14434611602753195,
"grad_norm": 1.2194032669067383,
"learning_rate": 4.970470462202343e-06,
"loss": 0.0651,
"step": 367
},
{
"epoch": 0.14473942969518191,
"grad_norm": 1.3438714742660522,
"learning_rate": 4.969941858876719e-06,
"loss": 0.0416,
"step": 368
},
{
"epoch": 0.14513274336283186,
"grad_norm": 1.4193546772003174,
"learning_rate": 4.96940859481966e-06,
"loss": 0.06,
"step": 369
},
{
"epoch": 0.1455260570304818,
"grad_norm": 1.2842000722885132,
"learning_rate": 4.968870671037427e-06,
"loss": 0.0598,
"step": 370
},
{
"epoch": 0.14591937069813177,
"grad_norm": 2.3905892372131348,
"learning_rate": 4.96832808854507e-06,
"loss": 0.0652,
"step": 371
},
{
"epoch": 0.1463126843657817,
"grad_norm": 1.5380994081497192,
"learning_rate": 4.967780848366432e-06,
"loss": 0.1034,
"step": 372
},
{
"epoch": 0.14670599803343165,
"grad_norm": 1.3698018789291382,
"learning_rate": 4.967228951534144e-06,
"loss": 0.0695,
"step": 373
},
{
"epoch": 0.14709931170108162,
"grad_norm": 1.6553199291229248,
"learning_rate": 4.966672399089626e-06,
"loss": 0.0358,
"step": 374
},
{
"epoch": 0.14749262536873156,
"grad_norm": 1.966484546661377,
"learning_rate": 4.966111192083081e-06,
"loss": 0.0396,
"step": 375
},
{
"epoch": 0.1478859390363815,
"grad_norm": 1.1057041883468628,
"learning_rate": 4.965545331573493e-06,
"loss": 0.0294,
"step": 376
},
{
"epoch": 0.14827925270403147,
"grad_norm": 1.3603320121765137,
"learning_rate": 4.964974818628633e-06,
"loss": 0.0431,
"step": 377
},
{
"epoch": 0.1486725663716814,
"grad_norm": 3.8050637245178223,
"learning_rate": 4.964399654325045e-06,
"loss": 0.063,
"step": 378
},
{
"epoch": 0.14906588003933136,
"grad_norm": 1.361873984336853,
"learning_rate": 4.963819839748055e-06,
"loss": 0.0258,
"step": 379
},
{
"epoch": 0.14945919370698132,
"grad_norm": 1.0739333629608154,
"learning_rate": 4.96323537599176e-06,
"loss": 0.0553,
"step": 380
},
{
"epoch": 0.14985250737463127,
"grad_norm": 1.5606439113616943,
"learning_rate": 4.962646264159031e-06,
"loss": 0.0341,
"step": 381
},
{
"epoch": 0.1502458210422812,
"grad_norm": 1.526953101158142,
"learning_rate": 4.962052505361512e-06,
"loss": 0.0693,
"step": 382
},
{
"epoch": 0.15063913470993118,
"grad_norm": 3.761380195617676,
"learning_rate": 4.9614541007196136e-06,
"loss": 0.0685,
"step": 383
},
{
"epoch": 0.15103244837758112,
"grad_norm": 2.7432498931884766,
"learning_rate": 4.960851051362514e-06,
"loss": 0.0501,
"step": 384
},
{
"epoch": 0.15142576204523106,
"grad_norm": 2.669240951538086,
"learning_rate": 4.960243358428154e-06,
"loss": 0.1198,
"step": 385
},
{
"epoch": 0.15181907571288103,
"grad_norm": 1.5905970335006714,
"learning_rate": 4.959631023063238e-06,
"loss": 0.0803,
"step": 386
},
{
"epoch": 0.15221238938053097,
"grad_norm": 1.1858878135681152,
"learning_rate": 4.959014046423233e-06,
"loss": 0.0654,
"step": 387
},
{
"epoch": 0.1526057030481809,
"grad_norm": 1.7795485258102417,
"learning_rate": 4.9583924296723606e-06,
"loss": 0.0598,
"step": 388
},
{
"epoch": 0.15299901671583088,
"grad_norm": 1.2830811738967896,
"learning_rate": 4.957766173983598e-06,
"loss": 0.0437,
"step": 389
},
{
"epoch": 0.15339233038348082,
"grad_norm": 0.8960599303245544,
"learning_rate": 4.9571352805386795e-06,
"loss": 0.0455,
"step": 390
},
{
"epoch": 0.15378564405113077,
"grad_norm": 2.005126714706421,
"learning_rate": 4.956499750528086e-06,
"loss": 0.0755,
"step": 391
},
{
"epoch": 0.15417895771878073,
"grad_norm": 1.5545151233673096,
"learning_rate": 4.955859585151054e-06,
"loss": 0.0449,
"step": 392
},
{
"epoch": 0.15457227138643068,
"grad_norm": 1.0876412391662598,
"learning_rate": 4.955214785615558e-06,
"loss": 0.0718,
"step": 393
},
{
"epoch": 0.15496558505408062,
"grad_norm": 1.9705466032028198,
"learning_rate": 4.9545653531383255e-06,
"loss": 0.0612,
"step": 394
},
{
"epoch": 0.1553588987217306,
"grad_norm": 1.3790346384048462,
"learning_rate": 4.953911288944821e-06,
"loss": 0.0371,
"step": 395
},
{
"epoch": 0.15575221238938053,
"grad_norm": 1.0736052989959717,
"learning_rate": 4.953252594269252e-06,
"loss": 0.056,
"step": 396
},
{
"epoch": 0.15614552605703047,
"grad_norm": 1.919756531715393,
"learning_rate": 4.9525892703545604e-06,
"loss": 0.0737,
"step": 397
},
{
"epoch": 0.15653883972468044,
"grad_norm": 1.333601713180542,
"learning_rate": 4.951921318452428e-06,
"loss": 0.0628,
"step": 398
},
{
"epoch": 0.15693215339233038,
"grad_norm": 1.5093313455581665,
"learning_rate": 4.951248739823264e-06,
"loss": 0.0677,
"step": 399
},
{
"epoch": 0.15732546705998032,
"grad_norm": 1.5697554349899292,
"learning_rate": 4.950571535736214e-06,
"loss": 0.0672,
"step": 400
},
{
"epoch": 0.1577187807276303,
"grad_norm": 1.4692028760910034,
"learning_rate": 4.949889707469145e-06,
"loss": 0.0472,
"step": 401
},
{
"epoch": 0.15811209439528023,
"grad_norm": 0.9199762940406799,
"learning_rate": 4.949203256308658e-06,
"loss": 0.0661,
"step": 402
},
{
"epoch": 0.15850540806293018,
"grad_norm": 1.4585742950439453,
"learning_rate": 4.948512183550068e-06,
"loss": 0.0776,
"step": 403
},
{
"epoch": 0.15889872173058014,
"grad_norm": 1.2560405731201172,
"learning_rate": 4.947816490497419e-06,
"loss": 0.0932,
"step": 404
},
{
"epoch": 0.1592920353982301,
"grad_norm": 1.6395833492279053,
"learning_rate": 4.947116178463469e-06,
"loss": 0.0399,
"step": 405
},
{
"epoch": 0.15968534906588003,
"grad_norm": 0.8655360341072083,
"learning_rate": 4.946411248769693e-06,
"loss": 0.0421,
"step": 406
},
{
"epoch": 0.16007866273353,
"grad_norm": 0.9741353392601013,
"learning_rate": 4.945701702746279e-06,
"loss": 0.0469,
"step": 407
},
{
"epoch": 0.16047197640117994,
"grad_norm": 0.9401141405105591,
"learning_rate": 4.944987541732126e-06,
"loss": 0.0668,
"step": 408
},
{
"epoch": 0.16086529006882988,
"grad_norm": 0.8718335032463074,
"learning_rate": 4.944268767074842e-06,
"loss": 0.0597,
"step": 409
},
{
"epoch": 0.16125860373647985,
"grad_norm": 1.3456203937530518,
"learning_rate": 4.943545380130742e-06,
"loss": 0.0755,
"step": 410
},
{
"epoch": 0.1616519174041298,
"grad_norm": 1.1579302549362183,
"learning_rate": 4.942817382264842e-06,
"loss": 0.0583,
"step": 411
},
{
"epoch": 0.16204523107177973,
"grad_norm": 1.664872169494629,
"learning_rate": 4.942084774850858e-06,
"loss": 0.0777,
"step": 412
},
{
"epoch": 0.1624385447394297,
"grad_norm": 2.256772518157959,
"learning_rate": 4.941347559271208e-06,
"loss": 0.0734,
"step": 413
},
{
"epoch": 0.16283185840707964,
"grad_norm": 1.235349416732788,
"learning_rate": 4.9406057369170015e-06,
"loss": 0.051,
"step": 414
},
{
"epoch": 0.16322517207472959,
"grad_norm": 1.6716983318328857,
"learning_rate": 4.939859309188044e-06,
"loss": 0.0728,
"step": 415
},
{
"epoch": 0.16361848574237955,
"grad_norm": 1.3591656684875488,
"learning_rate": 4.939108277492829e-06,
"loss": 0.0725,
"step": 416
},
{
"epoch": 0.1640117994100295,
"grad_norm": 0.6709238886833191,
"learning_rate": 4.9383526432485375e-06,
"loss": 0.0452,
"step": 417
},
{
"epoch": 0.16440511307767944,
"grad_norm": 1.2356040477752686,
"learning_rate": 4.937592407881039e-06,
"loss": 0.0682,
"step": 418
},
{
"epoch": 0.1647984267453294,
"grad_norm": 1.0750470161437988,
"learning_rate": 4.93682757282488e-06,
"loss": 0.0383,
"step": 419
},
{
"epoch": 0.16519174041297935,
"grad_norm": 1.5483283996582031,
"learning_rate": 4.936058139523291e-06,
"loss": 0.0645,
"step": 420
},
{
"epoch": 0.1655850540806293,
"grad_norm": 2.0328383445739746,
"learning_rate": 4.935284109428177e-06,
"loss": 0.0623,
"step": 421
},
{
"epoch": 0.16597836774827926,
"grad_norm": 1.5979444980621338,
"learning_rate": 4.934505484000116e-06,
"loss": 0.0751,
"step": 422
},
{
"epoch": 0.1663716814159292,
"grad_norm": 1.1430745124816895,
"learning_rate": 4.93372226470836e-06,
"loss": 0.0542,
"step": 423
},
{
"epoch": 0.16676499508357914,
"grad_norm": 2.062899112701416,
"learning_rate": 4.932934453030829e-06,
"loss": 0.0873,
"step": 424
},
{
"epoch": 0.1671583087512291,
"grad_norm": 3.2697086334228516,
"learning_rate": 4.932142050454107e-06,
"loss": 0.0733,
"step": 425
},
{
"epoch": 0.16755162241887905,
"grad_norm": 1.2826026678085327,
"learning_rate": 4.931345058473443e-06,
"loss": 0.0497,
"step": 426
},
{
"epoch": 0.167944936086529,
"grad_norm": 2.3819937705993652,
"learning_rate": 4.930543478592743e-06,
"loss": 0.0789,
"step": 427
},
{
"epoch": 0.16833824975417896,
"grad_norm": 2.840121030807495,
"learning_rate": 4.929737312324574e-06,
"loss": 0.054,
"step": 428
},
{
"epoch": 0.1687315634218289,
"grad_norm": 0.6918103098869324,
"learning_rate": 4.928926561190155e-06,
"loss": 0.0448,
"step": 429
},
{
"epoch": 0.16912487708947885,
"grad_norm": 0.8336203694343567,
"learning_rate": 4.928111226719359e-06,
"loss": 0.0629,
"step": 430
},
{
"epoch": 0.16951819075712882,
"grad_norm": 1.9415661096572876,
"learning_rate": 4.927291310450705e-06,
"loss": 0.0731,
"step": 431
},
{
"epoch": 0.16991150442477876,
"grad_norm": 1.3499138355255127,
"learning_rate": 4.926466813931358e-06,
"loss": 0.0562,
"step": 432
},
{
"epoch": 0.1703048180924287,
"grad_norm": 1.0689488649368286,
"learning_rate": 4.925637738717127e-06,
"loss": 0.0706,
"step": 433
},
{
"epoch": 0.17069813176007867,
"grad_norm": 2.7924535274505615,
"learning_rate": 4.924804086372462e-06,
"loss": 0.0671,
"step": 434
},
{
"epoch": 0.1710914454277286,
"grad_norm": 0.8586186170578003,
"learning_rate": 4.9239658584704466e-06,
"loss": 0.049,
"step": 435
},
{
"epoch": 0.17148475909537855,
"grad_norm": 1.8235011100769043,
"learning_rate": 4.923123056592801e-06,
"loss": 0.0715,
"step": 436
},
{
"epoch": 0.17187807276302852,
"grad_norm": 1.1591852903366089,
"learning_rate": 4.922275682329876e-06,
"loss": 0.0799,
"step": 437
},
{
"epoch": 0.17227138643067846,
"grad_norm": 1.2786961793899536,
"learning_rate": 4.921423737280649e-06,
"loss": 0.0561,
"step": 438
},
{
"epoch": 0.1726647000983284,
"grad_norm": 1.602005958557129,
"learning_rate": 4.9205672230527254e-06,
"loss": 0.0517,
"step": 439
},
{
"epoch": 0.17305801376597837,
"grad_norm": 1.3069565296173096,
"learning_rate": 4.919706141262329e-06,
"loss": 0.063,
"step": 440
},
{
"epoch": 0.17345132743362832,
"grad_norm": 1.4721592664718628,
"learning_rate": 4.918840493534305e-06,
"loss": 0.0789,
"step": 441
},
{
"epoch": 0.17384464110127826,
"grad_norm": 2.0551934242248535,
"learning_rate": 4.917970281502112e-06,
"loss": 0.0711,
"step": 442
},
{
"epoch": 0.17423795476892823,
"grad_norm": 1.175560474395752,
"learning_rate": 4.917095506807824e-06,
"loss": 0.0646,
"step": 443
},
{
"epoch": 0.17463126843657817,
"grad_norm": 1.3429381847381592,
"learning_rate": 4.916216171102124e-06,
"loss": 0.0609,
"step": 444
},
{
"epoch": 0.1750245821042281,
"grad_norm": 1.306825041770935,
"learning_rate": 4.9153322760443015e-06,
"loss": 0.0529,
"step": 445
},
{
"epoch": 0.17541789577187808,
"grad_norm": 1.4618321657180786,
"learning_rate": 4.914443823302246e-06,
"loss": 0.0509,
"step": 446
},
{
"epoch": 0.17581120943952802,
"grad_norm": 1.054541826248169,
"learning_rate": 4.913550814552454e-06,
"loss": 0.0613,
"step": 447
},
{
"epoch": 0.17620452310717796,
"grad_norm": 0.9349273443222046,
"learning_rate": 4.912653251480013e-06,
"loss": 0.0531,
"step": 448
},
{
"epoch": 0.17659783677482793,
"grad_norm": 1.302675724029541,
"learning_rate": 4.9117511357786075e-06,
"loss": 0.0661,
"step": 449
},
{
"epoch": 0.17699115044247787,
"grad_norm": 2.327521562576294,
"learning_rate": 4.910844469150512e-06,
"loss": 0.08,
"step": 450
},
{
"epoch": 0.17738446411012782,
"grad_norm": 1.7499988079071045,
"learning_rate": 4.909933253306588e-06,
"loss": 0.0368,
"step": 451
},
{
"epoch": 0.17777777777777778,
"grad_norm": 1.1263257265090942,
"learning_rate": 4.909017489966283e-06,
"loss": 0.0322,
"step": 452
},
{
"epoch": 0.17817109144542773,
"grad_norm": 2.8002772331237793,
"learning_rate": 4.9080971808576226e-06,
"loss": 0.0597,
"step": 453
},
{
"epoch": 0.17856440511307767,
"grad_norm": 2.0555684566497803,
"learning_rate": 4.907172327717214e-06,
"loss": 0.0754,
"step": 454
},
{
"epoch": 0.17895771878072764,
"grad_norm": 2.3041601181030273,
"learning_rate": 4.906242932290234e-06,
"loss": 0.0838,
"step": 455
},
{
"epoch": 0.17935103244837758,
"grad_norm": 2.3882484436035156,
"learning_rate": 4.905308996330437e-06,
"loss": 0.063,
"step": 456
},
{
"epoch": 0.17974434611602752,
"grad_norm": 1.4339286088943481,
"learning_rate": 4.904370521600138e-06,
"loss": 0.0723,
"step": 457
},
{
"epoch": 0.1801376597836775,
"grad_norm": 1.387052059173584,
"learning_rate": 4.903427509870222e-06,
"loss": 0.0708,
"step": 458
},
{
"epoch": 0.18053097345132743,
"grad_norm": 0.8694115877151489,
"learning_rate": 4.902479962920134e-06,
"loss": 0.0519,
"step": 459
},
{
"epoch": 0.18092428711897737,
"grad_norm": 1.0308964252471924,
"learning_rate": 4.901527882537876e-06,
"loss": 0.054,
"step": 460
},
{
"epoch": 0.18131760078662734,
"grad_norm": 2.4914846420288086,
"learning_rate": 4.900571270520004e-06,
"loss": 0.115,
"step": 461
},
{
"epoch": 0.18171091445427728,
"grad_norm": 2.637059450149536,
"learning_rate": 4.899610128671626e-06,
"loss": 0.0851,
"step": 462
},
{
"epoch": 0.18210422812192723,
"grad_norm": 1.9722718000411987,
"learning_rate": 4.898644458806398e-06,
"loss": 0.0637,
"step": 463
},
{
"epoch": 0.1824975417895772,
"grad_norm": 0.9795344471931458,
"learning_rate": 4.897674262746522e-06,
"loss": 0.0622,
"step": 464
},
{
"epoch": 0.18289085545722714,
"grad_norm": 1.2904670238494873,
"learning_rate": 4.896699542322736e-06,
"loss": 0.0384,
"step": 465
},
{
"epoch": 0.18328416912487708,
"grad_norm": 1.4417036771774292,
"learning_rate": 4.895720299374319e-06,
"loss": 0.1118,
"step": 466
},
{
"epoch": 0.18367748279252705,
"grad_norm": 1.6243058443069458,
"learning_rate": 4.894736535749083e-06,
"loss": 0.0756,
"step": 467
},
{
"epoch": 0.184070796460177,
"grad_norm": 1.0999799966812134,
"learning_rate": 4.89374825330337e-06,
"loss": 0.0525,
"step": 468
},
{
"epoch": 0.18446411012782693,
"grad_norm": 1.9067320823669434,
"learning_rate": 4.892755453902051e-06,
"loss": 0.066,
"step": 469
},
{
"epoch": 0.1848574237954769,
"grad_norm": 1.1623554229736328,
"learning_rate": 4.8917581394185175e-06,
"loss": 0.0547,
"step": 470
},
{
"epoch": 0.18525073746312684,
"grad_norm": 1.2230125665664673,
"learning_rate": 4.890756311734683e-06,
"loss": 0.0753,
"step": 471
},
{
"epoch": 0.18564405113077678,
"grad_norm": 1.376905083656311,
"learning_rate": 4.8897499727409755e-06,
"loss": 0.0637,
"step": 472
},
{
"epoch": 0.18603736479842675,
"grad_norm": 2.381087064743042,
"learning_rate": 4.888739124336338e-06,
"loss": 0.0818,
"step": 473
},
{
"epoch": 0.1864306784660767,
"grad_norm": 1.5327961444854736,
"learning_rate": 4.8877237684282205e-06,
"loss": 0.0689,
"step": 474
},
{
"epoch": 0.18682399213372664,
"grad_norm": 1.7480573654174805,
"learning_rate": 4.8867039069325804e-06,
"loss": 0.0713,
"step": 475
},
{
"epoch": 0.1872173058013766,
"grad_norm": 1.2657626867294312,
"learning_rate": 4.8856795417738754e-06,
"loss": 0.0742,
"step": 476
},
{
"epoch": 0.18761061946902655,
"grad_norm": 1.0295419692993164,
"learning_rate": 4.884650674885062e-06,
"loss": 0.0448,
"step": 477
},
{
"epoch": 0.1880039331366765,
"grad_norm": 1.9904601573944092,
"learning_rate": 4.883617308207592e-06,
"loss": 0.0801,
"step": 478
},
{
"epoch": 0.18839724680432646,
"grad_norm": 1.4027286767959595,
"learning_rate": 4.88257944369141e-06,
"loss": 0.0502,
"step": 479
},
{
"epoch": 0.1887905604719764,
"grad_norm": 2.087235689163208,
"learning_rate": 4.8815370832949425e-06,
"loss": 0.1021,
"step": 480
},
{
"epoch": 0.18918387413962634,
"grad_norm": 0.8643338680267334,
"learning_rate": 4.880490228985104e-06,
"loss": 0.0732,
"step": 481
},
{
"epoch": 0.1895771878072763,
"grad_norm": 1.4668515920639038,
"learning_rate": 4.8794388827372884e-06,
"loss": 0.0548,
"step": 482
},
{
"epoch": 0.18997050147492625,
"grad_norm": 1.8225198984146118,
"learning_rate": 4.878383046535366e-06,
"loss": 0.0882,
"step": 483
},
{
"epoch": 0.1903638151425762,
"grad_norm": 1.6394109725952148,
"learning_rate": 4.877322722371677e-06,
"loss": 0.1029,
"step": 484
},
{
"epoch": 0.19075712881022616,
"grad_norm": 0.9612401723861694,
"learning_rate": 4.876257912247033e-06,
"loss": 0.0442,
"step": 485
},
{
"epoch": 0.1911504424778761,
"grad_norm": 2.0715410709381104,
"learning_rate": 4.8751886181707105e-06,
"loss": 0.0793,
"step": 486
},
{
"epoch": 0.19154375614552605,
"grad_norm": 1.14213228225708,
"learning_rate": 4.874114842160445e-06,
"loss": 0.0782,
"step": 487
},
{
"epoch": 0.19193706981317601,
"grad_norm": 1.7314140796661377,
"learning_rate": 4.873036586242431e-06,
"loss": 0.0478,
"step": 488
},
{
"epoch": 0.19233038348082596,
"grad_norm": 0.6948450803756714,
"learning_rate": 4.871953852451316e-06,
"loss": 0.0546,
"step": 489
},
{
"epoch": 0.1927236971484759,
"grad_norm": 1.9421541690826416,
"learning_rate": 4.8708666428301975e-06,
"loss": 0.0793,
"step": 490
},
{
"epoch": 0.19311701081612587,
"grad_norm": 0.5670569539070129,
"learning_rate": 4.869774959430619e-06,
"loss": 0.0506,
"step": 491
},
{
"epoch": 0.1935103244837758,
"grad_norm": 1.437902808189392,
"learning_rate": 4.868678804312565e-06,
"loss": 0.0545,
"step": 492
},
{
"epoch": 0.19390363815142575,
"grad_norm": 1.8984867334365845,
"learning_rate": 4.867578179544457e-06,
"loss": 0.0658,
"step": 493
},
{
"epoch": 0.19429695181907572,
"grad_norm": 2.0684666633605957,
"learning_rate": 4.866473087203154e-06,
"loss": 0.0565,
"step": 494
},
{
"epoch": 0.19469026548672566,
"grad_norm": 1.5473408699035645,
"learning_rate": 4.865363529373944e-06,
"loss": 0.0481,
"step": 495
},
{
"epoch": 0.1950835791543756,
"grad_norm": 1.678281545639038,
"learning_rate": 4.864249508150539e-06,
"loss": 0.056,
"step": 496
},
{
"epoch": 0.19547689282202557,
"grad_norm": 1.3713724613189697,
"learning_rate": 4.863131025635076e-06,
"loss": 0.0474,
"step": 497
},
{
"epoch": 0.1958702064896755,
"grad_norm": 2.0483641624450684,
"learning_rate": 4.862008083938109e-06,
"loss": 0.0712,
"step": 498
},
{
"epoch": 0.19626352015732546,
"grad_norm": 1.701915979385376,
"learning_rate": 4.8608806851786075e-06,
"loss": 0.0642,
"step": 499
},
{
"epoch": 0.19665683382497542,
"grad_norm": 1.4159979820251465,
"learning_rate": 4.859748831483949e-06,
"loss": 0.0706,
"step": 500
},
{
"epoch": 0.19705014749262537,
"grad_norm": 0.9921556711196899,
"learning_rate": 4.858612524989921e-06,
"loss": 0.0311,
"step": 501
},
{
"epoch": 0.1974434611602753,
"grad_norm": 0.6453993320465088,
"learning_rate": 4.857471767840709e-06,
"loss": 0.0304,
"step": 502
},
{
"epoch": 0.19783677482792528,
"grad_norm": 2.1691184043884277,
"learning_rate": 4.856326562188902e-06,
"loss": 0.0573,
"step": 503
},
{
"epoch": 0.19823008849557522,
"grad_norm": 1.424170732498169,
"learning_rate": 4.855176910195479e-06,
"loss": 0.0371,
"step": 504
},
{
"epoch": 0.19862340216322516,
"grad_norm": 2.0996835231781006,
"learning_rate": 4.854022814029809e-06,
"loss": 0.06,
"step": 505
},
{
"epoch": 0.19901671583087513,
"grad_norm": 2.2325479984283447,
"learning_rate": 4.852864275869652e-06,
"loss": 0.0686,
"step": 506
},
{
"epoch": 0.19941002949852507,
"grad_norm": 1.8133199214935303,
"learning_rate": 4.851701297901144e-06,
"loss": 0.0811,
"step": 507
},
{
"epoch": 0.199803343166175,
"grad_norm": 1.4886740446090698,
"learning_rate": 4.850533882318803e-06,
"loss": 0.0516,
"step": 508
},
{
"epoch": 0.20019665683382498,
"grad_norm": 1.685327172279358,
"learning_rate": 4.849362031325518e-06,
"loss": 0.0427,
"step": 509
},
{
"epoch": 0.20058997050147492,
"grad_norm": 2.726207733154297,
"learning_rate": 4.8481857471325485e-06,
"loss": 0.0686,
"step": 510
},
{
"epoch": 0.20098328416912487,
"grad_norm": 1.1494991779327393,
"learning_rate": 4.847005031959521e-06,
"loss": 0.0642,
"step": 511
},
{
"epoch": 0.20137659783677483,
"grad_norm": 2.118980884552002,
"learning_rate": 4.84581988803442e-06,
"loss": 0.0504,
"step": 512
},
{
"epoch": 0.20176991150442478,
"grad_norm": 1.4535127878189087,
"learning_rate": 4.84463031759359e-06,
"loss": 0.0482,
"step": 513
},
{
"epoch": 0.20216322517207472,
"grad_norm": 0.8411951065063477,
"learning_rate": 4.843436322881725e-06,
"loss": 0.0491,
"step": 514
},
{
"epoch": 0.2025565388397247,
"grad_norm": 0.9351110458374023,
"learning_rate": 4.8422379061518705e-06,
"loss": 0.0278,
"step": 515
},
{
"epoch": 0.20294985250737463,
"grad_norm": 1.2653199434280396,
"learning_rate": 4.841035069665416e-06,
"loss": 0.0494,
"step": 516
},
{
"epoch": 0.20334316617502457,
"grad_norm": 2.1194064617156982,
"learning_rate": 4.83982781569209e-06,
"loss": 0.0985,
"step": 517
},
{
"epoch": 0.20373647984267454,
"grad_norm": 0.9621169567108154,
"learning_rate": 4.838616146509956e-06,
"loss": 0.0681,
"step": 518
},
{
"epoch": 0.20412979351032448,
"grad_norm": 2.935671091079712,
"learning_rate": 4.83740006440541e-06,
"loss": 0.1056,
"step": 519
},
{
"epoch": 0.20452310717797442,
"grad_norm": 1.5503019094467163,
"learning_rate": 4.8361795716731744e-06,
"loss": 0.0736,
"step": 520
},
{
"epoch": 0.2049164208456244,
"grad_norm": 1.5426656007766724,
"learning_rate": 4.8349546706162965e-06,
"loss": 0.0768,
"step": 521
},
{
"epoch": 0.20530973451327433,
"grad_norm": 1.788036823272705,
"learning_rate": 4.833725363546139e-06,
"loss": 0.0785,
"step": 522
},
{
"epoch": 0.20570304818092428,
"grad_norm": 1.3642781972885132,
"learning_rate": 4.8324916527823795e-06,
"loss": 0.0582,
"step": 523
},
{
"epoch": 0.20609636184857424,
"grad_norm": 2.6498544216156006,
"learning_rate": 4.831253540653007e-06,
"loss": 0.068,
"step": 524
},
{
"epoch": 0.20648967551622419,
"grad_norm": 1.3358078002929688,
"learning_rate": 4.8300110294943145e-06,
"loss": 0.0689,
"step": 525
},
{
"epoch": 0.20688298918387413,
"grad_norm": 2.4475595951080322,
"learning_rate": 4.828764121650896e-06,
"loss": 0.0685,
"step": 526
},
{
"epoch": 0.2072763028515241,
"grad_norm": 1.8231087923049927,
"learning_rate": 4.827512819475641e-06,
"loss": 0.061,
"step": 527
},
{
"epoch": 0.20766961651917404,
"grad_norm": 1.6098417043685913,
"learning_rate": 4.826257125329733e-06,
"loss": 0.0775,
"step": 528
},
{
"epoch": 0.20806293018682398,
"grad_norm": 1.2955044507980347,
"learning_rate": 4.824997041582641e-06,
"loss": 0.0828,
"step": 529
},
{
"epoch": 0.20845624385447395,
"grad_norm": 1.600419282913208,
"learning_rate": 4.82373257061212e-06,
"loss": 0.0868,
"step": 530
},
{
"epoch": 0.2088495575221239,
"grad_norm": 1.2169928550720215,
"learning_rate": 4.8224637148042e-06,
"loss": 0.0543,
"step": 531
},
{
"epoch": 0.20924287118977383,
"grad_norm": 1.6863512992858887,
"learning_rate": 4.821190476553186e-06,
"loss": 0.0703,
"step": 532
},
{
"epoch": 0.2096361848574238,
"grad_norm": 1.9771099090576172,
"learning_rate": 4.819912858261656e-06,
"loss": 0.0799,
"step": 533
},
{
"epoch": 0.21002949852507374,
"grad_norm": 1.276354432106018,
"learning_rate": 4.818630862340449e-06,
"loss": 0.0661,
"step": 534
},
{
"epoch": 0.21042281219272368,
"grad_norm": 1.1068519353866577,
"learning_rate": 4.817344491208665e-06,
"loss": 0.0496,
"step": 535
},
{
"epoch": 0.21081612586037365,
"grad_norm": 1.1699997186660767,
"learning_rate": 4.816053747293663e-06,
"loss": 0.0395,
"step": 536
},
{
"epoch": 0.2112094395280236,
"grad_norm": 1.290640115737915,
"learning_rate": 4.814758633031049e-06,
"loss": 0.0526,
"step": 537
},
{
"epoch": 0.21160275319567354,
"grad_norm": 1.8085367679595947,
"learning_rate": 4.813459150864681e-06,
"loss": 0.0593,
"step": 538
},
{
"epoch": 0.2119960668633235,
"grad_norm": 1.6277810335159302,
"learning_rate": 4.812155303246653e-06,
"loss": 0.0645,
"step": 539
},
{
"epoch": 0.21238938053097345,
"grad_norm": 0.9544056057929993,
"learning_rate": 4.810847092637301e-06,
"loss": 0.063,
"step": 540
},
{
"epoch": 0.2127826941986234,
"grad_norm": 1.349601149559021,
"learning_rate": 4.809534521505192e-06,
"loss": 0.0877,
"step": 541
},
{
"epoch": 0.21317600786627336,
"grad_norm": 1.6013360023498535,
"learning_rate": 4.8082175923271235e-06,
"loss": 0.0637,
"step": 542
},
{
"epoch": 0.2135693215339233,
"grad_norm": 1.130764365196228,
"learning_rate": 4.806896307588113e-06,
"loss": 0.086,
"step": 543
},
{
"epoch": 0.21396263520157324,
"grad_norm": 1.40028715133667,
"learning_rate": 4.805570669781399e-06,
"loss": 0.0876,
"step": 544
},
{
"epoch": 0.2143559488692232,
"grad_norm": 1.7551463842391968,
"learning_rate": 4.804240681408434e-06,
"loss": 0.0593,
"step": 545
},
{
"epoch": 0.21474926253687315,
"grad_norm": 1.648735523223877,
"learning_rate": 4.802906344978881e-06,
"loss": 0.0772,
"step": 546
},
{
"epoch": 0.2151425762045231,
"grad_norm": 0.8385063409805298,
"learning_rate": 4.801567663010605e-06,
"loss": 0.0706,
"step": 547
},
{
"epoch": 0.21553588987217306,
"grad_norm": 1.8120150566101074,
"learning_rate": 4.800224638029672e-06,
"loss": 0.0696,
"step": 548
},
{
"epoch": 0.215929203539823,
"grad_norm": 0.5346795916557312,
"learning_rate": 4.798877272570343e-06,
"loss": 0.0494,
"step": 549
},
{
"epoch": 0.21632251720747295,
"grad_norm": 1.4182865619659424,
"learning_rate": 4.797525569175073e-06,
"loss": 0.0711,
"step": 550
},
{
"epoch": 0.21671583087512292,
"grad_norm": 0.9838932752609253,
"learning_rate": 4.796169530394498e-06,
"loss": 0.0843,
"step": 551
},
{
"epoch": 0.21710914454277286,
"grad_norm": 1.5188270807266235,
"learning_rate": 4.7948091587874355e-06,
"loss": 0.0663,
"step": 552
},
{
"epoch": 0.2175024582104228,
"grad_norm": 1.796202540397644,
"learning_rate": 4.793444456920881e-06,
"loss": 0.0655,
"step": 553
},
{
"epoch": 0.21789577187807277,
"grad_norm": 1.4925826787948608,
"learning_rate": 4.7920754273699985e-06,
"loss": 0.0607,
"step": 554
},
{
"epoch": 0.2182890855457227,
"grad_norm": 1.2840732336044312,
"learning_rate": 4.790702072718121e-06,
"loss": 0.0634,
"step": 555
},
{
"epoch": 0.21868239921337265,
"grad_norm": 1.0566197633743286,
"learning_rate": 4.789324395556741e-06,
"loss": 0.0475,
"step": 556
},
{
"epoch": 0.21907571288102262,
"grad_norm": 1.2299338579177856,
"learning_rate": 4.7879423984855085e-06,
"loss": 0.054,
"step": 557
},
{
"epoch": 0.21946902654867256,
"grad_norm": 1.7808493375778198,
"learning_rate": 4.786556084112224e-06,
"loss": 0.0905,
"step": 558
},
{
"epoch": 0.2198623402163225,
"grad_norm": 1.054694652557373,
"learning_rate": 4.785165455052836e-06,
"loss": 0.0561,
"step": 559
},
{
"epoch": 0.22025565388397247,
"grad_norm": 2.180976629257202,
"learning_rate": 4.783770513931433e-06,
"loss": 0.0705,
"step": 560
},
{
"epoch": 0.22064896755162242,
"grad_norm": 0.9467242956161499,
"learning_rate": 4.782371263380242e-06,
"loss": 0.0471,
"step": 561
},
{
"epoch": 0.22104228121927236,
"grad_norm": 1.0072274208068848,
"learning_rate": 4.780967706039622e-06,
"loss": 0.0642,
"step": 562
},
{
"epoch": 0.22143559488692233,
"grad_norm": 0.9987531304359436,
"learning_rate": 4.779559844558056e-06,
"loss": 0.0556,
"step": 563
},
{
"epoch": 0.22182890855457227,
"grad_norm": 1.5135668516159058,
"learning_rate": 4.778147681592152e-06,
"loss": 0.051,
"step": 564
},
{
"epoch": 0.2222222222222222,
"grad_norm": 1.6369942426681519,
"learning_rate": 4.776731219806634e-06,
"loss": 0.1089,
"step": 565
},
{
"epoch": 0.22261553588987218,
"grad_norm": 1.8307068347930908,
"learning_rate": 4.775310461874337e-06,
"loss": 0.0555,
"step": 566
},
{
"epoch": 0.22300884955752212,
"grad_norm": 1.2417643070220947,
"learning_rate": 4.773885410476202e-06,
"loss": 0.0356,
"step": 567
},
{
"epoch": 0.22340216322517206,
"grad_norm": 0.8904944658279419,
"learning_rate": 4.7724560683012735e-06,
"loss": 0.0649,
"step": 568
},
{
"epoch": 0.22379547689282203,
"grad_norm": 1.3853691816329956,
"learning_rate": 4.771022438046693e-06,
"loss": 0.0429,
"step": 569
},
{
"epoch": 0.22418879056047197,
"grad_norm": 1.6937843561172485,
"learning_rate": 4.769584522417691e-06,
"loss": 0.0831,
"step": 570
},
{
"epoch": 0.22458210422812191,
"grad_norm": 1.6160171031951904,
"learning_rate": 4.768142324127586e-06,
"loss": 0.0754,
"step": 571
},
{
"epoch": 0.22497541789577188,
"grad_norm": 1.2548290491104126,
"learning_rate": 4.766695845897778e-06,
"loss": 0.073,
"step": 572
},
{
"epoch": 0.22536873156342183,
"grad_norm": 2.645967483520508,
"learning_rate": 4.765245090457744e-06,
"loss": 0.1022,
"step": 573
},
{
"epoch": 0.22576204523107177,
"grad_norm": 1.2090085744857788,
"learning_rate": 4.763790060545028e-06,
"loss": 0.0449,
"step": 574
},
{
"epoch": 0.22615535889872174,
"grad_norm": 1.5384302139282227,
"learning_rate": 4.762330758905246e-06,
"loss": 0.0523,
"step": 575
},
{
"epoch": 0.22654867256637168,
"grad_norm": 1.3840306997299194,
"learning_rate": 4.760867188292068e-06,
"loss": 0.0409,
"step": 576
},
{
"epoch": 0.22694198623402162,
"grad_norm": 0.8169382214546204,
"learning_rate": 4.7593993514672255e-06,
"loss": 0.0526,
"step": 577
},
{
"epoch": 0.2273352999016716,
"grad_norm": 0.6939831972122192,
"learning_rate": 4.757927251200497e-06,
"loss": 0.0497,
"step": 578
},
{
"epoch": 0.22772861356932153,
"grad_norm": 2.4073455333709717,
"learning_rate": 4.756450890269705e-06,
"loss": 0.0703,
"step": 579
},
{
"epoch": 0.22812192723697147,
"grad_norm": 1.4490169286727905,
"learning_rate": 4.754970271460714e-06,
"loss": 0.0429,
"step": 580
},
{
"epoch": 0.22851524090462144,
"grad_norm": 0.8039276599884033,
"learning_rate": 4.753485397567424e-06,
"loss": 0.0525,
"step": 581
},
{
"epoch": 0.22890855457227138,
"grad_norm": 0.9220805764198303,
"learning_rate": 4.751996271391761e-06,
"loss": 0.056,
"step": 582
},
{
"epoch": 0.22930186823992132,
"grad_norm": 2.1960690021514893,
"learning_rate": 4.750502895743677e-06,
"loss": 0.0636,
"step": 583
},
{
"epoch": 0.2296951819075713,
"grad_norm": 1.5164406299591064,
"learning_rate": 4.749005273441143e-06,
"loss": 0.0557,
"step": 584
},
{
"epoch": 0.23008849557522124,
"grad_norm": 1.8541299104690552,
"learning_rate": 4.747503407310142e-06,
"loss": 0.0679,
"step": 585
},
{
"epoch": 0.23048180924287118,
"grad_norm": 5.52957010269165,
"learning_rate": 4.745997300184666e-06,
"loss": 0.0805,
"step": 586
},
{
"epoch": 0.23087512291052115,
"grad_norm": 1.318687915802002,
"learning_rate": 4.744486954906709e-06,
"loss": 0.0499,
"step": 587
},
{
"epoch": 0.2312684365781711,
"grad_norm": 1.1736847162246704,
"learning_rate": 4.742972374326262e-06,
"loss": 0.0371,
"step": 588
},
{
"epoch": 0.23166175024582103,
"grad_norm": 1.7209968566894531,
"learning_rate": 4.74145356130131e-06,
"loss": 0.0553,
"step": 589
},
{
"epoch": 0.232055063913471,
"grad_norm": 1.392303228378296,
"learning_rate": 4.739930518697823e-06,
"loss": 0.0468,
"step": 590
},
{
"epoch": 0.23244837758112094,
"grad_norm": 1.6198259592056274,
"learning_rate": 4.738403249389752e-06,
"loss": 0.0671,
"step": 591
},
{
"epoch": 0.23284169124877088,
"grad_norm": 1.394888997077942,
"learning_rate": 4.736871756259023e-06,
"loss": 0.0851,
"step": 592
},
{
"epoch": 0.23323500491642085,
"grad_norm": 1.2976491451263428,
"learning_rate": 4.7353360421955345e-06,
"loss": 0.0614,
"step": 593
},
{
"epoch": 0.2336283185840708,
"grad_norm": 1.2485517263412476,
"learning_rate": 4.733796110097148e-06,
"loss": 0.0429,
"step": 594
},
{
"epoch": 0.23402163225172073,
"grad_norm": 2.0384671688079834,
"learning_rate": 4.732251962869685e-06,
"loss": 0.0549,
"step": 595
},
{
"epoch": 0.2344149459193707,
"grad_norm": 2.514827251434326,
"learning_rate": 4.730703603426921e-06,
"loss": 0.0934,
"step": 596
},
{
"epoch": 0.23480825958702065,
"grad_norm": 1.5746873617172241,
"learning_rate": 4.729151034690579e-06,
"loss": 0.0797,
"step": 597
},
{
"epoch": 0.2352015732546706,
"grad_norm": 1.458757996559143,
"learning_rate": 4.727594259590326e-06,
"loss": 0.07,
"step": 598
},
{
"epoch": 0.23559488692232056,
"grad_norm": 1.9289155006408691,
"learning_rate": 4.726033281063766e-06,
"loss": 0.0447,
"step": 599
},
{
"epoch": 0.2359882005899705,
"grad_norm": 2.641873359680176,
"learning_rate": 4.724468102056434e-06,
"loss": 0.1165,
"step": 600
},
{
"epoch": 0.23638151425762044,
"grad_norm": 0.6296206116676331,
"learning_rate": 4.722898725521793e-06,
"loss": 0.0597,
"step": 601
},
{
"epoch": 0.2367748279252704,
"grad_norm": 1.7393361330032349,
"learning_rate": 4.721325154421224e-06,
"loss": 0.0508,
"step": 602
},
{
"epoch": 0.23716814159292035,
"grad_norm": 1.639045000076294,
"learning_rate": 4.7197473917240255e-06,
"loss": 0.0433,
"step": 603
},
{
"epoch": 0.2375614552605703,
"grad_norm": 1.4411070346832275,
"learning_rate": 4.718165440407404e-06,
"loss": 0.0626,
"step": 604
},
{
"epoch": 0.23795476892822026,
"grad_norm": 1.7141265869140625,
"learning_rate": 4.716579303456471e-06,
"loss": 0.0641,
"step": 605
},
{
"epoch": 0.2383480825958702,
"grad_norm": 1.1153072118759155,
"learning_rate": 4.714988983864235e-06,
"loss": 0.0524,
"step": 606
},
{
"epoch": 0.23874139626352014,
"grad_norm": 0.6169893741607666,
"learning_rate": 4.713394484631598e-06,
"loss": 0.0485,
"step": 607
},
{
"epoch": 0.23913470993117011,
"grad_norm": 2.24593186378479,
"learning_rate": 4.711795808767348e-06,
"loss": 0.0767,
"step": 608
},
{
"epoch": 0.23952802359882006,
"grad_norm": 0.8726077675819397,
"learning_rate": 4.7101929592881545e-06,
"loss": 0.0506,
"step": 609
},
{
"epoch": 0.23992133726647,
"grad_norm": 1.0482176542282104,
"learning_rate": 4.708585939218564e-06,
"loss": 0.0374,
"step": 610
},
{
"epoch": 0.24031465093411997,
"grad_norm": 1.031867265701294,
"learning_rate": 4.7069747515909905e-06,
"loss": 0.0513,
"step": 611
},
{
"epoch": 0.2407079646017699,
"grad_norm": 1.548361897468567,
"learning_rate": 4.7053593994457135e-06,
"loss": 0.0524,
"step": 612
},
{
"epoch": 0.24110127826941985,
"grad_norm": 2.367420196533203,
"learning_rate": 4.70373988583087e-06,
"loss": 0.0915,
"step": 613
},
{
"epoch": 0.24149459193706982,
"grad_norm": 1.440256953239441,
"learning_rate": 4.7021162138024524e-06,
"loss": 0.0829,
"step": 614
},
{
"epoch": 0.24188790560471976,
"grad_norm": 1.6830074787139893,
"learning_rate": 4.700488386424294e-06,
"loss": 0.0706,
"step": 615
},
{
"epoch": 0.2422812192723697,
"grad_norm": 2.811821699142456,
"learning_rate": 4.698856406768076e-06,
"loss": 0.0531,
"step": 616
},
{
"epoch": 0.24267453294001967,
"grad_norm": 2.031094551086426,
"learning_rate": 4.697220277913311e-06,
"loss": 0.0751,
"step": 617
},
{
"epoch": 0.2430678466076696,
"grad_norm": 1.9269078969955444,
"learning_rate": 4.695580002947341e-06,
"loss": 0.0624,
"step": 618
},
{
"epoch": 0.24346116027531955,
"grad_norm": 1.3828526735305786,
"learning_rate": 4.6939355849653325e-06,
"loss": 0.0776,
"step": 619
},
{
"epoch": 0.24385447394296952,
"grad_norm": 1.0781844854354858,
"learning_rate": 4.69228702707027e-06,
"loss": 0.0477,
"step": 620
},
{
"epoch": 0.24424778761061947,
"grad_norm": 1.0195046663284302,
"learning_rate": 4.69063433237295e-06,
"loss": 0.06,
"step": 621
},
{
"epoch": 0.2446411012782694,
"grad_norm": 0.6686704158782959,
"learning_rate": 4.688977503991975e-06,
"loss": 0.0713,
"step": 622
},
{
"epoch": 0.24503441494591938,
"grad_norm": 1.7740367650985718,
"learning_rate": 4.687316545053746e-06,
"loss": 0.092,
"step": 623
},
{
"epoch": 0.24542772861356932,
"grad_norm": 1.1935254335403442,
"learning_rate": 4.68565145869246e-06,
"loss": 0.0697,
"step": 624
},
{
"epoch": 0.24582104228121926,
"grad_norm": 0.7092412710189819,
"learning_rate": 4.683982248050103e-06,
"loss": 0.0647,
"step": 625
},
{
"epoch": 0.24621435594886923,
"grad_norm": 2.2962708473205566,
"learning_rate": 4.6823089162764425e-06,
"loss": 0.07,
"step": 626
},
{
"epoch": 0.24660766961651917,
"grad_norm": 1.1462363004684448,
"learning_rate": 4.6806314665290205e-06,
"loss": 0.0519,
"step": 627
},
{
"epoch": 0.2470009832841691,
"grad_norm": 2.2198500633239746,
"learning_rate": 4.678949901973154e-06,
"loss": 0.0411,
"step": 628
},
{
"epoch": 0.24739429695181908,
"grad_norm": 0.703561007976532,
"learning_rate": 4.677264225781921e-06,
"loss": 0.0505,
"step": 629
},
{
"epoch": 0.24778761061946902,
"grad_norm": 1.4070128202438354,
"learning_rate": 4.6755744411361585e-06,
"loss": 0.0659,
"step": 630
},
{
"epoch": 0.24818092428711896,
"grad_norm": 0.9832798838615417,
"learning_rate": 4.6738805512244575e-06,
"loss": 0.0917,
"step": 631
},
{
"epoch": 0.24857423795476893,
"grad_norm": 0.9056950807571411,
"learning_rate": 4.672182559243155e-06,
"loss": 0.0484,
"step": 632
},
{
"epoch": 0.24896755162241888,
"grad_norm": 2.0713984966278076,
"learning_rate": 4.670480468396327e-06,
"loss": 0.0729,
"step": 633
},
{
"epoch": 0.24936086529006882,
"grad_norm": 0.9963469505310059,
"learning_rate": 4.668774281895786e-06,
"loss": 0.0507,
"step": 634
},
{
"epoch": 0.2497541789577188,
"grad_norm": 0.9695498943328857,
"learning_rate": 4.667064002961073e-06,
"loss": 0.0538,
"step": 635
},
{
"epoch": 0.25014749262536873,
"grad_norm": 1.3090274333953857,
"learning_rate": 4.66534963481945e-06,
"loss": 0.0931,
"step": 636
},
{
"epoch": 0.25054080629301867,
"grad_norm": 1.2280491590499878,
"learning_rate": 4.663631180705894e-06,
"loss": 0.0488,
"step": 637
},
{
"epoch": 0.2509341199606686,
"grad_norm": 1.050603985786438,
"learning_rate": 4.661908643863096e-06,
"loss": 0.0723,
"step": 638
},
{
"epoch": 0.2513274336283186,
"grad_norm": 1.2820688486099243,
"learning_rate": 4.66018202754145e-06,
"loss": 0.0854,
"step": 639
},
{
"epoch": 0.25172074729596855,
"grad_norm": 0.9909592866897583,
"learning_rate": 4.658451334999043e-06,
"loss": 0.0613,
"step": 640
},
{
"epoch": 0.2521140609636185,
"grad_norm": 0.7117825746536255,
"learning_rate": 4.656716569501661e-06,
"loss": 0.0249,
"step": 641
},
{
"epoch": 0.25250737463126843,
"grad_norm": 1.803819179534912,
"learning_rate": 4.654977734322772e-06,
"loss": 0.0744,
"step": 642
},
{
"epoch": 0.2529006882989184,
"grad_norm": 1.2123903036117554,
"learning_rate": 4.653234832743521e-06,
"loss": 0.0893,
"step": 643
},
{
"epoch": 0.2532940019665683,
"grad_norm": 1.3053680658340454,
"learning_rate": 4.651487868052731e-06,
"loss": 0.0794,
"step": 644
},
{
"epoch": 0.2536873156342183,
"grad_norm": 1.5112253427505493,
"learning_rate": 4.64973684354689e-06,
"loss": 0.1139,
"step": 645
},
{
"epoch": 0.25408062930186825,
"grad_norm": 0.4444582164287567,
"learning_rate": 4.647981762530145e-06,
"loss": 0.031,
"step": 646
},
{
"epoch": 0.2544739429695182,
"grad_norm": 0.863317608833313,
"learning_rate": 4.6462226283143e-06,
"loss": 0.0336,
"step": 647
},
{
"epoch": 0.25486725663716814,
"grad_norm": 2.007761001586914,
"learning_rate": 4.644459444218807e-06,
"loss": 0.0531,
"step": 648
},
{
"epoch": 0.2552605703048181,
"grad_norm": 2.1189866065979004,
"learning_rate": 4.642692213570759e-06,
"loss": 0.0906,
"step": 649
},
{
"epoch": 0.255653883972468,
"grad_norm": 0.7463569045066833,
"learning_rate": 4.640920939704885e-06,
"loss": 0.0449,
"step": 650
},
{
"epoch": 0.256047197640118,
"grad_norm": 2.031602144241333,
"learning_rate": 4.639145625963544e-06,
"loss": 0.0673,
"step": 651
},
{
"epoch": 0.25644051130776796,
"grad_norm": 2.0455472469329834,
"learning_rate": 4.637366275696718e-06,
"loss": 0.0495,
"step": 652
},
{
"epoch": 0.2568338249754179,
"grad_norm": 1.2602909803390503,
"learning_rate": 4.635582892262006e-06,
"loss": 0.0442,
"step": 653
},
{
"epoch": 0.25722713864306784,
"grad_norm": 1.3121466636657715,
"learning_rate": 4.633795479024616e-06,
"loss": 0.0404,
"step": 654
},
{
"epoch": 0.2576204523107178,
"grad_norm": 1.028448224067688,
"learning_rate": 4.632004039357364e-06,
"loss": 0.0497,
"step": 655
},
{
"epoch": 0.2580137659783677,
"grad_norm": 0.9586936235427856,
"learning_rate": 4.630208576640659e-06,
"loss": 0.0499,
"step": 656
},
{
"epoch": 0.2584070796460177,
"grad_norm": 1.3646454811096191,
"learning_rate": 4.628409094262504e-06,
"loss": 0.0383,
"step": 657
},
{
"epoch": 0.25880039331366766,
"grad_norm": 1.6489843130111694,
"learning_rate": 4.6266055956184865e-06,
"loss": 0.0458,
"step": 658
},
{
"epoch": 0.2591937069813176,
"grad_norm": 1.8696314096450806,
"learning_rate": 4.624798084111773e-06,
"loss": 0.0783,
"step": 659
},
{
"epoch": 0.25958702064896755,
"grad_norm": 1.5261452198028564,
"learning_rate": 4.622986563153104e-06,
"loss": 0.0465,
"step": 660
},
{
"epoch": 0.2599803343166175,
"grad_norm": 1.8203606605529785,
"learning_rate": 4.621171036160781e-06,
"loss": 0.0767,
"step": 661
},
{
"epoch": 0.26037364798426743,
"grad_norm": 1.3250322341918945,
"learning_rate": 4.6193515065606675e-06,
"loss": 0.0607,
"step": 662
},
{
"epoch": 0.26076696165191743,
"grad_norm": 1.298017978668213,
"learning_rate": 4.617527977786182e-06,
"loss": 0.0619,
"step": 663
},
{
"epoch": 0.26116027531956737,
"grad_norm": 1.0446304082870483,
"learning_rate": 4.615700453278285e-06,
"loss": 0.0268,
"step": 664
},
{
"epoch": 0.2615535889872173,
"grad_norm": 1.0812922716140747,
"learning_rate": 4.61386893648548e-06,
"loss": 0.0519,
"step": 665
},
{
"epoch": 0.26194690265486725,
"grad_norm": 1.8242236375808716,
"learning_rate": 4.612033430863804e-06,
"loss": 0.0565,
"step": 666
},
{
"epoch": 0.2623402163225172,
"grad_norm": 1.567988634109497,
"learning_rate": 4.610193939876818e-06,
"loss": 0.0476,
"step": 667
},
{
"epoch": 0.26273352999016714,
"grad_norm": 3.7344436645507812,
"learning_rate": 4.608350466995606e-06,
"loss": 0.0519,
"step": 668
},
{
"epoch": 0.26312684365781713,
"grad_norm": 3.131584882736206,
"learning_rate": 4.606503015698765e-06,
"loss": 0.0696,
"step": 669
},
{
"epoch": 0.2635201573254671,
"grad_norm": 1.2186100482940674,
"learning_rate": 4.6046515894723985e-06,
"loss": 0.0596,
"step": 670
},
{
"epoch": 0.263913470993117,
"grad_norm": 0.8804354667663574,
"learning_rate": 4.602796191810113e-06,
"loss": 0.0465,
"step": 671
},
{
"epoch": 0.26430678466076696,
"grad_norm": 1.961540937423706,
"learning_rate": 4.600936826213004e-06,
"loss": 0.0756,
"step": 672
},
{
"epoch": 0.2647000983284169,
"grad_norm": 0.739213764667511,
"learning_rate": 4.59907349618966e-06,
"loss": 0.0475,
"step": 673
},
{
"epoch": 0.26509341199606684,
"grad_norm": 0.8394540548324585,
"learning_rate": 4.597206205256147e-06,
"loss": 0.0538,
"step": 674
},
{
"epoch": 0.26548672566371684,
"grad_norm": 1.5452135801315308,
"learning_rate": 4.595334956936007e-06,
"loss": 0.0664,
"step": 675
},
{
"epoch": 0.2658800393313668,
"grad_norm": 1.613324522972107,
"learning_rate": 4.593459754760248e-06,
"loss": 0.0673,
"step": 676
},
{
"epoch": 0.2662733529990167,
"grad_norm": 1.4427350759506226,
"learning_rate": 4.591580602267338e-06,
"loss": 0.0509,
"step": 677
},
{
"epoch": 0.26666666666666666,
"grad_norm": 1.7156988382339478,
"learning_rate": 4.589697503003203e-06,
"loss": 0.0601,
"step": 678
},
{
"epoch": 0.2670599803343166,
"grad_norm": 1.4072953462600708,
"learning_rate": 4.587810460521213e-06,
"loss": 0.0678,
"step": 679
},
{
"epoch": 0.26745329400196655,
"grad_norm": 0.7101967930793762,
"learning_rate": 4.585919478382178e-06,
"loss": 0.0522,
"step": 680
},
{
"epoch": 0.26784660766961654,
"grad_norm": 0.5038359761238098,
"learning_rate": 4.584024560154348e-06,
"loss": 0.0408,
"step": 681
},
{
"epoch": 0.2682399213372665,
"grad_norm": 1.1651291847229004,
"learning_rate": 4.582125709413392e-06,
"loss": 0.0719,
"step": 682
},
{
"epoch": 0.2686332350049164,
"grad_norm": 1.0390863418579102,
"learning_rate": 4.580222929742407e-06,
"loss": 0.0402,
"step": 683
},
{
"epoch": 0.26902654867256637,
"grad_norm": 1.8808722496032715,
"learning_rate": 4.5783162247318986e-06,
"loss": 0.0612,
"step": 684
},
{
"epoch": 0.2694198623402163,
"grad_norm": 1.4362890720367432,
"learning_rate": 4.576405597979782e-06,
"loss": 0.0367,
"step": 685
},
{
"epoch": 0.26981317600786625,
"grad_norm": 0.9547756910324097,
"learning_rate": 4.5744910530913725e-06,
"loss": 0.0799,
"step": 686
},
{
"epoch": 0.27020648967551625,
"grad_norm": 1.8914170265197754,
"learning_rate": 4.572572593679379e-06,
"loss": 0.048,
"step": 687
},
{
"epoch": 0.2705998033431662,
"grad_norm": 1.460436224937439,
"learning_rate": 4.5706502233638935e-06,
"loss": 0.0633,
"step": 688
},
{
"epoch": 0.27099311701081613,
"grad_norm": 1.7330501079559326,
"learning_rate": 4.568723945772394e-06,
"loss": 0.0332,
"step": 689
},
{
"epoch": 0.2713864306784661,
"grad_norm": 1.1326316595077515,
"learning_rate": 4.5667937645397276e-06,
"loss": 0.0555,
"step": 690
},
{
"epoch": 0.271779744346116,
"grad_norm": 0.8753216862678528,
"learning_rate": 4.564859683308107e-06,
"loss": 0.0416,
"step": 691
},
{
"epoch": 0.27217305801376596,
"grad_norm": 0.8659785389900208,
"learning_rate": 4.562921705727106e-06,
"loss": 0.0551,
"step": 692
},
{
"epoch": 0.27256637168141595,
"grad_norm": 0.502169668674469,
"learning_rate": 4.5609798354536495e-06,
"loss": 0.0284,
"step": 693
},
{
"epoch": 0.2729596853490659,
"grad_norm": 2.1083321571350098,
"learning_rate": 4.559034076152009e-06,
"loss": 0.0779,
"step": 694
},
{
"epoch": 0.27335299901671584,
"grad_norm": 1.5410869121551514,
"learning_rate": 4.557084431493793e-06,
"loss": 0.0788,
"step": 695
},
{
"epoch": 0.2737463126843658,
"grad_norm": 1.707189679145813,
"learning_rate": 4.555130905157943e-06,
"loss": 0.0921,
"step": 696
},
{
"epoch": 0.2741396263520157,
"grad_norm": 1.2371059656143188,
"learning_rate": 4.553173500830724e-06,
"loss": 0.0562,
"step": 697
},
{
"epoch": 0.27453294001966566,
"grad_norm": 1.6234147548675537,
"learning_rate": 4.55121222220572e-06,
"loss": 0.0471,
"step": 698
},
{
"epoch": 0.27492625368731566,
"grad_norm": 1.2629426717758179,
"learning_rate": 4.549247072983825e-06,
"loss": 0.0795,
"step": 699
},
{
"epoch": 0.2753195673549656,
"grad_norm": 1.7955608367919922,
"learning_rate": 4.5472780568732356e-06,
"loss": 0.0468,
"step": 700
},
{
"epoch": 0.27571288102261554,
"grad_norm": 7.252640724182129,
"learning_rate": 4.545305177589448e-06,
"loss": 0.0699,
"step": 701
},
{
"epoch": 0.2761061946902655,
"grad_norm": 1.8121711015701294,
"learning_rate": 4.5433284388552435e-06,
"loss": 0.0718,
"step": 702
},
{
"epoch": 0.2764995083579154,
"grad_norm": 0.901907742023468,
"learning_rate": 4.541347844400692e-06,
"loss": 0.0255,
"step": 703
},
{
"epoch": 0.27689282202556537,
"grad_norm": 0.7126281261444092,
"learning_rate": 4.539363397963134e-06,
"loss": 0.0509,
"step": 704
},
{
"epoch": 0.27728613569321536,
"grad_norm": 2.012707233428955,
"learning_rate": 4.537375103287183e-06,
"loss": 0.0904,
"step": 705
},
{
"epoch": 0.2776794493608653,
"grad_norm": 1.7197178602218628,
"learning_rate": 4.53538296412471e-06,
"loss": 0.0617,
"step": 706
},
{
"epoch": 0.27807276302851525,
"grad_norm": 2.5714545249938965,
"learning_rate": 4.533386984234841e-06,
"loss": 0.0825,
"step": 707
},
{
"epoch": 0.2784660766961652,
"grad_norm": 1.3491824865341187,
"learning_rate": 4.5313871673839525e-06,
"loss": 0.0545,
"step": 708
},
{
"epoch": 0.27885939036381513,
"grad_norm": 1.0081161260604858,
"learning_rate": 4.52938351734566e-06,
"loss": 0.046,
"step": 709
},
{
"epoch": 0.27925270403146507,
"grad_norm": 1.3097039461135864,
"learning_rate": 4.52737603790081e-06,
"loss": 0.0678,
"step": 710
},
{
"epoch": 0.27964601769911507,
"grad_norm": 1.264832615852356,
"learning_rate": 4.525364732837476e-06,
"loss": 0.0408,
"step": 711
},
{
"epoch": 0.280039331366765,
"grad_norm": 1.6724627017974854,
"learning_rate": 4.523349605950953e-06,
"loss": 0.0583,
"step": 712
},
{
"epoch": 0.28043264503441495,
"grad_norm": 1.2600414752960205,
"learning_rate": 4.521330661043744e-06,
"loss": 0.0762,
"step": 713
},
{
"epoch": 0.2808259587020649,
"grad_norm": 0.8454362750053406,
"learning_rate": 4.519307901925558e-06,
"loss": 0.0433,
"step": 714
},
{
"epoch": 0.28121927236971483,
"grad_norm": 2.131969451904297,
"learning_rate": 4.517281332413302e-06,
"loss": 0.0738,
"step": 715
},
{
"epoch": 0.2816125860373648,
"grad_norm": 2.226288080215454,
"learning_rate": 4.515250956331072e-06,
"loss": 0.0892,
"step": 716
},
{
"epoch": 0.2820058997050148,
"grad_norm": 1.6737391948699951,
"learning_rate": 4.513216777510149e-06,
"loss": 0.0556,
"step": 717
},
{
"epoch": 0.2823992133726647,
"grad_norm": 1.5575467348098755,
"learning_rate": 4.511178799788987e-06,
"loss": 0.0561,
"step": 718
},
{
"epoch": 0.28279252704031466,
"grad_norm": 1.7405011653900146,
"learning_rate": 4.50913702701321e-06,
"loss": 0.0653,
"step": 719
},
{
"epoch": 0.2831858407079646,
"grad_norm": 1.097738265991211,
"learning_rate": 4.507091463035601e-06,
"loss": 0.0772,
"step": 720
},
{
"epoch": 0.28357915437561454,
"grad_norm": 0.8409376740455627,
"learning_rate": 4.505042111716103e-06,
"loss": 0.0645,
"step": 721
},
{
"epoch": 0.2839724680432645,
"grad_norm": 1.1851140260696411,
"learning_rate": 4.502988976921797e-06,
"loss": 0.0462,
"step": 722
},
{
"epoch": 0.2843657817109145,
"grad_norm": 1.7740516662597656,
"learning_rate": 4.50093206252691e-06,
"loss": 0.0717,
"step": 723
},
{
"epoch": 0.2847590953785644,
"grad_norm": 2.491065263748169,
"learning_rate": 4.498871372412798e-06,
"loss": 0.0575,
"step": 724
},
{
"epoch": 0.28515240904621436,
"grad_norm": 1.446291446685791,
"learning_rate": 4.496806910467944e-06,
"loss": 0.0566,
"step": 725
},
{
"epoch": 0.2855457227138643,
"grad_norm": 1.2584576606750488,
"learning_rate": 4.494738680587946e-06,
"loss": 0.053,
"step": 726
},
{
"epoch": 0.28593903638151424,
"grad_norm": 1.188159704208374,
"learning_rate": 4.492666686675511e-06,
"loss": 0.0627,
"step": 727
},
{
"epoch": 0.2863323500491642,
"grad_norm": 1.2687791585922241,
"learning_rate": 4.490590932640453e-06,
"loss": 0.0676,
"step": 728
},
{
"epoch": 0.2867256637168142,
"grad_norm": 1.7722615003585815,
"learning_rate": 4.488511422399677e-06,
"loss": 0.0548,
"step": 729
},
{
"epoch": 0.2871189773844641,
"grad_norm": 3.2244741916656494,
"learning_rate": 4.48642815987718e-06,
"loss": 0.0763,
"step": 730
},
{
"epoch": 0.28751229105211407,
"grad_norm": 1.1106655597686768,
"learning_rate": 4.484341149004035e-06,
"loss": 0.0862,
"step": 731
},
{
"epoch": 0.287905604719764,
"grad_norm": 0.6258023381233215,
"learning_rate": 4.482250393718392e-06,
"loss": 0.0526,
"step": 732
},
{
"epoch": 0.28829891838741395,
"grad_norm": 0.7904531955718994,
"learning_rate": 4.480155897965463e-06,
"loss": 0.0367,
"step": 733
},
{
"epoch": 0.2886922320550639,
"grad_norm": 1.5454163551330566,
"learning_rate": 4.47805766569752e-06,
"loss": 0.0747,
"step": 734
},
{
"epoch": 0.2890855457227139,
"grad_norm": 2.1076667308807373,
"learning_rate": 4.475955700873888e-06,
"loss": 0.0939,
"step": 735
},
{
"epoch": 0.28947885939036383,
"grad_norm": 1.407893419265747,
"learning_rate": 4.473850007460932e-06,
"loss": 0.0524,
"step": 736
},
{
"epoch": 0.28987217305801377,
"grad_norm": 1.957629680633545,
"learning_rate": 4.471740589432053e-06,
"loss": 0.0541,
"step": 737
},
{
"epoch": 0.2902654867256637,
"grad_norm": 1.0253725051879883,
"learning_rate": 4.469627450767682e-06,
"loss": 0.0478,
"step": 738
},
{
"epoch": 0.29065880039331365,
"grad_norm": 1.5762360095977783,
"learning_rate": 4.46751059545527e-06,
"loss": 0.0936,
"step": 739
},
{
"epoch": 0.2910521140609636,
"grad_norm": 1.2460707426071167,
"learning_rate": 4.465390027489279e-06,
"loss": 0.0596,
"step": 740
},
{
"epoch": 0.2914454277286136,
"grad_norm": 1.042962670326233,
"learning_rate": 4.463265750871182e-06,
"loss": 0.0615,
"step": 741
},
{
"epoch": 0.29183874139626353,
"grad_norm": 1.554513692855835,
"learning_rate": 4.461137769609445e-06,
"loss": 0.0562,
"step": 742
},
{
"epoch": 0.2922320550639135,
"grad_norm": 1.5099841356277466,
"learning_rate": 4.459006087719527e-06,
"loss": 0.0462,
"step": 743
},
{
"epoch": 0.2926253687315634,
"grad_norm": 0.8272073864936829,
"learning_rate": 4.45687070922387e-06,
"loss": 0.0311,
"step": 744
},
{
"epoch": 0.29301868239921336,
"grad_norm": 1.1962639093399048,
"learning_rate": 4.4547316381518905e-06,
"loss": 0.054,
"step": 745
},
{
"epoch": 0.2934119960668633,
"grad_norm": 0.7265387773513794,
"learning_rate": 4.4525888785399725e-06,
"loss": 0.0322,
"step": 746
},
{
"epoch": 0.2938053097345133,
"grad_norm": 2.045783042907715,
"learning_rate": 4.450442434431463e-06,
"loss": 0.0668,
"step": 747
},
{
"epoch": 0.29419862340216324,
"grad_norm": 1.417593240737915,
"learning_rate": 4.448292309876657e-06,
"loss": 0.0499,
"step": 748
},
{
"epoch": 0.2945919370698132,
"grad_norm": 1.4235261678695679,
"learning_rate": 4.4461385089328e-06,
"loss": 0.0904,
"step": 749
},
{
"epoch": 0.2949852507374631,
"grad_norm": 1.050933837890625,
"learning_rate": 4.44398103566407e-06,
"loss": 0.05,
"step": 750
},
{
"epoch": 0.29537856440511306,
"grad_norm": 1.3113094568252563,
"learning_rate": 4.4418198941415756e-06,
"loss": 0.0717,
"step": 751
},
{
"epoch": 0.295771878072763,
"grad_norm": 1.1153532266616821,
"learning_rate": 4.4396550884433495e-06,
"loss": 0.0613,
"step": 752
},
{
"epoch": 0.296165191740413,
"grad_norm": 1.6574000120162964,
"learning_rate": 4.437486622654337e-06,
"loss": 0.08,
"step": 753
},
{
"epoch": 0.29655850540806294,
"grad_norm": 1.037023901939392,
"learning_rate": 4.43531450086639e-06,
"loss": 0.059,
"step": 754
},
{
"epoch": 0.2969518190757129,
"grad_norm": 1.3382397890090942,
"learning_rate": 4.433138727178259e-06,
"loss": 0.0504,
"step": 755
},
{
"epoch": 0.2973451327433628,
"grad_norm": 2.023531198501587,
"learning_rate": 4.4309593056955865e-06,
"loss": 0.0682,
"step": 756
},
{
"epoch": 0.29773844641101277,
"grad_norm": 1.3962974548339844,
"learning_rate": 4.4287762405308974e-06,
"loss": 0.0678,
"step": 757
},
{
"epoch": 0.2981317600786627,
"grad_norm": 0.6099796295166016,
"learning_rate": 4.426589535803593e-06,
"loss": 0.0496,
"step": 758
},
{
"epoch": 0.2985250737463127,
"grad_norm": 1.6071325540542603,
"learning_rate": 4.424399195639941e-06,
"loss": 0.0519,
"step": 759
},
{
"epoch": 0.29891838741396265,
"grad_norm": 1.116490125656128,
"learning_rate": 4.422205224173071e-06,
"loss": 0.0651,
"step": 760
},
{
"epoch": 0.2993117010816126,
"grad_norm": 1.163526177406311,
"learning_rate": 4.420007625542963e-06,
"loss": 0.042,
"step": 761
},
{
"epoch": 0.29970501474926253,
"grad_norm": 0.6789044737815857,
"learning_rate": 4.417806403896442e-06,
"loss": 0.0652,
"step": 762
},
{
"epoch": 0.3000983284169125,
"grad_norm": 1.6137206554412842,
"learning_rate": 4.41560156338717e-06,
"loss": 0.073,
"step": 763
},
{
"epoch": 0.3004916420845624,
"grad_norm": 1.9308634996414185,
"learning_rate": 4.413393108175637e-06,
"loss": 0.0805,
"step": 764
},
{
"epoch": 0.3008849557522124,
"grad_norm": 1.6792504787445068,
"learning_rate": 4.411181042429156e-06,
"loss": 0.0471,
"step": 765
},
{
"epoch": 0.30127826941986235,
"grad_norm": 1.1271363496780396,
"learning_rate": 4.40896537032185e-06,
"loss": 0.0378,
"step": 766
},
{
"epoch": 0.3016715830875123,
"grad_norm": 1.0671911239624023,
"learning_rate": 4.406746096034647e-06,
"loss": 0.0548,
"step": 767
},
{
"epoch": 0.30206489675516224,
"grad_norm": 1.2227768898010254,
"learning_rate": 4.4045232237552756e-06,
"loss": 0.0701,
"step": 768
},
{
"epoch": 0.3024582104228122,
"grad_norm": 1.471924901008606,
"learning_rate": 4.4022967576782525e-06,
"loss": 0.0568,
"step": 769
},
{
"epoch": 0.3028515240904621,
"grad_norm": 1.6219385862350464,
"learning_rate": 4.400066702004874e-06,
"loss": 0.05,
"step": 770
},
{
"epoch": 0.3032448377581121,
"grad_norm": 1.4471542835235596,
"learning_rate": 4.39783306094321e-06,
"loss": 0.0685,
"step": 771
},
{
"epoch": 0.30363815142576206,
"grad_norm": 1.525600552558899,
"learning_rate": 4.395595838708099e-06,
"loss": 0.0513,
"step": 772
},
{
"epoch": 0.304031465093412,
"grad_norm": 1.3881157636642456,
"learning_rate": 4.393355039521134e-06,
"loss": 0.0812,
"step": 773
},
{
"epoch": 0.30442477876106194,
"grad_norm": 1.1738461256027222,
"learning_rate": 4.391110667610658e-06,
"loss": 0.0595,
"step": 774
},
{
"epoch": 0.3048180924287119,
"grad_norm": 1.1576417684555054,
"learning_rate": 4.388862727211759e-06,
"loss": 0.0541,
"step": 775
},
{
"epoch": 0.3052114060963618,
"grad_norm": 1.283400058746338,
"learning_rate": 4.386611222566254e-06,
"loss": 0.0505,
"step": 776
},
{
"epoch": 0.3056047197640118,
"grad_norm": 1.4386646747589111,
"learning_rate": 4.384356157922688e-06,
"loss": 0.0706,
"step": 777
},
{
"epoch": 0.30599803343166176,
"grad_norm": 2.0160024166107178,
"learning_rate": 4.382097537536322e-06,
"loss": 0.0596,
"step": 778
},
{
"epoch": 0.3063913470993117,
"grad_norm": 1.3747514486312866,
"learning_rate": 4.379835365669132e-06,
"loss": 0.0561,
"step": 779
},
{
"epoch": 0.30678466076696165,
"grad_norm": 1.5668084621429443,
"learning_rate": 4.377569646589789e-06,
"loss": 0.0522,
"step": 780
},
{
"epoch": 0.3071779744346116,
"grad_norm": 1.6369160413742065,
"learning_rate": 4.375300384573659e-06,
"loss": 0.05,
"step": 781
},
{
"epoch": 0.30757128810226153,
"grad_norm": 1.2633172273635864,
"learning_rate": 4.373027583902796e-06,
"loss": 0.0447,
"step": 782
},
{
"epoch": 0.30796460176991153,
"grad_norm": 1.3119875192642212,
"learning_rate": 4.370751248865929e-06,
"loss": 0.062,
"step": 783
},
{
"epoch": 0.30835791543756147,
"grad_norm": 2.1404073238372803,
"learning_rate": 4.368471383758459e-06,
"loss": 0.0446,
"step": 784
},
{
"epoch": 0.3087512291052114,
"grad_norm": 0.7563901543617249,
"learning_rate": 4.366187992882444e-06,
"loss": 0.0429,
"step": 785
},
{
"epoch": 0.30914454277286135,
"grad_norm": 0.7048685550689697,
"learning_rate": 4.3639010805466e-06,
"loss": 0.0299,
"step": 786
},
{
"epoch": 0.3095378564405113,
"grad_norm": 0.7395270466804504,
"learning_rate": 4.361610651066283e-06,
"loss": 0.0334,
"step": 787
},
{
"epoch": 0.30993117010816124,
"grad_norm": 1.2910830974578857,
"learning_rate": 4.35931670876349e-06,
"loss": 0.0666,
"step": 788
},
{
"epoch": 0.31032448377581123,
"grad_norm": 3.32393217086792,
"learning_rate": 4.357019257966844e-06,
"loss": 0.0773,
"step": 789
},
{
"epoch": 0.3107177974434612,
"grad_norm": 1.2098692655563354,
"learning_rate": 4.354718303011588e-06,
"loss": 0.0524,
"step": 790
},
{
"epoch": 0.3111111111111111,
"grad_norm": 1.650527834892273,
"learning_rate": 4.352413848239579e-06,
"loss": 0.0518,
"step": 791
},
{
"epoch": 0.31150442477876106,
"grad_norm": 0.8377374410629272,
"learning_rate": 4.35010589799928e-06,
"loss": 0.0482,
"step": 792
},
{
"epoch": 0.311897738446411,
"grad_norm": 1.225882649421692,
"learning_rate": 4.347794456645744e-06,
"loss": 0.0405,
"step": 793
},
{
"epoch": 0.31229105211406094,
"grad_norm": 2.0014147758483887,
"learning_rate": 4.345479528540618e-06,
"loss": 0.053,
"step": 794
},
{
"epoch": 0.31268436578171094,
"grad_norm": 1.2061558961868286,
"learning_rate": 4.343161118052123e-06,
"loss": 0.045,
"step": 795
},
{
"epoch": 0.3130776794493609,
"grad_norm": 0.8555061221122742,
"learning_rate": 4.340839229555056e-06,
"loss": 0.0673,
"step": 796
},
{
"epoch": 0.3134709931170108,
"grad_norm": 1.4630858898162842,
"learning_rate": 4.338513867430773e-06,
"loss": 0.0414,
"step": 797
},
{
"epoch": 0.31386430678466076,
"grad_norm": 1.101480484008789,
"learning_rate": 4.336185036067187e-06,
"loss": 0.0383,
"step": 798
},
{
"epoch": 0.3142576204523107,
"grad_norm": 0.6861633658409119,
"learning_rate": 4.3338527398587575e-06,
"loss": 0.0393,
"step": 799
},
{
"epoch": 0.31465093411996065,
"grad_norm": 1.0716795921325684,
"learning_rate": 4.33151698320648e-06,
"loss": 0.0407,
"step": 800
},
{
"epoch": 0.31504424778761064,
"grad_norm": 1.0103176832199097,
"learning_rate": 4.329177770517881e-06,
"loss": 0.0467,
"step": 801
},
{
"epoch": 0.3154375614552606,
"grad_norm": 1.1415047645568848,
"learning_rate": 4.32683510620701e-06,
"loss": 0.0518,
"step": 802
},
{
"epoch": 0.3158308751229105,
"grad_norm": 1.0959949493408203,
"learning_rate": 4.324488994694427e-06,
"loss": 0.0447,
"step": 803
},
{
"epoch": 0.31622418879056047,
"grad_norm": 3.7971184253692627,
"learning_rate": 4.322139440407198e-06,
"loss": 0.1218,
"step": 804
},
{
"epoch": 0.3166175024582104,
"grad_norm": 1.0682744979858398,
"learning_rate": 4.319786447778887e-06,
"loss": 0.0271,
"step": 805
},
{
"epoch": 0.31701081612586035,
"grad_norm": 0.7397903800010681,
"learning_rate": 4.317430021249543e-06,
"loss": 0.0313,
"step": 806
},
{
"epoch": 0.31740412979351035,
"grad_norm": 1.9803013801574707,
"learning_rate": 4.315070165265695e-06,
"loss": 0.0832,
"step": 807
},
{
"epoch": 0.3177974434611603,
"grad_norm": 0.9591525793075562,
"learning_rate": 4.312706884280349e-06,
"loss": 0.0611,
"step": 808
},
{
"epoch": 0.31819075712881023,
"grad_norm": 0.7980911731719971,
"learning_rate": 4.310340182752965e-06,
"loss": 0.0163,
"step": 809
},
{
"epoch": 0.3185840707964602,
"grad_norm": 0.8986029028892517,
"learning_rate": 4.307970065149464e-06,
"loss": 0.0382,
"step": 810
},
{
"epoch": 0.3189773844641101,
"grad_norm": 0.9218258857727051,
"learning_rate": 4.305596535942211e-06,
"loss": 0.0362,
"step": 811
},
{
"epoch": 0.31937069813176006,
"grad_norm": 1.9387575387954712,
"learning_rate": 4.303219599610009e-06,
"loss": 0.045,
"step": 812
},
{
"epoch": 0.31976401179941005,
"grad_norm": 2.1032979488372803,
"learning_rate": 4.300839260638089e-06,
"loss": 0.0583,
"step": 813
},
{
"epoch": 0.32015732546706,
"grad_norm": 0.8777870535850525,
"learning_rate": 4.298455523518102e-06,
"loss": 0.0611,
"step": 814
},
{
"epoch": 0.32055063913470994,
"grad_norm": 1.7572643756866455,
"learning_rate": 4.296068392748116e-06,
"loss": 0.053,
"step": 815
},
{
"epoch": 0.3209439528023599,
"grad_norm": 1.3729215860366821,
"learning_rate": 4.293677872832599e-06,
"loss": 0.1014,
"step": 816
},
{
"epoch": 0.3213372664700098,
"grad_norm": 2.968247175216675,
"learning_rate": 4.291283968282413e-06,
"loss": 0.0422,
"step": 817
},
{
"epoch": 0.32173058013765976,
"grad_norm": 1.2367733716964722,
"learning_rate": 4.288886683614809e-06,
"loss": 0.0598,
"step": 818
},
{
"epoch": 0.32212389380530976,
"grad_norm": 2.149622678756714,
"learning_rate": 4.286486023353417e-06,
"loss": 0.0834,
"step": 819
},
{
"epoch": 0.3225172074729597,
"grad_norm": 2.1104652881622314,
"learning_rate": 4.284081992028235e-06,
"loss": 0.0764,
"step": 820
},
{
"epoch": 0.32291052114060964,
"grad_norm": 1.5311528444290161,
"learning_rate": 4.281674594175621e-06,
"loss": 0.0586,
"step": 821
},
{
"epoch": 0.3233038348082596,
"grad_norm": 1.432000756263733,
"learning_rate": 4.2792638343382894e-06,
"loss": 0.0787,
"step": 822
},
{
"epoch": 0.3236971484759095,
"grad_norm": 1.2007765769958496,
"learning_rate": 4.276849717065295e-06,
"loss": 0.0462,
"step": 823
},
{
"epoch": 0.32409046214355947,
"grad_norm": 1.0811890363693237,
"learning_rate": 4.2744322469120296e-06,
"loss": 0.0624,
"step": 824
},
{
"epoch": 0.32448377581120946,
"grad_norm": 1.440487265586853,
"learning_rate": 4.272011428440212e-06,
"loss": 0.0557,
"step": 825
},
{
"epoch": 0.3248770894788594,
"grad_norm": 2.677267551422119,
"learning_rate": 4.269587266217878e-06,
"loss": 0.0804,
"step": 826
},
{
"epoch": 0.32527040314650935,
"grad_norm": 1.07245671749115,
"learning_rate": 4.2671597648193745e-06,
"loss": 0.0542,
"step": 827
},
{
"epoch": 0.3256637168141593,
"grad_norm": 1.0649880170822144,
"learning_rate": 4.264728928825347e-06,
"loss": 0.0573,
"step": 828
},
{
"epoch": 0.32605703048180923,
"grad_norm": 1.880872130393982,
"learning_rate": 4.262294762822738e-06,
"loss": 0.0892,
"step": 829
},
{
"epoch": 0.32645034414945917,
"grad_norm": 1.7007864713668823,
"learning_rate": 4.259857271404767e-06,
"loss": 0.097,
"step": 830
},
{
"epoch": 0.32684365781710917,
"grad_norm": 0.9796857237815857,
"learning_rate": 4.257416459170935e-06,
"loss": 0.0372,
"step": 831
},
{
"epoch": 0.3272369714847591,
"grad_norm": 1.3802924156188965,
"learning_rate": 4.254972330727004e-06,
"loss": 0.0388,
"step": 832
},
{
"epoch": 0.32763028515240905,
"grad_norm": 1.8189585208892822,
"learning_rate": 4.252524890685e-06,
"loss": 0.0504,
"step": 833
},
{
"epoch": 0.328023598820059,
"grad_norm": 1.2440087795257568,
"learning_rate": 4.250074143663189e-06,
"loss": 0.055,
"step": 834
},
{
"epoch": 0.32841691248770893,
"grad_norm": 1.26856529712677,
"learning_rate": 4.247620094286085e-06,
"loss": 0.0528,
"step": 835
},
{
"epoch": 0.3288102261553589,
"grad_norm": 1.8983615636825562,
"learning_rate": 4.2451627471844305e-06,
"loss": 0.0527,
"step": 836
},
{
"epoch": 0.3292035398230089,
"grad_norm": 0.9810947179794312,
"learning_rate": 4.24270210699519e-06,
"loss": 0.04,
"step": 837
},
{
"epoch": 0.3295968534906588,
"grad_norm": 1.2199605703353882,
"learning_rate": 4.240238178361543e-06,
"loss": 0.0443,
"step": 838
},
{
"epoch": 0.32999016715830876,
"grad_norm": 0.5256842374801636,
"learning_rate": 4.237770965932875e-06,
"loss": 0.0267,
"step": 839
},
{
"epoch": 0.3303834808259587,
"grad_norm": 1.456432819366455,
"learning_rate": 4.235300474364766e-06,
"loss": 0.0623,
"step": 840
},
{
"epoch": 0.33077679449360864,
"grad_norm": 1.4406569004058838,
"learning_rate": 4.232826708318985e-06,
"loss": 0.0453,
"step": 841
},
{
"epoch": 0.3311701081612586,
"grad_norm": 1.9302328824996948,
"learning_rate": 4.230349672463481e-06,
"loss": 0.0655,
"step": 842
},
{
"epoch": 0.3315634218289086,
"grad_norm": 0.7055051922798157,
"learning_rate": 4.22786937147237e-06,
"loss": 0.0405,
"step": 843
},
{
"epoch": 0.3319567354965585,
"grad_norm": 2.823591947555542,
"learning_rate": 4.2253858100259304e-06,
"loss": 0.1111,
"step": 844
},
{
"epoch": 0.33235004916420846,
"grad_norm": 1.458694577217102,
"learning_rate": 4.222898992810596e-06,
"loss": 0.0688,
"step": 845
},
{
"epoch": 0.3327433628318584,
"grad_norm": 1.3440479040145874,
"learning_rate": 4.220408924518939e-06,
"loss": 0.0654,
"step": 846
},
{
"epoch": 0.33313667649950834,
"grad_norm": 1.2197304964065552,
"learning_rate": 4.217915609849671e-06,
"loss": 0.0269,
"step": 847
},
{
"epoch": 0.3335299901671583,
"grad_norm": 1.0218877792358398,
"learning_rate": 4.215419053507626e-06,
"loss": 0.0525,
"step": 848
},
{
"epoch": 0.3339233038348083,
"grad_norm": 1.4025174379348755,
"learning_rate": 4.212919260203757e-06,
"loss": 0.0947,
"step": 849
},
{
"epoch": 0.3343166175024582,
"grad_norm": 0.7898326516151428,
"learning_rate": 4.210416234655125e-06,
"loss": 0.0337,
"step": 850
},
{
"epoch": 0.33470993117010817,
"grad_norm": 1.196540355682373,
"learning_rate": 4.207909981584889e-06,
"loss": 0.0578,
"step": 851
},
{
"epoch": 0.3351032448377581,
"grad_norm": 0.926796555519104,
"learning_rate": 4.2054005057223e-06,
"loss": 0.0672,
"step": 852
},
{
"epoch": 0.33549655850540805,
"grad_norm": 1.2736568450927734,
"learning_rate": 4.202887811802687e-06,
"loss": 0.0484,
"step": 853
},
{
"epoch": 0.335889872173058,
"grad_norm": 1.2440752983093262,
"learning_rate": 4.200371904567457e-06,
"loss": 0.0478,
"step": 854
},
{
"epoch": 0.336283185840708,
"grad_norm": 1.4759784936904907,
"learning_rate": 4.197852788764075e-06,
"loss": 0.0458,
"step": 855
},
{
"epoch": 0.33667649950835793,
"grad_norm": 0.7424830794334412,
"learning_rate": 4.195330469146063e-06,
"loss": 0.0327,
"step": 856
},
{
"epoch": 0.33706981317600787,
"grad_norm": 1.2250968217849731,
"learning_rate": 4.1928049504729886e-06,
"loss": 0.0637,
"step": 857
},
{
"epoch": 0.3374631268436578,
"grad_norm": 1.2263579368591309,
"learning_rate": 4.1902762375104555e-06,
"loss": 0.0733,
"step": 858
},
{
"epoch": 0.33785644051130775,
"grad_norm": 0.5867930054664612,
"learning_rate": 4.187744335030095e-06,
"loss": 0.055,
"step": 859
},
{
"epoch": 0.3382497541789577,
"grad_norm": 2.040759563446045,
"learning_rate": 4.185209247809557e-06,
"loss": 0.0664,
"step": 860
},
{
"epoch": 0.3386430678466077,
"grad_norm": 2.09037709236145,
"learning_rate": 4.182670980632501e-06,
"loss": 0.0728,
"step": 861
},
{
"epoch": 0.33903638151425763,
"grad_norm": 3.822634220123291,
"learning_rate": 4.180129538288587e-06,
"loss": 0.0912,
"step": 862
},
{
"epoch": 0.3394296951819076,
"grad_norm": 1.7590773105621338,
"learning_rate": 4.177584925573466e-06,
"loss": 0.0623,
"step": 863
},
{
"epoch": 0.3398230088495575,
"grad_norm": 1.2151440382003784,
"learning_rate": 4.175037147288772e-06,
"loss": 0.044,
"step": 864
},
{
"epoch": 0.34021632251720746,
"grad_norm": 0.765602171421051,
"learning_rate": 4.172486208242113e-06,
"loss": 0.0811,
"step": 865
},
{
"epoch": 0.3406096361848574,
"grad_norm": 0.9690750241279602,
"learning_rate": 4.169932113247059e-06,
"loss": 0.0587,
"step": 866
},
{
"epoch": 0.3410029498525074,
"grad_norm": 0.6641612648963928,
"learning_rate": 4.167374867123138e-06,
"loss": 0.0336,
"step": 867
},
{
"epoch": 0.34139626352015734,
"grad_norm": 0.9194386601448059,
"learning_rate": 4.164814474695823e-06,
"loss": 0.0566,
"step": 868
},
{
"epoch": 0.3417895771878073,
"grad_norm": 2.2128334045410156,
"learning_rate": 4.162250940796523e-06,
"loss": 0.074,
"step": 869
},
{
"epoch": 0.3421828908554572,
"grad_norm": 1.8464068174362183,
"learning_rate": 4.159684270262576e-06,
"loss": 0.0736,
"step": 870
},
{
"epoch": 0.34257620452310716,
"grad_norm": 0.9694234728813171,
"learning_rate": 4.157114467937239e-06,
"loss": 0.0413,
"step": 871
},
{
"epoch": 0.3429695181907571,
"grad_norm": 1.4554444551467896,
"learning_rate": 4.154541538669677e-06,
"loss": 0.0468,
"step": 872
},
{
"epoch": 0.3433628318584071,
"grad_norm": 1.3524583578109741,
"learning_rate": 4.151965487314959e-06,
"loss": 0.049,
"step": 873
},
{
"epoch": 0.34375614552605704,
"grad_norm": 1.6620694398880005,
"learning_rate": 4.1493863187340415e-06,
"loss": 0.0686,
"step": 874
},
{
"epoch": 0.344149459193707,
"grad_norm": 0.8126603364944458,
"learning_rate": 4.146804037793763e-06,
"loss": 0.0335,
"step": 875
},
{
"epoch": 0.3445427728613569,
"grad_norm": 1.852401852607727,
"learning_rate": 4.144218649366839e-06,
"loss": 0.0488,
"step": 876
},
{
"epoch": 0.34493608652900687,
"grad_norm": 1.165703296661377,
"learning_rate": 4.141630158331845e-06,
"loss": 0.0464,
"step": 877
},
{
"epoch": 0.3453294001966568,
"grad_norm": 2.391685962677002,
"learning_rate": 4.139038569573213e-06,
"loss": 0.0829,
"step": 878
},
{
"epoch": 0.3457227138643068,
"grad_norm": 1.832273006439209,
"learning_rate": 4.1364438879812194e-06,
"loss": 0.0406,
"step": 879
},
{
"epoch": 0.34611602753195675,
"grad_norm": 1.1527806520462036,
"learning_rate": 4.1338461184519776e-06,
"loss": 0.0682,
"step": 880
},
{
"epoch": 0.3465093411996067,
"grad_norm": 1.8680974245071411,
"learning_rate": 4.131245265887426e-06,
"loss": 0.0847,
"step": 881
},
{
"epoch": 0.34690265486725663,
"grad_norm": 1.7685651779174805,
"learning_rate": 4.1286413351953235e-06,
"loss": 0.0461,
"step": 882
},
{
"epoch": 0.3472959685349066,
"grad_norm": 2.0602667331695557,
"learning_rate": 4.126034331289235e-06,
"loss": 0.0992,
"step": 883
},
{
"epoch": 0.3476892822025565,
"grad_norm": 1.4323168992996216,
"learning_rate": 4.123424259088525e-06,
"loss": 0.0992,
"step": 884
},
{
"epoch": 0.3480825958702065,
"grad_norm": 0.9091783165931702,
"learning_rate": 4.120811123518349e-06,
"loss": 0.0519,
"step": 885
},
{
"epoch": 0.34847590953785645,
"grad_norm": 1.3111385107040405,
"learning_rate": 4.1181949295096415e-06,
"loss": 0.0811,
"step": 886
},
{
"epoch": 0.3488692232055064,
"grad_norm": 2.218848705291748,
"learning_rate": 4.11557568199911e-06,
"loss": 0.0743,
"step": 887
},
{
"epoch": 0.34926253687315634,
"grad_norm": 0.9991410970687866,
"learning_rate": 4.112953385929221e-06,
"loss": 0.0488,
"step": 888
},
{
"epoch": 0.3496558505408063,
"grad_norm": 1.4411261081695557,
"learning_rate": 4.110328046248196e-06,
"loss": 0.0704,
"step": 889
},
{
"epoch": 0.3500491642084562,
"grad_norm": 1.3707761764526367,
"learning_rate": 4.107699667909999e-06,
"loss": 0.0514,
"step": 890
},
{
"epoch": 0.3504424778761062,
"grad_norm": 1.438081979751587,
"learning_rate": 4.105068255874328e-06,
"loss": 0.0622,
"step": 891
},
{
"epoch": 0.35083579154375616,
"grad_norm": 1.0999984741210938,
"learning_rate": 4.102433815106606e-06,
"loss": 0.0423,
"step": 892
},
{
"epoch": 0.3512291052114061,
"grad_norm": 1.6553218364715576,
"learning_rate": 4.09979635057797e-06,
"loss": 0.0621,
"step": 893
},
{
"epoch": 0.35162241887905604,
"grad_norm": 2.6534736156463623,
"learning_rate": 4.097155867265264e-06,
"loss": 0.0956,
"step": 894
},
{
"epoch": 0.352015732546706,
"grad_norm": 1.2164000272750854,
"learning_rate": 4.094512370151027e-06,
"loss": 0.064,
"step": 895
},
{
"epoch": 0.3524090462143559,
"grad_norm": 1.4759900569915771,
"learning_rate": 4.091865864223487e-06,
"loss": 0.0496,
"step": 896
},
{
"epoch": 0.3528023598820059,
"grad_norm": 1.3511669635772705,
"learning_rate": 4.089216354476545e-06,
"loss": 0.0662,
"step": 897
},
{
"epoch": 0.35319567354965586,
"grad_norm": 1.4343103170394897,
"learning_rate": 4.086563845909779e-06,
"loss": 0.0543,
"step": 898
},
{
"epoch": 0.3535889872173058,
"grad_norm": 0.5085878968238831,
"learning_rate": 4.083908343528415e-06,
"loss": 0.0457,
"step": 899
},
{
"epoch": 0.35398230088495575,
"grad_norm": 0.9629530906677246,
"learning_rate": 4.081249852343336e-06,
"loss": 0.0422,
"step": 900
},
{
"epoch": 0.3543756145526057,
"grad_norm": 1.697277307510376,
"learning_rate": 4.078588377371062e-06,
"loss": 0.0583,
"step": 901
},
{
"epoch": 0.35476892822025563,
"grad_norm": 1.2820713520050049,
"learning_rate": 4.075923923633745e-06,
"loss": 0.0621,
"step": 902
},
{
"epoch": 0.3551622418879056,
"grad_norm": 0.9127804636955261,
"learning_rate": 4.073256496159153e-06,
"loss": 0.0616,
"step": 903
},
{
"epoch": 0.35555555555555557,
"grad_norm": 1.4303189516067505,
"learning_rate": 4.070586099980672e-06,
"loss": 0.0556,
"step": 904
},
{
"epoch": 0.3559488692232055,
"grad_norm": 0.8110685348510742,
"learning_rate": 4.067912740137285e-06,
"loss": 0.0665,
"step": 905
},
{
"epoch": 0.35634218289085545,
"grad_norm": 1.490004062652588,
"learning_rate": 4.06523642167357e-06,
"loss": 0.0771,
"step": 906
},
{
"epoch": 0.3567354965585054,
"grad_norm": 1.763295292854309,
"learning_rate": 4.062557149639688e-06,
"loss": 0.0824,
"step": 907
},
{
"epoch": 0.35712881022615534,
"grad_norm": 2.5675792694091797,
"learning_rate": 4.059874929091369e-06,
"loss": 0.0886,
"step": 908
},
{
"epoch": 0.35752212389380533,
"grad_norm": 1.442456841468811,
"learning_rate": 4.057189765089914e-06,
"loss": 0.0507,
"step": 909
},
{
"epoch": 0.3579154375614553,
"grad_norm": 1.2593395709991455,
"learning_rate": 4.054501662702172e-06,
"loss": 0.0555,
"step": 910
},
{
"epoch": 0.3583087512291052,
"grad_norm": 1.1391284465789795,
"learning_rate": 4.05181062700054e-06,
"loss": 0.058,
"step": 911
},
{
"epoch": 0.35870206489675516,
"grad_norm": 0.7833881378173828,
"learning_rate": 4.049116663062949e-06,
"loss": 0.0588,
"step": 912
},
{
"epoch": 0.3590953785644051,
"grad_norm": 1.7920033931732178,
"learning_rate": 4.046419775972855e-06,
"loss": 0.1015,
"step": 913
},
{
"epoch": 0.35948869223205504,
"grad_norm": 1.4693628549575806,
"learning_rate": 4.043719970819231e-06,
"loss": 0.0734,
"step": 914
},
{
"epoch": 0.35988200589970504,
"grad_norm": 0.9692854285240173,
"learning_rate": 4.041017252696556e-06,
"loss": 0.0537,
"step": 915
},
{
"epoch": 0.360275319567355,
"grad_norm": 0.9593791961669922,
"learning_rate": 4.038311626704806e-06,
"loss": 0.0599,
"step": 916
},
{
"epoch": 0.3606686332350049,
"grad_norm": 1.1619371175765991,
"learning_rate": 4.035603097949444e-06,
"loss": 0.0597,
"step": 917
},
{
"epoch": 0.36106194690265486,
"grad_norm": 1.3384184837341309,
"learning_rate": 4.032891671541409e-06,
"loss": 0.0513,
"step": 918
},
{
"epoch": 0.3614552605703048,
"grad_norm": 0.7744063138961792,
"learning_rate": 4.030177352597109e-06,
"loss": 0.0428,
"step": 919
},
{
"epoch": 0.36184857423795475,
"grad_norm": 1.1778054237365723,
"learning_rate": 4.027460146238411e-06,
"loss": 0.0733,
"step": 920
},
{
"epoch": 0.36224188790560474,
"grad_norm": 1.161788821220398,
"learning_rate": 4.02474005759263e-06,
"loss": 0.0735,
"step": 921
},
{
"epoch": 0.3626352015732547,
"grad_norm": 2.0623209476470947,
"learning_rate": 4.022017091792518e-06,
"loss": 0.065,
"step": 922
},
{
"epoch": 0.3630285152409046,
"grad_norm": 1.3139375448226929,
"learning_rate": 4.01929125397626e-06,
"loss": 0.0582,
"step": 923
},
{
"epoch": 0.36342182890855457,
"grad_norm": 2.0761849880218506,
"learning_rate": 4.016562549287455e-06,
"loss": 0.0557,
"step": 924
},
{
"epoch": 0.3638151425762045,
"grad_norm": 1.474522352218628,
"learning_rate": 4.013830982875117e-06,
"loss": 0.0665,
"step": 925
},
{
"epoch": 0.36420845624385445,
"grad_norm": 1.7274634838104248,
"learning_rate": 4.0110965598936565e-06,
"loss": 0.0735,
"step": 926
},
{
"epoch": 0.36460176991150445,
"grad_norm": 0.7064616084098816,
"learning_rate": 4.008359285502877e-06,
"loss": 0.0449,
"step": 927
},
{
"epoch": 0.3649950835791544,
"grad_norm": 0.8762916922569275,
"learning_rate": 4.005619164867959e-06,
"loss": 0.0582,
"step": 928
},
{
"epoch": 0.36538839724680433,
"grad_norm": 1.2766094207763672,
"learning_rate": 4.002876203159458e-06,
"loss": 0.0467,
"step": 929
},
{
"epoch": 0.36578171091445427,
"grad_norm": 1.4357662200927734,
"learning_rate": 4.000130405553287e-06,
"loss": 0.0676,
"step": 930
},
{
"epoch": 0.3661750245821042,
"grad_norm": 1.755672574043274,
"learning_rate": 3.997381777230714e-06,
"loss": 0.0647,
"step": 931
},
{
"epoch": 0.36656833824975416,
"grad_norm": 0.9483436942100525,
"learning_rate": 3.994630323378344e-06,
"loss": 0.0601,
"step": 932
},
{
"epoch": 0.36696165191740415,
"grad_norm": 1.6659551858901978,
"learning_rate": 3.991876049188116e-06,
"loss": 0.0738,
"step": 933
},
{
"epoch": 0.3673549655850541,
"grad_norm": 1.5737981796264648,
"learning_rate": 3.989118959857293e-06,
"loss": 0.0483,
"step": 934
},
{
"epoch": 0.36774827925270404,
"grad_norm": 1.5014865398406982,
"learning_rate": 3.986359060588446e-06,
"loss": 0.0458,
"step": 935
},
{
"epoch": 0.368141592920354,
"grad_norm": 1.5164520740509033,
"learning_rate": 3.983596356589452e-06,
"loss": 0.0617,
"step": 936
},
{
"epoch": 0.3685349065880039,
"grad_norm": 2.2842421531677246,
"learning_rate": 3.980830853073476e-06,
"loss": 0.0816,
"step": 937
},
{
"epoch": 0.36892822025565386,
"grad_norm": 1.5114701986312866,
"learning_rate": 3.978062555258972e-06,
"loss": 0.0355,
"step": 938
},
{
"epoch": 0.36932153392330386,
"grad_norm": 1.2816709280014038,
"learning_rate": 3.975291468369661e-06,
"loss": 0.0556,
"step": 939
},
{
"epoch": 0.3697148475909538,
"grad_norm": 2.0237350463867188,
"learning_rate": 3.97251759763453e-06,
"loss": 0.0622,
"step": 940
},
{
"epoch": 0.37010816125860374,
"grad_norm": 1.3120791912078857,
"learning_rate": 3.969740948287817e-06,
"loss": 0.0414,
"step": 941
},
{
"epoch": 0.3705014749262537,
"grad_norm": 1.3838061094284058,
"learning_rate": 3.966961525569005e-06,
"loss": 0.0653,
"step": 942
},
{
"epoch": 0.3708947885939036,
"grad_norm": 0.6813984513282776,
"learning_rate": 3.964179334722811e-06,
"loss": 0.0345,
"step": 943
},
{
"epoch": 0.37128810226155357,
"grad_norm": 0.8976694345474243,
"learning_rate": 3.961394380999173e-06,
"loss": 0.0314,
"step": 944
},
{
"epoch": 0.37168141592920356,
"grad_norm": 0.9033572673797607,
"learning_rate": 3.958606669653243e-06,
"loss": 0.0542,
"step": 945
},
{
"epoch": 0.3720747295968535,
"grad_norm": 0.901779055595398,
"learning_rate": 3.955816205945378e-06,
"loss": 0.0359,
"step": 946
},
{
"epoch": 0.37246804326450345,
"grad_norm": 2.198181390762329,
"learning_rate": 3.953022995141128e-06,
"loss": 0.0473,
"step": 947
},
{
"epoch": 0.3728613569321534,
"grad_norm": 1.4871481657028198,
"learning_rate": 3.950227042511226e-06,
"loss": 0.0888,
"step": 948
},
{
"epoch": 0.37325467059980333,
"grad_norm": 1.3157522678375244,
"learning_rate": 3.947428353331579e-06,
"loss": 0.041,
"step": 949
},
{
"epoch": 0.37364798426745327,
"grad_norm": 1.431186318397522,
"learning_rate": 3.94462693288326e-06,
"loss": 0.0799,
"step": 950
},
{
"epoch": 0.37404129793510327,
"grad_norm": 1.389054775238037,
"learning_rate": 3.941822786452491e-06,
"loss": 0.0457,
"step": 951
},
{
"epoch": 0.3744346116027532,
"grad_norm": 1.6102625131607056,
"learning_rate": 3.939015919330643e-06,
"loss": 0.0926,
"step": 952
},
{
"epoch": 0.37482792527040315,
"grad_norm": 0.8472495675086975,
"learning_rate": 3.936206336814219e-06,
"loss": 0.0408,
"step": 953
},
{
"epoch": 0.3752212389380531,
"grad_norm": 0.8631911873817444,
"learning_rate": 3.933394044204843e-06,
"loss": 0.0405,
"step": 954
},
{
"epoch": 0.37561455260570303,
"grad_norm": 5.559257507324219,
"learning_rate": 3.930579046809259e-06,
"loss": 0.048,
"step": 955
},
{
"epoch": 0.376007866273353,
"grad_norm": 1.6139276027679443,
"learning_rate": 3.92776134993931e-06,
"loss": 0.0596,
"step": 956
},
{
"epoch": 0.376401179941003,
"grad_norm": 1.7035290002822876,
"learning_rate": 3.924940958911933e-06,
"loss": 0.061,
"step": 957
},
{
"epoch": 0.3767944936086529,
"grad_norm": 0.8409842848777771,
"learning_rate": 3.922117879049152e-06,
"loss": 0.0416,
"step": 958
},
{
"epoch": 0.37718780727630286,
"grad_norm": 1.9367414712905884,
"learning_rate": 3.91929211567806e-06,
"loss": 0.0617,
"step": 959
},
{
"epoch": 0.3775811209439528,
"grad_norm": 1.0128939151763916,
"learning_rate": 3.916463674130821e-06,
"loss": 0.0477,
"step": 960
},
{
"epoch": 0.37797443461160274,
"grad_norm": 1.9125791788101196,
"learning_rate": 3.913632559744645e-06,
"loss": 0.0571,
"step": 961
},
{
"epoch": 0.3783677482792527,
"grad_norm": 1.4633182287216187,
"learning_rate": 3.910798777861788e-06,
"loss": 0.0511,
"step": 962
},
{
"epoch": 0.3787610619469027,
"grad_norm": 0.9891822934150696,
"learning_rate": 3.9079623338295436e-06,
"loss": 0.0485,
"step": 963
},
{
"epoch": 0.3791543756145526,
"grad_norm": 1.2277315855026245,
"learning_rate": 3.9051232330002245e-06,
"loss": 0.0449,
"step": 964
},
{
"epoch": 0.37954768928220256,
"grad_norm": 0.49736377596855164,
"learning_rate": 3.902281480731156e-06,
"loss": 0.0213,
"step": 965
},
{
"epoch": 0.3799410029498525,
"grad_norm": 0.982218861579895,
"learning_rate": 3.899437082384671e-06,
"loss": 0.0581,
"step": 966
},
{
"epoch": 0.38033431661750244,
"grad_norm": 0.8971213102340698,
"learning_rate": 3.89659004332809e-06,
"loss": 0.0458,
"step": 967
},
{
"epoch": 0.3807276302851524,
"grad_norm": 0.4127979874610901,
"learning_rate": 3.893740368933722e-06,
"loss": 0.0313,
"step": 968
},
{
"epoch": 0.3811209439528024,
"grad_norm": 2.5857155323028564,
"learning_rate": 3.8908880645788464e-06,
"loss": 0.0711,
"step": 969
},
{
"epoch": 0.3815142576204523,
"grad_norm": 1.2110406160354614,
"learning_rate": 3.888033135645702e-06,
"loss": 0.0508,
"step": 970
},
{
"epoch": 0.38190757128810227,
"grad_norm": 1.58492112159729,
"learning_rate": 3.885175587521486e-06,
"loss": 0.0662,
"step": 971
},
{
"epoch": 0.3823008849557522,
"grad_norm": 0.8792701363563538,
"learning_rate": 3.882315425598334e-06,
"loss": 0.0767,
"step": 972
},
{
"epoch": 0.38269419862340215,
"grad_norm": 1.797515869140625,
"learning_rate": 3.879452655273316e-06,
"loss": 0.0585,
"step": 973
},
{
"epoch": 0.3830875122910521,
"grad_norm": 1.6386829614639282,
"learning_rate": 3.876587281948422e-06,
"loss": 0.08,
"step": 974
},
{
"epoch": 0.3834808259587021,
"grad_norm": 1.1229251623153687,
"learning_rate": 3.873719311030556e-06,
"loss": 0.0585,
"step": 975
},
{
"epoch": 0.38387413962635203,
"grad_norm": 1.2260591983795166,
"learning_rate": 3.8708487479315204e-06,
"loss": 0.0647,
"step": 976
},
{
"epoch": 0.38426745329400197,
"grad_norm": 1.565321683883667,
"learning_rate": 3.867975598068012e-06,
"loss": 0.067,
"step": 977
},
{
"epoch": 0.3846607669616519,
"grad_norm": 1.4004123210906982,
"learning_rate": 3.8650998668616085e-06,
"loss": 0.0765,
"step": 978
},
{
"epoch": 0.38505408062930185,
"grad_norm": 1.5652803182601929,
"learning_rate": 3.862221559738757e-06,
"loss": 0.0672,
"step": 979
},
{
"epoch": 0.3854473942969518,
"grad_norm": 4.284322738647461,
"learning_rate": 3.859340682130766e-06,
"loss": 0.0692,
"step": 980
},
{
"epoch": 0.3858407079646018,
"grad_norm": 1.21330988407135,
"learning_rate": 3.856457239473795e-06,
"loss": 0.0828,
"step": 981
},
{
"epoch": 0.38623402163225173,
"grad_norm": 2.4526336193084717,
"learning_rate": 3.853571237208843e-06,
"loss": 0.0694,
"step": 982
},
{
"epoch": 0.3866273352999017,
"grad_norm": 1.0117402076721191,
"learning_rate": 3.8506826807817395e-06,
"loss": 0.0362,
"step": 983
},
{
"epoch": 0.3870206489675516,
"grad_norm": 1.1363615989685059,
"learning_rate": 3.847791575643134e-06,
"loss": 0.0543,
"step": 984
},
{
"epoch": 0.38741396263520156,
"grad_norm": 1.1766973733901978,
"learning_rate": 3.844897927248483e-06,
"loss": 0.0488,
"step": 985
},
{
"epoch": 0.3878072763028515,
"grad_norm": 0.8534460067749023,
"learning_rate": 3.842001741058045e-06,
"loss": 0.0603,
"step": 986
},
{
"epoch": 0.3882005899705015,
"grad_norm": 1.5655368566513062,
"learning_rate": 3.839103022536865e-06,
"loss": 0.0713,
"step": 987
},
{
"epoch": 0.38859390363815144,
"grad_norm": 0.6574957966804504,
"learning_rate": 3.836201777154769e-06,
"loss": 0.0583,
"step": 988
},
{
"epoch": 0.3889872173058014,
"grad_norm": 0.8077657222747803,
"learning_rate": 3.833298010386347e-06,
"loss": 0.05,
"step": 989
},
{
"epoch": 0.3893805309734513,
"grad_norm": 1.513853669166565,
"learning_rate": 3.830391727710954e-06,
"loss": 0.0502,
"step": 990
},
{
"epoch": 0.38977384464110126,
"grad_norm": 2.019428253173828,
"learning_rate": 3.827482934612684e-06,
"loss": 0.0557,
"step": 991
},
{
"epoch": 0.3901671583087512,
"grad_norm": 1.0257922410964966,
"learning_rate": 3.824571636580372e-06,
"loss": 0.0625,
"step": 992
},
{
"epoch": 0.3905604719764012,
"grad_norm": 0.5803849697113037,
"learning_rate": 3.821657839107583e-06,
"loss": 0.0442,
"step": 993
},
{
"epoch": 0.39095378564405114,
"grad_norm": 0.8499471545219421,
"learning_rate": 3.818741547692593e-06,
"loss": 0.0342,
"step": 994
},
{
"epoch": 0.3913470993117011,
"grad_norm": 0.4951908588409424,
"learning_rate": 3.815822767838386e-06,
"loss": 0.0343,
"step": 995
},
{
"epoch": 0.391740412979351,
"grad_norm": 1.5221655368804932,
"learning_rate": 3.812901505052642e-06,
"loss": 0.0465,
"step": 996
},
{
"epoch": 0.39213372664700097,
"grad_norm": 1.7891956567764282,
"learning_rate": 3.8099777648477264e-06,
"loss": 0.0821,
"step": 997
},
{
"epoch": 0.3925270403146509,
"grad_norm": 0.8419029116630554,
"learning_rate": 3.8070515527406803e-06,
"loss": 0.0546,
"step": 998
},
{
"epoch": 0.3929203539823009,
"grad_norm": 0.9236086010932922,
"learning_rate": 3.8041228742532064e-06,
"loss": 0.0423,
"step": 999
},
{
"epoch": 0.39331366764995085,
"grad_norm": 1.0892646312713623,
"learning_rate": 3.8011917349116633e-06,
"loss": 0.0531,
"step": 1000
},
{
"epoch": 0.3937069813176008,
"grad_norm": 1.6544411182403564,
"learning_rate": 3.7982581402470536e-06,
"loss": 0.0404,
"step": 1001
},
{
"epoch": 0.39410029498525073,
"grad_norm": 1.8338655233383179,
"learning_rate": 3.795322095795012e-06,
"loss": 0.0535,
"step": 1002
},
{
"epoch": 0.3944936086529007,
"grad_norm": 1.4561970233917236,
"learning_rate": 3.7923836070957963e-06,
"loss": 0.0506,
"step": 1003
},
{
"epoch": 0.3948869223205506,
"grad_norm": 1.1206718683242798,
"learning_rate": 3.7894426796942773e-06,
"loss": 0.07,
"step": 1004
},
{
"epoch": 0.3952802359882006,
"grad_norm": 1.5864077806472778,
"learning_rate": 3.786499319139926e-06,
"loss": 0.0511,
"step": 1005
},
{
"epoch": 0.39567354965585055,
"grad_norm": 1.6479477882385254,
"learning_rate": 3.7835535309868055e-06,
"loss": 0.1065,
"step": 1006
},
{
"epoch": 0.3960668633235005,
"grad_norm": 1.173240303993225,
"learning_rate": 3.78060532079356e-06,
"loss": 0.0366,
"step": 1007
},
{
"epoch": 0.39646017699115044,
"grad_norm": 1.512009859085083,
"learning_rate": 3.777654694123404e-06,
"loss": 0.0333,
"step": 1008
},
{
"epoch": 0.3968534906588004,
"grad_norm": 0.7629926800727844,
"learning_rate": 3.7747016565441112e-06,
"loss": 0.0293,
"step": 1009
},
{
"epoch": 0.3972468043264503,
"grad_norm": 1.325535774230957,
"learning_rate": 3.771746213628006e-06,
"loss": 0.0494,
"step": 1010
},
{
"epoch": 0.3976401179941003,
"grad_norm": 0.9456796050071716,
"learning_rate": 3.7687883709519496e-06,
"loss": 0.0347,
"step": 1011
},
{
"epoch": 0.39803343166175026,
"grad_norm": 1.6305729150772095,
"learning_rate": 3.7658281340973336e-06,
"loss": 0.0782,
"step": 1012
},
{
"epoch": 0.3984267453294002,
"grad_norm": 2.3638815879821777,
"learning_rate": 3.7628655086500654e-06,
"loss": 0.0746,
"step": 1013
},
{
"epoch": 0.39882005899705014,
"grad_norm": 1.1770771741867065,
"learning_rate": 3.7599005002005616e-06,
"loss": 0.0436,
"step": 1014
},
{
"epoch": 0.3992133726647001,
"grad_norm": 1.2992199659347534,
"learning_rate": 3.7569331143437336e-06,
"loss": 0.0565,
"step": 1015
},
{
"epoch": 0.39960668633235,
"grad_norm": 1.2094827890396118,
"learning_rate": 3.7539633566789812e-06,
"loss": 0.0536,
"step": 1016
},
{
"epoch": 0.4,
"grad_norm": 1.641381859779358,
"learning_rate": 3.750991232810177e-06,
"loss": 0.0373,
"step": 1017
},
{
"epoch": 0.40039331366764996,
"grad_norm": 0.7891103029251099,
"learning_rate": 3.7480167483456603e-06,
"loss": 0.0632,
"step": 1018
},
{
"epoch": 0.4007866273352999,
"grad_norm": 0.7216825485229492,
"learning_rate": 3.7450399088982247e-06,
"loss": 0.0513,
"step": 1019
},
{
"epoch": 0.40117994100294985,
"grad_norm": 0.7158090472221375,
"learning_rate": 3.742060720085107e-06,
"loss": 0.0456,
"step": 1020
},
{
"epoch": 0.4015732546705998,
"grad_norm": 0.58232182264328,
"learning_rate": 3.739079187527978e-06,
"loss": 0.027,
"step": 1021
},
{
"epoch": 0.40196656833824973,
"grad_norm": 1.546899437904358,
"learning_rate": 3.73609531685293e-06,
"loss": 0.1034,
"step": 1022
},
{
"epoch": 0.4023598820058997,
"grad_norm": 1.1753488779067993,
"learning_rate": 3.733109113690469e-06,
"loss": 0.0609,
"step": 1023
},
{
"epoch": 0.40275319567354967,
"grad_norm": 1.5217546224594116,
"learning_rate": 3.7301205836755006e-06,
"loss": 0.0853,
"step": 1024
},
{
"epoch": 0.4031465093411996,
"grad_norm": 0.9366397857666016,
"learning_rate": 3.727129732447322e-06,
"loss": 0.0511,
"step": 1025
},
{
"epoch": 0.40353982300884955,
"grad_norm": 0.8296689391136169,
"learning_rate": 3.7241365656496103e-06,
"loss": 0.0336,
"step": 1026
},
{
"epoch": 0.4039331366764995,
"grad_norm": 0.8638429641723633,
"learning_rate": 3.7211410889304117e-06,
"loss": 0.0675,
"step": 1027
},
{
"epoch": 0.40432645034414944,
"grad_norm": 0.6674923896789551,
"learning_rate": 3.7181433079421316e-06,
"loss": 0.0299,
"step": 1028
},
{
"epoch": 0.40471976401179943,
"grad_norm": 1.5683988332748413,
"learning_rate": 3.7151432283415244e-06,
"loss": 0.0814,
"step": 1029
},
{
"epoch": 0.4051130776794494,
"grad_norm": 0.6941884756088257,
"learning_rate": 3.712140855789679e-06,
"loss": 0.0428,
"step": 1030
},
{
"epoch": 0.4055063913470993,
"grad_norm": 0.8299364447593689,
"learning_rate": 3.709136195952015e-06,
"loss": 0.0534,
"step": 1031
},
{
"epoch": 0.40589970501474926,
"grad_norm": 1.065128207206726,
"learning_rate": 3.706129254498266e-06,
"loss": 0.0527,
"step": 1032
},
{
"epoch": 0.4062930186823992,
"grad_norm": 1.3388938903808594,
"learning_rate": 3.703120037102469e-06,
"loss": 0.0619,
"step": 1033
},
{
"epoch": 0.40668633235004914,
"grad_norm": 1.6854989528656006,
"learning_rate": 3.7001085494429596e-06,
"loss": 0.0605,
"step": 1034
},
{
"epoch": 0.40707964601769914,
"grad_norm": 1.7878034114837646,
"learning_rate": 3.697094797202355e-06,
"loss": 0.0644,
"step": 1035
},
{
"epoch": 0.4074729596853491,
"grad_norm": 0.7512350082397461,
"learning_rate": 3.694078786067546e-06,
"loss": 0.0561,
"step": 1036
},
{
"epoch": 0.407866273352999,
"grad_norm": 0.5946680307388306,
"learning_rate": 3.691060521729686e-06,
"loss": 0.032,
"step": 1037
},
{
"epoch": 0.40825958702064896,
"grad_norm": 0.7464413642883301,
"learning_rate": 3.6880400098841794e-06,
"loss": 0.0581,
"step": 1038
},
{
"epoch": 0.4086529006882989,
"grad_norm": 1.3339935541152954,
"learning_rate": 3.6850172562306735e-06,
"loss": 0.065,
"step": 1039
},
{
"epoch": 0.40904621435594885,
"grad_norm": 1.2734817266464233,
"learning_rate": 3.681992266473044e-06,
"loss": 0.0302,
"step": 1040
},
{
"epoch": 0.40943952802359884,
"grad_norm": 1.6477503776550293,
"learning_rate": 3.6789650463193864e-06,
"loss": 0.0454,
"step": 1041
},
{
"epoch": 0.4098328416912488,
"grad_norm": 1.9478659629821777,
"learning_rate": 3.675935601482006e-06,
"loss": 0.0906,
"step": 1042
},
{
"epoch": 0.4102261553588987,
"grad_norm": 1.2177263498306274,
"learning_rate": 3.6729039376774055e-06,
"loss": 0.0708,
"step": 1043
},
{
"epoch": 0.41061946902654867,
"grad_norm": 1.3361903429031372,
"learning_rate": 3.6698700606262733e-06,
"loss": 0.0542,
"step": 1044
},
{
"epoch": 0.4110127826941986,
"grad_norm": 0.7786129117012024,
"learning_rate": 3.6668339760534768e-06,
"loss": 0.0666,
"step": 1045
},
{
"epoch": 0.41140609636184855,
"grad_norm": 0.4651035964488983,
"learning_rate": 3.6637956896880465e-06,
"loss": 0.0442,
"step": 1046
},
{
"epoch": 0.41179941002949855,
"grad_norm": 0.28553763031959534,
"learning_rate": 3.6607552072631685e-06,
"loss": 0.0266,
"step": 1047
},
{
"epoch": 0.4121927236971485,
"grad_norm": 1.054947018623352,
"learning_rate": 3.6577125345161748e-06,
"loss": 0.0533,
"step": 1048
},
{
"epoch": 0.41258603736479843,
"grad_norm": 0.6713748574256897,
"learning_rate": 3.6546676771885257e-06,
"loss": 0.0347,
"step": 1049
},
{
"epoch": 0.41297935103244837,
"grad_norm": 1.4435083866119385,
"learning_rate": 3.6516206410258092e-06,
"loss": 0.0384,
"step": 1050
},
{
"epoch": 0.4133726647000983,
"grad_norm": 1.4494538307189941,
"learning_rate": 3.6485714317777223e-06,
"loss": 0.068,
"step": 1051
},
{
"epoch": 0.41376597836774826,
"grad_norm": 1.666913390159607,
"learning_rate": 3.6455200551980605e-06,
"loss": 0.0685,
"step": 1052
},
{
"epoch": 0.41415929203539825,
"grad_norm": 2.99609375,
"learning_rate": 3.642466517044713e-06,
"loss": 0.1213,
"step": 1053
},
{
"epoch": 0.4145526057030482,
"grad_norm": 1.6199326515197754,
"learning_rate": 3.6394108230796455e-06,
"loss": 0.0557,
"step": 1054
},
{
"epoch": 0.41494591937069814,
"grad_norm": 0.6611631512641907,
"learning_rate": 3.636352979068891e-06,
"loss": 0.0333,
"step": 1055
},
{
"epoch": 0.4153392330383481,
"grad_norm": 0.8349502086639404,
"learning_rate": 3.6332929907825426e-06,
"loss": 0.0285,
"step": 1056
},
{
"epoch": 0.415732546705998,
"grad_norm": 1.6354492902755737,
"learning_rate": 3.630230863994736e-06,
"loss": 0.0808,
"step": 1057
},
{
"epoch": 0.41612586037364796,
"grad_norm": 0.8214701414108276,
"learning_rate": 3.6271666044836433e-06,
"loss": 0.0355,
"step": 1058
},
{
"epoch": 0.41651917404129796,
"grad_norm": 1.321581244468689,
"learning_rate": 3.624100218031464e-06,
"loss": 0.0444,
"step": 1059
},
{
"epoch": 0.4169124877089479,
"grad_norm": 0.7428562641143799,
"learning_rate": 3.621031710424407e-06,
"loss": 0.0259,
"step": 1060
},
{
"epoch": 0.41730580137659784,
"grad_norm": 0.7929845452308655,
"learning_rate": 3.6179610874526856e-06,
"loss": 0.0345,
"step": 1061
},
{
"epoch": 0.4176991150442478,
"grad_norm": 0.6758319139480591,
"learning_rate": 3.614888354910505e-06,
"loss": 0.037,
"step": 1062
},
{
"epoch": 0.4180924287118977,
"grad_norm": 1.5147916078567505,
"learning_rate": 3.6118135185960507e-06,
"loss": 0.0855,
"step": 1063
},
{
"epoch": 0.41848574237954766,
"grad_norm": 1.0528610944747925,
"learning_rate": 3.6087365843114773e-06,
"loss": 0.0324,
"step": 1064
},
{
"epoch": 0.41887905604719766,
"grad_norm": 1.3274002075195312,
"learning_rate": 3.6056575578629006e-06,
"loss": 0.0475,
"step": 1065
},
{
"epoch": 0.4192723697148476,
"grad_norm": 0.5520153641700745,
"learning_rate": 3.6025764450603808e-06,
"loss": 0.022,
"step": 1066
},
{
"epoch": 0.41966568338249755,
"grad_norm": 1.81023371219635,
"learning_rate": 3.5994932517179182e-06,
"loss": 0.043,
"step": 1067
},
{
"epoch": 0.4200589970501475,
"grad_norm": 1.3602193593978882,
"learning_rate": 3.596407983653436e-06,
"loss": 0.073,
"step": 1068
},
{
"epoch": 0.42045231071779743,
"grad_norm": 1.921582579612732,
"learning_rate": 3.5933206466887755e-06,
"loss": 0.0759,
"step": 1069
},
{
"epoch": 0.42084562438544737,
"grad_norm": 0.8578033447265625,
"learning_rate": 3.59023124664968e-06,
"loss": 0.0249,
"step": 1070
},
{
"epoch": 0.42123893805309737,
"grad_norm": 1.7219325304031372,
"learning_rate": 3.5871397893657867e-06,
"loss": 0.0596,
"step": 1071
},
{
"epoch": 0.4216322517207473,
"grad_norm": 0.9463638663291931,
"learning_rate": 3.5840462806706126e-06,
"loss": 0.0454,
"step": 1072
},
{
"epoch": 0.42202556538839725,
"grad_norm": 1.9718307256698608,
"learning_rate": 3.5809507264015502e-06,
"loss": 0.0623,
"step": 1073
},
{
"epoch": 0.4224188790560472,
"grad_norm": 2.0382165908813477,
"learning_rate": 3.5778531323998465e-06,
"loss": 0.0497,
"step": 1074
},
{
"epoch": 0.42281219272369713,
"grad_norm": 1.496324062347412,
"learning_rate": 3.574753504510602e-06,
"loss": 0.0826,
"step": 1075
},
{
"epoch": 0.4232055063913471,
"grad_norm": 0.49463126063346863,
"learning_rate": 3.571651848582753e-06,
"loss": 0.0415,
"step": 1076
},
{
"epoch": 0.42359882005899707,
"grad_norm": 1.1558905839920044,
"learning_rate": 3.5685481704690617e-06,
"loss": 0.0473,
"step": 1077
},
{
"epoch": 0.423992133726647,
"grad_norm": 3.914982795715332,
"learning_rate": 3.5654424760261082e-06,
"loss": 0.0853,
"step": 1078
},
{
"epoch": 0.42438544739429696,
"grad_norm": 1.7288295030593872,
"learning_rate": 3.5623347711142764e-06,
"loss": 0.0817,
"step": 1079
},
{
"epoch": 0.4247787610619469,
"grad_norm": 1.0033987760543823,
"learning_rate": 3.5592250615977434e-06,
"loss": 0.0552,
"step": 1080
},
{
"epoch": 0.42517207472959684,
"grad_norm": 1.461305856704712,
"learning_rate": 3.5561133533444703e-06,
"loss": 0.0659,
"step": 1081
},
{
"epoch": 0.4255653883972468,
"grad_norm": 0.7007796168327332,
"learning_rate": 3.552999652226189e-06,
"loss": 0.0332,
"step": 1082
},
{
"epoch": 0.4259587020648968,
"grad_norm": 0.7041943073272705,
"learning_rate": 3.549883964118392e-06,
"loss": 0.0205,
"step": 1083
},
{
"epoch": 0.4263520157325467,
"grad_norm": 1.5797779560089111,
"learning_rate": 3.54676629490032e-06,
"loss": 0.0564,
"step": 1084
},
{
"epoch": 0.42674532940019666,
"grad_norm": 1.4408408403396606,
"learning_rate": 3.543646650454955e-06,
"loss": 0.0347,
"step": 1085
},
{
"epoch": 0.4271386430678466,
"grad_norm": 0.709080159664154,
"learning_rate": 3.5405250366690023e-06,
"loss": 0.0259,
"step": 1086
},
{
"epoch": 0.42753195673549654,
"grad_norm": 1.4579590559005737,
"learning_rate": 3.5374014594328877e-06,
"loss": 0.0712,
"step": 1087
},
{
"epoch": 0.4279252704031465,
"grad_norm": 0.9378184676170349,
"learning_rate": 3.5342759246407378e-06,
"loss": 0.0583,
"step": 1088
},
{
"epoch": 0.4283185840707965,
"grad_norm": 0.9149574041366577,
"learning_rate": 3.5311484381903754e-06,
"loss": 0.0594,
"step": 1089
},
{
"epoch": 0.4287118977384464,
"grad_norm": 1.2301528453826904,
"learning_rate": 3.528019005983306e-06,
"loss": 0.0603,
"step": 1090
},
{
"epoch": 0.42910521140609637,
"grad_norm": 1.222373127937317,
"learning_rate": 3.5248876339247053e-06,
"loss": 0.0331,
"step": 1091
},
{
"epoch": 0.4294985250737463,
"grad_norm": 1.5141066312789917,
"learning_rate": 3.521754327923412e-06,
"loss": 0.0662,
"step": 1092
},
{
"epoch": 0.42989183874139625,
"grad_norm": 1.581040620803833,
"learning_rate": 3.5186190938919106e-06,
"loss": 0.0634,
"step": 1093
},
{
"epoch": 0.4302851524090462,
"grad_norm": 1.1250847578048706,
"learning_rate": 3.515481937746327e-06,
"loss": 0.0428,
"step": 1094
},
{
"epoch": 0.4306784660766962,
"grad_norm": 1.6886603832244873,
"learning_rate": 3.5123428654064134e-06,
"loss": 0.043,
"step": 1095
},
{
"epoch": 0.43107177974434613,
"grad_norm": 2.050182819366455,
"learning_rate": 3.509201882795536e-06,
"loss": 0.1201,
"step": 1096
},
{
"epoch": 0.43146509341199607,
"grad_norm": 1.2001996040344238,
"learning_rate": 3.5060589958406677e-06,
"loss": 0.0453,
"step": 1097
},
{
"epoch": 0.431858407079646,
"grad_norm": 1.0683172941207886,
"learning_rate": 3.5029142104723725e-06,
"loss": 0.0331,
"step": 1098
},
{
"epoch": 0.43225172074729595,
"grad_norm": 2.0737650394439697,
"learning_rate": 3.4997675326247993e-06,
"loss": 0.0526,
"step": 1099
},
{
"epoch": 0.4326450344149459,
"grad_norm": 0.8983532190322876,
"learning_rate": 3.4966189682356677e-06,
"loss": 0.0532,
"step": 1100
},
{
"epoch": 0.4330383480825959,
"grad_norm": 1.8358802795410156,
"learning_rate": 3.493468523246255e-06,
"loss": 0.0598,
"step": 1101
},
{
"epoch": 0.43343166175024583,
"grad_norm": 2.076266050338745,
"learning_rate": 3.4903162036013894e-06,
"loss": 0.0836,
"step": 1102
},
{
"epoch": 0.4338249754178958,
"grad_norm": 2.4419870376586914,
"learning_rate": 3.487162015249436e-06,
"loss": 0.0758,
"step": 1103
},
{
"epoch": 0.4342182890855457,
"grad_norm": 1.3942052125930786,
"learning_rate": 3.484005964142285e-06,
"loss": 0.0803,
"step": 1104
},
{
"epoch": 0.43461160275319566,
"grad_norm": 1.3950960636138916,
"learning_rate": 3.4808480562353426e-06,
"loss": 0.0675,
"step": 1105
},
{
"epoch": 0.4350049164208456,
"grad_norm": 1.5000733137130737,
"learning_rate": 3.477688297487519e-06,
"loss": 0.0448,
"step": 1106
},
{
"epoch": 0.4353982300884956,
"grad_norm": 1.5005849599838257,
"learning_rate": 3.474526693861216e-06,
"loss": 0.0729,
"step": 1107
},
{
"epoch": 0.43579154375614554,
"grad_norm": 0.6299577951431274,
"learning_rate": 3.4713632513223178e-06,
"loss": 0.039,
"step": 1108
},
{
"epoch": 0.4361848574237955,
"grad_norm": 0.8964212536811829,
"learning_rate": 3.4681979758401767e-06,
"loss": 0.0521,
"step": 1109
},
{
"epoch": 0.4365781710914454,
"grad_norm": 1.3757152557373047,
"learning_rate": 3.465030873387606e-06,
"loss": 0.0598,
"step": 1110
},
{
"epoch": 0.43697148475909536,
"grad_norm": 0.48663070797920227,
"learning_rate": 3.461861949940865e-06,
"loss": 0.0442,
"step": 1111
},
{
"epoch": 0.4373647984267453,
"grad_norm": 0.8878856897354126,
"learning_rate": 3.458691211479649e-06,
"loss": 0.023,
"step": 1112
},
{
"epoch": 0.4377581120943953,
"grad_norm": 1.1162179708480835,
"learning_rate": 3.4555186639870795e-06,
"loss": 0.0493,
"step": 1113
},
{
"epoch": 0.43815142576204524,
"grad_norm": 1.1180258989334106,
"learning_rate": 3.4523443134496916e-06,
"loss": 0.0577,
"step": 1114
},
{
"epoch": 0.4385447394296952,
"grad_norm": 0.6240465641021729,
"learning_rate": 3.4491681658574205e-06,
"loss": 0.0295,
"step": 1115
},
{
"epoch": 0.4389380530973451,
"grad_norm": 2.439685106277466,
"learning_rate": 3.445990227203594e-06,
"loss": 0.0676,
"step": 1116
},
{
"epoch": 0.43933136676499507,
"grad_norm": 1.1544771194458008,
"learning_rate": 3.442810503484921e-06,
"loss": 0.0487,
"step": 1117
},
{
"epoch": 0.439724680432645,
"grad_norm": 1.794083833694458,
"learning_rate": 3.4396290007014752e-06,
"loss": 0.043,
"step": 1118
},
{
"epoch": 0.440117994100295,
"grad_norm": 0.8073402643203735,
"learning_rate": 3.4364457248566913e-06,
"loss": 0.0404,
"step": 1119
},
{
"epoch": 0.44051130776794495,
"grad_norm": 0.4391036331653595,
"learning_rate": 3.433260681957346e-06,
"loss": 0.0394,
"step": 1120
},
{
"epoch": 0.4409046214355949,
"grad_norm": 1.0611299276351929,
"learning_rate": 3.430073878013554e-06,
"loss": 0.0263,
"step": 1121
},
{
"epoch": 0.44129793510324483,
"grad_norm": 0.48767581582069397,
"learning_rate": 3.4268853190387496e-06,
"loss": 0.0341,
"step": 1122
},
{
"epoch": 0.4416912487708948,
"grad_norm": 0.6423639059066772,
"learning_rate": 3.423695011049683e-06,
"loss": 0.0234,
"step": 1123
},
{
"epoch": 0.4420845624385447,
"grad_norm": 1.0390664339065552,
"learning_rate": 3.4205029600663996e-06,
"loss": 0.0593,
"step": 1124
},
{
"epoch": 0.4424778761061947,
"grad_norm": 1.2516858577728271,
"learning_rate": 3.4173091721122375e-06,
"loss": 0.0375,
"step": 1125
},
{
"epoch": 0.44287118977384465,
"grad_norm": 1.670310139656067,
"learning_rate": 3.414113653213812e-06,
"loss": 0.0504,
"step": 1126
},
{
"epoch": 0.4432645034414946,
"grad_norm": 2.317314624786377,
"learning_rate": 3.410916409401004e-06,
"loss": 0.0911,
"step": 1127
},
{
"epoch": 0.44365781710914454,
"grad_norm": 1.418398141860962,
"learning_rate": 3.407717446706948e-06,
"loss": 0.0439,
"step": 1128
},
{
"epoch": 0.4440511307767945,
"grad_norm": 1.1104565858840942,
"learning_rate": 3.4045167711680244e-06,
"loss": 0.0485,
"step": 1129
},
{
"epoch": 0.4444444444444444,
"grad_norm": 1.8792333602905273,
"learning_rate": 3.4013143888238455e-06,
"loss": 0.064,
"step": 1130
},
{
"epoch": 0.4448377581120944,
"grad_norm": 1.7921650409698486,
"learning_rate": 3.398110305717241e-06,
"loss": 0.0495,
"step": 1131
},
{
"epoch": 0.44523107177974436,
"grad_norm": 1.4747095108032227,
"learning_rate": 3.3949045278942545e-06,
"loss": 0.0743,
"step": 1132
},
{
"epoch": 0.4456243854473943,
"grad_norm": 0.6847875118255615,
"learning_rate": 3.3916970614041244e-06,
"loss": 0.0224,
"step": 1133
},
{
"epoch": 0.44601769911504424,
"grad_norm": 0.7522935271263123,
"learning_rate": 3.3884879122992762e-06,
"loss": 0.0334,
"step": 1134
},
{
"epoch": 0.4464110127826942,
"grad_norm": 1.5176104307174683,
"learning_rate": 3.3852770866353125e-06,
"loss": 0.0729,
"step": 1135
},
{
"epoch": 0.4468043264503441,
"grad_norm": 1.188468337059021,
"learning_rate": 3.382064590470996e-06,
"loss": 0.0315,
"step": 1136
},
{
"epoch": 0.4471976401179941,
"grad_norm": 0.5583229660987854,
"learning_rate": 3.378850429868244e-06,
"loss": 0.0292,
"step": 1137
},
{
"epoch": 0.44759095378564406,
"grad_norm": 0.7804880738258362,
"learning_rate": 3.3756346108921145e-06,
"loss": 0.0378,
"step": 1138
},
{
"epoch": 0.447984267453294,
"grad_norm": 1.090079426765442,
"learning_rate": 3.372417139610793e-06,
"loss": 0.0549,
"step": 1139
},
{
"epoch": 0.44837758112094395,
"grad_norm": 1.363856554031372,
"learning_rate": 3.369198022095585e-06,
"loss": 0.0859,
"step": 1140
},
{
"epoch": 0.4487708947885939,
"grad_norm": 1.162818431854248,
"learning_rate": 3.3659772644209023e-06,
"loss": 0.0292,
"step": 1141
},
{
"epoch": 0.44916420845624383,
"grad_norm": 0.8213643431663513,
"learning_rate": 3.36275487266425e-06,
"loss": 0.0435,
"step": 1142
},
{
"epoch": 0.4495575221238938,
"grad_norm": 0.8050291538238525,
"learning_rate": 3.3595308529062176e-06,
"loss": 0.0279,
"step": 1143
},
{
"epoch": 0.44995083579154377,
"grad_norm": 1.1065354347229004,
"learning_rate": 3.3563052112304674e-06,
"loss": 0.0425,
"step": 1144
},
{
"epoch": 0.4503441494591937,
"grad_norm": 0.9072518348693848,
"learning_rate": 3.3530779537237194e-06,
"loss": 0.0315,
"step": 1145
},
{
"epoch": 0.45073746312684365,
"grad_norm": 0.8572150468826294,
"learning_rate": 3.349849086475747e-06,
"loss": 0.0306,
"step": 1146
},
{
"epoch": 0.4511307767944936,
"grad_norm": 1.552173137664795,
"learning_rate": 3.346618615579359e-06,
"loss": 0.0671,
"step": 1147
},
{
"epoch": 0.45152409046214353,
"grad_norm": 0.9978398084640503,
"learning_rate": 3.3433865471303876e-06,
"loss": 0.0667,
"step": 1148
},
{
"epoch": 0.45191740412979353,
"grad_norm": 2.7961080074310303,
"learning_rate": 3.3401528872276847e-06,
"loss": 0.0696,
"step": 1149
},
{
"epoch": 0.4523107177974435,
"grad_norm": 1.520912528038025,
"learning_rate": 3.3369176419731004e-06,
"loss": 0.0722,
"step": 1150
},
{
"epoch": 0.4527040314650934,
"grad_norm": 0.8389769196510315,
"learning_rate": 3.33368081747148e-06,
"loss": 0.0444,
"step": 1151
},
{
"epoch": 0.45309734513274336,
"grad_norm": 2.075424909591675,
"learning_rate": 3.3304424198306464e-06,
"loss": 0.0826,
"step": 1152
},
{
"epoch": 0.4534906588003933,
"grad_norm": 0.7416201829910278,
"learning_rate": 3.3272024551613926e-06,
"loss": 0.0283,
"step": 1153
},
{
"epoch": 0.45388397246804324,
"grad_norm": 1.0457786321640015,
"learning_rate": 3.3239609295774667e-06,
"loss": 0.0418,
"step": 1154
},
{
"epoch": 0.45427728613569324,
"grad_norm": 0.9312077760696411,
"learning_rate": 3.3207178491955656e-06,
"loss": 0.0341,
"step": 1155
},
{
"epoch": 0.4546705998033432,
"grad_norm": 0.9886119365692139,
"learning_rate": 3.3174732201353155e-06,
"loss": 0.0623,
"step": 1156
},
{
"epoch": 0.4550639134709931,
"grad_norm": 1.2970693111419678,
"learning_rate": 3.3142270485192683e-06,
"loss": 0.087,
"step": 1157
},
{
"epoch": 0.45545722713864306,
"grad_norm": 1.273305892944336,
"learning_rate": 3.3109793404728855e-06,
"loss": 0.0654,
"step": 1158
},
{
"epoch": 0.455850540806293,
"grad_norm": 0.8121715188026428,
"learning_rate": 3.3077301021245285e-06,
"loss": 0.0257,
"step": 1159
},
{
"epoch": 0.45624385447394294,
"grad_norm": 1.6593793630599976,
"learning_rate": 3.3044793396054447e-06,
"loss": 0.0679,
"step": 1160
},
{
"epoch": 0.45663716814159294,
"grad_norm": 1.2623846530914307,
"learning_rate": 3.3012270590497596e-06,
"loss": 0.071,
"step": 1161
},
{
"epoch": 0.4570304818092429,
"grad_norm": 0.9096400737762451,
"learning_rate": 3.2979732665944615e-06,
"loss": 0.067,
"step": 1162
},
{
"epoch": 0.4574237954768928,
"grad_norm": 0.9472593069076538,
"learning_rate": 3.2947179683793928e-06,
"loss": 0.0395,
"step": 1163
},
{
"epoch": 0.45781710914454277,
"grad_norm": 0.9576103091239929,
"learning_rate": 3.291461170547237e-06,
"loss": 0.049,
"step": 1164
},
{
"epoch": 0.4582104228121927,
"grad_norm": 0.9918181300163269,
"learning_rate": 3.2882028792435072e-06,
"loss": 0.0318,
"step": 1165
},
{
"epoch": 0.45860373647984265,
"grad_norm": 1.843493938446045,
"learning_rate": 3.2849431006165343e-06,
"loss": 0.0634,
"step": 1166
},
{
"epoch": 0.45899705014749265,
"grad_norm": 0.8672575950622559,
"learning_rate": 3.2816818408174567e-06,
"loss": 0.0826,
"step": 1167
},
{
"epoch": 0.4593903638151426,
"grad_norm": 1.5660734176635742,
"learning_rate": 3.278419106000206e-06,
"loss": 0.0695,
"step": 1168
},
{
"epoch": 0.45978367748279253,
"grad_norm": 1.3234399557113647,
"learning_rate": 3.2751549023214995e-06,
"loss": 0.0381,
"step": 1169
},
{
"epoch": 0.46017699115044247,
"grad_norm": 1.7596269845962524,
"learning_rate": 3.2718892359408245e-06,
"loss": 0.0438,
"step": 1170
},
{
"epoch": 0.4605703048180924,
"grad_norm": 0.6878931522369385,
"learning_rate": 3.2686221130204287e-06,
"loss": 0.0347,
"step": 1171
},
{
"epoch": 0.46096361848574235,
"grad_norm": 1.0857138633728027,
"learning_rate": 3.265353539725309e-06,
"loss": 0.0609,
"step": 1172
},
{
"epoch": 0.46135693215339235,
"grad_norm": 0.777098536491394,
"learning_rate": 3.2620835222231972e-06,
"loss": 0.0597,
"step": 1173
},
{
"epoch": 0.4617502458210423,
"grad_norm": 4.028940677642822,
"learning_rate": 3.2588120666845534e-06,
"loss": 0.0702,
"step": 1174
},
{
"epoch": 0.46214355948869223,
"grad_norm": 1.3609766960144043,
"learning_rate": 3.255539179282548e-06,
"loss": 0.0478,
"step": 1175
},
{
"epoch": 0.4625368731563422,
"grad_norm": 1.3808916807174683,
"learning_rate": 3.2522648661930558e-06,
"loss": 0.0787,
"step": 1176
},
{
"epoch": 0.4629301868239921,
"grad_norm": 1.464201807975769,
"learning_rate": 3.2489891335946413e-06,
"loss": 0.0565,
"step": 1177
},
{
"epoch": 0.46332350049164206,
"grad_norm": 1.4196548461914062,
"learning_rate": 3.245711987668545e-06,
"loss": 0.0747,
"step": 1178
},
{
"epoch": 0.46371681415929206,
"grad_norm": 1.5526188611984253,
"learning_rate": 3.2424334345986787e-06,
"loss": 0.0384,
"step": 1179
},
{
"epoch": 0.464110127826942,
"grad_norm": 1.4707880020141602,
"learning_rate": 3.239153480571605e-06,
"loss": 0.0669,
"step": 1180
},
{
"epoch": 0.46450344149459194,
"grad_norm": 1.5997252464294434,
"learning_rate": 3.2358721317765344e-06,
"loss": 0.063,
"step": 1181
},
{
"epoch": 0.4648967551622419,
"grad_norm": 0.7773184180259705,
"learning_rate": 3.2325893944053066e-06,
"loss": 0.0515,
"step": 1182
},
{
"epoch": 0.4652900688298918,
"grad_norm": 1.1635929346084595,
"learning_rate": 3.2293052746523814e-06,
"loss": 0.0494,
"step": 1183
},
{
"epoch": 0.46568338249754176,
"grad_norm": 0.9854192137718201,
"learning_rate": 3.2260197787148277e-06,
"loss": 0.0559,
"step": 1184
},
{
"epoch": 0.46607669616519176,
"grad_norm": 1.9313583374023438,
"learning_rate": 3.222732912792313e-06,
"loss": 0.0447,
"step": 1185
},
{
"epoch": 0.4664700098328417,
"grad_norm": 2.149656295776367,
"learning_rate": 3.2194446830870865e-06,
"loss": 0.0772,
"step": 1186
},
{
"epoch": 0.46686332350049164,
"grad_norm": 1.784822940826416,
"learning_rate": 3.2161550958039732e-06,
"loss": 0.0746,
"step": 1187
},
{
"epoch": 0.4672566371681416,
"grad_norm": 1.5821526050567627,
"learning_rate": 3.2128641571503594e-06,
"loss": 0.0613,
"step": 1188
},
{
"epoch": 0.46764995083579153,
"grad_norm": 1.6123450994491577,
"learning_rate": 3.2095718733361803e-06,
"loss": 0.0419,
"step": 1189
},
{
"epoch": 0.46804326450344147,
"grad_norm": 1.5458816289901733,
"learning_rate": 3.2062782505739125e-06,
"loss": 0.0854,
"step": 1190
},
{
"epoch": 0.46843657817109147,
"grad_norm": 1.5308221578598022,
"learning_rate": 3.202983295078555e-06,
"loss": 0.063,
"step": 1191
},
{
"epoch": 0.4688298918387414,
"grad_norm": 1.166703224182129,
"learning_rate": 3.199687013067624e-06,
"loss": 0.0759,
"step": 1192
},
{
"epoch": 0.46922320550639135,
"grad_norm": 1.2040659189224243,
"learning_rate": 3.1963894107611395e-06,
"loss": 0.0648,
"step": 1193
},
{
"epoch": 0.4696165191740413,
"grad_norm": 0.8159343004226685,
"learning_rate": 3.1930904943816104e-06,
"loss": 0.0252,
"step": 1194
},
{
"epoch": 0.47000983284169123,
"grad_norm": 0.5714221596717834,
"learning_rate": 3.189790270154028e-06,
"loss": 0.0402,
"step": 1195
},
{
"epoch": 0.4704031465093412,
"grad_norm": 1.1028029918670654,
"learning_rate": 3.186488744305849e-06,
"loss": 0.0358,
"step": 1196
},
{
"epoch": 0.47079646017699117,
"grad_norm": 1.1706167459487915,
"learning_rate": 3.183185923066988e-06,
"loss": 0.0405,
"step": 1197
},
{
"epoch": 0.4711897738446411,
"grad_norm": 2.2323551177978516,
"learning_rate": 3.179881812669804e-06,
"loss": 0.0626,
"step": 1198
},
{
"epoch": 0.47158308751229105,
"grad_norm": 1.4933780431747437,
"learning_rate": 3.1765764193490863e-06,
"loss": 0.0421,
"step": 1199
},
{
"epoch": 0.471976401179941,
"grad_norm": 1.759582281112671,
"learning_rate": 3.173269749342047e-06,
"loss": 0.0386,
"step": 1200
},
{
"epoch": 0.47236971484759094,
"grad_norm": 0.9716536998748779,
"learning_rate": 3.1699618088883094e-06,
"loss": 0.0469,
"step": 1201
},
{
"epoch": 0.4727630285152409,
"grad_norm": 1.4588727951049805,
"learning_rate": 3.1666526042298883e-06,
"loss": 0.062,
"step": 1202
},
{
"epoch": 0.4731563421828909,
"grad_norm": 0.7807295918464661,
"learning_rate": 3.16334214161119e-06,
"loss": 0.0516,
"step": 1203
},
{
"epoch": 0.4735496558505408,
"grad_norm": 0.9360034465789795,
"learning_rate": 3.1600304272789904e-06,
"loss": 0.0413,
"step": 1204
},
{
"epoch": 0.47394296951819076,
"grad_norm": 3.0252861976623535,
"learning_rate": 3.1567174674824303e-06,
"loss": 0.0517,
"step": 1205
},
{
"epoch": 0.4743362831858407,
"grad_norm": 1.2127926349639893,
"learning_rate": 3.1534032684729978e-06,
"loss": 0.0634,
"step": 1206
},
{
"epoch": 0.47472959685349064,
"grad_norm": 1.008239984512329,
"learning_rate": 3.1500878365045217e-06,
"loss": 0.035,
"step": 1207
},
{
"epoch": 0.4751229105211406,
"grad_norm": 0.8630732893943787,
"learning_rate": 3.1467711778331573e-06,
"loss": 0.0432,
"step": 1208
},
{
"epoch": 0.4755162241887906,
"grad_norm": 0.5713632702827454,
"learning_rate": 3.143453298717373e-06,
"loss": 0.0293,
"step": 1209
},
{
"epoch": 0.4759095378564405,
"grad_norm": 1.3503292798995972,
"learning_rate": 3.14013420541794e-06,
"loss": 0.0488,
"step": 1210
},
{
"epoch": 0.47630285152409046,
"grad_norm": 0.6340729594230652,
"learning_rate": 3.1368139041979235e-06,
"loss": 0.0352,
"step": 1211
},
{
"epoch": 0.4766961651917404,
"grad_norm": 2.0643789768218994,
"learning_rate": 3.133492401322666e-06,
"loss": 0.0602,
"step": 1212
},
{
"epoch": 0.47708947885939035,
"grad_norm": 1.456824779510498,
"learning_rate": 3.1301697030597772e-06,
"loss": 0.0576,
"step": 1213
},
{
"epoch": 0.4774827925270403,
"grad_norm": 1.6788169145584106,
"learning_rate": 3.126845815679123e-06,
"loss": 0.0473,
"step": 1214
},
{
"epoch": 0.4778761061946903,
"grad_norm": 0.9894094467163086,
"learning_rate": 3.1235207454528137e-06,
"loss": 0.0486,
"step": 1215
},
{
"epoch": 0.47826941986234023,
"grad_norm": 0.6644244194030762,
"learning_rate": 3.12019449865519e-06,
"loss": 0.0348,
"step": 1216
},
{
"epoch": 0.47866273352999017,
"grad_norm": 1.8796205520629883,
"learning_rate": 3.116867081562815e-06,
"loss": 0.0711,
"step": 1217
},
{
"epoch": 0.4790560471976401,
"grad_norm": 0.71921706199646,
"learning_rate": 3.1135385004544584e-06,
"loss": 0.0439,
"step": 1218
},
{
"epoch": 0.47944936086529005,
"grad_norm": 1.4723786115646362,
"learning_rate": 3.1102087616110866e-06,
"loss": 0.0948,
"step": 1219
},
{
"epoch": 0.47984267453294,
"grad_norm": 1.0385109186172485,
"learning_rate": 3.1068778713158515e-06,
"loss": 0.0481,
"step": 1220
},
{
"epoch": 0.48023598820059,
"grad_norm": 1.8688119649887085,
"learning_rate": 3.1035458358540764e-06,
"loss": 0.0962,
"step": 1221
},
{
"epoch": 0.48062930186823993,
"grad_norm": 0.988058865070343,
"learning_rate": 3.100212661513247e-06,
"loss": 0.0862,
"step": 1222
},
{
"epoch": 0.4810226155358899,
"grad_norm": 0.7118948698043823,
"learning_rate": 3.096878354582998e-06,
"loss": 0.0492,
"step": 1223
},
{
"epoch": 0.4814159292035398,
"grad_norm": 1.1759183406829834,
"learning_rate": 3.093542921355099e-06,
"loss": 0.0278,
"step": 1224
},
{
"epoch": 0.48180924287118976,
"grad_norm": 0.8185058832168579,
"learning_rate": 3.0902063681234473e-06,
"loss": 0.0618,
"step": 1225
},
{
"epoch": 0.4822025565388397,
"grad_norm": 1.0773781538009644,
"learning_rate": 3.086868701184054e-06,
"loss": 0.0393,
"step": 1226
},
{
"epoch": 0.4825958702064897,
"grad_norm": 1.4859130382537842,
"learning_rate": 3.083529926835028e-06,
"loss": 0.0425,
"step": 1227
},
{
"epoch": 0.48298918387413964,
"grad_norm": 0.8524113297462463,
"learning_rate": 3.0801900513765732e-06,
"loss": 0.0667,
"step": 1228
},
{
"epoch": 0.4833824975417896,
"grad_norm": 1.2344658374786377,
"learning_rate": 3.076849081110967e-06,
"loss": 0.0469,
"step": 1229
},
{
"epoch": 0.4837758112094395,
"grad_norm": 1.4112597703933716,
"learning_rate": 3.073507022342554e-06,
"loss": 0.0439,
"step": 1230
},
{
"epoch": 0.48416912487708946,
"grad_norm": 1.0202746391296387,
"learning_rate": 3.070163881377734e-06,
"loss": 0.0953,
"step": 1231
},
{
"epoch": 0.4845624385447394,
"grad_norm": 1.2902711629867554,
"learning_rate": 3.066819664524947e-06,
"loss": 0.0378,
"step": 1232
},
{
"epoch": 0.4849557522123894,
"grad_norm": 0.8746582269668579,
"learning_rate": 3.063474378094665e-06,
"loss": 0.0404,
"step": 1233
},
{
"epoch": 0.48534906588003934,
"grad_norm": 1.8847814798355103,
"learning_rate": 3.060128028399376e-06,
"loss": 0.0779,
"step": 1234
},
{
"epoch": 0.4857423795476893,
"grad_norm": 1.2793282270431519,
"learning_rate": 3.056780621753577e-06,
"loss": 0.0433,
"step": 1235
},
{
"epoch": 0.4861356932153392,
"grad_norm": 1.4302126169204712,
"learning_rate": 3.0534321644737574e-06,
"loss": 0.0565,
"step": 1236
},
{
"epoch": 0.48652900688298917,
"grad_norm": 0.8506616353988647,
"learning_rate": 3.0500826628783903e-06,
"loss": 0.0448,
"step": 1237
},
{
"epoch": 0.4869223205506391,
"grad_norm": 1.7796978950500488,
"learning_rate": 3.046732123287918e-06,
"loss": 0.0449,
"step": 1238
},
{
"epoch": 0.4873156342182891,
"grad_norm": 1.4967756271362305,
"learning_rate": 3.043380552024744e-06,
"loss": 0.0409,
"step": 1239
},
{
"epoch": 0.48770894788593905,
"grad_norm": 1.2920217514038086,
"learning_rate": 3.0400279554132157e-06,
"loss": 0.0465,
"step": 1240
},
{
"epoch": 0.488102261553589,
"grad_norm": 1.9115070104599,
"learning_rate": 3.0366743397796166e-06,
"loss": 0.0591,
"step": 1241
},
{
"epoch": 0.48849557522123893,
"grad_norm": 0.988409161567688,
"learning_rate": 3.033319711452154e-06,
"loss": 0.042,
"step": 1242
},
{
"epoch": 0.4888888888888889,
"grad_norm": 2.1158268451690674,
"learning_rate": 3.0299640767609447e-06,
"loss": 0.0792,
"step": 1243
},
{
"epoch": 0.4892822025565388,
"grad_norm": 1.1518357992172241,
"learning_rate": 3.0266074420380043e-06,
"loss": 0.0554,
"step": 1244
},
{
"epoch": 0.4896755162241888,
"grad_norm": 1.3400568962097168,
"learning_rate": 3.023249813617238e-06,
"loss": 0.0545,
"step": 1245
},
{
"epoch": 0.49006882989183875,
"grad_norm": 0.8380603790283203,
"learning_rate": 3.0198911978344213e-06,
"loss": 0.0377,
"step": 1246
},
{
"epoch": 0.4904621435594887,
"grad_norm": 1.3251253366470337,
"learning_rate": 3.0165316010271982e-06,
"loss": 0.0419,
"step": 1247
},
{
"epoch": 0.49085545722713864,
"grad_norm": 0.7429760098457336,
"learning_rate": 3.0131710295350615e-06,
"loss": 0.0487,
"step": 1248
},
{
"epoch": 0.4912487708947886,
"grad_norm": 1.619492530822754,
"learning_rate": 3.0098094896993413e-06,
"loss": 0.0364,
"step": 1249
},
{
"epoch": 0.4916420845624385,
"grad_norm": 1.8555465936660767,
"learning_rate": 3.0064469878631986e-06,
"loss": 0.0327,
"step": 1250
},
{
"epoch": 0.4920353982300885,
"grad_norm": 2.1514008045196533,
"learning_rate": 3.003083530371606e-06,
"loss": 0.0961,
"step": 1251
},
{
"epoch": 0.49242871189773846,
"grad_norm": 1.1894843578338623,
"learning_rate": 2.9997191235713435e-06,
"loss": 0.0773,
"step": 1252
},
{
"epoch": 0.4928220255653884,
"grad_norm": 1.375878095626831,
"learning_rate": 2.9963537738109783e-06,
"loss": 0.0635,
"step": 1253
},
{
"epoch": 0.49321533923303834,
"grad_norm": 0.9740056395530701,
"learning_rate": 2.9929874874408595e-06,
"loss": 0.0581,
"step": 1254
},
{
"epoch": 0.4936086529006883,
"grad_norm": 1.21156907081604,
"learning_rate": 2.9896202708131027e-06,
"loss": 0.0524,
"step": 1255
},
{
"epoch": 0.4940019665683382,
"grad_norm": 4.271803855895996,
"learning_rate": 2.98625213028158e-06,
"loss": 0.0437,
"step": 1256
},
{
"epoch": 0.4943952802359882,
"grad_norm": 1.0697994232177734,
"learning_rate": 2.9828830722019046e-06,
"loss": 0.0693,
"step": 1257
},
{
"epoch": 0.49478859390363816,
"grad_norm": 1.0657457113265991,
"learning_rate": 2.979513102931424e-06,
"loss": 0.0788,
"step": 1258
},
{
"epoch": 0.4951819075712881,
"grad_norm": 1.6833268404006958,
"learning_rate": 2.9761422288292017e-06,
"loss": 0.0755,
"step": 1259
},
{
"epoch": 0.49557522123893805,
"grad_norm": 0.7139087915420532,
"learning_rate": 2.9727704562560124e-06,
"loss": 0.0416,
"step": 1260
},
{
"epoch": 0.495968534906588,
"grad_norm": 1.025672435760498,
"learning_rate": 2.9693977915743227e-06,
"loss": 0.057,
"step": 1261
},
{
"epoch": 0.49636184857423793,
"grad_norm": 1.6005637645721436,
"learning_rate": 2.9660242411482848e-06,
"loss": 0.0694,
"step": 1262
},
{
"epoch": 0.4967551622418879,
"grad_norm": 1.2426131963729858,
"learning_rate": 2.9626498113437215e-06,
"loss": 0.0443,
"step": 1263
},
{
"epoch": 0.49714847590953787,
"grad_norm": 1.0461783409118652,
"learning_rate": 2.9592745085281154e-06,
"loss": 0.0449,
"step": 1264
},
{
"epoch": 0.4975417895771878,
"grad_norm": 1.1440929174423218,
"learning_rate": 2.955898339070596e-06,
"loss": 0.0429,
"step": 1265
},
{
"epoch": 0.49793510324483775,
"grad_norm": 1.5936861038208008,
"learning_rate": 2.9525213093419275e-06,
"loss": 0.0517,
"step": 1266
},
{
"epoch": 0.4983284169124877,
"grad_norm": 0.9140682220458984,
"learning_rate": 2.9491434257144995e-06,
"loss": 0.0699,
"step": 1267
},
{
"epoch": 0.49872173058013763,
"grad_norm": 0.6656792759895325,
"learning_rate": 2.9457646945623107e-06,
"loss": 0.023,
"step": 1268
},
{
"epoch": 0.49911504424778763,
"grad_norm": 1.1062997579574585,
"learning_rate": 2.9423851222609607e-06,
"loss": 0.0801,
"step": 1269
},
{
"epoch": 0.4995083579154376,
"grad_norm": 0.9155628085136414,
"learning_rate": 2.939004715187635e-06,
"loss": 0.0704,
"step": 1270
},
{
"epoch": 0.4999016715830875,
"grad_norm": 0.8905113339424133,
"learning_rate": 2.935623479721095e-06,
"loss": 0.0442,
"step": 1271
},
{
"epoch": 0.5002949852507375,
"grad_norm": 0.8276392817497253,
"learning_rate": 2.932241422241665e-06,
"loss": 0.0535,
"step": 1272
},
{
"epoch": 0.5006882989183874,
"grad_norm": 0.5640360713005066,
"learning_rate": 2.9288585491312206e-06,
"loss": 0.0411,
"step": 1273
},
{
"epoch": 0.5010816125860373,
"grad_norm": 1.5979022979736328,
"learning_rate": 2.925474866773176e-06,
"loss": 0.0703,
"step": 1274
},
{
"epoch": 0.5014749262536873,
"grad_norm": 1.1477428674697876,
"learning_rate": 2.922090381552475e-06,
"loss": 0.0488,
"step": 1275
},
{
"epoch": 0.5018682399213372,
"grad_norm": 1.544410228729248,
"learning_rate": 2.9187050998555715e-06,
"loss": 0.0689,
"step": 1276
},
{
"epoch": 0.5022615535889872,
"grad_norm": 1.16623055934906,
"learning_rate": 2.915319028070427e-06,
"loss": 0.0681,
"step": 1277
},
{
"epoch": 0.5026548672566372,
"grad_norm": 0.2639702558517456,
"learning_rate": 2.9119321725864914e-06,
"loss": 0.0321,
"step": 1278
},
{
"epoch": 0.5030481809242872,
"grad_norm": 0.9400918483734131,
"learning_rate": 2.908544539794693e-06,
"loss": 0.0726,
"step": 1279
},
{
"epoch": 0.5034414945919371,
"grad_norm": 2.083108425140381,
"learning_rate": 2.9051561360874297e-06,
"loss": 0.0567,
"step": 1280
},
{
"epoch": 0.503834808259587,
"grad_norm": 0.9149637818336487,
"learning_rate": 2.901766967858551e-06,
"loss": 0.0626,
"step": 1281
},
{
"epoch": 0.504228121927237,
"grad_norm": 0.6115841269493103,
"learning_rate": 2.8983770415033507e-06,
"loss": 0.0386,
"step": 1282
},
{
"epoch": 0.5046214355948869,
"grad_norm": 1.530674695968628,
"learning_rate": 2.8949863634185533e-06,
"loss": 0.0743,
"step": 1283
},
{
"epoch": 0.5050147492625369,
"grad_norm": 0.9860877990722656,
"learning_rate": 2.8915949400022995e-06,
"loss": 0.0397,
"step": 1284
},
{
"epoch": 0.5054080629301868,
"grad_norm": 1.6740636825561523,
"learning_rate": 2.8882027776541406e-06,
"loss": 0.0997,
"step": 1285
},
{
"epoch": 0.5058013765978367,
"grad_norm": 1.1494807004928589,
"learning_rate": 2.8848098827750186e-06,
"loss": 0.0639,
"step": 1286
},
{
"epoch": 0.5061946902654867,
"grad_norm": 1.5039880275726318,
"learning_rate": 2.8814162617672586e-06,
"loss": 0.0615,
"step": 1287
},
{
"epoch": 0.5065880039331366,
"grad_norm": 1.2192140817642212,
"learning_rate": 2.8780219210345573e-06,
"loss": 0.0543,
"step": 1288
},
{
"epoch": 0.5069813176007866,
"grad_norm": 1.1865425109863281,
"learning_rate": 2.8746268669819676e-06,
"loss": 0.069,
"step": 1289
},
{
"epoch": 0.5073746312684366,
"grad_norm": 1.6422653198242188,
"learning_rate": 2.8712311060158904e-06,
"loss": 0.0407,
"step": 1290
},
{
"epoch": 0.5077679449360866,
"grad_norm": 1.0872414112091064,
"learning_rate": 2.8678346445440588e-06,
"loss": 0.0485,
"step": 1291
},
{
"epoch": 0.5081612586037365,
"grad_norm": 1.3887152671813965,
"learning_rate": 2.8644374889755284e-06,
"loss": 0.0594,
"step": 1292
},
{
"epoch": 0.5085545722713865,
"grad_norm": 0.9311152100563049,
"learning_rate": 2.861039645720664e-06,
"loss": 0.0558,
"step": 1293
},
{
"epoch": 0.5089478859390364,
"grad_norm": 0.5611655116081238,
"learning_rate": 2.85764112119113e-06,
"loss": 0.0326,
"step": 1294
},
{
"epoch": 0.5093411996066863,
"grad_norm": 0.6655589938163757,
"learning_rate": 2.854241921799874e-06,
"loss": 0.0608,
"step": 1295
},
{
"epoch": 0.5097345132743363,
"grad_norm": 0.9743668437004089,
"learning_rate": 2.850842053961119e-06,
"loss": 0.0674,
"step": 1296
},
{
"epoch": 0.5101278269419862,
"grad_norm": 0.3803253471851349,
"learning_rate": 2.847441524090347e-06,
"loss": 0.0318,
"step": 1297
},
{
"epoch": 0.5105211406096362,
"grad_norm": 0.9651347398757935,
"learning_rate": 2.844040338604291e-06,
"loss": 0.0467,
"step": 1298
},
{
"epoch": 0.5109144542772861,
"grad_norm": 1.3503124713897705,
"learning_rate": 2.8406385039209217e-06,
"loss": 0.0353,
"step": 1299
},
{
"epoch": 0.511307767944936,
"grad_norm": 1.3085218667984009,
"learning_rate": 2.837236026459432e-06,
"loss": 0.0677,
"step": 1300
},
{
"epoch": 0.511701081612586,
"grad_norm": 0.759332537651062,
"learning_rate": 2.833832912640232e-06,
"loss": 0.0399,
"step": 1301
},
{
"epoch": 0.512094395280236,
"grad_norm": 1.254012107849121,
"learning_rate": 2.8304291688849283e-06,
"loss": 0.0469,
"step": 1302
},
{
"epoch": 0.512487708947886,
"grad_norm": 1.6213202476501465,
"learning_rate": 2.827024801616319e-06,
"loss": 0.077,
"step": 1303
},
{
"epoch": 0.5128810226155359,
"grad_norm": 0.751507580280304,
"learning_rate": 2.8236198172583765e-06,
"loss": 0.0499,
"step": 1304
},
{
"epoch": 0.5132743362831859,
"grad_norm": 0.6438438296318054,
"learning_rate": 2.820214222236241e-06,
"loss": 0.0638,
"step": 1305
},
{
"epoch": 0.5136676499508358,
"grad_norm": 0.8826209902763367,
"learning_rate": 2.816808022976201e-06,
"loss": 0.0422,
"step": 1306
},
{
"epoch": 0.5140609636184857,
"grad_norm": 0.4389915466308594,
"learning_rate": 2.813401225905688e-06,
"loss": 0.0192,
"step": 1307
},
{
"epoch": 0.5144542772861357,
"grad_norm": 0.7698509693145752,
"learning_rate": 2.8099938374532615e-06,
"loss": 0.043,
"step": 1308
},
{
"epoch": 0.5148475909537856,
"grad_norm": 1.0304797887802124,
"learning_rate": 2.806585864048594e-06,
"loss": 0.0648,
"step": 1309
},
{
"epoch": 0.5152409046214356,
"grad_norm": 0.9679722189903259,
"learning_rate": 2.8031773121224665e-06,
"loss": 0.0528,
"step": 1310
},
{
"epoch": 0.5156342182890855,
"grad_norm": 0.8979973793029785,
"learning_rate": 2.799768188106747e-06,
"loss": 0.0493,
"step": 1311
},
{
"epoch": 0.5160275319567355,
"grad_norm": 1.266461730003357,
"learning_rate": 2.7963584984343856e-06,
"loss": 0.0489,
"step": 1312
},
{
"epoch": 0.5164208456243854,
"grad_norm": 1.1776021718978882,
"learning_rate": 2.7929482495393995e-06,
"loss": 0.0453,
"step": 1313
},
{
"epoch": 0.5168141592920354,
"grad_norm": 0.89280104637146,
"learning_rate": 2.7895374478568608e-06,
"loss": 0.0506,
"step": 1314
},
{
"epoch": 0.5172074729596854,
"grad_norm": 1.046673059463501,
"learning_rate": 2.786126099822885e-06,
"loss": 0.0812,
"step": 1315
},
{
"epoch": 0.5176007866273353,
"grad_norm": 1.451196312904358,
"learning_rate": 2.7827142118746187e-06,
"loss": 0.0388,
"step": 1316
},
{
"epoch": 0.5179941002949853,
"grad_norm": 0.9998504519462585,
"learning_rate": 2.779301790450226e-06,
"loss": 0.0505,
"step": 1317
},
{
"epoch": 0.5183874139626352,
"grad_norm": 1.0535742044448853,
"learning_rate": 2.7758888419888797e-06,
"loss": 0.0377,
"step": 1318
},
{
"epoch": 0.5187807276302852,
"grad_norm": 0.9973492622375488,
"learning_rate": 2.7724753729307454e-06,
"loss": 0.0512,
"step": 1319
},
{
"epoch": 0.5191740412979351,
"grad_norm": 1.3732929229736328,
"learning_rate": 2.769061389716971e-06,
"loss": 0.0992,
"step": 1320
},
{
"epoch": 0.519567354965585,
"grad_norm": 1.1079411506652832,
"learning_rate": 2.765646898789677e-06,
"loss": 0.0438,
"step": 1321
},
{
"epoch": 0.519960668633235,
"grad_norm": 1.0692771673202515,
"learning_rate": 2.762231906591939e-06,
"loss": 0.0482,
"step": 1322
},
{
"epoch": 0.5203539823008849,
"grad_norm": 0.773914098739624,
"learning_rate": 2.75881641956778e-06,
"loss": 0.0307,
"step": 1323
},
{
"epoch": 0.5207472959685349,
"grad_norm": 0.8193982243537903,
"learning_rate": 2.7554004441621562e-06,
"loss": 0.0357,
"step": 1324
},
{
"epoch": 0.5211406096361848,
"grad_norm": 1.0655934810638428,
"learning_rate": 2.7519839868209462e-06,
"loss": 0.0564,
"step": 1325
},
{
"epoch": 0.5215339233038349,
"grad_norm": 0.668292760848999,
"learning_rate": 2.748567053990937e-06,
"loss": 0.0394,
"step": 1326
},
{
"epoch": 0.5219272369714848,
"grad_norm": 1.5048760175704956,
"learning_rate": 2.7451496521198144e-06,
"loss": 0.0756,
"step": 1327
},
{
"epoch": 0.5223205506391347,
"grad_norm": 1.869588017463684,
"learning_rate": 2.741731787656146e-06,
"loss": 0.08,
"step": 1328
},
{
"epoch": 0.5227138643067847,
"grad_norm": 1.6091140508651733,
"learning_rate": 2.7383134670493765e-06,
"loss": 0.0618,
"step": 1329
},
{
"epoch": 0.5231071779744346,
"grad_norm": 0.5614988207817078,
"learning_rate": 2.734894696749808e-06,
"loss": 0.022,
"step": 1330
},
{
"epoch": 0.5235004916420846,
"grad_norm": 1.5846737623214722,
"learning_rate": 2.7314754832085926e-06,
"loss": 0.0617,
"step": 1331
},
{
"epoch": 0.5238938053097345,
"grad_norm": 1.0142868757247925,
"learning_rate": 2.728055832877719e-06,
"loss": 0.1201,
"step": 1332
},
{
"epoch": 0.5242871189773844,
"grad_norm": 0.9764862060546875,
"learning_rate": 2.7246357522099996e-06,
"loss": 0.0576,
"step": 1333
},
{
"epoch": 0.5246804326450344,
"grad_norm": 0.7208642363548279,
"learning_rate": 2.721215247659059e-06,
"loss": 0.0165,
"step": 1334
},
{
"epoch": 0.5250737463126843,
"grad_norm": 1.2766616344451904,
"learning_rate": 2.7177943256793214e-06,
"loss": 0.0589,
"step": 1335
},
{
"epoch": 0.5254670599803343,
"grad_norm": 1.7238527536392212,
"learning_rate": 2.7143729927259992e-06,
"loss": 0.0415,
"step": 1336
},
{
"epoch": 0.5258603736479842,
"grad_norm": 0.9424237608909607,
"learning_rate": 2.7109512552550804e-06,
"loss": 0.088,
"step": 1337
},
{
"epoch": 0.5262536873156343,
"grad_norm": 0.8586751818656921,
"learning_rate": 2.707529119723315e-06,
"loss": 0.0621,
"step": 1338
},
{
"epoch": 0.5266470009832842,
"grad_norm": 0.6910445690155029,
"learning_rate": 2.7041065925882054e-06,
"loss": 0.0473,
"step": 1339
},
{
"epoch": 0.5270403146509341,
"grad_norm": 0.6774911880493164,
"learning_rate": 2.7006836803079934e-06,
"loss": 0.0401,
"step": 1340
},
{
"epoch": 0.5274336283185841,
"grad_norm": 1.1810059547424316,
"learning_rate": 2.697260389341645e-06,
"loss": 0.0464,
"step": 1341
},
{
"epoch": 0.527826941986234,
"grad_norm": 0.6813443303108215,
"learning_rate": 2.693836726148844e-06,
"loss": 0.0502,
"step": 1342
},
{
"epoch": 0.528220255653884,
"grad_norm": 1.6458402872085571,
"learning_rate": 2.6904126971899754e-06,
"loss": 0.0644,
"step": 1343
},
{
"epoch": 0.5286135693215339,
"grad_norm": 1.4540367126464844,
"learning_rate": 2.686988308926112e-06,
"loss": 0.0564,
"step": 1344
},
{
"epoch": 0.5290068829891839,
"grad_norm": 0.6865090131759644,
"learning_rate": 2.68356356781901e-06,
"loss": 0.0448,
"step": 1345
},
{
"epoch": 0.5294001966568338,
"grad_norm": 1.91966712474823,
"learning_rate": 2.6801384803310855e-06,
"loss": 0.0431,
"step": 1346
},
{
"epoch": 0.5297935103244837,
"grad_norm": 0.6628435254096985,
"learning_rate": 2.676713052925411e-06,
"loss": 0.0513,
"step": 1347
},
{
"epoch": 0.5301868239921337,
"grad_norm": 1.0600309371948242,
"learning_rate": 2.6732872920657018e-06,
"loss": 0.0321,
"step": 1348
},
{
"epoch": 0.5305801376597836,
"grad_norm": 0.5295042991638184,
"learning_rate": 2.6698612042162995e-06,
"loss": 0.0299,
"step": 1349
},
{
"epoch": 0.5309734513274337,
"grad_norm": 1.229316234588623,
"learning_rate": 2.6664347958421647e-06,
"loss": 0.0475,
"step": 1350
},
{
"epoch": 0.5313667649950836,
"grad_norm": 0.8785441517829895,
"learning_rate": 2.6630080734088625e-06,
"loss": 0.0424,
"step": 1351
},
{
"epoch": 0.5317600786627336,
"grad_norm": 1.3285952806472778,
"learning_rate": 2.6595810433825496e-06,
"loss": 0.0359,
"step": 1352
},
{
"epoch": 0.5321533923303835,
"grad_norm": 0.8368435502052307,
"learning_rate": 2.6561537122299647e-06,
"loss": 0.0503,
"step": 1353
},
{
"epoch": 0.5325467059980334,
"grad_norm": 0.790544331073761,
"learning_rate": 2.6527260864184135e-06,
"loss": 0.0321,
"step": 1354
},
{
"epoch": 0.5329400196656834,
"grad_norm": 1.5722286701202393,
"learning_rate": 2.6492981724157576e-06,
"loss": 0.0765,
"step": 1355
},
{
"epoch": 0.5333333333333333,
"grad_norm": 1.0913268327713013,
"learning_rate": 2.6458699766904033e-06,
"loss": 0.0526,
"step": 1356
},
{
"epoch": 0.5337266470009833,
"grad_norm": 1.2754257917404175,
"learning_rate": 2.6424415057112883e-06,
"loss": 0.0585,
"step": 1357
},
{
"epoch": 0.5341199606686332,
"grad_norm": 2.0785610675811768,
"learning_rate": 2.6390127659478698e-06,
"loss": 0.0995,
"step": 1358
},
{
"epoch": 0.5345132743362832,
"grad_norm": 1.3484556674957275,
"learning_rate": 2.6355837638701115e-06,
"loss": 0.0462,
"step": 1359
},
{
"epoch": 0.5349065880039331,
"grad_norm": 0.7563539147377014,
"learning_rate": 2.632154505948472e-06,
"loss": 0.0614,
"step": 1360
},
{
"epoch": 0.535299901671583,
"grad_norm": 0.7201266288757324,
"learning_rate": 2.6287249986538944e-06,
"loss": 0.0449,
"step": 1361
},
{
"epoch": 0.5356932153392331,
"grad_norm": 1.439516544342041,
"learning_rate": 2.62529524845779e-06,
"loss": 0.0694,
"step": 1362
},
{
"epoch": 0.536086529006883,
"grad_norm": 0.6716679334640503,
"learning_rate": 2.6218652618320306e-06,
"loss": 0.0302,
"step": 1363
},
{
"epoch": 0.536479842674533,
"grad_norm": 1.9574276208877563,
"learning_rate": 2.6184350452489317e-06,
"loss": 0.0708,
"step": 1364
},
{
"epoch": 0.5368731563421829,
"grad_norm": 1.3900701999664307,
"learning_rate": 2.615004605181246e-06,
"loss": 0.0833,
"step": 1365
},
{
"epoch": 0.5372664700098329,
"grad_norm": 0.9019057154655457,
"learning_rate": 2.611573948102144e-06,
"loss": 0.0625,
"step": 1366
},
{
"epoch": 0.5376597836774828,
"grad_norm": 2.0217947959899902,
"learning_rate": 2.6081430804852093e-06,
"loss": 0.0837,
"step": 1367
},
{
"epoch": 0.5380530973451327,
"grad_norm": 1.5341334342956543,
"learning_rate": 2.604712008804421e-06,
"loss": 0.0734,
"step": 1368
},
{
"epoch": 0.5384464110127827,
"grad_norm": 1.3491941690444946,
"learning_rate": 2.601280739534143e-06,
"loss": 0.0631,
"step": 1369
},
{
"epoch": 0.5388397246804326,
"grad_norm": 1.264906406402588,
"learning_rate": 2.5978492791491126e-06,
"loss": 0.0361,
"step": 1370
},
{
"epoch": 0.5392330383480826,
"grad_norm": 1.567254900932312,
"learning_rate": 2.594417634124428e-06,
"loss": 0.0802,
"step": 1371
},
{
"epoch": 0.5396263520157325,
"grad_norm": 0.912144124507904,
"learning_rate": 2.590985810935535e-06,
"loss": 0.0321,
"step": 1372
},
{
"epoch": 0.5400196656833824,
"grad_norm": 0.7098456025123596,
"learning_rate": 2.5875538160582176e-06,
"loss": 0.0625,
"step": 1373
},
{
"epoch": 0.5404129793510325,
"grad_norm": 1.4193458557128906,
"learning_rate": 2.58412165596858e-06,
"loss": 0.0518,
"step": 1374
},
{
"epoch": 0.5408062930186824,
"grad_norm": 1.3003660440444946,
"learning_rate": 2.5806893371430413e-06,
"loss": 0.0625,
"step": 1375
},
{
"epoch": 0.5411996066863324,
"grad_norm": 1.4275062084197998,
"learning_rate": 2.57725686605832e-06,
"loss": 0.0628,
"step": 1376
},
{
"epoch": 0.5415929203539823,
"grad_norm": 1.3604398965835571,
"learning_rate": 2.5738242491914206e-06,
"loss": 0.0733,
"step": 1377
},
{
"epoch": 0.5419862340216323,
"grad_norm": 2.859689235687256,
"learning_rate": 2.5703914930196227e-06,
"loss": 0.0547,
"step": 1378
},
{
"epoch": 0.5423795476892822,
"grad_norm": 0.770262598991394,
"learning_rate": 2.5669586040204697e-06,
"loss": 0.0644,
"step": 1379
},
{
"epoch": 0.5427728613569321,
"grad_norm": 0.7974931001663208,
"learning_rate": 2.5635255886717553e-06,
"loss": 0.0687,
"step": 1380
},
{
"epoch": 0.5431661750245821,
"grad_norm": 0.9779230356216431,
"learning_rate": 2.560092453451512e-06,
"loss": 0.0586,
"step": 1381
},
{
"epoch": 0.543559488692232,
"grad_norm": 2.3653101921081543,
"learning_rate": 2.5566592048379975e-06,
"loss": 0.0697,
"step": 1382
},
{
"epoch": 0.543952802359882,
"grad_norm": 1.6566016674041748,
"learning_rate": 2.553225849309684e-06,
"loss": 0.104,
"step": 1383
},
{
"epoch": 0.5443461160275319,
"grad_norm": 1.516684889793396,
"learning_rate": 2.5497923933452464e-06,
"loss": 0.0423,
"step": 1384
},
{
"epoch": 0.5447394296951819,
"grad_norm": 1.3681788444519043,
"learning_rate": 2.5463588434235463e-06,
"loss": 0.052,
"step": 1385
},
{
"epoch": 0.5451327433628319,
"grad_norm": 0.49628522992134094,
"learning_rate": 2.542925206023626e-06,
"loss": 0.0255,
"step": 1386
},
{
"epoch": 0.5455260570304818,
"grad_norm": 0.9334824681282043,
"learning_rate": 2.5394914876246916e-06,
"loss": 0.0517,
"step": 1387
},
{
"epoch": 0.5459193706981318,
"grad_norm": 1.3869428634643555,
"learning_rate": 2.5360576947061004e-06,
"loss": 0.051,
"step": 1388
},
{
"epoch": 0.5463126843657817,
"grad_norm": 0.7261596918106079,
"learning_rate": 2.5326238337473537e-06,
"loss": 0.0349,
"step": 1389
},
{
"epoch": 0.5467059980334317,
"grad_norm": 1.0270626544952393,
"learning_rate": 2.5291899112280765e-06,
"loss": 0.0574,
"step": 1390
},
{
"epoch": 0.5470993117010816,
"grad_norm": 0.9097653031349182,
"learning_rate": 2.5257559336280145e-06,
"loss": 0.0434,
"step": 1391
},
{
"epoch": 0.5474926253687316,
"grad_norm": 1.5684995651245117,
"learning_rate": 2.522321907427016e-06,
"loss": 0.0394,
"step": 1392
},
{
"epoch": 0.5478859390363815,
"grad_norm": 0.5134732723236084,
"learning_rate": 2.5188878391050187e-06,
"loss": 0.0642,
"step": 1393
},
{
"epoch": 0.5482792527040314,
"grad_norm": 1.6495331525802612,
"learning_rate": 2.515453735142043e-06,
"loss": 0.0335,
"step": 1394
},
{
"epoch": 0.5486725663716814,
"grad_norm": 0.949030876159668,
"learning_rate": 2.5120196020181752e-06,
"loss": 0.069,
"step": 1395
},
{
"epoch": 0.5490658800393313,
"grad_norm": 0.5853769183158875,
"learning_rate": 2.5085854462135556e-06,
"loss": 0.035,
"step": 1396
},
{
"epoch": 0.5494591937069813,
"grad_norm": 1.0677484273910522,
"learning_rate": 2.505151274208369e-06,
"loss": 0.0511,
"step": 1397
},
{
"epoch": 0.5498525073746313,
"grad_norm": 1.5644643306732178,
"learning_rate": 2.50171709248283e-06,
"loss": 0.0814,
"step": 1398
},
{
"epoch": 0.5502458210422813,
"grad_norm": 0.736179769039154,
"learning_rate": 2.4982829075171714e-06,
"loss": 0.0452,
"step": 1399
},
{
"epoch": 0.5506391347099312,
"grad_norm": 0.8911694288253784,
"learning_rate": 2.494848725791632e-06,
"loss": 0.0564,
"step": 1400
},
{
"epoch": 0.5510324483775811,
"grad_norm": 1.9409581422805786,
"learning_rate": 2.4914145537864453e-06,
"loss": 0.0724,
"step": 1401
},
{
"epoch": 0.5514257620452311,
"grad_norm": 1.1989744901657104,
"learning_rate": 2.4879803979818256e-06,
"loss": 0.0496,
"step": 1402
},
{
"epoch": 0.551819075712881,
"grad_norm": 1.8545705080032349,
"learning_rate": 2.4845462648579573e-06,
"loss": 0.0527,
"step": 1403
},
{
"epoch": 0.552212389380531,
"grad_norm": 1.8136131763458252,
"learning_rate": 2.481112160894982e-06,
"loss": 0.0601,
"step": 1404
},
{
"epoch": 0.5526057030481809,
"grad_norm": 1.070971131324768,
"learning_rate": 2.4776780925729853e-06,
"loss": 0.0612,
"step": 1405
},
{
"epoch": 0.5529990167158308,
"grad_norm": 1.127616047859192,
"learning_rate": 2.474244066371986e-06,
"loss": 0.0503,
"step": 1406
},
{
"epoch": 0.5533923303834808,
"grad_norm": 1.5506644248962402,
"learning_rate": 2.4708100887719243e-06,
"loss": 0.0638,
"step": 1407
},
{
"epoch": 0.5537856440511307,
"grad_norm": 1.5224863290786743,
"learning_rate": 2.4673761662526475e-06,
"loss": 0.0521,
"step": 1408
},
{
"epoch": 0.5541789577187807,
"grad_norm": 1.2066714763641357,
"learning_rate": 2.4639423052938995e-06,
"loss": 0.0533,
"step": 1409
},
{
"epoch": 0.5545722713864307,
"grad_norm": 1.389074683189392,
"learning_rate": 2.4605085123753097e-06,
"loss": 0.0809,
"step": 1410
},
{
"epoch": 0.5549655850540807,
"grad_norm": 0.6731852293014526,
"learning_rate": 2.4570747939763745e-06,
"loss": 0.0249,
"step": 1411
},
{
"epoch": 0.5553588987217306,
"grad_norm": 1.2953534126281738,
"learning_rate": 2.453641156576454e-06,
"loss": 0.0473,
"step": 1412
},
{
"epoch": 0.5557522123893806,
"grad_norm": 0.9251944422721863,
"learning_rate": 2.4502076066547545e-06,
"loss": 0.0765,
"step": 1413
},
{
"epoch": 0.5561455260570305,
"grad_norm": 1.831679344177246,
"learning_rate": 2.4467741506903162e-06,
"loss": 0.0798,
"step": 1414
},
{
"epoch": 0.5565388397246804,
"grad_norm": 1.2218101024627686,
"learning_rate": 2.443340795162003e-06,
"loss": 0.0393,
"step": 1415
},
{
"epoch": 0.5569321533923304,
"grad_norm": 1.164400577545166,
"learning_rate": 2.4399075465484883e-06,
"loss": 0.0681,
"step": 1416
},
{
"epoch": 0.5573254670599803,
"grad_norm": 1.0514402389526367,
"learning_rate": 2.4364744113282455e-06,
"loss": 0.0593,
"step": 1417
},
{
"epoch": 0.5577187807276303,
"grad_norm": 1.9647271633148193,
"learning_rate": 2.433041395979531e-06,
"loss": 0.0785,
"step": 1418
},
{
"epoch": 0.5581120943952802,
"grad_norm": 0.7550022006034851,
"learning_rate": 2.429608506980378e-06,
"loss": 0.0443,
"step": 1419
},
{
"epoch": 0.5585054080629301,
"grad_norm": 1.2886439561843872,
"learning_rate": 2.4261757508085803e-06,
"loss": 0.0625,
"step": 1420
},
{
"epoch": 0.5588987217305801,
"grad_norm": 0.6531363129615784,
"learning_rate": 2.422743133941681e-06,
"loss": 0.0437,
"step": 1421
},
{
"epoch": 0.5592920353982301,
"grad_norm": 1.3166404962539673,
"learning_rate": 2.419310662856959e-06,
"loss": 0.0363,
"step": 1422
},
{
"epoch": 0.5596853490658801,
"grad_norm": 0.9738766551017761,
"learning_rate": 2.415878344031421e-06,
"loss": 0.0499,
"step": 1423
},
{
"epoch": 0.56007866273353,
"grad_norm": 1.1199309825897217,
"learning_rate": 2.4124461839417832e-06,
"loss": 0.0638,
"step": 1424
},
{
"epoch": 0.56047197640118,
"grad_norm": 0.7884669303894043,
"learning_rate": 2.4090141890644654e-06,
"loss": 0.0219,
"step": 1425
},
{
"epoch": 0.5608652900688299,
"grad_norm": 1.508720874786377,
"learning_rate": 2.405582365875573e-06,
"loss": 0.0722,
"step": 1426
},
{
"epoch": 0.5612586037364798,
"grad_norm": 0.9353559017181396,
"learning_rate": 2.4021507208508882e-06,
"loss": 0.0654,
"step": 1427
},
{
"epoch": 0.5616519174041298,
"grad_norm": 1.9918673038482666,
"learning_rate": 2.398719260465858e-06,
"loss": 0.0741,
"step": 1428
},
{
"epoch": 0.5620452310717797,
"grad_norm": 0.9243260622024536,
"learning_rate": 2.3952879911955794e-06,
"loss": 0.0369,
"step": 1429
},
{
"epoch": 0.5624385447394297,
"grad_norm": 1.3456679582595825,
"learning_rate": 2.391856919514791e-06,
"loss": 0.0811,
"step": 1430
},
{
"epoch": 0.5628318584070796,
"grad_norm": 1.5919969081878662,
"learning_rate": 2.3884260518978562e-06,
"loss": 0.0402,
"step": 1431
},
{
"epoch": 0.5632251720747296,
"grad_norm": 0.5894349813461304,
"learning_rate": 2.3849953948187552e-06,
"loss": 0.0396,
"step": 1432
},
{
"epoch": 0.5636184857423795,
"grad_norm": 1.708106517791748,
"learning_rate": 2.3815649547510687e-06,
"loss": 0.0575,
"step": 1433
},
{
"epoch": 0.5640117994100295,
"grad_norm": 1.6241428852081299,
"learning_rate": 2.37813473816797e-06,
"loss": 0.047,
"step": 1434
},
{
"epoch": 0.5644051130776795,
"grad_norm": 1.1760050058364868,
"learning_rate": 2.3747047515422102e-06,
"loss": 0.049,
"step": 1435
},
{
"epoch": 0.5647984267453294,
"grad_norm": 0.6579201221466064,
"learning_rate": 2.371275001346106e-06,
"loss": 0.0569,
"step": 1436
},
{
"epoch": 0.5651917404129794,
"grad_norm": 0.5577812194824219,
"learning_rate": 2.367845494051529e-06,
"loss": 0.0338,
"step": 1437
},
{
"epoch": 0.5655850540806293,
"grad_norm": 0.9575706124305725,
"learning_rate": 2.3644162361298897e-06,
"loss": 0.0622,
"step": 1438
},
{
"epoch": 0.5659783677482793,
"grad_norm": 0.6951814889907837,
"learning_rate": 2.360987234052131e-06,
"loss": 0.0329,
"step": 1439
},
{
"epoch": 0.5663716814159292,
"grad_norm": 1.079609990119934,
"learning_rate": 2.357558494288712e-06,
"loss": 0.0672,
"step": 1440
},
{
"epoch": 0.5667649950835791,
"grad_norm": 1.0509586334228516,
"learning_rate": 2.354130023309597e-06,
"loss": 0.0755,
"step": 1441
},
{
"epoch": 0.5671583087512291,
"grad_norm": 0.9782833456993103,
"learning_rate": 2.350701827584243e-06,
"loss": 0.0319,
"step": 1442
},
{
"epoch": 0.567551622418879,
"grad_norm": 1.019370675086975,
"learning_rate": 2.3472739135815877e-06,
"loss": 0.0696,
"step": 1443
},
{
"epoch": 0.567944936086529,
"grad_norm": 1.419137716293335,
"learning_rate": 2.343846287770036e-06,
"loss": 0.0797,
"step": 1444
},
{
"epoch": 0.5683382497541789,
"grad_norm": 1.8223907947540283,
"learning_rate": 2.340418956617451e-06,
"loss": 0.0462,
"step": 1445
},
{
"epoch": 0.568731563421829,
"grad_norm": 1.1286693811416626,
"learning_rate": 2.336991926591138e-06,
"loss": 0.0735,
"step": 1446
},
{
"epoch": 0.5691248770894789,
"grad_norm": 1.7998546361923218,
"learning_rate": 2.3335652041578352e-06,
"loss": 0.0964,
"step": 1447
},
{
"epoch": 0.5695181907571288,
"grad_norm": 1.0016109943389893,
"learning_rate": 2.3301387957837017e-06,
"loss": 0.0631,
"step": 1448
},
{
"epoch": 0.5699115044247788,
"grad_norm": 1.876328706741333,
"learning_rate": 2.326712707934299e-06,
"loss": 0.0683,
"step": 1449
},
{
"epoch": 0.5703048180924287,
"grad_norm": 1.8099371194839478,
"learning_rate": 2.3232869470745893e-06,
"loss": 0.058,
"step": 1450
},
{
"epoch": 0.5706981317600787,
"grad_norm": 0.8637019395828247,
"learning_rate": 2.3198615196689153e-06,
"loss": 0.0655,
"step": 1451
},
{
"epoch": 0.5710914454277286,
"grad_norm": 2.1426312923431396,
"learning_rate": 2.3164364321809906e-06,
"loss": 0.0572,
"step": 1452
},
{
"epoch": 0.5714847590953785,
"grad_norm": 1.6157870292663574,
"learning_rate": 2.3130116910738874e-06,
"loss": 0.0321,
"step": 1453
},
{
"epoch": 0.5718780727630285,
"grad_norm": 0.8953425288200378,
"learning_rate": 2.309587302810026e-06,
"loss": 0.0292,
"step": 1454
},
{
"epoch": 0.5722713864306784,
"grad_norm": 0.8132373094558716,
"learning_rate": 2.306163273851157e-06,
"loss": 0.0517,
"step": 1455
},
{
"epoch": 0.5726647000983284,
"grad_norm": 0.8843181729316711,
"learning_rate": 2.302739610658356e-06,
"loss": 0.0389,
"step": 1456
},
{
"epoch": 0.5730580137659783,
"grad_norm": 1.1060006618499756,
"learning_rate": 2.2993163196920075e-06,
"loss": 0.08,
"step": 1457
},
{
"epoch": 0.5734513274336284,
"grad_norm": 1.1257623434066772,
"learning_rate": 2.295893407411795e-06,
"loss": 0.053,
"step": 1458
},
{
"epoch": 0.5738446411012783,
"grad_norm": 1.0160799026489258,
"learning_rate": 2.2924708802766857e-06,
"loss": 0.0439,
"step": 1459
},
{
"epoch": 0.5742379547689282,
"grad_norm": 1.231930136680603,
"learning_rate": 2.2890487447449204e-06,
"loss": 0.0569,
"step": 1460
},
{
"epoch": 0.5746312684365782,
"grad_norm": 0.8130099177360535,
"learning_rate": 2.285627007274001e-06,
"loss": 0.0361,
"step": 1461
},
{
"epoch": 0.5750245821042281,
"grad_norm": 0.6949229836463928,
"learning_rate": 2.282205674320679e-06,
"loss": 0.0598,
"step": 1462
},
{
"epoch": 0.5754178957718781,
"grad_norm": 1.0386853218078613,
"learning_rate": 2.2787847523409416e-06,
"loss": 0.0601,
"step": 1463
},
{
"epoch": 0.575811209439528,
"grad_norm": 0.48775455355644226,
"learning_rate": 2.2753642477900012e-06,
"loss": 0.0483,
"step": 1464
},
{
"epoch": 0.576204523107178,
"grad_norm": 1.220493197441101,
"learning_rate": 2.2719441671222815e-06,
"loss": 0.0398,
"step": 1465
},
{
"epoch": 0.5765978367748279,
"grad_norm": 0.747078537940979,
"learning_rate": 2.268524516791408e-06,
"loss": 0.0313,
"step": 1466
},
{
"epoch": 0.5769911504424778,
"grad_norm": 0.7773571014404297,
"learning_rate": 2.2651053032501928e-06,
"loss": 0.0395,
"step": 1467
},
{
"epoch": 0.5773844641101278,
"grad_norm": 0.4083022177219391,
"learning_rate": 2.261686532950624e-06,
"loss": 0.0255,
"step": 1468
},
{
"epoch": 0.5777777777777777,
"grad_norm": 1.0136034488677979,
"learning_rate": 2.2582682123438547e-06,
"loss": 0.0499,
"step": 1469
},
{
"epoch": 0.5781710914454278,
"grad_norm": 1.2290290594100952,
"learning_rate": 2.254850347880187e-06,
"loss": 0.0649,
"step": 1470
},
{
"epoch": 0.5785644051130777,
"grad_norm": 1.4913883209228516,
"learning_rate": 2.2514329460090633e-06,
"loss": 0.0595,
"step": 1471
},
{
"epoch": 0.5789577187807277,
"grad_norm": 1.210160732269287,
"learning_rate": 2.248016013179054e-06,
"loss": 0.0433,
"step": 1472
},
{
"epoch": 0.5793510324483776,
"grad_norm": 0.757161557674408,
"learning_rate": 2.244599555837844e-06,
"loss": 0.035,
"step": 1473
},
{
"epoch": 0.5797443461160275,
"grad_norm": 1.0250403881072998,
"learning_rate": 2.2411835804322206e-06,
"loss": 0.0375,
"step": 1474
},
{
"epoch": 0.5801376597836775,
"grad_norm": 1.1955897808074951,
"learning_rate": 2.2377680934080625e-06,
"loss": 0.0449,
"step": 1475
},
{
"epoch": 0.5805309734513274,
"grad_norm": 1.7066453695297241,
"learning_rate": 2.2343531012103244e-06,
"loss": 0.0722,
"step": 1476
},
{
"epoch": 0.5809242871189774,
"grad_norm": 0.6709203720092773,
"learning_rate": 2.2309386102830295e-06,
"loss": 0.0354,
"step": 1477
},
{
"epoch": 0.5813176007866273,
"grad_norm": 0.9403322339057922,
"learning_rate": 2.227524627069256e-06,
"loss": 0.039,
"step": 1478
},
{
"epoch": 0.5817109144542773,
"grad_norm": 1.1907342672348022,
"learning_rate": 2.2241111580111207e-06,
"loss": 0.0894,
"step": 1479
},
{
"epoch": 0.5821042281219272,
"grad_norm": 0.9678034782409668,
"learning_rate": 2.220698209549774e-06,
"loss": 0.0492,
"step": 1480
},
{
"epoch": 0.5824975417895771,
"grad_norm": 0.5867919325828552,
"learning_rate": 2.2172857881253825e-06,
"loss": 0.0329,
"step": 1481
},
{
"epoch": 0.5828908554572272,
"grad_norm": 0.9085230827331543,
"learning_rate": 2.2138739001771157e-06,
"loss": 0.0501,
"step": 1482
},
{
"epoch": 0.5832841691248771,
"grad_norm": 1.015177845954895,
"learning_rate": 2.2104625521431396e-06,
"loss": 0.0297,
"step": 1483
},
{
"epoch": 0.5836774827925271,
"grad_norm": 0.48682698607444763,
"learning_rate": 2.207051750460601e-06,
"loss": 0.0329,
"step": 1484
},
{
"epoch": 0.584070796460177,
"grad_norm": 1.861662745475769,
"learning_rate": 2.2036415015656148e-06,
"loss": 0.0619,
"step": 1485
},
{
"epoch": 0.584464110127827,
"grad_norm": 0.9373002648353577,
"learning_rate": 2.2002318118932543e-06,
"loss": 0.0563,
"step": 1486
},
{
"epoch": 0.5848574237954769,
"grad_norm": 0.4820902943611145,
"learning_rate": 2.1968226878775347e-06,
"loss": 0.0206,
"step": 1487
},
{
"epoch": 0.5852507374631268,
"grad_norm": 0.6255022287368774,
"learning_rate": 2.1934141359514062e-06,
"loss": 0.0319,
"step": 1488
},
{
"epoch": 0.5856440511307768,
"grad_norm": 0.8468760848045349,
"learning_rate": 2.1900061625467393e-06,
"loss": 0.0574,
"step": 1489
},
{
"epoch": 0.5860373647984267,
"grad_norm": 0.519826352596283,
"learning_rate": 2.1865987740943116e-06,
"loss": 0.0595,
"step": 1490
},
{
"epoch": 0.5864306784660767,
"grad_norm": 1.6838140487670898,
"learning_rate": 2.183191977023799e-06,
"loss": 0.0549,
"step": 1491
},
{
"epoch": 0.5868239921337266,
"grad_norm": 1.3588017225265503,
"learning_rate": 2.17978577776376e-06,
"loss": 0.058,
"step": 1492
},
{
"epoch": 0.5872173058013765,
"grad_norm": 0.9913402199745178,
"learning_rate": 2.176380182741624e-06,
"loss": 0.021,
"step": 1493
},
{
"epoch": 0.5876106194690266,
"grad_norm": 1.7032448053359985,
"learning_rate": 2.172975198383682e-06,
"loss": 0.0565,
"step": 1494
},
{
"epoch": 0.5880039331366765,
"grad_norm": 0.9853689670562744,
"learning_rate": 2.169570831115072e-06,
"loss": 0.0532,
"step": 1495
},
{
"epoch": 0.5883972468043265,
"grad_norm": 1.061571717262268,
"learning_rate": 2.1661670873597686e-06,
"loss": 0.042,
"step": 1496
},
{
"epoch": 0.5887905604719764,
"grad_norm": 1.0780665874481201,
"learning_rate": 2.1627639735405683e-06,
"loss": 0.0412,
"step": 1497
},
{
"epoch": 0.5891838741396264,
"grad_norm": 1.1072509288787842,
"learning_rate": 2.1593614960790795e-06,
"loss": 0.0369,
"step": 1498
},
{
"epoch": 0.5895771878072763,
"grad_norm": 0.9231078028678894,
"learning_rate": 2.15595966139571e-06,
"loss": 0.0388,
"step": 1499
},
{
"epoch": 0.5899705014749262,
"grad_norm": 0.8702555894851685,
"learning_rate": 2.152558475909654e-06,
"loss": 0.0719,
"step": 1500
},
{
"epoch": 0.5903638151425762,
"grad_norm": 0.910358726978302,
"learning_rate": 2.149157946038882e-06,
"loss": 0.0468,
"step": 1501
},
{
"epoch": 0.5907571288102261,
"grad_norm": 1.3807059526443481,
"learning_rate": 2.145758078200126e-06,
"loss": 0.0729,
"step": 1502
},
{
"epoch": 0.5911504424778761,
"grad_norm": 0.9765854477882385,
"learning_rate": 2.1423588788088704e-06,
"loss": 0.0407,
"step": 1503
},
{
"epoch": 0.591543756145526,
"grad_norm": 1.021924376487732,
"learning_rate": 2.1389603542793364e-06,
"loss": 0.0342,
"step": 1504
},
{
"epoch": 0.591937069813176,
"grad_norm": 1.098352313041687,
"learning_rate": 2.1355625110244725e-06,
"loss": 0.0668,
"step": 1505
},
{
"epoch": 0.592330383480826,
"grad_norm": 1.5986775159835815,
"learning_rate": 2.1321653554559425e-06,
"loss": 0.0673,
"step": 1506
},
{
"epoch": 0.592723697148476,
"grad_norm": 1.2270184755325317,
"learning_rate": 2.1287688939841104e-06,
"loss": 0.0405,
"step": 1507
},
{
"epoch": 0.5931170108161259,
"grad_norm": 0.6227984428405762,
"learning_rate": 2.125373133018033e-06,
"loss": 0.0362,
"step": 1508
},
{
"epoch": 0.5935103244837758,
"grad_norm": 1.1838734149932861,
"learning_rate": 2.1219780789654436e-06,
"loss": 0.0705,
"step": 1509
},
{
"epoch": 0.5939036381514258,
"grad_norm": 1.5811330080032349,
"learning_rate": 2.1185837382327422e-06,
"loss": 0.0811,
"step": 1510
},
{
"epoch": 0.5942969518190757,
"grad_norm": 1.6723252534866333,
"learning_rate": 2.1151901172249823e-06,
"loss": 0.0711,
"step": 1511
},
{
"epoch": 0.5946902654867257,
"grad_norm": 1.1075739860534668,
"learning_rate": 2.1117972223458598e-06,
"loss": 0.0365,
"step": 1512
},
{
"epoch": 0.5950835791543756,
"grad_norm": 1.0250906944274902,
"learning_rate": 2.108405059997701e-06,
"loss": 0.0534,
"step": 1513
},
{
"epoch": 0.5954768928220255,
"grad_norm": 1.4097585678100586,
"learning_rate": 2.1050136365814484e-06,
"loss": 0.0633,
"step": 1514
},
{
"epoch": 0.5958702064896755,
"grad_norm": 1.0003234148025513,
"learning_rate": 2.10162295849665e-06,
"loss": 0.0331,
"step": 1515
},
{
"epoch": 0.5962635201573254,
"grad_norm": 1.203927755355835,
"learning_rate": 2.0982330321414495e-06,
"loss": 0.0397,
"step": 1516
},
{
"epoch": 0.5966568338249754,
"grad_norm": 1.1078671216964722,
"learning_rate": 2.094843863912571e-06,
"loss": 0.061,
"step": 1517
},
{
"epoch": 0.5970501474926254,
"grad_norm": 0.9437456130981445,
"learning_rate": 2.0914554602053072e-06,
"loss": 0.0549,
"step": 1518
},
{
"epoch": 0.5974434611602754,
"grad_norm": 0.34665971994400024,
"learning_rate": 2.0880678274135103e-06,
"loss": 0.0374,
"step": 1519
},
{
"epoch": 0.5978367748279253,
"grad_norm": 1.6303670406341553,
"learning_rate": 2.084680971929574e-06,
"loss": 0.0729,
"step": 1520
},
{
"epoch": 0.5982300884955752,
"grad_norm": 1.1011961698532104,
"learning_rate": 2.0812949001444293e-06,
"loss": 0.0399,
"step": 1521
},
{
"epoch": 0.5986234021632252,
"grad_norm": 0.8066303730010986,
"learning_rate": 2.077909618447526e-06,
"loss": 0.05,
"step": 1522
},
{
"epoch": 0.5990167158308751,
"grad_norm": 1.4448401927947998,
"learning_rate": 2.0745251332268238e-06,
"loss": 0.0616,
"step": 1523
},
{
"epoch": 0.5994100294985251,
"grad_norm": 0.49370574951171875,
"learning_rate": 2.07114145086878e-06,
"loss": 0.0496,
"step": 1524
},
{
"epoch": 0.599803343166175,
"grad_norm": 1.0275585651397705,
"learning_rate": 2.0677585777583366e-06,
"loss": 0.038,
"step": 1525
},
{
"epoch": 0.600196656833825,
"grad_norm": 1.1347780227661133,
"learning_rate": 2.0643765202789064e-06,
"loss": 0.0324,
"step": 1526
},
{
"epoch": 0.6005899705014749,
"grad_norm": 1.2602198123931885,
"learning_rate": 2.060995284812366e-06,
"loss": 0.0699,
"step": 1527
},
{
"epoch": 0.6009832841691248,
"grad_norm": 1.4369268417358398,
"learning_rate": 2.0576148777390397e-06,
"loss": 0.0664,
"step": 1528
},
{
"epoch": 0.6013765978367748,
"grad_norm": 1.8620692491531372,
"learning_rate": 2.0542353054376893e-06,
"loss": 0.0566,
"step": 1529
},
{
"epoch": 0.6017699115044248,
"grad_norm": 1.026005506515503,
"learning_rate": 2.0508565742855017e-06,
"loss": 0.023,
"step": 1530
},
{
"epoch": 0.6021632251720748,
"grad_norm": 0.8947687149047852,
"learning_rate": 2.0474786906580733e-06,
"loss": 0.0573,
"step": 1531
},
{
"epoch": 0.6025565388397247,
"grad_norm": 1.1179437637329102,
"learning_rate": 2.044101660929405e-06,
"loss": 0.0551,
"step": 1532
},
{
"epoch": 0.6029498525073747,
"grad_norm": 0.6822925806045532,
"learning_rate": 2.040725491471885e-06,
"loss": 0.0393,
"step": 1533
},
{
"epoch": 0.6033431661750246,
"grad_norm": 1.8381119966506958,
"learning_rate": 2.037350188656279e-06,
"loss": 0.0502,
"step": 1534
},
{
"epoch": 0.6037364798426745,
"grad_norm": 1.5118048191070557,
"learning_rate": 2.0339757588517165e-06,
"loss": 0.0403,
"step": 1535
},
{
"epoch": 0.6041297935103245,
"grad_norm": 1.0197237730026245,
"learning_rate": 2.0306022084256786e-06,
"loss": 0.0651,
"step": 1536
},
{
"epoch": 0.6045231071779744,
"grad_norm": 2.17777943611145,
"learning_rate": 2.027229543743989e-06,
"loss": 0.069,
"step": 1537
},
{
"epoch": 0.6049164208456244,
"grad_norm": 1.1577013731002808,
"learning_rate": 2.0238577711707987e-06,
"loss": 0.0615,
"step": 1538
},
{
"epoch": 0.6053097345132743,
"grad_norm": 1.1709601879119873,
"learning_rate": 2.0204868970685764e-06,
"loss": 0.0548,
"step": 1539
},
{
"epoch": 0.6057030481809242,
"grad_norm": 0.8054937124252319,
"learning_rate": 2.0171169277980954e-06,
"loss": 0.0479,
"step": 1540
},
{
"epoch": 0.6060963618485742,
"grad_norm": 0.9096735715866089,
"learning_rate": 2.0137478697184205e-06,
"loss": 0.0655,
"step": 1541
},
{
"epoch": 0.6064896755162242,
"grad_norm": 0.9453304409980774,
"learning_rate": 2.0103797291868977e-06,
"loss": 0.0812,
"step": 1542
},
{
"epoch": 0.6068829891838742,
"grad_norm": 0.8558923602104187,
"learning_rate": 2.0070125125591414e-06,
"loss": 0.0468,
"step": 1543
},
{
"epoch": 0.6072763028515241,
"grad_norm": 1.2030149698257446,
"learning_rate": 2.0036462261890225e-06,
"loss": 0.0542,
"step": 1544
},
{
"epoch": 0.6076696165191741,
"grad_norm": 0.9261341691017151,
"learning_rate": 2.0002808764286573e-06,
"loss": 0.0706,
"step": 1545
},
{
"epoch": 0.608062930186824,
"grad_norm": 0.7496268153190613,
"learning_rate": 1.9969164696283945e-06,
"loss": 0.0298,
"step": 1546
},
{
"epoch": 0.6084562438544739,
"grad_norm": 1.2815377712249756,
"learning_rate": 1.9935530121368023e-06,
"loss": 0.0555,
"step": 1547
},
{
"epoch": 0.6088495575221239,
"grad_norm": 0.964885413646698,
"learning_rate": 1.990190510300659e-06,
"loss": 0.0211,
"step": 1548
},
{
"epoch": 0.6092428711897738,
"grad_norm": 0.8117434978485107,
"learning_rate": 1.986828970464939e-06,
"loss": 0.0417,
"step": 1549
},
{
"epoch": 0.6096361848574238,
"grad_norm": 0.4136671721935272,
"learning_rate": 1.983468398972802e-06,
"loss": 0.0177,
"step": 1550
},
{
"epoch": 0.6100294985250737,
"grad_norm": 0.8469100594520569,
"learning_rate": 1.980108802165579e-06,
"loss": 0.0375,
"step": 1551
},
{
"epoch": 0.6104228121927237,
"grad_norm": 0.8030047416687012,
"learning_rate": 1.976750186382764e-06,
"loss": 0.0237,
"step": 1552
},
{
"epoch": 0.6108161258603736,
"grad_norm": 1.6747819185256958,
"learning_rate": 1.9733925579619965e-06,
"loss": 0.072,
"step": 1553
},
{
"epoch": 0.6112094395280236,
"grad_norm": 0.8288264870643616,
"learning_rate": 1.970035923239056e-06,
"loss": 0.0347,
"step": 1554
},
{
"epoch": 0.6116027531956736,
"grad_norm": 0.8544471859931946,
"learning_rate": 1.9666802885478463e-06,
"loss": 0.0445,
"step": 1555
},
{
"epoch": 0.6119960668633235,
"grad_norm": 0.8386610150337219,
"learning_rate": 1.963325660220384e-06,
"loss": 0.0609,
"step": 1556
},
{
"epoch": 0.6123893805309735,
"grad_norm": 1.3670865297317505,
"learning_rate": 1.9599720445867856e-06,
"loss": 0.0601,
"step": 1557
},
{
"epoch": 0.6127826941986234,
"grad_norm": 1.0806509256362915,
"learning_rate": 1.956619447975257e-06,
"loss": 0.058,
"step": 1558
},
{
"epoch": 0.6131760078662734,
"grad_norm": 0.9588520526885986,
"learning_rate": 1.9532678767120827e-06,
"loss": 0.0422,
"step": 1559
},
{
"epoch": 0.6135693215339233,
"grad_norm": 1.370969295501709,
"learning_rate": 1.9499173371216105e-06,
"loss": 0.0646,
"step": 1560
},
{
"epoch": 0.6139626352015732,
"grad_norm": 1.074244499206543,
"learning_rate": 1.946567835526243e-06,
"loss": 0.0613,
"step": 1561
},
{
"epoch": 0.6143559488692232,
"grad_norm": 0.8812416195869446,
"learning_rate": 1.943219378246423e-06,
"loss": 0.0626,
"step": 1562
},
{
"epoch": 0.6147492625368731,
"grad_norm": 1.3703498840332031,
"learning_rate": 1.9398719716006246e-06,
"loss": 0.0673,
"step": 1563
},
{
"epoch": 0.6151425762045231,
"grad_norm": 1.3188180923461914,
"learning_rate": 1.936525621905336e-06,
"loss": 0.0711,
"step": 1564
},
{
"epoch": 0.615535889872173,
"grad_norm": 0.5656819939613342,
"learning_rate": 1.9331803354750537e-06,
"loss": 0.0496,
"step": 1565
},
{
"epoch": 0.6159292035398231,
"grad_norm": 1.2018178701400757,
"learning_rate": 1.9298361186222665e-06,
"loss": 0.052,
"step": 1566
},
{
"epoch": 0.616322517207473,
"grad_norm": 1.197943091392517,
"learning_rate": 1.926492977657446e-06,
"loss": 0.0667,
"step": 1567
},
{
"epoch": 0.6167158308751229,
"grad_norm": 0.6885368227958679,
"learning_rate": 1.9231509188890345e-06,
"loss": 0.0374,
"step": 1568
},
{
"epoch": 0.6171091445427729,
"grad_norm": 0.8017690181732178,
"learning_rate": 1.919809948623428e-06,
"loss": 0.053,
"step": 1569
},
{
"epoch": 0.6175024582104228,
"grad_norm": 1.5223562717437744,
"learning_rate": 1.9164700731649723e-06,
"loss": 0.0605,
"step": 1570
},
{
"epoch": 0.6178957718780728,
"grad_norm": 1.8122631311416626,
"learning_rate": 1.913131298815947e-06,
"loss": 0.0719,
"step": 1571
},
{
"epoch": 0.6182890855457227,
"grad_norm": 1.5113699436187744,
"learning_rate": 1.9097936318765527e-06,
"loss": 0.0547,
"step": 1572
},
{
"epoch": 0.6186823992133726,
"grad_norm": 0.7732280492782593,
"learning_rate": 1.906457078644901e-06,
"loss": 0.0456,
"step": 1573
},
{
"epoch": 0.6190757128810226,
"grad_norm": 1.347740650177002,
"learning_rate": 1.903121645417003e-06,
"loss": 0.0469,
"step": 1574
},
{
"epoch": 0.6194690265486725,
"grad_norm": 0.6614682674407959,
"learning_rate": 1.8997873384867534e-06,
"loss": 0.0266,
"step": 1575
},
{
"epoch": 0.6198623402163225,
"grad_norm": 1.1419849395751953,
"learning_rate": 1.8964541641459242e-06,
"loss": 0.0465,
"step": 1576
},
{
"epoch": 0.6202556538839724,
"grad_norm": 0.9635249972343445,
"learning_rate": 1.893122128684149e-06,
"loss": 0.0482,
"step": 1577
},
{
"epoch": 0.6206489675516225,
"grad_norm": 0.9544531106948853,
"learning_rate": 1.8897912383889138e-06,
"loss": 0.0689,
"step": 1578
},
{
"epoch": 0.6210422812192724,
"grad_norm": 0.7220961451530457,
"learning_rate": 1.886461499545543e-06,
"loss": 0.0521,
"step": 1579
},
{
"epoch": 0.6214355948869223,
"grad_norm": 2.5634989738464355,
"learning_rate": 1.883132918437186e-06,
"loss": 0.0702,
"step": 1580
},
{
"epoch": 0.6218289085545723,
"grad_norm": 1.1183925867080688,
"learning_rate": 1.8798055013448105e-06,
"loss": 0.0623,
"step": 1581
},
{
"epoch": 0.6222222222222222,
"grad_norm": 0.7888696193695068,
"learning_rate": 1.8764792545471872e-06,
"loss": 0.0452,
"step": 1582
},
{
"epoch": 0.6226155358898722,
"grad_norm": 0.4925548732280731,
"learning_rate": 1.8731541843208772e-06,
"loss": 0.0481,
"step": 1583
},
{
"epoch": 0.6230088495575221,
"grad_norm": 1.184525489807129,
"learning_rate": 1.869830296940223e-06,
"loss": 0.0947,
"step": 1584
},
{
"epoch": 0.6234021632251721,
"grad_norm": 1.0969839096069336,
"learning_rate": 1.8665075986773346e-06,
"loss": 0.0786,
"step": 1585
},
{
"epoch": 0.623795476892822,
"grad_norm": 1.2557084560394287,
"learning_rate": 1.863186095802077e-06,
"loss": 0.048,
"step": 1586
},
{
"epoch": 0.6241887905604719,
"grad_norm": 0.9532119631767273,
"learning_rate": 1.8598657945820605e-06,
"loss": 0.0356,
"step": 1587
},
{
"epoch": 0.6245821042281219,
"grad_norm": 0.6121819019317627,
"learning_rate": 1.8565467012826282e-06,
"loss": 0.0395,
"step": 1588
},
{
"epoch": 0.6249754178957718,
"grad_norm": 0.9521839022636414,
"learning_rate": 1.853228822166843e-06,
"loss": 0.0417,
"step": 1589
},
{
"epoch": 0.6253687315634219,
"grad_norm": 1.3007653951644897,
"learning_rate": 1.849912163495479e-06,
"loss": 0.0376,
"step": 1590
},
{
"epoch": 0.6257620452310718,
"grad_norm": 1.0467530488967896,
"learning_rate": 1.8465967315270029e-06,
"loss": 0.0531,
"step": 1591
},
{
"epoch": 0.6261553588987218,
"grad_norm": 0.8435487747192383,
"learning_rate": 1.8432825325175707e-06,
"loss": 0.0333,
"step": 1592
},
{
"epoch": 0.6265486725663717,
"grad_norm": 1.2616933584213257,
"learning_rate": 1.8399695727210098e-06,
"loss": 0.0556,
"step": 1593
},
{
"epoch": 0.6269419862340216,
"grad_norm": 1.1721434593200684,
"learning_rate": 1.836657858388811e-06,
"loss": 0.0658,
"step": 1594
},
{
"epoch": 0.6273352999016716,
"grad_norm": 0.6084288954734802,
"learning_rate": 1.8333473957701126e-06,
"loss": 0.0385,
"step": 1595
},
{
"epoch": 0.6277286135693215,
"grad_norm": 1.4398316144943237,
"learning_rate": 1.830038191111692e-06,
"loss": 0.0606,
"step": 1596
},
{
"epoch": 0.6281219272369715,
"grad_norm": 1.9486684799194336,
"learning_rate": 1.8267302506579532e-06,
"loss": 0.0853,
"step": 1597
},
{
"epoch": 0.6285152409046214,
"grad_norm": 0.7250006794929504,
"learning_rate": 1.8234235806509145e-06,
"loss": 0.0295,
"step": 1598
},
{
"epoch": 0.6289085545722713,
"grad_norm": 1.2927533388137817,
"learning_rate": 1.8201181873301967e-06,
"loss": 0.046,
"step": 1599
},
{
"epoch": 0.6293018682399213,
"grad_norm": 1.2859911918640137,
"learning_rate": 1.816814076933012e-06,
"loss": 0.0579,
"step": 1600
},
{
"epoch": 0.6296951819075712,
"grad_norm": 1.900543451309204,
"learning_rate": 1.813511255694152e-06,
"loss": 0.0567,
"step": 1601
},
{
"epoch": 0.6300884955752213,
"grad_norm": 2.090280532836914,
"learning_rate": 1.8102097298459732e-06,
"loss": 0.0865,
"step": 1602
},
{
"epoch": 0.6304818092428712,
"grad_norm": 1.3595722913742065,
"learning_rate": 1.80690950561839e-06,
"loss": 0.0561,
"step": 1603
},
{
"epoch": 0.6308751229105212,
"grad_norm": 1.022291660308838,
"learning_rate": 1.8036105892388611e-06,
"loss": 0.0382,
"step": 1604
},
{
"epoch": 0.6312684365781711,
"grad_norm": 0.8052154779434204,
"learning_rate": 1.800312986932376e-06,
"loss": 0.0529,
"step": 1605
},
{
"epoch": 0.631661750245821,
"grad_norm": 4.667014122009277,
"learning_rate": 1.7970167049214466e-06,
"loss": 0.0492,
"step": 1606
},
{
"epoch": 0.632055063913471,
"grad_norm": 1.5009123086929321,
"learning_rate": 1.7937217494260888e-06,
"loss": 0.0779,
"step": 1607
},
{
"epoch": 0.6324483775811209,
"grad_norm": 1.570505976676941,
"learning_rate": 1.7904281266638201e-06,
"loss": 0.0577,
"step": 1608
},
{
"epoch": 0.6328416912487709,
"grad_norm": 1.3305639028549194,
"learning_rate": 1.7871358428496416e-06,
"loss": 0.0979,
"step": 1609
},
{
"epoch": 0.6332350049164208,
"grad_norm": 0.6136133074760437,
"learning_rate": 1.7838449041960276e-06,
"loss": 0.0424,
"step": 1610
},
{
"epoch": 0.6336283185840708,
"grad_norm": 0.7882452607154846,
"learning_rate": 1.7805553169129142e-06,
"loss": 0.0656,
"step": 1611
},
{
"epoch": 0.6340216322517207,
"grad_norm": 2.1648337841033936,
"learning_rate": 1.7772670872076883e-06,
"loss": 0.0622,
"step": 1612
},
{
"epoch": 0.6344149459193706,
"grad_norm": 0.5130072832107544,
"learning_rate": 1.773980221285173e-06,
"loss": 0.0394,
"step": 1613
},
{
"epoch": 0.6348082595870207,
"grad_norm": 1.0151782035827637,
"learning_rate": 1.7706947253476194e-06,
"loss": 0.0424,
"step": 1614
},
{
"epoch": 0.6352015732546706,
"grad_norm": 0.8527183532714844,
"learning_rate": 1.767410605594694e-06,
"loss": 0.0394,
"step": 1615
},
{
"epoch": 0.6355948869223206,
"grad_norm": 1.3671120405197144,
"learning_rate": 1.7641278682234658e-06,
"loss": 0.0625,
"step": 1616
},
{
"epoch": 0.6359882005899705,
"grad_norm": 0.8969728350639343,
"learning_rate": 1.7608465194283958e-06,
"loss": 0.0295,
"step": 1617
},
{
"epoch": 0.6363815142576205,
"grad_norm": 0.7407302260398865,
"learning_rate": 1.757566565401323e-06,
"loss": 0.055,
"step": 1618
},
{
"epoch": 0.6367748279252704,
"grad_norm": 1.153152346611023,
"learning_rate": 1.7542880123314559e-06,
"loss": 0.0945,
"step": 1619
},
{
"epoch": 0.6371681415929203,
"grad_norm": 1.259879231452942,
"learning_rate": 1.75101086640536e-06,
"loss": 0.0537,
"step": 1620
},
{
"epoch": 0.6375614552605703,
"grad_norm": 0.6502655744552612,
"learning_rate": 1.7477351338069442e-06,
"loss": 0.0443,
"step": 1621
},
{
"epoch": 0.6379547689282202,
"grad_norm": 0.9160225987434387,
"learning_rate": 1.7444608207174519e-06,
"loss": 0.0494,
"step": 1622
},
{
"epoch": 0.6383480825958702,
"grad_norm": 1.6503887176513672,
"learning_rate": 1.741187933315448e-06,
"loss": 0.0415,
"step": 1623
},
{
"epoch": 0.6387413962635201,
"grad_norm": 1.2449769973754883,
"learning_rate": 1.7379164777768038e-06,
"loss": 0.0607,
"step": 1624
},
{
"epoch": 0.63913470993117,
"grad_norm": 0.799196720123291,
"learning_rate": 1.734646460274692e-06,
"loss": 0.0404,
"step": 1625
},
{
"epoch": 0.6395280235988201,
"grad_norm": 1.6735135316848755,
"learning_rate": 1.7313778869795717e-06,
"loss": 0.0626,
"step": 1626
},
{
"epoch": 0.63992133726647,
"grad_norm": 1.090598702430725,
"learning_rate": 1.728110764059176e-06,
"loss": 0.0649,
"step": 1627
},
{
"epoch": 0.64031465093412,
"grad_norm": 0.6586104035377502,
"learning_rate": 1.7248450976785011e-06,
"loss": 0.0501,
"step": 1628
},
{
"epoch": 0.6407079646017699,
"grad_norm": 1.8684154748916626,
"learning_rate": 1.7215808939997945e-06,
"loss": 0.0653,
"step": 1629
},
{
"epoch": 0.6411012782694199,
"grad_norm": 1.1549500226974487,
"learning_rate": 1.7183181591825437e-06,
"loss": 0.0332,
"step": 1630
},
{
"epoch": 0.6414945919370698,
"grad_norm": 1.295351505279541,
"learning_rate": 1.7150568993834666e-06,
"loss": 0.0535,
"step": 1631
},
{
"epoch": 0.6418879056047198,
"grad_norm": 0.8795567750930786,
"learning_rate": 1.7117971207564934e-06,
"loss": 0.0866,
"step": 1632
},
{
"epoch": 0.6422812192723697,
"grad_norm": 0.6757074594497681,
"learning_rate": 1.7085388294527632e-06,
"loss": 0.0385,
"step": 1633
},
{
"epoch": 0.6426745329400196,
"grad_norm": 0.9733456373214722,
"learning_rate": 1.705282031620608e-06,
"loss": 0.0923,
"step": 1634
},
{
"epoch": 0.6430678466076696,
"grad_norm": 1.0591400861740112,
"learning_rate": 1.7020267334055393e-06,
"loss": 0.0492,
"step": 1635
},
{
"epoch": 0.6434611602753195,
"grad_norm": 0.8595137596130371,
"learning_rate": 1.6987729409502412e-06,
"loss": 0.0411,
"step": 1636
},
{
"epoch": 0.6438544739429695,
"grad_norm": 1.831631064414978,
"learning_rate": 1.6955206603945557e-06,
"loss": 0.0733,
"step": 1637
},
{
"epoch": 0.6442477876106195,
"grad_norm": 0.5861109495162964,
"learning_rate": 1.6922698978754726e-06,
"loss": 0.045,
"step": 1638
},
{
"epoch": 0.6446411012782695,
"grad_norm": 1.3072712421417236,
"learning_rate": 1.6890206595271153e-06,
"loss": 0.0713,
"step": 1639
},
{
"epoch": 0.6450344149459194,
"grad_norm": 0.8035500049591064,
"learning_rate": 1.6857729514807325e-06,
"loss": 0.0379,
"step": 1640
},
{
"epoch": 0.6454277286135693,
"grad_norm": 0.7814714312553406,
"learning_rate": 1.6825267798646851e-06,
"loss": 0.041,
"step": 1641
},
{
"epoch": 0.6458210422812193,
"grad_norm": 1.3243709802627563,
"learning_rate": 1.6792821508044352e-06,
"loss": 0.0633,
"step": 1642
},
{
"epoch": 0.6462143559488692,
"grad_norm": 0.8479057550430298,
"learning_rate": 1.6760390704225333e-06,
"loss": 0.0561,
"step": 1643
},
{
"epoch": 0.6466076696165192,
"grad_norm": 1.0051478147506714,
"learning_rate": 1.672797544838608e-06,
"loss": 0.0372,
"step": 1644
},
{
"epoch": 0.6470009832841691,
"grad_norm": 0.962547779083252,
"learning_rate": 1.6695575801693549e-06,
"loss": 0.0398,
"step": 1645
},
{
"epoch": 0.647394296951819,
"grad_norm": 1.314014196395874,
"learning_rate": 1.6663191825285214e-06,
"loss": 0.0492,
"step": 1646
},
{
"epoch": 0.647787610619469,
"grad_norm": 0.6934694647789001,
"learning_rate": 1.6630823580269005e-06,
"loss": 0.0367,
"step": 1647
},
{
"epoch": 0.6481809242871189,
"grad_norm": 1.1256476640701294,
"learning_rate": 1.6598471127723162e-06,
"loss": 0.0476,
"step": 1648
},
{
"epoch": 0.6485742379547689,
"grad_norm": 1.5946294069290161,
"learning_rate": 1.6566134528696126e-06,
"loss": 0.0484,
"step": 1649
},
{
"epoch": 0.6489675516224189,
"grad_norm": 1.1677006483078003,
"learning_rate": 1.6533813844206426e-06,
"loss": 0.0443,
"step": 1650
},
{
"epoch": 0.6493608652900689,
"grad_norm": 0.9727287292480469,
"learning_rate": 1.6501509135242533e-06,
"loss": 0.036,
"step": 1651
},
{
"epoch": 0.6497541789577188,
"grad_norm": 1.6365562677383423,
"learning_rate": 1.6469220462762807e-06,
"loss": 0.0794,
"step": 1652
},
{
"epoch": 0.6501474926253688,
"grad_norm": 0.9197725057601929,
"learning_rate": 1.6436947887695336e-06,
"loss": 0.0314,
"step": 1653
},
{
"epoch": 0.6505408062930187,
"grad_norm": 0.9444229006767273,
"learning_rate": 1.6404691470937829e-06,
"loss": 0.017,
"step": 1654
},
{
"epoch": 0.6509341199606686,
"grad_norm": 1.0287470817565918,
"learning_rate": 1.6372451273357504e-06,
"loss": 0.0674,
"step": 1655
},
{
"epoch": 0.6513274336283186,
"grad_norm": 0.9683353900909424,
"learning_rate": 1.6340227355790988e-06,
"loss": 0.0727,
"step": 1656
},
{
"epoch": 0.6517207472959685,
"grad_norm": 0.9869152903556824,
"learning_rate": 1.6308019779044154e-06,
"loss": 0.0526,
"step": 1657
},
{
"epoch": 0.6521140609636185,
"grad_norm": 2.224297046661377,
"learning_rate": 1.6275828603892078e-06,
"loss": 0.0635,
"step": 1658
},
{
"epoch": 0.6525073746312684,
"grad_norm": 0.8496151566505432,
"learning_rate": 1.6243653891078864e-06,
"loss": 0.0581,
"step": 1659
},
{
"epoch": 0.6529006882989183,
"grad_norm": 1.2158007621765137,
"learning_rate": 1.6211495701317565e-06,
"loss": 0.0728,
"step": 1660
},
{
"epoch": 0.6532940019665683,
"grad_norm": 0.48335015773773193,
"learning_rate": 1.6179354095290051e-06,
"loss": 0.0405,
"step": 1661
},
{
"epoch": 0.6536873156342183,
"grad_norm": 0.679865300655365,
"learning_rate": 1.6147229133646885e-06,
"loss": 0.0497,
"step": 1662
},
{
"epoch": 0.6540806293018683,
"grad_norm": 2.487617254257202,
"learning_rate": 1.611512087700724e-06,
"loss": 0.1029,
"step": 1663
},
{
"epoch": 0.6544739429695182,
"grad_norm": 1.0901083946228027,
"learning_rate": 1.6083029385958762e-06,
"loss": 0.0706,
"step": 1664
},
{
"epoch": 0.6548672566371682,
"grad_norm": 1.4582974910736084,
"learning_rate": 1.6050954721057461e-06,
"loss": 0.0651,
"step": 1665
},
{
"epoch": 0.6552605703048181,
"grad_norm": 1.1469032764434814,
"learning_rate": 1.6018896942827595e-06,
"loss": 0.0533,
"step": 1666
},
{
"epoch": 0.655653883972468,
"grad_norm": 1.5001522302627563,
"learning_rate": 1.5986856111761562e-06,
"loss": 0.0688,
"step": 1667
},
{
"epoch": 0.656047197640118,
"grad_norm": 0.7778475880622864,
"learning_rate": 1.595483228831976e-06,
"loss": 0.0457,
"step": 1668
},
{
"epoch": 0.6564405113077679,
"grad_norm": 0.910394549369812,
"learning_rate": 1.5922825532930526e-06,
"loss": 0.0295,
"step": 1669
},
{
"epoch": 0.6568338249754179,
"grad_norm": 1.1938371658325195,
"learning_rate": 1.5890835905989969e-06,
"loss": 0.0533,
"step": 1670
},
{
"epoch": 0.6572271386430678,
"grad_norm": 0.9362410306930542,
"learning_rate": 1.5858863467861882e-06,
"loss": 0.054,
"step": 1671
},
{
"epoch": 0.6576204523107178,
"grad_norm": 0.5481738448143005,
"learning_rate": 1.582690827887763e-06,
"loss": 0.037,
"step": 1672
},
{
"epoch": 0.6580137659783677,
"grad_norm": 0.8186729550361633,
"learning_rate": 1.5794970399336012e-06,
"loss": 0.0355,
"step": 1673
},
{
"epoch": 0.6584070796460177,
"grad_norm": 0.885360598564148,
"learning_rate": 1.576304988950318e-06,
"loss": 0.0478,
"step": 1674
},
{
"epoch": 0.6588003933136677,
"grad_norm": 1.0103771686553955,
"learning_rate": 1.5731146809612508e-06,
"loss": 0.0562,
"step": 1675
},
{
"epoch": 0.6591937069813176,
"grad_norm": 0.9461012482643127,
"learning_rate": 1.569926121986447e-06,
"loss": 0.0301,
"step": 1676
},
{
"epoch": 0.6595870206489676,
"grad_norm": 1.5684260129928589,
"learning_rate": 1.566739318042655e-06,
"loss": 0.0339,
"step": 1677
},
{
"epoch": 0.6599803343166175,
"grad_norm": 0.7456137537956238,
"learning_rate": 1.56355427514331e-06,
"loss": 0.0592,
"step": 1678
},
{
"epoch": 0.6603736479842675,
"grad_norm": 1.6279810667037964,
"learning_rate": 1.5603709992985256e-06,
"loss": 0.0452,
"step": 1679
},
{
"epoch": 0.6607669616519174,
"grad_norm": 1.3496975898742676,
"learning_rate": 1.5571894965150796e-06,
"loss": 0.058,
"step": 1680
},
{
"epoch": 0.6611602753195673,
"grad_norm": 1.0409663915634155,
"learning_rate": 1.554009772796406e-06,
"loss": 0.0635,
"step": 1681
},
{
"epoch": 0.6615535889872173,
"grad_norm": 0.6893079876899719,
"learning_rate": 1.55083183414258e-06,
"loss": 0.042,
"step": 1682
},
{
"epoch": 0.6619469026548672,
"grad_norm": 1.3735069036483765,
"learning_rate": 1.5476556865503095e-06,
"loss": 0.0418,
"step": 1683
},
{
"epoch": 0.6623402163225172,
"grad_norm": 0.9965916275978088,
"learning_rate": 1.5444813360129207e-06,
"loss": 0.0436,
"step": 1684
},
{
"epoch": 0.6627335299901671,
"grad_norm": 0.41811513900756836,
"learning_rate": 1.5413087885203515e-06,
"loss": 0.032,
"step": 1685
},
{
"epoch": 0.6631268436578172,
"grad_norm": 1.2320137023925781,
"learning_rate": 1.538138050059136e-06,
"loss": 0.0588,
"step": 1686
},
{
"epoch": 0.6635201573254671,
"grad_norm": 1.2540123462677002,
"learning_rate": 1.5349691266123946e-06,
"loss": 0.0527,
"step": 1687
},
{
"epoch": 0.663913470993117,
"grad_norm": 0.8406708240509033,
"learning_rate": 1.5318020241598248e-06,
"loss": 0.0479,
"step": 1688
},
{
"epoch": 0.664306784660767,
"grad_norm": 1.1033174991607666,
"learning_rate": 1.5286367486776835e-06,
"loss": 0.0566,
"step": 1689
},
{
"epoch": 0.6647000983284169,
"grad_norm": 1.4875179529190063,
"learning_rate": 1.5254733061387846e-06,
"loss": 0.0566,
"step": 1690
},
{
"epoch": 0.6650934119960669,
"grad_norm": 1.0827391147613525,
"learning_rate": 1.5223117025124817e-06,
"loss": 0.0333,
"step": 1691
},
{
"epoch": 0.6654867256637168,
"grad_norm": 1.2373061180114746,
"learning_rate": 1.5191519437646576e-06,
"loss": 0.048,
"step": 1692
},
{
"epoch": 0.6658800393313667,
"grad_norm": 0.9508680701255798,
"learning_rate": 1.5159940358577151e-06,
"loss": 0.0499,
"step": 1693
},
{
"epoch": 0.6662733529990167,
"grad_norm": 0.4500909447669983,
"learning_rate": 1.512837984750565e-06,
"loss": 0.0207,
"step": 1694
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.83719003200531,
"learning_rate": 1.5096837963986112e-06,
"loss": 0.0541,
"step": 1695
},
{
"epoch": 0.6670599803343166,
"grad_norm": 1.0231764316558838,
"learning_rate": 1.5065314767537453e-06,
"loss": 0.0255,
"step": 1696
},
{
"epoch": 0.6674532940019666,
"grad_norm": 0.8618975877761841,
"learning_rate": 1.5033810317643327e-06,
"loss": 0.0398,
"step": 1697
},
{
"epoch": 0.6678466076696166,
"grad_norm": 0.40866029262542725,
"learning_rate": 1.5002324673752006e-06,
"loss": 0.031,
"step": 1698
},
{
"epoch": 0.6682399213372665,
"grad_norm": 0.7475729584693909,
"learning_rate": 1.4970857895276285e-06,
"loss": 0.0534,
"step": 1699
},
{
"epoch": 0.6686332350049164,
"grad_norm": 1.0545064210891724,
"learning_rate": 1.4939410041593338e-06,
"loss": 0.0451,
"step": 1700
},
{
"epoch": 0.6690265486725664,
"grad_norm": 1.023006796836853,
"learning_rate": 1.4907981172044647e-06,
"loss": 0.0594,
"step": 1701
},
{
"epoch": 0.6694198623402163,
"grad_norm": 0.9975923299789429,
"learning_rate": 1.487657134593587e-06,
"loss": 0.0634,
"step": 1702
},
{
"epoch": 0.6698131760078663,
"grad_norm": 1.2105883359909058,
"learning_rate": 1.4845180622536728e-06,
"loss": 0.0482,
"step": 1703
},
{
"epoch": 0.6702064896755162,
"grad_norm": 1.007332682609558,
"learning_rate": 1.4813809061080893e-06,
"loss": 0.0706,
"step": 1704
},
{
"epoch": 0.6705998033431662,
"grad_norm": 0.7119497060775757,
"learning_rate": 1.4782456720765895e-06,
"loss": 0.0409,
"step": 1705
},
{
"epoch": 0.6709931170108161,
"grad_norm": 1.0542527437210083,
"learning_rate": 1.4751123660752955e-06,
"loss": 0.0388,
"step": 1706
},
{
"epoch": 0.671386430678466,
"grad_norm": 2.3204405307769775,
"learning_rate": 1.4719809940166952e-06,
"loss": 0.0724,
"step": 1707
},
{
"epoch": 0.671779744346116,
"grad_norm": 0.5740649700164795,
"learning_rate": 1.4688515618096252e-06,
"loss": 0.0319,
"step": 1708
},
{
"epoch": 0.672173058013766,
"grad_norm": 0.9803503155708313,
"learning_rate": 1.4657240753592627e-06,
"loss": 0.0504,
"step": 1709
},
{
"epoch": 0.672566371681416,
"grad_norm": 0.8115725517272949,
"learning_rate": 1.462598540567113e-06,
"loss": 0.0605,
"step": 1710
},
{
"epoch": 0.6729596853490659,
"grad_norm": 1.3304479122161865,
"learning_rate": 1.4594749633309981e-06,
"loss": 0.0758,
"step": 1711
},
{
"epoch": 0.6733529990167159,
"grad_norm": 1.208067774772644,
"learning_rate": 1.456353349545046e-06,
"loss": 0.0706,
"step": 1712
},
{
"epoch": 0.6737463126843658,
"grad_norm": 1.1107121706008911,
"learning_rate": 1.4532337050996804e-06,
"loss": 0.0468,
"step": 1713
},
{
"epoch": 0.6741396263520157,
"grad_norm": 1.192116618156433,
"learning_rate": 1.4501160358816085e-06,
"loss": 0.0657,
"step": 1714
},
{
"epoch": 0.6745329400196657,
"grad_norm": 1.0967481136322021,
"learning_rate": 1.4470003477738111e-06,
"loss": 0.0499,
"step": 1715
},
{
"epoch": 0.6749262536873156,
"grad_norm": 1.3263583183288574,
"learning_rate": 1.4438866466555308e-06,
"loss": 0.0449,
"step": 1716
},
{
"epoch": 0.6753195673549656,
"grad_norm": 1.5055456161499023,
"learning_rate": 1.4407749384022576e-06,
"loss": 0.0489,
"step": 1717
},
{
"epoch": 0.6757128810226155,
"grad_norm": 1.5726017951965332,
"learning_rate": 1.4376652288857249e-06,
"loss": 0.0626,
"step": 1718
},
{
"epoch": 0.6761061946902654,
"grad_norm": 1.6234389543533325,
"learning_rate": 1.4345575239738928e-06,
"loss": 0.0606,
"step": 1719
},
{
"epoch": 0.6764995083579154,
"grad_norm": 1.7149680852890015,
"learning_rate": 1.431451829530939e-06,
"loss": 0.0527,
"step": 1720
},
{
"epoch": 0.6768928220255654,
"grad_norm": 0.8043215870857239,
"learning_rate": 1.4283481514172487e-06,
"loss": 0.0454,
"step": 1721
},
{
"epoch": 0.6772861356932154,
"grad_norm": 1.3794721364974976,
"learning_rate": 1.425246495489399e-06,
"loss": 0.0522,
"step": 1722
},
{
"epoch": 0.6776794493608653,
"grad_norm": 0.7596322298049927,
"learning_rate": 1.4221468676001544e-06,
"loss": 0.0507,
"step": 1723
},
{
"epoch": 0.6780727630285153,
"grad_norm": 0.9277907013893127,
"learning_rate": 1.419049273598451e-06,
"loss": 0.0406,
"step": 1724
},
{
"epoch": 0.6784660766961652,
"grad_norm": 1.7175707817077637,
"learning_rate": 1.4159537193293876e-06,
"loss": 0.0477,
"step": 1725
},
{
"epoch": 0.6788593903638152,
"grad_norm": 0.5326056480407715,
"learning_rate": 1.4128602106342154e-06,
"loss": 0.0248,
"step": 1726
},
{
"epoch": 0.6792527040314651,
"grad_norm": 1.259993314743042,
"learning_rate": 1.4097687533503213e-06,
"loss": 0.05,
"step": 1727
},
{
"epoch": 0.679646017699115,
"grad_norm": 0.9844882488250732,
"learning_rate": 1.4066793533112255e-06,
"loss": 0.0407,
"step": 1728
},
{
"epoch": 0.680039331366765,
"grad_norm": 1.6221920251846313,
"learning_rate": 1.4035920163465648e-06,
"loss": 0.0589,
"step": 1729
},
{
"epoch": 0.6804326450344149,
"grad_norm": 2.0537407398223877,
"learning_rate": 1.400506748282083e-06,
"loss": 0.0622,
"step": 1730
},
{
"epoch": 0.6808259587020649,
"grad_norm": 1.1460561752319336,
"learning_rate": 1.3974235549396198e-06,
"loss": 0.0448,
"step": 1731
},
{
"epoch": 0.6812192723697148,
"grad_norm": 1.2280306816101074,
"learning_rate": 1.3943424421370998e-06,
"loss": 0.0621,
"step": 1732
},
{
"epoch": 0.6816125860373649,
"grad_norm": 1.9272797107696533,
"learning_rate": 1.3912634156885235e-06,
"loss": 0.0559,
"step": 1733
},
{
"epoch": 0.6820058997050148,
"grad_norm": 0.8985779285430908,
"learning_rate": 1.3881864814039503e-06,
"loss": 0.0568,
"step": 1734
},
{
"epoch": 0.6823992133726647,
"grad_norm": 0.5459672808647156,
"learning_rate": 1.3851116450894959e-06,
"loss": 0.03,
"step": 1735
},
{
"epoch": 0.6827925270403147,
"grad_norm": 0.8683139085769653,
"learning_rate": 1.382038912547315e-06,
"loss": 0.0513,
"step": 1736
},
{
"epoch": 0.6831858407079646,
"grad_norm": 0.7696962952613831,
"learning_rate": 1.3789682895755935e-06,
"loss": 0.0448,
"step": 1737
},
{
"epoch": 0.6835791543756146,
"grad_norm": 1.2431952953338623,
"learning_rate": 1.3758997819685366e-06,
"loss": 0.0493,
"step": 1738
},
{
"epoch": 0.6839724680432645,
"grad_norm": 0.9553192853927612,
"learning_rate": 1.3728333955163565e-06,
"loss": 0.0321,
"step": 1739
},
{
"epoch": 0.6843657817109144,
"grad_norm": 1.2432819604873657,
"learning_rate": 1.3697691360052646e-06,
"loss": 0.0744,
"step": 1740
},
{
"epoch": 0.6847590953785644,
"grad_norm": 0.6021830439567566,
"learning_rate": 1.3667070092174587e-06,
"loss": 0.0471,
"step": 1741
},
{
"epoch": 0.6851524090462143,
"grad_norm": 1.0340098142623901,
"learning_rate": 1.3636470209311093e-06,
"loss": 0.0645,
"step": 1742
},
{
"epoch": 0.6855457227138643,
"grad_norm": 1.2661107778549194,
"learning_rate": 1.360589176920355e-06,
"loss": 0.0314,
"step": 1743
},
{
"epoch": 0.6859390363815142,
"grad_norm": 1.7685880661010742,
"learning_rate": 1.357533482955287e-06,
"loss": 0.0635,
"step": 1744
},
{
"epoch": 0.6863323500491643,
"grad_norm": 1.249866008758545,
"learning_rate": 1.354479944801939e-06,
"loss": 0.0257,
"step": 1745
},
{
"epoch": 0.6867256637168142,
"grad_norm": 0.8888324499130249,
"learning_rate": 1.3514285682222777e-06,
"loss": 0.0501,
"step": 1746
},
{
"epoch": 0.6871189773844641,
"grad_norm": 0.9306212067604065,
"learning_rate": 1.3483793589741901e-06,
"loss": 0.0535,
"step": 1747
},
{
"epoch": 0.6875122910521141,
"grad_norm": 1.239108920097351,
"learning_rate": 1.3453323228114745e-06,
"loss": 0.0645,
"step": 1748
},
{
"epoch": 0.687905604719764,
"grad_norm": 1.971179723739624,
"learning_rate": 1.3422874654838263e-06,
"loss": 0.0617,
"step": 1749
},
{
"epoch": 0.688298918387414,
"grad_norm": 0.8780958652496338,
"learning_rate": 1.3392447927368315e-06,
"loss": 0.0303,
"step": 1750
},
{
"epoch": 0.6886922320550639,
"grad_norm": 0.5229460000991821,
"learning_rate": 1.3362043103119537e-06,
"loss": 0.0408,
"step": 1751
},
{
"epoch": 0.6890855457227139,
"grad_norm": 1.0178303718566895,
"learning_rate": 1.3331660239465232e-06,
"loss": 0.0692,
"step": 1752
},
{
"epoch": 0.6894788593903638,
"grad_norm": 1.1098684072494507,
"learning_rate": 1.3301299393737262e-06,
"loss": 0.0553,
"step": 1753
},
{
"epoch": 0.6898721730580137,
"grad_norm": 0.9905382990837097,
"learning_rate": 1.3270960623225953e-06,
"loss": 0.0551,
"step": 1754
},
{
"epoch": 0.6902654867256637,
"grad_norm": 1.15705406665802,
"learning_rate": 1.324064398517994e-06,
"loss": 0.0606,
"step": 1755
},
{
"epoch": 0.6906588003933136,
"grad_norm": 0.7547001838684082,
"learning_rate": 1.3210349536806138e-06,
"loss": 0.0375,
"step": 1756
},
{
"epoch": 0.6910521140609637,
"grad_norm": 0.9143390655517578,
"learning_rate": 1.3180077335269565e-06,
"loss": 0.0557,
"step": 1757
},
{
"epoch": 0.6914454277286136,
"grad_norm": 1.5813028812408447,
"learning_rate": 1.3149827437693267e-06,
"loss": 0.0734,
"step": 1758
},
{
"epoch": 0.6918387413962636,
"grad_norm": 1.3135156631469727,
"learning_rate": 1.3119599901158214e-06,
"loss": 0.0454,
"step": 1759
},
{
"epoch": 0.6922320550639135,
"grad_norm": 1.3713979721069336,
"learning_rate": 1.3089394782703152e-06,
"loss": 0.0459,
"step": 1760
},
{
"epoch": 0.6926253687315634,
"grad_norm": 1.0648804903030396,
"learning_rate": 1.3059212139324548e-06,
"loss": 0.0562,
"step": 1761
},
{
"epoch": 0.6930186823992134,
"grad_norm": 0.8367137312889099,
"learning_rate": 1.3029052027976457e-06,
"loss": 0.0269,
"step": 1762
},
{
"epoch": 0.6934119960668633,
"grad_norm": 1.1222723722457886,
"learning_rate": 1.299891450557041e-06,
"loss": 0.0458,
"step": 1763
},
{
"epoch": 0.6938053097345133,
"grad_norm": 1.087550163269043,
"learning_rate": 1.2968799628975311e-06,
"loss": 0.0357,
"step": 1764
},
{
"epoch": 0.6941986234021632,
"grad_norm": 0.8797011375427246,
"learning_rate": 1.2938707455017358e-06,
"loss": 0.0459,
"step": 1765
},
{
"epoch": 0.6945919370698131,
"grad_norm": 1.4389101266860962,
"learning_rate": 1.2908638040479855e-06,
"loss": 0.0715,
"step": 1766
},
{
"epoch": 0.6949852507374631,
"grad_norm": 0.826977014541626,
"learning_rate": 1.2878591442103215e-06,
"loss": 0.0498,
"step": 1767
},
{
"epoch": 0.695378564405113,
"grad_norm": 1.2073124647140503,
"learning_rate": 1.2848567716584764e-06,
"loss": 0.0401,
"step": 1768
},
{
"epoch": 0.6957718780727631,
"grad_norm": 1.2512377500534058,
"learning_rate": 1.2818566920578684e-06,
"loss": 0.0545,
"step": 1769
},
{
"epoch": 0.696165191740413,
"grad_norm": 1.003304123878479,
"learning_rate": 1.2788589110695896e-06,
"loss": 0.0657,
"step": 1770
},
{
"epoch": 0.696558505408063,
"grad_norm": 1.6829479932785034,
"learning_rate": 1.275863434350391e-06,
"loss": 0.0488,
"step": 1771
},
{
"epoch": 0.6969518190757129,
"grad_norm": 1.0957913398742676,
"learning_rate": 1.2728702675526788e-06,
"loss": 0.0695,
"step": 1772
},
{
"epoch": 0.6973451327433628,
"grad_norm": 1.2029186487197876,
"learning_rate": 1.2698794163244998e-06,
"loss": 0.0574,
"step": 1773
},
{
"epoch": 0.6977384464110128,
"grad_norm": 0.8925944566726685,
"learning_rate": 1.2668908863095311e-06,
"loss": 0.0424,
"step": 1774
},
{
"epoch": 0.6981317600786627,
"grad_norm": 0.8353788256645203,
"learning_rate": 1.2639046831470697e-06,
"loss": 0.038,
"step": 1775
},
{
"epoch": 0.6985250737463127,
"grad_norm": 2.284682273864746,
"learning_rate": 1.2609208124720228e-06,
"loss": 0.0687,
"step": 1776
},
{
"epoch": 0.6989183874139626,
"grad_norm": 0.9992805123329163,
"learning_rate": 1.2579392799148938e-06,
"loss": 0.0401,
"step": 1777
},
{
"epoch": 0.6993117010816126,
"grad_norm": 1.329393744468689,
"learning_rate": 1.2549600911017761e-06,
"loss": 0.0768,
"step": 1778
},
{
"epoch": 0.6997050147492625,
"grad_norm": 1.184579849243164,
"learning_rate": 1.25198325165434e-06,
"loss": 0.0467,
"step": 1779
},
{
"epoch": 0.7000983284169124,
"grad_norm": 0.6934780478477478,
"learning_rate": 1.2490087671898234e-06,
"loss": 0.0454,
"step": 1780
},
{
"epoch": 0.7004916420845625,
"grad_norm": 0.5612182021141052,
"learning_rate": 1.24603664332102e-06,
"loss": 0.0397,
"step": 1781
},
{
"epoch": 0.7008849557522124,
"grad_norm": 1.493826985359192,
"learning_rate": 1.243066885656267e-06,
"loss": 0.0815,
"step": 1782
},
{
"epoch": 0.7012782694198624,
"grad_norm": 0.7363511323928833,
"learning_rate": 1.240099499799439e-06,
"loss": 0.0496,
"step": 1783
},
{
"epoch": 0.7016715830875123,
"grad_norm": 1.6472634077072144,
"learning_rate": 1.237134491349935e-06,
"loss": 0.0741,
"step": 1784
},
{
"epoch": 0.7020648967551623,
"grad_norm": 1.3183567523956299,
"learning_rate": 1.234171865902667e-06,
"loss": 0.043,
"step": 1785
},
{
"epoch": 0.7024582104228122,
"grad_norm": 1.0543493032455444,
"learning_rate": 1.2312116290480506e-06,
"loss": 0.0401,
"step": 1786
},
{
"epoch": 0.7028515240904621,
"grad_norm": 0.8686029314994812,
"learning_rate": 1.228253786371995e-06,
"loss": 0.0335,
"step": 1787
},
{
"epoch": 0.7032448377581121,
"grad_norm": 1.9254342317581177,
"learning_rate": 1.2252983434558894e-06,
"loss": 0.0361,
"step": 1788
},
{
"epoch": 0.703638151425762,
"grad_norm": 0.8810344338417053,
"learning_rate": 1.2223453058765966e-06,
"loss": 0.0442,
"step": 1789
},
{
"epoch": 0.704031465093412,
"grad_norm": 1.138178825378418,
"learning_rate": 1.2193946792064403e-06,
"loss": 0.0768,
"step": 1790
},
{
"epoch": 0.7044247787610619,
"grad_norm": 0.7755922675132751,
"learning_rate": 1.2164464690131947e-06,
"loss": 0.0303,
"step": 1791
},
{
"epoch": 0.7048180924287119,
"grad_norm": 1.5868074893951416,
"learning_rate": 1.2135006808600752e-06,
"loss": 0.052,
"step": 1792
},
{
"epoch": 0.7052114060963619,
"grad_norm": 0.9672881364822388,
"learning_rate": 1.2105573203057233e-06,
"loss": 0.0432,
"step": 1793
},
{
"epoch": 0.7056047197640118,
"grad_norm": 0.9986976981163025,
"learning_rate": 1.207616392904204e-06,
"loss": 0.0464,
"step": 1794
},
{
"epoch": 0.7059980334316618,
"grad_norm": 0.646554708480835,
"learning_rate": 1.2046779042049883e-06,
"loss": 0.0268,
"step": 1795
},
{
"epoch": 0.7063913470993117,
"grad_norm": 0.6818554997444153,
"learning_rate": 1.2017418597529464e-06,
"loss": 0.0521,
"step": 1796
},
{
"epoch": 0.7067846607669617,
"grad_norm": 0.5991765260696411,
"learning_rate": 1.1988082650883376e-06,
"loss": 0.0538,
"step": 1797
},
{
"epoch": 0.7071779744346116,
"grad_norm": 1.1525814533233643,
"learning_rate": 1.1958771257467946e-06,
"loss": 0.0451,
"step": 1798
},
{
"epoch": 0.7075712881022616,
"grad_norm": 0.8486371040344238,
"learning_rate": 1.1929484472593205e-06,
"loss": 0.0514,
"step": 1799
},
{
"epoch": 0.7079646017699115,
"grad_norm": 1.393419623374939,
"learning_rate": 1.190022235152274e-06,
"loss": 0.0609,
"step": 1800
},
{
"epoch": 0.7083579154375614,
"grad_norm": 0.7574542760848999,
"learning_rate": 1.1870984949473586e-06,
"loss": 0.0604,
"step": 1801
},
{
"epoch": 0.7087512291052114,
"grad_norm": 1.0601574182510376,
"learning_rate": 1.184177232161615e-06,
"loss": 0.0459,
"step": 1802
},
{
"epoch": 0.7091445427728613,
"grad_norm": 0.7535306811332703,
"learning_rate": 1.1812584523074089e-06,
"loss": 0.0351,
"step": 1803
},
{
"epoch": 0.7095378564405113,
"grad_norm": 1.3023512363433838,
"learning_rate": 1.1783421608924183e-06,
"loss": 0.0598,
"step": 1804
},
{
"epoch": 0.7099311701081613,
"grad_norm": 1.1070560216903687,
"learning_rate": 1.1754283634196285e-06,
"loss": 0.0471,
"step": 1805
},
{
"epoch": 0.7103244837758113,
"grad_norm": 0.9613627791404724,
"learning_rate": 1.1725170653873174e-06,
"loss": 0.0486,
"step": 1806
},
{
"epoch": 0.7107177974434612,
"grad_norm": 0.7932494282722473,
"learning_rate": 1.1696082722890474e-06,
"loss": 0.0774,
"step": 1807
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.684893786907196,
"learning_rate": 1.1667019896136539e-06,
"loss": 0.0454,
"step": 1808
},
{
"epoch": 0.7115044247787611,
"grad_norm": 1.3207006454467773,
"learning_rate": 1.1637982228452329e-06,
"loss": 0.0473,
"step": 1809
},
{
"epoch": 0.711897738446411,
"grad_norm": 1.3429388999938965,
"learning_rate": 1.1608969774631366e-06,
"loss": 0.0412,
"step": 1810
},
{
"epoch": 0.712291052114061,
"grad_norm": 1.4132349491119385,
"learning_rate": 1.1579982589419568e-06,
"loss": 0.0549,
"step": 1811
},
{
"epoch": 0.7126843657817109,
"grad_norm": 0.7561691999435425,
"learning_rate": 1.155102072751518e-06,
"loss": 0.0337,
"step": 1812
},
{
"epoch": 0.7130776794493608,
"grad_norm": 0.7749929428100586,
"learning_rate": 1.152208424356867e-06,
"loss": 0.034,
"step": 1813
},
{
"epoch": 0.7134709931170108,
"grad_norm": 1.1324396133422852,
"learning_rate": 1.1493173192182613e-06,
"loss": 0.032,
"step": 1814
},
{
"epoch": 0.7138643067846607,
"grad_norm": 0.7702449560165405,
"learning_rate": 1.1464287627911577e-06,
"loss": 0.0451,
"step": 1815
},
{
"epoch": 0.7142576204523107,
"grad_norm": 0.7402438521385193,
"learning_rate": 1.1435427605262057e-06,
"loss": 0.0489,
"step": 1816
},
{
"epoch": 0.7146509341199607,
"grad_norm": 1.3986225128173828,
"learning_rate": 1.1406593178692346e-06,
"loss": 0.0463,
"step": 1817
},
{
"epoch": 0.7150442477876107,
"grad_norm": 0.7235271334648132,
"learning_rate": 1.1377784402612439e-06,
"loss": 0.0519,
"step": 1818
},
{
"epoch": 0.7154375614552606,
"grad_norm": 0.8625795841217041,
"learning_rate": 1.1349001331383921e-06,
"loss": 0.0375,
"step": 1819
},
{
"epoch": 0.7158308751229105,
"grad_norm": 1.5163322687149048,
"learning_rate": 1.132024401931988e-06,
"loss": 0.0557,
"step": 1820
},
{
"epoch": 0.7162241887905605,
"grad_norm": 0.6675801277160645,
"learning_rate": 1.12915125206848e-06,
"loss": 0.0261,
"step": 1821
},
{
"epoch": 0.7166175024582104,
"grad_norm": 0.9029967188835144,
"learning_rate": 1.1262806889694455e-06,
"loss": 0.037,
"step": 1822
},
{
"epoch": 0.7170108161258604,
"grad_norm": 0.716080367565155,
"learning_rate": 1.1234127180515787e-06,
"loss": 0.0559,
"step": 1823
},
{
"epoch": 0.7174041297935103,
"grad_norm": 0.9414195418357849,
"learning_rate": 1.1205473447266843e-06,
"loss": 0.0466,
"step": 1824
},
{
"epoch": 0.7177974434611603,
"grad_norm": 0.9414455890655518,
"learning_rate": 1.117684574401666e-06,
"loss": 0.0408,
"step": 1825
},
{
"epoch": 0.7181907571288102,
"grad_norm": 0.6914128065109253,
"learning_rate": 1.1148244124785143e-06,
"loss": 0.0286,
"step": 1826
},
{
"epoch": 0.7185840707964601,
"grad_norm": 1.238477349281311,
"learning_rate": 1.111966864354298e-06,
"loss": 0.0606,
"step": 1827
},
{
"epoch": 0.7189773844641101,
"grad_norm": 1.5670506954193115,
"learning_rate": 1.1091119354211544e-06,
"loss": 0.045,
"step": 1828
},
{
"epoch": 0.7193706981317601,
"grad_norm": 1.5129029750823975,
"learning_rate": 1.1062596310662775e-06,
"loss": 0.0352,
"step": 1829
},
{
"epoch": 0.7197640117994101,
"grad_norm": 1.0257515907287598,
"learning_rate": 1.1034099566719104e-06,
"loss": 0.0267,
"step": 1830
},
{
"epoch": 0.72015732546706,
"grad_norm": 0.8426341414451599,
"learning_rate": 1.1005629176153302e-06,
"loss": 0.0331,
"step": 1831
},
{
"epoch": 0.72055063913471,
"grad_norm": 1.1478296518325806,
"learning_rate": 1.097718519268844e-06,
"loss": 0.0601,
"step": 1832
},
{
"epoch": 0.7209439528023599,
"grad_norm": 1.6983435153961182,
"learning_rate": 1.0948767669997762e-06,
"loss": 0.0671,
"step": 1833
},
{
"epoch": 0.7213372664700098,
"grad_norm": 0.992310643196106,
"learning_rate": 1.092037666170456e-06,
"loss": 0.0554,
"step": 1834
},
{
"epoch": 0.7217305801376598,
"grad_norm": 1.258967399597168,
"learning_rate": 1.0892012221382115e-06,
"loss": 0.0423,
"step": 1835
},
{
"epoch": 0.7221238938053097,
"grad_norm": 0.8152772188186646,
"learning_rate": 1.0863674402553564e-06,
"loss": 0.0638,
"step": 1836
},
{
"epoch": 0.7225172074729597,
"grad_norm": 0.8680564165115356,
"learning_rate": 1.08353632586918e-06,
"loss": 0.0322,
"step": 1837
},
{
"epoch": 0.7229105211406096,
"grad_norm": 0.4944194257259369,
"learning_rate": 1.0807078843219395e-06,
"loss": 0.0684,
"step": 1838
},
{
"epoch": 0.7233038348082595,
"grad_norm": 1.0787291526794434,
"learning_rate": 1.077882120950849e-06,
"loss": 0.0355,
"step": 1839
},
{
"epoch": 0.7236971484759095,
"grad_norm": 0.4451111853122711,
"learning_rate": 1.0750590410880671e-06,
"loss": 0.0291,
"step": 1840
},
{
"epoch": 0.7240904621435595,
"grad_norm": 0.48384201526641846,
"learning_rate": 1.072238650060691e-06,
"loss": 0.0344,
"step": 1841
},
{
"epoch": 0.7244837758112095,
"grad_norm": 1.1826977729797363,
"learning_rate": 1.0694209531907412e-06,
"loss": 0.0302,
"step": 1842
},
{
"epoch": 0.7248770894788594,
"grad_norm": 0.5904631614685059,
"learning_rate": 1.0666059557951566e-06,
"loss": 0.0268,
"step": 1843
},
{
"epoch": 0.7252704031465094,
"grad_norm": 0.7693639993667603,
"learning_rate": 1.0637936631857815e-06,
"loss": 0.0329,
"step": 1844
},
{
"epoch": 0.7256637168141593,
"grad_norm": 1.1267420053482056,
"learning_rate": 1.0609840806693567e-06,
"loss": 0.0584,
"step": 1845
},
{
"epoch": 0.7260570304818093,
"grad_norm": 0.8826761841773987,
"learning_rate": 1.0581772135475089e-06,
"loss": 0.0371,
"step": 1846
},
{
"epoch": 0.7264503441494592,
"grad_norm": 0.9510964751243591,
"learning_rate": 1.0553730671167412e-06,
"loss": 0.0366,
"step": 1847
},
{
"epoch": 0.7268436578171091,
"grad_norm": 1.4061312675476074,
"learning_rate": 1.052571646668421e-06,
"loss": 0.0548,
"step": 1848
},
{
"epoch": 0.7272369714847591,
"grad_norm": 1.7235345840454102,
"learning_rate": 1.0497729574887744e-06,
"loss": 0.0729,
"step": 1849
},
{
"epoch": 0.727630285152409,
"grad_norm": 1.10977041721344,
"learning_rate": 1.0469770048588723e-06,
"loss": 0.042,
"step": 1850
},
{
"epoch": 0.728023598820059,
"grad_norm": 1.054607629776001,
"learning_rate": 1.0441837940546217e-06,
"loss": 0.0286,
"step": 1851
},
{
"epoch": 0.7284169124877089,
"grad_norm": 1.315953016281128,
"learning_rate": 1.0413933303467578e-06,
"loss": 0.0415,
"step": 1852
},
{
"epoch": 0.728810226155359,
"grad_norm": 1.4497429132461548,
"learning_rate": 1.038605619000828e-06,
"loss": 0.0566,
"step": 1853
},
{
"epoch": 0.7292035398230089,
"grad_norm": 1.1214773654937744,
"learning_rate": 1.0358206652771896e-06,
"loss": 0.0388,
"step": 1854
},
{
"epoch": 0.7295968534906588,
"grad_norm": 0.8499764204025269,
"learning_rate": 1.033038474430995e-06,
"loss": 0.022,
"step": 1855
},
{
"epoch": 0.7299901671583088,
"grad_norm": 0.993175745010376,
"learning_rate": 1.0302590517121835e-06,
"loss": 0.0351,
"step": 1856
},
{
"epoch": 0.7303834808259587,
"grad_norm": 1.3063788414001465,
"learning_rate": 1.0274824023654717e-06,
"loss": 0.049,
"step": 1857
},
{
"epoch": 0.7307767944936087,
"grad_norm": 0.6438285112380981,
"learning_rate": 1.0247085316303401e-06,
"loss": 0.0322,
"step": 1858
},
{
"epoch": 0.7311701081612586,
"grad_norm": 1.801291823387146,
"learning_rate": 1.0219374447410289e-06,
"loss": 0.0724,
"step": 1859
},
{
"epoch": 0.7315634218289085,
"grad_norm": 1.5461159944534302,
"learning_rate": 1.019169146926524e-06,
"loss": 0.0466,
"step": 1860
},
{
"epoch": 0.7319567354965585,
"grad_norm": 1.0814778804779053,
"learning_rate": 1.016403643410549e-06,
"loss": 0.0532,
"step": 1861
},
{
"epoch": 0.7323500491642084,
"grad_norm": 1.1939774751663208,
"learning_rate": 1.013640939411554e-06,
"loss": 0.0349,
"step": 1862
},
{
"epoch": 0.7327433628318584,
"grad_norm": 2.0183346271514893,
"learning_rate": 1.010881040142708e-06,
"loss": 0.0802,
"step": 1863
},
{
"epoch": 0.7331366764995083,
"grad_norm": 1.4486076831817627,
"learning_rate": 1.0081239508118842e-06,
"loss": 0.0381,
"step": 1864
},
{
"epoch": 0.7335299901671584,
"grad_norm": 0.7198472023010254,
"learning_rate": 1.0053696766216566e-06,
"loss": 0.0332,
"step": 1865
},
{
"epoch": 0.7339233038348083,
"grad_norm": 1.0703610181808472,
"learning_rate": 1.0026182227692865e-06,
"loss": 0.0321,
"step": 1866
},
{
"epoch": 0.7343166175024582,
"grad_norm": 0.9748527407646179,
"learning_rate": 9.998695944467127e-07,
"loss": 0.0312,
"step": 1867
},
{
"epoch": 0.7347099311701082,
"grad_norm": 0.6599907279014587,
"learning_rate": 9.97123796840543e-07,
"loss": 0.05,
"step": 1868
},
{
"epoch": 0.7351032448377581,
"grad_norm": 1.033435583114624,
"learning_rate": 9.943808351320418e-07,
"loss": 0.0482,
"step": 1869
},
{
"epoch": 0.7354965585054081,
"grad_norm": 1.139096975326538,
"learning_rate": 9.916407144971245e-07,
"loss": 0.046,
"step": 1870
},
{
"epoch": 0.735889872173058,
"grad_norm": 1.5064547061920166,
"learning_rate": 9.889034401063443e-07,
"loss": 0.0629,
"step": 1871
},
{
"epoch": 0.736283185840708,
"grad_norm": 0.7273301482200623,
"learning_rate": 9.861690171248841e-07,
"loss": 0.0314,
"step": 1872
},
{
"epoch": 0.7366764995083579,
"grad_norm": 0.579467236995697,
"learning_rate": 9.834374507125458e-07,
"loss": 0.0527,
"step": 1873
},
{
"epoch": 0.7370698131760078,
"grad_norm": 0.8448885679244995,
"learning_rate": 9.807087460237419e-07,
"loss": 0.0326,
"step": 1874
},
{
"epoch": 0.7374631268436578,
"grad_norm": 1.0001413822174072,
"learning_rate": 9.779829082074827e-07,
"loss": 0.0657,
"step": 1875
},
{
"epoch": 0.7378564405113077,
"grad_norm": 1.2145143747329712,
"learning_rate": 9.752599424073707e-07,
"loss": 0.0339,
"step": 1876
},
{
"epoch": 0.7382497541789578,
"grad_norm": 1.0525156259536743,
"learning_rate": 9.725398537615894e-07,
"loss": 0.0459,
"step": 1877
},
{
"epoch": 0.7386430678466077,
"grad_norm": 1.2982537746429443,
"learning_rate": 9.698226474028913e-07,
"loss": 0.0744,
"step": 1878
},
{
"epoch": 0.7390363815142577,
"grad_norm": 0.8789856433868408,
"learning_rate": 9.671083284585925e-07,
"loss": 0.0442,
"step": 1879
},
{
"epoch": 0.7394296951819076,
"grad_norm": 2.672044515609741,
"learning_rate": 9.643969020505573e-07,
"loss": 0.0769,
"step": 1880
},
{
"epoch": 0.7398230088495575,
"grad_norm": 1.0391490459442139,
"learning_rate": 9.616883732951945e-07,
"loss": 0.0721,
"step": 1881
},
{
"epoch": 0.7402163225172075,
"grad_norm": 1.1753817796707153,
"learning_rate": 9.589827473034443e-07,
"loss": 0.0463,
"step": 1882
},
{
"epoch": 0.7406096361848574,
"grad_norm": 1.260125994682312,
"learning_rate": 9.562800291807695e-07,
"loss": 0.0637,
"step": 1883
},
{
"epoch": 0.7410029498525074,
"grad_norm": 0.9175117015838623,
"learning_rate": 9.535802240271455e-07,
"loss": 0.037,
"step": 1884
},
{
"epoch": 0.7413962635201573,
"grad_norm": 0.9132412075996399,
"learning_rate": 9.508833369370524e-07,
"loss": 0.056,
"step": 1885
},
{
"epoch": 0.7417895771878072,
"grad_norm": 1.965725302696228,
"learning_rate": 9.481893729994609e-07,
"loss": 0.0545,
"step": 1886
},
{
"epoch": 0.7421828908554572,
"grad_norm": 2.073374032974243,
"learning_rate": 9.454983372978288e-07,
"loss": 0.0754,
"step": 1887
},
{
"epoch": 0.7425762045231071,
"grad_norm": 1.0531790256500244,
"learning_rate": 9.428102349100868e-07,
"loss": 0.0459,
"step": 1888
},
{
"epoch": 0.7429695181907572,
"grad_norm": 1.7750204801559448,
"learning_rate": 9.40125070908631e-07,
"loss": 0.061,
"step": 1889
},
{
"epoch": 0.7433628318584071,
"grad_norm": 0.6801098585128784,
"learning_rate": 9.374428503603139e-07,
"loss": 0.0597,
"step": 1890
},
{
"epoch": 0.7437561455260571,
"grad_norm": 0.6724294424057007,
"learning_rate": 9.347635783264309e-07,
"loss": 0.0302,
"step": 1891
},
{
"epoch": 0.744149459193707,
"grad_norm": 0.7799742817878723,
"learning_rate": 9.32087259862716e-07,
"loss": 0.0679,
"step": 1892
},
{
"epoch": 0.744542772861357,
"grad_norm": 1.623399257659912,
"learning_rate": 9.294139000193292e-07,
"loss": 0.0553,
"step": 1893
},
{
"epoch": 0.7449360865290069,
"grad_norm": 0.8977343440055847,
"learning_rate": 9.267435038408479e-07,
"loss": 0.0284,
"step": 1894
},
{
"epoch": 0.7453294001966568,
"grad_norm": 0.7733441591262817,
"learning_rate": 9.240760763662562e-07,
"loss": 0.0339,
"step": 1895
},
{
"epoch": 0.7457227138643068,
"grad_norm": 1.5382790565490723,
"learning_rate": 9.214116226289388e-07,
"loss": 0.0746,
"step": 1896
},
{
"epoch": 0.7461160275319567,
"grad_norm": 1.144547700881958,
"learning_rate": 9.187501476566648e-07,
"loss": 0.0351,
"step": 1897
},
{
"epoch": 0.7465093411996067,
"grad_norm": 0.7251105904579163,
"learning_rate": 9.16091656471586e-07,
"loss": 0.0634,
"step": 1898
},
{
"epoch": 0.7469026548672566,
"grad_norm": 0.999096155166626,
"learning_rate": 9.134361540902225e-07,
"loss": 0.0421,
"step": 1899
},
{
"epoch": 0.7472959685349065,
"grad_norm": 0.830605685710907,
"learning_rate": 9.10783645523455e-07,
"loss": 0.0426,
"step": 1900
},
{
"epoch": 0.7476892822025566,
"grad_norm": 1.5645976066589355,
"learning_rate": 9.081341357765145e-07,
"loss": 0.0416,
"step": 1901
},
{
"epoch": 0.7480825958702065,
"grad_norm": 0.8770972490310669,
"learning_rate": 9.054876298489742e-07,
"loss": 0.0561,
"step": 1902
},
{
"epoch": 0.7484759095378565,
"grad_norm": 1.5209007263183594,
"learning_rate": 9.02844132734737e-07,
"loss": 0.0419,
"step": 1903
},
{
"epoch": 0.7488692232055064,
"grad_norm": 3.409085512161255,
"learning_rate": 9.002036494220306e-07,
"loss": 0.0752,
"step": 1904
},
{
"epoch": 0.7492625368731564,
"grad_norm": 1.448819875717163,
"learning_rate": 8.975661848933945e-07,
"loss": 0.0523,
"step": 1905
},
{
"epoch": 0.7496558505408063,
"grad_norm": 0.998282790184021,
"learning_rate": 8.949317441256724e-07,
"loss": 0.0733,
"step": 1906
},
{
"epoch": 0.7500491642084562,
"grad_norm": 1.4408761262893677,
"learning_rate": 8.923003320900014e-07,
"loss": 0.0577,
"step": 1907
},
{
"epoch": 0.7504424778761062,
"grad_norm": 0.9130271077156067,
"learning_rate": 8.896719537518048e-07,
"loss": 0.0317,
"step": 1908
},
{
"epoch": 0.7508357915437561,
"grad_norm": 1.9195144176483154,
"learning_rate": 8.870466140707795e-07,
"loss": 0.0666,
"step": 1909
},
{
"epoch": 0.7512291052114061,
"grad_norm": 1.457318902015686,
"learning_rate": 8.844243180008913e-07,
"loss": 0.0762,
"step": 1910
},
{
"epoch": 0.751622418879056,
"grad_norm": 1.4528069496154785,
"learning_rate": 8.818050704903589e-07,
"loss": 0.0423,
"step": 1911
},
{
"epoch": 0.752015732546706,
"grad_norm": 0.849536120891571,
"learning_rate": 8.791888764816514e-07,
"loss": 0.0289,
"step": 1912
},
{
"epoch": 0.752409046214356,
"grad_norm": 1.4856075048446655,
"learning_rate": 8.765757409114753e-07,
"loss": 0.0665,
"step": 1913
},
{
"epoch": 0.752802359882006,
"grad_norm": 0.8997237086296082,
"learning_rate": 8.739656687107656e-07,
"loss": 0.0619,
"step": 1914
},
{
"epoch": 0.7531956735496559,
"grad_norm": 0.8566966652870178,
"learning_rate": 8.713586648046768e-07,
"loss": 0.0476,
"step": 1915
},
{
"epoch": 0.7535889872173058,
"grad_norm": 0.9483917355537415,
"learning_rate": 8.68754734112574e-07,
"loss": 0.0486,
"step": 1916
},
{
"epoch": 0.7539823008849558,
"grad_norm": 1.0472768545150757,
"learning_rate": 8.661538815480228e-07,
"loss": 0.0422,
"step": 1917
},
{
"epoch": 0.7543756145526057,
"grad_norm": 1.4821901321411133,
"learning_rate": 8.635561120187813e-07,
"loss": 0.0408,
"step": 1918
},
{
"epoch": 0.7547689282202557,
"grad_norm": 0.7954731583595276,
"learning_rate": 8.609614304267877e-07,
"loss": 0.059,
"step": 1919
},
{
"epoch": 0.7551622418879056,
"grad_norm": 0.9966669082641602,
"learning_rate": 8.583698416681555e-07,
"loss": 0.0303,
"step": 1920
},
{
"epoch": 0.7555555555555555,
"grad_norm": 0.39692261815071106,
"learning_rate": 8.557813506331616e-07,
"loss": 0.0324,
"step": 1921
},
{
"epoch": 0.7559488692232055,
"grad_norm": 1.7129300832748413,
"learning_rate": 8.531959622062372e-07,
"loss": 0.0397,
"step": 1922
},
{
"epoch": 0.7563421828908554,
"grad_norm": 1.0999704599380493,
"learning_rate": 8.506136812659601e-07,
"loss": 0.0455,
"step": 1923
},
{
"epoch": 0.7567354965585054,
"grad_norm": 1.2547434568405151,
"learning_rate": 8.480345126850414e-07,
"loss": 0.0658,
"step": 1924
},
{
"epoch": 0.7571288102261554,
"grad_norm": 1.1041603088378906,
"learning_rate": 8.454584613303227e-07,
"loss": 0.0339,
"step": 1925
},
{
"epoch": 0.7575221238938054,
"grad_norm": 0.8621834516525269,
"learning_rate": 8.428855320627613e-07,
"loss": 0.0294,
"step": 1926
},
{
"epoch": 0.7579154375614553,
"grad_norm": 0.7350767254829407,
"learning_rate": 8.403157297374239e-07,
"loss": 0.023,
"step": 1927
},
{
"epoch": 0.7583087512291052,
"grad_norm": 0.9072149991989136,
"learning_rate": 8.377490592034779e-07,
"loss": 0.0704,
"step": 1928
},
{
"epoch": 0.7587020648967552,
"grad_norm": 0.715020477771759,
"learning_rate": 8.35185525304178e-07,
"loss": 0.0321,
"step": 1929
},
{
"epoch": 0.7590953785644051,
"grad_norm": 0.7303974032402039,
"learning_rate": 8.326251328768626e-07,
"loss": 0.0207,
"step": 1930
},
{
"epoch": 0.7594886922320551,
"grad_norm": 1.534783124923706,
"learning_rate": 8.300678867529415e-07,
"loss": 0.0715,
"step": 1931
},
{
"epoch": 0.759882005899705,
"grad_norm": 0.6678977012634277,
"learning_rate": 8.275137917578879e-07,
"loss": 0.0454,
"step": 1932
},
{
"epoch": 0.760275319567355,
"grad_norm": 0.7839411497116089,
"learning_rate": 8.249628527112282e-07,
"loss": 0.053,
"step": 1933
},
{
"epoch": 0.7606686332350049,
"grad_norm": 0.6599370241165161,
"learning_rate": 8.224150744265352e-07,
"loss": 0.0312,
"step": 1934
},
{
"epoch": 0.7610619469026548,
"grad_norm": 0.8593689799308777,
"learning_rate": 8.198704617114143e-07,
"loss": 0.0219,
"step": 1935
},
{
"epoch": 0.7614552605703048,
"grad_norm": 1.0792686939239502,
"learning_rate": 8.173290193674996e-07,
"loss": 0.0688,
"step": 1936
},
{
"epoch": 0.7618485742379548,
"grad_norm": 1.1030522584915161,
"learning_rate": 8.147907521904433e-07,
"loss": 0.0598,
"step": 1937
},
{
"epoch": 0.7622418879056048,
"grad_norm": 1.4342604875564575,
"learning_rate": 8.122556649699051e-07,
"loss": 0.072,
"step": 1938
},
{
"epoch": 0.7626352015732547,
"grad_norm": 1.555779218673706,
"learning_rate": 8.097237624895452e-07,
"loss": 0.0875,
"step": 1939
},
{
"epoch": 0.7630285152409046,
"grad_norm": 1.7069602012634277,
"learning_rate": 8.07195049527012e-07,
"loss": 0.0625,
"step": 1940
},
{
"epoch": 0.7634218289085546,
"grad_norm": 1.4105464220046997,
"learning_rate": 8.046695308539376e-07,
"loss": 0.0302,
"step": 1941
},
{
"epoch": 0.7638151425762045,
"grad_norm": 0.9220629930496216,
"learning_rate": 8.021472112359255e-07,
"loss": 0.0788,
"step": 1942
},
{
"epoch": 0.7642084562438545,
"grad_norm": 1.7221704721450806,
"learning_rate": 7.996280954325433e-07,
"loss": 0.0701,
"step": 1943
},
{
"epoch": 0.7646017699115044,
"grad_norm": 1.240715503692627,
"learning_rate": 7.971121881973126e-07,
"loss": 0.0605,
"step": 1944
},
{
"epoch": 0.7649950835791544,
"grad_norm": 1.054165005683899,
"learning_rate": 7.945994942777016e-07,
"loss": 0.0278,
"step": 1945
},
{
"epoch": 0.7653883972468043,
"grad_norm": 0.3918832242488861,
"learning_rate": 7.92090018415112e-07,
"loss": 0.0433,
"step": 1946
},
{
"epoch": 0.7657817109144542,
"grad_norm": 1.2010436058044434,
"learning_rate": 7.895837653448759e-07,
"loss": 0.0645,
"step": 1947
},
{
"epoch": 0.7661750245821042,
"grad_norm": 0.6880310773849487,
"learning_rate": 7.870807397962438e-07,
"loss": 0.0466,
"step": 1948
},
{
"epoch": 0.7665683382497542,
"grad_norm": 0.8154659867286682,
"learning_rate": 7.845809464923748e-07,
"loss": 0.0478,
"step": 1949
},
{
"epoch": 0.7669616519174042,
"grad_norm": 0.7172273397445679,
"learning_rate": 7.820843901503308e-07,
"loss": 0.0352,
"step": 1950
},
{
"epoch": 0.7673549655850541,
"grad_norm": 1.7781319618225098,
"learning_rate": 7.79591075481062e-07,
"loss": 0.0732,
"step": 1951
},
{
"epoch": 0.7677482792527041,
"grad_norm": 0.6639533638954163,
"learning_rate": 7.771010071894052e-07,
"loss": 0.0179,
"step": 1952
},
{
"epoch": 0.768141592920354,
"grad_norm": 0.8761031627655029,
"learning_rate": 7.7461418997407e-07,
"loss": 0.0281,
"step": 1953
},
{
"epoch": 0.7685349065880039,
"grad_norm": 0.7496312856674194,
"learning_rate": 7.721306285276309e-07,
"loss": 0.053,
"step": 1954
},
{
"epoch": 0.7689282202556539,
"grad_norm": 0.46650174260139465,
"learning_rate": 7.696503275365194e-07,
"loss": 0.0513,
"step": 1955
},
{
"epoch": 0.7693215339233038,
"grad_norm": 1.1080721616744995,
"learning_rate": 7.671732916810154e-07,
"loss": 0.0507,
"step": 1956
},
{
"epoch": 0.7697148475909538,
"grad_norm": 0.6540339589118958,
"learning_rate": 7.646995256352346e-07,
"loss": 0.028,
"step": 1957
},
{
"epoch": 0.7701081612586037,
"grad_norm": 1.099401593208313,
"learning_rate": 7.622290340671256e-07,
"loss": 0.0623,
"step": 1958
},
{
"epoch": 0.7705014749262536,
"grad_norm": 0.9163020253181458,
"learning_rate": 7.597618216384576e-07,
"loss": 0.0251,
"step": 1959
},
{
"epoch": 0.7708947885939036,
"grad_norm": 1.32003915309906,
"learning_rate": 7.572978930048108e-07,
"loss": 0.0467,
"step": 1960
},
{
"epoch": 0.7712881022615536,
"grad_norm": 1.0354825258255005,
"learning_rate": 7.54837252815571e-07,
"loss": 0.0491,
"step": 1961
},
{
"epoch": 0.7716814159292036,
"grad_norm": 1.0285413265228271,
"learning_rate": 7.523799057139158e-07,
"loss": 0.0598,
"step": 1962
},
{
"epoch": 0.7720747295968535,
"grad_norm": 1.7109252214431763,
"learning_rate": 7.49925856336812e-07,
"loss": 0.058,
"step": 1963
},
{
"epoch": 0.7724680432645035,
"grad_norm": 1.3561407327651978,
"learning_rate": 7.474751093150015e-07,
"loss": 0.0351,
"step": 1964
},
{
"epoch": 0.7728613569321534,
"grad_norm": 0.4150741696357727,
"learning_rate": 7.450276692729957e-07,
"loss": 0.0181,
"step": 1965
},
{
"epoch": 0.7732546705998034,
"grad_norm": 1.0091959238052368,
"learning_rate": 7.425835408290655e-07,
"loss": 0.0403,
"step": 1966
},
{
"epoch": 0.7736479842674533,
"grad_norm": 2.851815938949585,
"learning_rate": 7.40142728595234e-07,
"loss": 0.0491,
"step": 1967
},
{
"epoch": 0.7740412979351032,
"grad_norm": 1.306333303451538,
"learning_rate": 7.377052371772637e-07,
"loss": 0.058,
"step": 1968
},
{
"epoch": 0.7744346116027532,
"grad_norm": 0.8560998439788818,
"learning_rate": 7.352710711746536e-07,
"loss": 0.0284,
"step": 1969
},
{
"epoch": 0.7748279252704031,
"grad_norm": 1.8746119737625122,
"learning_rate": 7.328402351806269e-07,
"loss": 0.0654,
"step": 1970
},
{
"epoch": 0.7752212389380531,
"grad_norm": 1.0875734090805054,
"learning_rate": 7.304127337821229e-07,
"loss": 0.0402,
"step": 1971
},
{
"epoch": 0.775614552605703,
"grad_norm": 0.8440957069396973,
"learning_rate": 7.279885715597896e-07,
"loss": 0.0367,
"step": 1972
},
{
"epoch": 0.776007866273353,
"grad_norm": 1.528245210647583,
"learning_rate": 7.255677530879713e-07,
"loss": 0.0336,
"step": 1973
},
{
"epoch": 0.776401179941003,
"grad_norm": 1.6772621870040894,
"learning_rate": 7.231502829347056e-07,
"loss": 0.0388,
"step": 1974
},
{
"epoch": 0.7767944936086529,
"grad_norm": 0.85129314661026,
"learning_rate": 7.207361656617112e-07,
"loss": 0.0521,
"step": 1975
},
{
"epoch": 0.7771878072763029,
"grad_norm": 1.1908273696899414,
"learning_rate": 7.183254058243791e-07,
"loss": 0.0419,
"step": 1976
},
{
"epoch": 0.7775811209439528,
"grad_norm": 1.2314374446868896,
"learning_rate": 7.159180079717656e-07,
"loss": 0.044,
"step": 1977
},
{
"epoch": 0.7779744346116028,
"grad_norm": 1.7192610502243042,
"learning_rate": 7.135139766465838e-07,
"loss": 0.0663,
"step": 1978
},
{
"epoch": 0.7783677482792527,
"grad_norm": 1.5432205200195312,
"learning_rate": 7.111133163851916e-07,
"loss": 0.0267,
"step": 1979
},
{
"epoch": 0.7787610619469026,
"grad_norm": 0.759152352809906,
"learning_rate": 7.087160317175881e-07,
"loss": 0.0299,
"step": 1980
},
{
"epoch": 0.7791543756145526,
"grad_norm": 0.9122269749641418,
"learning_rate": 7.06322127167402e-07,
"loss": 0.0301,
"step": 1981
},
{
"epoch": 0.7795476892822025,
"grad_norm": 0.7516564130783081,
"learning_rate": 7.03931607251884e-07,
"loss": 0.0627,
"step": 1982
},
{
"epoch": 0.7799410029498525,
"grad_norm": 1.2953605651855469,
"learning_rate": 7.015444764818988e-07,
"loss": 0.0571,
"step": 1983
},
{
"epoch": 0.7803343166175024,
"grad_norm": 0.8770161271095276,
"learning_rate": 6.991607393619129e-07,
"loss": 0.0322,
"step": 1984
},
{
"epoch": 0.7807276302851525,
"grad_norm": 0.8347287774085999,
"learning_rate": 6.967804003899925e-07,
"loss": 0.0497,
"step": 1985
},
{
"epoch": 0.7811209439528024,
"grad_norm": 0.5185628533363342,
"learning_rate": 6.944034640577896e-07,
"loss": 0.0292,
"step": 1986
},
{
"epoch": 0.7815142576204523,
"grad_norm": 0.9084299802780151,
"learning_rate": 6.920299348505365e-07,
"loss": 0.0343,
"step": 1987
},
{
"epoch": 0.7819075712881023,
"grad_norm": 1.2148305177688599,
"learning_rate": 6.896598172470356e-07,
"loss": 0.07,
"step": 1988
},
{
"epoch": 0.7823008849557522,
"grad_norm": 1.0693104267120361,
"learning_rate": 6.872931157196519e-07,
"loss": 0.0509,
"step": 1989
},
{
"epoch": 0.7826941986234022,
"grad_norm": 0.5483916997909546,
"learning_rate": 6.849298347343044e-07,
"loss": 0.04,
"step": 1990
},
{
"epoch": 0.7830875122910521,
"grad_norm": 0.9246038794517517,
"learning_rate": 6.825699787504586e-07,
"loss": 0.0602,
"step": 1991
},
{
"epoch": 0.783480825958702,
"grad_norm": 0.7501392960548401,
"learning_rate": 6.802135522211142e-07,
"loss": 0.0331,
"step": 1992
},
{
"epoch": 0.783874139626352,
"grad_norm": 0.8467764854431152,
"learning_rate": 6.778605595928025e-07,
"loss": 0.0325,
"step": 1993
},
{
"epoch": 0.7842674532940019,
"grad_norm": 0.5727487206459045,
"learning_rate": 6.755110053055738e-07,
"loss": 0.0264,
"step": 1994
},
{
"epoch": 0.7846607669616519,
"grad_norm": 1.1488757133483887,
"learning_rate": 6.731648937929911e-07,
"loss": 0.0548,
"step": 1995
},
{
"epoch": 0.7850540806293018,
"grad_norm": 0.7147387862205505,
"learning_rate": 6.708222294821196e-07,
"loss": 0.0548,
"step": 1996
},
{
"epoch": 0.7854473942969519,
"grad_norm": 1.0995930433273315,
"learning_rate": 6.684830167935207e-07,
"loss": 0.0476,
"step": 1997
},
{
"epoch": 0.7858407079646018,
"grad_norm": 1.1355059146881104,
"learning_rate": 6.66147260141243e-07,
"loss": 0.0501,
"step": 1998
},
{
"epoch": 0.7862340216322518,
"grad_norm": 0.7553796768188477,
"learning_rate": 6.638149639328134e-07,
"loss": 0.0686,
"step": 1999
},
{
"epoch": 0.7866273352999017,
"grad_norm": 0.8902336359024048,
"learning_rate": 6.614861325692277e-07,
"loss": 0.0349,
"step": 2000
},
{
"epoch": 0.7870206489675516,
"grad_norm": 1.090766429901123,
"learning_rate": 6.591607704449446e-07,
"loss": 0.0527,
"step": 2001
},
{
"epoch": 0.7874139626352016,
"grad_norm": 1.142582654953003,
"learning_rate": 6.568388819478769e-07,
"loss": 0.0537,
"step": 2002
},
{
"epoch": 0.7878072763028515,
"grad_norm": 1.449288010597229,
"learning_rate": 6.545204714593825e-07,
"loss": 0.0587,
"step": 2003
},
{
"epoch": 0.7882005899705015,
"grad_norm": 1.7187999486923218,
"learning_rate": 6.522055433542557e-07,
"loss": 0.0624,
"step": 2004
},
{
"epoch": 0.7885939036381514,
"grad_norm": 1.5539288520812988,
"learning_rate": 6.49894102000721e-07,
"loss": 0.0553,
"step": 2005
},
{
"epoch": 0.7889872173058013,
"grad_norm": 1.4520833492279053,
"learning_rate": 6.47586151760421e-07,
"loss": 0.0297,
"step": 2006
},
{
"epoch": 0.7893805309734513,
"grad_norm": 1.2936962842941284,
"learning_rate": 6.452816969884127e-07,
"loss": 0.0335,
"step": 2007
},
{
"epoch": 0.7897738446411012,
"grad_norm": 1.2932931184768677,
"learning_rate": 6.429807420331568e-07,
"loss": 0.0622,
"step": 2008
},
{
"epoch": 0.7901671583087513,
"grad_norm": 0.9521369934082031,
"learning_rate": 6.406832912365101e-07,
"loss": 0.0669,
"step": 2009
},
{
"epoch": 0.7905604719764012,
"grad_norm": 0.9570633172988892,
"learning_rate": 6.383893489337172e-07,
"loss": 0.054,
"step": 2010
},
{
"epoch": 0.7909537856440512,
"grad_norm": 0.7929260730743408,
"learning_rate": 6.360989194534004e-07,
"loss": 0.028,
"step": 2011
},
{
"epoch": 0.7913470993117011,
"grad_norm": 1.2527369260787964,
"learning_rate": 6.338120071175558e-07,
"loss": 0.0631,
"step": 2012
},
{
"epoch": 0.791740412979351,
"grad_norm": 0.9790352582931519,
"learning_rate": 6.315286162415412e-07,
"loss": 0.0485,
"step": 2013
},
{
"epoch": 0.792133726647001,
"grad_norm": 1.417540431022644,
"learning_rate": 6.292487511340709e-07,
"loss": 0.0575,
"step": 2014
},
{
"epoch": 0.7925270403146509,
"grad_norm": 1.3456201553344727,
"learning_rate": 6.269724160972043e-07,
"loss": 0.0709,
"step": 2015
},
{
"epoch": 0.7929203539823009,
"grad_norm": 1.3013477325439453,
"learning_rate": 6.246996154263421e-07,
"loss": 0.0571,
"step": 2016
},
{
"epoch": 0.7933136676499508,
"grad_norm": 1.0679081678390503,
"learning_rate": 6.224303534102125e-07,
"loss": 0.0395,
"step": 2017
},
{
"epoch": 0.7937069813176008,
"grad_norm": 1.3359334468841553,
"learning_rate": 6.201646343308685e-07,
"loss": 0.0439,
"step": 2018
},
{
"epoch": 0.7941002949852507,
"grad_norm": 1.4549192190170288,
"learning_rate": 6.179024624636772e-07,
"loss": 0.057,
"step": 2019
},
{
"epoch": 0.7944936086529006,
"grad_norm": 0.8267070055007935,
"learning_rate": 6.156438420773125e-07,
"loss": 0.0207,
"step": 2020
},
{
"epoch": 0.7948869223205507,
"grad_norm": 1.1873496770858765,
"learning_rate": 6.133887774337471e-07,
"loss": 0.0449,
"step": 2021
},
{
"epoch": 0.7952802359882006,
"grad_norm": 1.971118450164795,
"learning_rate": 6.111372727882417e-07,
"loss": 0.0444,
"step": 2022
},
{
"epoch": 0.7956735496558506,
"grad_norm": 0.5039023160934448,
"learning_rate": 6.088893323893419e-07,
"loss": 0.0165,
"step": 2023
},
{
"epoch": 0.7960668633235005,
"grad_norm": 1.2124491930007935,
"learning_rate": 6.066449604788666e-07,
"loss": 0.0384,
"step": 2024
},
{
"epoch": 0.7964601769911505,
"grad_norm": 1.4836233854293823,
"learning_rate": 6.044041612919016e-07,
"loss": 0.0711,
"step": 2025
},
{
"epoch": 0.7968534906588004,
"grad_norm": 1.4890559911727905,
"learning_rate": 6.021669390567902e-07,
"loss": 0.048,
"step": 2026
},
{
"epoch": 0.7972468043264503,
"grad_norm": 0.5430221557617188,
"learning_rate": 5.999332979951272e-07,
"loss": 0.049,
"step": 2027
},
{
"epoch": 0.7976401179941003,
"grad_norm": 0.9645549654960632,
"learning_rate": 5.977032423217482e-07,
"loss": 0.0201,
"step": 2028
},
{
"epoch": 0.7980334316617502,
"grad_norm": 1.7599254846572876,
"learning_rate": 5.954767762447244e-07,
"loss": 0.0524,
"step": 2029
},
{
"epoch": 0.7984267453294002,
"grad_norm": 0.6832358241081238,
"learning_rate": 5.932539039653535e-07,
"loss": 0.0451,
"step": 2030
},
{
"epoch": 0.7988200589970501,
"grad_norm": 0.5469837188720703,
"learning_rate": 5.910346296781511e-07,
"loss": 0.0342,
"step": 2031
},
{
"epoch": 0.7992133726647,
"grad_norm": 1.466138482093811,
"learning_rate": 5.888189575708453e-07,
"loss": 0.0619,
"step": 2032
},
{
"epoch": 0.7996066863323501,
"grad_norm": 1.1846930980682373,
"learning_rate": 5.866068918243634e-07,
"loss": 0.0527,
"step": 2033
},
{
"epoch": 0.8,
"grad_norm": 0.8236525058746338,
"learning_rate": 5.843984366128308e-07,
"loss": 0.0427,
"step": 2034
},
{
"epoch": 0.80039331366765,
"grad_norm": 0.8086917996406555,
"learning_rate": 5.821935961035589e-07,
"loss": 0.0743,
"step": 2035
},
{
"epoch": 0.8007866273352999,
"grad_norm": 1.3642960786819458,
"learning_rate": 5.799923744570376e-07,
"loss": 0.0609,
"step": 2036
},
{
"epoch": 0.8011799410029499,
"grad_norm": 1.4578794240951538,
"learning_rate": 5.777947758269295e-07,
"loss": 0.0828,
"step": 2037
},
{
"epoch": 0.8015732546705998,
"grad_norm": 0.5745184421539307,
"learning_rate": 5.756008043600594e-07,
"loss": 0.0444,
"step": 2038
},
{
"epoch": 0.8019665683382498,
"grad_norm": 2.3881709575653076,
"learning_rate": 5.734104641964075e-07,
"loss": 0.074,
"step": 2039
},
{
"epoch": 0.8023598820058997,
"grad_norm": 1.0504474639892578,
"learning_rate": 5.712237594691028e-07,
"loss": 0.0573,
"step": 2040
},
{
"epoch": 0.8027531956735496,
"grad_norm": 1.7040578126907349,
"learning_rate": 5.690406943044138e-07,
"loss": 0.0472,
"step": 2041
},
{
"epoch": 0.8031465093411996,
"grad_norm": 0.9709568619728088,
"learning_rate": 5.668612728217412e-07,
"loss": 0.0305,
"step": 2042
},
{
"epoch": 0.8035398230088495,
"grad_norm": 2.0475189685821533,
"learning_rate": 5.646854991336112e-07,
"loss": 0.0661,
"step": 2043
},
{
"epoch": 0.8039331366764995,
"grad_norm": 1.4109443426132202,
"learning_rate": 5.625133773456639e-07,
"loss": 0.0698,
"step": 2044
},
{
"epoch": 0.8043264503441495,
"grad_norm": 0.8161342740058899,
"learning_rate": 5.603449115566511e-07,
"loss": 0.0417,
"step": 2045
},
{
"epoch": 0.8047197640117995,
"grad_norm": 1.1740028858184814,
"learning_rate": 5.581801058584252e-07,
"loss": 0.0444,
"step": 2046
},
{
"epoch": 0.8051130776794494,
"grad_norm": 2.580334424972534,
"learning_rate": 5.560189643359312e-07,
"loss": 0.0988,
"step": 2047
},
{
"epoch": 0.8055063913470993,
"grad_norm": 0.8429194092750549,
"learning_rate": 5.538614910672005e-07,
"loss": 0.0312,
"step": 2048
},
{
"epoch": 0.8058997050147493,
"grad_norm": 0.8115060925483704,
"learning_rate": 5.517076901233434e-07,
"loss": 0.0561,
"step": 2049
},
{
"epoch": 0.8062930186823992,
"grad_norm": 0.5982792377471924,
"learning_rate": 5.495575655685382e-07,
"loss": 0.0369,
"step": 2050
},
{
"epoch": 0.8066863323500492,
"grad_norm": 1.5597193241119385,
"learning_rate": 5.474111214600278e-07,
"loss": 0.0701,
"step": 2051
},
{
"epoch": 0.8070796460176991,
"grad_norm": 1.3873978853225708,
"learning_rate": 5.452683618481103e-07,
"loss": 0.0372,
"step": 2052
},
{
"epoch": 0.807472959685349,
"grad_norm": 0.9317770004272461,
"learning_rate": 5.431292907761305e-07,
"loss": 0.0433,
"step": 2053
},
{
"epoch": 0.807866273352999,
"grad_norm": 1.736678957939148,
"learning_rate": 5.409939122804736e-07,
"loss": 0.0562,
"step": 2054
},
{
"epoch": 0.8082595870206489,
"grad_norm": 1.1516214609146118,
"learning_rate": 5.388622303905558e-07,
"loss": 0.0438,
"step": 2055
},
{
"epoch": 0.8086529006882989,
"grad_norm": 0.855049192905426,
"learning_rate": 5.367342491288186e-07,
"loss": 0.0389,
"step": 2056
},
{
"epoch": 0.8090462143559489,
"grad_norm": 0.8584917187690735,
"learning_rate": 5.346099725107213e-07,
"loss": 0.0686,
"step": 2057
},
{
"epoch": 0.8094395280235989,
"grad_norm": 1.1630586385726929,
"learning_rate": 5.324894045447312e-07,
"loss": 0.0361,
"step": 2058
},
{
"epoch": 0.8098328416912488,
"grad_norm": 1.2655314207077026,
"learning_rate": 5.303725492323194e-07,
"loss": 0.0284,
"step": 2059
},
{
"epoch": 0.8102261553588987,
"grad_norm": 1.1947369575500488,
"learning_rate": 5.282594105679481e-07,
"loss": 0.0562,
"step": 2060
},
{
"epoch": 0.8106194690265487,
"grad_norm": 0.7869384288787842,
"learning_rate": 5.261499925390692e-07,
"loss": 0.0407,
"step": 2061
},
{
"epoch": 0.8110127826941986,
"grad_norm": 1.6076072454452515,
"learning_rate": 5.240442991261127e-07,
"loss": 0.0384,
"step": 2062
},
{
"epoch": 0.8114060963618486,
"grad_norm": 2.237993001937866,
"learning_rate": 5.219423343024804e-07,
"loss": 0.0539,
"step": 2063
},
{
"epoch": 0.8117994100294985,
"grad_norm": 0.8259546756744385,
"learning_rate": 5.198441020345382e-07,
"loss": 0.0436,
"step": 2064
},
{
"epoch": 0.8121927236971485,
"grad_norm": 1.2509441375732422,
"learning_rate": 5.177496062816101e-07,
"loss": 0.0462,
"step": 2065
},
{
"epoch": 0.8125860373647984,
"grad_norm": 1.06137216091156,
"learning_rate": 5.156588509959659e-07,
"loss": 0.0339,
"step": 2066
},
{
"epoch": 0.8129793510324483,
"grad_norm": 0.7373847365379333,
"learning_rate": 5.13571840122821e-07,
"loss": 0.0301,
"step": 2067
},
{
"epoch": 0.8133726647000983,
"grad_norm": 1.1653954982757568,
"learning_rate": 5.114885776003234e-07,
"loss": 0.0427,
"step": 2068
},
{
"epoch": 0.8137659783677483,
"grad_norm": 1.518700122833252,
"learning_rate": 5.094090673595478e-07,
"loss": 0.0568,
"step": 2069
},
{
"epoch": 0.8141592920353983,
"grad_norm": 0.9491556286811829,
"learning_rate": 5.073333133244896e-07,
"loss": 0.0296,
"step": 2070
},
{
"epoch": 0.8145526057030482,
"grad_norm": 1.12187922000885,
"learning_rate": 5.052613194120554e-07,
"loss": 0.0625,
"step": 2071
},
{
"epoch": 0.8149459193706982,
"grad_norm": 0.9381184577941895,
"learning_rate": 5.031930895320569e-07,
"loss": 0.0318,
"step": 2072
},
{
"epoch": 0.8153392330383481,
"grad_norm": 0.8680362701416016,
"learning_rate": 5.011286275872021e-07,
"loss": 0.0631,
"step": 2073
},
{
"epoch": 0.815732546705998,
"grad_norm": 1.5543493032455444,
"learning_rate": 4.990679374730905e-07,
"loss": 0.0754,
"step": 2074
},
{
"epoch": 0.816125860373648,
"grad_norm": 1.3975200653076172,
"learning_rate": 4.970110230782035e-07,
"loss": 0.072,
"step": 2075
},
{
"epoch": 0.8165191740412979,
"grad_norm": 0.8037746548652649,
"learning_rate": 4.949578882838982e-07,
"loss": 0.0385,
"step": 2076
},
{
"epoch": 0.8169124877089479,
"grad_norm": 0.7833993434906006,
"learning_rate": 4.929085369643988e-07,
"loss": 0.0418,
"step": 2077
},
{
"epoch": 0.8173058013765978,
"grad_norm": 0.8177001476287842,
"learning_rate": 4.908629729867908e-07,
"loss": 0.0485,
"step": 2078
},
{
"epoch": 0.8176991150442477,
"grad_norm": 0.7933450937271118,
"learning_rate": 4.88821200211014e-07,
"loss": 0.0466,
"step": 2079
},
{
"epoch": 0.8180924287118977,
"grad_norm": 0.5968790054321289,
"learning_rate": 4.867832224898517e-07,
"loss": 0.0253,
"step": 2080
},
{
"epoch": 0.8184857423795477,
"grad_norm": 1.4022417068481445,
"learning_rate": 4.847490436689281e-07,
"loss": 0.0431,
"step": 2081
},
{
"epoch": 0.8188790560471977,
"grad_norm": 2.319401264190674,
"learning_rate": 4.827186675866985e-07,
"loss": 0.0493,
"step": 2082
},
{
"epoch": 0.8192723697148476,
"grad_norm": 1.0119627714157104,
"learning_rate": 4.806920980744426e-07,
"loss": 0.0606,
"step": 2083
},
{
"epoch": 0.8196656833824976,
"grad_norm": 1.2110787630081177,
"learning_rate": 4.786693389562566e-07,
"loss": 0.0582,
"step": 2084
},
{
"epoch": 0.8200589970501475,
"grad_norm": 0.7724167704582214,
"learning_rate": 4.7665039404904747e-07,
"loss": 0.0457,
"step": 2085
},
{
"epoch": 0.8204523107177975,
"grad_norm": 1.5843499898910522,
"learning_rate": 4.746352671625237e-07,
"loss": 0.0482,
"step": 2086
},
{
"epoch": 0.8208456243854474,
"grad_norm": 1.3220843076705933,
"learning_rate": 4.72623962099191e-07,
"loss": 0.0505,
"step": 2087
},
{
"epoch": 0.8212389380530973,
"grad_norm": 1.6696242094039917,
"learning_rate": 4.7061648265434053e-07,
"loss": 0.0587,
"step": 2088
},
{
"epoch": 0.8216322517207473,
"grad_norm": 1.341960072517395,
"learning_rate": 4.6861283261604745e-07,
"loss": 0.0781,
"step": 2089
},
{
"epoch": 0.8220255653883972,
"grad_norm": 1.6525554656982422,
"learning_rate": 4.666130157651594e-07,
"loss": 0.052,
"step": 2090
},
{
"epoch": 0.8224188790560472,
"grad_norm": 1.0084091424942017,
"learning_rate": 4.6461703587529106e-07,
"loss": 0.0354,
"step": 2091
},
{
"epoch": 0.8228121927236971,
"grad_norm": 0.8987352848052979,
"learning_rate": 4.62624896712818e-07,
"loss": 0.0351,
"step": 2092
},
{
"epoch": 0.8232055063913472,
"grad_norm": 1.0085314512252808,
"learning_rate": 4.6063660203686635e-07,
"loss": 0.0459,
"step": 2093
},
{
"epoch": 0.8235988200589971,
"grad_norm": 1.4987783432006836,
"learning_rate": 4.586521555993087e-07,
"loss": 0.0771,
"step": 2094
},
{
"epoch": 0.823992133726647,
"grad_norm": 1.5976486206054688,
"learning_rate": 4.5667156114475695e-07,
"loss": 0.0766,
"step": 2095
},
{
"epoch": 0.824385447394297,
"grad_norm": 0.9721060395240784,
"learning_rate": 4.5469482241055324e-07,
"loss": 0.0514,
"step": 2096
},
{
"epoch": 0.8247787610619469,
"grad_norm": 0.835397481918335,
"learning_rate": 4.527219431267646e-07,
"loss": 0.0352,
"step": 2097
},
{
"epoch": 0.8251720747295969,
"grad_norm": 1.1280697584152222,
"learning_rate": 4.507529270161759e-07,
"loss": 0.0712,
"step": 2098
},
{
"epoch": 0.8255653883972468,
"grad_norm": 1.8154939413070679,
"learning_rate": 4.4878777779428034e-07,
"loss": 0.0918,
"step": 2099
},
{
"epoch": 0.8259587020648967,
"grad_norm": 1.067765474319458,
"learning_rate": 4.4682649916927614e-07,
"loss": 0.0357,
"step": 2100
},
{
"epoch": 0.8263520157325467,
"grad_norm": 1.0095484256744385,
"learning_rate": 4.4486909484205725e-07,
"loss": 0.0315,
"step": 2101
},
{
"epoch": 0.8267453294001966,
"grad_norm": 1.7903807163238525,
"learning_rate": 4.429155685062073e-07,
"loss": 0.0598,
"step": 2102
},
{
"epoch": 0.8271386430678466,
"grad_norm": 1.5948070287704468,
"learning_rate": 4.409659238479919e-07,
"loss": 0.0408,
"step": 2103
},
{
"epoch": 0.8275319567354965,
"grad_norm": 0.805156946182251,
"learning_rate": 4.39020164546351e-07,
"loss": 0.0448,
"step": 2104
},
{
"epoch": 0.8279252704031466,
"grad_norm": 0.4440039098262787,
"learning_rate": 4.370782942728946e-07,
"loss": 0.0279,
"step": 2105
},
{
"epoch": 0.8283185840707965,
"grad_norm": 0.9887676239013672,
"learning_rate": 4.3514031669189325e-07,
"loss": 0.0706,
"step": 2106
},
{
"epoch": 0.8287118977384464,
"grad_norm": 1.1825933456420898,
"learning_rate": 4.3320623546027283e-07,
"loss": 0.0608,
"step": 2107
},
{
"epoch": 0.8291052114060964,
"grad_norm": 1.8713337182998657,
"learning_rate": 4.312760542276059e-07,
"loss": 0.049,
"step": 2108
},
{
"epoch": 0.8294985250737463,
"grad_norm": 0.9182631969451904,
"learning_rate": 4.293497766361068e-07,
"loss": 0.0436,
"step": 2109
},
{
"epoch": 0.8298918387413963,
"grad_norm": 1.1083096265792847,
"learning_rate": 4.2742740632062243e-07,
"loss": 0.0483,
"step": 2110
},
{
"epoch": 0.8302851524090462,
"grad_norm": 2.0837628841400146,
"learning_rate": 4.255089469086279e-07,
"loss": 0.0663,
"step": 2111
},
{
"epoch": 0.8306784660766962,
"grad_norm": 1.2065215110778809,
"learning_rate": 4.235944020202182e-07,
"loss": 0.0673,
"step": 2112
},
{
"epoch": 0.8310717797443461,
"grad_norm": 1.3495663404464722,
"learning_rate": 4.216837752681019e-07,
"loss": 0.0589,
"step": 2113
},
{
"epoch": 0.831465093411996,
"grad_norm": 0.8407555818557739,
"learning_rate": 4.19777070257594e-07,
"loss": 0.0309,
"step": 2114
},
{
"epoch": 0.831858407079646,
"grad_norm": 0.9763451814651489,
"learning_rate": 4.1787429058660845e-07,
"loss": 0.0231,
"step": 2115
},
{
"epoch": 0.8322517207472959,
"grad_norm": 1.1487807035446167,
"learning_rate": 4.159754398456531e-07,
"loss": 0.0582,
"step": 2116
},
{
"epoch": 0.832645034414946,
"grad_norm": 0.9778567552566528,
"learning_rate": 4.14080521617822e-07,
"loss": 0.0349,
"step": 2117
},
{
"epoch": 0.8330383480825959,
"grad_norm": 1.1251294612884521,
"learning_rate": 4.121895394787881e-07,
"loss": 0.0608,
"step": 2118
},
{
"epoch": 0.8334316617502459,
"grad_norm": 0.8375036716461182,
"learning_rate": 4.103024969967981e-07,
"loss": 0.0406,
"step": 2119
},
{
"epoch": 0.8338249754178958,
"grad_norm": 1.1409391164779663,
"learning_rate": 4.084193977326625e-07,
"loss": 0.0545,
"step": 2120
},
{
"epoch": 0.8342182890855457,
"grad_norm": 1.0144537687301636,
"learning_rate": 4.0654024523975323e-07,
"loss": 0.076,
"step": 2121
},
{
"epoch": 0.8346116027531957,
"grad_norm": 1.7752301692962646,
"learning_rate": 4.0466504306399366e-07,
"loss": 0.0647,
"step": 2122
},
{
"epoch": 0.8350049164208456,
"grad_norm": 1.1848422288894653,
"learning_rate": 4.027937947438532e-07,
"loss": 0.0642,
"step": 2123
},
{
"epoch": 0.8353982300884956,
"grad_norm": 0.8530738353729248,
"learning_rate": 4.009265038103402e-07,
"loss": 0.0407,
"step": 2124
},
{
"epoch": 0.8357915437561455,
"grad_norm": 0.9213998317718506,
"learning_rate": 3.9906317378699684e-07,
"loss": 0.0306,
"step": 2125
},
{
"epoch": 0.8361848574237954,
"grad_norm": 0.8134070038795471,
"learning_rate": 3.972038081898885e-07,
"loss": 0.0378,
"step": 2126
},
{
"epoch": 0.8365781710914454,
"grad_norm": 1.0904289484024048,
"learning_rate": 3.9534841052760174e-07,
"loss": 0.032,
"step": 2127
},
{
"epoch": 0.8369714847590953,
"grad_norm": 2.0691423416137695,
"learning_rate": 3.9349698430123566e-07,
"loss": 0.0737,
"step": 2128
},
{
"epoch": 0.8373647984267454,
"grad_norm": 1.1641324758529663,
"learning_rate": 3.9164953300439456e-07,
"loss": 0.0546,
"step": 2129
},
{
"epoch": 0.8377581120943953,
"grad_norm": 0.9116164445877075,
"learning_rate": 3.898060601231832e-07,
"loss": 0.0533,
"step": 2130
},
{
"epoch": 0.8381514257620453,
"grad_norm": 1.0761325359344482,
"learning_rate": 3.879665691361975e-07,
"loss": 0.0465,
"step": 2131
},
{
"epoch": 0.8385447394296952,
"grad_norm": 1.2517597675323486,
"learning_rate": 3.861310635145207e-07,
"loss": 0.0509,
"step": 2132
},
{
"epoch": 0.8389380530973451,
"grad_norm": 0.7470773458480835,
"learning_rate": 3.8429954672171613e-07,
"loss": 0.0452,
"step": 2133
},
{
"epoch": 0.8393313667649951,
"grad_norm": 1.572190284729004,
"learning_rate": 3.824720222138192e-07,
"loss": 0.0388,
"step": 2134
},
{
"epoch": 0.839724680432645,
"grad_norm": 1.1324615478515625,
"learning_rate": 3.806484934393331e-07,
"loss": 0.0696,
"step": 2135
},
{
"epoch": 0.840117994100295,
"grad_norm": 1.03518807888031,
"learning_rate": 3.788289638392206e-07,
"loss": 0.0333,
"step": 2136
},
{
"epoch": 0.8405113077679449,
"grad_norm": 1.2855054140090942,
"learning_rate": 3.7701343684689725e-07,
"loss": 0.0573,
"step": 2137
},
{
"epoch": 0.8409046214355949,
"grad_norm": 1.5672320127487183,
"learning_rate": 3.7520191588822695e-07,
"loss": 0.0618,
"step": 2138
},
{
"epoch": 0.8412979351032448,
"grad_norm": 1.3046908378601074,
"learning_rate": 3.7339440438151383e-07,
"loss": 0.0633,
"step": 2139
},
{
"epoch": 0.8416912487708947,
"grad_norm": 0.9728895425796509,
"learning_rate": 3.7159090573749693e-07,
"loss": 0.0287,
"step": 2140
},
{
"epoch": 0.8420845624385448,
"grad_norm": 1.4470866918563843,
"learning_rate": 3.6979142335934246e-07,
"loss": 0.0439,
"step": 2141
},
{
"epoch": 0.8424778761061947,
"grad_norm": 0.802937924861908,
"learning_rate": 3.67995960642637e-07,
"loss": 0.0316,
"step": 2142
},
{
"epoch": 0.8428711897738447,
"grad_norm": 0.8089593052864075,
"learning_rate": 3.6620452097538424e-07,
"loss": 0.0506,
"step": 2143
},
{
"epoch": 0.8432645034414946,
"grad_norm": 0.9571702480316162,
"learning_rate": 3.644171077379949e-07,
"loss": 0.0273,
"step": 2144
},
{
"epoch": 0.8436578171091446,
"grad_norm": 1.022767186164856,
"learning_rate": 3.6263372430328266e-07,
"loss": 0.0497,
"step": 2145
},
{
"epoch": 0.8440511307767945,
"grad_norm": 1.133183479309082,
"learning_rate": 3.6085437403645645e-07,
"loss": 0.0375,
"step": 2146
},
{
"epoch": 0.8444444444444444,
"grad_norm": 1.603365421295166,
"learning_rate": 3.5907906029511606e-07,
"loss": 0.0535,
"step": 2147
},
{
"epoch": 0.8448377581120944,
"grad_norm": 1.052833080291748,
"learning_rate": 3.573077864292421e-07,
"loss": 0.0419,
"step": 2148
},
{
"epoch": 0.8452310717797443,
"grad_norm": 0.8957949280738831,
"learning_rate": 3.555405557811936e-07,
"loss": 0.054,
"step": 2149
},
{
"epoch": 0.8456243854473943,
"grad_norm": 1.3401049375534058,
"learning_rate": 3.537773716857004e-07,
"loss": 0.0558,
"step": 2150
},
{
"epoch": 0.8460176991150442,
"grad_norm": 1.3811299800872803,
"learning_rate": 3.5201823746985554e-07,
"loss": 0.0436,
"step": 2151
},
{
"epoch": 0.8464110127826941,
"grad_norm": 1.3221920728683472,
"learning_rate": 3.5026315645311114e-07,
"loss": 0.0679,
"step": 2152
},
{
"epoch": 0.8468043264503442,
"grad_norm": 0.608182966709137,
"learning_rate": 3.485121319472695e-07,
"loss": 0.0624,
"step": 2153
},
{
"epoch": 0.8471976401179941,
"grad_norm": 0.8964172601699829,
"learning_rate": 3.4676516725647953e-07,
"loss": 0.0394,
"step": 2154
},
{
"epoch": 0.8475909537856441,
"grad_norm": 0.7584964632987976,
"learning_rate": 3.450222656772292e-07,
"loss": 0.0484,
"step": 2155
},
{
"epoch": 0.847984267453294,
"grad_norm": 0.3789440095424652,
"learning_rate": 3.43283430498339e-07,
"loss": 0.0277,
"step": 2156
},
{
"epoch": 0.848377581120944,
"grad_norm": 0.7871941924095154,
"learning_rate": 3.4154866500095695e-07,
"loss": 0.0493,
"step": 2157
},
{
"epoch": 0.8487708947885939,
"grad_norm": 1.302708625793457,
"learning_rate": 3.3981797245855096e-07,
"loss": 0.0799,
"step": 2158
},
{
"epoch": 0.8491642084562439,
"grad_norm": 0.7635212540626526,
"learning_rate": 3.380913561369037e-07,
"loss": 0.0427,
"step": 2159
},
{
"epoch": 0.8495575221238938,
"grad_norm": 0.8605564832687378,
"learning_rate": 3.363688192941067e-07,
"loss": 0.0462,
"step": 2160
},
{
"epoch": 0.8499508357915437,
"grad_norm": 0.9630613923072815,
"learning_rate": 3.346503651805513e-07,
"loss": 0.0637,
"step": 2161
},
{
"epoch": 0.8503441494591937,
"grad_norm": 1.0170080661773682,
"learning_rate": 3.329359970389279e-07,
"loss": 0.061,
"step": 2162
},
{
"epoch": 0.8507374631268436,
"grad_norm": 0.8377442359924316,
"learning_rate": 3.312257181042142e-07,
"loss": 0.0449,
"step": 2163
},
{
"epoch": 0.8511307767944936,
"grad_norm": 0.9564546346664429,
"learning_rate": 3.2951953160367365e-07,
"loss": 0.0496,
"step": 2164
},
{
"epoch": 0.8515240904621436,
"grad_norm": 0.5969823002815247,
"learning_rate": 3.2781744075684576e-07,
"loss": 0.0404,
"step": 2165
},
{
"epoch": 0.8519174041297936,
"grad_norm": 1.0183027982711792,
"learning_rate": 3.261194487755426e-07,
"loss": 0.0563,
"step": 2166
},
{
"epoch": 0.8523107177974435,
"grad_norm": 1.3610613346099854,
"learning_rate": 3.2442555886384145e-07,
"loss": 0.0791,
"step": 2167
},
{
"epoch": 0.8527040314650934,
"grad_norm": 0.7566685080528259,
"learning_rate": 3.2273577421807976e-07,
"loss": 0.0415,
"step": 2168
},
{
"epoch": 0.8530973451327434,
"grad_norm": 1.1211597919464111,
"learning_rate": 3.2105009802684636e-07,
"loss": 0.0874,
"step": 2169
},
{
"epoch": 0.8534906588003933,
"grad_norm": 1.6669408082962036,
"learning_rate": 3.1936853347097923e-07,
"loss": 0.0521,
"step": 2170
},
{
"epoch": 0.8538839724680433,
"grad_norm": 0.9726613163948059,
"learning_rate": 3.1769108372355804e-07,
"loss": 0.0457,
"step": 2171
},
{
"epoch": 0.8542772861356932,
"grad_norm": 1.5157469511032104,
"learning_rate": 3.1601775194989693e-07,
"loss": 0.0574,
"step": 2172
},
{
"epoch": 0.8546705998033431,
"grad_norm": 2.319978713989258,
"learning_rate": 3.143485413075398e-07,
"loss": 0.0604,
"step": 2173
},
{
"epoch": 0.8550639134709931,
"grad_norm": 1.160510778427124,
"learning_rate": 3.1268345494625486e-07,
"loss": 0.0454,
"step": 2174
},
{
"epoch": 0.855457227138643,
"grad_norm": 1.0284311771392822,
"learning_rate": 3.1102249600802573e-07,
"loss": 0.0375,
"step": 2175
},
{
"epoch": 0.855850540806293,
"grad_norm": 0.7068095207214355,
"learning_rate": 3.093656676270501e-07,
"loss": 0.0409,
"step": 2176
},
{
"epoch": 0.856243854473943,
"grad_norm": 0.8698954582214355,
"learning_rate": 3.0771297292972986e-07,
"loss": 0.0547,
"step": 2177
},
{
"epoch": 0.856637168141593,
"grad_norm": 0.7371048331260681,
"learning_rate": 3.0606441503466753e-07,
"loss": 0.0661,
"step": 2178
},
{
"epoch": 0.8570304818092429,
"grad_norm": 0.6116827726364136,
"learning_rate": 3.044199970526593e-07,
"loss": 0.0199,
"step": 2179
},
{
"epoch": 0.8574237954768928,
"grad_norm": 0.9910300374031067,
"learning_rate": 3.027797220866896e-07,
"loss": 0.0454,
"step": 2180
},
{
"epoch": 0.8578171091445428,
"grad_norm": 0.9253597855567932,
"learning_rate": 3.01143593231924e-07,
"loss": 0.0465,
"step": 2181
},
{
"epoch": 0.8582104228121927,
"grad_norm": 0.6476548314094543,
"learning_rate": 2.995116135757059e-07,
"loss": 0.0385,
"step": 2182
},
{
"epoch": 0.8586037364798427,
"grad_norm": 0.8749169707298279,
"learning_rate": 2.978837861975484e-07,
"loss": 0.0474,
"step": 2183
},
{
"epoch": 0.8589970501474926,
"grad_norm": 1.4006898403167725,
"learning_rate": 2.962601141691296e-07,
"loss": 0.0511,
"step": 2184
},
{
"epoch": 0.8593903638151426,
"grad_norm": 0.8508985638618469,
"learning_rate": 2.9464060055428703e-07,
"loss": 0.0549,
"step": 2185
},
{
"epoch": 0.8597836774827925,
"grad_norm": 1.1002285480499268,
"learning_rate": 2.930252484090101e-07,
"loss": 0.0283,
"step": 2186
},
{
"epoch": 0.8601769911504424,
"grad_norm": 0.8702027201652527,
"learning_rate": 2.9141406078143644e-07,
"loss": 0.0605,
"step": 2187
},
{
"epoch": 0.8605703048180924,
"grad_norm": 0.79606693983078,
"learning_rate": 2.8980704071184557e-07,
"loss": 0.0598,
"step": 2188
},
{
"epoch": 0.8609636184857424,
"grad_norm": 1.1964335441589355,
"learning_rate": 2.882041912326525e-07,
"loss": 0.046,
"step": 2189
},
{
"epoch": 0.8613569321533924,
"grad_norm": 1.1686105728149414,
"learning_rate": 2.8660551536840277e-07,
"loss": 0.0329,
"step": 2190
},
{
"epoch": 0.8617502458210423,
"grad_norm": 0.858632504940033,
"learning_rate": 2.8501101613576526e-07,
"loss": 0.0661,
"step": 2191
},
{
"epoch": 0.8621435594886923,
"grad_norm": 0.984893262386322,
"learning_rate": 2.834206965435293e-07,
"loss": 0.0351,
"step": 2192
},
{
"epoch": 0.8625368731563422,
"grad_norm": 1.3127596378326416,
"learning_rate": 2.818345595925959e-07,
"loss": 0.0387,
"step": 2193
},
{
"epoch": 0.8629301868239921,
"grad_norm": 1.4564718008041382,
"learning_rate": 2.8025260827597463e-07,
"loss": 0.0424,
"step": 2194
},
{
"epoch": 0.8633235004916421,
"grad_norm": 0.5872806310653687,
"learning_rate": 2.7867484557877607e-07,
"loss": 0.0414,
"step": 2195
},
{
"epoch": 0.863716814159292,
"grad_norm": 1.0555849075317383,
"learning_rate": 2.7710127447820783e-07,
"loss": 0.0519,
"step": 2196
},
{
"epoch": 0.864110127826942,
"grad_norm": 1.0422883033752441,
"learning_rate": 2.7553189794356615e-07,
"loss": 0.0562,
"step": 2197
},
{
"epoch": 0.8645034414945919,
"grad_norm": 1.2551977634429932,
"learning_rate": 2.739667189362347e-07,
"loss": 0.0344,
"step": 2198
},
{
"epoch": 0.8648967551622418,
"grad_norm": 1.0713584423065186,
"learning_rate": 2.724057404096744e-07,
"loss": 0.0385,
"step": 2199
},
{
"epoch": 0.8652900688298918,
"grad_norm": 0.6667132377624512,
"learning_rate": 2.708489653094218e-07,
"loss": 0.0525,
"step": 2200
},
{
"epoch": 0.8656833824975418,
"grad_norm": 0.9178755283355713,
"learning_rate": 2.692963965730805e-07,
"loss": 0.0722,
"step": 2201
},
{
"epoch": 0.8660766961651918,
"grad_norm": 1.2695622444152832,
"learning_rate": 2.677480371303162e-07,
"loss": 0.0759,
"step": 2202
},
{
"epoch": 0.8664700098328417,
"grad_norm": 1.1370331048965454,
"learning_rate": 2.662038899028532e-07,
"loss": 0.0396,
"step": 2203
},
{
"epoch": 0.8668633235004917,
"grad_norm": 0.6956948041915894,
"learning_rate": 2.6466395780446657e-07,
"loss": 0.062,
"step": 2204
},
{
"epoch": 0.8672566371681416,
"grad_norm": 0.5956060886383057,
"learning_rate": 2.6312824374097794e-07,
"loss": 0.049,
"step": 2205
},
{
"epoch": 0.8676499508357916,
"grad_norm": 3.8347904682159424,
"learning_rate": 2.6159675061024905e-07,
"loss": 0.0654,
"step": 2206
},
{
"epoch": 0.8680432645034415,
"grad_norm": 1.0327752828598022,
"learning_rate": 2.6006948130217815e-07,
"loss": 0.024,
"step": 2207
},
{
"epoch": 0.8684365781710914,
"grad_norm": 1.1763917207717896,
"learning_rate": 2.585464386986908e-07,
"loss": 0.0487,
"step": 2208
},
{
"epoch": 0.8688298918387414,
"grad_norm": 1.6335638761520386,
"learning_rate": 2.570276256737386e-07,
"loss": 0.0451,
"step": 2209
},
{
"epoch": 0.8692232055063913,
"grad_norm": 1.1163750886917114,
"learning_rate": 2.555130450932922e-07,
"loss": 0.072,
"step": 2210
},
{
"epoch": 0.8696165191740413,
"grad_norm": 1.2412861585617065,
"learning_rate": 2.54002699815335e-07,
"loss": 0.0541,
"step": 2211
},
{
"epoch": 0.8700098328416912,
"grad_norm": 0.9547197222709656,
"learning_rate": 2.52496592689859e-07,
"loss": 0.04,
"step": 2212
},
{
"epoch": 0.8704031465093413,
"grad_norm": 1.4851540327072144,
"learning_rate": 2.5099472655885777e-07,
"loss": 0.0602,
"step": 2213
},
{
"epoch": 0.8707964601769912,
"grad_norm": 0.9040324687957764,
"learning_rate": 2.4949710425632353e-07,
"loss": 0.0395,
"step": 2214
},
{
"epoch": 0.8711897738446411,
"grad_norm": 1.1058231592178345,
"learning_rate": 2.4800372860823956e-07,
"loss": 0.0472,
"step": 2215
},
{
"epoch": 0.8715830875122911,
"grad_norm": 0.814282238483429,
"learning_rate": 2.465146024325765e-07,
"loss": 0.0541,
"step": 2216
},
{
"epoch": 0.871976401179941,
"grad_norm": 0.9722008109092712,
"learning_rate": 2.4502972853928606e-07,
"loss": 0.0581,
"step": 2217
},
{
"epoch": 0.872369714847591,
"grad_norm": 0.9943141341209412,
"learning_rate": 2.435491097302961e-07,
"loss": 0.0435,
"step": 2218
},
{
"epoch": 0.8727630285152409,
"grad_norm": 1.2543455362319946,
"learning_rate": 2.420727487995045e-07,
"loss": 0.0613,
"step": 2219
},
{
"epoch": 0.8731563421828908,
"grad_norm": 0.8473043441772461,
"learning_rate": 2.40600648532775e-07,
"loss": 0.0391,
"step": 2220
},
{
"epoch": 0.8735496558505408,
"grad_norm": 1.0976766347885132,
"learning_rate": 2.3913281170793196e-07,
"loss": 0.0341,
"step": 2221
},
{
"epoch": 0.8739429695181907,
"grad_norm": 0.765153169631958,
"learning_rate": 2.376692410947548e-07,
"loss": 0.0335,
"step": 2222
},
{
"epoch": 0.8743362831858407,
"grad_norm": 1.2966009378433228,
"learning_rate": 2.3620993945497217e-07,
"loss": 0.0571,
"step": 2223
},
{
"epoch": 0.8747295968534906,
"grad_norm": 1.0903987884521484,
"learning_rate": 2.347549095422569e-07,
"loss": 0.0602,
"step": 2224
},
{
"epoch": 0.8751229105211407,
"grad_norm": 0.9129044413566589,
"learning_rate": 2.3330415410222212e-07,
"loss": 0.0508,
"step": 2225
},
{
"epoch": 0.8755162241887906,
"grad_norm": 1.3771973848342896,
"learning_rate": 2.3185767587241447e-07,
"loss": 0.0282,
"step": 2226
},
{
"epoch": 0.8759095378564405,
"grad_norm": 1.1595170497894287,
"learning_rate": 2.3041547758230977e-07,
"loss": 0.0768,
"step": 2227
},
{
"epoch": 0.8763028515240905,
"grad_norm": 0.7576168775558472,
"learning_rate": 2.2897756195330773e-07,
"loss": 0.0296,
"step": 2228
},
{
"epoch": 0.8766961651917404,
"grad_norm": 1.2020797729492188,
"learning_rate": 2.2754393169872685e-07,
"loss": 0.0392,
"step": 2229
},
{
"epoch": 0.8770894788593904,
"grad_norm": 1.2221319675445557,
"learning_rate": 2.2611458952379872e-07,
"loss": 0.0319,
"step": 2230
},
{
"epoch": 0.8774827925270403,
"grad_norm": 1.1023682355880737,
"learning_rate": 2.246895381256639e-07,
"loss": 0.0523,
"step": 2231
},
{
"epoch": 0.8778761061946903,
"grad_norm": 1.0071845054626465,
"learning_rate": 2.232687801933664e-07,
"loss": 0.034,
"step": 2232
},
{
"epoch": 0.8782694198623402,
"grad_norm": 0.8645428419113159,
"learning_rate": 2.2185231840784778e-07,
"loss": 0.0628,
"step": 2233
},
{
"epoch": 0.8786627335299901,
"grad_norm": 0.6460661292076111,
"learning_rate": 2.204401554419444e-07,
"loss": 0.045,
"step": 2234
},
{
"epoch": 0.8790560471976401,
"grad_norm": 1.7761812210083008,
"learning_rate": 2.1903229396037896e-07,
"loss": 0.0739,
"step": 2235
},
{
"epoch": 0.87944936086529,
"grad_norm": 1.3595634698867798,
"learning_rate": 2.1762873661975825e-07,
"loss": 0.041,
"step": 2236
},
{
"epoch": 0.8798426745329401,
"grad_norm": 0.8807711601257324,
"learning_rate": 2.1622948606856765e-07,
"loss": 0.0623,
"step": 2237
},
{
"epoch": 0.88023598820059,
"grad_norm": 1.0638388395309448,
"learning_rate": 2.1483454494716504e-07,
"loss": 0.0337,
"step": 2238
},
{
"epoch": 0.88062930186824,
"grad_norm": 0.9859362244606018,
"learning_rate": 2.1344391588777658e-07,
"loss": 0.0389,
"step": 2239
},
{
"epoch": 0.8810226155358899,
"grad_norm": 1.0022567510604858,
"learning_rate": 2.1205760151449206e-07,
"loss": 0.0358,
"step": 2240
},
{
"epoch": 0.8814159292035398,
"grad_norm": 0.8748469948768616,
"learning_rate": 2.106756044432598e-07,
"loss": 0.0367,
"step": 2241
},
{
"epoch": 0.8818092428711898,
"grad_norm": 1.0613561868667603,
"learning_rate": 2.0929792728187986e-07,
"loss": 0.0608,
"step": 2242
},
{
"epoch": 0.8822025565388397,
"grad_norm": 1.8184490203857422,
"learning_rate": 2.079245726300022e-07,
"loss": 0.0586,
"step": 2243
},
{
"epoch": 0.8825958702064897,
"grad_norm": 1.0881813764572144,
"learning_rate": 2.0655554307911997e-07,
"loss": 0.0603,
"step": 2244
},
{
"epoch": 0.8829891838741396,
"grad_norm": 1.0074139833450317,
"learning_rate": 2.05190841212565e-07,
"loss": 0.0666,
"step": 2245
},
{
"epoch": 0.8833824975417895,
"grad_norm": 1.1435564756393433,
"learning_rate": 2.038304696055024e-07,
"loss": 0.0312,
"step": 2246
},
{
"epoch": 0.8837758112094395,
"grad_norm": 0.6284701228141785,
"learning_rate": 2.0247443082492686e-07,
"loss": 0.0235,
"step": 2247
},
{
"epoch": 0.8841691248770894,
"grad_norm": 1.6139885187149048,
"learning_rate": 2.0112272742965678e-07,
"loss": 0.0262,
"step": 2248
},
{
"epoch": 0.8845624385447395,
"grad_norm": 0.8762457966804504,
"learning_rate": 1.997753619703291e-07,
"loss": 0.0431,
"step": 2249
},
{
"epoch": 0.8849557522123894,
"grad_norm": 1.287406086921692,
"learning_rate": 1.9843233698939617e-07,
"loss": 0.0457,
"step": 2250
},
{
"epoch": 0.8853490658800394,
"grad_norm": 1.3118491172790527,
"learning_rate": 1.9709365502111944e-07,
"loss": 0.0487,
"step": 2251
},
{
"epoch": 0.8857423795476893,
"grad_norm": 0.8101546764373779,
"learning_rate": 1.957593185915657e-07,
"loss": 0.0458,
"step": 2252
},
{
"epoch": 0.8861356932153392,
"grad_norm": 1.5364015102386475,
"learning_rate": 1.9442933021860095e-07,
"loss": 0.0407,
"step": 2253
},
{
"epoch": 0.8865290068829892,
"grad_norm": 0.9168291091918945,
"learning_rate": 1.9310369241188732e-07,
"loss": 0.0474,
"step": 2254
},
{
"epoch": 0.8869223205506391,
"grad_norm": 1.0423481464385986,
"learning_rate": 1.9178240767287666e-07,
"loss": 0.035,
"step": 2255
},
{
"epoch": 0.8873156342182891,
"grad_norm": 0.995087742805481,
"learning_rate": 1.904654784948079e-07,
"loss": 0.0596,
"step": 2256
},
{
"epoch": 0.887708947885939,
"grad_norm": 1.1472982168197632,
"learning_rate": 1.8915290736269965e-07,
"loss": 0.069,
"step": 2257
},
{
"epoch": 0.888102261553589,
"grad_norm": 0.7572572231292725,
"learning_rate": 1.878446967533476e-07,
"loss": 0.061,
"step": 2258
},
{
"epoch": 0.8884955752212389,
"grad_norm": 0.5118011832237244,
"learning_rate": 1.865408491353199e-07,
"loss": 0.0313,
"step": 2259
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.8399426937103271,
"learning_rate": 1.8524136696895068e-07,
"loss": 0.0444,
"step": 2260
},
{
"epoch": 0.8892822025565389,
"grad_norm": 0.8290569186210632,
"learning_rate": 1.8394625270633793e-07,
"loss": 0.0384,
"step": 2261
},
{
"epoch": 0.8896755162241888,
"grad_norm": 1.0309621095657349,
"learning_rate": 1.8265550879133538e-07,
"loss": 0.0522,
"step": 2262
},
{
"epoch": 0.8900688298918388,
"grad_norm": 2.102466583251953,
"learning_rate": 1.8136913765955195e-07,
"loss": 0.0684,
"step": 2263
},
{
"epoch": 0.8904621435594887,
"grad_norm": 0.9560519456863403,
"learning_rate": 1.8008714173834456e-07,
"loss": 0.0411,
"step": 2264
},
{
"epoch": 0.8908554572271387,
"grad_norm": 0.7714261412620544,
"learning_rate": 1.7880952344681402e-07,
"loss": 0.0393,
"step": 2265
},
{
"epoch": 0.8912487708947886,
"grad_norm": 2.210777521133423,
"learning_rate": 1.7753628519580097e-07,
"loss": 0.0531,
"step": 2266
},
{
"epoch": 0.8916420845624385,
"grad_norm": 1.3124444484710693,
"learning_rate": 1.7626742938788105e-07,
"loss": 0.0808,
"step": 2267
},
{
"epoch": 0.8920353982300885,
"grad_norm": 0.8876106142997742,
"learning_rate": 1.7500295841735905e-07,
"loss": 0.0299,
"step": 2268
},
{
"epoch": 0.8924287118977384,
"grad_norm": 0.9470813870429993,
"learning_rate": 1.7374287467026767e-07,
"loss": 0.0289,
"step": 2269
},
{
"epoch": 0.8928220255653884,
"grad_norm": 1.1278401613235474,
"learning_rate": 1.7248718052435942e-07,
"loss": 0.0557,
"step": 2270
},
{
"epoch": 0.8932153392330383,
"grad_norm": 1.0883233547210693,
"learning_rate": 1.712358783491047e-07,
"loss": 0.0493,
"step": 2271
},
{
"epoch": 0.8936086529006882,
"grad_norm": 1.8595354557037354,
"learning_rate": 1.6998897050568618e-07,
"loss": 0.0583,
"step": 2272
},
{
"epoch": 0.8940019665683383,
"grad_norm": 1.1858155727386475,
"learning_rate": 1.6874645934699342e-07,
"loss": 0.0406,
"step": 2273
},
{
"epoch": 0.8943952802359882,
"grad_norm": 0.8429166674613953,
"learning_rate": 1.6750834721762117e-07,
"loss": 0.0575,
"step": 2274
},
{
"epoch": 0.8947885939036382,
"grad_norm": 1.4577648639678955,
"learning_rate": 1.6627463645386199e-07,
"loss": 0.0412,
"step": 2275
},
{
"epoch": 0.8951819075712881,
"grad_norm": 0.6947933435440063,
"learning_rate": 1.6504532938370427e-07,
"loss": 0.0465,
"step": 2276
},
{
"epoch": 0.8955752212389381,
"grad_norm": 0.8350834846496582,
"learning_rate": 1.6382042832682577e-07,
"loss": 0.0438,
"step": 2277
},
{
"epoch": 0.895968534906588,
"grad_norm": 1.2530003786087036,
"learning_rate": 1.6259993559459091e-07,
"loss": 0.0415,
"step": 2278
},
{
"epoch": 0.896361848574238,
"grad_norm": 1.0597574710845947,
"learning_rate": 1.613838534900447e-07,
"loss": 0.0399,
"step": 2279
},
{
"epoch": 0.8967551622418879,
"grad_norm": 0.8264654278755188,
"learning_rate": 1.601721843079107e-07,
"loss": 0.0348,
"step": 2280
},
{
"epoch": 0.8971484759095378,
"grad_norm": 0.8567057251930237,
"learning_rate": 1.5896493033458416e-07,
"loss": 0.029,
"step": 2281
},
{
"epoch": 0.8975417895771878,
"grad_norm": 1.390363335609436,
"learning_rate": 1.5776209384812946e-07,
"loss": 0.0815,
"step": 2282
},
{
"epoch": 0.8979351032448377,
"grad_norm": 0.9575844407081604,
"learning_rate": 1.5656367711827602e-07,
"loss": 0.0526,
"step": 2283
},
{
"epoch": 0.8983284169124877,
"grad_norm": 0.7833372950553894,
"learning_rate": 1.553696824064116e-07,
"loss": 0.0329,
"step": 2284
},
{
"epoch": 0.8987217305801377,
"grad_norm": 0.8829760551452637,
"learning_rate": 1.5418011196558085e-07,
"loss": 0.0395,
"step": 2285
},
{
"epoch": 0.8991150442477877,
"grad_norm": 1.0580815076828003,
"learning_rate": 1.529949680404799e-07,
"loss": 0.0648,
"step": 2286
},
{
"epoch": 0.8995083579154376,
"grad_norm": 1.051527738571167,
"learning_rate": 1.5181425286745155e-07,
"loss": 0.0618,
"step": 2287
},
{
"epoch": 0.8999016715830875,
"grad_norm": 1.5211282968521118,
"learning_rate": 1.5063796867448243e-07,
"loss": 0.047,
"step": 2288
},
{
"epoch": 0.9002949852507375,
"grad_norm": 0.3931565582752228,
"learning_rate": 1.4946611768119763e-07,
"loss": 0.0371,
"step": 2289
},
{
"epoch": 0.9006882989183874,
"grad_norm": 0.40819835662841797,
"learning_rate": 1.4829870209885605e-07,
"loss": 0.0399,
"step": 2290
},
{
"epoch": 0.9010816125860374,
"grad_norm": 1.5606259107589722,
"learning_rate": 1.471357241303481e-07,
"loss": 0.0537,
"step": 2291
},
{
"epoch": 0.9014749262536873,
"grad_norm": 0.4650862514972687,
"learning_rate": 1.4597718597019055e-07,
"loss": 0.0169,
"step": 2292
},
{
"epoch": 0.9018682399213372,
"grad_norm": 0.8470922112464905,
"learning_rate": 1.4482308980452164e-07,
"loss": 0.0308,
"step": 2293
},
{
"epoch": 0.9022615535889872,
"grad_norm": 1.1515922546386719,
"learning_rate": 1.436734378110985e-07,
"loss": 0.0459,
"step": 2294
},
{
"epoch": 0.9026548672566371,
"grad_norm": 1.0158207416534424,
"learning_rate": 1.425282321592908e-07,
"loss": 0.0667,
"step": 2295
},
{
"epoch": 0.9030481809242871,
"grad_norm": 0.6387980580329895,
"learning_rate": 1.4138747501007966e-07,
"loss": 0.0419,
"step": 2296
},
{
"epoch": 0.9034414945919371,
"grad_norm": 1.8949992656707764,
"learning_rate": 1.4025116851605125e-07,
"loss": 0.0556,
"step": 2297
},
{
"epoch": 0.9038348082595871,
"grad_norm": 0.8390710949897766,
"learning_rate": 1.3911931482139317e-07,
"loss": 0.0322,
"step": 2298
},
{
"epoch": 0.904228121927237,
"grad_norm": 0.6234549880027771,
"learning_rate": 1.379919160618909e-07,
"loss": 0.0334,
"step": 2299
},
{
"epoch": 0.904621435594887,
"grad_norm": 1.1114718914031982,
"learning_rate": 1.368689743649243e-07,
"loss": 0.0536,
"step": 2300
},
{
"epoch": 0.9050147492625369,
"grad_norm": 0.7461351752281189,
"learning_rate": 1.3575049184946122e-07,
"loss": 0.0371,
"step": 2301
},
{
"epoch": 0.9054080629301868,
"grad_norm": 0.9355785250663757,
"learning_rate": 1.346364706260564e-07,
"loss": 0.0296,
"step": 2302
},
{
"epoch": 0.9058013765978368,
"grad_norm": 0.5872256755828857,
"learning_rate": 1.3352691279684582e-07,
"loss": 0.0281,
"step": 2303
},
{
"epoch": 0.9061946902654867,
"grad_norm": 1.7544050216674805,
"learning_rate": 1.324218204555433e-07,
"loss": 0.056,
"step": 2304
},
{
"epoch": 0.9065880039331367,
"grad_norm": 0.6219866871833801,
"learning_rate": 1.3132119568743662e-07,
"loss": 0.0288,
"step": 2305
},
{
"epoch": 0.9069813176007866,
"grad_norm": 1.4340651035308838,
"learning_rate": 1.3022504056938196e-07,
"loss": 0.0504,
"step": 2306
},
{
"epoch": 0.9073746312684365,
"grad_norm": 0.5100427269935608,
"learning_rate": 1.2913335716980307e-07,
"loss": 0.0473,
"step": 2307
},
{
"epoch": 0.9077679449360865,
"grad_norm": 0.650513768196106,
"learning_rate": 1.2804614754868466e-07,
"loss": 0.0537,
"step": 2308
},
{
"epoch": 0.9081612586037365,
"grad_norm": 1.4720587730407715,
"learning_rate": 1.2696341375756982e-07,
"loss": 0.043,
"step": 2309
},
{
"epoch": 0.9085545722713865,
"grad_norm": 1.7473880052566528,
"learning_rate": 1.2588515783955564e-07,
"loss": 0.0551,
"step": 2310
},
{
"epoch": 0.9089478859390364,
"grad_norm": 0.7824367880821228,
"learning_rate": 1.2481138182929065e-07,
"loss": 0.0299,
"step": 2311
},
{
"epoch": 0.9093411996066864,
"grad_norm": 1.2818101644515991,
"learning_rate": 1.2374208775296742e-07,
"loss": 0.0664,
"step": 2312
},
{
"epoch": 0.9097345132743363,
"grad_norm": 1.6559642553329468,
"learning_rate": 1.2267727762832388e-07,
"loss": 0.0667,
"step": 2313
},
{
"epoch": 0.9101278269419862,
"grad_norm": 0.8255678415298462,
"learning_rate": 1.2161695346463498e-07,
"loss": 0.042,
"step": 2314
},
{
"epoch": 0.9105211406096362,
"grad_norm": 0.7617945075035095,
"learning_rate": 1.2056111726271192e-07,
"loss": 0.0464,
"step": 2315
},
{
"epoch": 0.9109144542772861,
"grad_norm": 1.3965145349502563,
"learning_rate": 1.195097710148968e-07,
"loss": 0.039,
"step": 2316
},
{
"epoch": 0.9113077679449361,
"grad_norm": 1.3296297788619995,
"learning_rate": 1.1846291670505855e-07,
"loss": 0.0552,
"step": 2317
},
{
"epoch": 0.911701081612586,
"grad_norm": 0.7849988341331482,
"learning_rate": 1.1742055630859117e-07,
"loss": 0.0338,
"step": 2318
},
{
"epoch": 0.912094395280236,
"grad_norm": 2.0398993492126465,
"learning_rate": 1.1638269179240796e-07,
"loss": 0.0542,
"step": 2319
},
{
"epoch": 0.9124877089478859,
"grad_norm": 0.7769688367843628,
"learning_rate": 1.1534932511493846e-07,
"loss": 0.0343,
"step": 2320
},
{
"epoch": 0.9128810226155359,
"grad_norm": 0.6311588287353516,
"learning_rate": 1.1432045822612564e-07,
"loss": 0.0483,
"step": 2321
},
{
"epoch": 0.9132743362831859,
"grad_norm": 0.9618848562240601,
"learning_rate": 1.132960930674204e-07,
"loss": 0.0498,
"step": 2322
},
{
"epoch": 0.9136676499508358,
"grad_norm": 0.8956164121627808,
"learning_rate": 1.1227623157177986e-07,
"loss": 0.0316,
"step": 2323
},
{
"epoch": 0.9140609636184858,
"grad_norm": 1.1387652158737183,
"learning_rate": 1.1126087566366266e-07,
"loss": 0.0669,
"step": 2324
},
{
"epoch": 0.9144542772861357,
"grad_norm": 0.7763038277626038,
"learning_rate": 1.1025002725902484e-07,
"loss": 0.0512,
"step": 2325
},
{
"epoch": 0.9148475909537856,
"grad_norm": 1.52693510055542,
"learning_rate": 1.0924368826531751e-07,
"loss": 0.0745,
"step": 2326
},
{
"epoch": 0.9152409046214356,
"grad_norm": 1.1928157806396484,
"learning_rate": 1.0824186058148278e-07,
"loss": 0.047,
"step": 2327
},
{
"epoch": 0.9156342182890855,
"grad_norm": 0.6993405818939209,
"learning_rate": 1.0724454609794931e-07,
"loss": 0.0258,
"step": 2328
},
{
"epoch": 0.9160275319567355,
"grad_norm": 0.8654144406318665,
"learning_rate": 1.0625174669663036e-07,
"loss": 0.0493,
"step": 2329
},
{
"epoch": 0.9164208456243854,
"grad_norm": 1.6443697214126587,
"learning_rate": 1.0526346425091815e-07,
"loss": 0.0641,
"step": 2330
},
{
"epoch": 0.9168141592920354,
"grad_norm": 2.2090344429016113,
"learning_rate": 1.042797006256821e-07,
"loss": 0.0916,
"step": 2331
},
{
"epoch": 0.9172074729596853,
"grad_norm": 1.2032400369644165,
"learning_rate": 1.0330045767726504e-07,
"loss": 0.043,
"step": 2332
},
{
"epoch": 0.9176007866273354,
"grad_norm": 1.0382981300354004,
"learning_rate": 1.023257372534786e-07,
"loss": 0.0478,
"step": 2333
},
{
"epoch": 0.9179941002949853,
"grad_norm": 1.3554562330245972,
"learning_rate": 1.0135554119360153e-07,
"loss": 0.076,
"step": 2334
},
{
"epoch": 0.9183874139626352,
"grad_norm": 0.7670255899429321,
"learning_rate": 1.0038987132837435e-07,
"loss": 0.0666,
"step": 2335
},
{
"epoch": 0.9187807276302852,
"grad_norm": 1.3432739973068237,
"learning_rate": 9.942872947999672e-08,
"loss": 0.0472,
"step": 2336
},
{
"epoch": 0.9191740412979351,
"grad_norm": 0.7896971702575684,
"learning_rate": 9.847211746212504e-08,
"loss": 0.0636,
"step": 2337
},
{
"epoch": 0.9195673549655851,
"grad_norm": 0.7464331388473511,
"learning_rate": 9.752003707986652e-08,
"loss": 0.036,
"step": 2338
},
{
"epoch": 0.919960668633235,
"grad_norm": 1.4482289552688599,
"learning_rate": 9.657249012977821e-08,
"loss": 0.047,
"step": 2339
},
{
"epoch": 0.9203539823008849,
"grad_norm": 0.7451487183570862,
"learning_rate": 9.562947839986264e-08,
"loss": 0.0516,
"step": 2340
},
{
"epoch": 0.9207472959685349,
"grad_norm": 1.0219905376434326,
"learning_rate": 9.469100366956391e-08,
"loss": 0.0515,
"step": 2341
},
{
"epoch": 0.9211406096361848,
"grad_norm": 0.776695966720581,
"learning_rate": 9.375706770976573e-08,
"loss": 0.0289,
"step": 2342
},
{
"epoch": 0.9215339233038348,
"grad_norm": 0.9781972169876099,
"learning_rate": 9.282767228278672e-08,
"loss": 0.0767,
"step": 2343
},
{
"epoch": 0.9219272369714847,
"grad_norm": 1.0278164148330688,
"learning_rate": 9.190281914237736e-08,
"loss": 0.0333,
"step": 2344
},
{
"epoch": 0.9223205506391348,
"grad_norm": 1.5040227174758911,
"learning_rate": 9.09825100337175e-08,
"loss": 0.0788,
"step": 2345
},
{
"epoch": 0.9227138643067847,
"grad_norm": 1.5312731266021729,
"learning_rate": 9.006674669341214e-08,
"loss": 0.0744,
"step": 2346
},
{
"epoch": 0.9231071779744346,
"grad_norm": 1.6249146461486816,
"learning_rate": 8.915553084948847e-08,
"loss": 0.0442,
"step": 2347
},
{
"epoch": 0.9235004916420846,
"grad_norm": 1.0247668027877808,
"learning_rate": 8.824886422139273e-08,
"loss": 0.0621,
"step": 2348
},
{
"epoch": 0.9238938053097345,
"grad_norm": 1.506390929222107,
"learning_rate": 8.734674851998748e-08,
"loss": 0.0755,
"step": 2349
},
{
"epoch": 0.9242871189773845,
"grad_norm": 0.8823897838592529,
"learning_rate": 8.64491854475466e-08,
"loss": 0.0637,
"step": 2350
},
{
"epoch": 0.9246804326450344,
"grad_norm": 0.7110940217971802,
"learning_rate": 8.55561766977539e-08,
"loss": 0.0326,
"step": 2351
},
{
"epoch": 0.9250737463126844,
"grad_norm": 0.5734057426452637,
"learning_rate": 8.46677239556995e-08,
"loss": 0.0305,
"step": 2352
},
{
"epoch": 0.9254670599803343,
"grad_norm": 0.8686132431030273,
"learning_rate": 8.378382889787596e-08,
"loss": 0.0405,
"step": 2353
},
{
"epoch": 0.9258603736479842,
"grad_norm": 1.6284774541854858,
"learning_rate": 8.290449319217603e-08,
"loss": 0.0583,
"step": 2354
},
{
"epoch": 0.9262536873156342,
"grad_norm": 1.2678624391555786,
"learning_rate": 8.202971849788854e-08,
"loss": 0.0474,
"step": 2355
},
{
"epoch": 0.9266470009832841,
"grad_norm": 1.2101284265518188,
"learning_rate": 8.115950646569587e-08,
"loss": 0.0391,
"step": 2356
},
{
"epoch": 0.9270403146509342,
"grad_norm": 0.6382131576538086,
"learning_rate": 8.029385873767115e-08,
"loss": 0.0512,
"step": 2357
},
{
"epoch": 0.9274336283185841,
"grad_norm": 1.0339092016220093,
"learning_rate": 7.943277694727469e-08,
"loss": 0.0528,
"step": 2358
},
{
"epoch": 0.927826941986234,
"grad_norm": 0.7545960545539856,
"learning_rate": 7.857626271935037e-08,
"loss": 0.0418,
"step": 2359
},
{
"epoch": 0.928220255653884,
"grad_norm": 0.9588167071342468,
"learning_rate": 7.772431767012423e-08,
"loss": 0.0552,
"step": 2360
},
{
"epoch": 0.9286135693215339,
"grad_norm": 0.7952490448951721,
"learning_rate": 7.68769434071992e-08,
"loss": 0.0431,
"step": 2361
},
{
"epoch": 0.9290068829891839,
"grad_norm": 1.0601327419281006,
"learning_rate": 7.603414152955374e-08,
"loss": 0.0262,
"step": 2362
},
{
"epoch": 0.9294001966568338,
"grad_norm": 0.8356077075004578,
"learning_rate": 7.519591362753848e-08,
"loss": 0.0309,
"step": 2363
},
{
"epoch": 0.9297935103244838,
"grad_norm": 1.068089246749878,
"learning_rate": 7.436226128287288e-08,
"loss": 0.0374,
"step": 2364
},
{
"epoch": 0.9301868239921337,
"grad_norm": 1.1383631229400635,
"learning_rate": 7.35331860686428e-08,
"loss": 0.0515,
"step": 2365
},
{
"epoch": 0.9305801376597836,
"grad_norm": 0.9927535653114319,
"learning_rate": 7.270868954929595e-08,
"loss": 0.056,
"step": 2366
},
{
"epoch": 0.9309734513274336,
"grad_norm": 0.6153873801231384,
"learning_rate": 7.188877328064142e-08,
"loss": 0.0437,
"step": 2367
},
{
"epoch": 0.9313667649950835,
"grad_norm": 0.8163816928863525,
"learning_rate": 7.107343880984496e-08,
"loss": 0.0541,
"step": 2368
},
{
"epoch": 0.9317600786627336,
"grad_norm": 1.144721269607544,
"learning_rate": 7.026268767542671e-08,
"loss": 0.055,
"step": 2369
},
{
"epoch": 0.9321533923303835,
"grad_norm": 0.9538362622261047,
"learning_rate": 6.94565214072579e-08,
"loss": 0.0845,
"step": 2370
},
{
"epoch": 0.9325467059980335,
"grad_norm": 1.0417604446411133,
"learning_rate": 6.86549415265586e-08,
"loss": 0.054,
"step": 2371
},
{
"epoch": 0.9329400196656834,
"grad_norm": 0.8085368275642395,
"learning_rate": 6.785794954589365e-08,
"loss": 0.0338,
"step": 2372
},
{
"epoch": 0.9333333333333333,
"grad_norm": 0.6007797718048096,
"learning_rate": 6.706554696917139e-08,
"loss": 0.0314,
"step": 2373
},
{
"epoch": 0.9337266470009833,
"grad_norm": 0.8648099303245544,
"learning_rate": 6.627773529163994e-08,
"loss": 0.0302,
"step": 2374
},
{
"epoch": 0.9341199606686332,
"grad_norm": 0.5465229749679565,
"learning_rate": 6.549451599988432e-08,
"loss": 0.0359,
"step": 2375
},
{
"epoch": 0.9345132743362832,
"grad_norm": 0.6655777096748352,
"learning_rate": 6.471589057182398e-08,
"loss": 0.0435,
"step": 2376
},
{
"epoch": 0.9349065880039331,
"grad_norm": 1.1010547876358032,
"learning_rate": 6.394186047670947e-08,
"loss": 0.0377,
"step": 2377
},
{
"epoch": 0.9352999016715831,
"grad_norm": 0.7519053816795349,
"learning_rate": 6.317242717511995e-08,
"loss": 0.033,
"step": 2378
},
{
"epoch": 0.935693215339233,
"grad_norm": 0.8617828488349915,
"learning_rate": 6.240759211896153e-08,
"loss": 0.0434,
"step": 2379
},
{
"epoch": 0.9360865290068829,
"grad_norm": 1.5556560754776,
"learning_rate": 6.16473567514625e-08,
"loss": 0.0893,
"step": 2380
},
{
"epoch": 0.936479842674533,
"grad_norm": 1.6594090461730957,
"learning_rate": 6.089172250717201e-08,
"loss": 0.0667,
"step": 2381
},
{
"epoch": 0.9368731563421829,
"grad_norm": 0.7117483019828796,
"learning_rate": 6.014069081195673e-08,
"loss": 0.0256,
"step": 2382
},
{
"epoch": 0.9372664700098329,
"grad_norm": 0.8783112168312073,
"learning_rate": 5.9394263082998836e-08,
"loss": 0.0439,
"step": 2383
},
{
"epoch": 0.9376597836774828,
"grad_norm": 0.73135906457901,
"learning_rate": 5.8652440728792504e-08,
"loss": 0.0514,
"step": 2384
},
{
"epoch": 0.9380530973451328,
"grad_norm": 0.5708735585212708,
"learning_rate": 5.791522514914216e-08,
"loss": 0.0332,
"step": 2385
},
{
"epoch": 0.9384464110127827,
"grad_norm": 1.1698683500289917,
"learning_rate": 5.718261773515865e-08,
"loss": 0.026,
"step": 2386
},
{
"epoch": 0.9388397246804326,
"grad_norm": 0.8288942575454712,
"learning_rate": 5.64546198692581e-08,
"loss": 0.0401,
"step": 2387
},
{
"epoch": 0.9392330383480826,
"grad_norm": 1.1005017757415771,
"learning_rate": 5.573123292515775e-08,
"loss": 0.0625,
"step": 2388
},
{
"epoch": 0.9396263520157325,
"grad_norm": 1.4169667959213257,
"learning_rate": 5.50124582678746e-08,
"loss": 0.0561,
"step": 2389
},
{
"epoch": 0.9400196656833825,
"grad_norm": 1.8534727096557617,
"learning_rate": 5.429829725372204e-08,
"loss": 0.0563,
"step": 2390
},
{
"epoch": 0.9404129793510324,
"grad_norm": 0.49012327194213867,
"learning_rate": 5.3588751230307935e-08,
"loss": 0.0371,
"step": 2391
},
{
"epoch": 0.9408062930186823,
"grad_norm": 1.5290131568908691,
"learning_rate": 5.2883821536531545e-08,
"loss": 0.0471,
"step": 2392
},
{
"epoch": 0.9411996066863324,
"grad_norm": 0.37540706992149353,
"learning_rate": 5.218350950258133e-08,
"loss": 0.0224,
"step": 2393
},
{
"epoch": 0.9415929203539823,
"grad_norm": 1.6441450119018555,
"learning_rate": 5.1487816449932174e-08,
"loss": 0.0545,
"step": 2394
},
{
"epoch": 0.9419862340216323,
"grad_norm": 0.8181889057159424,
"learning_rate": 5.079674369134313e-08,
"loss": 0.0528,
"step": 2395
},
{
"epoch": 0.9423795476892822,
"grad_norm": 1.6283776760101318,
"learning_rate": 5.0110292530854696e-08,
"loss": 0.0528,
"step": 2396
},
{
"epoch": 0.9427728613569322,
"grad_norm": 4.418090343475342,
"learning_rate": 4.942846426378683e-08,
"loss": 0.052,
"step": 2397
},
{
"epoch": 0.9431661750245821,
"grad_norm": 0.9668748378753662,
"learning_rate": 4.875126017673593e-08,
"loss": 0.0441,
"step": 2398
},
{
"epoch": 0.943559488692232,
"grad_norm": 1.2723820209503174,
"learning_rate": 4.807868154757284e-08,
"loss": 0.0504,
"step": 2399
},
{
"epoch": 0.943952802359882,
"grad_norm": 1.2000619173049927,
"learning_rate": 4.741072964543958e-08,
"loss": 0.0669,
"step": 2400
},
{
"epoch": 0.9443461160275319,
"grad_norm": 1.4198737144470215,
"learning_rate": 4.6747405730748765e-08,
"loss": 0.0768,
"step": 2401
},
{
"epoch": 0.9447394296951819,
"grad_norm": 0.5707858800888062,
"learning_rate": 4.6088711055179426e-08,
"loss": 0.0363,
"step": 2402
},
{
"epoch": 0.9451327433628318,
"grad_norm": 0.9884591698646545,
"learning_rate": 4.543464686167537e-08,
"loss": 0.0617,
"step": 2403
},
{
"epoch": 0.9455260570304818,
"grad_norm": 1.1140447854995728,
"learning_rate": 4.478521438444267e-08,
"loss": 0.0307,
"step": 2404
},
{
"epoch": 0.9459193706981318,
"grad_norm": 1.7241696119308472,
"learning_rate": 4.414041484894743e-08,
"loss": 0.0468,
"step": 2405
},
{
"epoch": 0.9463126843657818,
"grad_norm": 1.4963939189910889,
"learning_rate": 4.3500249471913616e-08,
"loss": 0.0424,
"step": 2406
},
{
"epoch": 0.9467059980334317,
"grad_norm": 1.4940134286880493,
"learning_rate": 4.2864719461321036e-08,
"loss": 0.062,
"step": 2407
},
{
"epoch": 0.9470993117010816,
"grad_norm": 1.2279117107391357,
"learning_rate": 4.223382601640208e-08,
"loss": 0.0557,
"step": 2408
},
{
"epoch": 0.9474926253687316,
"grad_norm": 0.5514369606971741,
"learning_rate": 4.160757032764001e-08,
"loss": 0.0211,
"step": 2409
},
{
"epoch": 0.9478859390363815,
"grad_norm": 1.1696200370788574,
"learning_rate": 4.098595357676732e-08,
"loss": 0.0525,
"step": 2410
},
{
"epoch": 0.9482792527040315,
"grad_norm": 1.4047200679779053,
"learning_rate": 4.036897693676184e-08,
"loss": 0.0582,
"step": 2411
},
{
"epoch": 0.9486725663716814,
"grad_norm": 0.9069812893867493,
"learning_rate": 3.9756641571847e-08,
"loss": 0.0451,
"step": 2412
},
{
"epoch": 0.9490658800393313,
"grad_norm": 0.7696250677108765,
"learning_rate": 3.914894863748714e-08,
"loss": 0.0596,
"step": 2413
},
{
"epoch": 0.9494591937069813,
"grad_norm": 1.0009849071502686,
"learning_rate": 3.854589928038666e-08,
"loss": 0.0531,
"step": 2414
},
{
"epoch": 0.9498525073746312,
"grad_norm": 0.6316270232200623,
"learning_rate": 3.794749463848835e-08,
"loss": 0.0261,
"step": 2415
},
{
"epoch": 0.9502458210422812,
"grad_norm": 1.1284974813461304,
"learning_rate": 3.735373584096924e-08,
"loss": 0.0485,
"step": 2416
},
{
"epoch": 0.9506391347099312,
"grad_norm": 0.744842529296875,
"learning_rate": 3.676462400824088e-08,
"loss": 0.0437,
"step": 2417
},
{
"epoch": 0.9510324483775812,
"grad_norm": 1.1578047275543213,
"learning_rate": 3.618016025194598e-08,
"loss": 0.0458,
"step": 2418
},
{
"epoch": 0.9514257620452311,
"grad_norm": 1.029968023300171,
"learning_rate": 3.560034567495513e-08,
"loss": 0.063,
"step": 2419
},
{
"epoch": 0.951819075712881,
"grad_norm": 0.8940306305885315,
"learning_rate": 3.5025181371367844e-08,
"loss": 0.0583,
"step": 2420
},
{
"epoch": 0.952212389380531,
"grad_norm": 1.1246992349624634,
"learning_rate": 3.4454668426507076e-08,
"loss": 0.0446,
"step": 2421
},
{
"epoch": 0.9526057030481809,
"grad_norm": 1.069629192352295,
"learning_rate": 3.388880791692001e-08,
"loss": 0.0422,
"step": 2422
},
{
"epoch": 0.9529990167158309,
"grad_norm": 1.080478549003601,
"learning_rate": 3.33276009103739e-08,
"loss": 0.0547,
"step": 2423
},
{
"epoch": 0.9533923303834808,
"grad_norm": 1.105726718902588,
"learning_rate": 3.2771048465855546e-08,
"loss": 0.0478,
"step": 2424
},
{
"epoch": 0.9537856440511308,
"grad_norm": 0.9557194709777832,
"learning_rate": 3.221915163356848e-08,
"loss": 0.0454,
"step": 2425
},
{
"epoch": 0.9541789577187807,
"grad_norm": 0.7306869626045227,
"learning_rate": 3.167191145493076e-08,
"loss": 0.0306,
"step": 2426
},
{
"epoch": 0.9545722713864306,
"grad_norm": 0.9311756491661072,
"learning_rate": 3.1129328962573865e-08,
"loss": 0.0378,
"step": 2427
},
{
"epoch": 0.9549655850540806,
"grad_norm": 1.6339657306671143,
"learning_rate": 3.05914051803402e-08,
"loss": 0.053,
"step": 2428
},
{
"epoch": 0.9553588987217306,
"grad_norm": 1.5211260318756104,
"learning_rate": 3.005814112328143e-08,
"loss": 0.0408,
"step": 2429
},
{
"epoch": 0.9557522123893806,
"grad_norm": 1.1606007814407349,
"learning_rate": 2.9529537797656215e-08,
"loss": 0.0531,
"step": 2430
},
{
"epoch": 0.9561455260570305,
"grad_norm": 0.5916828513145447,
"learning_rate": 2.900559620092891e-08,
"loss": 0.0625,
"step": 2431
},
{
"epoch": 0.9565388397246805,
"grad_norm": 0.49938130378723145,
"learning_rate": 2.8486317321766432e-08,
"loss": 0.0395,
"step": 2432
},
{
"epoch": 0.9569321533923304,
"grad_norm": 1.587057113647461,
"learning_rate": 2.797170214003775e-08,
"loss": 0.1053,
"step": 2433
},
{
"epoch": 0.9573254670599803,
"grad_norm": 1.176936149597168,
"learning_rate": 2.7461751626811916e-08,
"loss": 0.0462,
"step": 2434
},
{
"epoch": 0.9577187807276303,
"grad_norm": 0.5434470176696777,
"learning_rate": 2.6956466744355315e-08,
"loss": 0.0268,
"step": 2435
},
{
"epoch": 0.9581120943952802,
"grad_norm": 0.6117231845855713,
"learning_rate": 2.6455848446130526e-08,
"loss": 0.0572,
"step": 2436
},
{
"epoch": 0.9585054080629302,
"grad_norm": 1.2302024364471436,
"learning_rate": 2.5959897676794134e-08,
"loss": 0.0613,
"step": 2437
},
{
"epoch": 0.9588987217305801,
"grad_norm": 1.686108946800232,
"learning_rate": 2.546861537219586e-08,
"loss": 0.0726,
"step": 2438
},
{
"epoch": 0.95929203539823,
"grad_norm": 0.9010059833526611,
"learning_rate": 2.4982002459375265e-08,
"loss": 0.0356,
"step": 2439
},
{
"epoch": 0.95968534906588,
"grad_norm": 0.7760159373283386,
"learning_rate": 2.450005985656173e-08,
"loss": 0.0376,
"step": 2440
},
{
"epoch": 0.96007866273353,
"grad_norm": 0.788345456123352,
"learning_rate": 2.4022788473170853e-08,
"loss": 0.0657,
"step": 2441
},
{
"epoch": 0.96047197640118,
"grad_norm": 0.8711709976196289,
"learning_rate": 2.355018920980501e-08,
"loss": 0.0444,
"step": 2442
},
{
"epoch": 0.9608652900688299,
"grad_norm": 0.6124730110168457,
"learning_rate": 2.308226295824917e-08,
"loss": 0.0542,
"step": 2443
},
{
"epoch": 0.9612586037364799,
"grad_norm": 1.0837171077728271,
"learning_rate": 2.2619010601470925e-08,
"loss": 0.0577,
"step": 2444
},
{
"epoch": 0.9616519174041298,
"grad_norm": 1.9453260898590088,
"learning_rate": 2.2160433013618533e-08,
"loss": 0.058,
"step": 2445
},
{
"epoch": 0.9620452310717797,
"grad_norm": 0.8556208610534668,
"learning_rate": 2.170653106001841e-08,
"loss": 0.0281,
"step": 2446
},
{
"epoch": 0.9624385447394297,
"grad_norm": 0.9196289777755737,
"learning_rate": 2.1257305597175428e-08,
"loss": 0.0414,
"step": 2447
},
{
"epoch": 0.9628318584070796,
"grad_norm": 1.5880217552185059,
"learning_rate": 2.0812757472768175e-08,
"loss": 0.0496,
"step": 2448
},
{
"epoch": 0.9632251720747296,
"grad_norm": 1.4076353311538696,
"learning_rate": 2.037288752565064e-08,
"loss": 0.049,
"step": 2449
},
{
"epoch": 0.9636184857423795,
"grad_norm": 0.8668321967124939,
"learning_rate": 1.99376965858486e-08,
"loss": 0.0606,
"step": 2450
},
{
"epoch": 0.9640117994100295,
"grad_norm": 0.7461321353912354,
"learning_rate": 1.9507185474558765e-08,
"loss": 0.0343,
"step": 2451
},
{
"epoch": 0.9644051130776794,
"grad_norm": 0.6470179557800293,
"learning_rate": 1.908135500414743e-08,
"loss": 0.0334,
"step": 2452
},
{
"epoch": 0.9647984267453295,
"grad_norm": 1.0918750762939453,
"learning_rate": 1.866020597814766e-08,
"loss": 0.0451,
"step": 2453
},
{
"epoch": 0.9651917404129794,
"grad_norm": 0.6877756118774414,
"learning_rate": 1.8243739191259603e-08,
"loss": 0.0397,
"step": 2454
},
{
"epoch": 0.9655850540806293,
"grad_norm": 0.9845160245895386,
"learning_rate": 1.7831955429348235e-08,
"loss": 0.0227,
"step": 2455
},
{
"epoch": 0.9659783677482793,
"grad_norm": 1.178027629852295,
"learning_rate": 1.7424855469440617e-08,
"loss": 0.0941,
"step": 2456
},
{
"epoch": 0.9663716814159292,
"grad_norm": 1.0678149461746216,
"learning_rate": 1.7022440079726976e-08,
"loss": 0.0519,
"step": 2457
},
{
"epoch": 0.9667649950835792,
"grad_norm": 0.7598469257354736,
"learning_rate": 1.6624710019556844e-08,
"loss": 0.0303,
"step": 2458
},
{
"epoch": 0.9671583087512291,
"grad_norm": 1.8913023471832275,
"learning_rate": 1.623166603943932e-08,
"loss": 0.0573,
"step": 2459
},
{
"epoch": 0.967551622418879,
"grad_norm": 0.8094140887260437,
"learning_rate": 1.584330888104002e-08,
"loss": 0.0454,
"step": 2460
},
{
"epoch": 0.967944936086529,
"grad_norm": 1.0645431280136108,
"learning_rate": 1.5459639277181637e-08,
"loss": 0.0482,
"step": 2461
},
{
"epoch": 0.9683382497541789,
"grad_norm": 1.1675747632980347,
"learning_rate": 1.508065795184116e-08,
"loss": 0.0587,
"step": 2462
},
{
"epoch": 0.9687315634218289,
"grad_norm": 1.6579506397247314,
"learning_rate": 1.4706365620149043e-08,
"loss": 0.0389,
"step": 2463
},
{
"epoch": 0.9691248770894788,
"grad_norm": 1.4258586168289185,
"learning_rate": 1.433676298838671e-08,
"loss": 0.0571,
"step": 2464
},
{
"epoch": 0.9695181907571289,
"grad_norm": 1.555445671081543,
"learning_rate": 1.3971850753987936e-08,
"loss": 0.0561,
"step": 2465
},
{
"epoch": 0.9699115044247788,
"grad_norm": 1.851238489151001,
"learning_rate": 1.3611629605534139e-08,
"loss": 0.0614,
"step": 2466
},
{
"epoch": 0.9703048180924287,
"grad_norm": 1.4167311191558838,
"learning_rate": 1.325610022275603e-08,
"loss": 0.0541,
"step": 2467
},
{
"epoch": 0.9706981317600787,
"grad_norm": 1.103963017463684,
"learning_rate": 1.29052632765303e-08,
"loss": 0.0515,
"step": 2468
},
{
"epoch": 0.9710914454277286,
"grad_norm": 0.8383644819259644,
"learning_rate": 1.2559119428879607e-08,
"loss": 0.0439,
"step": 2469
},
{
"epoch": 0.9714847590953786,
"grad_norm": 1.5626074075698853,
"learning_rate": 1.2217669332970084e-08,
"loss": 0.0358,
"step": 2470
},
{
"epoch": 0.9718780727630285,
"grad_norm": 0.965404748916626,
"learning_rate": 1.1880913633111335e-08,
"loss": 0.0588,
"step": 2471
},
{
"epoch": 0.9722713864306785,
"grad_norm": 1.2146902084350586,
"learning_rate": 1.1548852964755053e-08,
"loss": 0.0473,
"step": 2472
},
{
"epoch": 0.9726647000983284,
"grad_norm": 1.4855893850326538,
"learning_rate": 1.122148795449307e-08,
"loss": 0.0543,
"step": 2473
},
{
"epoch": 0.9730580137659783,
"grad_norm": 1.1908034086227417,
"learning_rate": 1.0898819220056811e-08,
"loss": 0.0486,
"step": 2474
},
{
"epoch": 0.9734513274336283,
"grad_norm": 1.0501704216003418,
"learning_rate": 1.058084737031534e-08,
"loss": 0.0475,
"step": 2475
},
{
"epoch": 0.9738446411012782,
"grad_norm": 0.6650611162185669,
"learning_rate": 1.0267573005275645e-08,
"loss": 0.0297,
"step": 2476
},
{
"epoch": 0.9742379547689283,
"grad_norm": 0.6201514601707458,
"learning_rate": 9.95899671607986e-09,
"loss": 0.047,
"step": 2477
},
{
"epoch": 0.9746312684365782,
"grad_norm": 1.1360257863998413,
"learning_rate": 9.655119085005827e-09,
"loss": 0.0363,
"step": 2478
},
{
"epoch": 0.9750245821042282,
"grad_norm": 0.8666075468063354,
"learning_rate": 9.355940685464305e-09,
"loss": 0.0458,
"step": 2479
},
{
"epoch": 0.9754178957718781,
"grad_norm": 1.1366305351257324,
"learning_rate": 9.061462081999262e-09,
"loss": 0.0471,
"step": 2480
},
{
"epoch": 0.975811209439528,
"grad_norm": 0.6694433689117432,
"learning_rate": 8.771683830285649e-09,
"loss": 0.0387,
"step": 2481
},
{
"epoch": 0.976204523107178,
"grad_norm": 2.0710513591766357,
"learning_rate": 8.486606477129677e-09,
"loss": 0.075,
"step": 2482
},
{
"epoch": 0.9765978367748279,
"grad_norm": 0.9630718231201172,
"learning_rate": 8.206230560466322e-09,
"loss": 0.0431,
"step": 2483
},
{
"epoch": 0.9769911504424779,
"grad_norm": 0.9957706332206726,
"learning_rate": 7.930556609359596e-09,
"loss": 0.0398,
"step": 2484
},
{
"epoch": 0.9773844641101278,
"grad_norm": 0.8392490148544312,
"learning_rate": 7.659585144000892e-09,
"loss": 0.1203,
"step": 2485
},
{
"epoch": 0.9777777777777777,
"grad_norm": 0.763048529624939,
"learning_rate": 7.393316675707584e-09,
"loss": 0.048,
"step": 2486
},
{
"epoch": 0.9781710914454277,
"grad_norm": 0.591249942779541,
"learning_rate": 7.131751706923595e-09,
"loss": 0.0276,
"step": 2487
},
{
"epoch": 0.9785644051130776,
"grad_norm": 0.7118191719055176,
"learning_rate": 6.8748907312163325e-09,
"loss": 0.0459,
"step": 2488
},
{
"epoch": 0.9789577187807277,
"grad_norm": 1.2333048582077026,
"learning_rate": 6.622734233277528e-09,
"loss": 0.0547,
"step": 2489
},
{
"epoch": 0.9793510324483776,
"grad_norm": 1.8401693105697632,
"learning_rate": 6.375282688921569e-09,
"loss": 0.0499,
"step": 2490
},
{
"epoch": 0.9797443461160276,
"grad_norm": 0.8339464068412781,
"learning_rate": 6.132536565084945e-09,
"loss": 0.0343,
"step": 2491
},
{
"epoch": 0.9801376597836775,
"grad_norm": 0.7225338220596313,
"learning_rate": 5.894496319824306e-09,
"loss": 0.0373,
"step": 2492
},
{
"epoch": 0.9805309734513274,
"grad_norm": 0.7467345595359802,
"learning_rate": 5.661162402316733e-09,
"loss": 0.0294,
"step": 2493
},
{
"epoch": 0.9809242871189774,
"grad_norm": 0.7157261967658997,
"learning_rate": 5.432535252859472e-09,
"loss": 0.0388,
"step": 2494
},
{
"epoch": 0.9813176007866273,
"grad_norm": 1.0490740537643433,
"learning_rate": 5.208615302866593e-09,
"loss": 0.0552,
"step": 2495
},
{
"epoch": 0.9817109144542773,
"grad_norm": 0.9684942364692688,
"learning_rate": 4.989402974871216e-09,
"loss": 0.0482,
"step": 2496
},
{
"epoch": 0.9821042281219272,
"grad_norm": 0.7083243727684021,
"learning_rate": 4.774898682522455e-09,
"loss": 0.0354,
"step": 2497
},
{
"epoch": 0.9824975417895772,
"grad_norm": 0.6887216567993164,
"learning_rate": 4.565102830585699e-09,
"loss": 0.0555,
"step": 2498
},
{
"epoch": 0.9828908554572271,
"grad_norm": 0.9905696511268616,
"learning_rate": 4.360015814941498e-09,
"loss": 0.044,
"step": 2499
},
{
"epoch": 0.983284169124877,
"grad_norm": 1.4582995176315308,
"learning_rate": 4.159638022585011e-09,
"loss": 0.0555,
"step": 2500
},
{
"epoch": 0.9836774827925271,
"grad_norm": 0.8839958906173706,
"learning_rate": 3.96396983162517e-09,
"loss": 0.0322,
"step": 2501
},
{
"epoch": 0.984070796460177,
"grad_norm": 0.9634173512458801,
"learning_rate": 3.773011611284128e-09,
"loss": 0.0305,
"step": 2502
},
{
"epoch": 0.984464110127827,
"grad_norm": 0.9942337870597839,
"learning_rate": 3.586763721896147e-09,
"loss": 0.0725,
"step": 2503
},
{
"epoch": 0.9848574237954769,
"grad_norm": 0.8074241876602173,
"learning_rate": 3.4052265149070453e-09,
"loss": 0.048,
"step": 2504
},
{
"epoch": 0.9852507374631269,
"grad_norm": 1.1746639013290405,
"learning_rate": 3.2284003328744706e-09,
"loss": 0.0565,
"step": 2505
},
{
"epoch": 0.9856440511307768,
"grad_norm": 1.454350233078003,
"learning_rate": 3.056285509465684e-09,
"loss": 0.0462,
"step": 2506
},
{
"epoch": 0.9860373647984267,
"grad_norm": 1.0500266551971436,
"learning_rate": 2.888882369457835e-09,
"loss": 0.0229,
"step": 2507
},
{
"epoch": 0.9864306784660767,
"grad_norm": 0.5939337611198425,
"learning_rate": 2.726191228737407e-09,
"loss": 0.0441,
"step": 2508
},
{
"epoch": 0.9868239921337266,
"grad_norm": 0.7773805856704712,
"learning_rate": 2.5682123942993852e-09,
"loss": 0.0388,
"step": 2509
},
{
"epoch": 0.9872173058013766,
"grad_norm": 0.9417904019355774,
"learning_rate": 2.414946164246701e-09,
"loss": 0.0448,
"step": 2510
},
{
"epoch": 0.9876106194690265,
"grad_norm": 0.8849769830703735,
"learning_rate": 2.2663928277896763e-09,
"loss": 0.0482,
"step": 2511
},
{
"epoch": 0.9880039331366764,
"grad_norm": 1.0469379425048828,
"learning_rate": 2.122552665245747e-09,
"loss": 0.0479,
"step": 2512
},
{
"epoch": 0.9883972468043265,
"grad_norm": 0.4294953942298889,
"learning_rate": 1.9834259480380756e-09,
"loss": 0.017,
"step": 2513
},
{
"epoch": 0.9887905604719764,
"grad_norm": 1.0931810140609741,
"learning_rate": 1.8490129386963818e-09,
"loss": 0.0376,
"step": 2514
},
{
"epoch": 0.9891838741396264,
"grad_norm": 0.5045303702354431,
"learning_rate": 1.719313890855001e-09,
"loss": 0.0203,
"step": 2515
},
{
"epoch": 0.9895771878072763,
"grad_norm": 1.2506543397903442,
"learning_rate": 1.5943290492539953e-09,
"loss": 0.0415,
"step": 2516
},
{
"epoch": 0.9899705014749263,
"grad_norm": 0.6282764673233032,
"learning_rate": 1.4740586497366538e-09,
"loss": 0.043,
"step": 2517
},
{
"epoch": 0.9903638151425762,
"grad_norm": 1.0732625722885132,
"learning_rate": 1.358502919251159e-09,
"loss": 0.049,
"step": 2518
},
{
"epoch": 0.9907571288102262,
"grad_norm": 0.8076870441436768,
"learning_rate": 1.247662075848921e-09,
"loss": 0.0367,
"step": 2519
},
{
"epoch": 0.9911504424778761,
"grad_norm": 1.1323729753494263,
"learning_rate": 1.1415363286843007e-09,
"loss": 0.0549,
"step": 2520
},
{
"epoch": 0.991543756145526,
"grad_norm": 1.2635443210601807,
"learning_rate": 1.0401258780146084e-09,
"loss": 0.0375,
"step": 2521
},
{
"epoch": 0.991937069813176,
"grad_norm": 1.430897831916809,
"learning_rate": 9.434309151992727e-10,
"loss": 0.075,
"step": 2522
},
{
"epoch": 0.9923303834808259,
"grad_norm": 1.1660479307174683,
"learning_rate": 8.514516226998393e-10,
"loss": 0.0562,
"step": 2523
},
{
"epoch": 0.9927236971484759,
"grad_norm": 2.029007911682129,
"learning_rate": 7.641881740794166e-10,
"loss": 0.0481,
"step": 2524
},
{
"epoch": 0.9931170108161259,
"grad_norm": 0.7072765827178955,
"learning_rate": 6.816407340023978e-10,
"loss": 0.0188,
"step": 2525
},
{
"epoch": 0.9935103244837759,
"grad_norm": 0.8789957165718079,
"learning_rate": 6.03809458233906e-10,
"loss": 0.0573,
"step": 2526
},
{
"epoch": 0.9939036381514258,
"grad_norm": 0.7415314316749573,
"learning_rate": 5.306944936406266e-10,
"loss": 0.0458,
"step": 2527
},
{
"epoch": 0.9942969518190757,
"grad_norm": 0.6154326796531677,
"learning_rate": 4.622959781883096e-10,
"loss": 0.0236,
"step": 2528
},
{
"epoch": 0.9946902654867257,
"grad_norm": 0.810153067111969,
"learning_rate": 3.9861404094426734e-10,
"loss": 0.0443,
"step": 2529
},
{
"epoch": 0.9950835791543756,
"grad_norm": 0.743605375289917,
"learning_rate": 3.3964880207459916e-10,
"loss": 0.052,
"step": 2530
},
{
"epoch": 0.9954768928220256,
"grad_norm": 1.1516720056533813,
"learning_rate": 2.8540037284557897e-10,
"loss": 0.0729,
"step": 2531
},
{
"epoch": 0.9958702064896755,
"grad_norm": 1.1776301860809326,
"learning_rate": 2.358688556233779e-10,
"loss": 0.0401,
"step": 2532
},
{
"epoch": 0.9962635201573254,
"grad_norm": 1.0834025144577026,
"learning_rate": 1.9105434387239886e-10,
"loss": 0.0593,
"step": 2533
},
{
"epoch": 0.9966568338249754,
"grad_norm": 1.4529463052749634,
"learning_rate": 1.509569221569418e-10,
"loss": 0.0423,
"step": 2534
},
{
"epoch": 0.9970501474926253,
"grad_norm": 1.1381511688232422,
"learning_rate": 1.1557666614037122e-10,
"loss": 0.0411,
"step": 2535
},
{
"epoch": 0.9974434611602753,
"grad_norm": 1.113553762435913,
"learning_rate": 8.49136425840058e-11,
"loss": 0.0611,
"step": 2536
},
{
"epoch": 0.9978367748279253,
"grad_norm": 1.071913719177246,
"learning_rate": 5.896790934878383e-11,
"loss": 0.0609,
"step": 2537
},
{
"epoch": 0.9982300884955753,
"grad_norm": 1.7356159687042236,
"learning_rate": 3.7739515393320215e-11,
"loss": 0.0524,
"step": 2538
},
{
"epoch": 0.9986234021632252,
"grad_norm": 1.0763658285140991,
"learning_rate": 2.122850077584948e-11,
"loss": 0.0527,
"step": 2539
},
{
"epoch": 0.9990167158308751,
"grad_norm": 0.6793241500854492,
"learning_rate": 9.434896651727699e-12,
"loss": 0.0462,
"step": 2540
},
{
"epoch": 0.9994100294985251,
"grad_norm": 0.9101441502571106,
"learning_rate": 2.358725275652951e-12,
"loss": 0.0453,
"step": 2541
},
{
"epoch": 0.999803343166175,
"grad_norm": 1.0394845008850098,
"learning_rate": 0.0,
"loss": 0.0578,
"step": 2542
},
{
"epoch": 0.999803343166175,
"step": 2542,
"total_flos": 5.5848341785175654e+17,
"train_loss": 0.05740805761998535,
"train_runtime": 78224.1342,
"train_samples_per_second": 1.04,
"train_steps_per_second": 0.032
}
],
"logging_steps": 1.0,
"max_steps": 2542,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.5848341785175654e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}