random_Qzlpe0CLiZ2PbJAw / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
07b3c4a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 408,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004901960784313725,
"grad_norm": 9.949305642315444,
"learning_rate": 9.999851776425575e-06,
"loss": 0.5539,
"step": 1
},
{
"epoch": 0.00980392156862745,
"grad_norm": 4.075674507588934,
"learning_rate": 9.999407114490384e-06,
"loss": 0.4213,
"step": 2
},
{
"epoch": 0.014705882352941176,
"grad_norm": 4.867105580967725,
"learning_rate": 9.998666040558187e-06,
"loss": 0.4319,
"step": 3
},
{
"epoch": 0.0196078431372549,
"grad_norm": 1.8297624952470446,
"learning_rate": 9.99762859856683e-06,
"loss": 0.3531,
"step": 4
},
{
"epoch": 0.024509803921568627,
"grad_norm": 1.3398804411974734,
"learning_rate": 9.996294850025658e-06,
"loss": 0.3147,
"step": 5
},
{
"epoch": 0.029411764705882353,
"grad_norm": 0.9826229003116685,
"learning_rate": 9.994664874011864e-06,
"loss": 0.3227,
"step": 6
},
{
"epoch": 0.03431372549019608,
"grad_norm": 0.7586586122465202,
"learning_rate": 9.992738767165791e-06,
"loss": 0.319,
"step": 7
},
{
"epoch": 0.0392156862745098,
"grad_norm": 0.882703555100207,
"learning_rate": 9.990516643685222e-06,
"loss": 0.2825,
"step": 8
},
{
"epoch": 0.04411764705882353,
"grad_norm": 0.8996418859466875,
"learning_rate": 9.987998635318586e-06,
"loss": 0.3206,
"step": 9
},
{
"epoch": 0.049019607843137254,
"grad_norm": 0.6941482280225552,
"learning_rate": 9.985184891357165e-06,
"loss": 0.3222,
"step": 10
},
{
"epoch": 0.05392156862745098,
"grad_norm": 0.6330852150187295,
"learning_rate": 9.982075578626235e-06,
"loss": 0.3091,
"step": 11
},
{
"epoch": 0.058823529411764705,
"grad_norm": 0.6492344466203871,
"learning_rate": 9.978670881475173e-06,
"loss": 0.2981,
"step": 12
},
{
"epoch": 0.06372549019607843,
"grad_norm": 0.6749300990695969,
"learning_rate": 9.974971001766534e-06,
"loss": 0.2855,
"step": 13
},
{
"epoch": 0.06862745098039216,
"grad_norm": 0.6104328540607359,
"learning_rate": 9.970976158864074e-06,
"loss": 0.2946,
"step": 14
},
{
"epoch": 0.07352941176470588,
"grad_norm": 0.5474488618726454,
"learning_rate": 9.96668658961975e-06,
"loss": 0.2877,
"step": 15
},
{
"epoch": 0.0784313725490196,
"grad_norm": 0.5672503974973263,
"learning_rate": 9.96210254835968e-06,
"loss": 0.2744,
"step": 16
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.5351355676937355,
"learning_rate": 9.957224306869053e-06,
"loss": 0.2837,
"step": 17
},
{
"epoch": 0.08823529411764706,
"grad_norm": 0.5420649578895781,
"learning_rate": 9.952052154376027e-06,
"loss": 0.2851,
"step": 18
},
{
"epoch": 0.09313725490196079,
"grad_norm": 0.616265084827749,
"learning_rate": 9.946586397534572e-06,
"loss": 0.3106,
"step": 19
},
{
"epoch": 0.09803921568627451,
"grad_norm": 0.5357910945074501,
"learning_rate": 9.940827360406297e-06,
"loss": 0.2866,
"step": 20
},
{
"epoch": 0.10294117647058823,
"grad_norm": 0.5436018725783519,
"learning_rate": 9.93477538444123e-06,
"loss": 0.3129,
"step": 21
},
{
"epoch": 0.10784313725490197,
"grad_norm": 0.5352962246240812,
"learning_rate": 9.92843082845757e-06,
"loss": 0.2911,
"step": 22
},
{
"epoch": 0.11274509803921569,
"grad_norm": 0.512996507276675,
"learning_rate": 9.92179406862043e-06,
"loss": 0.2761,
"step": 23
},
{
"epoch": 0.11764705882352941,
"grad_norm": 0.5316783600397025,
"learning_rate": 9.91486549841951e-06,
"loss": 0.2975,
"step": 24
},
{
"epoch": 0.12254901960784313,
"grad_norm": 0.4789505118842049,
"learning_rate": 9.907645528645791e-06,
"loss": 0.2613,
"step": 25
},
{
"epoch": 0.12745098039215685,
"grad_norm": 0.5113514733732404,
"learning_rate": 9.90013458736716e-06,
"loss": 0.2662,
"step": 26
},
{
"epoch": 0.1323529411764706,
"grad_norm": 0.504304050146625,
"learning_rate": 9.892333119903045e-06,
"loss": 0.3001,
"step": 27
},
{
"epoch": 0.13725490196078433,
"grad_norm": 0.5181939763522874,
"learning_rate": 9.884241588798004e-06,
"loss": 0.2928,
"step": 28
},
{
"epoch": 0.14215686274509803,
"grad_norm": 0.5407134908885123,
"learning_rate": 9.875860473794302e-06,
"loss": 0.2743,
"step": 29
},
{
"epoch": 0.14705882352941177,
"grad_norm": 0.4918389865538274,
"learning_rate": 9.867190271803466e-06,
"loss": 0.2781,
"step": 30
},
{
"epoch": 0.15196078431372548,
"grad_norm": 0.5040301538424998,
"learning_rate": 9.85823149687683e-06,
"loss": 0.2765,
"step": 31
},
{
"epoch": 0.1568627450980392,
"grad_norm": 0.5229000429022846,
"learning_rate": 9.848984680175049e-06,
"loss": 0.2814,
"step": 32
},
{
"epoch": 0.16176470588235295,
"grad_norm": 0.5184957323080689,
"learning_rate": 9.839450369936615e-06,
"loss": 0.2855,
"step": 33
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.5069951437454678,
"learning_rate": 9.829629131445342e-06,
"loss": 0.2936,
"step": 34
},
{
"epoch": 0.1715686274509804,
"grad_norm": 0.5036566946779656,
"learning_rate": 9.819521546996864e-06,
"loss": 0.2739,
"step": 35
},
{
"epoch": 0.17647058823529413,
"grad_norm": 0.4895005980353086,
"learning_rate": 9.809128215864096e-06,
"loss": 0.2864,
"step": 36
},
{
"epoch": 0.18137254901960784,
"grad_norm": 0.5051322582648758,
"learning_rate": 9.798449754261716e-06,
"loss": 0.2543,
"step": 37
},
{
"epoch": 0.18627450980392157,
"grad_norm": 0.5606615218448902,
"learning_rate": 9.787486795309621e-06,
"loss": 0.3097,
"step": 38
},
{
"epoch": 0.19117647058823528,
"grad_norm": 0.5113656698846124,
"learning_rate": 9.776239988995401e-06,
"loss": 0.2696,
"step": 39
},
{
"epoch": 0.19607843137254902,
"grad_norm": 0.4991653304275424,
"learning_rate": 9.764710002135784e-06,
"loss": 0.2942,
"step": 40
},
{
"epoch": 0.20098039215686275,
"grad_norm": 0.4757517371235649,
"learning_rate": 9.752897518337117e-06,
"loss": 0.2715,
"step": 41
},
{
"epoch": 0.20588235294117646,
"grad_norm": 0.4969728084578434,
"learning_rate": 9.74080323795483e-06,
"loss": 0.2911,
"step": 42
},
{
"epoch": 0.2107843137254902,
"grad_norm": 0.48508387550906595,
"learning_rate": 9.72842787805191e-06,
"loss": 0.2797,
"step": 43
},
{
"epoch": 0.21568627450980393,
"grad_norm": 0.500256000950693,
"learning_rate": 9.715772172356388e-06,
"loss": 0.2648,
"step": 44
},
{
"epoch": 0.22058823529411764,
"grad_norm": 0.4842669080621426,
"learning_rate": 9.702836871217838e-06,
"loss": 0.2778,
"step": 45
},
{
"epoch": 0.22549019607843138,
"grad_norm": 0.4747803990331153,
"learning_rate": 9.689622741562891e-06,
"loss": 0.2607,
"step": 46
},
{
"epoch": 0.23039215686274508,
"grad_norm": 0.5010061541098526,
"learning_rate": 9.676130566849757e-06,
"loss": 0.2984,
"step": 47
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.47171751321285477,
"learning_rate": 9.66236114702178e-06,
"loss": 0.2614,
"step": 48
},
{
"epoch": 0.24019607843137256,
"grad_norm": 0.4486004421177675,
"learning_rate": 9.64831529846001e-06,
"loss": 0.251,
"step": 49
},
{
"epoch": 0.24509803921568626,
"grad_norm": 0.4792184131399122,
"learning_rate": 9.633993853934803e-06,
"loss": 0.2763,
"step": 50
},
{
"epoch": 0.25,
"grad_norm": 0.5286196103587543,
"learning_rate": 9.619397662556434e-06,
"loss": 0.2733,
"step": 51
},
{
"epoch": 0.2549019607843137,
"grad_norm": 0.4628990001171691,
"learning_rate": 9.60452758972477e-06,
"loss": 0.2529,
"step": 52
},
{
"epoch": 0.25980392156862747,
"grad_norm": 0.4546346553824907,
"learning_rate": 9.589384517077945e-06,
"loss": 0.2575,
"step": 53
},
{
"epoch": 0.2647058823529412,
"grad_norm": 0.4644836879571118,
"learning_rate": 9.573969342440107e-06,
"loss": 0.2659,
"step": 54
},
{
"epoch": 0.2696078431372549,
"grad_norm": 0.4564926053780548,
"learning_rate": 9.558282979768164e-06,
"loss": 0.2806,
"step": 55
},
{
"epoch": 0.27450980392156865,
"grad_norm": 0.47250787332790883,
"learning_rate": 9.542326359097619e-06,
"loss": 0.2783,
"step": 56
},
{
"epoch": 0.27941176470588236,
"grad_norm": 0.4925803264813645,
"learning_rate": 9.52610042648741e-06,
"loss": 0.2738,
"step": 57
},
{
"epoch": 0.28431372549019607,
"grad_norm": 0.5764917572941164,
"learning_rate": 9.509606143963832e-06,
"loss": 0.2946,
"step": 58
},
{
"epoch": 0.28921568627450983,
"grad_norm": 0.5027549400707922,
"learning_rate": 9.492844489463486e-06,
"loss": 0.2705,
"step": 59
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.4948958040547992,
"learning_rate": 9.475816456775313e-06,
"loss": 0.279,
"step": 60
},
{
"epoch": 0.29901960784313725,
"grad_norm": 0.5249720085150932,
"learning_rate": 9.458523055481658e-06,
"loss": 0.2775,
"step": 61
},
{
"epoch": 0.30392156862745096,
"grad_norm": 0.5160221747233424,
"learning_rate": 9.440965310898425e-06,
"loss": 0.2749,
"step": 62
},
{
"epoch": 0.3088235294117647,
"grad_norm": 0.5043522735174615,
"learning_rate": 9.423144264014278e-06,
"loss": 0.2904,
"step": 63
},
{
"epoch": 0.3137254901960784,
"grad_norm": 0.5058021983501817,
"learning_rate": 9.405060971428924e-06,
"loss": 0.2873,
"step": 64
},
{
"epoch": 0.31862745098039214,
"grad_norm": 0.4769659728548327,
"learning_rate": 9.386716505290467e-06,
"loss": 0.2746,
"step": 65
},
{
"epoch": 0.3235294117647059,
"grad_norm": 0.4805442594216887,
"learning_rate": 9.368111953231849e-06,
"loss": 0.274,
"step": 66
},
{
"epoch": 0.3284313725490196,
"grad_norm": 0.483159811379141,
"learning_rate": 9.349248418306347e-06,
"loss": 0.2688,
"step": 67
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.5011176419560011,
"learning_rate": 9.330127018922195e-06,
"loss": 0.3139,
"step": 68
},
{
"epoch": 0.3382352941176471,
"grad_norm": 0.5310341725773848,
"learning_rate": 9.310748888776254e-06,
"loss": 0.303,
"step": 69
},
{
"epoch": 0.3431372549019608,
"grad_norm": 0.47577451907323093,
"learning_rate": 9.291115176786814e-06,
"loss": 0.2696,
"step": 70
},
{
"epoch": 0.3480392156862745,
"grad_norm": 0.45184435487313246,
"learning_rate": 9.271227047025462e-06,
"loss": 0.2827,
"step": 71
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.48943856307135464,
"learning_rate": 9.251085678648072e-06,
"loss": 0.2695,
"step": 72
},
{
"epoch": 0.35784313725490197,
"grad_norm": 0.4721139530533066,
"learning_rate": 9.230692265824888e-06,
"loss": 0.2555,
"step": 73
},
{
"epoch": 0.3627450980392157,
"grad_norm": 0.47202502734236734,
"learning_rate": 9.210048017669727e-06,
"loss": 0.2926,
"step": 74
},
{
"epoch": 0.36764705882352944,
"grad_norm": 0.49767797359868676,
"learning_rate": 9.189154158168293e-06,
"loss": 0.2913,
"step": 75
},
{
"epoch": 0.37254901960784315,
"grad_norm": 0.5025843797659678,
"learning_rate": 9.168011926105598e-06,
"loss": 0.2666,
"step": 76
},
{
"epoch": 0.37745098039215685,
"grad_norm": 0.5022682553518323,
"learning_rate": 9.146622574992528e-06,
"loss": 0.2806,
"step": 77
},
{
"epoch": 0.38235294117647056,
"grad_norm": 0.45419943811853347,
"learning_rate": 9.124987372991512e-06,
"loss": 0.2907,
"step": 78
},
{
"epoch": 0.3872549019607843,
"grad_norm": 0.4595804279730611,
"learning_rate": 9.103107602841341e-06,
"loss": 0.259,
"step": 79
},
{
"epoch": 0.39215686274509803,
"grad_norm": 0.471535607395236,
"learning_rate": 9.08098456178111e-06,
"loss": 0.2728,
"step": 80
},
{
"epoch": 0.39705882352941174,
"grad_norm": 0.5184899058796497,
"learning_rate": 9.058619561473308e-06,
"loss": 0.2677,
"step": 81
},
{
"epoch": 0.4019607843137255,
"grad_norm": 0.47509634282362856,
"learning_rate": 9.036013927926049e-06,
"loss": 0.2742,
"step": 82
},
{
"epoch": 0.4068627450980392,
"grad_norm": 0.47503959889700315,
"learning_rate": 9.013169001414458e-06,
"loss": 0.2778,
"step": 83
},
{
"epoch": 0.4117647058823529,
"grad_norm": 0.47489487818225196,
"learning_rate": 8.990086136401199e-06,
"loss": 0.2498,
"step": 84
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.46801056645527583,
"learning_rate": 8.966766701456177e-06,
"loss": 0.2641,
"step": 85
},
{
"epoch": 0.4215686274509804,
"grad_norm": 0.5112905724249457,
"learning_rate": 8.943212079175392e-06,
"loss": 0.2771,
"step": 86
},
{
"epoch": 0.4264705882352941,
"grad_norm": 0.4447440095916987,
"learning_rate": 8.91942366609897e-06,
"loss": 0.2701,
"step": 87
},
{
"epoch": 0.43137254901960786,
"grad_norm": 0.4734853505195237,
"learning_rate": 8.895402872628352e-06,
"loss": 0.2919,
"step": 88
},
{
"epoch": 0.4362745098039216,
"grad_norm": 0.45704855765327873,
"learning_rate": 8.871151122942692e-06,
"loss": 0.2606,
"step": 89
},
{
"epoch": 0.4411764705882353,
"grad_norm": 0.4701885669821727,
"learning_rate": 8.846669854914395e-06,
"loss": 0.2768,
"step": 90
},
{
"epoch": 0.44607843137254904,
"grad_norm": 0.4829583138980323,
"learning_rate": 8.821960520023884e-06,
"loss": 0.2796,
"step": 91
},
{
"epoch": 0.45098039215686275,
"grad_norm": 0.487539635758669,
"learning_rate": 8.797024583273536e-06,
"loss": 0.2758,
"step": 92
},
{
"epoch": 0.45588235294117646,
"grad_norm": 0.47395890303919547,
"learning_rate": 8.771863523100821e-06,
"loss": 0.2739,
"step": 93
},
{
"epoch": 0.46078431372549017,
"grad_norm": 0.5050007846153621,
"learning_rate": 8.746478831290648e-06,
"loss": 0.2993,
"step": 94
},
{
"epoch": 0.46568627450980393,
"grad_norm": 0.5224607026899196,
"learning_rate": 8.720872012886918e-06,
"loss": 0.2863,
"step": 95
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.47538250913835894,
"learning_rate": 8.695044586103297e-06,
"loss": 0.2912,
"step": 96
},
{
"epoch": 0.47549019607843135,
"grad_norm": 0.4548594126958497,
"learning_rate": 8.668998082233186e-06,
"loss": 0.2557,
"step": 97
},
{
"epoch": 0.4803921568627451,
"grad_norm": 0.4817593707119681,
"learning_rate": 8.642734045558952e-06,
"loss": 0.2559,
"step": 98
},
{
"epoch": 0.4852941176470588,
"grad_norm": 0.49702278125831084,
"learning_rate": 8.616254033260351e-06,
"loss": 0.2797,
"step": 99
},
{
"epoch": 0.49019607843137253,
"grad_norm": 0.46937954625053363,
"learning_rate": 8.58955961532221e-06,
"loss": 0.2543,
"step": 100
},
{
"epoch": 0.4950980392156863,
"grad_norm": 0.5097981268395455,
"learning_rate": 8.56265237444135e-06,
"loss": 0.2805,
"step": 101
},
{
"epoch": 0.5,
"grad_norm": 0.5147622119362961,
"learning_rate": 8.535533905932739e-06,
"loss": 0.2669,
"step": 102
},
{
"epoch": 0.5049019607843137,
"grad_norm": 0.48075492916551527,
"learning_rate": 8.508205817634908e-06,
"loss": 0.2701,
"step": 103
},
{
"epoch": 0.5098039215686274,
"grad_norm": 0.46764850077800707,
"learning_rate": 8.480669729814635e-06,
"loss": 0.2777,
"step": 104
},
{
"epoch": 0.5147058823529411,
"grad_norm": 0.4905586064810874,
"learning_rate": 8.452927275070858e-06,
"loss": 0.2752,
"step": 105
},
{
"epoch": 0.5196078431372549,
"grad_norm": 0.5164789347154716,
"learning_rate": 8.424980098237904e-06,
"loss": 0.2929,
"step": 106
},
{
"epoch": 0.5245098039215687,
"grad_norm": 0.47222892287405277,
"learning_rate": 8.39682985628795e-06,
"loss": 0.2825,
"step": 107
},
{
"epoch": 0.5294117647058824,
"grad_norm": 0.48296479038244106,
"learning_rate": 8.368478218232787e-06,
"loss": 0.2696,
"step": 108
},
{
"epoch": 0.5343137254901961,
"grad_norm": 0.5456904576064092,
"learning_rate": 8.339926865024871e-06,
"loss": 0.2625,
"step": 109
},
{
"epoch": 0.5392156862745098,
"grad_norm": 0.46572429076517435,
"learning_rate": 8.311177489457653e-06,
"loss": 0.2682,
"step": 110
},
{
"epoch": 0.5441176470588235,
"grad_norm": 0.4845901394041396,
"learning_rate": 8.282231796065215e-06,
"loss": 0.2661,
"step": 111
},
{
"epoch": 0.5490196078431373,
"grad_norm": 0.5338366403432033,
"learning_rate": 8.25309150102121e-06,
"loss": 0.3103,
"step": 112
},
{
"epoch": 0.553921568627451,
"grad_norm": 0.49130182412984563,
"learning_rate": 8.223758332037121e-06,
"loss": 0.2715,
"step": 113
},
{
"epoch": 0.5588235294117647,
"grad_norm": 0.4910434085279551,
"learning_rate": 8.194234028259806e-06,
"loss": 0.2951,
"step": 114
},
{
"epoch": 0.5637254901960784,
"grad_norm": 0.4837447278276799,
"learning_rate": 8.164520340168404e-06,
"loss": 0.2889,
"step": 115
},
{
"epoch": 0.5686274509803921,
"grad_norm": 0.5235056748140671,
"learning_rate": 8.134619029470535e-06,
"loss": 0.3195,
"step": 116
},
{
"epoch": 0.5735294117647058,
"grad_norm": 0.5393547644876617,
"learning_rate": 8.104531868997858e-06,
"loss": 0.2635,
"step": 117
},
{
"epoch": 0.5784313725490197,
"grad_norm": 0.46319401757550593,
"learning_rate": 8.074260642600963e-06,
"loss": 0.2735,
"step": 118
},
{
"epoch": 0.5833333333333334,
"grad_norm": 0.5011014808072246,
"learning_rate": 8.043807145043604e-06,
"loss": 0.2834,
"step": 119
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.4885650619750526,
"learning_rate": 8.013173181896283e-06,
"loss": 0.28,
"step": 120
},
{
"epoch": 0.5931372549019608,
"grad_norm": 0.48818735448671163,
"learning_rate": 7.982360569429206e-06,
"loss": 0.2832,
"step": 121
},
{
"epoch": 0.5980392156862745,
"grad_norm": 0.47782983621824704,
"learning_rate": 7.951371134504599e-06,
"loss": 0.2655,
"step": 122
},
{
"epoch": 0.6029411764705882,
"grad_norm": 0.5287209528766823,
"learning_rate": 7.920206714468383e-06,
"loss": 0.2852,
"step": 123
},
{
"epoch": 0.6078431372549019,
"grad_norm": 0.5115537708833304,
"learning_rate": 7.888869157041257e-06,
"loss": 0.2838,
"step": 124
},
{
"epoch": 0.6127450980392157,
"grad_norm": 0.5014284065264025,
"learning_rate": 7.857360320209126e-06,
"loss": 0.2716,
"step": 125
},
{
"epoch": 0.6176470588235294,
"grad_norm": 0.4651603608252272,
"learning_rate": 7.82568207211296e-06,
"loss": 0.2782,
"step": 126
},
{
"epoch": 0.6225490196078431,
"grad_norm": 0.5043652198730769,
"learning_rate": 7.793836290938026e-06,
"loss": 0.2714,
"step": 127
},
{
"epoch": 0.6274509803921569,
"grad_norm": 0.5581975764740609,
"learning_rate": 7.76182486480253e-06,
"loss": 0.2936,
"step": 128
},
{
"epoch": 0.6323529411764706,
"grad_norm": 0.49460513690934,
"learning_rate": 7.729649691645673e-06,
"loss": 0.2514,
"step": 129
},
{
"epoch": 0.6372549019607843,
"grad_norm": 0.49477864005095495,
"learning_rate": 7.697312679115126e-06,
"loss": 0.3039,
"step": 130
},
{
"epoch": 0.6421568627450981,
"grad_norm": 0.43459499281810504,
"learning_rate": 7.664815744453918e-06,
"loss": 0.258,
"step": 131
},
{
"epoch": 0.6470588235294118,
"grad_norm": 0.4662127766253258,
"learning_rate": 7.63216081438678e-06,
"loss": 0.2682,
"step": 132
},
{
"epoch": 0.6519607843137255,
"grad_norm": 0.4461887881123459,
"learning_rate": 7.599349825005892e-06,
"loss": 0.2517,
"step": 133
},
{
"epoch": 0.6568627450980392,
"grad_norm": 0.48712380156922164,
"learning_rate": 7.566384721656103e-06,
"loss": 0.2741,
"step": 134
},
{
"epoch": 0.6617647058823529,
"grad_norm": 0.4954211848354499,
"learning_rate": 7.533267458819597e-06,
"loss": 0.2772,
"step": 135
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.4882543467157239,
"learning_rate": 7.500000000000001e-06,
"loss": 0.2693,
"step": 136
},
{
"epoch": 0.6715686274509803,
"grad_norm": 0.533534907534477,
"learning_rate": 7.466584317605978e-06,
"loss": 0.2682,
"step": 137
},
{
"epoch": 0.6764705882352942,
"grad_norm": 0.47618239404048246,
"learning_rate": 7.4330223928342814e-06,
"loss": 0.2793,
"step": 138
},
{
"epoch": 0.6813725490196079,
"grad_norm": 0.45267230623426713,
"learning_rate": 7.399316215552296e-06,
"loss": 0.2676,
"step": 139
},
{
"epoch": 0.6862745098039216,
"grad_norm": 0.484264347953501,
"learning_rate": 7.365467784180051e-06,
"loss": 0.2762,
"step": 140
},
{
"epoch": 0.6911764705882353,
"grad_norm": 0.4811924950195945,
"learning_rate": 7.33147910557174e-06,
"loss": 0.256,
"step": 141
},
{
"epoch": 0.696078431372549,
"grad_norm": 0.4713139634213423,
"learning_rate": 7.297352194896738e-06,
"loss": 0.259,
"step": 142
},
{
"epoch": 0.7009803921568627,
"grad_norm": 0.4650406278525342,
"learning_rate": 7.26308907552012e-06,
"loss": 0.2589,
"step": 143
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.508611340986804,
"learning_rate": 7.2286917788826926e-06,
"loss": 0.2846,
"step": 144
},
{
"epoch": 0.7107843137254902,
"grad_norm": 0.49931442983079793,
"learning_rate": 7.194162344380561e-06,
"loss": 0.2847,
"step": 145
},
{
"epoch": 0.7156862745098039,
"grad_norm": 0.4402469704937451,
"learning_rate": 7.159502819244206e-06,
"loss": 0.2513,
"step": 146
},
{
"epoch": 0.7205882352941176,
"grad_norm": 0.4338024772855486,
"learning_rate": 7.124715258417111e-06,
"loss": 0.2546,
"step": 147
},
{
"epoch": 0.7254901960784313,
"grad_norm": 0.4928083208518911,
"learning_rate": 7.089801724433918e-06,
"loss": 0.2712,
"step": 148
},
{
"epoch": 0.7303921568627451,
"grad_norm": 0.4477835861527179,
"learning_rate": 7.05476428729815e-06,
"loss": 0.262,
"step": 149
},
{
"epoch": 0.7352941176470589,
"grad_norm": 0.4874852700808163,
"learning_rate": 7.019605024359475e-06,
"loss": 0.2622,
"step": 150
},
{
"epoch": 0.7401960784313726,
"grad_norm": 0.4555898200997557,
"learning_rate": 6.984326020190544e-06,
"loss": 0.2616,
"step": 151
},
{
"epoch": 0.7450980392156863,
"grad_norm": 0.4950149488353659,
"learning_rate": 6.948929366463397e-06,
"loss": 0.3077,
"step": 152
},
{
"epoch": 0.75,
"grad_norm": 0.5366809202495563,
"learning_rate": 6.913417161825449e-06,
"loss": 0.2834,
"step": 153
},
{
"epoch": 0.7549019607843137,
"grad_norm": 0.48159848397294547,
"learning_rate": 6.877791511775064e-06,
"loss": 0.2547,
"step": 154
},
{
"epoch": 0.7598039215686274,
"grad_norm": 0.46460905629471105,
"learning_rate": 6.842054528536717e-06,
"loss": 0.2616,
"step": 155
},
{
"epoch": 0.7647058823529411,
"grad_norm": 0.4733666361824114,
"learning_rate": 6.806208330935766e-06,
"loss": 0.2529,
"step": 156
},
{
"epoch": 0.7696078431372549,
"grad_norm": 0.46805105182447543,
"learning_rate": 6.770255044272826e-06,
"loss": 0.2678,
"step": 157
},
{
"epoch": 0.7745098039215687,
"grad_norm": 0.47075639159563154,
"learning_rate": 6.734196800197763e-06,
"loss": 0.2554,
"step": 158
},
{
"epoch": 0.7794117647058824,
"grad_norm": 0.47216348647984946,
"learning_rate": 6.698035736583307e-06,
"loss": 0.2691,
"step": 159
},
{
"epoch": 0.7843137254901961,
"grad_norm": 0.4776754820294553,
"learning_rate": 6.6617739973982985e-06,
"loss": 0.2893,
"step": 160
},
{
"epoch": 0.7892156862745098,
"grad_norm": 0.5056737376559055,
"learning_rate": 6.625413732580577e-06,
"loss": 0.3119,
"step": 161
},
{
"epoch": 0.7941176470588235,
"grad_norm": 0.5001505320557236,
"learning_rate": 6.588957097909509e-06,
"loss": 0.2534,
"step": 162
},
{
"epoch": 0.7990196078431373,
"grad_norm": 0.4672061035111928,
"learning_rate": 6.552406254878175e-06,
"loss": 0.2656,
"step": 163
},
{
"epoch": 0.803921568627451,
"grad_norm": 0.4554461122885562,
"learning_rate": 6.515763370565218e-06,
"loss": 0.261,
"step": 164
},
{
"epoch": 0.8088235294117647,
"grad_norm": 0.4926507470830697,
"learning_rate": 6.4790306175063535e-06,
"loss": 0.2597,
"step": 165
},
{
"epoch": 0.8137254901960784,
"grad_norm": 0.4943296156206765,
"learning_rate": 6.442210173565562e-06,
"loss": 0.279,
"step": 166
},
{
"epoch": 0.8186274509803921,
"grad_norm": 0.4728785530345938,
"learning_rate": 6.405304221805972e-06,
"loss": 0.271,
"step": 167
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.5132288172128124,
"learning_rate": 6.368314950360416e-06,
"loss": 0.2778,
"step": 168
},
{
"epoch": 0.8284313725490197,
"grad_norm": 0.4840027829550652,
"learning_rate": 6.331244552301705e-06,
"loss": 0.28,
"step": 169
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.4753576271704974,
"learning_rate": 6.294095225512604e-06,
"loss": 0.2491,
"step": 170
},
{
"epoch": 0.8382352941176471,
"grad_norm": 0.46179398942588656,
"learning_rate": 6.2568691725555144e-06,
"loss": 0.2736,
"step": 171
},
{
"epoch": 0.8431372549019608,
"grad_norm": 0.4552053511887567,
"learning_rate": 6.219568600541886e-06,
"loss": 0.255,
"step": 172
},
{
"epoch": 0.8480392156862745,
"grad_norm": 0.461827774670562,
"learning_rate": 6.182195721001366e-06,
"loss": 0.2699,
"step": 173
},
{
"epoch": 0.8529411764705882,
"grad_norm": 0.45609580191383764,
"learning_rate": 6.144752749750671e-06,
"loss": 0.2618,
"step": 174
},
{
"epoch": 0.8578431372549019,
"grad_norm": 0.4977220589396911,
"learning_rate": 6.107241906762214e-06,
"loss": 0.2587,
"step": 175
},
{
"epoch": 0.8627450980392157,
"grad_norm": 0.5124222190867876,
"learning_rate": 6.0696654160324875e-06,
"loss": 0.3044,
"step": 176
},
{
"epoch": 0.8676470588235294,
"grad_norm": 0.544844105660069,
"learning_rate": 6.0320255054501985e-06,
"loss": 0.2893,
"step": 177
},
{
"epoch": 0.8725490196078431,
"grad_norm": 0.47482041593280866,
"learning_rate": 5.994324406664184e-06,
"loss": 0.2524,
"step": 178
},
{
"epoch": 0.8774509803921569,
"grad_norm": 0.5065344279012253,
"learning_rate": 5.956564354951091e-06,
"loss": 0.2656,
"step": 179
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.490249393727454,
"learning_rate": 5.918747589082853e-06,
"loss": 0.2809,
"step": 180
},
{
"epoch": 0.8872549019607843,
"grad_norm": 0.48483179337750465,
"learning_rate": 5.880876351193956e-06,
"loss": 0.2784,
"step": 181
},
{
"epoch": 0.8921568627450981,
"grad_norm": 0.4468073287249408,
"learning_rate": 5.842952886648496e-06,
"loss": 0.2478,
"step": 182
},
{
"epoch": 0.8970588235294118,
"grad_norm": 0.5394634417319949,
"learning_rate": 5.804979443907065e-06,
"loss": 0.2715,
"step": 183
},
{
"epoch": 0.9019607843137255,
"grad_norm": 0.48798110547637624,
"learning_rate": 5.766958274393428e-06,
"loss": 0.2733,
"step": 184
},
{
"epoch": 0.9068627450980392,
"grad_norm": 0.49133168083308504,
"learning_rate": 5.728891632361043e-06,
"loss": 0.256,
"step": 185
},
{
"epoch": 0.9117647058823529,
"grad_norm": 0.48254056420422375,
"learning_rate": 5.690781774759412e-06,
"loss": 0.281,
"step": 186
},
{
"epoch": 0.9166666666666666,
"grad_norm": 0.5159826456616268,
"learning_rate": 5.65263096110026e-06,
"loss": 0.3025,
"step": 187
},
{
"epoch": 0.9215686274509803,
"grad_norm": 0.4998585500181035,
"learning_rate": 5.614441453323571e-06,
"loss": 0.2641,
"step": 188
},
{
"epoch": 0.9264705882352942,
"grad_norm": 0.47408281336995545,
"learning_rate": 5.576215515663489e-06,
"loss": 0.2672,
"step": 189
},
{
"epoch": 0.9313725490196079,
"grad_norm": 0.5213843608163483,
"learning_rate": 5.537955414514058e-06,
"loss": 0.287,
"step": 190
},
{
"epoch": 0.9362745098039216,
"grad_norm": 0.5225831835576515,
"learning_rate": 5.499663418294858e-06,
"loss": 0.2719,
"step": 191
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.4879552875250876,
"learning_rate": 5.46134179731651e-06,
"loss": 0.2808,
"step": 192
},
{
"epoch": 0.946078431372549,
"grad_norm": 0.4429756422592165,
"learning_rate": 5.4229928236460705e-06,
"loss": 0.2511,
"step": 193
},
{
"epoch": 0.9509803921568627,
"grad_norm": 0.4838680164008833,
"learning_rate": 5.3846187709723195e-06,
"loss": 0.2729,
"step": 194
},
{
"epoch": 0.9558823529411765,
"grad_norm": 0.45487335735058904,
"learning_rate": 5.346221914470959e-06,
"loss": 0.2639,
"step": 195
},
{
"epoch": 0.9607843137254902,
"grad_norm": 0.47930922167465867,
"learning_rate": 5.3078045306697154e-06,
"loss": 0.2721,
"step": 196
},
{
"epoch": 0.9656862745098039,
"grad_norm": 0.48206365406271,
"learning_rate": 5.2693688973133675e-06,
"loss": 0.2844,
"step": 197
},
{
"epoch": 0.9705882352941176,
"grad_norm": 0.4426568449367021,
"learning_rate": 5.230917293228699e-06,
"loss": 0.2472,
"step": 198
},
{
"epoch": 0.9754901960784313,
"grad_norm": 0.45364504567570346,
"learning_rate": 5.192451998189392e-06,
"loss": 0.2734,
"step": 199
},
{
"epoch": 0.9803921568627451,
"grad_norm": 0.4757644973640628,
"learning_rate": 5.153975292780852e-06,
"loss": 0.2634,
"step": 200
},
{
"epoch": 0.9803921568627451,
"eval_loss": 0.2901236116886139,
"eval_runtime": 4.5192,
"eval_samples_per_second": 14.604,
"eval_steps_per_second": 3.762,
"step": 200
},
{
"epoch": 0.9852941176470589,
"grad_norm": 0.4743962403323634,
"learning_rate": 5.115489458265006e-06,
"loss": 0.2708,
"step": 201
},
{
"epoch": 0.9901960784313726,
"grad_norm": 0.4984282092923338,
"learning_rate": 5.0769967764450345e-06,
"loss": 0.2507,
"step": 202
},
{
"epoch": 0.9950980392156863,
"grad_norm": 0.4605973534637747,
"learning_rate": 5.038499529530094e-06,
"loss": 0.2604,
"step": 203
},
{
"epoch": 1.0,
"grad_norm": 0.49516075604073534,
"learning_rate": 5e-06,
"loss": 0.2695,
"step": 204
},
{
"epoch": 1.0049019607843137,
"grad_norm": 0.5082618843355043,
"learning_rate": 4.961500470469908e-06,
"loss": 0.2255,
"step": 205
},
{
"epoch": 1.0098039215686274,
"grad_norm": 0.5088751533658034,
"learning_rate": 4.923003223554967e-06,
"loss": 0.2447,
"step": 206
},
{
"epoch": 1.0147058823529411,
"grad_norm": 0.49220028140675365,
"learning_rate": 4.8845105417349955e-06,
"loss": 0.2312,
"step": 207
},
{
"epoch": 1.0196078431372548,
"grad_norm": 0.4708853500743636,
"learning_rate": 4.846024707219149e-06,
"loss": 0.2198,
"step": 208
},
{
"epoch": 1.0245098039215685,
"grad_norm": 0.5298187958047372,
"learning_rate": 4.807548001810611e-06,
"loss": 0.2366,
"step": 209
},
{
"epoch": 1.0294117647058822,
"grad_norm": 0.4526141045695304,
"learning_rate": 4.7690827067713035e-06,
"loss": 0.2137,
"step": 210
},
{
"epoch": 1.0343137254901962,
"grad_norm": 0.4587936745039723,
"learning_rate": 4.730631102686635e-06,
"loss": 0.2379,
"step": 211
},
{
"epoch": 1.0392156862745099,
"grad_norm": 0.47490380719342673,
"learning_rate": 4.692195469330286e-06,
"loss": 0.2166,
"step": 212
},
{
"epoch": 1.0441176470588236,
"grad_norm": 0.4543116276180149,
"learning_rate": 4.653778085529043e-06,
"loss": 0.2269,
"step": 213
},
{
"epoch": 1.0490196078431373,
"grad_norm": 0.4789578356476451,
"learning_rate": 4.615381229027681e-06,
"loss": 0.208,
"step": 214
},
{
"epoch": 1.053921568627451,
"grad_norm": 0.5295078000919256,
"learning_rate": 4.577007176353931e-06,
"loss": 0.2184,
"step": 215
},
{
"epoch": 1.0588235294117647,
"grad_norm": 0.5848689976910358,
"learning_rate": 4.53865820268349e-06,
"loss": 0.2275,
"step": 216
},
{
"epoch": 1.0637254901960784,
"grad_norm": 0.4738691598319379,
"learning_rate": 4.5003365817051434e-06,
"loss": 0.2307,
"step": 217
},
{
"epoch": 1.0686274509803921,
"grad_norm": 0.43950765165268024,
"learning_rate": 4.462044585485944e-06,
"loss": 0.2165,
"step": 218
},
{
"epoch": 1.0735294117647058,
"grad_norm": 0.49593785245882227,
"learning_rate": 4.4237844843365126e-06,
"loss": 0.2462,
"step": 219
},
{
"epoch": 1.0784313725490196,
"grad_norm": 0.5332165363629019,
"learning_rate": 4.3855585466764305e-06,
"loss": 0.2273,
"step": 220
},
{
"epoch": 1.0833333333333333,
"grad_norm": 0.4850004626701561,
"learning_rate": 4.347369038899744e-06,
"loss": 0.2092,
"step": 221
},
{
"epoch": 1.088235294117647,
"grad_norm": 0.5203678675930979,
"learning_rate": 4.309218225240591e-06,
"loss": 0.232,
"step": 222
},
{
"epoch": 1.093137254901961,
"grad_norm": 0.5490724478431357,
"learning_rate": 4.271108367638959e-06,
"loss": 0.2261,
"step": 223
},
{
"epoch": 1.0980392156862746,
"grad_norm": 0.4752955178586657,
"learning_rate": 4.233041725606573e-06,
"loss": 0.2103,
"step": 224
},
{
"epoch": 1.1029411764705883,
"grad_norm": 0.5271708488599877,
"learning_rate": 4.195020556092935e-06,
"loss": 0.2599,
"step": 225
},
{
"epoch": 1.107843137254902,
"grad_norm": 0.48431472139133885,
"learning_rate": 4.157047113351504e-06,
"loss": 0.2081,
"step": 226
},
{
"epoch": 1.1127450980392157,
"grad_norm": 0.4799065587797215,
"learning_rate": 4.119123648806046e-06,
"loss": 0.2239,
"step": 227
},
{
"epoch": 1.1176470588235294,
"grad_norm": 0.4843432542281396,
"learning_rate": 4.081252410917148e-06,
"loss": 0.233,
"step": 228
},
{
"epoch": 1.1225490196078431,
"grad_norm": 0.5134294387550117,
"learning_rate": 4.043435645048911e-06,
"loss": 0.2347,
"step": 229
},
{
"epoch": 1.1274509803921569,
"grad_norm": 0.4674896337352177,
"learning_rate": 4.005675593335818e-06,
"loss": 0.2107,
"step": 230
},
{
"epoch": 1.1323529411764706,
"grad_norm": 0.4612970868262565,
"learning_rate": 3.967974494549803e-06,
"loss": 0.2091,
"step": 231
},
{
"epoch": 1.1372549019607843,
"grad_norm": 0.5731954137918644,
"learning_rate": 3.930334583967514e-06,
"loss": 0.2372,
"step": 232
},
{
"epoch": 1.142156862745098,
"grad_norm": 0.5075101754997032,
"learning_rate": 3.892758093237788e-06,
"loss": 0.2304,
"step": 233
},
{
"epoch": 1.1470588235294117,
"grad_norm": 0.479594961464793,
"learning_rate": 3.855247250249331e-06,
"loss": 0.2086,
"step": 234
},
{
"epoch": 1.1519607843137254,
"grad_norm": 0.5071236660718516,
"learning_rate": 3.8178042789986355e-06,
"loss": 0.2306,
"step": 235
},
{
"epoch": 1.156862745098039,
"grad_norm": 0.46823532072465995,
"learning_rate": 3.7804313994581143e-06,
"loss": 0.2123,
"step": 236
},
{
"epoch": 1.161764705882353,
"grad_norm": 0.4963955283839783,
"learning_rate": 3.743130827444487e-06,
"loss": 0.2276,
"step": 237
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.4722357144886749,
"learning_rate": 3.705904774487396e-06,
"loss": 0.225,
"step": 238
},
{
"epoch": 1.1715686274509804,
"grad_norm": 0.4712118235647605,
"learning_rate": 3.6687554476982954e-06,
"loss": 0.2171,
"step": 239
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.4839778546728081,
"learning_rate": 3.6316850496395863e-06,
"loss": 0.209,
"step": 240
},
{
"epoch": 1.1813725490196079,
"grad_norm": 0.4855855826367996,
"learning_rate": 3.5946957781940296e-06,
"loss": 0.215,
"step": 241
},
{
"epoch": 1.1862745098039216,
"grad_norm": 0.4661121070145884,
"learning_rate": 3.557789826434439e-06,
"loss": 0.2163,
"step": 242
},
{
"epoch": 1.1911764705882353,
"grad_norm": 0.5101563310470888,
"learning_rate": 3.5209693824936486e-06,
"loss": 0.2219,
"step": 243
},
{
"epoch": 1.196078431372549,
"grad_norm": 0.45246713488863194,
"learning_rate": 3.484236629434783e-06,
"loss": 0.22,
"step": 244
},
{
"epoch": 1.2009803921568627,
"grad_norm": 0.48803317168808835,
"learning_rate": 3.4475937451218257e-06,
"loss": 0.2421,
"step": 245
},
{
"epoch": 1.2058823529411764,
"grad_norm": 0.47070215757963546,
"learning_rate": 3.4110429020904924e-06,
"loss": 0.2112,
"step": 246
},
{
"epoch": 1.2107843137254901,
"grad_norm": 0.5433683679114224,
"learning_rate": 3.3745862674194246e-06,
"loss": 0.2318,
"step": 247
},
{
"epoch": 1.215686274509804,
"grad_norm": 0.494372431441497,
"learning_rate": 3.3382260026017027e-06,
"loss": 0.2263,
"step": 248
},
{
"epoch": 1.2205882352941178,
"grad_norm": 0.48636265742721485,
"learning_rate": 3.301964263416693e-06,
"loss": 0.2114,
"step": 249
},
{
"epoch": 1.2254901960784315,
"grad_norm": 0.4739784224799503,
"learning_rate": 3.2658031998022368e-06,
"loss": 0.2043,
"step": 250
},
{
"epoch": 1.2303921568627452,
"grad_norm": 0.4815109523313931,
"learning_rate": 3.2297449557271743e-06,
"loss": 0.2165,
"step": 251
},
{
"epoch": 1.2352941176470589,
"grad_norm": 0.45623070246733244,
"learning_rate": 3.1937916690642356e-06,
"loss": 0.2018,
"step": 252
},
{
"epoch": 1.2401960784313726,
"grad_norm": 0.5217926836177432,
"learning_rate": 3.1579454714632853e-06,
"loss": 0.242,
"step": 253
},
{
"epoch": 1.2450980392156863,
"grad_norm": 0.4853475659437706,
"learning_rate": 3.1222084882249375e-06,
"loss": 0.223,
"step": 254
},
{
"epoch": 1.25,
"grad_norm": 0.4723805429523192,
"learning_rate": 3.0865828381745515e-06,
"loss": 0.2289,
"step": 255
},
{
"epoch": 1.2549019607843137,
"grad_norm": 0.5016994760967523,
"learning_rate": 3.0510706335366034e-06,
"loss": 0.2424,
"step": 256
},
{
"epoch": 1.2598039215686274,
"grad_norm": 0.5020856006286446,
"learning_rate": 3.015673979809457e-06,
"loss": 0.2283,
"step": 257
},
{
"epoch": 1.2647058823529411,
"grad_norm": 0.48802662651093065,
"learning_rate": 2.980394975640526e-06,
"loss": 0.2101,
"step": 258
},
{
"epoch": 1.2696078431372548,
"grad_norm": 0.5011136337277475,
"learning_rate": 2.9452357127018516e-06,
"loss": 0.2159,
"step": 259
},
{
"epoch": 1.2745098039215685,
"grad_norm": 0.5400359633757877,
"learning_rate": 2.910198275566085e-06,
"loss": 0.2277,
"step": 260
},
{
"epoch": 1.2794117647058822,
"grad_norm": 0.5137240802607623,
"learning_rate": 2.8752847415828923e-06,
"loss": 0.2215,
"step": 261
},
{
"epoch": 1.284313725490196,
"grad_norm": 0.47572170016097703,
"learning_rate": 2.8404971807557957e-06,
"loss": 0.2219,
"step": 262
},
{
"epoch": 1.2892156862745099,
"grad_norm": 0.485360542980026,
"learning_rate": 2.80583765561944e-06,
"loss": 0.2301,
"step": 263
},
{
"epoch": 1.2941176470588236,
"grad_norm": 0.5083747858931846,
"learning_rate": 2.771308221117309e-06,
"loss": 0.2148,
"step": 264
},
{
"epoch": 1.2990196078431373,
"grad_norm": 0.5607090322482143,
"learning_rate": 2.736910924479881e-06,
"loss": 0.2268,
"step": 265
},
{
"epoch": 1.303921568627451,
"grad_norm": 0.4896494954982839,
"learning_rate": 2.7026478051032625e-06,
"loss": 0.2351,
"step": 266
},
{
"epoch": 1.3088235294117647,
"grad_norm": 0.4929197961015348,
"learning_rate": 2.668520894428259e-06,
"loss": 0.2126,
"step": 267
},
{
"epoch": 1.3137254901960784,
"grad_norm": 0.4539652841374491,
"learning_rate": 2.6345322158199503e-06,
"loss": 0.2094,
"step": 268
},
{
"epoch": 1.3186274509803921,
"grad_norm": 0.48162901242603456,
"learning_rate": 2.600683784447704e-06,
"loss": 0.2084,
"step": 269
},
{
"epoch": 1.3235294117647058,
"grad_norm": 0.47818141221316735,
"learning_rate": 2.5669776071657194e-06,
"loss": 0.2156,
"step": 270
},
{
"epoch": 1.3284313725490196,
"grad_norm": 0.4928567042617787,
"learning_rate": 2.5334156823940237e-06,
"loss": 0.231,
"step": 271
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.49369116994850987,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.2221,
"step": 272
},
{
"epoch": 1.3382352941176472,
"grad_norm": 0.48305909995269875,
"learning_rate": 2.466732541180404e-06,
"loss": 0.2135,
"step": 273
},
{
"epoch": 1.343137254901961,
"grad_norm": 0.4951678749675107,
"learning_rate": 2.4336152783438984e-06,
"loss": 0.2297,
"step": 274
},
{
"epoch": 1.3480392156862746,
"grad_norm": 0.4657323792291692,
"learning_rate": 2.4006501749941097e-06,
"loss": 0.2274,
"step": 275
},
{
"epoch": 1.3529411764705883,
"grad_norm": 0.5241852758039561,
"learning_rate": 2.3678391856132203e-06,
"loss": 0.2231,
"step": 276
},
{
"epoch": 1.357843137254902,
"grad_norm": 0.48935802891939345,
"learning_rate": 2.335184255546083e-06,
"loss": 0.207,
"step": 277
},
{
"epoch": 1.3627450980392157,
"grad_norm": 0.49803971224569626,
"learning_rate": 2.302687320884876e-06,
"loss": 0.2041,
"step": 278
},
{
"epoch": 1.3676470588235294,
"grad_norm": 0.522300739994449,
"learning_rate": 2.2703503083543288e-06,
"loss": 0.2322,
"step": 279
},
{
"epoch": 1.3725490196078431,
"grad_norm": 0.5175680961463683,
"learning_rate": 2.238175135197471e-06,
"loss": 0.2292,
"step": 280
},
{
"epoch": 1.3774509803921569,
"grad_norm": 0.503707855088509,
"learning_rate": 2.206163709061976e-06,
"loss": 0.2127,
"step": 281
},
{
"epoch": 1.3823529411764706,
"grad_norm": 0.5074511535403994,
"learning_rate": 2.174317927887041e-06,
"loss": 0.229,
"step": 282
},
{
"epoch": 1.3872549019607843,
"grad_norm": 0.4925131267407971,
"learning_rate": 2.1426396797908764e-06,
"loss": 0.2165,
"step": 283
},
{
"epoch": 1.392156862745098,
"grad_norm": 0.5106602755906986,
"learning_rate": 2.1111308429587446e-06,
"loss": 0.2191,
"step": 284
},
{
"epoch": 1.3970588235294117,
"grad_norm": 0.49942574868641376,
"learning_rate": 2.0797932855316183e-06,
"loss": 0.2108,
"step": 285
},
{
"epoch": 1.4019607843137254,
"grad_norm": 0.4783779601581248,
"learning_rate": 2.048628865495403e-06,
"loss": 0.2154,
"step": 286
},
{
"epoch": 1.406862745098039,
"grad_norm": 0.4675068927861205,
"learning_rate": 2.017639430570794e-06,
"loss": 0.2132,
"step": 287
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.5045061184751272,
"learning_rate": 1.9868268181037186e-06,
"loss": 0.2307,
"step": 288
},
{
"epoch": 1.4166666666666667,
"grad_norm": 0.48658941494641167,
"learning_rate": 1.956192854956397e-06,
"loss": 0.2213,
"step": 289
},
{
"epoch": 1.4215686274509804,
"grad_norm": 0.49002558251207806,
"learning_rate": 1.925739357399038e-06,
"loss": 0.229,
"step": 290
},
{
"epoch": 1.4264705882352942,
"grad_norm": 0.48906447602859293,
"learning_rate": 1.8954681310021434e-06,
"loss": 0.2272,
"step": 291
},
{
"epoch": 1.4313725490196079,
"grad_norm": 0.5142081803047547,
"learning_rate": 1.865380970529469e-06,
"loss": 0.2325,
"step": 292
},
{
"epoch": 1.4362745098039216,
"grad_norm": 0.4912306633034611,
"learning_rate": 1.8354796598315977e-06,
"loss": 0.2227,
"step": 293
},
{
"epoch": 1.4411764705882353,
"grad_norm": 0.5145847762913899,
"learning_rate": 1.8057659717401948e-06,
"loss": 0.2255,
"step": 294
},
{
"epoch": 1.446078431372549,
"grad_norm": 0.49819056747659707,
"learning_rate": 1.7762416679628792e-06,
"loss": 0.2176,
"step": 295
},
{
"epoch": 1.4509803921568627,
"grad_norm": 0.531159719917642,
"learning_rate": 1.746908498978791e-06,
"loss": 0.2486,
"step": 296
},
{
"epoch": 1.4558823529411764,
"grad_norm": 0.5032774150199282,
"learning_rate": 1.7177682039347875e-06,
"loss": 0.222,
"step": 297
},
{
"epoch": 1.4607843137254901,
"grad_norm": 0.5141853027734455,
"learning_rate": 1.6888225105423505e-06,
"loss": 0.2472,
"step": 298
},
{
"epoch": 1.465686274509804,
"grad_norm": 0.5159209957285815,
"learning_rate": 1.6600731349751303e-06,
"loss": 0.2137,
"step": 299
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.4548136506677179,
"learning_rate": 1.6315217817672142e-06,
"loss": 0.2028,
"step": 300
},
{
"epoch": 1.4754901960784315,
"grad_norm": 0.5217869077832439,
"learning_rate": 1.6031701437120512e-06,
"loss": 0.2282,
"step": 301
},
{
"epoch": 1.4803921568627452,
"grad_norm": 0.4743126314542162,
"learning_rate": 1.575019901762097e-06,
"loss": 0.2214,
"step": 302
},
{
"epoch": 1.4852941176470589,
"grad_norm": 0.46199991416748565,
"learning_rate": 1.5470727249291423e-06,
"loss": 0.2362,
"step": 303
},
{
"epoch": 1.4901960784313726,
"grad_norm": 0.4880202546561932,
"learning_rate": 1.5193302701853674e-06,
"loss": 0.2274,
"step": 304
},
{
"epoch": 1.4950980392156863,
"grad_norm": 0.5157149627873591,
"learning_rate": 1.4917941823650917e-06,
"loss": 0.2443,
"step": 305
},
{
"epoch": 1.5,
"grad_norm": 0.49949954036382943,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.2207,
"step": 306
},
{
"epoch": 1.5049019607843137,
"grad_norm": 0.47935960669362754,
"learning_rate": 1.4373476255586515e-06,
"loss": 0.2175,
"step": 307
},
{
"epoch": 1.5098039215686274,
"grad_norm": 0.47396201830045953,
"learning_rate": 1.410440384677791e-06,
"loss": 0.2274,
"step": 308
},
{
"epoch": 1.5147058823529411,
"grad_norm": 0.47348072824518034,
"learning_rate": 1.383745966739652e-06,
"loss": 0.2251,
"step": 309
},
{
"epoch": 1.5196078431372548,
"grad_norm": 0.49930262299209816,
"learning_rate": 1.3572659544410493e-06,
"loss": 0.2163,
"step": 310
},
{
"epoch": 1.5245098039215685,
"grad_norm": 0.4992054804915185,
"learning_rate": 1.3310019177668154e-06,
"loss": 0.2329,
"step": 311
},
{
"epoch": 1.5294117647058822,
"grad_norm": 0.4845633753575591,
"learning_rate": 1.3049554138967052e-06,
"loss": 0.2224,
"step": 312
},
{
"epoch": 1.534313725490196,
"grad_norm": 0.4856356018612521,
"learning_rate": 1.2791279871130824e-06,
"loss": 0.2212,
"step": 313
},
{
"epoch": 1.5392156862745097,
"grad_norm": 0.5136795584316685,
"learning_rate": 1.2535211687093535e-06,
"loss": 0.2426,
"step": 314
},
{
"epoch": 1.5441176470588234,
"grad_norm": 0.4992226834558923,
"learning_rate": 1.2281364768991804e-06,
"loss": 0.2337,
"step": 315
},
{
"epoch": 1.5490196078431373,
"grad_norm": 0.46127348587510075,
"learning_rate": 1.202975416726464e-06,
"loss": 0.2206,
"step": 316
},
{
"epoch": 1.553921568627451,
"grad_norm": 0.4751601261175884,
"learning_rate": 1.1780394799761163e-06,
"loss": 0.221,
"step": 317
},
{
"epoch": 1.5588235294117647,
"grad_norm": 0.4687740072346912,
"learning_rate": 1.1533301450856054e-06,
"loss": 0.2147,
"step": 318
},
{
"epoch": 1.5637254901960784,
"grad_norm": 0.494253944161966,
"learning_rate": 1.1288488770573097e-06,
"loss": 0.2145,
"step": 319
},
{
"epoch": 1.5686274509803921,
"grad_norm": 0.45656769356436955,
"learning_rate": 1.1045971273716476e-06,
"loss": 0.2225,
"step": 320
},
{
"epoch": 1.5735294117647058,
"grad_norm": 0.4700967866343513,
"learning_rate": 1.0805763339010329e-06,
"loss": 0.2155,
"step": 321
},
{
"epoch": 1.5784313725490198,
"grad_norm": 0.4785394705087758,
"learning_rate": 1.0567879208246084e-06,
"loss": 0.2001,
"step": 322
},
{
"epoch": 1.5833333333333335,
"grad_norm": 0.4733995988013302,
"learning_rate": 1.0332332985438248e-06,
"loss": 0.2092,
"step": 323
},
{
"epoch": 1.5882352941176472,
"grad_norm": 0.4721076608628528,
"learning_rate": 1.0099138635988026e-06,
"loss": 0.2171,
"step": 324
},
{
"epoch": 1.593137254901961,
"grad_norm": 0.4760012038103648,
"learning_rate": 9.868309985855446e-07,
"loss": 0.2285,
"step": 325
},
{
"epoch": 1.5980392156862746,
"grad_norm": 0.46040599411726224,
"learning_rate": 9.639860720739524e-07,
"loss": 0.2069,
"step": 326
},
{
"epoch": 1.6029411764705883,
"grad_norm": 0.4604293862507961,
"learning_rate": 9.41380438526694e-07,
"loss": 0.2242,
"step": 327
},
{
"epoch": 1.607843137254902,
"grad_norm": 0.4757645851924005,
"learning_rate": 9.190154382188921e-07,
"loss": 0.228,
"step": 328
},
{
"epoch": 1.6127450980392157,
"grad_norm": 0.4950460680266549,
"learning_rate": 8.968923971586596e-07,
"loss": 0.2003,
"step": 329
},
{
"epoch": 1.6176470588235294,
"grad_norm": 0.49276290384706434,
"learning_rate": 8.750126270084891e-07,
"loss": 0.2225,
"step": 330
},
{
"epoch": 1.6225490196078431,
"grad_norm": 0.5021860257275205,
"learning_rate": 8.533774250074727e-07,
"loss": 0.2301,
"step": 331
},
{
"epoch": 1.6274509803921569,
"grad_norm": 0.5235703470648727,
"learning_rate": 8.31988073894403e-07,
"loss": 0.2459,
"step": 332
},
{
"epoch": 1.6323529411764706,
"grad_norm": 0.46964359302698966,
"learning_rate": 8.108458418317089e-07,
"loss": 0.2371,
"step": 333
},
{
"epoch": 1.6372549019607843,
"grad_norm": 0.4686413263785466,
"learning_rate": 7.899519823302743e-07,
"loss": 0.2243,
"step": 334
},
{
"epoch": 1.642156862745098,
"grad_norm": 0.44320620484465817,
"learning_rate": 7.693077341751138e-07,
"loss": 0.2046,
"step": 335
},
{
"epoch": 1.6470588235294117,
"grad_norm": 0.46592155506606175,
"learning_rate": 7.489143213519301e-07,
"loss": 0.2201,
"step": 336
},
{
"epoch": 1.6519607843137254,
"grad_norm": 0.4714629835797636,
"learning_rate": 7.287729529745386e-07,
"loss": 0.2333,
"step": 337
},
{
"epoch": 1.656862745098039,
"grad_norm": 0.5011962626617099,
"learning_rate": 7.088848232131862e-07,
"loss": 0.2286,
"step": 338
},
{
"epoch": 1.6617647058823528,
"grad_norm": 0.4840431535834992,
"learning_rate": 6.892511112237472e-07,
"loss": 0.2262,
"step": 339
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.49298475918099244,
"learning_rate": 6.698729810778065e-07,
"loss": 0.2423,
"step": 340
},
{
"epoch": 1.6715686274509802,
"grad_norm": 0.5150625646437968,
"learning_rate": 6.507515816936538e-07,
"loss": 0.2431,
"step": 341
},
{
"epoch": 1.6764705882352942,
"grad_norm": 0.4673229541968105,
"learning_rate": 6.318880467681527e-07,
"loss": 0.197,
"step": 342
},
{
"epoch": 1.6813725490196079,
"grad_norm": 0.4852125017912043,
"learning_rate": 6.132834947095334e-07,
"loss": 0.2163,
"step": 343
},
{
"epoch": 1.6862745098039216,
"grad_norm": 0.5016719261559505,
"learning_rate": 5.949390285710777e-07,
"loss": 0.2291,
"step": 344
},
{
"epoch": 1.6911764705882353,
"grad_norm": 0.5224531030766459,
"learning_rate": 5.768557359857241e-07,
"loss": 0.2428,
"step": 345
},
{
"epoch": 1.696078431372549,
"grad_norm": 0.5022281569239445,
"learning_rate": 5.590346891015758e-07,
"loss": 0.2311,
"step": 346
},
{
"epoch": 1.7009803921568627,
"grad_norm": 0.48366634743846737,
"learning_rate": 5.414769445183432e-07,
"loss": 0.2366,
"step": 347
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.43375840975227226,
"learning_rate": 5.241835432246888e-07,
"loss": 0.193,
"step": 348
},
{
"epoch": 1.7107843137254903,
"grad_norm": 0.48263877759982,
"learning_rate": 5.071555105365156e-07,
"loss": 0.1957,
"step": 349
},
{
"epoch": 1.715686274509804,
"grad_norm": 0.49674137654837686,
"learning_rate": 4.903938560361698e-07,
"loss": 0.226,
"step": 350
},
{
"epoch": 1.7205882352941178,
"grad_norm": 0.48486655987557653,
"learning_rate": 4.738995735125895e-07,
"loss": 0.2113,
"step": 351
},
{
"epoch": 1.7254901960784315,
"grad_norm": 0.4727249461110737,
"learning_rate": 4.576736409023813e-07,
"loss": 0.2344,
"step": 352
},
{
"epoch": 1.7303921568627452,
"grad_norm": 0.4794537140272506,
"learning_rate": 4.4171702023183663e-07,
"loss": 0.2117,
"step": 353
},
{
"epoch": 1.7352941176470589,
"grad_norm": 0.4798108438983259,
"learning_rate": 4.2603065755989493e-07,
"loss": 0.2253,
"step": 354
},
{
"epoch": 1.7401960784313726,
"grad_norm": 0.5174065293558706,
"learning_rate": 4.10615482922056e-07,
"loss": 0.2246,
"step": 355
},
{
"epoch": 1.7450980392156863,
"grad_norm": 0.4806212887917769,
"learning_rate": 3.9547241027523164e-07,
"loss": 0.2228,
"step": 356
},
{
"epoch": 1.75,
"grad_norm": 0.48418984478836974,
"learning_rate": 3.8060233744356634e-07,
"loss": 0.2197,
"step": 357
},
{
"epoch": 1.7549019607843137,
"grad_norm": 0.46333557615327836,
"learning_rate": 3.660061460651981e-07,
"loss": 0.2194,
"step": 358
},
{
"epoch": 1.7598039215686274,
"grad_norm": 0.5008257727294606,
"learning_rate": 3.5168470153998937e-07,
"loss": 0.2435,
"step": 359
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.4551834191653335,
"learning_rate": 3.3763885297822153e-07,
"loss": 0.2145,
"step": 360
},
{
"epoch": 1.7696078431372548,
"grad_norm": 0.48282176466836263,
"learning_rate": 3.238694331502451e-07,
"loss": 0.2114,
"step": 361
},
{
"epoch": 1.7745098039215685,
"grad_norm": 0.5041514081418971,
"learning_rate": 3.103772584371106e-07,
"loss": 0.2569,
"step": 362
},
{
"epoch": 1.7794117647058822,
"grad_norm": 0.4812170684266997,
"learning_rate": 2.9716312878216194e-07,
"loss": 0.2138,
"step": 363
},
{
"epoch": 1.784313725490196,
"grad_norm": 0.458630979087121,
"learning_rate": 2.842278276436128e-07,
"loss": 0.2066,
"step": 364
},
{
"epoch": 1.7892156862745097,
"grad_norm": 0.4589648323345021,
"learning_rate": 2.71572121948091e-07,
"loss": 0.2291,
"step": 365
},
{
"epoch": 1.7941176470588234,
"grad_norm": 0.48772513974124904,
"learning_rate": 2.5919676204517073e-07,
"loss": 0.225,
"step": 366
},
{
"epoch": 1.7990196078431373,
"grad_norm": 0.47688867452378064,
"learning_rate": 2.471024816628836e-07,
"loss": 0.2117,
"step": 367
},
{
"epoch": 1.803921568627451,
"grad_norm": 0.4970666334607962,
"learning_rate": 2.3528999786421758e-07,
"loss": 0.2203,
"step": 368
},
{
"epoch": 1.8088235294117647,
"grad_norm": 0.5068554152010685,
"learning_rate": 2.237600110046001e-07,
"loss": 0.2432,
"step": 369
},
{
"epoch": 1.8137254901960784,
"grad_norm": 0.46797846957419414,
"learning_rate": 2.1251320469037827e-07,
"loss": 0.2131,
"step": 370
},
{
"epoch": 1.8186274509803921,
"grad_norm": 0.49107452866371326,
"learning_rate": 2.0155024573828452e-07,
"loss": 0.2344,
"step": 371
},
{
"epoch": 1.8235294117647058,
"grad_norm": 0.49061343192781487,
"learning_rate": 1.908717841359048e-07,
"loss": 0.2174,
"step": 372
},
{
"epoch": 1.8284313725490198,
"grad_norm": 0.46324699413017995,
"learning_rate": 1.8047845300313726e-07,
"loss": 0.231,
"step": 373
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.4894838733242118,
"learning_rate": 1.7037086855465902e-07,
"loss": 0.2443,
"step": 374
},
{
"epoch": 1.8382352941176472,
"grad_norm": 0.4692723244866964,
"learning_rate": 1.6054963006338742e-07,
"loss": 0.2143,
"step": 375
},
{
"epoch": 1.843137254901961,
"grad_norm": 0.4698179780132864,
"learning_rate": 1.510153198249531e-07,
"loss": 0.2066,
"step": 376
},
{
"epoch": 1.8480392156862746,
"grad_norm": 0.4619932559660832,
"learning_rate": 1.4176850312317246e-07,
"loss": 0.2083,
"step": 377
},
{
"epoch": 1.8529411764705883,
"grad_norm": 0.5033482409328541,
"learning_rate": 1.328097281965357e-07,
"loss": 0.2385,
"step": 378
},
{
"epoch": 1.857843137254902,
"grad_norm": 0.49653636838509496,
"learning_rate": 1.241395262056999e-07,
"loss": 0.2259,
"step": 379
},
{
"epoch": 1.8627450980392157,
"grad_norm": 0.492657271024254,
"learning_rate": 1.157584112019966e-07,
"loss": 0.2354,
"step": 380
},
{
"epoch": 1.8676470588235294,
"grad_norm": 0.4798776902464036,
"learning_rate": 1.0766688009695548e-07,
"loss": 0.239,
"step": 381
},
{
"epoch": 1.8725490196078431,
"grad_norm": 0.4818365919471583,
"learning_rate": 9.986541263284077e-08,
"loss": 0.2329,
"step": 382
},
{
"epoch": 1.8774509803921569,
"grad_norm": 0.4969256351294847,
"learning_rate": 9.235447135421127e-08,
"loss": 0.2253,
"step": 383
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.5001352235803699,
"learning_rate": 8.513450158049109e-08,
"loss": 0.2435,
"step": 384
},
{
"epoch": 1.8872549019607843,
"grad_norm": 0.4851147711386313,
"learning_rate": 7.820593137957244e-08,
"loss": 0.2113,
"step": 385
},
{
"epoch": 1.892156862745098,
"grad_norm": 0.49751220828120635,
"learning_rate": 7.156917154243048e-08,
"loss": 0.2326,
"step": 386
},
{
"epoch": 1.8970588235294117,
"grad_norm": 0.4377191985748351,
"learning_rate": 6.522461555877213e-08,
"loss": 0.2121,
"step": 387
},
{
"epoch": 1.9019607843137254,
"grad_norm": 0.44546214474502566,
"learning_rate": 5.917263959370312e-08,
"loss": 0.2092,
"step": 388
},
{
"epoch": 1.906862745098039,
"grad_norm": 0.49712054558929825,
"learning_rate": 5.341360246542804e-08,
"loss": 0.2166,
"step": 389
},
{
"epoch": 1.9117647058823528,
"grad_norm": 0.49011006260488194,
"learning_rate": 4.794784562397459e-08,
"loss": 0.2224,
"step": 390
},
{
"epoch": 1.9166666666666665,
"grad_norm": 0.4816656251558823,
"learning_rate": 4.2775693130948094e-08,
"loss": 0.2199,
"step": 391
},
{
"epoch": 1.9215686274509802,
"grad_norm": 0.48846238423214716,
"learning_rate": 3.7897451640321326e-08,
"loss": 0.2388,
"step": 392
},
{
"epoch": 1.9264705882352942,
"grad_norm": 0.4666098884837087,
"learning_rate": 3.3313410380250157e-08,
"loss": 0.2037,
"step": 393
},
{
"epoch": 1.9313725490196079,
"grad_norm": 0.4682399224429251,
"learning_rate": 2.9023841135927822e-08,
"loss": 0.2142,
"step": 394
},
{
"epoch": 1.9362745098039216,
"grad_norm": 0.4736092237883573,
"learning_rate": 2.5028998233467272e-08,
"loss": 0.2228,
"step": 395
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.4697089850402861,
"learning_rate": 2.1329118524827662e-08,
"loss": 0.2315,
"step": 396
},
{
"epoch": 1.946078431372549,
"grad_norm": 0.4672227329719579,
"learning_rate": 1.7924421373766153e-08,
"loss": 0.2156,
"step": 397
},
{
"epoch": 1.9509803921568627,
"grad_norm": 0.497487935493445,
"learning_rate": 1.481510864283553e-08,
"loss": 0.2292,
"step": 398
},
{
"epoch": 1.9558823529411766,
"grad_norm": 0.46974714857662914,
"learning_rate": 1.200136468141544e-08,
"loss": 0.2111,
"step": 399
},
{
"epoch": 1.9607843137254903,
"grad_norm": 0.4879408121501933,
"learning_rate": 9.48335631477948e-09,
"loss": 0.2219,
"step": 400
},
{
"epoch": 1.9607843137254903,
"eval_loss": 0.2928379774093628,
"eval_runtime": 4.5224,
"eval_samples_per_second": 14.594,
"eval_steps_per_second": 3.759,
"step": 400
},
{
"epoch": 1.965686274509804,
"grad_norm": 0.5040773663379564,
"learning_rate": 7.261232834209208e-09,
"loss": 0.2405,
"step": 401
},
{
"epoch": 1.9705882352941178,
"grad_norm": 0.48149306121852936,
"learning_rate": 5.3351259881379016e-09,
"loss": 0.2352,
"step": 402
},
{
"epoch": 1.9754901960784315,
"grad_norm": 0.4749550695488964,
"learning_rate": 3.705149974342348e-09,
"loss": 0.2153,
"step": 403
},
{
"epoch": 1.9803921568627452,
"grad_norm": 0.4819452418309937,
"learning_rate": 2.371401433170495e-09,
"loss": 0.232,
"step": 404
},
{
"epoch": 1.9852941176470589,
"grad_norm": 0.46988250711319407,
"learning_rate": 1.3339594418138036e-09,
"loss": 0.2024,
"step": 405
},
{
"epoch": 1.9901960784313726,
"grad_norm": 0.4762711111697877,
"learning_rate": 5.928855096154485e-10,
"loss": 0.2009,
"step": 406
},
{
"epoch": 1.9950980392156863,
"grad_norm": 0.46631748299658515,
"learning_rate": 1.4822357442656475e-10,
"loss": 0.2188,
"step": 407
},
{
"epoch": 2.0,
"grad_norm": 0.532021803787812,
"learning_rate": 0.0,
"loss": 0.2193,
"step": 408
},
{
"epoch": 2.0,
"step": 408,
"total_flos": 38770669780992.0,
"train_loss": 0.2513315852950601,
"train_runtime": 1293.2809,
"train_samples_per_second": 10.061,
"train_steps_per_second": 0.315
}
],
"logging_steps": 1,
"max_steps": 408,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 38770669780992.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}