openr1_32B / trainer_state.json
marianna13's picture
Upload folder using huggingface_hub
51f8443 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 340,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014705882352941176,
"grad_norm": 2.547836181388219,
"learning_rate": 2.3529411764705885e-06,
"loss": 0.693,
"step": 1
},
{
"epoch": 0.029411764705882353,
"grad_norm": 2.5270813414533366,
"learning_rate": 4.705882352941177e-06,
"loss": 0.6892,
"step": 2
},
{
"epoch": 0.04411764705882353,
"grad_norm": 2.355483907211656,
"learning_rate": 7.058823529411766e-06,
"loss": 0.6851,
"step": 3
},
{
"epoch": 0.058823529411764705,
"grad_norm": 1.8130870424308736,
"learning_rate": 9.411764705882354e-06,
"loss": 0.6513,
"step": 4
},
{
"epoch": 0.07352941176470588,
"grad_norm": 1.2216520407231852,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.6231,
"step": 5
},
{
"epoch": 0.08823529411764706,
"grad_norm": 1.503718312141216,
"learning_rate": 1.4117647058823532e-05,
"loss": 0.6169,
"step": 6
},
{
"epoch": 0.10294117647058823,
"grad_norm": 1.7964660884238277,
"learning_rate": 1.647058823529412e-05,
"loss": 0.5687,
"step": 7
},
{
"epoch": 0.11764705882352941,
"grad_norm": 1.7269912533351832,
"learning_rate": 1.8823529411764708e-05,
"loss": 0.5586,
"step": 8
},
{
"epoch": 0.1323529411764706,
"grad_norm": 0.7251113424301481,
"learning_rate": 2.1176470588235296e-05,
"loss": 0.5208,
"step": 9
},
{
"epoch": 0.14705882352941177,
"grad_norm": 1.1160352667226052,
"learning_rate": 2.3529411764705884e-05,
"loss": 0.5086,
"step": 10
},
{
"epoch": 0.16176470588235295,
"grad_norm": 0.9015674485810162,
"learning_rate": 2.5882352941176475e-05,
"loss": 0.4903,
"step": 11
},
{
"epoch": 0.17647058823529413,
"grad_norm": 0.7153137232697248,
"learning_rate": 2.8235294117647063e-05,
"loss": 0.4841,
"step": 12
},
{
"epoch": 0.19117647058823528,
"grad_norm": 0.8088993429376143,
"learning_rate": 3.0588235294117644e-05,
"loss": 0.4711,
"step": 13
},
{
"epoch": 0.20588235294117646,
"grad_norm": 0.5653053624299016,
"learning_rate": 3.294117647058824e-05,
"loss": 0.4567,
"step": 14
},
{
"epoch": 0.22058823529411764,
"grad_norm": 0.5534586516972508,
"learning_rate": 3.529411764705883e-05,
"loss": 0.4522,
"step": 15
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.5736228520847013,
"learning_rate": 3.7647058823529415e-05,
"loss": 0.4415,
"step": 16
},
{
"epoch": 0.25,
"grad_norm": 0.4337208100637674,
"learning_rate": 4e-05,
"loss": 0.4327,
"step": 17
},
{
"epoch": 0.2647058823529412,
"grad_norm": 0.4876347786470063,
"learning_rate": 4.235294117647059e-05,
"loss": 0.43,
"step": 18
},
{
"epoch": 0.27941176470588236,
"grad_norm": 0.4061097022997626,
"learning_rate": 4.470588235294118e-05,
"loss": 0.4317,
"step": 19
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.4103637458773689,
"learning_rate": 4.705882352941177e-05,
"loss": 0.43,
"step": 20
},
{
"epoch": 0.3088235294117647,
"grad_norm": 0.3888995331551954,
"learning_rate": 4.941176470588236e-05,
"loss": 0.4199,
"step": 21
},
{
"epoch": 0.3235294117647059,
"grad_norm": 0.3268371484433097,
"learning_rate": 5.176470588235295e-05,
"loss": 0.4185,
"step": 22
},
{
"epoch": 0.3382352941176471,
"grad_norm": 0.39223698739269497,
"learning_rate": 5.411764705882354e-05,
"loss": 0.4128,
"step": 23
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.2779761293219138,
"learning_rate": 5.6470588235294126e-05,
"loss": 0.4043,
"step": 24
},
{
"epoch": 0.36764705882352944,
"grad_norm": 0.3989443078505564,
"learning_rate": 5.8823529411764714e-05,
"loss": 0.4089,
"step": 25
},
{
"epoch": 0.38235294117647056,
"grad_norm": 0.38093200095401203,
"learning_rate": 6.117647058823529e-05,
"loss": 0.4017,
"step": 26
},
{
"epoch": 0.39705882352941174,
"grad_norm": 0.4064320796173135,
"learning_rate": 6.352941176470589e-05,
"loss": 0.4011,
"step": 27
},
{
"epoch": 0.4117647058823529,
"grad_norm": 0.4971422289820653,
"learning_rate": 6.588235294117648e-05,
"loss": 0.3989,
"step": 28
},
{
"epoch": 0.4264705882352941,
"grad_norm": 0.7406572881589351,
"learning_rate": 6.823529411764707e-05,
"loss": 0.4051,
"step": 29
},
{
"epoch": 0.4411764705882353,
"grad_norm": 0.8206793701783639,
"learning_rate": 7.058823529411765e-05,
"loss": 0.406,
"step": 30
},
{
"epoch": 0.45588235294117646,
"grad_norm": 0.46999206766579715,
"learning_rate": 7.294117647058824e-05,
"loss": 0.3959,
"step": 31
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.7832181937714577,
"learning_rate": 7.529411764705883e-05,
"loss": 0.3979,
"step": 32
},
{
"epoch": 0.4852941176470588,
"grad_norm": 0.6729630572440819,
"learning_rate": 7.764705882352942e-05,
"loss": 0.4037,
"step": 33
},
{
"epoch": 0.5,
"grad_norm": 0.5845843059576046,
"learning_rate": 8e-05,
"loss": 0.3981,
"step": 34
},
{
"epoch": 0.5147058823529411,
"grad_norm": 0.5179039508771461,
"learning_rate": 7.999789193948694e-05,
"loss": 0.3899,
"step": 35
},
{
"epoch": 0.5294117647058824,
"grad_norm": 0.8446247503414208,
"learning_rate": 7.999156798014366e-05,
"loss": 0.3919,
"step": 36
},
{
"epoch": 0.5441176470588235,
"grad_norm": 0.5811147704566235,
"learning_rate": 7.998102878853464e-05,
"loss": 0.3842,
"step": 37
},
{
"epoch": 0.5588235294117647,
"grad_norm": 0.7042141225204365,
"learning_rate": 7.996627547552256e-05,
"loss": 0.3887,
"step": 38
},
{
"epoch": 0.5735294117647058,
"grad_norm": 0.5162453656112366,
"learning_rate": 7.994730959615125e-05,
"loss": 0.3801,
"step": 39
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.6306069956051037,
"learning_rate": 7.992413314948177e-05,
"loss": 0.3872,
"step": 40
},
{
"epoch": 0.6029411764705882,
"grad_norm": 0.4954620508512063,
"learning_rate": 7.989674857838173e-05,
"loss": 0.3799,
"step": 41
},
{
"epoch": 0.6176470588235294,
"grad_norm": 0.4655956434769946,
"learning_rate": 7.986515876926777e-05,
"loss": 0.3795,
"step": 42
},
{
"epoch": 0.6323529411764706,
"grad_norm": 0.4180599863638677,
"learning_rate": 7.982936705180139e-05,
"loss": 0.3797,
"step": 43
},
{
"epoch": 0.6470588235294118,
"grad_norm": 0.3370827795593815,
"learning_rate": 7.978937719853786e-05,
"loss": 0.3713,
"step": 44
},
{
"epoch": 0.6617647058823529,
"grad_norm": 0.302709681434872,
"learning_rate": 7.974519342452872e-05,
"loss": 0.37,
"step": 45
},
{
"epoch": 0.6764705882352942,
"grad_norm": 0.35605679848702854,
"learning_rate": 7.969682038687744e-05,
"loss": 0.3706,
"step": 46
},
{
"epoch": 0.6911764705882353,
"grad_norm": 0.3386095408702867,
"learning_rate": 7.964426318424855e-05,
"loss": 0.3717,
"step": 47
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.3740443449230433,
"learning_rate": 7.958752735633022e-05,
"loss": 0.3678,
"step": 48
},
{
"epoch": 0.7205882352941176,
"grad_norm": 0.291877051555131,
"learning_rate": 7.952661888325038e-05,
"loss": 0.3667,
"step": 49
},
{
"epoch": 0.7352941176470589,
"grad_norm": 0.22798970572724098,
"learning_rate": 7.946154418494639e-05,
"loss": 0.3658,
"step": 50
},
{
"epoch": 0.75,
"grad_norm": 0.3017794055631277,
"learning_rate": 7.939231012048833e-05,
"loss": 0.366,
"step": 51
},
{
"epoch": 0.7647058823529411,
"grad_norm": 0.2183488887265105,
"learning_rate": 7.931892398735608e-05,
"loss": 0.364,
"step": 52
},
{
"epoch": 0.7794117647058824,
"grad_norm": 0.2651745781496878,
"learning_rate": 7.92413935206701e-05,
"loss": 0.36,
"step": 53
},
{
"epoch": 0.7941176470588235,
"grad_norm": 0.25030378975718287,
"learning_rate": 7.915972689237618e-05,
"loss": 0.3629,
"step": 54
},
{
"epoch": 0.8088235294117647,
"grad_norm": 0.2531618636697317,
"learning_rate": 7.907393271038403e-05,
"loss": 0.3548,
"step": 55
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.33260642234735066,
"learning_rate": 7.898402001766002e-05,
"loss": 0.364,
"step": 56
},
{
"epoch": 0.8382352941176471,
"grad_norm": 0.41115101334491694,
"learning_rate": 7.888999829127398e-05,
"loss": 0.3578,
"step": 57
},
{
"epoch": 0.8529411764705882,
"grad_norm": 0.44590061545961934,
"learning_rate": 7.879187744140039e-05,
"loss": 0.3607,
"step": 58
},
{
"epoch": 0.8676470588235294,
"grad_norm": 0.522962751461851,
"learning_rate": 7.868966781027367e-05,
"loss": 0.3592,
"step": 59
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.6601512591505266,
"learning_rate": 7.858338017109822e-05,
"loss": 0.3657,
"step": 60
},
{
"epoch": 0.8970588235294118,
"grad_norm": 0.5825651454852345,
"learning_rate": 7.847302572691277e-05,
"loss": 0.3625,
"step": 61
},
{
"epoch": 0.9117647058823529,
"grad_norm": 0.42428578770971376,
"learning_rate": 7.835861610940965e-05,
"loss": 0.3613,
"step": 62
},
{
"epoch": 0.9264705882352942,
"grad_norm": 0.3717888932030909,
"learning_rate": 7.824016337770872e-05,
"loss": 0.3573,
"step": 63
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.42447915248441553,
"learning_rate": 7.811768001708627e-05,
"loss": 0.362,
"step": 64
},
{
"epoch": 0.9558823529411765,
"grad_norm": 0.3876778292466403,
"learning_rate": 7.799117893765913e-05,
"loss": 0.3557,
"step": 65
},
{
"epoch": 0.9705882352941176,
"grad_norm": 0.26608755562026015,
"learning_rate": 7.786067347302379e-05,
"loss": 0.3545,
"step": 66
},
{
"epoch": 0.9852941176470589,
"grad_norm": 0.3434111610187913,
"learning_rate": 7.77261773788511e-05,
"loss": 0.353,
"step": 67
},
{
"epoch": 1.0,
"grad_norm": 0.3497537599784584,
"learning_rate": 7.758770483143634e-05,
"loss": 0.3545,
"step": 68
},
{
"epoch": 1.0147058823529411,
"grad_norm": 0.3391085192527439,
"learning_rate": 7.744527042620496e-05,
"loss": 0.3419,
"step": 69
},
{
"epoch": 1.0294117647058822,
"grad_norm": 0.5077627087325893,
"learning_rate": 7.729888917617424e-05,
"loss": 0.3471,
"step": 70
},
{
"epoch": 1.0441176470588236,
"grad_norm": 0.5532598557050799,
"learning_rate": 7.714857651037081e-05,
"loss": 0.3412,
"step": 71
},
{
"epoch": 1.0588235294117647,
"grad_norm": 0.5780960309028261,
"learning_rate": 7.699434827220448e-05,
"loss": 0.3428,
"step": 72
},
{
"epoch": 1.0735294117647058,
"grad_norm": 0.49974865026294507,
"learning_rate": 7.683622071779816e-05,
"loss": 0.3364,
"step": 73
},
{
"epoch": 1.088235294117647,
"grad_norm": 0.36747177299557693,
"learning_rate": 7.667421051427453e-05,
"loss": 0.3389,
"step": 74
},
{
"epoch": 1.1029411764705883,
"grad_norm": 0.3669907335719105,
"learning_rate": 7.650833473799922e-05,
"loss": 0.3312,
"step": 75
},
{
"epoch": 1.1176470588235294,
"grad_norm": 0.3922235437218937,
"learning_rate": 7.633861087278095e-05,
"loss": 0.3293,
"step": 76
},
{
"epoch": 1.1323529411764706,
"grad_norm": 0.3719011690300337,
"learning_rate": 7.616505680802863e-05,
"loss": 0.3375,
"step": 77
},
{
"epoch": 1.1470588235294117,
"grad_norm": 0.3377098123124342,
"learning_rate": 7.598769083686582e-05,
"loss": 0.3405,
"step": 78
},
{
"epoch": 1.161764705882353,
"grad_norm": 0.29899552501628507,
"learning_rate": 7.58065316542025e-05,
"loss": 0.3351,
"step": 79
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.32828082894171606,
"learning_rate": 7.562159835476466e-05,
"loss": 0.3389,
"step": 80
},
{
"epoch": 1.1911764705882353,
"grad_norm": 0.3445874209931112,
"learning_rate": 7.543291043108159e-05,
"loss": 0.3378,
"step": 81
},
{
"epoch": 1.2058823529411764,
"grad_norm": 0.3023978628206444,
"learning_rate": 7.524048777143139e-05,
"loss": 0.3387,
"step": 82
},
{
"epoch": 1.2205882352941178,
"grad_norm": 0.25558483521205383,
"learning_rate": 7.504435065774455e-05,
"loss": 0.3306,
"step": 83
},
{
"epoch": 1.2352941176470589,
"grad_norm": 0.32953601128265847,
"learning_rate": 7.48445197634663e-05,
"loss": 0.3349,
"step": 84
},
{
"epoch": 1.25,
"grad_norm": 0.33756326919937646,
"learning_rate": 7.464101615137756e-05,
"loss": 0.3324,
"step": 85
},
{
"epoch": 1.2647058823529411,
"grad_norm": 0.24120088917587124,
"learning_rate": 7.443386127137472e-05,
"loss": 0.3317,
"step": 86
},
{
"epoch": 1.2794117647058822,
"grad_norm": 0.2914122682482861,
"learning_rate": 7.422307695820893e-05,
"loss": 0.3346,
"step": 87
},
{
"epoch": 1.2941176470588236,
"grad_norm": 0.32020365300061576,
"learning_rate": 7.400868542918457e-05,
"loss": 0.3303,
"step": 88
},
{
"epoch": 1.3088235294117647,
"grad_norm": 0.2516025057415832,
"learning_rate": 7.379070928181747e-05,
"loss": 0.3351,
"step": 89
},
{
"epoch": 1.3235294117647058,
"grad_norm": 0.25001986331755705,
"learning_rate": 7.356917149145308e-05,
"loss": 0.3353,
"step": 90
},
{
"epoch": 1.3382352941176472,
"grad_norm": 0.2792197235528977,
"learning_rate": 7.334409540884479e-05,
"loss": 0.3294,
"step": 91
},
{
"epoch": 1.3529411764705883,
"grad_norm": 0.28078783596981044,
"learning_rate": 7.311550475769272e-05,
"loss": 0.3248,
"step": 92
},
{
"epoch": 1.3676470588235294,
"grad_norm": 0.37967936203746205,
"learning_rate": 7.288342363214313e-05,
"loss": 0.3328,
"step": 93
},
{
"epoch": 1.3823529411764706,
"grad_norm": 0.5149190442137033,
"learning_rate": 7.264787649424888e-05,
"loss": 0.3312,
"step": 94
},
{
"epoch": 1.3970588235294117,
"grad_norm": 0.6122843221799549,
"learning_rate": 7.240888817139094e-05,
"loss": 0.3348,
"step": 95
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.5886981731078635,
"learning_rate": 7.216648385366167e-05,
"loss": 0.3395,
"step": 96
},
{
"epoch": 1.4264705882352942,
"grad_norm": 0.34409848388448383,
"learning_rate": 7.192068909120959e-05,
"loss": 0.3306,
"step": 97
},
{
"epoch": 1.4411764705882353,
"grad_norm": 0.2978091494500524,
"learning_rate": 7.167152979154634e-05,
"loss": 0.3334,
"step": 98
},
{
"epoch": 1.4558823529411764,
"grad_norm": 0.4400729737039225,
"learning_rate": 7.141903221681595e-05,
"loss": 0.3404,
"step": 99
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.3694283042397561,
"learning_rate": 7.116322298102681e-05,
"loss": 0.3332,
"step": 100
},
{
"epoch": 1.4852941176470589,
"grad_norm": 0.2639067391873521,
"learning_rate": 7.090412904724636e-05,
"loss": 0.3313,
"step": 101
},
{
"epoch": 1.5,
"grad_norm": 0.37416820779021187,
"learning_rate": 7.064177772475912e-05,
"loss": 0.3285,
"step": 102
},
{
"epoch": 1.5147058823529411,
"grad_norm": 0.33275488946271775,
"learning_rate": 7.037619666618829e-05,
"loss": 0.3361,
"step": 103
},
{
"epoch": 1.5294117647058822,
"grad_norm": 0.23894606263596457,
"learning_rate": 7.010741386458099e-05,
"loss": 0.3388,
"step": 104
},
{
"epoch": 1.5441176470588234,
"grad_norm": 0.3668325550727754,
"learning_rate": 6.983545765045774e-05,
"loss": 0.3311,
"step": 105
},
{
"epoch": 1.5588235294117647,
"grad_norm": 0.2399104316842811,
"learning_rate": 6.956035668882637e-05,
"loss": 0.3297,
"step": 106
},
{
"epoch": 1.5735294117647058,
"grad_norm": 0.2357871283562126,
"learning_rate": 6.928213997616059e-05,
"loss": 0.3318,
"step": 107
},
{
"epoch": 1.5882352941176472,
"grad_norm": 0.2849131707817035,
"learning_rate": 6.900083683734372e-05,
"loss": 0.3304,
"step": 108
},
{
"epoch": 1.6029411764705883,
"grad_norm": 0.17423798370987492,
"learning_rate": 6.871647692257768e-05,
"loss": 0.3276,
"step": 109
},
{
"epoch": 1.6176470588235294,
"grad_norm": 0.2548925157793133,
"learning_rate": 6.842909020425789e-05,
"loss": 0.334,
"step": 110
},
{
"epoch": 1.6323529411764706,
"grad_norm": 0.24942793454876405,
"learning_rate": 6.8138706973814e-05,
"loss": 0.3286,
"step": 111
},
{
"epoch": 1.6470588235294117,
"grad_norm": 0.18689295644396056,
"learning_rate": 6.784535783851708e-05,
"loss": 0.3266,
"step": 112
},
{
"epoch": 1.6617647058823528,
"grad_norm": 0.2911151755362797,
"learning_rate": 6.754907371825355e-05,
"loss": 0.3262,
"step": 113
},
{
"epoch": 1.6764705882352942,
"grad_norm": 0.27837463790987516,
"learning_rate": 6.724988584226616e-05,
"loss": 0.3279,
"step": 114
},
{
"epoch": 1.6911764705882353,
"grad_norm": 0.28908336337027807,
"learning_rate": 6.69478257458623e-05,
"loss": 0.3281,
"step": 115
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.34403162565227147,
"learning_rate": 6.664292526709001e-05,
"loss": 0.3313,
"step": 116
},
{
"epoch": 1.7205882352941178,
"grad_norm": 0.29186151217355694,
"learning_rate": 6.633521654338233e-05,
"loss": 0.3334,
"step": 117
},
{
"epoch": 1.7352941176470589,
"grad_norm": 0.3213370329801203,
"learning_rate": 6.602473200816969e-05,
"loss": 0.3267,
"step": 118
},
{
"epoch": 1.75,
"grad_norm": 0.30160846132752556,
"learning_rate": 6.571150438746157e-05,
"loss": 0.3242,
"step": 119
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.26892209589390914,
"learning_rate": 6.539556669639691e-05,
"loss": 0.3244,
"step": 120
},
{
"epoch": 1.7794117647058822,
"grad_norm": 0.2440948850515908,
"learning_rate": 6.507695223576428e-05,
"loss": 0.3229,
"step": 121
},
{
"epoch": 1.7941176470588234,
"grad_norm": 0.20672290158202897,
"learning_rate": 6.475569458849178e-05,
"loss": 0.331,
"step": 122
},
{
"epoch": 1.8088235294117647,
"grad_norm": 0.2459670080944092,
"learning_rate": 6.443182761610752e-05,
"loss": 0.3321,
"step": 123
},
{
"epoch": 1.8235294117647058,
"grad_norm": 0.2704407966484609,
"learning_rate": 6.410538545517026e-05,
"loss": 0.3288,
"step": 124
},
{
"epoch": 1.8382352941176472,
"grad_norm": 0.23662938749712678,
"learning_rate": 6.377640251367148e-05,
"loss": 0.3285,
"step": 125
},
{
"epoch": 1.8529411764705883,
"grad_norm": 0.23508733004486604,
"learning_rate": 6.344491346740859e-05,
"loss": 0.3265,
"step": 126
},
{
"epoch": 1.8676470588235294,
"grad_norm": 0.15905797582955938,
"learning_rate": 6.311095325633006e-05,
"loss": 0.3287,
"step": 127
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.16194725388549308,
"learning_rate": 6.277455708085255e-05,
"loss": 0.3193,
"step": 128
},
{
"epoch": 1.8970588235294117,
"grad_norm": 0.19431781452308503,
"learning_rate": 6.24357603981508e-05,
"loss": 0.3218,
"step": 129
},
{
"epoch": 1.9117647058823528,
"grad_norm": 0.1530375931457355,
"learning_rate": 6.209459891842023e-05,
"loss": 0.3232,
"step": 130
},
{
"epoch": 1.9264705882352942,
"grad_norm": 0.20552563428946305,
"learning_rate": 6.175110860111307e-05,
"loss": 0.3291,
"step": 131
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.16800259989430927,
"learning_rate": 6.140532565114801e-05,
"loss": 0.3255,
"step": 132
},
{
"epoch": 1.9558823529411766,
"grad_norm": 0.15539283189220082,
"learning_rate": 6.105728651509424e-05,
"loss": 0.3254,
"step": 133
},
{
"epoch": 1.9705882352941178,
"grad_norm": 0.16103547056804585,
"learning_rate": 6.070702787732971e-05,
"loss": 0.3249,
"step": 134
},
{
"epoch": 1.9852941176470589,
"grad_norm": 0.16239367515131709,
"learning_rate": 6.0354586656174606e-05,
"loss": 0.3288,
"step": 135
},
{
"epoch": 2.0,
"grad_norm": 0.20166055674169706,
"learning_rate": 6.000000000000001e-05,
"loss": 0.3099,
"step": 136
},
{
"epoch": 2.014705882352941,
"grad_norm": 0.24260127096827225,
"learning_rate": 5.964330528331234e-05,
"loss": 0.3056,
"step": 137
},
{
"epoch": 2.0294117647058822,
"grad_norm": 0.33127332158017103,
"learning_rate": 5.9284540102813964e-05,
"loss": 0.3,
"step": 138
},
{
"epoch": 2.0441176470588234,
"grad_norm": 0.43353541229148657,
"learning_rate": 5.892374227344041e-05,
"loss": 0.308,
"step": 139
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.49315921894412995,
"learning_rate": 5.856094982437454e-05,
"loss": 0.3067,
"step": 140
},
{
"epoch": 2.073529411764706,
"grad_norm": 0.450512729936066,
"learning_rate": 5.819620099503818e-05,
"loss": 0.3101,
"step": 141
},
{
"epoch": 2.088235294117647,
"grad_norm": 0.343920919271517,
"learning_rate": 5.782953423106154e-05,
"loss": 0.3046,
"step": 142
},
{
"epoch": 2.1029411764705883,
"grad_norm": 0.2814078808698335,
"learning_rate": 5.746098818023093e-05,
"loss": 0.2988,
"step": 143
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.3529797938787429,
"learning_rate": 5.709060168841524e-05,
"loss": 0.3033,
"step": 144
},
{
"epoch": 2.1323529411764706,
"grad_norm": 0.33042737999638,
"learning_rate": 5.6718413795471346e-05,
"loss": 0.3028,
"step": 145
},
{
"epoch": 2.1470588235294117,
"grad_norm": 0.16519735683489525,
"learning_rate": 5.634446373112926e-05,
"loss": 0.3035,
"step": 146
},
{
"epoch": 2.161764705882353,
"grad_norm": 0.23206099454265375,
"learning_rate": 5.596879091085724e-05,
"loss": 0.3001,
"step": 147
},
{
"epoch": 2.176470588235294,
"grad_norm": 0.2771142062084271,
"learning_rate": 5.5591434931707176e-05,
"loss": 0.3005,
"step": 148
},
{
"epoch": 2.1911764705882355,
"grad_norm": 0.2431821354495366,
"learning_rate": 5.5212435568141036e-05,
"loss": 0.3045,
"step": 149
},
{
"epoch": 2.2058823529411766,
"grad_norm": 0.18676533826823546,
"learning_rate": 5.4831832767838436e-05,
"loss": 0.2993,
"step": 150
},
{
"epoch": 2.2205882352941178,
"grad_norm": 0.21179962464182328,
"learning_rate": 5.444966664748613e-05,
"loss": 0.2967,
"step": 151
},
{
"epoch": 2.235294117647059,
"grad_norm": 0.2277766014598405,
"learning_rate": 5.406597748854947e-05,
"loss": 0.2993,
"step": 152
},
{
"epoch": 2.25,
"grad_norm": 0.19342351829324006,
"learning_rate": 5.368080573302676e-05,
"loss": 0.3044,
"step": 153
},
{
"epoch": 2.264705882352941,
"grad_norm": 0.21064512781524572,
"learning_rate": 5.329419197918639e-05,
"loss": 0.3062,
"step": 154
},
{
"epoch": 2.2794117647058822,
"grad_norm": 0.15702816077320908,
"learning_rate": 5.29061769772878e-05,
"loss": 0.2995,
"step": 155
},
{
"epoch": 2.2941176470588234,
"grad_norm": 0.17947407344934332,
"learning_rate": 5.251680162528618e-05,
"loss": 0.3013,
"step": 156
},
{
"epoch": 2.3088235294117645,
"grad_norm": 0.18466536636530687,
"learning_rate": 5.212610696452174e-05,
"loss": 0.3036,
"step": 157
},
{
"epoch": 2.323529411764706,
"grad_norm": 0.1680929482668945,
"learning_rate": 5.173413417539385e-05,
"loss": 0.3029,
"step": 158
},
{
"epoch": 2.338235294117647,
"grad_norm": 0.1654731382812225,
"learning_rate": 5.134092457302044e-05,
"loss": 0.3024,
"step": 159
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.15258036230353525,
"learning_rate": 5.0946519602883326e-05,
"loss": 0.3037,
"step": 160
},
{
"epoch": 2.3676470588235294,
"grad_norm": 0.1491955575134207,
"learning_rate": 5.0550960836459674e-05,
"loss": 0.3044,
"step": 161
},
{
"epoch": 2.3823529411764706,
"grad_norm": 0.14545439455177017,
"learning_rate": 5.0154289966840315e-05,
"loss": 0.2954,
"step": 162
},
{
"epoch": 2.3970588235294117,
"grad_norm": 0.13499905174566654,
"learning_rate": 4.975654880433509e-05,
"loss": 0.2991,
"step": 163
},
{
"epoch": 2.411764705882353,
"grad_norm": 0.13529530394095032,
"learning_rate": 4.935777927206595e-05,
"loss": 0.301,
"step": 164
},
{
"epoch": 2.426470588235294,
"grad_norm": 0.14447147193835977,
"learning_rate": 4.895802340154813e-05,
"loss": 0.3038,
"step": 165
},
{
"epoch": 2.4411764705882355,
"grad_norm": 0.1368779663418461,
"learning_rate": 4.85573233282599e-05,
"loss": 0.3022,
"step": 166
},
{
"epoch": 2.4558823529411766,
"grad_norm": 0.13420641137376216,
"learning_rate": 4.815572128720138e-05,
"loss": 0.3049,
"step": 167
},
{
"epoch": 2.4705882352941178,
"grad_norm": 0.12163815573974876,
"learning_rate": 4.7753259608442804e-05,
"loss": 0.2998,
"step": 168
},
{
"epoch": 2.485294117647059,
"grad_norm": 0.15356753682422525,
"learning_rate": 4.734998071266282e-05,
"loss": 0.298,
"step": 169
},
{
"epoch": 2.5,
"grad_norm": 0.11474062387695687,
"learning_rate": 4.694592710667723e-05,
"loss": 0.3068,
"step": 170
},
{
"epoch": 2.514705882352941,
"grad_norm": 0.15293267687494932,
"learning_rate": 4.65411413789586e-05,
"loss": 0.3005,
"step": 171
},
{
"epoch": 2.5294117647058822,
"grad_norm": 0.13885259280209167,
"learning_rate": 4.6135666195147426e-05,
"loss": 0.2994,
"step": 172
},
{
"epoch": 2.5441176470588234,
"grad_norm": 0.11212797930990655,
"learning_rate": 4.572954429355487e-05,
"loss": 0.3026,
"step": 173
},
{
"epoch": 2.5588235294117645,
"grad_norm": 0.1441373090862667,
"learning_rate": 4.532281848065816e-05,
"loss": 0.3014,
"step": 174
},
{
"epoch": 2.5735294117647056,
"grad_norm": 0.1438877611890079,
"learning_rate": 4.491553162658857e-05,
"loss": 0.3044,
"step": 175
},
{
"epoch": 2.588235294117647,
"grad_norm": 0.12603413712308525,
"learning_rate": 4.450772666061285e-05,
"loss": 0.301,
"step": 176
},
{
"epoch": 2.6029411764705883,
"grad_norm": 0.12633770238098366,
"learning_rate": 4.409944656660828e-05,
"loss": 0.2965,
"step": 177
},
{
"epoch": 2.6176470588235294,
"grad_norm": 0.13789374969079662,
"learning_rate": 4.369073437853208e-05,
"loss": 0.3009,
"step": 178
},
{
"epoch": 2.6323529411764706,
"grad_norm": 0.1402520516162208,
"learning_rate": 4.328163317588552e-05,
"loss": 0.298,
"step": 179
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.09904937633936901,
"learning_rate": 4.2872186079173106e-05,
"loss": 0.3013,
"step": 180
},
{
"epoch": 2.661764705882353,
"grad_norm": 0.1312793347901254,
"learning_rate": 4.2462436245357724e-05,
"loss": 0.3,
"step": 181
},
{
"epoch": 2.6764705882352944,
"grad_norm": 0.09621059323368814,
"learning_rate": 4.205242686331159e-05,
"loss": 0.3029,
"step": 182
},
{
"epoch": 2.6911764705882355,
"grad_norm": 0.10807537972219632,
"learning_rate": 4.164220114926414e-05,
"loss": 0.2978,
"step": 183
},
{
"epoch": 2.7058823529411766,
"grad_norm": 0.10722934994246927,
"learning_rate": 4.123180234224682e-05,
"loss": 0.2998,
"step": 184
},
{
"epoch": 2.7205882352941178,
"grad_norm": 0.1064704662992324,
"learning_rate": 4.0821273699535625e-05,
"loss": 0.3013,
"step": 185
},
{
"epoch": 2.735294117647059,
"grad_norm": 0.10969792661155021,
"learning_rate": 4.04106584920916e-05,
"loss": 0.3006,
"step": 186
},
{
"epoch": 2.75,
"grad_norm": 0.12146189911454472,
"learning_rate": 4e-05,
"loss": 0.3031,
"step": 187
},
{
"epoch": 2.764705882352941,
"grad_norm": 0.11305408445478238,
"learning_rate": 3.9589341507908415e-05,
"loss": 0.3037,
"step": 188
},
{
"epoch": 2.7794117647058822,
"grad_norm": 0.13033799624199852,
"learning_rate": 3.917872630046439e-05,
"loss": 0.3032,
"step": 189
},
{
"epoch": 2.7941176470588234,
"grad_norm": 0.10937208565515465,
"learning_rate": 3.8768197657753194e-05,
"loss": 0.3035,
"step": 190
},
{
"epoch": 2.8088235294117645,
"grad_norm": 0.11386446489403948,
"learning_rate": 3.835779885073588e-05,
"loss": 0.2985,
"step": 191
},
{
"epoch": 2.8235294117647056,
"grad_norm": 0.10573108090134906,
"learning_rate": 3.794757313668841e-05,
"loss": 0.3025,
"step": 192
},
{
"epoch": 2.838235294117647,
"grad_norm": 0.09522893522850687,
"learning_rate": 3.753756375464229e-05,
"loss": 0.2964,
"step": 193
},
{
"epoch": 2.8529411764705883,
"grad_norm": 0.10763504421587497,
"learning_rate": 3.71278139208269e-05,
"loss": 0.2983,
"step": 194
},
{
"epoch": 2.8676470588235294,
"grad_norm": 0.09673967344497875,
"learning_rate": 3.67183668241145e-05,
"loss": 0.303,
"step": 195
},
{
"epoch": 2.8823529411764706,
"grad_norm": 0.11019580337657149,
"learning_rate": 3.630926562146792e-05,
"loss": 0.2977,
"step": 196
},
{
"epoch": 2.8970588235294117,
"grad_norm": 0.09457792059050557,
"learning_rate": 3.5900553433391724e-05,
"loss": 0.2987,
"step": 197
},
{
"epoch": 2.911764705882353,
"grad_norm": 0.08738325053473636,
"learning_rate": 3.549227333938716e-05,
"loss": 0.2986,
"step": 198
},
{
"epoch": 2.9264705882352944,
"grad_norm": 0.09681306300823637,
"learning_rate": 3.5084468373411444e-05,
"loss": 0.3013,
"step": 199
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.09249395424589109,
"learning_rate": 3.467718151934187e-05,
"loss": 0.2972,
"step": 200
},
{
"epoch": 2.9558823529411766,
"grad_norm": 0.09703820251292773,
"learning_rate": 3.427045570644515e-05,
"loss": 0.2979,
"step": 201
},
{
"epoch": 2.9705882352941178,
"grad_norm": 0.10540258782003765,
"learning_rate": 3.386433380485258e-05,
"loss": 0.2992,
"step": 202
},
{
"epoch": 2.985294117647059,
"grad_norm": 0.08015062850340511,
"learning_rate": 3.34588586210414e-05,
"loss": 0.3011,
"step": 203
},
{
"epoch": 3.0,
"grad_norm": 0.14706722768632058,
"learning_rate": 3.305407289332279e-05,
"loss": 0.2806,
"step": 204
},
{
"epoch": 3.014705882352941,
"grad_norm": 0.10603496844146229,
"learning_rate": 3.2650019287337184e-05,
"loss": 0.2799,
"step": 205
},
{
"epoch": 3.0294117647058822,
"grad_norm": 0.15530639149269276,
"learning_rate": 3.22467403915572e-05,
"loss": 0.2781,
"step": 206
},
{
"epoch": 3.0441176470588234,
"grad_norm": 0.1421223150968687,
"learning_rate": 3.184427871279863e-05,
"loss": 0.282,
"step": 207
},
{
"epoch": 3.0588235294117645,
"grad_norm": 0.14419949652005803,
"learning_rate": 3.144267667174011e-05,
"loss": 0.2776,
"step": 208
},
{
"epoch": 3.073529411764706,
"grad_norm": 0.12994406412161302,
"learning_rate": 3.1041976598451884e-05,
"loss": 0.2781,
"step": 209
},
{
"epoch": 3.088235294117647,
"grad_norm": 0.14614171687593527,
"learning_rate": 3.064222072793407e-05,
"loss": 0.2765,
"step": 210
},
{
"epoch": 3.1029411764705883,
"grad_norm": 0.12016869114374024,
"learning_rate": 3.0243451195664914e-05,
"loss": 0.2783,
"step": 211
},
{
"epoch": 3.1176470588235294,
"grad_norm": 0.1310851029344073,
"learning_rate": 2.984571003315969e-05,
"loss": 0.2781,
"step": 212
},
{
"epoch": 3.1323529411764706,
"grad_norm": 0.11922587006224326,
"learning_rate": 2.944903916354032e-05,
"loss": 0.2795,
"step": 213
},
{
"epoch": 3.1470588235294117,
"grad_norm": 0.12710795880675843,
"learning_rate": 2.905348039711669e-05,
"loss": 0.2784,
"step": 214
},
{
"epoch": 3.161764705882353,
"grad_norm": 0.11445734817339984,
"learning_rate": 2.865907542697957e-05,
"loss": 0.2758,
"step": 215
},
{
"epoch": 3.176470588235294,
"grad_norm": 0.1300689688100464,
"learning_rate": 2.8265865824606165e-05,
"loss": 0.2758,
"step": 216
},
{
"epoch": 3.1911764705882355,
"grad_norm": 0.09831374725437492,
"learning_rate": 2.7873893035478265e-05,
"loss": 0.2748,
"step": 217
},
{
"epoch": 3.2058823529411766,
"grad_norm": 0.1272913228663667,
"learning_rate": 2.7483198374713836e-05,
"loss": 0.2746,
"step": 218
},
{
"epoch": 3.2205882352941178,
"grad_norm": 0.0993483978526809,
"learning_rate": 2.7093823022712217e-05,
"loss": 0.2739,
"step": 219
},
{
"epoch": 3.235294117647059,
"grad_norm": 0.09886695511684795,
"learning_rate": 2.6705808020813622e-05,
"loss": 0.2832,
"step": 220
},
{
"epoch": 3.25,
"grad_norm": 0.0998986520318052,
"learning_rate": 2.6319194266973256e-05,
"loss": 0.2743,
"step": 221
},
{
"epoch": 3.264705882352941,
"grad_norm": 0.0964816428580283,
"learning_rate": 2.5934022511450528e-05,
"loss": 0.2762,
"step": 222
},
{
"epoch": 3.2794117647058822,
"grad_norm": 0.09255980694012796,
"learning_rate": 2.5550333352513885e-05,
"loss": 0.2782,
"step": 223
},
{
"epoch": 3.2941176470588234,
"grad_norm": 0.0988289109757557,
"learning_rate": 2.5168167232161574e-05,
"loss": 0.2748,
"step": 224
},
{
"epoch": 3.3088235294117645,
"grad_norm": 0.0884234456508421,
"learning_rate": 2.4787564431858977e-05,
"loss": 0.2753,
"step": 225
},
{
"epoch": 3.323529411764706,
"grad_norm": 0.09774336930093334,
"learning_rate": 2.4408565068292827e-05,
"loss": 0.2751,
"step": 226
},
{
"epoch": 3.338235294117647,
"grad_norm": 0.0883002824327121,
"learning_rate": 2.4031209089142773e-05,
"loss": 0.2773,
"step": 227
},
{
"epoch": 3.3529411764705883,
"grad_norm": 0.09215211229010013,
"learning_rate": 2.3655536268870744e-05,
"loss": 0.2752,
"step": 228
},
{
"epoch": 3.3676470588235294,
"grad_norm": 0.08094179064778044,
"learning_rate": 2.328158620452868e-05,
"loss": 0.2729,
"step": 229
},
{
"epoch": 3.3823529411764706,
"grad_norm": 0.09692135730954927,
"learning_rate": 2.2909398311584775e-05,
"loss": 0.2731,
"step": 230
},
{
"epoch": 3.3970588235294117,
"grad_norm": 0.0813183018452283,
"learning_rate": 2.2539011819769056e-05,
"loss": 0.2782,
"step": 231
},
{
"epoch": 3.411764705882353,
"grad_norm": 0.08149910156743696,
"learning_rate": 2.2170465768938473e-05,
"loss": 0.275,
"step": 232
},
{
"epoch": 3.426470588235294,
"grad_norm": 0.08503586167349093,
"learning_rate": 2.1803799004961824e-05,
"loss": 0.2766,
"step": 233
},
{
"epoch": 3.4411764705882355,
"grad_norm": 0.07599564665486494,
"learning_rate": 2.1439050175625474e-05,
"loss": 0.2759,
"step": 234
},
{
"epoch": 3.4558823529411766,
"grad_norm": 0.08451424035355402,
"learning_rate": 2.1076257726559603e-05,
"loss": 0.2795,
"step": 235
},
{
"epoch": 3.4705882352941178,
"grad_norm": 0.07467914550010295,
"learning_rate": 2.0715459897186046e-05,
"loss": 0.2767,
"step": 236
},
{
"epoch": 3.485294117647059,
"grad_norm": 0.09085541610657201,
"learning_rate": 2.0356694716687687e-05,
"loss": 0.2785,
"step": 237
},
{
"epoch": 3.5,
"grad_norm": 0.0734615712405951,
"learning_rate": 2.0000000000000012e-05,
"loss": 0.2739,
"step": 238
},
{
"epoch": 3.514705882352941,
"grad_norm": 0.07717163991035296,
"learning_rate": 1.964541334382541e-05,
"loss": 0.2729,
"step": 239
},
{
"epoch": 3.5294117647058822,
"grad_norm": 0.07496301816518236,
"learning_rate": 1.9292972122670303e-05,
"loss": 0.2752,
"step": 240
},
{
"epoch": 3.5441176470588234,
"grad_norm": 0.08080347359960849,
"learning_rate": 1.8942713484905762e-05,
"loss": 0.2801,
"step": 241
},
{
"epoch": 3.5588235294117645,
"grad_norm": 0.0738257225960406,
"learning_rate": 1.8594674348851992e-05,
"loss": 0.2767,
"step": 242
},
{
"epoch": 3.5735294117647056,
"grad_norm": 0.07389506773645138,
"learning_rate": 1.824889139888694e-05,
"loss": 0.2773,
"step": 243
},
{
"epoch": 3.588235294117647,
"grad_norm": 0.070294801390775,
"learning_rate": 1.790540108157977e-05,
"loss": 0.2763,
"step": 244
},
{
"epoch": 3.6029411764705883,
"grad_norm": 0.068704655766595,
"learning_rate": 1.756423960184922e-05,
"loss": 0.2781,
"step": 245
},
{
"epoch": 3.6176470588235294,
"grad_norm": 0.06594242125553404,
"learning_rate": 1.7225442919147467e-05,
"loss": 0.2757,
"step": 246
},
{
"epoch": 3.6323529411764706,
"grad_norm": 0.06969707402390993,
"learning_rate": 1.6889046743669957e-05,
"loss": 0.2776,
"step": 247
},
{
"epoch": 3.6470588235294117,
"grad_norm": 0.06492677835336866,
"learning_rate": 1.6555086532591425e-05,
"loss": 0.2781,
"step": 248
},
{
"epoch": 3.661764705882353,
"grad_norm": 0.06331911499148075,
"learning_rate": 1.6223597486328534e-05,
"loss": 0.279,
"step": 249
},
{
"epoch": 3.6764705882352944,
"grad_norm": 0.06722098051730128,
"learning_rate": 1.589461454482975e-05,
"loss": 0.2802,
"step": 250
},
{
"epoch": 3.6911764705882355,
"grad_norm": 0.06266474278629547,
"learning_rate": 1.556817238389249e-05,
"loss": 0.2781,
"step": 251
},
{
"epoch": 3.7058823529411766,
"grad_norm": 0.07000155789305507,
"learning_rate": 1.5244305411508217e-05,
"loss": 0.278,
"step": 252
},
{
"epoch": 3.7205882352941178,
"grad_norm": 0.061574644987438566,
"learning_rate": 1.4923047764235752e-05,
"loss": 0.2767,
"step": 253
},
{
"epoch": 3.735294117647059,
"grad_norm": 0.0682684963649869,
"learning_rate": 1.4604433303603092e-05,
"loss": 0.2732,
"step": 254
},
{
"epoch": 3.75,
"grad_norm": 0.0642818898133848,
"learning_rate": 1.4288495612538427e-05,
"loss": 0.2743,
"step": 255
},
{
"epoch": 3.764705882352941,
"grad_norm": 0.06713851434875447,
"learning_rate": 1.3975267991830327e-05,
"loss": 0.2817,
"step": 256
},
{
"epoch": 3.7794117647058822,
"grad_norm": 0.06828899506125703,
"learning_rate": 1.3664783456617703e-05,
"loss": 0.2725,
"step": 257
},
{
"epoch": 3.7941176470588234,
"grad_norm": 0.07069453573008218,
"learning_rate": 1.3357074732909996e-05,
"loss": 0.2775,
"step": 258
},
{
"epoch": 3.8088235294117645,
"grad_norm": 0.06390051742059577,
"learning_rate": 1.3052174254137713e-05,
"loss": 0.2771,
"step": 259
},
{
"epoch": 3.8235294117647056,
"grad_norm": 0.06129666220940109,
"learning_rate": 1.275011415773383e-05,
"loss": 0.2789,
"step": 260
},
{
"epoch": 3.838235294117647,
"grad_norm": 0.065817213580326,
"learning_rate": 1.2450926281746458e-05,
"loss": 0.274,
"step": 261
},
{
"epoch": 3.8529411764705883,
"grad_norm": 0.06519904576862424,
"learning_rate": 1.2154642161482939e-05,
"loss": 0.2771,
"step": 262
},
{
"epoch": 3.8676470588235294,
"grad_norm": 0.06313901880567484,
"learning_rate": 1.1861293026186007e-05,
"loss": 0.2754,
"step": 263
},
{
"epoch": 3.8823529411764706,
"grad_norm": 0.06296057703800705,
"learning_rate": 1.1570909795742118e-05,
"loss": 0.2732,
"step": 264
},
{
"epoch": 3.8970588235294117,
"grad_norm": 0.06544715501817833,
"learning_rate": 1.1283523077422327e-05,
"loss": 0.2797,
"step": 265
},
{
"epoch": 3.911764705882353,
"grad_norm": 0.0650053871447722,
"learning_rate": 1.0999163162656296e-05,
"loss": 0.279,
"step": 266
},
{
"epoch": 3.9264705882352944,
"grad_norm": 0.06600064138970108,
"learning_rate": 1.0717860023839424e-05,
"loss": 0.276,
"step": 267
},
{
"epoch": 3.9411764705882355,
"grad_norm": 0.06210716465473454,
"learning_rate": 1.0439643311173642e-05,
"loss": 0.2768,
"step": 268
},
{
"epoch": 3.9558823529411766,
"grad_norm": 0.06401894182204264,
"learning_rate": 1.0164542349542273e-05,
"loss": 0.2788,
"step": 269
},
{
"epoch": 3.9705882352941178,
"grad_norm": 0.06372281222639016,
"learning_rate": 9.892586135419022e-06,
"loss": 0.2777,
"step": 270
},
{
"epoch": 3.985294117647059,
"grad_norm": 0.05892561579737352,
"learning_rate": 9.623803333811713e-06,
"loss": 0.2771,
"step": 271
},
{
"epoch": 4.0,
"grad_norm": 0.12086246389399692,
"learning_rate": 9.358222275240884e-06,
"loss": 0.2599,
"step": 272
},
{
"epoch": 4.014705882352941,
"grad_norm": 0.08602389508274347,
"learning_rate": 9.095870952753647e-06,
"loss": 0.2593,
"step": 273
},
{
"epoch": 4.029411764705882,
"grad_norm": 0.07197077150088015,
"learning_rate": 8.83677701897318e-06,
"loss": 0.262,
"step": 274
},
{
"epoch": 4.044117647058823,
"grad_norm": 0.0968580360252349,
"learning_rate": 8.580967783184055e-06,
"loss": 0.261,
"step": 275
},
{
"epoch": 4.0588235294117645,
"grad_norm": 0.09014393537599821,
"learning_rate": 8.328470208453683e-06,
"loss": 0.2622,
"step": 276
},
{
"epoch": 4.073529411764706,
"grad_norm": 0.0789945543394142,
"learning_rate": 8.07931090879042e-06,
"loss": 0.2577,
"step": 277
},
{
"epoch": 4.088235294117647,
"grad_norm": 0.08815856516590073,
"learning_rate": 7.833516146338329e-06,
"loss": 0.2617,
"step": 278
},
{
"epoch": 4.102941176470588,
"grad_norm": 0.08015266822256034,
"learning_rate": 7.591111828609059e-06,
"loss": 0.2641,
"step": 279
},
{
"epoch": 4.117647058823529,
"grad_norm": 0.08154629673882077,
"learning_rate": 7.3521235057511364e-06,
"loss": 0.2638,
"step": 280
},
{
"epoch": 4.132352941176471,
"grad_norm": 0.08290035992126818,
"learning_rate": 7.116576367856871e-06,
"loss": 0.2606,
"step": 281
},
{
"epoch": 4.147058823529412,
"grad_norm": 0.07674741527367407,
"learning_rate": 6.884495242307285e-06,
"loss": 0.2613,
"step": 282
},
{
"epoch": 4.161764705882353,
"grad_norm": 0.07292754427620451,
"learning_rate": 6.655904591155224e-06,
"loss": 0.2618,
"step": 283
},
{
"epoch": 4.176470588235294,
"grad_norm": 0.0736899911159241,
"learning_rate": 6.430828508546936e-06,
"loss": 0.2637,
"step": 284
},
{
"epoch": 4.1911764705882355,
"grad_norm": 0.07217050587422956,
"learning_rate": 6.209290718182539e-06,
"loss": 0.2615,
"step": 285
},
{
"epoch": 4.205882352941177,
"grad_norm": 0.07553232921098052,
"learning_rate": 5.991314570815441e-06,
"loss": 0.265,
"step": 286
},
{
"epoch": 4.220588235294118,
"grad_norm": 0.06886285952389541,
"learning_rate": 5.776923041791076e-06,
"loss": 0.2602,
"step": 287
},
{
"epoch": 4.235294117647059,
"grad_norm": 0.06173078424810273,
"learning_rate": 5.566138728625294e-06,
"loss": 0.2575,
"step": 288
},
{
"epoch": 4.25,
"grad_norm": 0.06592763687997258,
"learning_rate": 5.358983848622452e-06,
"loss": 0.2566,
"step": 289
},
{
"epoch": 4.264705882352941,
"grad_norm": 0.06767164583052036,
"learning_rate": 5.15548023653369e-06,
"loss": 0.2595,
"step": 290
},
{
"epoch": 4.279411764705882,
"grad_norm": 0.06893875689961178,
"learning_rate": 4.955649342255462e-06,
"loss": 0.2622,
"step": 291
},
{
"epoch": 4.294117647058823,
"grad_norm": 0.0627731194283305,
"learning_rate": 4.7595122285686215e-06,
"loss": 0.2605,
"step": 292
},
{
"epoch": 4.3088235294117645,
"grad_norm": 0.06183425105989223,
"learning_rate": 4.567089568918403e-06,
"loss": 0.262,
"step": 293
},
{
"epoch": 4.323529411764706,
"grad_norm": 0.08304190651695828,
"learning_rate": 4.3784016452353526e-06,
"loss": 0.2577,
"step": 294
},
{
"epoch": 4.338235294117647,
"grad_norm": 0.06162011756699104,
"learning_rate": 4.193468345797511e-06,
"loss": 0.2626,
"step": 295
},
{
"epoch": 4.352941176470588,
"grad_norm": 0.06077784990223975,
"learning_rate": 4.012309163134194e-06,
"loss": 0.2631,
"step": 296
},
{
"epoch": 4.367647058823529,
"grad_norm": 0.059344232351091784,
"learning_rate": 3.8349431919713655e-06,
"loss": 0.2606,
"step": 297
},
{
"epoch": 4.382352941176471,
"grad_norm": 0.053110563118761965,
"learning_rate": 3.6613891272190506e-06,
"loss": 0.2584,
"step": 298
},
{
"epoch": 4.397058823529412,
"grad_norm": 0.0550813714546073,
"learning_rate": 3.49166526200079e-06,
"loss": 0.2623,
"step": 299
},
{
"epoch": 4.411764705882353,
"grad_norm": 0.05497952308193732,
"learning_rate": 3.325789485725488e-06,
"loss": 0.2607,
"step": 300
},
{
"epoch": 4.426470588235294,
"grad_norm": 0.056505023992606394,
"learning_rate": 3.163779282201853e-06,
"loss": 0.2648,
"step": 301
},
{
"epoch": 4.4411764705882355,
"grad_norm": 0.055167426784272444,
"learning_rate": 3.0056517277955357e-06,
"loss": 0.2612,
"step": 302
},
{
"epoch": 4.455882352941177,
"grad_norm": 0.052033455176057224,
"learning_rate": 2.8514234896291904e-06,
"loss": 0.2617,
"step": 303
},
{
"epoch": 4.470588235294118,
"grad_norm": 0.05049118136446942,
"learning_rate": 2.7011108238257723e-06,
"loss": 0.2656,
"step": 304
},
{
"epoch": 4.485294117647059,
"grad_norm": 0.04973325645683792,
"learning_rate": 2.5547295737950475e-06,
"loss": 0.2651,
"step": 305
},
{
"epoch": 4.5,
"grad_norm": 0.05470312702591553,
"learning_rate": 2.4122951685636674e-06,
"loss": 0.2647,
"step": 306
},
{
"epoch": 4.514705882352941,
"grad_norm": 0.04975562581567009,
"learning_rate": 2.2738226211489024e-06,
"loss": 0.2588,
"step": 307
},
{
"epoch": 4.529411764705882,
"grad_norm": 0.04990586736807332,
"learning_rate": 2.1393265269762197e-06,
"loss": 0.2628,
"step": 308
},
{
"epoch": 4.544117647058823,
"grad_norm": 0.048763186017272454,
"learning_rate": 2.008821062340891e-06,
"loss": 0.2621,
"step": 309
},
{
"epoch": 4.5588235294117645,
"grad_norm": 0.04772811522858683,
"learning_rate": 1.8823199829137406e-06,
"loss": 0.2604,
"step": 310
},
{
"epoch": 4.573529411764706,
"grad_norm": 0.04514011191017183,
"learning_rate": 1.7598366222912933e-06,
"loss": 0.2626,
"step": 311
},
{
"epoch": 4.588235294117647,
"grad_norm": 0.0447568436808381,
"learning_rate": 1.6413838905903556e-06,
"loss": 0.2567,
"step": 312
},
{
"epoch": 4.602941176470588,
"grad_norm": 0.04828872191610463,
"learning_rate": 1.5269742730872384e-06,
"loss": 0.2618,
"step": 313
},
{
"epoch": 4.617647058823529,
"grad_norm": 0.048070892531791296,
"learning_rate": 1.4166198289017952e-06,
"loss": 0.2624,
"step": 314
},
{
"epoch": 4.632352941176471,
"grad_norm": 0.0481947096054082,
"learning_rate": 1.3103321897263421e-06,
"loss": 0.2624,
"step": 315
},
{
"epoch": 4.647058823529412,
"grad_norm": 0.0498006739485343,
"learning_rate": 1.2081225585996248e-06,
"loss": 0.2594,
"step": 316
},
{
"epoch": 4.661764705882353,
"grad_norm": 0.046785729335567995,
"learning_rate": 1.1100017087260205e-06,
"loss": 0.2622,
"step": 317
},
{
"epoch": 4.676470588235294,
"grad_norm": 0.04518076179941004,
"learning_rate": 1.015979982339994e-06,
"loss": 0.2636,
"step": 318
},
{
"epoch": 4.6911764705882355,
"grad_norm": 0.04627910929165888,
"learning_rate": 9.260672896159728e-07,
"loss": 0.2603,
"step": 319
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.04505892744233274,
"learning_rate": 8.402731076238191e-07,
"loss": 0.2606,
"step": 320
},
{
"epoch": 4.720588235294118,
"grad_norm": 0.04552316822622753,
"learning_rate": 7.586064793298998e-07,
"loss": 0.2593,
"step": 321
},
{
"epoch": 4.735294117647059,
"grad_norm": 0.045512369955118724,
"learning_rate": 6.810760126439287e-07,
"loss": 0.2654,
"step": 322
},
{
"epoch": 4.75,
"grad_norm": 0.0451772973693403,
"learning_rate": 6.076898795116792e-07,
"loss": 0.2621,
"step": 323
},
{
"epoch": 4.764705882352941,
"grad_norm": 0.044079852747311694,
"learning_rate": 5.384558150536201e-07,
"loss": 0.2615,
"step": 324
},
{
"epoch": 4.779411764705882,
"grad_norm": 0.04455441736236055,
"learning_rate": 4.7338111674962495e-07,
"loss": 0.2606,
"step": 325
},
{
"epoch": 4.794117647058823,
"grad_norm": 0.04405689466623866,
"learning_rate": 4.124726436697879e-07,
"loss": 0.2621,
"step": 326
},
{
"epoch": 4.8088235294117645,
"grad_norm": 0.044529538816460816,
"learning_rate": 3.557368157514596e-07,
"loss": 0.2646,
"step": 327
},
{
"epoch": 4.823529411764706,
"grad_norm": 0.04553425966459204,
"learning_rate": 3.031796131225706e-07,
"loss": 0.2625,
"step": 328
},
{
"epoch": 4.838235294117647,
"grad_norm": 0.04331163075687581,
"learning_rate": 2.548065754712914e-07,
"loss": 0.2592,
"step": 329
},
{
"epoch": 4.852941176470588,
"grad_norm": 0.045474788941501976,
"learning_rate": 2.1062280146215252e-07,
"loss": 0.2625,
"step": 330
},
{
"epoch": 4.867647058823529,
"grad_norm": 0.04615519000321619,
"learning_rate": 1.706329481986213e-07,
"loss": 0.2635,
"step": 331
},
{
"epoch": 4.882352941176471,
"grad_norm": 0.04427563364549322,
"learning_rate": 1.3484123073222332e-07,
"loss": 0.2637,
"step": 332
},
{
"epoch": 4.897058823529412,
"grad_norm": 0.04265147614885937,
"learning_rate": 1.0325142161827561e-07,
"loss": 0.2637,
"step": 333
},
{
"epoch": 4.911764705882353,
"grad_norm": 0.04295525693989313,
"learning_rate": 7.586685051823584e-08,
"loss": 0.2586,
"step": 334
},
{
"epoch": 4.926470588235294,
"grad_norm": 0.04243683788724124,
"learning_rate": 5.2690403848760785e-08,
"loss": 0.2603,
"step": 335
},
{
"epoch": 4.9411764705882355,
"grad_norm": 0.043654171470091575,
"learning_rate": 3.3724524477447564e-08,
"loss": 0.2622,
"step": 336
},
{
"epoch": 4.955882352941177,
"grad_norm": 0.04272527321797204,
"learning_rate": 1.897121146536396e-08,
"loss": 0.2542,
"step": 337
},
{
"epoch": 4.970588235294118,
"grad_norm": 0.04827925979026659,
"learning_rate": 8.432019856345896e-09,
"loss": 0.2582,
"step": 338
},
{
"epoch": 4.985294117647059,
"grad_norm": 0.042423900810093146,
"learning_rate": 2.1080605130752162e-09,
"loss": 0.2564,
"step": 339
},
{
"epoch": 5.0,
"grad_norm": 0.07128915321863381,
"learning_rate": 0.0,
"loss": 0.2511,
"step": 340
},
{
"epoch": 5.0,
"step": 340,
"total_flos": 1.915314895847424e+16,
"train_loss": 0.3188589934040518,
"train_runtime": 19815.4609,
"train_samples_per_second": 8.67,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 340,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.915314895847424e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}