nemo_nano_math_100k / trainer_state.json
EtashGuha's picture
Upload model
b89d797 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.977457168620378,
"eval_steps": 500,
"global_step": 345,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014427412082957619,
"grad_norm": 6.30643255752035,
"learning_rate": 2.285714285714286e-06,
"loss": 0.8669,
"step": 1
},
{
"epoch": 0.028854824165915238,
"grad_norm": 6.348193987490592,
"learning_rate": 4.571428571428572e-06,
"loss": 0.87,
"step": 2
},
{
"epoch": 0.04328223624887286,
"grad_norm": 5.780015353700753,
"learning_rate": 6.857142857142858e-06,
"loss": 0.8477,
"step": 3
},
{
"epoch": 0.057709648331830475,
"grad_norm": 4.234197549907419,
"learning_rate": 9.142857142857144e-06,
"loss": 0.8118,
"step": 4
},
{
"epoch": 0.0721370604147881,
"grad_norm": 2.3155214212097306,
"learning_rate": 1.1428571428571429e-05,
"loss": 0.7747,
"step": 5
},
{
"epoch": 0.08656447249774572,
"grad_norm": 5.1118273278397846,
"learning_rate": 1.3714285714285716e-05,
"loss": 0.8053,
"step": 6
},
{
"epoch": 0.10099188458070334,
"grad_norm": 7.1277227926615,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.7794,
"step": 7
},
{
"epoch": 0.11541929666366095,
"grad_norm": 8.187688384697006,
"learning_rate": 1.8285714285714288e-05,
"loss": 0.7978,
"step": 8
},
{
"epoch": 0.12984670874661858,
"grad_norm": 5.093195709417533,
"learning_rate": 2.057142857142857e-05,
"loss": 0.7518,
"step": 9
},
{
"epoch": 0.1442741208295762,
"grad_norm": 3.1265792874681977,
"learning_rate": 2.2857142857142858e-05,
"loss": 0.7116,
"step": 10
},
{
"epoch": 0.1587015329125338,
"grad_norm": 2.5188366289278323,
"learning_rate": 2.5142857142857143e-05,
"loss": 0.6783,
"step": 11
},
{
"epoch": 0.17312894499549145,
"grad_norm": 1.4940573526303949,
"learning_rate": 2.742857142857143e-05,
"loss": 0.6532,
"step": 12
},
{
"epoch": 0.18755635707844906,
"grad_norm": 1.6352594758559187,
"learning_rate": 2.9714285714285717e-05,
"loss": 0.6347,
"step": 13
},
{
"epoch": 0.20198376916140667,
"grad_norm": 1.2187588508966425,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.6195,
"step": 14
},
{
"epoch": 0.2164111812443643,
"grad_norm": 1.2755607008946352,
"learning_rate": 3.4285714285714284e-05,
"loss": 0.6142,
"step": 15
},
{
"epoch": 0.2308385933273219,
"grad_norm": 1.0456746979084692,
"learning_rate": 3.6571428571428576e-05,
"loss": 0.6038,
"step": 16
},
{
"epoch": 0.24526600541027954,
"grad_norm": 1.4162214811220066,
"learning_rate": 3.885714285714286e-05,
"loss": 0.5997,
"step": 17
},
{
"epoch": 0.25969341749323716,
"grad_norm": 1.123092592959995,
"learning_rate": 4.114285714285714e-05,
"loss": 0.5855,
"step": 18
},
{
"epoch": 0.27412082957619477,
"grad_norm": 1.247093949716292,
"learning_rate": 4.342857142857143e-05,
"loss": 0.5783,
"step": 19
},
{
"epoch": 0.2885482416591524,
"grad_norm": 0.9162444696210892,
"learning_rate": 4.5714285714285716e-05,
"loss": 0.5762,
"step": 20
},
{
"epoch": 0.30297565374211,
"grad_norm": 1.7011597008607717,
"learning_rate": 4.8e-05,
"loss": 0.5788,
"step": 21
},
{
"epoch": 0.3174030658250676,
"grad_norm": 1.0313827493696333,
"learning_rate": 5.0285714285714286e-05,
"loss": 0.5711,
"step": 22
},
{
"epoch": 0.3318304779080252,
"grad_norm": 2.0638257126228083,
"learning_rate": 5.257142857142858e-05,
"loss": 0.589,
"step": 23
},
{
"epoch": 0.3462578899909829,
"grad_norm": 1.2864831655829803,
"learning_rate": 5.485714285714286e-05,
"loss": 0.5638,
"step": 24
},
{
"epoch": 0.3606853020739405,
"grad_norm": 1.9339557478184641,
"learning_rate": 5.714285714285715e-05,
"loss": 0.58,
"step": 25
},
{
"epoch": 0.3751127141568981,
"grad_norm": 1.5799611967220424,
"learning_rate": 5.9428571428571434e-05,
"loss": 0.5647,
"step": 26
},
{
"epoch": 0.38954012623985573,
"grad_norm": 1.251826447578104,
"learning_rate": 6.171428571428573e-05,
"loss": 0.5586,
"step": 27
},
{
"epoch": 0.40396753832281335,
"grad_norm": 1.5898160634262704,
"learning_rate": 6.400000000000001e-05,
"loss": 0.5526,
"step": 28
},
{
"epoch": 0.41839495040577096,
"grad_norm": 1.1139638002205856,
"learning_rate": 6.62857142857143e-05,
"loss": 0.5503,
"step": 29
},
{
"epoch": 0.4328223624887286,
"grad_norm": 1.2940676544230694,
"learning_rate": 6.857142857142857e-05,
"loss": 0.556,
"step": 30
},
{
"epoch": 0.4472497745716862,
"grad_norm": 1.4125777791117147,
"learning_rate": 7.085714285714287e-05,
"loss": 0.5429,
"step": 31
},
{
"epoch": 0.4616771866546438,
"grad_norm": 0.6917885537888634,
"learning_rate": 7.314285714285715e-05,
"loss": 0.537,
"step": 32
},
{
"epoch": 0.47610459873760147,
"grad_norm": 1.0491224041512421,
"learning_rate": 7.542857142857144e-05,
"loss": 0.5431,
"step": 33
},
{
"epoch": 0.4905320108205591,
"grad_norm": 1.1841266996810977,
"learning_rate": 7.771428571428572e-05,
"loss": 0.5408,
"step": 34
},
{
"epoch": 0.5049594229035167,
"grad_norm": 1.3485916713931527,
"learning_rate": 8e-05,
"loss": 0.5369,
"step": 35
},
{
"epoch": 0.5193868349864743,
"grad_norm": 1.381633742212091,
"learning_rate": 7.999794598960815e-05,
"loss": 0.5447,
"step": 36
},
{
"epoch": 0.5338142470694319,
"grad_norm": 1.113965104171002,
"learning_rate": 7.999178416938051e-05,
"loss": 0.5343,
"step": 37
},
{
"epoch": 0.5482416591523895,
"grad_norm": 3.2942969304440877,
"learning_rate": 7.998151517213926e-05,
"loss": 0.5223,
"step": 38
},
{
"epoch": 0.5626690712353472,
"grad_norm": 1.8632536229114898,
"learning_rate": 7.996714005251569e-05,
"loss": 0.5358,
"step": 39
},
{
"epoch": 0.5770964833183048,
"grad_norm": 1.3196787168296893,
"learning_rate": 7.994866028684212e-05,
"loss": 0.5372,
"step": 40
},
{
"epoch": 0.5915238954012624,
"grad_norm": 1.1177800163283473,
"learning_rate": 7.992607777300004e-05,
"loss": 0.5274,
"step": 41
},
{
"epoch": 0.60595130748422,
"grad_norm": 1.016337599180268,
"learning_rate": 7.989939483022537e-05,
"loss": 0.5209,
"step": 42
},
{
"epoch": 0.6203787195671776,
"grad_norm": 1.395873276215236,
"learning_rate": 7.98686141988702e-05,
"loss": 0.5209,
"step": 43
},
{
"epoch": 0.6348061316501352,
"grad_norm": 0.5976851739788052,
"learning_rate": 7.983373904012138e-05,
"loss": 0.5189,
"step": 44
},
{
"epoch": 0.6492335437330928,
"grad_norm": 0.8805510948652211,
"learning_rate": 7.97947729356758e-05,
"loss": 0.5158,
"step": 45
},
{
"epoch": 0.6636609558160504,
"grad_norm": 0.9279297933278495,
"learning_rate": 7.975171988737267e-05,
"loss": 0.5237,
"step": 46
},
{
"epoch": 0.6780883678990082,
"grad_norm": 1.2997587022594848,
"learning_rate": 7.970458431678239e-05,
"loss": 0.5426,
"step": 47
},
{
"epoch": 0.6925157799819658,
"grad_norm": 0.7359557953904674,
"learning_rate": 7.965337106475256e-05,
"loss": 0.5146,
"step": 48
},
{
"epoch": 0.7069431920649234,
"grad_norm": 0.950140709774444,
"learning_rate": 7.959808539091077e-05,
"loss": 0.5207,
"step": 49
},
{
"epoch": 0.721370604147881,
"grad_norm": 0.9262413860600672,
"learning_rate": 7.953873297312447e-05,
"loss": 0.5114,
"step": 50
},
{
"epoch": 0.7357980162308386,
"grad_norm": 0.659161414732944,
"learning_rate": 7.947531990691778e-05,
"loss": 0.5065,
"step": 51
},
{
"epoch": 0.7502254283137962,
"grad_norm": 0.6193539698881468,
"learning_rate": 7.940785270484556e-05,
"loss": 0.5082,
"step": 52
},
{
"epoch": 0.7646528403967539,
"grad_norm": 0.6484786655929663,
"learning_rate": 7.933633829582451e-05,
"loss": 0.5073,
"step": 53
},
{
"epoch": 0.7790802524797115,
"grad_norm": 0.5246514937970682,
"learning_rate": 7.926078402442161e-05,
"loss": 0.5034,
"step": 54
},
{
"epoch": 0.7935076645626691,
"grad_norm": 0.7555014101024001,
"learning_rate": 7.918119765009979e-05,
"loss": 0.5011,
"step": 55
},
{
"epoch": 0.8079350766456267,
"grad_norm": 0.7908819876305151,
"learning_rate": 7.909758734642103e-05,
"loss": 0.5034,
"step": 56
},
{
"epoch": 0.8223624887285843,
"grad_norm": 0.8314592679002677,
"learning_rate": 7.900996170020697e-05,
"loss": 0.4941,
"step": 57
},
{
"epoch": 0.8367899008115419,
"grad_norm": 0.6541181086969657,
"learning_rate": 7.8918329710657e-05,
"loss": 0.4971,
"step": 58
},
{
"epoch": 0.8512173128944995,
"grad_norm": 0.572541817797756,
"learning_rate": 7.882270078842407e-05,
"loss": 0.4945,
"step": 59
},
{
"epoch": 0.8656447249774571,
"grad_norm": 0.7068090025887374,
"learning_rate": 7.872308475464818e-05,
"loss": 0.496,
"step": 60
},
{
"epoch": 0.8800721370604148,
"grad_norm": 0.5503187486959946,
"learning_rate": 7.861949183994774e-05,
"loss": 0.4921,
"step": 61
},
{
"epoch": 0.8944995491433724,
"grad_norm": 0.6381310310432255,
"learning_rate": 7.851193268336894e-05,
"loss": 0.4993,
"step": 62
},
{
"epoch": 0.90892696122633,
"grad_norm": 0.7118765648942602,
"learning_rate": 7.840041833129304e-05,
"loss": 0.488,
"step": 63
},
{
"epoch": 0.9233543733092876,
"grad_norm": 0.8229902231799235,
"learning_rate": 7.828496023630193e-05,
"loss": 0.4886,
"step": 64
},
{
"epoch": 0.9377817853922452,
"grad_norm": 0.8472076467453001,
"learning_rate": 7.816557025600196e-05,
"loss": 0.4954,
"step": 65
},
{
"epoch": 0.9522091974752029,
"grad_norm": 0.9110400155251186,
"learning_rate": 7.804226065180615e-05,
"loss": 0.4869,
"step": 66
},
{
"epoch": 0.9666366095581606,
"grad_norm": 1.0607731617003107,
"learning_rate": 7.791504408767492e-05,
"loss": 0.4867,
"step": 67
},
{
"epoch": 0.9810640216411182,
"grad_norm": 0.8705421661545191,
"learning_rate": 7.778393362881549e-05,
"loss": 0.4873,
"step": 68
},
{
"epoch": 0.9954914337240758,
"grad_norm": 0.598223438503932,
"learning_rate": 7.764894274034014e-05,
"loss": 0.4866,
"step": 69
},
{
"epoch": 1.0099188458070334,
"grad_norm": 1.3726649121858878,
"learning_rate": 7.751008528588322e-05,
"loss": 0.8287,
"step": 70
},
{
"epoch": 1.024346257889991,
"grad_norm": 1.4843499216749163,
"learning_rate": 7.736737552617749e-05,
"loss": 0.4874,
"step": 71
},
{
"epoch": 1.0387736699729486,
"grad_norm": 0.5218559993997877,
"learning_rate": 7.722082811758939e-05,
"loss": 0.4768,
"step": 72
},
{
"epoch": 1.0532010820559061,
"grad_norm": 1.3743390311137267,
"learning_rate": 7.707045811061396e-05,
"loss": 0.4805,
"step": 73
},
{
"epoch": 1.0676284941388638,
"grad_norm": 0.6090236020230758,
"learning_rate": 7.691628094832901e-05,
"loss": 0.4731,
"step": 74
},
{
"epoch": 1.0820559062218216,
"grad_norm": 0.897847351023635,
"learning_rate": 7.675831246480923e-05,
"loss": 0.4821,
"step": 75
},
{
"epoch": 1.096483318304779,
"grad_norm": 0.7029565517391541,
"learning_rate": 7.659656888349997e-05,
"loss": 0.4724,
"step": 76
},
{
"epoch": 1.1109107303877368,
"grad_norm": 0.6183945563644426,
"learning_rate": 7.643106681555106e-05,
"loss": 0.4763,
"step": 77
},
{
"epoch": 1.1253381424706943,
"grad_norm": 0.5847084533277753,
"learning_rate": 7.626182325811089e-05,
"loss": 0.4664,
"step": 78
},
{
"epoch": 1.139765554553652,
"grad_norm": 0.6749737142635733,
"learning_rate": 7.60888555925807e-05,
"loss": 0.4671,
"step": 79
},
{
"epoch": 1.1541929666366095,
"grad_norm": 0.4481798380260001,
"learning_rate": 7.591218158282968e-05,
"loss": 0.4656,
"step": 80
},
{
"epoch": 1.1686203787195673,
"grad_norm": 0.649578910722019,
"learning_rate": 7.573181937337037e-05,
"loss": 0.4685,
"step": 81
},
{
"epoch": 1.1830477908025248,
"grad_norm": 0.511575614745596,
"learning_rate": 7.554778748749543e-05,
"loss": 0.4608,
"step": 82
},
{
"epoch": 1.1974752028854825,
"grad_norm": 0.5161021268023156,
"learning_rate": 7.536010482537514e-05,
"loss": 0.4613,
"step": 83
},
{
"epoch": 1.21190261496844,
"grad_norm": 0.46897677283090455,
"learning_rate": 7.516879066211644e-05,
"loss": 0.4691,
"step": 84
},
{
"epoch": 1.2263300270513977,
"grad_norm": 0.5762897639302133,
"learning_rate": 7.497386464578329e-05,
"loss": 0.4654,
"step": 85
},
{
"epoch": 1.2407574391343552,
"grad_norm": 0.3969665274048659,
"learning_rate": 7.477534679537885e-05,
"loss": 0.4587,
"step": 86
},
{
"epoch": 1.255184851217313,
"grad_norm": 0.4524782612023369,
"learning_rate": 7.457325749878951e-05,
"loss": 0.4534,
"step": 87
},
{
"epoch": 1.2696122633002704,
"grad_norm": 0.5470294599409099,
"learning_rate": 7.436761751069103e-05,
"loss": 0.4643,
"step": 88
},
{
"epoch": 1.2840396753832282,
"grad_norm": 0.5658245365895949,
"learning_rate": 7.415844795041704e-05,
"loss": 0.4602,
"step": 89
},
{
"epoch": 1.2984670874661859,
"grad_norm": 0.6284954594621484,
"learning_rate": 7.394577029979004e-05,
"loss": 0.4676,
"step": 90
},
{
"epoch": 1.3128944995491434,
"grad_norm": 0.7345913995003851,
"learning_rate": 7.372960640091529e-05,
"loss": 0.4606,
"step": 91
},
{
"epoch": 1.327321911632101,
"grad_norm": 0.8342633496573308,
"learning_rate": 7.350997845393752e-05,
"loss": 0.4557,
"step": 92
},
{
"epoch": 1.3417493237150586,
"grad_norm": 0.8330096859025692,
"learning_rate": 7.328690901476095e-05,
"loss": 0.4647,
"step": 93
},
{
"epoch": 1.3561767357980163,
"grad_norm": 0.6546676985057208,
"learning_rate": 7.306042099273297e-05,
"loss": 0.4592,
"step": 94
},
{
"epoch": 1.3706041478809738,
"grad_norm": 0.47502637705371126,
"learning_rate": 7.283053764829106e-05,
"loss": 0.4605,
"step": 95
},
{
"epoch": 1.3850315599639313,
"grad_norm": 0.5531078683869538,
"learning_rate": 7.259728259057417e-05,
"loss": 0.4567,
"step": 96
},
{
"epoch": 1.399458972046889,
"grad_norm": 0.515899958416822,
"learning_rate": 7.236067977499791e-05,
"loss": 0.4578,
"step": 97
},
{
"epoch": 1.4138863841298468,
"grad_norm": 0.3492664441384964,
"learning_rate": 7.212075350079437e-05,
"loss": 0.4561,
"step": 98
},
{
"epoch": 1.4283137962128043,
"grad_norm": 0.42413300170898927,
"learning_rate": 7.187752840851661e-05,
"loss": 0.4569,
"step": 99
},
{
"epoch": 1.442741208295762,
"grad_norm": 0.4947663891832909,
"learning_rate": 7.163102947750794e-05,
"loss": 0.456,
"step": 100
},
{
"epoch": 1.4571686203787195,
"grad_norm": 0.36507776313239376,
"learning_rate": 7.13812820233367e-05,
"loss": 0.4592,
"step": 101
},
{
"epoch": 1.4715960324616773,
"grad_norm": 0.37547804843247373,
"learning_rate": 7.112831169519617e-05,
"loss": 0.459,
"step": 102
},
{
"epoch": 1.4860234445446348,
"grad_norm": 0.36635807000670995,
"learning_rate": 7.087214447327049e-05,
"loss": 0.4561,
"step": 103
},
{
"epoch": 1.5004508566275925,
"grad_norm": 0.315478417939894,
"learning_rate": 7.061280666606646e-05,
"loss": 0.4563,
"step": 104
},
{
"epoch": 1.5148782687105502,
"grad_norm": 0.4096625613828037,
"learning_rate": 7.035032490771165e-05,
"loss": 0.4541,
"step": 105
},
{
"epoch": 1.5293056807935077,
"grad_norm": 0.4422620826291203,
"learning_rate": 7.008472615521898e-05,
"loss": 0.4508,
"step": 106
},
{
"epoch": 1.5437330928764652,
"grad_norm": 0.3213468597989991,
"learning_rate": 6.98160376857184e-05,
"loss": 0.458,
"step": 107
},
{
"epoch": 1.558160504959423,
"grad_norm": 0.35471415827924724,
"learning_rate": 6.954428709365527e-05,
"loss": 0.4563,
"step": 108
},
{
"epoch": 1.5725879170423807,
"grad_norm": 0.4247233136060684,
"learning_rate": 6.926950228795663e-05,
"loss": 0.4516,
"step": 109
},
{
"epoch": 1.5870153291253382,
"grad_norm": 0.31840084731849594,
"learning_rate": 6.89917114891648e-05,
"loss": 0.4547,
"step": 110
},
{
"epoch": 1.6014427412082957,
"grad_norm": 0.3573055805732088,
"learning_rate": 6.871094322653916e-05,
"loss": 0.4574,
"step": 111
},
{
"epoch": 1.6158701532912534,
"grad_norm": 0.33089511640034097,
"learning_rate": 6.842722633512614e-05,
"loss": 0.4568,
"step": 112
},
{
"epoch": 1.630297565374211,
"grad_norm": 0.32234159444311866,
"learning_rate": 6.814058995279793e-05,
"loss": 0.4506,
"step": 113
},
{
"epoch": 1.6447249774571686,
"grad_norm": 0.2842035714714675,
"learning_rate": 6.785106351725992e-05,
"loss": 0.4451,
"step": 114
},
{
"epoch": 1.6591523895401261,
"grad_norm": 0.24782641096472402,
"learning_rate": 6.755867676302747e-05,
"loss": 0.4524,
"step": 115
},
{
"epoch": 1.6735798016230838,
"grad_norm": 0.29530488172037256,
"learning_rate": 6.726345971837217e-05,
"loss": 0.4523,
"step": 116
},
{
"epoch": 1.6880072137060416,
"grad_norm": 0.29231108013584617,
"learning_rate": 6.69654427022379e-05,
"loss": 0.448,
"step": 117
},
{
"epoch": 1.702434625788999,
"grad_norm": 0.3209263624489444,
"learning_rate": 6.666465632112707e-05,
"loss": 0.4523,
"step": 118
},
{
"epoch": 1.7168620378719566,
"grad_norm": 0.4315596822756952,
"learning_rate": 6.636113146595729e-05,
"loss": 0.4491,
"step": 119
},
{
"epoch": 1.7312894499549143,
"grad_norm": 0.4570225349432179,
"learning_rate": 6.60548993088889e-05,
"loss": 0.4464,
"step": 120
},
{
"epoch": 1.745716862037872,
"grad_norm": 0.44762480786064185,
"learning_rate": 6.574599130012355e-05,
"loss": 0.4548,
"step": 121
},
{
"epoch": 1.7601442741208295,
"grad_norm": 0.4937434929135096,
"learning_rate": 6.543443916467426e-05,
"loss": 0.4503,
"step": 122
},
{
"epoch": 1.7745716862037872,
"grad_norm": 0.606568052119448,
"learning_rate": 6.512027489910718e-05,
"loss": 0.4486,
"step": 123
},
{
"epoch": 1.788999098286745,
"grad_norm": 0.6858758315433683,
"learning_rate": 6.480353076825566e-05,
"loss": 0.449,
"step": 124
},
{
"epoch": 1.8034265103697025,
"grad_norm": 0.5123808792652511,
"learning_rate": 6.448423930190653e-05,
"loss": 0.4464,
"step": 125
},
{
"epoch": 1.81785392245266,
"grad_norm": 0.38964320431553595,
"learning_rate": 6.416243329145923e-05,
"loss": 0.4475,
"step": 126
},
{
"epoch": 1.8322813345356177,
"grad_norm": 0.35099016991264836,
"learning_rate": 6.383814578655829e-05,
"loss": 0.4547,
"step": 127
},
{
"epoch": 1.8467087466185754,
"grad_norm": 0.3451471240491199,
"learning_rate": 6.351141009169893e-05,
"loss": 0.4502,
"step": 128
},
{
"epoch": 1.861136158701533,
"grad_norm": 0.33153601295599006,
"learning_rate": 6.31822597628068e-05,
"loss": 0.4487,
"step": 129
},
{
"epoch": 1.8755635707844904,
"grad_norm": 0.34266592441777854,
"learning_rate": 6.28507286037917e-05,
"loss": 0.4477,
"step": 130
},
{
"epoch": 1.8899909828674482,
"grad_norm": 0.3492224166038735,
"learning_rate": 6.251685066307592e-05,
"loss": 0.4577,
"step": 131
},
{
"epoch": 1.9044183949504059,
"grad_norm": 0.2600600833378922,
"learning_rate": 6.218066023009743e-05,
"loss": 0.4491,
"step": 132
},
{
"epoch": 1.9188458070333634,
"grad_norm": 0.2930478733859803,
"learning_rate": 6.184219183178842e-05,
"loss": 0.4378,
"step": 133
},
{
"epoch": 1.9332732191163209,
"grad_norm": 0.344123397095677,
"learning_rate": 6.150148022902922e-05,
"loss": 0.4486,
"step": 134
},
{
"epoch": 1.9477006311992786,
"grad_norm": 0.32732494053257644,
"learning_rate": 6.11585604130785e-05,
"loss": 0.4451,
"step": 135
},
{
"epoch": 1.9621280432822363,
"grad_norm": 0.25454887232448653,
"learning_rate": 6.081346760197953e-05,
"loss": 0.4435,
"step": 136
},
{
"epoch": 1.9765554553651938,
"grad_norm": 0.21336525188734806,
"learning_rate": 6.04662372369433e-05,
"loss": 0.4459,
"step": 137
},
{
"epoch": 1.9909828674481513,
"grad_norm": 0.21510264038063648,
"learning_rate": 6.0116904978708716e-05,
"loss": 0.4451,
"step": 138
},
{
"epoch": 2.0054102795311093,
"grad_norm": 0.3886088967850276,
"learning_rate": 5.976550670388023e-05,
"loss": 0.7365,
"step": 139
},
{
"epoch": 2.019837691614067,
"grad_norm": 0.5461141945560231,
"learning_rate": 5.941207850124325e-05,
"loss": 0.4274,
"step": 140
},
{
"epoch": 2.0342651036970243,
"grad_norm": 0.7233438360497401,
"learning_rate": 5.9056656668057806e-05,
"loss": 0.4257,
"step": 141
},
{
"epoch": 2.048692515779982,
"grad_norm": 0.902604447839341,
"learning_rate": 5.8699277706330854e-05,
"loss": 0.4327,
"step": 142
},
{
"epoch": 2.0631199278629397,
"grad_norm": 0.9842345625256362,
"learning_rate": 5.833997831906746e-05,
"loss": 0.4206,
"step": 143
},
{
"epoch": 2.0775473399458972,
"grad_norm": 0.7550138291557669,
"learning_rate": 5.7978795406501365e-05,
"loss": 0.4213,
"step": 144
},
{
"epoch": 2.0919747520288547,
"grad_norm": 0.5725375243656562,
"learning_rate": 5.761576606230538e-05,
"loss": 0.4232,
"step": 145
},
{
"epoch": 2.1064021641118122,
"grad_norm": 0.5871563051625412,
"learning_rate": 5.725092756978177e-05,
"loss": 0.4268,
"step": 146
},
{
"epoch": 2.12082957619477,
"grad_norm": 0.6848078352834541,
"learning_rate": 5.688431739803328e-05,
"loss": 0.4231,
"step": 147
},
{
"epoch": 2.1352569882777277,
"grad_norm": 0.47360287031992565,
"learning_rate": 5.651597319811505e-05,
"loss": 0.4245,
"step": 148
},
{
"epoch": 2.149684400360685,
"grad_norm": 0.43267908202913546,
"learning_rate": 5.6145932799167795e-05,
"loss": 0.421,
"step": 149
},
{
"epoch": 2.164111812443643,
"grad_norm": 0.5225940009578477,
"learning_rate": 5.5774234204532746e-05,
"loss": 0.4171,
"step": 150
},
{
"epoch": 2.1785392245266006,
"grad_norm": 0.345292795118154,
"learning_rate": 5.5400915587848713e-05,
"loss": 0.4176,
"step": 151
},
{
"epoch": 2.192966636609558,
"grad_norm": 0.37397788119190706,
"learning_rate": 5.502601528913161e-05,
"loss": 0.4185,
"step": 152
},
{
"epoch": 2.2073940486925157,
"grad_norm": 0.33142951490345385,
"learning_rate": 5.464957181083692e-05,
"loss": 0.4185,
"step": 153
},
{
"epoch": 2.2218214607754736,
"grad_norm": 0.2921058390866845,
"learning_rate": 5.427162381390543e-05,
"loss": 0.417,
"step": 154
},
{
"epoch": 2.236248872858431,
"grad_norm": 0.34198696626119557,
"learning_rate": 5.389221011379281e-05,
"loss": 0.4165,
"step": 155
},
{
"epoch": 2.2506762849413886,
"grad_norm": 0.26908479849148176,
"learning_rate": 5.351136967648323e-05,
"loss": 0.4193,
"step": 156
},
{
"epoch": 2.265103697024346,
"grad_norm": 0.31962185227765055,
"learning_rate": 5.3129141614487456e-05,
"loss": 0.4279,
"step": 157
},
{
"epoch": 2.279531109107304,
"grad_norm": 0.376211538661627,
"learning_rate": 5.274556518282607e-05,
"loss": 0.4195,
"step": 158
},
{
"epoch": 2.2939585211902616,
"grad_norm": 0.28546559354766526,
"learning_rate": 5.23606797749979e-05,
"loss": 0.4199,
"step": 159
},
{
"epoch": 2.308385933273219,
"grad_norm": 0.35404717031780875,
"learning_rate": 5.1974524918934336e-05,
"loss": 0.4194,
"step": 160
},
{
"epoch": 2.3228133453561766,
"grad_norm": 0.32804234637360613,
"learning_rate": 5.15871402729397e-05,
"loss": 0.4215,
"step": 161
},
{
"epoch": 2.3372407574391345,
"grad_norm": 0.25853378935309307,
"learning_rate": 5.1198565621618444e-05,
"loss": 0.42,
"step": 162
},
{
"epoch": 2.351668169522092,
"grad_norm": 0.29254513463752485,
"learning_rate": 5.0808840871789155e-05,
"loss": 0.4137,
"step": 163
},
{
"epoch": 2.3660955816050495,
"grad_norm": 0.2324430211066698,
"learning_rate": 5.0418006048386134e-05,
"loss": 0.4174,
"step": 164
},
{
"epoch": 2.3805229936880075,
"grad_norm": 0.22977260261166277,
"learning_rate": 5.002610129034883e-05,
"loss": 0.418,
"step": 165
},
{
"epoch": 2.394950405770965,
"grad_norm": 0.25178175225388516,
"learning_rate": 4.963316684649951e-05,
"loss": 0.4215,
"step": 166
},
{
"epoch": 2.4093778178539225,
"grad_norm": 0.18022661655296157,
"learning_rate": 4.923924307140974e-05,
"loss": 0.414,
"step": 167
},
{
"epoch": 2.42380522993688,
"grad_norm": 0.23950853172671158,
"learning_rate": 4.8844370421255886e-05,
"loss": 0.419,
"step": 168
},
{
"epoch": 2.4382326420198375,
"grad_norm": 0.19718161732313788,
"learning_rate": 4.8448589449664305e-05,
"loss": 0.4124,
"step": 169
},
{
"epoch": 2.4526600541027954,
"grad_norm": 0.1804834440563653,
"learning_rate": 4.805194080354641e-05,
"loss": 0.4179,
"step": 170
},
{
"epoch": 2.467087466185753,
"grad_norm": 0.20353053079969263,
"learning_rate": 4.765446521892426e-05,
"loss": 0.4104,
"step": 171
},
{
"epoch": 2.4815148782687104,
"grad_norm": 0.16177819342894753,
"learning_rate": 4.725620351674693e-05,
"loss": 0.4202,
"step": 172
},
{
"epoch": 2.4959422903516684,
"grad_norm": 0.16071769506654357,
"learning_rate": 4.685719659869815e-05,
"loss": 0.4083,
"step": 173
},
{
"epoch": 2.510369702434626,
"grad_norm": 0.1725361486750181,
"learning_rate": 4.645748544299574e-05,
"loss": 0.4153,
"step": 174
},
{
"epoch": 2.5247971145175834,
"grad_norm": 0.16753825050295582,
"learning_rate": 4.605711110018307e-05,
"loss": 0.4123,
"step": 175
},
{
"epoch": 2.539224526600541,
"grad_norm": 0.17717032081528933,
"learning_rate": 4.565611468891318e-05,
"loss": 0.4129,
"step": 176
},
{
"epoch": 2.5536519386834984,
"grad_norm": 0.1564598566236543,
"learning_rate": 4.525453739172586e-05,
"loss": 0.4117,
"step": 177
},
{
"epoch": 2.5680793507664563,
"grad_norm": 0.15287663289000603,
"learning_rate": 4.48524204508182e-05,
"loss": 0.4183,
"step": 178
},
{
"epoch": 2.582506762849414,
"grad_norm": 0.18206218031669835,
"learning_rate": 4.444980516380895e-05,
"loss": 0.4117,
"step": 179
},
{
"epoch": 2.5969341749323718,
"grad_norm": 0.16895498094131148,
"learning_rate": 4.4046732879497295e-05,
"loss": 0.4148,
"step": 180
},
{
"epoch": 2.6113615870153293,
"grad_norm": 0.20384116046961775,
"learning_rate": 4.364324499361626e-05,
"loss": 0.4121,
"step": 181
},
{
"epoch": 2.625788999098287,
"grad_norm": 0.18201505177744084,
"learning_rate": 4.3239382944581384e-05,
"loss": 0.4154,
"step": 182
},
{
"epoch": 2.6402164111812443,
"grad_norm": 0.16531279832670212,
"learning_rate": 4.283518820923492e-05,
"loss": 0.4134,
"step": 183
},
{
"epoch": 2.654643823264202,
"grad_norm": 0.17869608399055636,
"learning_rate": 4.243070229858624e-05,
"loss": 0.4167,
"step": 184
},
{
"epoch": 2.6690712353471597,
"grad_norm": 0.15659192579938305,
"learning_rate": 4.202596675354851e-05,
"loss": 0.415,
"step": 185
},
{
"epoch": 2.6834986474301172,
"grad_norm": 0.1729110016630772,
"learning_rate": 4.1621023140672524e-05,
"loss": 0.4149,
"step": 186
},
{
"epoch": 2.6979260595130747,
"grad_norm": 0.17987624911793657,
"learning_rate": 4.121591304787772e-05,
"loss": 0.4128,
"step": 187
},
{
"epoch": 2.7123534715960327,
"grad_norm": 0.16277022431055213,
"learning_rate": 4.081067808018111e-05,
"loss": 0.4115,
"step": 188
},
{
"epoch": 2.72678088367899,
"grad_norm": 0.1614060894054725,
"learning_rate": 4.040535985542445e-05,
"loss": 0.4188,
"step": 189
},
{
"epoch": 2.7412082957619477,
"grad_norm": 0.1498519807080618,
"learning_rate": 4e-05,
"loss": 0.4172,
"step": 190
},
{
"epoch": 2.755635707844905,
"grad_norm": 0.1604036678202687,
"learning_rate": 3.959464014457557e-05,
"loss": 0.4077,
"step": 191
},
{
"epoch": 2.7700631199278627,
"grad_norm": 0.13770932722249057,
"learning_rate": 3.91893219198189e-05,
"loss": 0.4195,
"step": 192
},
{
"epoch": 2.7844905320108206,
"grad_norm": 0.15035210016285183,
"learning_rate": 3.87840869521223e-05,
"loss": 0.4134,
"step": 193
},
{
"epoch": 2.798917944093778,
"grad_norm": 0.15201640612716522,
"learning_rate": 3.837897685932748e-05,
"loss": 0.4106,
"step": 194
},
{
"epoch": 2.8133453561767356,
"grad_norm": 0.13650157280906988,
"learning_rate": 3.7974033246451496e-05,
"loss": 0.4156,
"step": 195
},
{
"epoch": 2.8277727682596936,
"grad_norm": 0.17964938669673042,
"learning_rate": 3.7569297701413765e-05,
"loss": 0.4154,
"step": 196
},
{
"epoch": 2.842200180342651,
"grad_norm": 0.1243561060549184,
"learning_rate": 3.716481179076509e-05,
"loss": 0.4197,
"step": 197
},
{
"epoch": 2.8566275924256086,
"grad_norm": 0.17089769484487582,
"learning_rate": 3.676061705541864e-05,
"loss": 0.4152,
"step": 198
},
{
"epoch": 2.871055004508566,
"grad_norm": 0.17561155960318975,
"learning_rate": 3.635675500638375e-05,
"loss": 0.4167,
"step": 199
},
{
"epoch": 2.885482416591524,
"grad_norm": 0.16307396978150157,
"learning_rate": 3.595326712050272e-05,
"loss": 0.418,
"step": 200
},
{
"epoch": 2.8999098286744815,
"grad_norm": 0.18681533479112983,
"learning_rate": 3.555019483619106e-05,
"loss": 0.418,
"step": 201
},
{
"epoch": 2.914337240757439,
"grad_norm": 0.1692680534023291,
"learning_rate": 3.5147579549181805e-05,
"loss": 0.4095,
"step": 202
},
{
"epoch": 2.928764652840397,
"grad_norm": 0.1647112968457325,
"learning_rate": 3.4745462608274143e-05,
"loss": 0.421,
"step": 203
},
{
"epoch": 2.9431920649233545,
"grad_norm": 0.1645824019282664,
"learning_rate": 3.434388531108683e-05,
"loss": 0.4201,
"step": 204
},
{
"epoch": 2.957619477006312,
"grad_norm": 0.16193821543079018,
"learning_rate": 3.394288889981695e-05,
"loss": 0.4144,
"step": 205
},
{
"epoch": 2.9720468890892695,
"grad_norm": 0.15576654979169963,
"learning_rate": 3.354251455700427e-05,
"loss": 0.421,
"step": 206
},
{
"epoch": 2.986474301172227,
"grad_norm": 0.11204665102016201,
"learning_rate": 3.314280340130187e-05,
"loss": 0.4169,
"step": 207
},
{
"epoch": 3.000901713255185,
"grad_norm": 0.2597744580379488,
"learning_rate": 3.274379648325308e-05,
"loss": 0.7047,
"step": 208
},
{
"epoch": 3.0153291253381425,
"grad_norm": 0.29242080182868324,
"learning_rate": 3.234553478107575e-05,
"loss": 0.3922,
"step": 209
},
{
"epoch": 3.0297565374211,
"grad_norm": 0.15554632560519327,
"learning_rate": 3.194805919645359e-05,
"loss": 0.3914,
"step": 210
},
{
"epoch": 3.044183949504058,
"grad_norm": 0.22638176078144323,
"learning_rate": 3.155141055033571e-05,
"loss": 0.389,
"step": 211
},
{
"epoch": 3.0586113615870154,
"grad_norm": 0.22235251051875934,
"learning_rate": 3.115562957874413e-05,
"loss": 0.3894,
"step": 212
},
{
"epoch": 3.073038773669973,
"grad_norm": 0.14895254239929756,
"learning_rate": 3.0760756928590265e-05,
"loss": 0.3855,
"step": 213
},
{
"epoch": 3.0874661857529304,
"grad_norm": 0.21985837426895496,
"learning_rate": 3.0366833153500502e-05,
"loss": 0.3899,
"step": 214
},
{
"epoch": 3.1018935978358884,
"grad_norm": 0.1448532296100453,
"learning_rate": 2.997389870965118e-05,
"loss": 0.3853,
"step": 215
},
{
"epoch": 3.116321009918846,
"grad_norm": 0.18340169272282977,
"learning_rate": 2.958199395161388e-05,
"loss": 0.3885,
"step": 216
},
{
"epoch": 3.1307484220018034,
"grad_norm": 0.16252308646393857,
"learning_rate": 2.9191159128210865e-05,
"loss": 0.388,
"step": 217
},
{
"epoch": 3.145175834084761,
"grad_norm": 0.15643803474572993,
"learning_rate": 2.8801434378381566e-05,
"loss": 0.3918,
"step": 218
},
{
"epoch": 3.159603246167719,
"grad_norm": 0.16477382717354483,
"learning_rate": 2.841285972706032e-05,
"loss": 0.3848,
"step": 219
},
{
"epoch": 3.1740306582506763,
"grad_norm": 0.1428234224200868,
"learning_rate": 2.8025475081065684e-05,
"loss": 0.3916,
"step": 220
},
{
"epoch": 3.188458070333634,
"grad_norm": 0.15459593532143248,
"learning_rate": 2.7639320225002108e-05,
"loss": 0.3868,
"step": 221
},
{
"epoch": 3.2028854824165913,
"grad_norm": 0.1376918823828853,
"learning_rate": 2.725443481717394e-05,
"loss": 0.3869,
"step": 222
},
{
"epoch": 3.2173128944995493,
"grad_norm": 0.12950376396508245,
"learning_rate": 2.687085838551255e-05,
"loss": 0.391,
"step": 223
},
{
"epoch": 3.2317403065825068,
"grad_norm": 0.15236052575941866,
"learning_rate": 2.6488630323516785e-05,
"loss": 0.3854,
"step": 224
},
{
"epoch": 3.2461677186654643,
"grad_norm": 0.12413662200660247,
"learning_rate": 2.6107789886207195e-05,
"loss": 0.3932,
"step": 225
},
{
"epoch": 3.260595130748422,
"grad_norm": 0.12948714851227347,
"learning_rate": 2.5728376186094582e-05,
"loss": 0.392,
"step": 226
},
{
"epoch": 3.2750225428313797,
"grad_norm": 0.13509083763614343,
"learning_rate": 2.5350428189163095e-05,
"loss": 0.3893,
"step": 227
},
{
"epoch": 3.2894499549143372,
"grad_norm": 0.11596299194935494,
"learning_rate": 2.4973984710868394e-05,
"loss": 0.3853,
"step": 228
},
{
"epoch": 3.3038773669972947,
"grad_norm": 0.11495064647362904,
"learning_rate": 2.4599084412151283e-05,
"loss": 0.3881,
"step": 229
},
{
"epoch": 3.3183047790802522,
"grad_norm": 0.11377790156854924,
"learning_rate": 2.4225765795467267e-05,
"loss": 0.3881,
"step": 230
},
{
"epoch": 3.33273219116321,
"grad_norm": 0.11176541174980999,
"learning_rate": 2.3854067200832226e-05,
"loss": 0.3849,
"step": 231
},
{
"epoch": 3.3471596032461677,
"grad_norm": 0.10932782133038507,
"learning_rate": 2.348402680188496e-05,
"loss": 0.3913,
"step": 232
},
{
"epoch": 3.361587015329125,
"grad_norm": 0.12116739999194517,
"learning_rate": 2.3115682601966726e-05,
"loss": 0.3909,
"step": 233
},
{
"epoch": 3.376014427412083,
"grad_norm": 0.11683332854779228,
"learning_rate": 2.274907243021824e-05,
"loss": 0.384,
"step": 234
},
{
"epoch": 3.3904418394950406,
"grad_norm": 0.10329122415194655,
"learning_rate": 2.2384233937694626e-05,
"loss": 0.3891,
"step": 235
},
{
"epoch": 3.404869251577998,
"grad_norm": 0.11676332764357526,
"learning_rate": 2.202120459349864e-05,
"loss": 0.3879,
"step": 236
},
{
"epoch": 3.4192966636609556,
"grad_norm": 0.11043415196225377,
"learning_rate": 2.1660021680932565e-05,
"loss": 0.3907,
"step": 237
},
{
"epoch": 3.4337240757439136,
"grad_norm": 0.10352103209720392,
"learning_rate": 2.130072229366916e-05,
"loss": 0.3868,
"step": 238
},
{
"epoch": 3.448151487826871,
"grad_norm": 0.11106271281959253,
"learning_rate": 2.0943343331942208e-05,
"loss": 0.3872,
"step": 239
},
{
"epoch": 3.4625788999098286,
"grad_norm": 0.100859861129825,
"learning_rate": 2.0587921498756768e-05,
"loss": 0.3841,
"step": 240
},
{
"epoch": 3.4770063119927865,
"grad_norm": 0.11902184783806945,
"learning_rate": 2.0234493296119776e-05,
"loss": 0.389,
"step": 241
},
{
"epoch": 3.491433724075744,
"grad_norm": 0.09752054045307186,
"learning_rate": 1.9883095021291294e-05,
"loss": 0.3894,
"step": 242
},
{
"epoch": 3.5058611361587015,
"grad_norm": 0.1157887524298405,
"learning_rate": 1.9533762763056714e-05,
"loss": 0.3864,
"step": 243
},
{
"epoch": 3.520288548241659,
"grad_norm": 0.0962545228356216,
"learning_rate": 1.918653239802048e-05,
"loss": 0.3911,
"step": 244
},
{
"epoch": 3.5347159603246165,
"grad_norm": 0.11589978846585437,
"learning_rate": 1.8841439586921515e-05,
"loss": 0.3873,
"step": 245
},
{
"epoch": 3.5491433724075745,
"grad_norm": 0.10235501875925748,
"learning_rate": 1.849851977097078e-05,
"loss": 0.3919,
"step": 246
},
{
"epoch": 3.563570784490532,
"grad_norm": 0.10642762647275054,
"learning_rate": 1.8157808168211605e-05,
"loss": 0.3862,
"step": 247
},
{
"epoch": 3.5779981965734895,
"grad_norm": 0.10705125409234852,
"learning_rate": 1.7819339769902568e-05,
"loss": 0.3826,
"step": 248
},
{
"epoch": 3.5924256086564474,
"grad_norm": 0.11011000435589068,
"learning_rate": 1.7483149336924105e-05,
"loss": 0.3896,
"step": 249
},
{
"epoch": 3.606853020739405,
"grad_norm": 0.10299367912221409,
"learning_rate": 1.71492713962083e-05,
"loss": 0.3818,
"step": 250
},
{
"epoch": 3.6212804328223624,
"grad_norm": 0.09896534243305305,
"learning_rate": 1.6817740237193213e-05,
"loss": 0.3899,
"step": 251
},
{
"epoch": 3.63570784490532,
"grad_norm": 0.10057029872247607,
"learning_rate": 1.648858990830108e-05,
"loss": 0.3865,
"step": 252
},
{
"epoch": 3.6501352569882775,
"grad_norm": 0.10556137735012057,
"learning_rate": 1.6161854213441724e-05,
"loss": 0.3857,
"step": 253
},
{
"epoch": 3.6645626690712354,
"grad_norm": 0.09912849463045817,
"learning_rate": 1.5837566708540776e-05,
"loss": 0.3882,
"step": 254
},
{
"epoch": 3.678990081154193,
"grad_norm": 0.10873331871358806,
"learning_rate": 1.5515760698093485e-05,
"loss": 0.3913,
"step": 255
},
{
"epoch": 3.693417493237151,
"grad_norm": 0.10135375429134282,
"learning_rate": 1.5196469231744338e-05,
"loss": 0.3918,
"step": 256
},
{
"epoch": 3.7078449053201084,
"grad_norm": 0.101442765978251,
"learning_rate": 1.4879725100892821e-05,
"loss": 0.3898,
"step": 257
},
{
"epoch": 3.722272317403066,
"grad_norm": 0.09944828474807176,
"learning_rate": 1.456556083532577e-05,
"loss": 0.3888,
"step": 258
},
{
"epoch": 3.7366997294860234,
"grad_norm": 0.10063514199400612,
"learning_rate": 1.4254008699876468e-05,
"loss": 0.3875,
"step": 259
},
{
"epoch": 3.751127141568981,
"grad_norm": 0.1073202284969319,
"learning_rate": 1.394510069111112e-05,
"loss": 0.3825,
"step": 260
},
{
"epoch": 3.765554553651939,
"grad_norm": 0.11199636802016412,
"learning_rate": 1.3638868534042732e-05,
"loss": 0.3912,
"step": 261
},
{
"epoch": 3.7799819657348963,
"grad_norm": 0.09460154248342248,
"learning_rate": 1.3335343678872947e-05,
"loss": 0.3919,
"step": 262
},
{
"epoch": 3.794409377817854,
"grad_norm": 0.10030251095406782,
"learning_rate": 1.3034557297762108e-05,
"loss": 0.3897,
"step": 263
},
{
"epoch": 3.8088367899008118,
"grad_norm": 0.09707674946485532,
"learning_rate": 1.2736540281627833e-05,
"loss": 0.3882,
"step": 264
},
{
"epoch": 3.8232642019837693,
"grad_norm": 0.10066191197693501,
"learning_rate": 1.2441323236972536e-05,
"loss": 0.3838,
"step": 265
},
{
"epoch": 3.8376916140667268,
"grad_norm": 0.09882561767806158,
"learning_rate": 1.2148936482740106e-05,
"loss": 0.3876,
"step": 266
},
{
"epoch": 3.8521190261496843,
"grad_norm": 0.09393577574639751,
"learning_rate": 1.1859410047202076e-05,
"loss": 0.3949,
"step": 267
},
{
"epoch": 3.8665464382326418,
"grad_norm": 0.10491601613830169,
"learning_rate": 1.1572773664873877e-05,
"loss": 0.3945,
"step": 268
},
{
"epoch": 3.8809738503155997,
"grad_norm": 0.09433909557518863,
"learning_rate": 1.1289056773460848e-05,
"loss": 0.3907,
"step": 269
},
{
"epoch": 3.895401262398557,
"grad_norm": 0.09718276334267877,
"learning_rate": 1.100828851083521e-05,
"loss": 0.3892,
"step": 270
},
{
"epoch": 3.9098286744815147,
"grad_norm": 0.09130729699370443,
"learning_rate": 1.0730497712043375e-05,
"loss": 0.3877,
"step": 271
},
{
"epoch": 3.9242560865644727,
"grad_norm": 0.0989960086350818,
"learning_rate": 1.0455712906344742e-05,
"loss": 0.3905,
"step": 272
},
{
"epoch": 3.93868349864743,
"grad_norm": 0.08478658948822386,
"learning_rate": 1.0183962314281616e-05,
"loss": 0.3809,
"step": 273
},
{
"epoch": 3.9531109107303877,
"grad_norm": 0.08732293651393247,
"learning_rate": 9.91527384478102e-06,
"loss": 0.3909,
"step": 274
},
{
"epoch": 3.967538322813345,
"grad_norm": 0.09248422321017552,
"learning_rate": 9.649675092288366e-06,
"loss": 0.3904,
"step": 275
},
{
"epoch": 3.981965734896303,
"grad_norm": 0.08874068919252195,
"learning_rate": 9.387193333933542e-06,
"loss": 0.3901,
"step": 276
},
{
"epoch": 3.9963931469792606,
"grad_norm": 0.10386496694166722,
"learning_rate": 9.127855526729518e-06,
"loss": 0.4421,
"step": 277
},
{
"epoch": 4.010820559062219,
"grad_norm": 0.17465874605366377,
"learning_rate": 8.87168830480385e-06,
"loss": 0.5908,
"step": 278
},
{
"epoch": 4.025247971145176,
"grad_norm": 0.10653039126787628,
"learning_rate": 8.618717976663316e-06,
"loss": 0.3731,
"step": 279
},
{
"epoch": 4.039675383228134,
"grad_norm": 0.09575070816517416,
"learning_rate": 8.368970522492064e-06,
"loss": 0.368,
"step": 280
},
{
"epoch": 4.054102795311091,
"grad_norm": 0.10014800890252488,
"learning_rate": 8.122471591483405e-06,
"loss": 0.379,
"step": 281
},
{
"epoch": 4.068530207394049,
"grad_norm": 0.10719719334181581,
"learning_rate": 7.879246499205635e-06,
"loss": 0.3747,
"step": 282
},
{
"epoch": 4.082957619477006,
"grad_norm": 0.1034029506118448,
"learning_rate": 7.639320225002106e-06,
"loss": 0.3675,
"step": 283
},
{
"epoch": 4.097385031559964,
"grad_norm": 0.09719883839292859,
"learning_rate": 7.402717409425846e-06,
"loss": 0.3745,
"step": 284
},
{
"epoch": 4.111812443642922,
"grad_norm": 0.09348527541393592,
"learning_rate": 7.169462351708958e-06,
"loss": 0.3746,
"step": 285
},
{
"epoch": 4.1262398557258795,
"grad_norm": 0.09622234742870885,
"learning_rate": 6.939579007267041e-06,
"loss": 0.3669,
"step": 286
},
{
"epoch": 4.140667267808837,
"grad_norm": 0.10062068129651956,
"learning_rate": 6.7130909852390504e-06,
"loss": 0.377,
"step": 287
},
{
"epoch": 4.1550946798917945,
"grad_norm": 0.09063820087374816,
"learning_rate": 6.490021546062495e-06,
"loss": 0.3725,
"step": 288
},
{
"epoch": 4.169522091974752,
"grad_norm": 0.0978449706487065,
"learning_rate": 6.270393599084719e-06,
"loss": 0.3701,
"step": 289
},
{
"epoch": 4.1839495040577095,
"grad_norm": 0.09420394070648874,
"learning_rate": 6.054229700209959e-06,
"loss": 0.3686,
"step": 290
},
{
"epoch": 4.198376916140667,
"grad_norm": 0.09135183952593588,
"learning_rate": 5.841552049582979e-06,
"loss": 0.3668,
"step": 291
},
{
"epoch": 4.2128043282236245,
"grad_norm": 0.08941854382744684,
"learning_rate": 5.632382489308983e-06,
"loss": 0.3753,
"step": 292
},
{
"epoch": 4.227231740306583,
"grad_norm": 0.09033071999727058,
"learning_rate": 5.4267425012105e-06,
"loss": 0.371,
"step": 293
},
{
"epoch": 4.24165915238954,
"grad_norm": 0.08363022499101917,
"learning_rate": 5.224653204621155e-06,
"loss": 0.3699,
"step": 294
},
{
"epoch": 4.256086564472498,
"grad_norm": 0.0794997380043983,
"learning_rate": 5.026135354216717e-06,
"loss": 0.3703,
"step": 295
},
{
"epoch": 4.270513976555455,
"grad_norm": 0.08331150232989441,
"learning_rate": 4.8312093378835645e-06,
"loss": 0.3729,
"step": 296
},
{
"epoch": 4.284941388638413,
"grad_norm": 0.08516826877199297,
"learning_rate": 4.63989517462486e-06,
"loss": 0.3757,
"step": 297
},
{
"epoch": 4.29936880072137,
"grad_norm": 0.08386630568073708,
"learning_rate": 4.452212512504579e-06,
"loss": 0.3766,
"step": 298
},
{
"epoch": 4.313796212804328,
"grad_norm": 0.08120526732790356,
"learning_rate": 4.268180626629641e-06,
"loss": 0.3751,
"step": 299
},
{
"epoch": 4.328223624887286,
"grad_norm": 0.0797417427617793,
"learning_rate": 4.087818417170337e-06,
"loss": 0.3711,
"step": 300
},
{
"epoch": 4.342651036970244,
"grad_norm": 0.08091306914486351,
"learning_rate": 3.9111444074193e-06,
"loss": 0.3704,
"step": 301
},
{
"epoch": 4.357078449053201,
"grad_norm": 0.08277906868820106,
"learning_rate": 3.7381767418891303e-06,
"loss": 0.3736,
"step": 302
},
{
"epoch": 4.371505861136159,
"grad_norm": 0.08109844725998167,
"learning_rate": 3.568933184448944e-06,
"loss": 0.3679,
"step": 303
},
{
"epoch": 4.385933273219116,
"grad_norm": 0.076043565671384,
"learning_rate": 3.403431116500038e-06,
"loss": 0.3737,
"step": 304
},
{
"epoch": 4.400360685302074,
"grad_norm": 0.0786325856472425,
"learning_rate": 3.241687535190776e-06,
"loss": 0.3722,
"step": 305
},
{
"epoch": 4.414788097385031,
"grad_norm": 0.07882441543843179,
"learning_rate": 3.08371905167101e-06,
"loss": 0.3746,
"step": 306
},
{
"epoch": 4.429215509467989,
"grad_norm": 0.0813528283180034,
"learning_rate": 2.929541889386056e-06,
"loss": 0.3698,
"step": 307
},
{
"epoch": 4.443642921550947,
"grad_norm": 0.07778147610676125,
"learning_rate": 2.7791718824106186e-06,
"loss": 0.3747,
"step": 308
},
{
"epoch": 4.458070333633905,
"grad_norm": 0.07497215994153009,
"learning_rate": 2.6326244738225183e-06,
"loss": 0.3793,
"step": 309
},
{
"epoch": 4.472497745716862,
"grad_norm": 0.0751254260494879,
"learning_rate": 2.489914714116788e-06,
"loss": 0.3707,
"step": 310
},
{
"epoch": 4.48692515779982,
"grad_norm": 0.0748304985473765,
"learning_rate": 2.3510572596598678e-06,
"loss": 0.3728,
"step": 311
},
{
"epoch": 4.501352569882777,
"grad_norm": 0.07793338755657392,
"learning_rate": 2.2160663711845176e-06,
"loss": 0.3733,
"step": 312
},
{
"epoch": 4.515779981965735,
"grad_norm": 0.07545418335066799,
"learning_rate": 2.084955912325093e-06,
"loss": 0.3663,
"step": 313
},
{
"epoch": 4.530207394048692,
"grad_norm": 0.0784362383534773,
"learning_rate": 1.957739348193859e-06,
"loss": 0.3694,
"step": 314
},
{
"epoch": 4.544634806131651,
"grad_norm": 0.07501282928300265,
"learning_rate": 1.8344297439980475e-06,
"loss": 0.3739,
"step": 315
},
{
"epoch": 4.559062218214608,
"grad_norm": 0.07184746061800508,
"learning_rate": 1.715039763698081e-06,
"loss": 0.372,
"step": 316
},
{
"epoch": 4.573489630297566,
"grad_norm": 0.0768286859431056,
"learning_rate": 1.5995816687069687e-06,
"loss": 0.367,
"step": 317
},
{
"epoch": 4.587917042380523,
"grad_norm": 0.07399515759551432,
"learning_rate": 1.4880673166310612e-06,
"loss": 0.3734,
"step": 318
},
{
"epoch": 4.602344454463481,
"grad_norm": 0.07232964813350647,
"learning_rate": 1.3805081600522585e-06,
"loss": 0.3697,
"step": 319
},
{
"epoch": 4.616771866546438,
"grad_norm": 0.07389169724144744,
"learning_rate": 1.276915245351833e-06,
"loss": 0.3666,
"step": 320
},
{
"epoch": 4.631199278629396,
"grad_norm": 0.071013638237935,
"learning_rate": 1.1772992115759351e-06,
"loss": 0.3704,
"step": 321
},
{
"epoch": 4.645626690712353,
"grad_norm": 0.07478471657750946,
"learning_rate": 1.081670289343002e-06,
"loss": 0.372,
"step": 322
},
{
"epoch": 4.660054102795311,
"grad_norm": 0.0713434887145259,
"learning_rate": 9.900382997930413e-07,
"loss": 0.3754,
"step": 323
},
{
"epoch": 4.674481514878269,
"grad_norm": 0.0730981387918326,
"learning_rate": 9.024126535789812e-07,
"loss": 0.3684,
"step": 324
},
{
"epoch": 4.6889089269612265,
"grad_norm": 0.07158489318241744,
"learning_rate": 8.188023499002206e-07,
"loss": 0.3808,
"step": 325
},
{
"epoch": 4.703336339044184,
"grad_norm": 0.07013751377947393,
"learning_rate": 7.392159755783957e-07,
"loss": 0.3626,
"step": 326
},
{
"epoch": 4.7177637511271415,
"grad_norm": 0.07150689349177662,
"learning_rate": 6.636617041754978e-07,
"loss": 0.3723,
"step": 327
},
{
"epoch": 4.732191163210099,
"grad_norm": 0.07003675588222115,
"learning_rate": 5.921472951544527e-07,
"loss": 0.3689,
"step": 328
},
{
"epoch": 4.7466185752930565,
"grad_norm": 0.06957634634931112,
"learning_rate": 5.246800930822371e-07,
"loss": 0.3751,
"step": 329
},
{
"epoch": 4.761045987376015,
"grad_norm": 0.07044531133634477,
"learning_rate": 4.6126702687554483e-07,
"loss": 0.371,
"step": 330
},
{
"epoch": 4.775473399458972,
"grad_norm": 0.07077686025594564,
"learning_rate": 4.0191460908923563e-07,
"loss": 0.3676,
"step": 331
},
{
"epoch": 4.78990081154193,
"grad_norm": 0.07253004421887527,
"learning_rate": 3.4662893524745276e-07,
"loss": 0.3781,
"step": 332
},
{
"epoch": 4.804328223624887,
"grad_norm": 0.07510496456067554,
"learning_rate": 2.954156832176214e-07,
"loss": 0.3783,
"step": 333
},
{
"epoch": 4.818755635707845,
"grad_norm": 0.0706180896307413,
"learning_rate": 2.482801126273371e-07,
"loss": 0.371,
"step": 334
},
{
"epoch": 4.833183047790802,
"grad_norm": 0.06847497940017412,
"learning_rate": 2.0522706432419382e-07,
"loss": 0.3702,
"step": 335
},
{
"epoch": 4.84761045987376,
"grad_norm": 0.07040073406374138,
"learning_rate": 1.6626095987862134e-07,
"loss": 0.3703,
"step": 336
},
{
"epoch": 4.862037871956717,
"grad_norm": 0.06922886880619938,
"learning_rate": 1.3138580112979083e-07,
"loss": 0.3693,
"step": 337
},
{
"epoch": 4.876465284039675,
"grad_norm": 0.07071379713825318,
"learning_rate": 1.0060516977462797e-07,
"loss": 0.3683,
"step": 338
},
{
"epoch": 4.890892696122633,
"grad_norm": 0.0707082821934966,
"learning_rate": 7.39222269999651e-08,
"loss": 0.3795,
"step": 339
},
{
"epoch": 4.905320108205591,
"grad_norm": 0.06885902870580198,
"learning_rate": 5.133971315788966e-08,
"loss": 0.3671,
"step": 340
},
{
"epoch": 4.919747520288548,
"grad_norm": 0.06873108316208869,
"learning_rate": 3.285994748430721e-08,
"loss": 0.3738,
"step": 341
},
{
"epoch": 4.934174932371506,
"grad_norm": 0.06944038886061686,
"learning_rate": 1.8484827860754118e-08,
"loss": 0.3691,
"step": 342
},
{
"epoch": 4.948602344454463,
"grad_norm": 0.07137425763584286,
"learning_rate": 8.215830619486831e-09,
"loss": 0.3709,
"step": 343
},
{
"epoch": 4.963029756537421,
"grad_norm": 0.0716171836993154,
"learning_rate": 2.054010391856487e-09,
"loss": 0.3704,
"step": 344
},
{
"epoch": 4.977457168620378,
"grad_norm": 0.0713787661901706,
"learning_rate": 0.0,
"loss": 0.3736,
"step": 345
},
{
"epoch": 4.977457168620378,
"step": 345,
"total_flos": 9.173613467414823e+18,
"train_loss": 0.4462836230146712,
"train_runtime": 80545.5288,
"train_samples_per_second": 2.202,
"train_steps_per_second": 0.004
}
],
"logging_steps": 1,
"max_steps": 345,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.173613467414823e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}