f1_avg_domain / trainer_state.json
sedrickkeh's picture
End of training
4eb9384 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.992511233150275,
"eval_steps": 500,
"global_step": 625,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00798801797304044,
"grad_norm": 5.8922959558083035,
"learning_rate": 1.26984126984127e-06,
"loss": 0.9284,
"step": 1
},
{
"epoch": 0.01597603594608088,
"grad_norm": 5.937587864934546,
"learning_rate": 2.53968253968254e-06,
"loss": 0.9318,
"step": 2
},
{
"epoch": 0.023964053919121316,
"grad_norm": 5.861772382161128,
"learning_rate": 3.80952380952381e-06,
"loss": 0.9331,
"step": 3
},
{
"epoch": 0.03195207189216176,
"grad_norm": 5.239005126601421,
"learning_rate": 5.07936507936508e-06,
"loss": 0.9119,
"step": 4
},
{
"epoch": 0.0399400898652022,
"grad_norm": 3.629499849213271,
"learning_rate": 6.349206349206349e-06,
"loss": 0.8754,
"step": 5
},
{
"epoch": 0.04792810783824263,
"grad_norm": 2.106015204146543,
"learning_rate": 7.61904761904762e-06,
"loss": 0.836,
"step": 6
},
{
"epoch": 0.05591612581128307,
"grad_norm": 4.356383106407689,
"learning_rate": 8.888888888888888e-06,
"loss": 0.8711,
"step": 7
},
{
"epoch": 0.06390414378432352,
"grad_norm": 4.748038669119492,
"learning_rate": 1.015873015873016e-05,
"loss": 0.8657,
"step": 8
},
{
"epoch": 0.07189216175736396,
"grad_norm": 4.437164847463165,
"learning_rate": 1.1428571428571429e-05,
"loss": 0.8249,
"step": 9
},
{
"epoch": 0.0798801797304044,
"grad_norm": 4.231505558889787,
"learning_rate": 1.2698412698412699e-05,
"loss": 0.8205,
"step": 10
},
{
"epoch": 0.08786819770344484,
"grad_norm": 2.771780905554085,
"learning_rate": 1.3968253968253968e-05,
"loss": 0.8071,
"step": 11
},
{
"epoch": 0.09585621567648527,
"grad_norm": 1.7918482116297212,
"learning_rate": 1.523809523809524e-05,
"loss": 0.7653,
"step": 12
},
{
"epoch": 0.1038442336495257,
"grad_norm": 1.6236141779129738,
"learning_rate": 1.6507936507936507e-05,
"loss": 0.7437,
"step": 13
},
{
"epoch": 0.11183225162256615,
"grad_norm": 1.2870146428263272,
"learning_rate": 1.7777777777777777e-05,
"loss": 0.736,
"step": 14
},
{
"epoch": 0.11982026959560658,
"grad_norm": 1.0068702786417012,
"learning_rate": 1.904761904761905e-05,
"loss": 0.7124,
"step": 15
},
{
"epoch": 0.12780828756864704,
"grad_norm": 1.1636059875738414,
"learning_rate": 2.031746031746032e-05,
"loss": 0.7004,
"step": 16
},
{
"epoch": 0.13579630554168748,
"grad_norm": 0.8899548891950194,
"learning_rate": 2.158730158730159e-05,
"loss": 0.6953,
"step": 17
},
{
"epoch": 0.14378432351472792,
"grad_norm": 0.8171634825879731,
"learning_rate": 2.2857142857142858e-05,
"loss": 0.6899,
"step": 18
},
{
"epoch": 0.15177234148776836,
"grad_norm": 0.8423601505147725,
"learning_rate": 2.4126984126984128e-05,
"loss": 0.6759,
"step": 19
},
{
"epoch": 0.1597603594608088,
"grad_norm": 0.9144240660639567,
"learning_rate": 2.5396825396825397e-05,
"loss": 0.6768,
"step": 20
},
{
"epoch": 0.16774837743384924,
"grad_norm": 0.7527042679957461,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.6664,
"step": 21
},
{
"epoch": 0.17573639540688968,
"grad_norm": 0.9115589252395023,
"learning_rate": 2.7936507936507936e-05,
"loss": 0.6685,
"step": 22
},
{
"epoch": 0.18372441337993012,
"grad_norm": 0.7794511419641769,
"learning_rate": 2.9206349206349206e-05,
"loss": 0.6476,
"step": 23
},
{
"epoch": 0.19171243135297053,
"grad_norm": 0.8206145936410231,
"learning_rate": 3.047619047619048e-05,
"loss": 0.6555,
"step": 24
},
{
"epoch": 0.19970044932601097,
"grad_norm": 0.8660748611689925,
"learning_rate": 3.1746031746031745e-05,
"loss": 0.6504,
"step": 25
},
{
"epoch": 0.2076884672990514,
"grad_norm": 1.09005656089158,
"learning_rate": 3.3015873015873014e-05,
"loss": 0.6468,
"step": 26
},
{
"epoch": 0.21567648527209185,
"grad_norm": 1.2233269812335474,
"learning_rate": 3.4285714285714284e-05,
"loss": 0.6554,
"step": 27
},
{
"epoch": 0.2236645032451323,
"grad_norm": 0.7202441107469458,
"learning_rate": 3.555555555555555e-05,
"loss": 0.6351,
"step": 28
},
{
"epoch": 0.23165252121817273,
"grad_norm": 1.549610538416556,
"learning_rate": 3.682539682539683e-05,
"loss": 0.6386,
"step": 29
},
{
"epoch": 0.23964053919121317,
"grad_norm": 0.7964826077261805,
"learning_rate": 3.80952380952381e-05,
"loss": 0.6282,
"step": 30
},
{
"epoch": 0.2476285571642536,
"grad_norm": 0.6903264777596222,
"learning_rate": 3.936507936507937e-05,
"loss": 0.6281,
"step": 31
},
{
"epoch": 0.2556165751372941,
"grad_norm": 1.2761326884044875,
"learning_rate": 4.063492063492064e-05,
"loss": 0.6216,
"step": 32
},
{
"epoch": 0.2636045931103345,
"grad_norm": 1.3286354473207003,
"learning_rate": 4.190476190476191e-05,
"loss": 0.6196,
"step": 33
},
{
"epoch": 0.27159261108337496,
"grad_norm": 0.6908894226839724,
"learning_rate": 4.317460317460318e-05,
"loss": 0.6144,
"step": 34
},
{
"epoch": 0.2795806290564154,
"grad_norm": 1.0386450814645398,
"learning_rate": 4.444444444444445e-05,
"loss": 0.6155,
"step": 35
},
{
"epoch": 0.28756864702945584,
"grad_norm": 0.7231485406568985,
"learning_rate": 4.5714285714285716e-05,
"loss": 0.6081,
"step": 36
},
{
"epoch": 0.2955566650024963,
"grad_norm": 1.0158040603959178,
"learning_rate": 4.698412698412699e-05,
"loss": 0.6071,
"step": 37
},
{
"epoch": 0.3035446829755367,
"grad_norm": 1.5638712924845808,
"learning_rate": 4.8253968253968255e-05,
"loss": 0.6011,
"step": 38
},
{
"epoch": 0.31153270094857716,
"grad_norm": 0.9158856622661424,
"learning_rate": 4.952380952380953e-05,
"loss": 0.6029,
"step": 39
},
{
"epoch": 0.3195207189216176,
"grad_norm": 1.533932723524169,
"learning_rate": 5.0793650793650794e-05,
"loss": 0.6007,
"step": 40
},
{
"epoch": 0.32750873689465804,
"grad_norm": 1.1151203871839255,
"learning_rate": 5.206349206349207e-05,
"loss": 0.614,
"step": 41
},
{
"epoch": 0.3354967548676985,
"grad_norm": 1.7936006261869704,
"learning_rate": 5.333333333333333e-05,
"loss": 0.5964,
"step": 42
},
{
"epoch": 0.3434847728407389,
"grad_norm": 1.6373220709210505,
"learning_rate": 5.460317460317461e-05,
"loss": 0.6048,
"step": 43
},
{
"epoch": 0.35147279081377936,
"grad_norm": 1.1820238351172419,
"learning_rate": 5.587301587301587e-05,
"loss": 0.5983,
"step": 44
},
{
"epoch": 0.3594608087868198,
"grad_norm": 1.0448595195163097,
"learning_rate": 5.714285714285715e-05,
"loss": 0.6015,
"step": 45
},
{
"epoch": 0.36744882675986024,
"grad_norm": 0.9595564806215681,
"learning_rate": 5.841269841269841e-05,
"loss": 0.5845,
"step": 46
},
{
"epoch": 0.3754368447329007,
"grad_norm": 1.5962786237575002,
"learning_rate": 5.968253968253969e-05,
"loss": 0.5995,
"step": 47
},
{
"epoch": 0.38342486270594106,
"grad_norm": 1.5871239780794693,
"learning_rate": 6.095238095238096e-05,
"loss": 0.5884,
"step": 48
},
{
"epoch": 0.3914128806789815,
"grad_norm": 1.1244276800474557,
"learning_rate": 6.222222222222223e-05,
"loss": 0.597,
"step": 49
},
{
"epoch": 0.39940089865202194,
"grad_norm": 1.971108394067141,
"learning_rate": 6.349206349206349e-05,
"loss": 0.5959,
"step": 50
},
{
"epoch": 0.4073889166250624,
"grad_norm": 1.119155483104472,
"learning_rate": 6.476190476190477e-05,
"loss": 0.595,
"step": 51
},
{
"epoch": 0.4153769345981028,
"grad_norm": 2.3293959233637813,
"learning_rate": 6.603174603174603e-05,
"loss": 0.5968,
"step": 52
},
{
"epoch": 0.42336495257114326,
"grad_norm": 1.7690872710201135,
"learning_rate": 6.730158730158731e-05,
"loss": 0.5942,
"step": 53
},
{
"epoch": 0.4313529705441837,
"grad_norm": 1.5082563438406895,
"learning_rate": 6.857142857142857e-05,
"loss": 0.5929,
"step": 54
},
{
"epoch": 0.43934098851722414,
"grad_norm": 1.705505860185178,
"learning_rate": 6.984126984126985e-05,
"loss": 0.5886,
"step": 55
},
{
"epoch": 0.4473290064902646,
"grad_norm": 1.311562682930509,
"learning_rate": 7.11111111111111e-05,
"loss": 0.5942,
"step": 56
},
{
"epoch": 0.455317024463305,
"grad_norm": 1.3702806631104458,
"learning_rate": 7.238095238095239e-05,
"loss": 0.5918,
"step": 57
},
{
"epoch": 0.46330504243634546,
"grad_norm": 1.3735083834456305,
"learning_rate": 7.365079365079366e-05,
"loss": 0.5944,
"step": 58
},
{
"epoch": 0.4712930604093859,
"grad_norm": 1.5109411814380815,
"learning_rate": 7.492063492063493e-05,
"loss": 0.5859,
"step": 59
},
{
"epoch": 0.47928107838242634,
"grad_norm": 1.8414765598754854,
"learning_rate": 7.61904761904762e-05,
"loss": 0.5932,
"step": 60
},
{
"epoch": 0.4872690963554668,
"grad_norm": 1.1402061244328228,
"learning_rate": 7.746031746031747e-05,
"loss": 0.5828,
"step": 61
},
{
"epoch": 0.4952571143285072,
"grad_norm": 1.794539731996526,
"learning_rate": 7.873015873015874e-05,
"loss": 0.5792,
"step": 62
},
{
"epoch": 0.5032451323015477,
"grad_norm": 1.4047554942240879,
"learning_rate": 8e-05,
"loss": 0.5804,
"step": 63
},
{
"epoch": 0.5112331502745882,
"grad_norm": 1.3931552496353343,
"learning_rate": 7.999937503459301e-05,
"loss": 0.5775,
"step": 64
},
{
"epoch": 0.5192211682476285,
"grad_norm": 1.1656900196646254,
"learning_rate": 7.999750015790111e-05,
"loss": 0.5909,
"step": 65
},
{
"epoch": 0.527209186220669,
"grad_norm": 1.1493581998452567,
"learning_rate": 7.999437542851095e-05,
"loss": 0.5754,
"step": 66
},
{
"epoch": 0.5351972041937094,
"grad_norm": 1.9412467459743252,
"learning_rate": 7.999000094406493e-05,
"loss": 0.5932,
"step": 67
},
{
"epoch": 0.5431852221667499,
"grad_norm": 1.3410903514703634,
"learning_rate": 7.998437684125812e-05,
"loss": 0.5849,
"step": 68
},
{
"epoch": 0.5511732401397903,
"grad_norm": 1.1599213167605864,
"learning_rate": 7.997750329583402e-05,
"loss": 0.5779,
"step": 69
},
{
"epoch": 0.5591612581128308,
"grad_norm": 2.611492770456904,
"learning_rate": 7.9969380522579e-05,
"loss": 0.5936,
"step": 70
},
{
"epoch": 0.5671492760858712,
"grad_norm": 1.5087257150690652,
"learning_rate": 7.996000877531569e-05,
"loss": 0.5884,
"step": 71
},
{
"epoch": 0.5751372940589117,
"grad_norm": 2.6141462248634086,
"learning_rate": 7.9949388346895e-05,
"loss": 0.5951,
"step": 72
},
{
"epoch": 0.5831253120319521,
"grad_norm": 1.9836349293790256,
"learning_rate": 7.993751956918693e-05,
"loss": 0.5874,
"step": 73
},
{
"epoch": 0.5911133300049926,
"grad_norm": 1.5274699003911547,
"learning_rate": 7.992440281307027e-05,
"loss": 0.5962,
"step": 74
},
{
"epoch": 0.5991013479780329,
"grad_norm": 1.2158179637702575,
"learning_rate": 7.991003848842093e-05,
"loss": 0.5801,
"step": 75
},
{
"epoch": 0.6070893659510734,
"grad_norm": 1.2430162793293555,
"learning_rate": 7.989442704409925e-05,
"loss": 0.5757,
"step": 76
},
{
"epoch": 0.6150773839241138,
"grad_norm": 0.9546052456533828,
"learning_rate": 7.987756896793583e-05,
"loss": 0.5836,
"step": 77
},
{
"epoch": 0.6230654018971543,
"grad_norm": 1.051061984198158,
"learning_rate": 7.985946478671642e-05,
"loss": 0.575,
"step": 78
},
{
"epoch": 0.6310534198701947,
"grad_norm": 1.025823090309492,
"learning_rate": 7.984011506616534e-05,
"loss": 0.5792,
"step": 79
},
{
"epoch": 0.6390414378432352,
"grad_norm": 1.0879892769571216,
"learning_rate": 7.981952041092792e-05,
"loss": 0.575,
"step": 80
},
{
"epoch": 0.6470294558162756,
"grad_norm": 1.3203984543837413,
"learning_rate": 7.979768146455148e-05,
"loss": 0.5725,
"step": 81
},
{
"epoch": 0.6550174737893161,
"grad_norm": 0.8793400599633049,
"learning_rate": 7.977459890946534e-05,
"loss": 0.5643,
"step": 82
},
{
"epoch": 0.6630054917623565,
"grad_norm": 0.9839614386276342,
"learning_rate": 7.975027346695943e-05,
"loss": 0.5609,
"step": 83
},
{
"epoch": 0.670993509735397,
"grad_norm": 1.087269282291481,
"learning_rate": 7.972470589716175e-05,
"loss": 0.5706,
"step": 84
},
{
"epoch": 0.6789815277084373,
"grad_norm": 0.8949957037226873,
"learning_rate": 7.969789699901462e-05,
"loss": 0.5718,
"step": 85
},
{
"epoch": 0.6869695456814778,
"grad_norm": 0.5685210804043624,
"learning_rate": 7.966984761024974e-05,
"loss": 0.5651,
"step": 86
},
{
"epoch": 0.6949575636545182,
"grad_norm": 0.7365421304468946,
"learning_rate": 7.964055860736199e-05,
"loss": 0.5625,
"step": 87
},
{
"epoch": 0.7029455816275587,
"grad_norm": 0.6519155688073771,
"learning_rate": 7.961003090558208e-05,
"loss": 0.5602,
"step": 88
},
{
"epoch": 0.7109335996005991,
"grad_norm": 0.47928031192412984,
"learning_rate": 7.957826545884786e-05,
"loss": 0.5549,
"step": 89
},
{
"epoch": 0.7189216175736396,
"grad_norm": 0.7685907979864348,
"learning_rate": 7.95452632597746e-05,
"loss": 0.5558,
"step": 90
},
{
"epoch": 0.72690963554668,
"grad_norm": 0.7342212353643156,
"learning_rate": 7.951102533962393e-05,
"loss": 0.5539,
"step": 91
},
{
"epoch": 0.7348976535197205,
"grad_norm": 0.526378766562186,
"learning_rate": 7.947555276827166e-05,
"loss": 0.5604,
"step": 92
},
{
"epoch": 0.7428856714927609,
"grad_norm": 0.763635167097638,
"learning_rate": 7.94388466541743e-05,
"loss": 0.5604,
"step": 93
},
{
"epoch": 0.7508736894658014,
"grad_norm": 1.1133910887739713,
"learning_rate": 7.940090814433437e-05,
"loss": 0.5502,
"step": 94
},
{
"epoch": 0.7588617074388417,
"grad_norm": 1.350450301452925,
"learning_rate": 7.936173842426473e-05,
"loss": 0.5607,
"step": 95
},
{
"epoch": 0.7668497254118821,
"grad_norm": 0.47766209706502316,
"learning_rate": 7.932133871795136e-05,
"loss": 0.5584,
"step": 96
},
{
"epoch": 0.7748377433849226,
"grad_norm": 0.8964819495426043,
"learning_rate": 7.927971028781522e-05,
"loss": 0.5533,
"step": 97
},
{
"epoch": 0.782825761357963,
"grad_norm": 1.2844756885032345,
"learning_rate": 7.923685443467275e-05,
"loss": 0.5439,
"step": 98
},
{
"epoch": 0.7908137793310035,
"grad_norm": 0.7076588316414215,
"learning_rate": 7.919277249769522e-05,
"loss": 0.5516,
"step": 99
},
{
"epoch": 0.7988017973040439,
"grad_norm": 0.9548748366290979,
"learning_rate": 7.914746585436692e-05,
"loss": 0.5622,
"step": 100
},
{
"epoch": 0.8067898152770844,
"grad_norm": 1.0033397557294186,
"learning_rate": 7.91009359204421e-05,
"loss": 0.55,
"step": 101
},
{
"epoch": 0.8147778332501248,
"grad_norm": 0.8058153670114928,
"learning_rate": 7.90531841499007e-05,
"loss": 0.5472,
"step": 102
},
{
"epoch": 0.8227658512231653,
"grad_norm": 0.7522684804995226,
"learning_rate": 7.900421203490295e-05,
"loss": 0.5475,
"step": 103
},
{
"epoch": 0.8307538691962056,
"grad_norm": 0.8260701286176672,
"learning_rate": 7.895402110574277e-05,
"loss": 0.546,
"step": 104
},
{
"epoch": 0.8387418871692461,
"grad_norm": 0.9294034148971123,
"learning_rate": 7.890261293079985e-05,
"loss": 0.5486,
"step": 105
},
{
"epoch": 0.8467299051422865,
"grad_norm": 0.6210859012554373,
"learning_rate": 7.884998911649077e-05,
"loss": 0.5565,
"step": 106
},
{
"epoch": 0.854717923115327,
"grad_norm": 0.6446646964930844,
"learning_rate": 7.879615130721868e-05,
"loss": 0.539,
"step": 107
},
{
"epoch": 0.8627059410883674,
"grad_norm": 0.8996911090197094,
"learning_rate": 7.8741101185322e-05,
"loss": 0.5422,
"step": 108
},
{
"epoch": 0.8706939590614079,
"grad_norm": 0.9338087827721026,
"learning_rate": 7.868484047102183e-05,
"loss": 0.5535,
"step": 109
},
{
"epoch": 0.8786819770344483,
"grad_norm": 1.1026810388479344,
"learning_rate": 7.862737092236818e-05,
"loss": 0.5453,
"step": 110
},
{
"epoch": 0.8866699950074888,
"grad_norm": 0.9663842431402072,
"learning_rate": 7.856869433518506e-05,
"loss": 0.5452,
"step": 111
},
{
"epoch": 0.8946580129805292,
"grad_norm": 1.0210253102387117,
"learning_rate": 7.850881254301432e-05,
"loss": 0.5568,
"step": 112
},
{
"epoch": 0.9026460309535697,
"grad_norm": 0.8477567856764551,
"learning_rate": 7.844772741705835e-05,
"loss": 0.545,
"step": 113
},
{
"epoch": 0.91063404892661,
"grad_norm": 0.5613356829580358,
"learning_rate": 7.838544086612174e-05,
"loss": 0.5438,
"step": 114
},
{
"epoch": 0.9186220668996505,
"grad_norm": 0.6248181380373118,
"learning_rate": 7.832195483655144e-05,
"loss": 0.5366,
"step": 115
},
{
"epoch": 0.9266100848726909,
"grad_norm": 0.8519302343250585,
"learning_rate": 7.825727131217609e-05,
"loss": 0.5401,
"step": 116
},
{
"epoch": 0.9345981028457314,
"grad_norm": 0.45919068712258837,
"learning_rate": 7.81913923142439e-05,
"loss": 0.5518,
"step": 117
},
{
"epoch": 0.9425861208187718,
"grad_norm": 0.5491942320357649,
"learning_rate": 7.812431990135965e-05,
"loss": 0.545,
"step": 118
},
{
"epoch": 0.9505741387918123,
"grad_norm": 0.7204970814629463,
"learning_rate": 7.805605616942023e-05,
"loss": 0.5502,
"step": 119
},
{
"epoch": 0.9585621567648527,
"grad_norm": 0.624445399157028,
"learning_rate": 7.798660325154917e-05,
"loss": 0.5465,
"step": 120
},
{
"epoch": 0.9665501747378932,
"grad_norm": 0.43723265221924457,
"learning_rate": 7.791596331803003e-05,
"loss": 0.5387,
"step": 121
},
{
"epoch": 0.9745381927109336,
"grad_norm": 0.40296635700807665,
"learning_rate": 7.784413857623856e-05,
"loss": 0.5384,
"step": 122
},
{
"epoch": 0.982526210683974,
"grad_norm": 0.4355607269982166,
"learning_rate": 7.77711312705737e-05,
"loss": 0.5391,
"step": 123
},
{
"epoch": 0.9905142286570144,
"grad_norm": 0.37094543758250353,
"learning_rate": 7.769694368238746e-05,
"loss": 0.534,
"step": 124
},
{
"epoch": 0.9985022466300549,
"grad_norm": 0.36678250566452825,
"learning_rate": 7.762157812991369e-05,
"loss": 0.535,
"step": 125
},
{
"epoch": 1.0064902646030953,
"grad_norm": 0.7573100076200363,
"learning_rate": 7.754503696819553e-05,
"loss": 0.955,
"step": 126
},
{
"epoch": 1.0144782825761358,
"grad_norm": 1.0167041671110564,
"learning_rate": 7.74673225890119e-05,
"loss": 0.5181,
"step": 127
},
{
"epoch": 1.0224663005491763,
"grad_norm": 1.0181250181107355,
"learning_rate": 7.738843742080269e-05,
"loss": 0.5237,
"step": 128
},
{
"epoch": 1.0304543185222166,
"grad_norm": 1.1080605772500498,
"learning_rate": 7.730838392859303e-05,
"loss": 0.5312,
"step": 129
},
{
"epoch": 1.038442336495257,
"grad_norm": 0.7638562997222614,
"learning_rate": 7.722716461391603e-05,
"loss": 0.5338,
"step": 130
},
{
"epoch": 1.0464303544682976,
"grad_norm": 0.8619620628236141,
"learning_rate": 7.714478201473483e-05,
"loss": 0.5249,
"step": 131
},
{
"epoch": 1.054418372441338,
"grad_norm": 1.1654304124994774,
"learning_rate": 7.706123870536315e-05,
"loss": 0.5208,
"step": 132
},
{
"epoch": 1.0624063904143783,
"grad_norm": 0.5330720342927018,
"learning_rate": 7.697653729638489e-05,
"loss": 0.5184,
"step": 133
},
{
"epoch": 1.0703944083874188,
"grad_norm": 1.020325885284434,
"learning_rate": 7.689068043457261e-05,
"loss": 0.5128,
"step": 134
},
{
"epoch": 1.0783824263604593,
"grad_norm": 0.6983781848617573,
"learning_rate": 7.68036708028047e-05,
"loss": 0.518,
"step": 135
},
{
"epoch": 1.0863704443334998,
"grad_norm": 0.6057523169656847,
"learning_rate": 7.671551111998169e-05,
"loss": 0.5196,
"step": 136
},
{
"epoch": 1.09435846230654,
"grad_norm": 0.5211411106516707,
"learning_rate": 7.662620414094117e-05,
"loss": 0.5199,
"step": 137
},
{
"epoch": 1.1023464802795806,
"grad_norm": 0.5166573997289899,
"learning_rate": 7.653575265637177e-05,
"loss": 0.5154,
"step": 138
},
{
"epoch": 1.110334498252621,
"grad_norm": 0.4470708726865469,
"learning_rate": 7.644415949272591e-05,
"loss": 0.5098,
"step": 139
},
{
"epoch": 1.1183225162256616,
"grad_norm": 0.5357218094920962,
"learning_rate": 7.635142751213156e-05,
"loss": 0.5196,
"step": 140
},
{
"epoch": 1.1263105341987019,
"grad_norm": 0.48982578714373154,
"learning_rate": 7.62575596123027e-05,
"loss": 0.5112,
"step": 141
},
{
"epoch": 1.1342985521717424,
"grad_norm": 0.3953911478616972,
"learning_rate": 7.616255872644888e-05,
"loss": 0.5022,
"step": 142
},
{
"epoch": 1.1422865701447829,
"grad_norm": 0.46599322968658796,
"learning_rate": 7.60664278231834e-05,
"loss": 0.5067,
"step": 143
},
{
"epoch": 1.1502745881178233,
"grad_norm": 0.47850160868681485,
"learning_rate": 7.596916990643077e-05,
"loss": 0.5028,
"step": 144
},
{
"epoch": 1.1582626060908636,
"grad_norm": 0.42978953466708475,
"learning_rate": 7.587078801533262e-05,
"loss": 0.5015,
"step": 145
},
{
"epoch": 1.1662506240639041,
"grad_norm": 0.3540055333518291,
"learning_rate": 7.577128522415292e-05,
"loss": 0.5076,
"step": 146
},
{
"epoch": 1.1742386420369446,
"grad_norm": 0.3351153000601574,
"learning_rate": 7.567066464218178e-05,
"loss": 0.4989,
"step": 147
},
{
"epoch": 1.182226660009985,
"grad_norm": 0.3005800301999229,
"learning_rate": 7.556892941363833e-05,
"loss": 0.4967,
"step": 148
},
{
"epoch": 1.1902146779830254,
"grad_norm": 0.3563502792477842,
"learning_rate": 7.546608271757251e-05,
"loss": 0.5107,
"step": 149
},
{
"epoch": 1.1982026959560659,
"grad_norm": 0.38770493909399334,
"learning_rate": 7.536212776776567e-05,
"loss": 0.5104,
"step": 150
},
{
"epoch": 1.2061907139291064,
"grad_norm": 0.3767151991317555,
"learning_rate": 7.525706781263023e-05,
"loss": 0.5102,
"step": 151
},
{
"epoch": 1.2141787319021469,
"grad_norm": 0.4105950587040687,
"learning_rate": 7.515090613510801e-05,
"loss": 0.4986,
"step": 152
},
{
"epoch": 1.2221667498751871,
"grad_norm": 0.42936249879191585,
"learning_rate": 7.504364605256784e-05,
"loss": 0.5035,
"step": 153
},
{
"epoch": 1.2301547678482276,
"grad_norm": 0.4346225237944244,
"learning_rate": 7.493529091670181e-05,
"loss": 0.4988,
"step": 154
},
{
"epoch": 1.2381427858212681,
"grad_norm": 0.4396844168311194,
"learning_rate": 7.482584411342043e-05,
"loss": 0.5077,
"step": 155
},
{
"epoch": 1.2461308037943086,
"grad_norm": 0.431746092302867,
"learning_rate": 7.471530906274704e-05,
"loss": 0.4983,
"step": 156
},
{
"epoch": 1.254118821767349,
"grad_norm": 0.5889910567664702,
"learning_rate": 7.460368921871077e-05,
"loss": 0.5122,
"step": 157
},
{
"epoch": 1.2621068397403894,
"grad_norm": 0.6830133790630488,
"learning_rate": 7.44909880692387e-05,
"loss": 0.5073,
"step": 158
},
{
"epoch": 1.27009485771343,
"grad_norm": 0.6354350767066138,
"learning_rate": 7.437720913604681e-05,
"loss": 0.5117,
"step": 159
},
{
"epoch": 1.2780828756864704,
"grad_norm": 0.4963286720098572,
"learning_rate": 7.426235597452995e-05,
"loss": 0.4993,
"step": 160
},
{
"epoch": 1.2860708936595107,
"grad_norm": 0.418831779419711,
"learning_rate": 7.41464321736508e-05,
"loss": 0.5021,
"step": 161
},
{
"epoch": 1.2940589116325512,
"grad_norm": 0.4787432347277129,
"learning_rate": 7.402944135582758e-05,
"loss": 0.502,
"step": 162
},
{
"epoch": 1.3020469296055917,
"grad_norm": 0.50062737801301,
"learning_rate": 7.391138717682103e-05,
"loss": 0.4937,
"step": 163
},
{
"epoch": 1.310034947578632,
"grad_norm": 0.39201954318713855,
"learning_rate": 7.379227332562005e-05,
"loss": 0.5003,
"step": 164
},
{
"epoch": 1.3180229655516724,
"grad_norm": 0.31007216413114186,
"learning_rate": 7.367210352432645e-05,
"loss": 0.502,
"step": 165
},
{
"epoch": 1.326010983524713,
"grad_norm": 0.42076785863557453,
"learning_rate": 7.355088152803866e-05,
"loss": 0.501,
"step": 166
},
{
"epoch": 1.3339990014977534,
"grad_norm": 0.4745296323176778,
"learning_rate": 7.342861112473442e-05,
"loss": 0.4979,
"step": 167
},
{
"epoch": 1.341987019470794,
"grad_norm": 0.4199718916823893,
"learning_rate": 7.330529613515232e-05,
"loss": 0.4984,
"step": 168
},
{
"epoch": 1.3499750374438342,
"grad_norm": 0.3814943625708202,
"learning_rate": 7.318094041267253e-05,
"loss": 0.4946,
"step": 169
},
{
"epoch": 1.3579630554168747,
"grad_norm": 0.3584958844621985,
"learning_rate": 7.305554784319625e-05,
"loss": 0.4945,
"step": 170
},
{
"epoch": 1.3659510733899152,
"grad_norm": 0.3258027404514737,
"learning_rate": 7.29291223450244e-05,
"loss": 0.4936,
"step": 171
},
{
"epoch": 1.3739390913629554,
"grad_norm": 0.3304823682468289,
"learning_rate": 7.280166786873514e-05,
"loss": 0.4957,
"step": 172
},
{
"epoch": 1.381927109335996,
"grad_norm": 0.285695277322611,
"learning_rate": 7.267318839706038e-05,
"loss": 0.5004,
"step": 173
},
{
"epoch": 1.3899151273090364,
"grad_norm": 0.360711874339804,
"learning_rate": 7.25436879447614e-05,
"loss": 0.4946,
"step": 174
},
{
"epoch": 1.397903145282077,
"grad_norm": 0.4690067762041838,
"learning_rate": 7.241317055850336e-05,
"loss": 0.4933,
"step": 175
},
{
"epoch": 1.4058911632551174,
"grad_norm": 0.48954294072750454,
"learning_rate": 7.228164031672879e-05,
"loss": 0.4958,
"step": 176
},
{
"epoch": 1.4138791812281577,
"grad_norm": 0.5871985410108085,
"learning_rate": 7.214910132953027e-05,
"loss": 0.495,
"step": 177
},
{
"epoch": 1.4218671992011982,
"grad_norm": 0.720040324723498,
"learning_rate": 7.201555773852189e-05,
"loss": 0.4989,
"step": 178
},
{
"epoch": 1.4298552171742387,
"grad_norm": 0.8159522745469254,
"learning_rate": 7.188101371670991e-05,
"loss": 0.5006,
"step": 179
},
{
"epoch": 1.437843235147279,
"grad_norm": 0.8363865485901019,
"learning_rate": 7.174547346836228e-05,
"loss": 0.5069,
"step": 180
},
{
"epoch": 1.4458312531203195,
"grad_norm": 0.7345453619769279,
"learning_rate": 7.160894122887733e-05,
"loss": 0.4927,
"step": 181
},
{
"epoch": 1.45381927109336,
"grad_norm": 0.589527692471703,
"learning_rate": 7.147142126465138e-05,
"loss": 0.4955,
"step": 182
},
{
"epoch": 1.4618072890664005,
"grad_norm": 0.4423587194525166,
"learning_rate": 7.133291787294547e-05,
"loss": 0.5094,
"step": 183
},
{
"epoch": 1.469795307039441,
"grad_norm": 0.40340279142628255,
"learning_rate": 7.119343538175102e-05,
"loss": 0.4967,
"step": 184
},
{
"epoch": 1.4777833250124812,
"grad_norm": 0.4982976531352129,
"learning_rate": 7.10529781496546e-05,
"loss": 0.4951,
"step": 185
},
{
"epoch": 1.4857713429855217,
"grad_norm": 0.45741686448136076,
"learning_rate": 7.09115505657018e-05,
"loss": 0.4839,
"step": 186
},
{
"epoch": 1.4937593609585622,
"grad_norm": 0.32134532426731377,
"learning_rate": 7.076915704926e-05,
"loss": 0.4947,
"step": 187
},
{
"epoch": 1.5017473789316025,
"grad_norm": 0.2578730665869774,
"learning_rate": 7.062580204988028e-05,
"loss": 0.4885,
"step": 188
},
{
"epoch": 1.509735396904643,
"grad_norm": 0.3424320920246288,
"learning_rate": 7.048149004715843e-05,
"loss": 0.4968,
"step": 189
},
{
"epoch": 1.5177234148776835,
"grad_norm": 0.40215949965851383,
"learning_rate": 7.033622555059491e-05,
"loss": 0.4964,
"step": 190
},
{
"epoch": 1.525711432850724,
"grad_norm": 0.3989533402101727,
"learning_rate": 7.0190013099454e-05,
"loss": 0.4993,
"step": 191
},
{
"epoch": 1.5336994508237645,
"grad_norm": 0.2863829598271095,
"learning_rate": 7.004285726262188e-05,
"loss": 0.5058,
"step": 192
},
{
"epoch": 1.5416874687968047,
"grad_norm": 0.24052248409440963,
"learning_rate": 6.989476263846396e-05,
"loss": 0.4861,
"step": 193
},
{
"epoch": 1.5496754867698452,
"grad_norm": 0.4598329169035325,
"learning_rate": 6.974573385468105e-05,
"loss": 0.5007,
"step": 194
},
{
"epoch": 1.5576635047428855,
"grad_norm": 0.6079055307812807,
"learning_rate": 6.95957755681649e-05,
"loss": 0.5008,
"step": 195
},
{
"epoch": 1.565651522715926,
"grad_norm": 0.5580132412627938,
"learning_rate": 6.944489246485257e-05,
"loss": 0.4962,
"step": 196
},
{
"epoch": 1.5736395406889665,
"grad_norm": 0.42994805656529084,
"learning_rate": 6.929308925958009e-05,
"loss": 0.5076,
"step": 197
},
{
"epoch": 1.581627558662007,
"grad_norm": 0.3842832421038355,
"learning_rate": 6.914037069593504e-05,
"loss": 0.4924,
"step": 198
},
{
"epoch": 1.5896155766350475,
"grad_norm": 0.32699055905703517,
"learning_rate": 6.898674154610839e-05,
"loss": 0.4921,
"step": 199
},
{
"epoch": 1.597603594608088,
"grad_norm": 0.42528398283904756,
"learning_rate": 6.883220661074534e-05,
"loss": 0.4928,
"step": 200
},
{
"epoch": 1.6055916125811283,
"grad_norm": 0.6183497108648602,
"learning_rate": 6.867677071879535e-05,
"loss": 0.4993,
"step": 201
},
{
"epoch": 1.6135796305541688,
"grad_norm": 0.7584925576329896,
"learning_rate": 6.852043872736116e-05,
"loss": 0.4846,
"step": 202
},
{
"epoch": 1.621567648527209,
"grad_norm": 0.6243564361060799,
"learning_rate": 6.836321552154714e-05,
"loss": 0.5007,
"step": 203
},
{
"epoch": 1.6295556665002495,
"grad_norm": 0.3651441665883393,
"learning_rate": 6.820510601430649e-05,
"loss": 0.4936,
"step": 204
},
{
"epoch": 1.63754368447329,
"grad_norm": 0.23834669483267124,
"learning_rate": 6.804611514628788e-05,
"loss": 0.4857,
"step": 205
},
{
"epoch": 1.6455317024463305,
"grad_norm": 0.3073254289591667,
"learning_rate": 6.78862478856809e-05,
"loss": 0.4974,
"step": 206
},
{
"epoch": 1.653519720419371,
"grad_norm": 0.3183758714531585,
"learning_rate": 6.772550922806096e-05,
"loss": 0.4915,
"step": 207
},
{
"epoch": 1.6615077383924115,
"grad_norm": 0.2710320114390746,
"learning_rate": 6.756390419623307e-05,
"loss": 0.4901,
"step": 208
},
{
"epoch": 1.6694957563654518,
"grad_norm": 0.27532630096114225,
"learning_rate": 6.740143784007495e-05,
"loss": 0.4885,
"step": 209
},
{
"epoch": 1.6774837743384923,
"grad_norm": 0.24949516998489749,
"learning_rate": 6.723811523637923e-05,
"loss": 0.4948,
"step": 210
},
{
"epoch": 1.6854717923115325,
"grad_norm": 0.27385769367337703,
"learning_rate": 6.707394148869479e-05,
"loss": 0.4963,
"step": 211
},
{
"epoch": 1.693459810284573,
"grad_norm": 0.3041551075828834,
"learning_rate": 6.690892172716726e-05,
"loss": 0.486,
"step": 212
},
{
"epoch": 1.7014478282576135,
"grad_norm": 0.3555632959677351,
"learning_rate": 6.674306110837881e-05,
"loss": 0.499,
"step": 213
},
{
"epoch": 1.709435846230654,
"grad_norm": 0.3329437137508577,
"learning_rate": 6.657636481518683e-05,
"loss": 0.4949,
"step": 214
},
{
"epoch": 1.7174238642036945,
"grad_norm": 0.3417126321251888,
"learning_rate": 6.640883805656221e-05,
"loss": 0.4913,
"step": 215
},
{
"epoch": 1.725411882176735,
"grad_norm": 0.3989241732557222,
"learning_rate": 6.624048606742636e-05,
"loss": 0.4911,
"step": 216
},
{
"epoch": 1.7333999001497753,
"grad_norm": 0.45014562286637283,
"learning_rate": 6.607131410848777e-05,
"loss": 0.4932,
"step": 217
},
{
"epoch": 1.7413879181228158,
"grad_norm": 0.4927365755110579,
"learning_rate": 6.590132746607755e-05,
"loss": 0.4929,
"step": 218
},
{
"epoch": 1.749375936095856,
"grad_norm": 0.5486106005274718,
"learning_rate": 6.573053145198422e-05,
"loss": 0.4924,
"step": 219
},
{
"epoch": 1.7573639540688966,
"grad_norm": 0.5493013804791822,
"learning_rate": 6.555893140328787e-05,
"loss": 0.5029,
"step": 220
},
{
"epoch": 1.765351972041937,
"grad_norm": 0.4921038998096511,
"learning_rate": 6.538653268219316e-05,
"loss": 0.501,
"step": 221
},
{
"epoch": 1.7733399900149776,
"grad_norm": 0.36708379922405937,
"learning_rate": 6.521334067586194e-05,
"loss": 0.4912,
"step": 222
},
{
"epoch": 1.781328007988018,
"grad_norm": 0.2934447036565008,
"learning_rate": 6.503936079624486e-05,
"loss": 0.4924,
"step": 223
},
{
"epoch": 1.7893160259610585,
"grad_norm": 0.41971512428606667,
"learning_rate": 6.486459847991226e-05,
"loss": 0.4867,
"step": 224
},
{
"epoch": 1.7973040439340988,
"grad_norm": 0.38954075869198324,
"learning_rate": 6.46890591878842e-05,
"loss": 0.4833,
"step": 225
},
{
"epoch": 1.8052920619071393,
"grad_norm": 0.34504882506932716,
"learning_rate": 6.451274840545995e-05,
"loss": 0.4952,
"step": 226
},
{
"epoch": 1.8132800798801796,
"grad_norm": 0.3115751552302506,
"learning_rate": 6.433567164204652e-05,
"loss": 0.4838,
"step": 227
},
{
"epoch": 1.82126809785322,
"grad_norm": 0.3412485251072806,
"learning_rate": 6.415783443098645e-05,
"loss": 0.4855,
"step": 228
},
{
"epoch": 1.8292561158262606,
"grad_norm": 0.4108218843875664,
"learning_rate": 6.397924232938504e-05,
"loss": 0.4911,
"step": 229
},
{
"epoch": 1.837244133799301,
"grad_norm": 0.348838980704177,
"learning_rate": 6.379990091793653e-05,
"loss": 0.4924,
"step": 230
},
{
"epoch": 1.8452321517723416,
"grad_norm": 0.2727569106903297,
"learning_rate": 6.361981580074983e-05,
"loss": 0.4875,
"step": 231
},
{
"epoch": 1.853220169745382,
"grad_norm": 0.31966296310063425,
"learning_rate": 6.343899260517339e-05,
"loss": 0.4929,
"step": 232
},
{
"epoch": 1.8612081877184223,
"grad_norm": 0.2973479822646696,
"learning_rate": 6.325743698161927e-05,
"loss": 0.4929,
"step": 233
},
{
"epoch": 1.8691962056914628,
"grad_norm": 0.34272092476530364,
"learning_rate": 6.307515460338672e-05,
"loss": 0.4896,
"step": 234
},
{
"epoch": 1.877184223664503,
"grad_norm": 0.3581061926529654,
"learning_rate": 6.289215116648477e-05,
"loss": 0.486,
"step": 235
},
{
"epoch": 1.8851722416375436,
"grad_norm": 0.2528403776001991,
"learning_rate": 6.270843238945426e-05,
"loss": 0.4941,
"step": 236
},
{
"epoch": 1.893160259610584,
"grad_norm": 0.2684767914087712,
"learning_rate": 6.252400401318924e-05,
"loss": 0.495,
"step": 237
},
{
"epoch": 1.9011482775836246,
"grad_norm": 0.3089206948515233,
"learning_rate": 6.233887180075744e-05,
"loss": 0.4952,
"step": 238
},
{
"epoch": 1.909136295556665,
"grad_norm": 0.30351254889018653,
"learning_rate": 6.21530415372203e-05,
"loss": 0.4846,
"step": 239
},
{
"epoch": 1.9171243135297056,
"grad_norm": 0.4047998399516971,
"learning_rate": 6.196651902945213e-05,
"loss": 0.4961,
"step": 240
},
{
"epoch": 1.9251123315027459,
"grad_norm": 0.34718079097807986,
"learning_rate": 6.17793101059587e-05,
"loss": 0.4784,
"step": 241
},
{
"epoch": 1.9331003494757864,
"grad_norm": 0.23676859947641374,
"learning_rate": 6.159142061669504e-05,
"loss": 0.4816,
"step": 242
},
{
"epoch": 1.9410883674488266,
"grad_norm": 0.3083982484226228,
"learning_rate": 6.14028564328827e-05,
"loss": 0.4846,
"step": 243
},
{
"epoch": 1.9490763854218671,
"grad_norm": 0.23280924719224474,
"learning_rate": 6.12136234468263e-05,
"loss": 0.4901,
"step": 244
},
{
"epoch": 1.9570644033949076,
"grad_norm": 0.23217318367899584,
"learning_rate": 6.1023727571729334e-05,
"loss": 0.4922,
"step": 245
},
{
"epoch": 1.965052421367948,
"grad_norm": 0.3110861621844553,
"learning_rate": 6.083317474150943e-05,
"loss": 0.4897,
"step": 246
},
{
"epoch": 1.9730404393409886,
"grad_norm": 0.2740981225422537,
"learning_rate": 6.0641970910612966e-05,
"loss": 0.4884,
"step": 247
},
{
"epoch": 1.981028457314029,
"grad_norm": 0.30045631025591646,
"learning_rate": 6.045012205382894e-05,
"loss": 0.4842,
"step": 248
},
{
"epoch": 1.9890164752870694,
"grad_norm": 0.3426504942977091,
"learning_rate": 6.025763416610229e-05,
"loss": 0.4805,
"step": 249
},
{
"epoch": 1.9970044932601099,
"grad_norm": 0.2696833408525596,
"learning_rate": 6.006451326234656e-05,
"loss": 0.4955,
"step": 250
},
{
"epoch": 2.00499251123315,
"grad_norm": 0.5162311215778072,
"learning_rate": 5.987076537725598e-05,
"loss": 0.8356,
"step": 251
},
{
"epoch": 2.0129805292061906,
"grad_norm": 0.8755278174646857,
"learning_rate": 5.9676396565116814e-05,
"loss": 0.4597,
"step": 252
},
{
"epoch": 2.020968547179231,
"grad_norm": 1.2654521868820567,
"learning_rate": 5.9481412899618286e-05,
"loss": 0.4832,
"step": 253
},
{
"epoch": 2.0289565651522716,
"grad_norm": 0.7005128945439788,
"learning_rate": 5.9285820473662676e-05,
"loss": 0.4576,
"step": 254
},
{
"epoch": 2.036944583125312,
"grad_norm": 0.8900852330925937,
"learning_rate": 5.9089625399174975e-05,
"loss": 0.4677,
"step": 255
},
{
"epoch": 2.0449326010983526,
"grad_norm": 0.9295293387128268,
"learning_rate": 5.8892833806911934e-05,
"loss": 0.4581,
"step": 256
},
{
"epoch": 2.052920619071393,
"grad_norm": 0.7632251727706844,
"learning_rate": 5.869545184627041e-05,
"loss": 0.4564,
"step": 257
},
{
"epoch": 2.060908637044433,
"grad_norm": 0.606887179521497,
"learning_rate": 5.849748568509529e-05,
"loss": 0.4446,
"step": 258
},
{
"epoch": 2.0688966550174737,
"grad_norm": 0.7617777810480713,
"learning_rate": 5.829894150948668e-05,
"loss": 0.4501,
"step": 259
},
{
"epoch": 2.076884672990514,
"grad_norm": 0.6040763884991026,
"learning_rate": 5.8099825523606675e-05,
"loss": 0.4468,
"step": 260
},
{
"epoch": 2.0848726909635547,
"grad_norm": 0.6051469481172999,
"learning_rate": 5.790014394948542e-05,
"loss": 0.4543,
"step": 261
},
{
"epoch": 2.092860708936595,
"grad_norm": 0.478413783344682,
"learning_rate": 5.769990302682672e-05,
"loss": 0.4506,
"step": 262
},
{
"epoch": 2.1008487269096356,
"grad_norm": 0.562558957244333,
"learning_rate": 5.749910901281309e-05,
"loss": 0.453,
"step": 263
},
{
"epoch": 2.108836744882676,
"grad_norm": 0.4282466955885263,
"learning_rate": 5.729776818191014e-05,
"loss": 0.4545,
"step": 264
},
{
"epoch": 2.1168247628557166,
"grad_norm": 0.5285703751553213,
"learning_rate": 5.709588682567059e-05,
"loss": 0.4479,
"step": 265
},
{
"epoch": 2.1248127808287567,
"grad_norm": 0.40043659559155015,
"learning_rate": 5.689347125253765e-05,
"loss": 0.4442,
"step": 266
},
{
"epoch": 2.132800798801797,
"grad_norm": 0.45748239783102446,
"learning_rate": 5.6690527787647856e-05,
"loss": 0.4507,
"step": 267
},
{
"epoch": 2.1407888167748377,
"grad_norm": 0.4448537769428446,
"learning_rate": 5.6487062772633455e-05,
"loss": 0.4518,
"step": 268
},
{
"epoch": 2.148776834747878,
"grad_norm": 0.3496452875829841,
"learning_rate": 5.628308256542428e-05,
"loss": 0.4511,
"step": 269
},
{
"epoch": 2.1567648527209187,
"grad_norm": 0.36851827820489447,
"learning_rate": 5.607859354004897e-05,
"loss": 0.4475,
"step": 270
},
{
"epoch": 2.164752870693959,
"grad_norm": 0.3581014245926748,
"learning_rate": 5.5873602086435876e-05,
"loss": 0.4559,
"step": 271
},
{
"epoch": 2.1727408886669997,
"grad_norm": 0.3124251429586786,
"learning_rate": 5.566811461021335e-05,
"loss": 0.4507,
"step": 272
},
{
"epoch": 2.1807289066400397,
"grad_norm": 0.363939895859037,
"learning_rate": 5.5462137532509624e-05,
"loss": 0.4488,
"step": 273
},
{
"epoch": 2.18871692461308,
"grad_norm": 0.26872286843640025,
"learning_rate": 5.5255677289752086e-05,
"loss": 0.445,
"step": 274
},
{
"epoch": 2.1967049425861207,
"grad_norm": 0.31188711856580686,
"learning_rate": 5.504874033346623e-05,
"loss": 0.4518,
"step": 275
},
{
"epoch": 2.204692960559161,
"grad_norm": 0.27440306176835016,
"learning_rate": 5.4841333130074015e-05,
"loss": 0.4398,
"step": 276
},
{
"epoch": 2.2126809785322017,
"grad_norm": 0.2443244556857597,
"learning_rate": 5.4633462160691793e-05,
"loss": 0.4496,
"step": 277
},
{
"epoch": 2.220668996505242,
"grad_norm": 0.3469310336287689,
"learning_rate": 5.442513392092783e-05,
"loss": 0.4434,
"step": 278
},
{
"epoch": 2.2286570144782827,
"grad_norm": 0.2103072041810048,
"learning_rate": 5.4216354920679256e-05,
"loss": 0.4536,
"step": 279
},
{
"epoch": 2.236645032451323,
"grad_norm": 0.302897592656899,
"learning_rate": 5.400713168392874e-05,
"loss": 0.4469,
"step": 280
},
{
"epoch": 2.2446330504243637,
"grad_norm": 0.26907620566043555,
"learning_rate": 5.379747074854054e-05,
"loss": 0.4429,
"step": 281
},
{
"epoch": 2.2526210683974037,
"grad_norm": 0.242767529010096,
"learning_rate": 5.358737866605624e-05,
"loss": 0.4526,
"step": 282
},
{
"epoch": 2.260609086370444,
"grad_norm": 0.24059729283753153,
"learning_rate": 5.337686200149004e-05,
"loss": 0.4496,
"step": 283
},
{
"epoch": 2.2685971043434847,
"grad_norm": 0.16892626513698825,
"learning_rate": 5.316592733312359e-05,
"loss": 0.4444,
"step": 284
},
{
"epoch": 2.276585122316525,
"grad_norm": 0.2428921866442825,
"learning_rate": 5.2954581252300416e-05,
"loss": 0.4475,
"step": 285
},
{
"epoch": 2.2845731402895657,
"grad_norm": 0.24079102043869002,
"learning_rate": 5.2742830363220014e-05,
"loss": 0.4443,
"step": 286
},
{
"epoch": 2.292561158262606,
"grad_norm": 0.1691131754858366,
"learning_rate": 5.25306812827314e-05,
"loss": 0.4423,
"step": 287
},
{
"epoch": 2.3005491762356467,
"grad_norm": 0.26332279757319926,
"learning_rate": 5.231814064012639e-05,
"loss": 0.4482,
"step": 288
},
{
"epoch": 2.3085371942086867,
"grad_norm": 0.30874064763423864,
"learning_rate": 5.210521507693245e-05,
"loss": 0.4439,
"step": 289
},
{
"epoch": 2.3165252121817272,
"grad_norm": 0.22311973873687838,
"learning_rate": 5.189191124670514e-05,
"loss": 0.4402,
"step": 290
},
{
"epoch": 2.3245132301547677,
"grad_norm": 0.1922497454060213,
"learning_rate": 5.167823581482022e-05,
"loss": 0.4409,
"step": 291
},
{
"epoch": 2.3325012481278082,
"grad_norm": 0.16710905147214794,
"learning_rate": 5.146419545826535e-05,
"loss": 0.4471,
"step": 292
},
{
"epoch": 2.3404892661008487,
"grad_norm": 0.18694588888380953,
"learning_rate": 5.124979686543145e-05,
"loss": 0.4514,
"step": 293
},
{
"epoch": 2.3484772840738892,
"grad_norm": 0.19041976798949875,
"learning_rate": 5.103504673590372e-05,
"loss": 0.4385,
"step": 294
},
{
"epoch": 2.3564653020469297,
"grad_norm": 0.20694395753288766,
"learning_rate": 5.081995178025228e-05,
"loss": 0.4486,
"step": 295
},
{
"epoch": 2.36445332001997,
"grad_norm": 0.16778281147710722,
"learning_rate": 5.060451871982242e-05,
"loss": 0.455,
"step": 296
},
{
"epoch": 2.3724413379930107,
"grad_norm": 0.17343940615670786,
"learning_rate": 5.038875428652468e-05,
"loss": 0.447,
"step": 297
},
{
"epoch": 2.3804293559660508,
"grad_norm": 0.17734566622982126,
"learning_rate": 5.0172665222624395e-05,
"loss": 0.4481,
"step": 298
},
{
"epoch": 2.3884173739390913,
"grad_norm": 0.1766718931672107,
"learning_rate": 4.995625828053106e-05,
"loss": 0.4524,
"step": 299
},
{
"epoch": 2.3964053919121318,
"grad_norm": 0.19583636193380063,
"learning_rate": 4.973954022258729e-05,
"loss": 0.4547,
"step": 300
},
{
"epoch": 2.4043934098851723,
"grad_norm": 0.17026857168289744,
"learning_rate": 4.952251782085757e-05,
"loss": 0.448,
"step": 301
},
{
"epoch": 2.4123814278582127,
"grad_norm": 0.1394946256958487,
"learning_rate": 4.930519785691657e-05,
"loss": 0.4482,
"step": 302
},
{
"epoch": 2.4203694458312532,
"grad_norm": 0.1507130531191368,
"learning_rate": 4.9087587121637284e-05,
"loss": 0.4489,
"step": 303
},
{
"epoch": 2.4283574638042937,
"grad_norm": 0.19875894846238537,
"learning_rate": 4.886969241497878e-05,
"loss": 0.4445,
"step": 304
},
{
"epoch": 2.436345481777334,
"grad_norm": 0.23769686285223604,
"learning_rate": 4.865152054577379e-05,
"loss": 0.4524,
"step": 305
},
{
"epoch": 2.4443334997503743,
"grad_norm": 0.22560472662810682,
"learning_rate": 4.843307833151583e-05,
"loss": 0.4473,
"step": 306
},
{
"epoch": 2.452321517723415,
"grad_norm": 0.15975420253786612,
"learning_rate": 4.82143725981463e-05,
"loss": 0.4474,
"step": 307
},
{
"epoch": 2.4603095356964553,
"grad_norm": 0.1453747344586306,
"learning_rate": 4.7995410179841065e-05,
"loss": 0.4496,
"step": 308
},
{
"epoch": 2.4682975536694958,
"grad_norm": 0.15320122247522389,
"learning_rate": 4.777619791879698e-05,
"loss": 0.4445,
"step": 309
},
{
"epoch": 2.4762855716425363,
"grad_norm": 0.20898054566985402,
"learning_rate": 4.755674266501802e-05,
"loss": 0.4557,
"step": 310
},
{
"epoch": 2.4842735896155768,
"grad_norm": 0.21741215675606498,
"learning_rate": 4.73370512761013e-05,
"loss": 0.4417,
"step": 311
},
{
"epoch": 2.4922616075886173,
"grad_norm": 0.16889794561130403,
"learning_rate": 4.711713061702274e-05,
"loss": 0.4443,
"step": 312
},
{
"epoch": 2.5002496255616578,
"grad_norm": 0.17993307076723922,
"learning_rate": 4.689698755992255e-05,
"loss": 0.4479,
"step": 313
},
{
"epoch": 2.508237643534698,
"grad_norm": 0.19257453660181062,
"learning_rate": 4.667662898389048e-05,
"loss": 0.4491,
"step": 314
},
{
"epoch": 2.5162256615077383,
"grad_norm": 0.1472085090976699,
"learning_rate": 4.645606177475089e-05,
"loss": 0.4373,
"step": 315
},
{
"epoch": 2.524213679480779,
"grad_norm": 0.19033455613068187,
"learning_rate": 4.6235292824847575e-05,
"loss": 0.4544,
"step": 316
},
{
"epoch": 2.5322016974538193,
"grad_norm": 0.18170601952075063,
"learning_rate": 4.601432903282836e-05,
"loss": 0.4412,
"step": 317
},
{
"epoch": 2.54018971542686,
"grad_norm": 0.15727860647785666,
"learning_rate": 4.579317730342955e-05,
"loss": 0.4399,
"step": 318
},
{
"epoch": 2.5481777333999003,
"grad_norm": 0.17970878529305648,
"learning_rate": 4.5571844547260184e-05,
"loss": 0.4403,
"step": 319
},
{
"epoch": 2.5561657513729408,
"grad_norm": 0.15429718810042514,
"learning_rate": 4.535033768058604e-05,
"loss": 0.4485,
"step": 320
},
{
"epoch": 2.564153769345981,
"grad_norm": 0.15715864822910056,
"learning_rate": 4.512866362511361e-05,
"loss": 0.4467,
"step": 321
},
{
"epoch": 2.5721417873190213,
"grad_norm": 0.14222629722842062,
"learning_rate": 4.490682930777368e-05,
"loss": 0.4374,
"step": 322
},
{
"epoch": 2.580129805292062,
"grad_norm": 0.16416055580887054,
"learning_rate": 4.468484166050499e-05,
"loss": 0.4429,
"step": 323
},
{
"epoch": 2.5881178232651023,
"grad_norm": 0.1378665667643313,
"learning_rate": 4.446270762003754e-05,
"loss": 0.4439,
"step": 324
},
{
"epoch": 2.596105841238143,
"grad_norm": 0.14749790568854468,
"learning_rate": 4.424043412767589e-05,
"loss": 0.4466,
"step": 325
},
{
"epoch": 2.6040938592111833,
"grad_norm": 0.146540138127552,
"learning_rate": 4.401802812908221e-05,
"loss": 0.4419,
"step": 326
},
{
"epoch": 2.612081877184224,
"grad_norm": 0.17339116836008553,
"learning_rate": 4.379549657405928e-05,
"loss": 0.4467,
"step": 327
},
{
"epoch": 2.620069895157264,
"grad_norm": 0.18348099975421248,
"learning_rate": 4.35728464163333e-05,
"loss": 0.4416,
"step": 328
},
{
"epoch": 2.628057913130305,
"grad_norm": 0.13620309620113327,
"learning_rate": 4.335008461333657e-05,
"loss": 0.4427,
"step": 329
},
{
"epoch": 2.636045931103345,
"grad_norm": 0.1709480972281254,
"learning_rate": 4.312721812599016e-05,
"loss": 0.4414,
"step": 330
},
{
"epoch": 2.6440339490763853,
"grad_norm": 0.16164451064940724,
"learning_rate": 4.2904253918486295e-05,
"loss": 0.4535,
"step": 331
},
{
"epoch": 2.652021967049426,
"grad_norm": 0.14081917286088105,
"learning_rate": 4.268119895807084e-05,
"loss": 0.4429,
"step": 332
},
{
"epoch": 2.6600099850224663,
"grad_norm": 0.18137180021156257,
"learning_rate": 4.245806021482547e-05,
"loss": 0.4427,
"step": 333
},
{
"epoch": 2.667998002995507,
"grad_norm": 0.13800609298110714,
"learning_rate": 4.2234844661449964e-05,
"loss": 0.44,
"step": 334
},
{
"epoch": 2.6759860209685473,
"grad_norm": 0.1551146252415665,
"learning_rate": 4.20115592730443e-05,
"loss": 0.4507,
"step": 335
},
{
"epoch": 2.683974038941588,
"grad_norm": 0.15173038583107296,
"learning_rate": 4.178821102689064e-05,
"loss": 0.4426,
"step": 336
},
{
"epoch": 2.691962056914628,
"grad_norm": 0.15116080328062176,
"learning_rate": 4.156480690223537e-05,
"loss": 0.447,
"step": 337
},
{
"epoch": 2.6999500748876684,
"grad_norm": 0.17450805671193279,
"learning_rate": 4.134135388007097e-05,
"loss": 0.4469,
"step": 338
},
{
"epoch": 2.707938092860709,
"grad_norm": 0.17281860373285934,
"learning_rate": 4.111785894291789e-05,
"loss": 0.4427,
"step": 339
},
{
"epoch": 2.7159261108337494,
"grad_norm": 0.13324453353593427,
"learning_rate": 4.089432907460634e-05,
"loss": 0.45,
"step": 340
},
{
"epoch": 2.72391412880679,
"grad_norm": 0.15126807617639215,
"learning_rate": 4.0670771260058106e-05,
"loss": 0.4486,
"step": 341
},
{
"epoch": 2.7319021467798303,
"grad_norm": 0.16029221354477333,
"learning_rate": 4.044719248506819e-05,
"loss": 0.4408,
"step": 342
},
{
"epoch": 2.739890164752871,
"grad_norm": 0.1463219695798821,
"learning_rate": 4.0223599736086596e-05,
"loss": 0.4479,
"step": 343
},
{
"epoch": 2.747878182725911,
"grad_norm": 0.14595637335852438,
"learning_rate": 4e-05,
"loss": 0.4473,
"step": 344
},
{
"epoch": 2.755866200698952,
"grad_norm": 0.13738467367514962,
"learning_rate": 3.9776400263913404e-05,
"loss": 0.4541,
"step": 345
},
{
"epoch": 2.763854218671992,
"grad_norm": 0.1439562510526391,
"learning_rate": 3.9552807514931824e-05,
"loss": 0.4436,
"step": 346
},
{
"epoch": 2.7718422366450324,
"grad_norm": 0.13006608621756496,
"learning_rate": 3.93292287399419e-05,
"loss": 0.4397,
"step": 347
},
{
"epoch": 2.779830254618073,
"grad_norm": 0.14041358992697037,
"learning_rate": 3.9105670925393665e-05,
"loss": 0.4322,
"step": 348
},
{
"epoch": 2.7878182725911134,
"grad_norm": 0.1495382630742624,
"learning_rate": 3.8882141057082117e-05,
"loss": 0.449,
"step": 349
},
{
"epoch": 2.795806290564154,
"grad_norm": 0.13422760316245289,
"learning_rate": 3.8658646119929046e-05,
"loss": 0.4481,
"step": 350
},
{
"epoch": 2.8037943085371944,
"grad_norm": 0.16641223994983959,
"learning_rate": 3.843519309776464e-05,
"loss": 0.4454,
"step": 351
},
{
"epoch": 2.811782326510235,
"grad_norm": 0.12812350342466014,
"learning_rate": 3.821178897310938e-05,
"loss": 0.4535,
"step": 352
},
{
"epoch": 2.819770344483275,
"grad_norm": 0.15337686560279318,
"learning_rate": 3.798844072695571e-05,
"loss": 0.4455,
"step": 353
},
{
"epoch": 2.8277583624563154,
"grad_norm": 0.13561487109024523,
"learning_rate": 3.776515533855004e-05,
"loss": 0.4421,
"step": 354
},
{
"epoch": 2.835746380429356,
"grad_norm": 0.12405473708728454,
"learning_rate": 3.7541939785174545e-05,
"loss": 0.4433,
"step": 355
},
{
"epoch": 2.8437343984023964,
"grad_norm": 0.12633600414835006,
"learning_rate": 3.731880104192917e-05,
"loss": 0.4432,
"step": 356
},
{
"epoch": 2.851722416375437,
"grad_norm": 0.1317080752956006,
"learning_rate": 3.709574608151371e-05,
"loss": 0.4465,
"step": 357
},
{
"epoch": 2.8597104343484774,
"grad_norm": 0.1475249153982226,
"learning_rate": 3.687278187400985e-05,
"loss": 0.4401,
"step": 358
},
{
"epoch": 2.867698452321518,
"grad_norm": 0.1458288492905671,
"learning_rate": 3.664991538666344e-05,
"loss": 0.4344,
"step": 359
},
{
"epoch": 2.875686470294558,
"grad_norm": 0.11939958255100196,
"learning_rate": 3.6427153583666715e-05,
"loss": 0.4367,
"step": 360
},
{
"epoch": 2.883674488267599,
"grad_norm": 0.16554239338436524,
"learning_rate": 3.620450342594073e-05,
"loss": 0.4418,
"step": 361
},
{
"epoch": 2.891662506240639,
"grad_norm": 0.1187974636724584,
"learning_rate": 3.59819718709178e-05,
"loss": 0.45,
"step": 362
},
{
"epoch": 2.8996505242136794,
"grad_norm": 0.15936228812392336,
"learning_rate": 3.575956587232413e-05,
"loss": 0.4508,
"step": 363
},
{
"epoch": 2.90763854218672,
"grad_norm": 0.13367105463505505,
"learning_rate": 3.5537292379962474e-05,
"loss": 0.4465,
"step": 364
},
{
"epoch": 2.9156265601597604,
"grad_norm": 0.14243006994077556,
"learning_rate": 3.5315158339495015e-05,
"loss": 0.4464,
"step": 365
},
{
"epoch": 2.923614578132801,
"grad_norm": 0.1399001261869002,
"learning_rate": 3.509317069222633e-05,
"loss": 0.4502,
"step": 366
},
{
"epoch": 2.9316025961058414,
"grad_norm": 0.13108273735056272,
"learning_rate": 3.487133637488639e-05,
"loss": 0.4369,
"step": 367
},
{
"epoch": 2.939590614078882,
"grad_norm": 0.14943325684519726,
"learning_rate": 3.464966231941397e-05,
"loss": 0.4415,
"step": 368
},
{
"epoch": 2.947578632051922,
"grad_norm": 0.13558373438864768,
"learning_rate": 3.442815545273983e-05,
"loss": 0.4382,
"step": 369
},
{
"epoch": 2.9555666500249624,
"grad_norm": 0.12912584792295748,
"learning_rate": 3.420682269657047e-05,
"loss": 0.4363,
"step": 370
},
{
"epoch": 2.963554667998003,
"grad_norm": 0.12458007215100302,
"learning_rate": 3.398567096717165e-05,
"loss": 0.4409,
"step": 371
},
{
"epoch": 2.9715426859710434,
"grad_norm": 0.12840111428281253,
"learning_rate": 3.376470717515244e-05,
"loss": 0.4407,
"step": 372
},
{
"epoch": 2.979530703944084,
"grad_norm": 0.13058809738960123,
"learning_rate": 3.354393822524913e-05,
"loss": 0.4407,
"step": 373
},
{
"epoch": 2.9875187219171244,
"grad_norm": 0.15613845334671814,
"learning_rate": 3.332337101610953e-05,
"loss": 0.4473,
"step": 374
},
{
"epoch": 2.995506739890165,
"grad_norm": 0.13389617942366203,
"learning_rate": 3.310301244007747e-05,
"loss": 0.4352,
"step": 375
},
{
"epoch": 3.0034947578632054,
"grad_norm": 0.30944417126328405,
"learning_rate": 3.2882869382977265e-05,
"loss": 0.7723,
"step": 376
},
{
"epoch": 3.0114827758362455,
"grad_norm": 0.29354627871039446,
"learning_rate": 3.266294872389871e-05,
"loss": 0.4025,
"step": 377
},
{
"epoch": 3.019470793809286,
"grad_norm": 0.2010591684487564,
"learning_rate": 3.2443257334981985e-05,
"loss": 0.4024,
"step": 378
},
{
"epoch": 3.0274588117823265,
"grad_norm": 0.27298247297612654,
"learning_rate": 3.222380208120304e-05,
"loss": 0.4089,
"step": 379
},
{
"epoch": 3.035446829755367,
"grad_norm": 0.23270934832932566,
"learning_rate": 3.200458982015894e-05,
"loss": 0.4072,
"step": 380
},
{
"epoch": 3.0434348477284074,
"grad_norm": 0.20268308202991778,
"learning_rate": 3.178562740185372e-05,
"loss": 0.4022,
"step": 381
},
{
"epoch": 3.051422865701448,
"grad_norm": 0.20766736812021794,
"learning_rate": 3.156692166848418e-05,
"loss": 0.4024,
"step": 382
},
{
"epoch": 3.0594108836744884,
"grad_norm": 0.2547479854625852,
"learning_rate": 3.134847945422622e-05,
"loss": 0.4072,
"step": 383
},
{
"epoch": 3.067398901647529,
"grad_norm": 0.1969866280565691,
"learning_rate": 3.113030758502123e-05,
"loss": 0.4118,
"step": 384
},
{
"epoch": 3.075386919620569,
"grad_norm": 0.23153499880928385,
"learning_rate": 3.091241287836272e-05,
"loss": 0.4077,
"step": 385
},
{
"epoch": 3.0833749375936095,
"grad_norm": 0.20503882652518132,
"learning_rate": 3.0694802143083436e-05,
"loss": 0.4132,
"step": 386
},
{
"epoch": 3.09136295556665,
"grad_norm": 0.17320798113782282,
"learning_rate": 3.0477482179142432e-05,
"loss": 0.4097,
"step": 387
},
{
"epoch": 3.0993509735396905,
"grad_norm": 0.20168474769945824,
"learning_rate": 3.026045977741272e-05,
"loss": 0.3965,
"step": 388
},
{
"epoch": 3.107338991512731,
"grad_norm": 0.19398918365065387,
"learning_rate": 3.004374171946895e-05,
"loss": 0.402,
"step": 389
},
{
"epoch": 3.1153270094857715,
"grad_norm": 0.16700046485980305,
"learning_rate": 2.9827334777375622e-05,
"loss": 0.4136,
"step": 390
},
{
"epoch": 3.123315027458812,
"grad_norm": 0.25279169008131713,
"learning_rate": 2.9611245713475328e-05,
"loss": 0.4003,
"step": 391
},
{
"epoch": 3.131303045431852,
"grad_norm": 0.16080528287954057,
"learning_rate": 2.9395481280177596e-05,
"loss": 0.4011,
"step": 392
},
{
"epoch": 3.1392910634048925,
"grad_norm": 0.22759163441812938,
"learning_rate": 2.9180048219747736e-05,
"loss": 0.4034,
"step": 393
},
{
"epoch": 3.147279081377933,
"grad_norm": 0.17841534466968145,
"learning_rate": 2.8964953264096277e-05,
"loss": 0.4086,
"step": 394
},
{
"epoch": 3.1552670993509735,
"grad_norm": 0.17487802806512123,
"learning_rate": 2.8750203134568564e-05,
"loss": 0.408,
"step": 395
},
{
"epoch": 3.163255117324014,
"grad_norm": 0.18241625540198192,
"learning_rate": 2.8535804541734663e-05,
"loss": 0.4077,
"step": 396
},
{
"epoch": 3.1712431352970545,
"grad_norm": 0.16398724549614757,
"learning_rate": 2.832176418517979e-05,
"loss": 0.4098,
"step": 397
},
{
"epoch": 3.179231153270095,
"grad_norm": 0.16170229114317095,
"learning_rate": 2.8108088753294864e-05,
"loss": 0.4,
"step": 398
},
{
"epoch": 3.1872191712431355,
"grad_norm": 0.14606650542275093,
"learning_rate": 2.7894784923067563e-05,
"loss": 0.4081,
"step": 399
},
{
"epoch": 3.195207189216176,
"grad_norm": 0.154688060281461,
"learning_rate": 2.768185935987362e-05,
"loss": 0.4095,
"step": 400
},
{
"epoch": 3.203195207189216,
"grad_norm": 0.14458385897335363,
"learning_rate": 2.7469318717268622e-05,
"loss": 0.4083,
"step": 401
},
{
"epoch": 3.2111832251622565,
"grad_norm": 0.14953811526297756,
"learning_rate": 2.7257169636779992e-05,
"loss": 0.4082,
"step": 402
},
{
"epoch": 3.219171243135297,
"grad_norm": 0.13312099784173914,
"learning_rate": 2.704541874769958e-05,
"loss": 0.4068,
"step": 403
},
{
"epoch": 3.2271592611083375,
"grad_norm": 0.1386674411611782,
"learning_rate": 2.6834072666876427e-05,
"loss": 0.402,
"step": 404
},
{
"epoch": 3.235147279081378,
"grad_norm": 0.12924251838188583,
"learning_rate": 2.6623137998509964e-05,
"loss": 0.4113,
"step": 405
},
{
"epoch": 3.2431352970544185,
"grad_norm": 0.13222743176356805,
"learning_rate": 2.641262133394378e-05,
"loss": 0.4093,
"step": 406
},
{
"epoch": 3.251123315027459,
"grad_norm": 0.13021912109847186,
"learning_rate": 2.6202529251459475e-05,
"loss": 0.4104,
"step": 407
},
{
"epoch": 3.259111333000499,
"grad_norm": 0.13606000089551518,
"learning_rate": 2.599286831607127e-05,
"loss": 0.4089,
"step": 408
},
{
"epoch": 3.2670993509735395,
"grad_norm": 0.13357003115707924,
"learning_rate": 2.5783645079320757e-05,
"loss": 0.4055,
"step": 409
},
{
"epoch": 3.27508736894658,
"grad_norm": 0.1232470250676397,
"learning_rate": 2.5574866079072188e-05,
"loss": 0.4133,
"step": 410
},
{
"epoch": 3.2830753869196205,
"grad_norm": 0.14061126711951444,
"learning_rate": 2.5366537839308213e-05,
"loss": 0.4023,
"step": 411
},
{
"epoch": 3.291063404892661,
"grad_norm": 0.12020419683198272,
"learning_rate": 2.515866686992599e-05,
"loss": 0.406,
"step": 412
},
{
"epoch": 3.2990514228657015,
"grad_norm": 0.13624018306536384,
"learning_rate": 2.4951259666533778e-05,
"loss": 0.4137,
"step": 413
},
{
"epoch": 3.307039440838742,
"grad_norm": 0.13470595005125394,
"learning_rate": 2.4744322710247914e-05,
"loss": 0.4072,
"step": 414
},
{
"epoch": 3.3150274588117825,
"grad_norm": 0.11406991036845995,
"learning_rate": 2.4537862467490393e-05,
"loss": 0.4032,
"step": 415
},
{
"epoch": 3.323015476784823,
"grad_norm": 0.12469392558548403,
"learning_rate": 2.4331885389786648e-05,
"loss": 0.4061,
"step": 416
},
{
"epoch": 3.331003494757863,
"grad_norm": 0.11240496673470576,
"learning_rate": 2.4126397913564138e-05,
"loss": 0.3972,
"step": 417
},
{
"epoch": 3.3389915127309036,
"grad_norm": 0.11440176304944144,
"learning_rate": 2.3921406459951038e-05,
"loss": 0.401,
"step": 418
},
{
"epoch": 3.346979530703944,
"grad_norm": 0.12061267695807164,
"learning_rate": 2.371691743457573e-05,
"loss": 0.4042,
"step": 419
},
{
"epoch": 3.3549675486769845,
"grad_norm": 0.12408924452739928,
"learning_rate": 2.3512937227366548e-05,
"loss": 0.4042,
"step": 420
},
{
"epoch": 3.362955566650025,
"grad_norm": 0.119324320832681,
"learning_rate": 2.330947221235217e-05,
"loss": 0.3999,
"step": 421
},
{
"epoch": 3.3709435846230655,
"grad_norm": 0.11372319294009971,
"learning_rate": 2.3106528747462374e-05,
"loss": 0.411,
"step": 422
},
{
"epoch": 3.378931602596106,
"grad_norm": 0.11440578627516848,
"learning_rate": 2.290411317432942e-05,
"loss": 0.4103,
"step": 423
},
{
"epoch": 3.386919620569146,
"grad_norm": 0.11396557333843903,
"learning_rate": 2.270223181808988e-05,
"loss": 0.4056,
"step": 424
},
{
"epoch": 3.3949076385421866,
"grad_norm": 0.1073175497389294,
"learning_rate": 2.250089098718692e-05,
"loss": 0.4001,
"step": 425
},
{
"epoch": 3.402895656515227,
"grad_norm": 0.11142545752473547,
"learning_rate": 2.2300096973173276e-05,
"loss": 0.4013,
"step": 426
},
{
"epoch": 3.4108836744882676,
"grad_norm": 0.11528253053702402,
"learning_rate": 2.2099856050514593e-05,
"loss": 0.4074,
"step": 427
},
{
"epoch": 3.418871692461308,
"grad_norm": 0.1075239061798206,
"learning_rate": 2.1900174476393335e-05,
"loss": 0.4035,
"step": 428
},
{
"epoch": 3.4268597104343486,
"grad_norm": 0.10808021553369461,
"learning_rate": 2.170105849051332e-05,
"loss": 0.4052,
"step": 429
},
{
"epoch": 3.434847728407389,
"grad_norm": 0.11387661467604573,
"learning_rate": 2.1502514314904723e-05,
"loss": 0.4011,
"step": 430
},
{
"epoch": 3.442835746380429,
"grad_norm": 0.10171924087995715,
"learning_rate": 2.1304548153729596e-05,
"loss": 0.4077,
"step": 431
},
{
"epoch": 3.4508237643534696,
"grad_norm": 0.1285002444781682,
"learning_rate": 2.1107166193088073e-05,
"loss": 0.4063,
"step": 432
},
{
"epoch": 3.45881178232651,
"grad_norm": 0.11335168282371334,
"learning_rate": 2.091037460082503e-05,
"loss": 0.4154,
"step": 433
},
{
"epoch": 3.4667998002995506,
"grad_norm": 0.11343444669438019,
"learning_rate": 2.0714179526337334e-05,
"loss": 0.41,
"step": 434
},
{
"epoch": 3.474787818272591,
"grad_norm": 0.1217156602130217,
"learning_rate": 2.0518587100381727e-05,
"loss": 0.4075,
"step": 435
},
{
"epoch": 3.4827758362456316,
"grad_norm": 0.10725293992167916,
"learning_rate": 2.0323603434883186e-05,
"loss": 0.4066,
"step": 436
},
{
"epoch": 3.490763854218672,
"grad_norm": 0.12028103178489573,
"learning_rate": 2.0129234622744044e-05,
"loss": 0.4103,
"step": 437
},
{
"epoch": 3.4987518721917126,
"grad_norm": 0.1029854987347421,
"learning_rate": 1.9935486737653452e-05,
"loss": 0.4038,
"step": 438
},
{
"epoch": 3.506739890164753,
"grad_norm": 0.11857347505878003,
"learning_rate": 1.9742365833897733e-05,
"loss": 0.4074,
"step": 439
},
{
"epoch": 3.514727908137793,
"grad_norm": 0.1105825700379065,
"learning_rate": 1.954987794617107e-05,
"loss": 0.4105,
"step": 440
},
{
"epoch": 3.5227159261108336,
"grad_norm": 0.11511596034752838,
"learning_rate": 1.9358029089387034e-05,
"loss": 0.4131,
"step": 441
},
{
"epoch": 3.530703944083874,
"grad_norm": 0.11612657903144337,
"learning_rate": 1.916682525849058e-05,
"loss": 0.4068,
"step": 442
},
{
"epoch": 3.5386919620569146,
"grad_norm": 0.10575599755099882,
"learning_rate": 1.897627242827068e-05,
"loss": 0.4038,
"step": 443
},
{
"epoch": 3.546679980029955,
"grad_norm": 0.11088748332110426,
"learning_rate": 1.878637655317372e-05,
"loss": 0.4078,
"step": 444
},
{
"epoch": 3.5546679980029956,
"grad_norm": 0.11466223345296331,
"learning_rate": 1.859714356711731e-05,
"loss": 0.3939,
"step": 445
},
{
"epoch": 3.562656015976036,
"grad_norm": 0.11673865175002288,
"learning_rate": 1.8408579383304985e-05,
"loss": 0.4049,
"step": 446
},
{
"epoch": 3.570644033949076,
"grad_norm": 0.11577952607867907,
"learning_rate": 1.8220689894041314e-05,
"loss": 0.4088,
"step": 447
},
{
"epoch": 3.578632051922117,
"grad_norm": 0.10690091900937719,
"learning_rate": 1.8033480970547872e-05,
"loss": 0.4056,
"step": 448
},
{
"epoch": 3.586620069895157,
"grad_norm": 0.11541573082426308,
"learning_rate": 1.7846958462779716e-05,
"loss": 0.4007,
"step": 449
},
{
"epoch": 3.5946080878681976,
"grad_norm": 0.1100114302346526,
"learning_rate": 1.7661128199242576e-05,
"loss": 0.4089,
"step": 450
},
{
"epoch": 3.602596105841238,
"grad_norm": 0.10956511339867736,
"learning_rate": 1.7475995986810775e-05,
"loss": 0.4018,
"step": 451
},
{
"epoch": 3.6105841238142786,
"grad_norm": 0.10850454028936493,
"learning_rate": 1.7291567610545738e-05,
"loss": 0.4051,
"step": 452
},
{
"epoch": 3.618572141787319,
"grad_norm": 0.1131878747175685,
"learning_rate": 1.7107848833515244e-05,
"loss": 0.4079,
"step": 453
},
{
"epoch": 3.6265601597603596,
"grad_norm": 0.09884020665129564,
"learning_rate": 1.6924845396613275e-05,
"loss": 0.407,
"step": 454
},
{
"epoch": 3.6345481777334,
"grad_norm": 0.11216709502149264,
"learning_rate": 1.6742563018380734e-05,
"loss": 0.4087,
"step": 455
},
{
"epoch": 3.64253619570644,
"grad_norm": 0.0996580768122796,
"learning_rate": 1.6561007394826623e-05,
"loss": 0.4039,
"step": 456
},
{
"epoch": 3.6505242136794807,
"grad_norm": 0.10651639312645377,
"learning_rate": 1.638018419925018e-05,
"loss": 0.3996,
"step": 457
},
{
"epoch": 3.658512231652521,
"grad_norm": 0.09841162160967377,
"learning_rate": 1.6200099082063477e-05,
"loss": 0.4055,
"step": 458
},
{
"epoch": 3.6665002496255616,
"grad_norm": 0.11559374542937897,
"learning_rate": 1.602075767061497e-05,
"loss": 0.4088,
"step": 459
},
{
"epoch": 3.674488267598602,
"grad_norm": 0.11049592658320795,
"learning_rate": 1.584216556901355e-05,
"loss": 0.4053,
"step": 460
},
{
"epoch": 3.6824762855716426,
"grad_norm": 0.09690459875455099,
"learning_rate": 1.566432835795349e-05,
"loss": 0.4052,
"step": 461
},
{
"epoch": 3.690464303544683,
"grad_norm": 0.11084043420560455,
"learning_rate": 1.5487251594540062e-05,
"loss": 0.4013,
"step": 462
},
{
"epoch": 3.698452321517723,
"grad_norm": 0.11145942008644477,
"learning_rate": 1.5310940812115812e-05,
"loss": 0.404,
"step": 463
},
{
"epoch": 3.706440339490764,
"grad_norm": 0.09702858045834936,
"learning_rate": 1.5135401520087757e-05,
"loss": 0.4033,
"step": 464
},
{
"epoch": 3.714428357463804,
"grad_norm": 0.10073536549329104,
"learning_rate": 1.4960639203755136e-05,
"loss": 0.4046,
"step": 465
},
{
"epoch": 3.7224163754368447,
"grad_norm": 0.09948648507952308,
"learning_rate": 1.4786659324138075e-05,
"loss": 0.4041,
"step": 466
},
{
"epoch": 3.730404393409885,
"grad_norm": 0.09373041246826647,
"learning_rate": 1.4613467317806861e-05,
"loss": 0.4075,
"step": 467
},
{
"epoch": 3.7383924113829257,
"grad_norm": 0.10208986391007283,
"learning_rate": 1.4441068596712157e-05,
"loss": 0.3999,
"step": 468
},
{
"epoch": 3.746380429355966,
"grad_norm": 0.10239549924151786,
"learning_rate": 1.4269468548015785e-05,
"loss": 0.3954,
"step": 469
},
{
"epoch": 3.7543684473290067,
"grad_norm": 0.10434926470085772,
"learning_rate": 1.4098672533922471e-05,
"loss": 0.4103,
"step": 470
},
{
"epoch": 3.762356465302047,
"grad_norm": 0.1022671854724037,
"learning_rate": 1.3928685891512248e-05,
"loss": 0.4068,
"step": 471
},
{
"epoch": 3.770344483275087,
"grad_norm": 0.10372672313209318,
"learning_rate": 1.375951393257365e-05,
"loss": 0.4063,
"step": 472
},
{
"epoch": 3.7783325012481277,
"grad_norm": 0.1001467709798247,
"learning_rate": 1.35911619434378e-05,
"loss": 0.3982,
"step": 473
},
{
"epoch": 3.786320519221168,
"grad_norm": 0.10848171250475616,
"learning_rate": 1.3423635184813182e-05,
"loss": 0.3994,
"step": 474
},
{
"epoch": 3.7943085371942087,
"grad_norm": 0.10297059459791853,
"learning_rate": 1.3256938891621208e-05,
"loss": 0.4051,
"step": 475
},
{
"epoch": 3.802296555167249,
"grad_norm": 0.09850487513786725,
"learning_rate": 1.3091078272832732e-05,
"loss": 0.4039,
"step": 476
},
{
"epoch": 3.8102845731402897,
"grad_norm": 0.09654837279347964,
"learning_rate": 1.2926058511305221e-05,
"loss": 0.4027,
"step": 477
},
{
"epoch": 3.81827259111333,
"grad_norm": 0.10106233469187086,
"learning_rate": 1.2761884763620773e-05,
"loss": 0.4028,
"step": 478
},
{
"epoch": 3.8262606090863702,
"grad_norm": 0.10521144963578496,
"learning_rate": 1.2598562159925068e-05,
"loss": 0.4047,
"step": 479
},
{
"epoch": 3.8342486270594107,
"grad_norm": 0.10055170090272858,
"learning_rate": 1.2436095803766946e-05,
"loss": 0.408,
"step": 480
},
{
"epoch": 3.842236645032451,
"grad_norm": 0.10030963723827738,
"learning_rate": 1.2274490771939047e-05,
"loss": 0.4139,
"step": 481
},
{
"epoch": 3.8502246630054917,
"grad_norm": 0.11291159209305866,
"learning_rate": 1.2113752114319107e-05,
"loss": 0.4075,
"step": 482
},
{
"epoch": 3.858212680978532,
"grad_norm": 0.09711749318081525,
"learning_rate": 1.195388485371213e-05,
"loss": 0.4008,
"step": 483
},
{
"epoch": 3.8662006989515727,
"grad_norm": 0.09587092246627478,
"learning_rate": 1.1794893985693517e-05,
"loss": 0.4072,
"step": 484
},
{
"epoch": 3.874188716924613,
"grad_norm": 0.10842252534792915,
"learning_rate": 1.1636784478452872e-05,
"loss": 0.3983,
"step": 485
},
{
"epoch": 3.8821767348976532,
"grad_norm": 0.10030989962078998,
"learning_rate": 1.1479561272638851e-05,
"loss": 0.405,
"step": 486
},
{
"epoch": 3.890164752870694,
"grad_norm": 0.09668292596476558,
"learning_rate": 1.1323229281204667e-05,
"loss": 0.4046,
"step": 487
},
{
"epoch": 3.8981527708437342,
"grad_norm": 0.11229884300303226,
"learning_rate": 1.1167793389254671e-05,
"loss": 0.4077,
"step": 488
},
{
"epoch": 3.9061407888167747,
"grad_norm": 0.1007265262970734,
"learning_rate": 1.1013258453891624e-05,
"loss": 0.4079,
"step": 489
},
{
"epoch": 3.9141288067898152,
"grad_norm": 0.09800596022091544,
"learning_rate": 1.0859629304064966e-05,
"loss": 0.4124,
"step": 490
},
{
"epoch": 3.9221168247628557,
"grad_norm": 0.0972237859271068,
"learning_rate": 1.0706910740419927e-05,
"loss": 0.3995,
"step": 491
},
{
"epoch": 3.930104842735896,
"grad_norm": 0.09568160375704794,
"learning_rate": 1.055510753514744e-05,
"loss": 0.4044,
"step": 492
},
{
"epoch": 3.9380928607089367,
"grad_norm": 0.10293942587001009,
"learning_rate": 1.0404224431835127e-05,
"loss": 0.3999,
"step": 493
},
{
"epoch": 3.946080878681977,
"grad_norm": 0.09547704742606819,
"learning_rate": 1.025426614531897e-05,
"loss": 0.4012,
"step": 494
},
{
"epoch": 3.9540688966550173,
"grad_norm": 0.09843903422495338,
"learning_rate": 1.0105237361536058e-05,
"loss": 0.4029,
"step": 495
},
{
"epoch": 3.9620569146280578,
"grad_norm": 0.0995011244677626,
"learning_rate": 9.957142737378128e-06,
"loss": 0.4084,
"step": 496
},
{
"epoch": 3.9700449326010983,
"grad_norm": 0.10559619287684664,
"learning_rate": 9.809986900546011e-06,
"loss": 0.4031,
"step": 497
},
{
"epoch": 3.9780329505741387,
"grad_norm": 0.09619833393540202,
"learning_rate": 9.663774449405095e-06,
"loss": 0.3986,
"step": 498
},
{
"epoch": 3.9860209685471792,
"grad_norm": 0.09183866726575214,
"learning_rate": 9.518509952841586e-06,
"loss": 0.4066,
"step": 499
},
{
"epoch": 3.9940089865202197,
"grad_norm": 0.09366222741801747,
"learning_rate": 9.374197950119726e-06,
"loss": 0.4039,
"step": 500
},
{
"epoch": 4.00199700449326,
"grad_norm": 0.2243873122878777,
"learning_rate": 9.230842950740002e-06,
"loss": 0.7111,
"step": 501
},
{
"epoch": 4.0099850224663,
"grad_norm": 0.16901611231637365,
"learning_rate": 9.088449434298204e-06,
"loss": 0.3809,
"step": 502
},
{
"epoch": 4.017973040439341,
"grad_norm": 0.13423991777929192,
"learning_rate": 8.947021850345398e-06,
"loss": 0.3726,
"step": 503
},
{
"epoch": 4.025961058412381,
"grad_norm": 0.1178503561561421,
"learning_rate": 8.806564618248999e-06,
"loss": 0.3808,
"step": 504
},
{
"epoch": 4.033949076385422,
"grad_norm": 0.14732236266291146,
"learning_rate": 8.667082127054533e-06,
"loss": 0.3832,
"step": 505
},
{
"epoch": 4.041937094358462,
"grad_norm": 0.15778749749862814,
"learning_rate": 8.52857873534862e-06,
"loss": 0.3779,
"step": 506
},
{
"epoch": 4.049925112331502,
"grad_norm": 0.1386627835810346,
"learning_rate": 8.391058771122673e-06,
"loss": 0.3831,
"step": 507
},
{
"epoch": 4.057913130304543,
"grad_norm": 0.12467125366104446,
"learning_rate": 8.254526531637727e-06,
"loss": 0.3874,
"step": 508
},
{
"epoch": 4.065901148277583,
"grad_norm": 0.1255433393864893,
"learning_rate": 8.118986283290096e-06,
"loss": 0.3873,
"step": 509
},
{
"epoch": 4.073889166250624,
"grad_norm": 0.13259474782114336,
"learning_rate": 7.984442261478108e-06,
"loss": 0.3779,
"step": 510
},
{
"epoch": 4.081877184223664,
"grad_norm": 0.1313024483917481,
"learning_rate": 7.850898670469745e-06,
"loss": 0.3796,
"step": 511
},
{
"epoch": 4.089865202196705,
"grad_norm": 0.12062485435615429,
"learning_rate": 7.718359683271224e-06,
"loss": 0.3801,
"step": 512
},
{
"epoch": 4.097853220169745,
"grad_norm": 0.11323572660175975,
"learning_rate": 7.586829441496668e-06,
"loss": 0.3692,
"step": 513
},
{
"epoch": 4.105841238142786,
"grad_norm": 0.12334975202412422,
"learning_rate": 7.456312055238606e-06,
"loss": 0.3792,
"step": 514
},
{
"epoch": 4.113829256115826,
"grad_norm": 0.12055598843637728,
"learning_rate": 7.326811602939634e-06,
"loss": 0.3825,
"step": 515
},
{
"epoch": 4.121817274088866,
"grad_norm": 0.11922302158014507,
"learning_rate": 7.198332131264876e-06,
"loss": 0.3827,
"step": 516
},
{
"epoch": 4.129805292061907,
"grad_norm": 0.1197396216655153,
"learning_rate": 7.070877654975614e-06,
"loss": 0.3858,
"step": 517
},
{
"epoch": 4.137793310034947,
"grad_norm": 0.10303380534845168,
"learning_rate": 6.944452156803763e-06,
"loss": 0.3763,
"step": 518
},
{
"epoch": 4.145781328007988,
"grad_norm": 0.10771564322360738,
"learning_rate": 6.819059587327479e-06,
"loss": 0.3798,
"step": 519
},
{
"epoch": 4.153769345981028,
"grad_norm": 0.11083630377147478,
"learning_rate": 6.694703864847673e-06,
"loss": 0.3812,
"step": 520
},
{
"epoch": 4.161757363954069,
"grad_norm": 0.1036678793429057,
"learning_rate": 6.571388875265592e-06,
"loss": 0.3804,
"step": 521
},
{
"epoch": 4.169745381927109,
"grad_norm": 0.10290514415858039,
"learning_rate": 6.449118471961342e-06,
"loss": 0.3815,
"step": 522
},
{
"epoch": 4.177733399900149,
"grad_norm": 0.09999602037947594,
"learning_rate": 6.327896475673561e-06,
"loss": 0.3796,
"step": 523
},
{
"epoch": 4.18572141787319,
"grad_norm": 0.10176649553175782,
"learning_rate": 6.207726674379961e-06,
"loss": 0.3802,
"step": 524
},
{
"epoch": 4.19370943584623,
"grad_norm": 0.10341756297649503,
"learning_rate": 6.088612823178968e-06,
"loss": 0.3752,
"step": 525
},
{
"epoch": 4.201697453819271,
"grad_norm": 0.10010538520744762,
"learning_rate": 5.970558644172424e-06,
"loss": 0.3772,
"step": 526
},
{
"epoch": 4.209685471792311,
"grad_norm": 0.09383564748055143,
"learning_rate": 5.853567826349213e-06,
"loss": 0.3738,
"step": 527
},
{
"epoch": 4.217673489765352,
"grad_norm": 0.09458974198014311,
"learning_rate": 5.737644025470057e-06,
"loss": 0.3752,
"step": 528
},
{
"epoch": 4.225661507738392,
"grad_norm": 0.10200091444940393,
"learning_rate": 5.6227908639532045e-06,
"loss": 0.3822,
"step": 529
},
{
"epoch": 4.233649525711433,
"grad_norm": 0.09730500481091861,
"learning_rate": 5.509011930761308e-06,
"loss": 0.381,
"step": 530
},
{
"epoch": 4.241637543684473,
"grad_norm": 0.09532139450671104,
"learning_rate": 5.396310781289243e-06,
"loss": 0.3816,
"step": 531
},
{
"epoch": 4.249625561657513,
"grad_norm": 0.09644789239600618,
"learning_rate": 5.284690937252977e-06,
"loss": 0.3696,
"step": 532
},
{
"epoch": 4.257613579630554,
"grad_norm": 0.10066108394874461,
"learning_rate": 5.1741558865795906e-06,
"loss": 0.3859,
"step": 533
},
{
"epoch": 4.265601597603594,
"grad_norm": 0.09693373503450557,
"learning_rate": 5.064709083298214e-06,
"loss": 0.3822,
"step": 534
},
{
"epoch": 4.273589615576635,
"grad_norm": 0.08926912859612744,
"learning_rate": 4.95635394743216e-06,
"loss": 0.3782,
"step": 535
},
{
"epoch": 4.281577633549675,
"grad_norm": 0.09076499502790894,
"learning_rate": 4.849093864891994e-06,
"loss": 0.3822,
"step": 536
},
{
"epoch": 4.289565651522716,
"grad_norm": 0.08773426720097247,
"learning_rate": 4.7429321873697865e-06,
"loss": 0.3783,
"step": 537
},
{
"epoch": 4.297553669495756,
"grad_norm": 0.0879998661027265,
"learning_rate": 4.637872232234326e-06,
"loss": 0.3805,
"step": 538
},
{
"epoch": 4.305541687468796,
"grad_norm": 0.09409764491066522,
"learning_rate": 4.5339172824274955e-06,
"loss": 0.3795,
"step": 539
},
{
"epoch": 4.313529705441837,
"grad_norm": 0.089547761049764,
"learning_rate": 4.4310705863616835e-06,
"loss": 0.3794,
"step": 540
},
{
"epoch": 4.321517723414877,
"grad_norm": 0.09066889486649515,
"learning_rate": 4.329335357818236e-06,
"loss": 0.3759,
"step": 541
},
{
"epoch": 4.329505741387918,
"grad_norm": 0.09166858551173564,
"learning_rate": 4.228714775847084e-06,
"loss": 0.3877,
"step": 542
},
{
"epoch": 4.337493759360958,
"grad_norm": 0.09606401143384108,
"learning_rate": 4.129211984667385e-06,
"loss": 0.3803,
"step": 543
},
{
"epoch": 4.345481777333999,
"grad_norm": 0.08718065900580216,
"learning_rate": 4.030830093569247e-06,
"loss": 0.3764,
"step": 544
},
{
"epoch": 4.353469795307039,
"grad_norm": 0.09279816372084171,
"learning_rate": 3.933572176816602e-06,
"loss": 0.3818,
"step": 545
},
{
"epoch": 4.361457813280079,
"grad_norm": 0.0895802314032739,
"learning_rate": 3.837441273551137e-06,
"loss": 0.3749,
"step": 546
},
{
"epoch": 4.36944583125312,
"grad_norm": 0.08925866632093443,
"learning_rate": 3.7424403876972924e-06,
"loss": 0.3741,
"step": 547
},
{
"epoch": 4.37743384922616,
"grad_norm": 0.09077149176473304,
"learning_rate": 3.6485724878684382e-06,
"loss": 0.3889,
"step": 548
},
{
"epoch": 4.385421867199201,
"grad_norm": 0.08624641665702638,
"learning_rate": 3.555840507274093e-06,
"loss": 0.3788,
"step": 549
},
{
"epoch": 4.393409885172241,
"grad_norm": 0.09155307608035071,
"learning_rate": 3.464247343628242e-06,
"loss": 0.3833,
"step": 550
},
{
"epoch": 4.401397903145282,
"grad_norm": 0.08659198159245704,
"learning_rate": 3.373795859058837e-06,
"loss": 0.3756,
"step": 551
},
{
"epoch": 4.409385921118322,
"grad_norm": 0.08959149189104454,
"learning_rate": 3.284488880018315e-06,
"loss": 0.3809,
"step": 552
},
{
"epoch": 4.417373939091363,
"grad_norm": 0.08570866197339067,
"learning_rate": 3.196329197195307e-06,
"loss": 0.379,
"step": 553
},
{
"epoch": 4.425361957064403,
"grad_norm": 0.08585759689716206,
"learning_rate": 3.1093195654274024e-06,
"loss": 0.3844,
"step": 554
},
{
"epoch": 4.433349975037443,
"grad_norm": 0.08851894364844058,
"learning_rate": 3.0234627036151186e-06,
"loss": 0.3754,
"step": 555
},
{
"epoch": 4.441337993010484,
"grad_norm": 0.08546993455255109,
"learning_rate": 2.9387612946368647e-06,
"loss": 0.3767,
"step": 556
},
{
"epoch": 4.449326010983524,
"grad_norm": 0.08689133858962513,
"learning_rate": 2.855217985265184e-06,
"loss": 0.3818,
"step": 557
},
{
"epoch": 4.457314028956565,
"grad_norm": 0.08705508747400349,
"learning_rate": 2.7728353860839763e-06,
"loss": 0.3789,
"step": 558
},
{
"epoch": 4.465302046929605,
"grad_norm": 0.08598514484683649,
"learning_rate": 2.6916160714069817e-06,
"loss": 0.3721,
"step": 559
},
{
"epoch": 4.473290064902646,
"grad_norm": 0.08768951265999986,
"learning_rate": 2.6115625791973155e-06,
"loss": 0.3777,
"step": 560
},
{
"epoch": 4.481278082875686,
"grad_norm": 0.08479223708104064,
"learning_rate": 2.5326774109881223e-06,
"loss": 0.3805,
"step": 561
},
{
"epoch": 4.489266100848727,
"grad_norm": 0.08131123805163427,
"learning_rate": 2.454963031804485e-06,
"loss": 0.3746,
"step": 562
},
{
"epoch": 4.497254118821767,
"grad_norm": 0.08329047935604311,
"learning_rate": 2.378421870086314e-06,
"loss": 0.3761,
"step": 563
},
{
"epoch": 4.5052421367948075,
"grad_norm": 0.08462162107210089,
"learning_rate": 2.3030563176125444e-06,
"loss": 0.3738,
"step": 564
},
{
"epoch": 4.513230154767848,
"grad_norm": 0.09812143956960612,
"learning_rate": 2.228868729426319e-06,
"loss": 0.3765,
"step": 565
},
{
"epoch": 4.521218172740888,
"grad_norm": 0.08490273500457897,
"learning_rate": 2.1558614237614516e-06,
"loss": 0.3778,
"step": 566
},
{
"epoch": 4.529206190713929,
"grad_norm": 0.08570430572140957,
"learning_rate": 2.0840366819699788e-06,
"loss": 0.3857,
"step": 567
},
{
"epoch": 4.537194208686969,
"grad_norm": 0.08300561137308456,
"learning_rate": 2.013396748450842e-06,
"loss": 0.3761,
"step": 568
},
{
"epoch": 4.54518222666001,
"grad_norm": 0.08443227783552133,
"learning_rate": 1.9439438305797776e-06,
"loss": 0.3756,
"step": 569
},
{
"epoch": 4.55317024463305,
"grad_norm": 0.08135395570142633,
"learning_rate": 1.8756800986403466e-06,
"loss": 0.3782,
"step": 570
},
{
"epoch": 4.5611582626060905,
"grad_norm": 0.08279967533402854,
"learning_rate": 1.808607685756103e-06,
"loss": 0.3776,
"step": 571
},
{
"epoch": 4.569146280579131,
"grad_norm": 0.0834623870625263,
"learning_rate": 1.7427286878239247e-06,
"loss": 0.3713,
"step": 572
},
{
"epoch": 4.5771342985521715,
"grad_norm": 0.08512591892730595,
"learning_rate": 1.6780451634485606e-06,
"loss": 0.3781,
"step": 573
},
{
"epoch": 4.585122316525212,
"grad_norm": 0.08121169235017031,
"learning_rate": 1.614559133878264e-06,
"loss": 0.3822,
"step": 574
},
{
"epoch": 4.5931103344982525,
"grad_norm": 0.0815454483422227,
"learning_rate": 1.5522725829416474e-06,
"loss": 0.3789,
"step": 575
},
{
"epoch": 4.601098352471293,
"grad_norm": 0.0819923460505712,
"learning_rate": 1.4911874569856965e-06,
"loss": 0.3777,
"step": 576
},
{
"epoch": 4.6090863704443334,
"grad_norm": 0.08276809528907374,
"learning_rate": 1.4313056648149393e-06,
"loss": 0.3818,
"step": 577
},
{
"epoch": 4.6170743884173735,
"grad_norm": 0.08123407989783393,
"learning_rate": 1.3726290776318175e-06,
"loss": 0.3752,
"step": 578
},
{
"epoch": 4.625062406390414,
"grad_norm": 0.08137283984240884,
"learning_rate": 1.3151595289781738e-06,
"loss": 0.3846,
"step": 579
},
{
"epoch": 4.6330504243634545,
"grad_norm": 0.08150026114578374,
"learning_rate": 1.2588988146780135e-06,
"loss": 0.3884,
"step": 580
},
{
"epoch": 4.641038442336495,
"grad_norm": 0.08281920320562544,
"learning_rate": 1.2038486927813354e-06,
"loss": 0.3841,
"step": 581
},
{
"epoch": 4.6490264603095355,
"grad_norm": 0.08355306503400638,
"learning_rate": 1.1500108835092472e-06,
"loss": 0.3812,
"step": 582
},
{
"epoch": 4.657014478282576,
"grad_norm": 0.08418060141581976,
"learning_rate": 1.0973870692001554e-06,
"loss": 0.3792,
"step": 583
},
{
"epoch": 4.6650024962556165,
"grad_norm": 0.08223524263153421,
"learning_rate": 1.0459788942572423e-06,
"loss": 0.3843,
"step": 584
},
{
"epoch": 4.6729905142286565,
"grad_norm": 0.08271968804038993,
"learning_rate": 9.957879650970549e-07,
"loss": 0.3857,
"step": 585
},
{
"epoch": 4.6809785322016975,
"grad_norm": 0.08244656434489289,
"learning_rate": 9.468158500993207e-07,
"loss": 0.3874,
"step": 586
},
{
"epoch": 4.6889665501747375,
"grad_norm": 0.0819506533129172,
"learning_rate": 8.990640795579186e-07,
"loss": 0.3808,
"step": 587
},
{
"epoch": 4.6969545681477785,
"grad_norm": 0.08149745500782653,
"learning_rate": 8.525341456330883e-07,
"loss": 0.3727,
"step": 588
},
{
"epoch": 4.7049425861208185,
"grad_norm": 0.08076187044142838,
"learning_rate": 8.072275023047926e-07,
"loss": 0.3761,
"step": 589
},
{
"epoch": 4.712930604093859,
"grad_norm": 0.08151591065997134,
"learning_rate": 7.631455653272613e-07,
"loss": 0.3832,
"step": 590
},
{
"epoch": 4.7209186220668995,
"grad_norm": 0.08462271380326744,
"learning_rate": 7.202897121847852e-07,
"loss": 0.3749,
"step": 591
},
{
"epoch": 4.72890664003994,
"grad_norm": 0.08308313300815548,
"learning_rate": 6.786612820486449e-07,
"loss": 0.3742,
"step": 592
},
{
"epoch": 4.7368946580129805,
"grad_norm": 0.08421663571704587,
"learning_rate": 6.382615757352817e-07,
"loss": 0.383,
"step": 593
},
{
"epoch": 4.744882675986021,
"grad_norm": 0.08208417725816322,
"learning_rate": 5.990918556656411e-07,
"loss": 0.3802,
"step": 594
},
{
"epoch": 4.7528706939590615,
"grad_norm": 0.08235652164981158,
"learning_rate": 5.611533458257245e-07,
"loss": 0.3826,
"step": 595
},
{
"epoch": 4.7608587119321015,
"grad_norm": 0.0823525460961533,
"learning_rate": 5.2444723172834e-07,
"loss": 0.375,
"step": 596
},
{
"epoch": 4.7688467299051425,
"grad_norm": 0.08291828155167397,
"learning_rate": 4.889746603760693e-07,
"loss": 0.3841,
"step": 597
},
{
"epoch": 4.7768347478781825,
"grad_norm": 0.0809741018796145,
"learning_rate": 4.5473674022541213e-07,
"loss": 0.3753,
"step": 598
},
{
"epoch": 4.7848227658512235,
"grad_norm": 0.08124124038278724,
"learning_rate": 4.2173454115214783e-07,
"loss": 0.3838,
"step": 599
},
{
"epoch": 4.7928107838242635,
"grad_norm": 0.08103520713384339,
"learning_rate": 3.899690944179257e-07,
"loss": 0.3765,
"step": 600
},
{
"epoch": 4.8007988017973044,
"grad_norm": 0.08227217638870313,
"learning_rate": 3.5944139263800694e-07,
"loss": 0.3834,
"step": 601
},
{
"epoch": 4.8087868197703445,
"grad_norm": 0.07899228317121158,
"learning_rate": 3.3015238975026675e-07,
"loss": 0.3694,
"step": 602
},
{
"epoch": 4.8167748377433846,
"grad_norm": 0.09227389493594652,
"learning_rate": 3.021030009853876e-07,
"loss": 0.3783,
"step": 603
},
{
"epoch": 4.8247628557164255,
"grad_norm": 0.08106531182197436,
"learning_rate": 2.752941028382594e-07,
"loss": 0.3773,
"step": 604
},
{
"epoch": 4.8327508736894655,
"grad_norm": 0.08015145752167932,
"learning_rate": 2.4972653304057073e-07,
"loss": 0.3777,
"step": 605
},
{
"epoch": 4.8407388916625065,
"grad_norm": 0.08160453860592876,
"learning_rate": 2.25401090534656e-07,
"loss": 0.3808,
"step": 606
},
{
"epoch": 4.8487269096355465,
"grad_norm": 0.07966427336497452,
"learning_rate": 2.0231853544852465e-07,
"loss": 0.3744,
"step": 607
},
{
"epoch": 4.8567149276085875,
"grad_norm": 0.08123242623536424,
"learning_rate": 1.8047958907209339e-07,
"loss": 0.3825,
"step": 608
},
{
"epoch": 4.8647029455816275,
"grad_norm": 0.0805412707928896,
"learning_rate": 1.5988493383466198e-07,
"loss": 0.3749,
"step": 609
},
{
"epoch": 4.872690963554668,
"grad_norm": 0.08036474123731352,
"learning_rate": 1.40535213283588e-07,
"loss": 0.3748,
"step": 610
},
{
"epoch": 4.8806789815277085,
"grad_norm": 0.08213950898863626,
"learning_rate": 1.2243103206417418e-07,
"loss": 0.3819,
"step": 611
},
{
"epoch": 4.888666999500749,
"grad_norm": 0.07935174486804004,
"learning_rate": 1.05572955900759e-07,
"loss": 0.3827,
"step": 612
},
{
"epoch": 4.8966550174737895,
"grad_norm": 0.07731873027438858,
"learning_rate": 8.996151157907306e-08,
"loss": 0.3674,
"step": 613
},
{
"epoch": 4.90464303544683,
"grad_norm": 0.07905308777211134,
"learning_rate": 7.559718692974116e-08,
"loss": 0.3755,
"step": 614
},
{
"epoch": 4.9126310534198705,
"grad_norm": 0.08188223266669394,
"learning_rate": 6.248043081307664e-08,
"loss": 0.3848,
"step": 615
},
{
"epoch": 4.9206190713929105,
"grad_norm": 0.07960614583875532,
"learning_rate": 5.0611653105003824e-08,
"loss": 0.3754,
"step": 616
},
{
"epoch": 4.928607089365951,
"grad_norm": 0.08159036451816658,
"learning_rate": 3.99912246843126e-08,
"loss": 0.384,
"step": 617
},
{
"epoch": 4.9365951073389915,
"grad_norm": 0.08068916515639828,
"learning_rate": 3.061947742101001e-08,
"loss": 0.3797,
"step": 618
},
{
"epoch": 4.944583125312032,
"grad_norm": 0.07945334308304049,
"learning_rate": 2.2496704165995142e-08,
"loss": 0.378,
"step": 619
},
{
"epoch": 4.9525711432850725,
"grad_norm": 0.08051278989431843,
"learning_rate": 1.5623158741884247e-08,
"loss": 0.3804,
"step": 620
},
{
"epoch": 4.960559161258113,
"grad_norm": 0.07952219440080063,
"learning_rate": 9.999055935074887e-09,
"loss": 0.3661,
"step": 621
},
{
"epoch": 4.9685471792311535,
"grad_norm": 0.08077553977056519,
"learning_rate": 5.624571489053488e-09,
"loss": 0.3829,
"step": 622
},
{
"epoch": 4.976535197204194,
"grad_norm": 0.08255739947277718,
"learning_rate": 2.499842098901972e-09,
"loss": 0.3842,
"step": 623
},
{
"epoch": 4.9845232151772345,
"grad_norm": 0.08097383031020737,
"learning_rate": 6.249654069989674e-10,
"loss": 0.3817,
"step": 624
},
{
"epoch": 4.992511233150275,
"grad_norm": 0.08105708331755175,
"learning_rate": 0.0,
"loss": 0.377,
"step": 625
},
{
"epoch": 4.992511233150275,
"step": 625,
"total_flos": 1.6083110655493669e+19,
"train_loss": 0.47168621559143065,
"train_runtime": 96267.4715,
"train_samples_per_second": 3.329,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1.0,
"max_steps": 625,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6083110655493669e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}