PEFT
Safetensors
llama
alignment-handbook
trl
sft
Generated from Trainer
sft_r1_barc_pot_10k / trainer_state.json
aadityap's picture
Model save
eabb649 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 492,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006097560975609756,
"grad_norm": 13.761144051368902,
"learning_rate": 4.0000000000000003e-07,
"loss": 4.6438,
"step": 1
},
{
"epoch": 0.012195121951219513,
"grad_norm": 13.183326369804801,
"learning_rate": 8.000000000000001e-07,
"loss": 4.8577,
"step": 2
},
{
"epoch": 0.018292682926829267,
"grad_norm": 11.29732095469755,
"learning_rate": 1.2000000000000002e-06,
"loss": 4.7954,
"step": 3
},
{
"epoch": 0.024390243902439025,
"grad_norm": 11.769209187889713,
"learning_rate": 1.6000000000000001e-06,
"loss": 4.7998,
"step": 4
},
{
"epoch": 0.03048780487804878,
"grad_norm": 12.522749869416966,
"learning_rate": 2.0000000000000003e-06,
"loss": 4.7958,
"step": 5
},
{
"epoch": 0.036585365853658534,
"grad_norm": 13.40825064276547,
"learning_rate": 2.4000000000000003e-06,
"loss": 4.5537,
"step": 6
},
{
"epoch": 0.042682926829268296,
"grad_norm": 13.224745779198466,
"learning_rate": 2.8000000000000003e-06,
"loss": 4.5891,
"step": 7
},
{
"epoch": 0.04878048780487805,
"grad_norm": 11.459531949346005,
"learning_rate": 3.2000000000000003e-06,
"loss": 4.8212,
"step": 8
},
{
"epoch": 0.054878048780487805,
"grad_norm": 12.942263223893391,
"learning_rate": 3.6000000000000003e-06,
"loss": 4.6557,
"step": 9
},
{
"epoch": 0.06097560975609756,
"grad_norm": 11.67916919497089,
"learning_rate": 4.000000000000001e-06,
"loss": 4.6643,
"step": 10
},
{
"epoch": 0.06707317073170732,
"grad_norm": 11.256361402107963,
"learning_rate": 4.4e-06,
"loss": 4.6234,
"step": 11
},
{
"epoch": 0.07317073170731707,
"grad_norm": 10.918667814241992,
"learning_rate": 4.800000000000001e-06,
"loss": 4.5428,
"step": 12
},
{
"epoch": 0.07926829268292683,
"grad_norm": 11.18941706027207,
"learning_rate": 5.2e-06,
"loss": 4.1207,
"step": 13
},
{
"epoch": 0.08536585365853659,
"grad_norm": 10.834833962041147,
"learning_rate": 5.600000000000001e-06,
"loss": 4.2727,
"step": 14
},
{
"epoch": 0.09146341463414634,
"grad_norm": 11.193544605148698,
"learning_rate": 6e-06,
"loss": 4.3305,
"step": 15
},
{
"epoch": 0.0975609756097561,
"grad_norm": 11.156213868367029,
"learning_rate": 6.4000000000000006e-06,
"loss": 3.855,
"step": 16
},
{
"epoch": 0.10365853658536585,
"grad_norm": 11.03392463912042,
"learning_rate": 6.800000000000001e-06,
"loss": 3.8251,
"step": 17
},
{
"epoch": 0.10975609756097561,
"grad_norm": 10.92516220698292,
"learning_rate": 7.2000000000000005e-06,
"loss": 3.6552,
"step": 18
},
{
"epoch": 0.11585365853658537,
"grad_norm": 9.360934765475477,
"learning_rate": 7.600000000000001e-06,
"loss": 3.4844,
"step": 19
},
{
"epoch": 0.12195121951219512,
"grad_norm": 8.698890724234088,
"learning_rate": 8.000000000000001e-06,
"loss": 3.4775,
"step": 20
},
{
"epoch": 0.12804878048780488,
"grad_norm": 8.307940622795766,
"learning_rate": 8.400000000000001e-06,
"loss": 3.1046,
"step": 21
},
{
"epoch": 0.13414634146341464,
"grad_norm": 7.873509354464809,
"learning_rate": 8.8e-06,
"loss": 2.8967,
"step": 22
},
{
"epoch": 0.1402439024390244,
"grad_norm": 6.74058652993515,
"learning_rate": 9.200000000000002e-06,
"loss": 2.7398,
"step": 23
},
{
"epoch": 0.14634146341463414,
"grad_norm": 5.6682482038936985,
"learning_rate": 9.600000000000001e-06,
"loss": 2.554,
"step": 24
},
{
"epoch": 0.1524390243902439,
"grad_norm": 5.196466384583255,
"learning_rate": 1e-05,
"loss": 2.4104,
"step": 25
},
{
"epoch": 0.15853658536585366,
"grad_norm": 4.379173110371979,
"learning_rate": 1.04e-05,
"loss": 2.0813,
"step": 26
},
{
"epoch": 0.16463414634146342,
"grad_norm": 4.041770670660932,
"learning_rate": 1.0800000000000002e-05,
"loss": 2.1974,
"step": 27
},
{
"epoch": 0.17073170731707318,
"grad_norm": 3.7890756018085083,
"learning_rate": 1.1200000000000001e-05,
"loss": 2.1635,
"step": 28
},
{
"epoch": 0.17682926829268292,
"grad_norm": 2.760454794268313,
"learning_rate": 1.16e-05,
"loss": 1.7247,
"step": 29
},
{
"epoch": 0.18292682926829268,
"grad_norm": 2.084883247086018,
"learning_rate": 1.2e-05,
"loss": 1.7248,
"step": 30
},
{
"epoch": 0.18902439024390244,
"grad_norm": 1.8946267393458731,
"learning_rate": 1.2400000000000002e-05,
"loss": 1.6519,
"step": 31
},
{
"epoch": 0.1951219512195122,
"grad_norm": 1.4683657417452,
"learning_rate": 1.2800000000000001e-05,
"loss": 1.5153,
"step": 32
},
{
"epoch": 0.20121951219512196,
"grad_norm": 1.1510156850186328,
"learning_rate": 1.3200000000000002e-05,
"loss": 1.4752,
"step": 33
},
{
"epoch": 0.2073170731707317,
"grad_norm": 0.9975006671404031,
"learning_rate": 1.3600000000000002e-05,
"loss": 1.3437,
"step": 34
},
{
"epoch": 0.21341463414634146,
"grad_norm": 1.175487116789021,
"learning_rate": 1.4e-05,
"loss": 1.5412,
"step": 35
},
{
"epoch": 0.21951219512195122,
"grad_norm": 1.0595320461986457,
"learning_rate": 1.4400000000000001e-05,
"loss": 1.3513,
"step": 36
},
{
"epoch": 0.22560975609756098,
"grad_norm": 1.1536993931361366,
"learning_rate": 1.48e-05,
"loss": 1.387,
"step": 37
},
{
"epoch": 0.23170731707317074,
"grad_norm": 1.166131895120981,
"learning_rate": 1.5200000000000002e-05,
"loss": 1.3192,
"step": 38
},
{
"epoch": 0.23780487804878048,
"grad_norm": 1.1083262024444887,
"learning_rate": 1.5600000000000003e-05,
"loss": 1.3045,
"step": 39
},
{
"epoch": 0.24390243902439024,
"grad_norm": 1.0376815768281262,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.3058,
"step": 40
},
{
"epoch": 0.25,
"grad_norm": 0.87640355368596,
"learning_rate": 1.64e-05,
"loss": 1.1079,
"step": 41
},
{
"epoch": 0.25609756097560976,
"grad_norm": 0.8012590361351394,
"learning_rate": 1.6800000000000002e-05,
"loss": 1.0897,
"step": 42
},
{
"epoch": 0.2621951219512195,
"grad_norm": 0.7274182856521663,
"learning_rate": 1.72e-05,
"loss": 1.1044,
"step": 43
},
{
"epoch": 0.2682926829268293,
"grad_norm": 0.6432052930355101,
"learning_rate": 1.76e-05,
"loss": 1.071,
"step": 44
},
{
"epoch": 0.27439024390243905,
"grad_norm": 0.6162901066533818,
"learning_rate": 1.8e-05,
"loss": 1.0739,
"step": 45
},
{
"epoch": 0.2804878048780488,
"grad_norm": 0.4936365689365201,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.9854,
"step": 46
},
{
"epoch": 0.2865853658536585,
"grad_norm": 0.4413825753074836,
"learning_rate": 1.88e-05,
"loss": 0.9589,
"step": 47
},
{
"epoch": 0.2926829268292683,
"grad_norm": 0.3835552689369408,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.9157,
"step": 48
},
{
"epoch": 0.29878048780487804,
"grad_norm": 0.36169118496626246,
"learning_rate": 1.9600000000000002e-05,
"loss": 0.9501,
"step": 49
},
{
"epoch": 0.3048780487804878,
"grad_norm": 0.3201102977202935,
"learning_rate": 2e-05,
"loss": 0.8649,
"step": 50
},
{
"epoch": 0.31097560975609756,
"grad_norm": 0.3301312685545821,
"learning_rate": 1.9999747405795057e-05,
"loss": 0.9233,
"step": 51
},
{
"epoch": 0.3170731707317073,
"grad_norm": 0.31971021695823615,
"learning_rate": 1.9998989635940996e-05,
"loss": 0.8435,
"step": 52
},
{
"epoch": 0.3231707317073171,
"grad_norm": 0.35515389044587536,
"learning_rate": 1.9997726728719468e-05,
"loss": 0.8589,
"step": 53
},
{
"epoch": 0.32926829268292684,
"grad_norm": 0.36743148858881,
"learning_rate": 1.9995958747931083e-05,
"loss": 0.8576,
"step": 54
},
{
"epoch": 0.3353658536585366,
"grad_norm": 0.42811375031790766,
"learning_rate": 1.9993685782892184e-05,
"loss": 0.9279,
"step": 55
},
{
"epoch": 0.34146341463414637,
"grad_norm": 0.41721558479353726,
"learning_rate": 1.9990907948430327e-05,
"loss": 0.8907,
"step": 56
},
{
"epoch": 0.3475609756097561,
"grad_norm": 0.35352961285727363,
"learning_rate": 1.9987625384878493e-05,
"loss": 0.8291,
"step": 57
},
{
"epoch": 0.35365853658536583,
"grad_norm": 0.2804864451654209,
"learning_rate": 1.998383825806799e-05,
"loss": 0.7566,
"step": 58
},
{
"epoch": 0.3597560975609756,
"grad_norm": 0.2882591085430372,
"learning_rate": 1.997954675932006e-05,
"loss": 0.8485,
"step": 59
},
{
"epoch": 0.36585365853658536,
"grad_norm": 0.2607140485168168,
"learning_rate": 1.9974751105436266e-05,
"loss": 0.8366,
"step": 60
},
{
"epoch": 0.3719512195121951,
"grad_norm": 0.23074985418491212,
"learning_rate": 1.9969451538687474e-05,
"loss": 0.8274,
"step": 61
},
{
"epoch": 0.3780487804878049,
"grad_norm": 0.23223595784320752,
"learning_rate": 1.9963648326801653e-05,
"loss": 0.9039,
"step": 62
},
{
"epoch": 0.38414634146341464,
"grad_norm": 0.17404598061817236,
"learning_rate": 1.9957341762950346e-05,
"loss": 0.6557,
"step": 63
},
{
"epoch": 0.3902439024390244,
"grad_norm": 0.20998407134341585,
"learning_rate": 1.9950532165733847e-05,
"loss": 0.7985,
"step": 64
},
{
"epoch": 0.39634146341463417,
"grad_norm": 0.194171035037221,
"learning_rate": 1.9943219879165113e-05,
"loss": 0.7393,
"step": 65
},
{
"epoch": 0.4024390243902439,
"grad_norm": 0.19325266400118835,
"learning_rate": 1.993540527265239e-05,
"loss": 0.7448,
"step": 66
},
{
"epoch": 0.40853658536585363,
"grad_norm": 0.22349087274155047,
"learning_rate": 1.992708874098054e-05,
"loss": 0.9037,
"step": 67
},
{
"epoch": 0.4146341463414634,
"grad_norm": 0.1952833479528782,
"learning_rate": 1.9918270704291104e-05,
"loss": 0.7685,
"step": 68
},
{
"epoch": 0.42073170731707316,
"grad_norm": 0.18405443762754753,
"learning_rate": 1.9908951608061078e-05,
"loss": 0.6956,
"step": 69
},
{
"epoch": 0.4268292682926829,
"grad_norm": 0.18503738755792795,
"learning_rate": 1.98991319230804e-05,
"loss": 0.7063,
"step": 70
},
{
"epoch": 0.4329268292682927,
"grad_norm": 0.19690421628538282,
"learning_rate": 1.9888812145428172e-05,
"loss": 0.7793,
"step": 71
},
{
"epoch": 0.43902439024390244,
"grad_norm": 0.16284982763895423,
"learning_rate": 1.9877992796447604e-05,
"loss": 0.6833,
"step": 72
},
{
"epoch": 0.4451219512195122,
"grad_norm": 0.14309181240903507,
"learning_rate": 1.9866674422719666e-05,
"loss": 0.6706,
"step": 73
},
{
"epoch": 0.45121951219512196,
"grad_norm": 0.15477185844290706,
"learning_rate": 1.9854857596035476e-05,
"loss": 0.7312,
"step": 74
},
{
"epoch": 0.4573170731707317,
"grad_norm": 0.1293717417561759,
"learning_rate": 1.984254291336743e-05,
"loss": 0.6589,
"step": 75
},
{
"epoch": 0.4634146341463415,
"grad_norm": 0.12123882539222287,
"learning_rate": 1.982973099683902e-05,
"loss": 0.62,
"step": 76
},
{
"epoch": 0.4695121951219512,
"grad_norm": 0.13924219962219428,
"learning_rate": 1.9816422493693417e-05,
"loss": 0.7501,
"step": 77
},
{
"epoch": 0.47560975609756095,
"grad_norm": 0.11917935845470132,
"learning_rate": 1.9802618076260784e-05,
"loss": 0.6819,
"step": 78
},
{
"epoch": 0.4817073170731707,
"grad_norm": 0.11800076531829735,
"learning_rate": 1.9788318441924276e-05,
"loss": 0.615,
"step": 79
},
{
"epoch": 0.4878048780487805,
"grad_norm": 0.11971198977185014,
"learning_rate": 1.9773524313084857e-05,
"loss": 0.6414,
"step": 80
},
{
"epoch": 0.49390243902439024,
"grad_norm": 0.13027819783391864,
"learning_rate": 1.9758236437124768e-05,
"loss": 0.6463,
"step": 81
},
{
"epoch": 0.5,
"grad_norm": 0.12460980978522168,
"learning_rate": 1.9742455586369786e-05,
"loss": 0.6529,
"step": 82
},
{
"epoch": 0.5060975609756098,
"grad_norm": 0.13756050372643888,
"learning_rate": 1.972618255805019e-05,
"loss": 0.7114,
"step": 83
},
{
"epoch": 0.5121951219512195,
"grad_norm": 0.12522657328282585,
"learning_rate": 1.9709418174260523e-05,
"loss": 0.6289,
"step": 84
},
{
"epoch": 0.5182926829268293,
"grad_norm": 0.12334790238705769,
"learning_rate": 1.9692163281918016e-05,
"loss": 0.6985,
"step": 85
},
{
"epoch": 0.524390243902439,
"grad_norm": 0.10972400668603455,
"learning_rate": 1.9674418752719835e-05,
"loss": 0.6453,
"step": 86
},
{
"epoch": 0.5304878048780488,
"grad_norm": 0.10309779127439034,
"learning_rate": 1.9656185483099027e-05,
"loss": 0.6347,
"step": 87
},
{
"epoch": 0.5365853658536586,
"grad_norm": 0.10717651442117264,
"learning_rate": 1.963746439417924e-05,
"loss": 0.6389,
"step": 88
},
{
"epoch": 0.5426829268292683,
"grad_norm": 0.10108518878859295,
"learning_rate": 1.961825643172819e-05,
"loss": 0.6449,
"step": 89
},
{
"epoch": 0.5487804878048781,
"grad_norm": 0.10582770991792313,
"learning_rate": 1.959856256610988e-05,
"loss": 0.6407,
"step": 90
},
{
"epoch": 0.5548780487804879,
"grad_norm": 0.09805540518718314,
"learning_rate": 1.9578383792235573e-05,
"loss": 0.6146,
"step": 91
},
{
"epoch": 0.5609756097560976,
"grad_norm": 0.09178135236393883,
"learning_rate": 1.9557721129513538e-05,
"loss": 0.5477,
"step": 92
},
{
"epoch": 0.5670731707317073,
"grad_norm": 0.09658702034736838,
"learning_rate": 1.9536575621797546e-05,
"loss": 0.5892,
"step": 93
},
{
"epoch": 0.573170731707317,
"grad_norm": 0.09736462601933246,
"learning_rate": 1.9514948337334144e-05,
"loss": 0.6138,
"step": 94
},
{
"epoch": 0.5792682926829268,
"grad_norm": 0.08905830955745823,
"learning_rate": 1.9492840368708668e-05,
"loss": 0.5399,
"step": 95
},
{
"epoch": 0.5853658536585366,
"grad_norm": 0.09660552709108973,
"learning_rate": 1.947025283279008e-05,
"loss": 0.6364,
"step": 96
},
{
"epoch": 0.5914634146341463,
"grad_norm": 0.09133915004182258,
"learning_rate": 1.9447186870674505e-05,
"loss": 0.5921,
"step": 97
},
{
"epoch": 0.5975609756097561,
"grad_norm": 0.0933997331456134,
"learning_rate": 1.9423643647627625e-05,
"loss": 0.6915,
"step": 98
},
{
"epoch": 0.6036585365853658,
"grad_norm": 0.08353569640772877,
"learning_rate": 1.9399624353025774e-05,
"loss": 0.6408,
"step": 99
},
{
"epoch": 0.6097560975609756,
"grad_norm": 0.08634151989354441,
"learning_rate": 1.937513020029588e-05,
"loss": 0.5963,
"step": 100
},
{
"epoch": 0.6158536585365854,
"grad_norm": 0.08898929542438962,
"learning_rate": 1.9350162426854152e-05,
"loss": 0.595,
"step": 101
},
{
"epoch": 0.6219512195121951,
"grad_norm": 0.08305460595097429,
"learning_rate": 1.932472229404356e-05,
"loss": 0.5669,
"step": 102
},
{
"epoch": 0.6280487804878049,
"grad_norm": 0.08888872021259317,
"learning_rate": 1.9298811087070134e-05,
"loss": 0.6165,
"step": 103
},
{
"epoch": 0.6341463414634146,
"grad_norm": 0.08058144202586265,
"learning_rate": 1.9272430114938018e-05,
"loss": 0.5728,
"step": 104
},
{
"epoch": 0.6402439024390244,
"grad_norm": 0.08217790638268045,
"learning_rate": 1.9245580710383344e-05,
"loss": 0.577,
"step": 105
},
{
"epoch": 0.6463414634146342,
"grad_norm": 0.07659807407519503,
"learning_rate": 1.9218264229806917e-05,
"loss": 0.5881,
"step": 106
},
{
"epoch": 0.6524390243902439,
"grad_norm": 0.07540223196226505,
"learning_rate": 1.9190482053205673e-05,
"loss": 0.62,
"step": 107
},
{
"epoch": 0.6585365853658537,
"grad_norm": 0.08107411301661235,
"learning_rate": 1.9162235584102973e-05,
"loss": 0.6488,
"step": 108
},
{
"epoch": 0.6646341463414634,
"grad_norm": 0.07719107791626204,
"learning_rate": 1.91335262494777e-05,
"loss": 0.5771,
"step": 109
},
{
"epoch": 0.6707317073170732,
"grad_norm": 0.08173053132540807,
"learning_rate": 1.9104355499692166e-05,
"loss": 0.5666,
"step": 110
},
{
"epoch": 0.676829268292683,
"grad_norm": 0.07965621160015979,
"learning_rate": 1.9074724808418837e-05,
"loss": 0.6113,
"step": 111
},
{
"epoch": 0.6829268292682927,
"grad_norm": 0.08980818058271649,
"learning_rate": 1.9044635672565898e-05,
"loss": 0.6089,
"step": 112
},
{
"epoch": 0.6890243902439024,
"grad_norm": 0.07194337673119468,
"learning_rate": 1.9014089612201612e-05,
"loss": 0.5728,
"step": 113
},
{
"epoch": 0.6951219512195121,
"grad_norm": 0.08706992381814065,
"learning_rate": 1.8983088170477556e-05,
"loss": 0.7144,
"step": 114
},
{
"epoch": 0.7012195121951219,
"grad_norm": 0.06652366663030163,
"learning_rate": 1.8951632913550625e-05,
"loss": 0.5026,
"step": 115
},
{
"epoch": 0.7073170731707317,
"grad_norm": 0.07192252823061802,
"learning_rate": 1.8919725430503946e-05,
"loss": 0.5533,
"step": 116
},
{
"epoch": 0.7134146341463414,
"grad_norm": 0.08014714412171235,
"learning_rate": 1.888736733326658e-05,
"loss": 0.6077,
"step": 117
},
{
"epoch": 0.7195121951219512,
"grad_norm": 0.0751151009465322,
"learning_rate": 1.8854560256532098e-05,
"loss": 0.5554,
"step": 118
},
{
"epoch": 0.725609756097561,
"grad_norm": 0.08384104993084439,
"learning_rate": 1.8821305857675997e-05,
"loss": 0.6079,
"step": 119
},
{
"epoch": 0.7317073170731707,
"grad_norm": 0.07596092975802397,
"learning_rate": 1.8787605816671956e-05,
"loss": 0.6262,
"step": 120
},
{
"epoch": 0.7378048780487805,
"grad_norm": 0.06984378368031652,
"learning_rate": 1.875346183600699e-05,
"loss": 0.5579,
"step": 121
},
{
"epoch": 0.7439024390243902,
"grad_norm": 0.06972708877396938,
"learning_rate": 1.8718875640595432e-05,
"loss": 0.5568,
"step": 122
},
{
"epoch": 0.75,
"grad_norm": 0.0708625905901818,
"learning_rate": 1.8683848977691784e-05,
"loss": 0.582,
"step": 123
},
{
"epoch": 0.7560975609756098,
"grad_norm": 0.07712676436551813,
"learning_rate": 1.864838361680247e-05,
"loss": 0.5935,
"step": 124
},
{
"epoch": 0.7621951219512195,
"grad_norm": 0.06823828416067228,
"learning_rate": 1.8612481349596406e-05,
"loss": 0.5503,
"step": 125
},
{
"epoch": 0.7682926829268293,
"grad_norm": 0.07681189237975657,
"learning_rate": 1.8576143989814524e-05,
"loss": 0.6412,
"step": 126
},
{
"epoch": 0.774390243902439,
"grad_norm": 0.06850293773437466,
"learning_rate": 1.8539373373178126e-05,
"loss": 0.5771,
"step": 127
},
{
"epoch": 0.7804878048780488,
"grad_norm": 0.06791710528546226,
"learning_rate": 1.8502171357296144e-05,
"loss": 0.6076,
"step": 128
},
{
"epoch": 0.7865853658536586,
"grad_norm": 0.06599767271998445,
"learning_rate": 1.8464539821571302e-05,
"loss": 0.5583,
"step": 129
},
{
"epoch": 0.7926829268292683,
"grad_norm": 0.07021764659304032,
"learning_rate": 1.8426480667105178e-05,
"loss": 0.5439,
"step": 130
},
{
"epoch": 0.7987804878048781,
"grad_norm": 0.06809097796108884,
"learning_rate": 1.8387995816602137e-05,
"loss": 0.5584,
"step": 131
},
{
"epoch": 0.8048780487804879,
"grad_norm": 0.07552768082187959,
"learning_rate": 1.8349087214272222e-05,
"loss": 0.6235,
"step": 132
},
{
"epoch": 0.8109756097560976,
"grad_norm": 0.07388542257010466,
"learning_rate": 1.830975682573293e-05,
"loss": 0.5605,
"step": 133
},
{
"epoch": 0.8170731707317073,
"grad_norm": 0.0734139769106561,
"learning_rate": 1.8270006637909907e-05,
"loss": 0.4911,
"step": 134
},
{
"epoch": 0.823170731707317,
"grad_norm": 0.06661902834297227,
"learning_rate": 1.8229838658936566e-05,
"loss": 0.5263,
"step": 135
},
{
"epoch": 0.8292682926829268,
"grad_norm": 0.08000530324170357,
"learning_rate": 1.818925491805265e-05,
"loss": 0.6063,
"step": 136
},
{
"epoch": 0.8353658536585366,
"grad_norm": 0.06955587209390625,
"learning_rate": 1.8148257465501718e-05,
"loss": 0.5664,
"step": 137
},
{
"epoch": 0.8414634146341463,
"grad_norm": 0.06999764411415345,
"learning_rate": 1.810684837242755e-05,
"loss": 0.5731,
"step": 138
},
{
"epoch": 0.8475609756097561,
"grad_norm": 0.07392487537186451,
"learning_rate": 1.8065029730769534e-05,
"loss": 0.5771,
"step": 139
},
{
"epoch": 0.8536585365853658,
"grad_norm": 0.07023462293275694,
"learning_rate": 1.8022803653156983e-05,
"loss": 0.5586,
"step": 140
},
{
"epoch": 0.8597560975609756,
"grad_norm": 0.0754370714846295,
"learning_rate": 1.7980172272802398e-05,
"loss": 0.5386,
"step": 141
},
{
"epoch": 0.8658536585365854,
"grad_norm": 0.06014064520411485,
"learning_rate": 1.7937137743393695e-05,
"loss": 0.5019,
"step": 142
},
{
"epoch": 0.8719512195121951,
"grad_norm": 0.0684039280130895,
"learning_rate": 1.7893702238985433e-05,
"loss": 0.5593,
"step": 143
},
{
"epoch": 0.8780487804878049,
"grad_norm": 0.07523983909087964,
"learning_rate": 1.784986795388895e-05,
"loss": 0.608,
"step": 144
},
{
"epoch": 0.8841463414634146,
"grad_norm": 0.06631906454003386,
"learning_rate": 1.7805637102561516e-05,
"loss": 0.5496,
"step": 145
},
{
"epoch": 0.8902439024390244,
"grad_norm": 0.06861615150079985,
"learning_rate": 1.776101191949449e-05,
"loss": 0.543,
"step": 146
},
{
"epoch": 0.8963414634146342,
"grad_norm": 0.06396979787588344,
"learning_rate": 1.771599465910039e-05,
"loss": 0.565,
"step": 147
},
{
"epoch": 0.9024390243902439,
"grad_norm": 0.06363428283758014,
"learning_rate": 1.7670587595599034e-05,
"loss": 0.5657,
"step": 148
},
{
"epoch": 0.9085365853658537,
"grad_norm": 0.06397911394577013,
"learning_rate": 1.7624793022902648e-05,
"loss": 0.5343,
"step": 149
},
{
"epoch": 0.9146341463414634,
"grad_norm": 0.06974823526096927,
"learning_rate": 1.757861325449997e-05,
"loss": 0.5022,
"step": 150
},
{
"epoch": 0.9207317073170732,
"grad_norm": 0.06210202598875651,
"learning_rate": 1.753205062333937e-05,
"loss": 0.486,
"step": 151
},
{
"epoch": 0.926829268292683,
"grad_norm": 0.07077403367537935,
"learning_rate": 1.7485107481711014e-05,
"loss": 0.4869,
"step": 152
},
{
"epoch": 0.9329268292682927,
"grad_norm": 0.06531115051828462,
"learning_rate": 1.7437786201128003e-05,
"loss": 0.5544,
"step": 153
},
{
"epoch": 0.9390243902439024,
"grad_norm": 0.0705614281634583,
"learning_rate": 1.7390089172206594e-05,
"loss": 0.5951,
"step": 154
},
{
"epoch": 0.9451219512195121,
"grad_norm": 0.08044467420017791,
"learning_rate": 1.73420188045454e-05,
"loss": 0.5882,
"step": 155
},
{
"epoch": 0.9512195121951219,
"grad_norm": 0.06778031094228984,
"learning_rate": 1.7293577526603684e-05,
"loss": 0.5307,
"step": 156
},
{
"epoch": 0.9573170731707317,
"grad_norm": 0.07735829065491621,
"learning_rate": 1.724476778557866e-05,
"loss": 0.5803,
"step": 157
},
{
"epoch": 0.9634146341463414,
"grad_norm": 0.06740940804822154,
"learning_rate": 1.719559204728188e-05,
"loss": 0.517,
"step": 158
},
{
"epoch": 0.9695121951219512,
"grad_norm": 0.07639960491412072,
"learning_rate": 1.7146052796014646e-05,
"loss": 0.5753,
"step": 159
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.061081285281898906,
"learning_rate": 1.7096152534442515e-05,
"loss": 0.4686,
"step": 160
},
{
"epoch": 0.9817073170731707,
"grad_norm": 0.06793168030335978,
"learning_rate": 1.704589378346886e-05,
"loss": 0.5447,
"step": 161
},
{
"epoch": 0.9878048780487805,
"grad_norm": 0.07395695811692952,
"learning_rate": 1.6995279082107537e-05,
"loss": 0.5657,
"step": 162
},
{
"epoch": 0.9939024390243902,
"grad_norm": 0.065555783375453,
"learning_rate": 1.6944310987354597e-05,
"loss": 0.5449,
"step": 163
},
{
"epoch": 1.0,
"grad_norm": 0.07050013910319028,
"learning_rate": 1.689299207405911e-05,
"loss": 0.5184,
"step": 164
},
{
"epoch": 1.0060975609756098,
"grad_norm": 0.0733901169113276,
"learning_rate": 1.6841324934793096e-05,
"loss": 0.5226,
"step": 165
},
{
"epoch": 1.0121951219512195,
"grad_norm": 0.06589566948346295,
"learning_rate": 1.678931217972055e-05,
"loss": 0.4873,
"step": 166
},
{
"epoch": 1.0182926829268293,
"grad_norm": 0.07475734276172114,
"learning_rate": 1.6736956436465573e-05,
"loss": 0.4827,
"step": 167
},
{
"epoch": 1.024390243902439,
"grad_norm": 0.058903086819527835,
"learning_rate": 1.6684260349979637e-05,
"loss": 0.5053,
"step": 168
},
{
"epoch": 1.0304878048780488,
"grad_norm": 0.06353582735567607,
"learning_rate": 1.6631226582407954e-05,
"loss": 0.5482,
"step": 169
},
{
"epoch": 1.0365853658536586,
"grad_norm": 0.06380787517800868,
"learning_rate": 1.6577857812954994e-05,
"loss": 0.5248,
"step": 170
},
{
"epoch": 1.0426829268292683,
"grad_norm": 0.06730058327208745,
"learning_rate": 1.6524156737749132e-05,
"loss": 0.4964,
"step": 171
},
{
"epoch": 1.048780487804878,
"grad_norm": 0.06293892448460658,
"learning_rate": 1.6470126069706456e-05,
"loss": 0.5168,
"step": 172
},
{
"epoch": 1.0548780487804879,
"grad_norm": 0.0694624094267741,
"learning_rate": 1.641576853839369e-05,
"loss": 0.5526,
"step": 173
},
{
"epoch": 1.0609756097560976,
"grad_norm": 0.06478295672497261,
"learning_rate": 1.6361086889890307e-05,
"loss": 0.4853,
"step": 174
},
{
"epoch": 1.0670731707317074,
"grad_norm": 0.06608027299921394,
"learning_rate": 1.6306083886649823e-05,
"loss": 0.5226,
"step": 175
},
{
"epoch": 1.0731707317073171,
"grad_norm": 0.06681662898135733,
"learning_rate": 1.6250762307360206e-05,
"loss": 0.537,
"step": 176
},
{
"epoch": 1.079268292682927,
"grad_norm": 0.06053711484659685,
"learning_rate": 1.6195124946803527e-05,
"loss": 0.4683,
"step": 177
},
{
"epoch": 1.0853658536585367,
"grad_norm": 0.07013371267663553,
"learning_rate": 1.6139174615714753e-05,
"loss": 0.5767,
"step": 178
},
{
"epoch": 1.0914634146341464,
"grad_norm": 0.06617676868427722,
"learning_rate": 1.6082914140639768e-05,
"loss": 0.5357,
"step": 179
},
{
"epoch": 1.0975609756097562,
"grad_norm": 0.06805254845313483,
"learning_rate": 1.6026346363792565e-05,
"loss": 0.5179,
"step": 180
},
{
"epoch": 1.103658536585366,
"grad_norm": 0.06882718691143014,
"learning_rate": 1.596947414291167e-05,
"loss": 0.5665,
"step": 181
},
{
"epoch": 1.1097560975609757,
"grad_norm": 0.06329003072823183,
"learning_rate": 1.591230035111576e-05,
"loss": 0.512,
"step": 182
},
{
"epoch": 1.1158536585365855,
"grad_norm": 0.06713658217392786,
"learning_rate": 1.5854827876758535e-05,
"loss": 0.4958,
"step": 183
},
{
"epoch": 1.1219512195121952,
"grad_norm": 0.0677868436901709,
"learning_rate": 1.5797059623282787e-05,
"loss": 0.4715,
"step": 184
},
{
"epoch": 1.1280487804878048,
"grad_norm": 0.06613758460632664,
"learning_rate": 1.573899850907373e-05,
"loss": 0.4829,
"step": 185
},
{
"epoch": 1.1341463414634148,
"grad_norm": 0.06887716201911288,
"learning_rate": 1.568064746731156e-05,
"loss": 0.5418,
"step": 186
},
{
"epoch": 1.1402439024390243,
"grad_norm": 0.07682982503987941,
"learning_rate": 1.5622009445823274e-05,
"loss": 0.5929,
"step": 187
},
{
"epoch": 1.146341463414634,
"grad_norm": 0.06571420924574008,
"learning_rate": 1.5563087406933762e-05,
"loss": 0.511,
"step": 188
},
{
"epoch": 1.1524390243902438,
"grad_norm": 0.0664511649725902,
"learning_rate": 1.550388432731613e-05,
"loss": 0.4558,
"step": 189
},
{
"epoch": 1.1585365853658536,
"grad_norm": 0.07492574855512298,
"learning_rate": 1.5444403197841345e-05,
"loss": 0.5396,
"step": 190
},
{
"epoch": 1.1646341463414633,
"grad_norm": 0.07122982585751268,
"learning_rate": 1.5384647023427136e-05,
"loss": 0.5301,
"step": 191
},
{
"epoch": 1.170731707317073,
"grad_norm": 0.0658921691477124,
"learning_rate": 1.5324618822886167e-05,
"loss": 0.4947,
"step": 192
},
{
"epoch": 1.1768292682926829,
"grad_norm": 0.07813967262256015,
"learning_rate": 1.526432162877356e-05,
"loss": 0.5522,
"step": 193
},
{
"epoch": 1.1829268292682926,
"grad_norm": 0.06731988936901052,
"learning_rate": 1.5203758487233677e-05,
"loss": 0.476,
"step": 194
},
{
"epoch": 1.1890243902439024,
"grad_norm": 0.07228505085779448,
"learning_rate": 1.514293245784623e-05,
"loss": 0.5278,
"step": 195
},
{
"epoch": 1.1951219512195121,
"grad_norm": 0.07452980948980172,
"learning_rate": 1.5081846613471736e-05,
"loss": 0.5773,
"step": 196
},
{
"epoch": 1.201219512195122,
"grad_norm": 0.06955858620563475,
"learning_rate": 1.5020504040096241e-05,
"loss": 0.5147,
"step": 197
},
{
"epoch": 1.2073170731707317,
"grad_norm": 0.07065384450910228,
"learning_rate": 1.4958907836675467e-05,
"loss": 0.5275,
"step": 198
},
{
"epoch": 1.2134146341463414,
"grad_norm": 0.07110195242202547,
"learning_rate": 1.489706111497821e-05,
"loss": 0.4819,
"step": 199
},
{
"epoch": 1.2195121951219512,
"grad_norm": 0.06820779355050262,
"learning_rate": 1.4834966999429179e-05,
"loss": 0.521,
"step": 200
},
{
"epoch": 1.225609756097561,
"grad_norm": 0.06964782085920465,
"learning_rate": 1.4772628626951114e-05,
"loss": 0.5234,
"step": 201
},
{
"epoch": 1.2317073170731707,
"grad_norm": 0.06930582742745629,
"learning_rate": 1.4710049146806348e-05,
"loss": 0.4911,
"step": 202
},
{
"epoch": 1.2378048780487805,
"grad_norm": 0.06741877286597113,
"learning_rate": 1.4647231720437687e-05,
"loss": 0.5215,
"step": 203
},
{
"epoch": 1.2439024390243902,
"grad_norm": 0.06792423855223992,
"learning_rate": 1.4584179521308703e-05,
"loss": 0.5117,
"step": 204
},
{
"epoch": 1.25,
"grad_norm": 0.07167343063555995,
"learning_rate": 1.4520895734743419e-05,
"loss": 0.538,
"step": 205
},
{
"epoch": 1.2560975609756098,
"grad_norm": 0.07256282666966574,
"learning_rate": 1.4457383557765385e-05,
"loss": 0.5529,
"step": 206
},
{
"epoch": 1.2621951219512195,
"grad_norm": 0.06757284467612035,
"learning_rate": 1.4393646198936169e-05,
"loss": 0.4892,
"step": 207
},
{
"epoch": 1.2682926829268293,
"grad_norm": 0.07188557520197532,
"learning_rate": 1.4329686878193271e-05,
"loss": 0.5602,
"step": 208
},
{
"epoch": 1.274390243902439,
"grad_norm": 0.07747770661719462,
"learning_rate": 1.4265508826687442e-05,
"loss": 0.5658,
"step": 209
},
{
"epoch": 1.2804878048780488,
"grad_norm": 0.06809716748651344,
"learning_rate": 1.4201115286619464e-05,
"loss": 0.4713,
"step": 210
},
{
"epoch": 1.2865853658536586,
"grad_norm": 0.07448043410586613,
"learning_rate": 1.4136509511076347e-05,
"loss": 0.5311,
"step": 211
},
{
"epoch": 1.2926829268292683,
"grad_norm": 0.08085706529770824,
"learning_rate": 1.4071694763866988e-05,
"loss": 0.5617,
"step": 212
},
{
"epoch": 1.298780487804878,
"grad_norm": 0.0728263083382011,
"learning_rate": 1.4006674319357298e-05,
"loss": 0.4792,
"step": 213
},
{
"epoch": 1.3048780487804879,
"grad_norm": 0.0670268791274602,
"learning_rate": 1.3941451462304778e-05,
"loss": 0.4675,
"step": 214
},
{
"epoch": 1.3109756097560976,
"grad_norm": 0.08039512209656642,
"learning_rate": 1.387602948769257e-05,
"loss": 0.5056,
"step": 215
},
{
"epoch": 1.3170731707317074,
"grad_norm": 0.06817800650730356,
"learning_rate": 1.3810411700563005e-05,
"loss": 0.4739,
"step": 216
},
{
"epoch": 1.3231707317073171,
"grad_norm": 0.07057112884463199,
"learning_rate": 1.3744601415850637e-05,
"loss": 0.5573,
"step": 217
},
{
"epoch": 1.329268292682927,
"grad_norm": 0.06981321032667759,
"learning_rate": 1.3678601958214779e-05,
"loss": 0.5014,
"step": 218
},
{
"epoch": 1.3353658536585367,
"grad_norm": 0.06993357941167098,
"learning_rate": 1.3612416661871532e-05,
"loss": 0.524,
"step": 219
},
{
"epoch": 1.3414634146341464,
"grad_norm": 0.06483182405879527,
"learning_rate": 1.3546048870425356e-05,
"loss": 0.4806,
"step": 220
},
{
"epoch": 1.3475609756097562,
"grad_norm": 0.07027433088183081,
"learning_rate": 1.3479501936700161e-05,
"loss": 0.4944,
"step": 221
},
{
"epoch": 1.3536585365853657,
"grad_norm": 0.08129330060634665,
"learning_rate": 1.3412779222569907e-05,
"loss": 0.5541,
"step": 222
},
{
"epoch": 1.3597560975609757,
"grad_norm": 0.06825577692381518,
"learning_rate": 1.3345884098788775e-05,
"loss": 0.473,
"step": 223
},
{
"epoch": 1.3658536585365852,
"grad_norm": 0.06613216751504289,
"learning_rate": 1.3278819944820893e-05,
"loss": 0.4318,
"step": 224
},
{
"epoch": 1.3719512195121952,
"grad_norm": 0.07283827698992242,
"learning_rate": 1.3211590148669586e-05,
"loss": 0.5125,
"step": 225
},
{
"epoch": 1.3780487804878048,
"grad_norm": 0.06892969965848932,
"learning_rate": 1.314419810670624e-05,
"loss": 0.4533,
"step": 226
},
{
"epoch": 1.3841463414634148,
"grad_norm": 0.07855732769188979,
"learning_rate": 1.3076647223498703e-05,
"loss": 0.5461,
"step": 227
},
{
"epoch": 1.3902439024390243,
"grad_norm": 0.07382325404677605,
"learning_rate": 1.3008940911639302e-05,
"loss": 0.4379,
"step": 228
},
{
"epoch": 1.3963414634146343,
"grad_norm": 0.07190670195425049,
"learning_rate": 1.2941082591572443e-05,
"loss": 0.533,
"step": 229
},
{
"epoch": 1.4024390243902438,
"grad_norm": 0.06888657163333817,
"learning_rate": 1.2873075691421808e-05,
"loss": 0.5146,
"step": 230
},
{
"epoch": 1.4085365853658536,
"grad_norm": 0.06879068933807653,
"learning_rate": 1.2804923646817169e-05,
"loss": 0.542,
"step": 231
},
{
"epoch": 1.4146341463414633,
"grad_norm": 0.06316663754312257,
"learning_rate": 1.2736629900720832e-05,
"loss": 0.4763,
"step": 232
},
{
"epoch": 1.420731707317073,
"grad_norm": 0.0745177281343944,
"learning_rate": 1.2668197903253694e-05,
"loss": 0.5063,
"step": 233
},
{
"epoch": 1.4268292682926829,
"grad_norm": 0.07549236104105322,
"learning_rate": 1.2599631111520956e-05,
"loss": 0.4871,
"step": 234
},
{
"epoch": 1.4329268292682926,
"grad_norm": 0.07608890942436555,
"learning_rate": 1.2530932989437463e-05,
"loss": 0.5216,
"step": 235
},
{
"epoch": 1.4390243902439024,
"grad_norm": 0.08852428564425496,
"learning_rate": 1.2462107007552726e-05,
"loss": 0.5814,
"step": 236
},
{
"epoch": 1.4451219512195121,
"grad_norm": 0.07169927489263321,
"learning_rate": 1.2393156642875579e-05,
"loss": 0.5097,
"step": 237
},
{
"epoch": 1.451219512195122,
"grad_norm": 0.0714844858843735,
"learning_rate": 1.2324085378698529e-05,
"loss": 0.4943,
"step": 238
},
{
"epoch": 1.4573170731707317,
"grad_norm": 0.07303490526979263,
"learning_rate": 1.2254896704421789e-05,
"loss": 0.5254,
"step": 239
},
{
"epoch": 1.4634146341463414,
"grad_norm": 0.07649861873490388,
"learning_rate": 1.2185594115376991e-05,
"loss": 0.4628,
"step": 240
},
{
"epoch": 1.4695121951219512,
"grad_norm": 0.07968432654727967,
"learning_rate": 1.211618111265061e-05,
"loss": 0.5311,
"step": 241
},
{
"epoch": 1.475609756097561,
"grad_norm": 0.08164340764032027,
"learning_rate": 1.2046661202907101e-05,
"loss": 0.5082,
"step": 242
},
{
"epoch": 1.4817073170731707,
"grad_norm": 0.07536455518754334,
"learning_rate": 1.1977037898211723e-05,
"loss": 0.4963,
"step": 243
},
{
"epoch": 1.4878048780487805,
"grad_norm": 0.07838988741024766,
"learning_rate": 1.1907314715853138e-05,
"loss": 0.4964,
"step": 244
},
{
"epoch": 1.4939024390243902,
"grad_norm": 0.07681652298019655,
"learning_rate": 1.1837495178165706e-05,
"loss": 0.531,
"step": 245
},
{
"epoch": 1.5,
"grad_norm": 0.07585351296872528,
"learning_rate": 1.176758281235155e-05,
"loss": 0.4971,
"step": 246
},
{
"epoch": 1.5060975609756098,
"grad_norm": 0.07391818296648663,
"learning_rate": 1.1697581150302362e-05,
"loss": 0.5189,
"step": 247
},
{
"epoch": 1.5121951219512195,
"grad_norm": 0.07240466805493384,
"learning_rate": 1.1627493728420978e-05,
"loss": 0.4696,
"step": 248
},
{
"epoch": 1.5182926829268293,
"grad_norm": 0.07336052530563156,
"learning_rate": 1.1557324087442719e-05,
"loss": 0.5158,
"step": 249
},
{
"epoch": 1.524390243902439,
"grad_norm": 0.0734922713913184,
"learning_rate": 1.1487075772256517e-05,
"loss": 0.5013,
"step": 250
},
{
"epoch": 1.5304878048780488,
"grad_norm": 0.07041146771920755,
"learning_rate": 1.1416752331725842e-05,
"loss": 0.4925,
"step": 251
},
{
"epoch": 1.5365853658536586,
"grad_norm": 0.07749242303496245,
"learning_rate": 1.1346357318509395e-05,
"loss": 0.5115,
"step": 252
},
{
"epoch": 1.5426829268292683,
"grad_norm": 0.06688029575585173,
"learning_rate": 1.1275894288881664e-05,
"loss": 0.434,
"step": 253
},
{
"epoch": 1.548780487804878,
"grad_norm": 0.07880736062509318,
"learning_rate": 1.1205366802553231e-05,
"loss": 0.513,
"step": 254
},
{
"epoch": 1.5548780487804879,
"grad_norm": 0.07925969227352526,
"learning_rate": 1.1134778422490971e-05,
"loss": 0.5467,
"step": 255
},
{
"epoch": 1.5609756097560976,
"grad_norm": 0.07331952124747425,
"learning_rate": 1.1064132714738024e-05,
"loss": 0.5394,
"step": 256
},
{
"epoch": 1.5670731707317072,
"grad_norm": 0.06881324378789994,
"learning_rate": 1.0993433248233672e-05,
"loss": 0.481,
"step": 257
},
{
"epoch": 1.5731707317073171,
"grad_norm": 0.0760805406679711,
"learning_rate": 1.092268359463302e-05,
"loss": 0.4998,
"step": 258
},
{
"epoch": 1.5792682926829267,
"grad_norm": 0.0723622186584405,
"learning_rate": 1.0851887328126569e-05,
"loss": 0.4989,
"step": 259
},
{
"epoch": 1.5853658536585367,
"grad_norm": 0.0691805814488331,
"learning_rate": 1.0781048025259648e-05,
"loss": 0.4491,
"step": 260
},
{
"epoch": 1.5914634146341462,
"grad_norm": 0.07332736517126122,
"learning_rate": 1.0710169264751733e-05,
"loss": 0.4767,
"step": 261
},
{
"epoch": 1.5975609756097562,
"grad_norm": 0.07503167590781622,
"learning_rate": 1.0639254627315658e-05,
"loss": 0.5108,
"step": 262
},
{
"epoch": 1.6036585365853657,
"grad_norm": 0.07676152916231047,
"learning_rate": 1.0568307695476712e-05,
"loss": 0.5324,
"step": 263
},
{
"epoch": 1.6097560975609757,
"grad_norm": 0.0834575992691424,
"learning_rate": 1.049733205339167e-05,
"loss": 0.5628,
"step": 264
},
{
"epoch": 1.6158536585365852,
"grad_norm": 0.07979828063308889,
"learning_rate": 1.0426331286667701e-05,
"loss": 0.5017,
"step": 265
},
{
"epoch": 1.6219512195121952,
"grad_norm": 0.07227807983837574,
"learning_rate": 1.0355308982181254e-05,
"loss": 0.4286,
"step": 266
},
{
"epoch": 1.6280487804878048,
"grad_norm": 0.08033151020781615,
"learning_rate": 1.0284268727896833e-05,
"loss": 0.4991,
"step": 267
},
{
"epoch": 1.6341463414634148,
"grad_norm": 0.07726754814987509,
"learning_rate": 1.0213214112685747e-05,
"loss": 0.5663,
"step": 268
},
{
"epoch": 1.6402439024390243,
"grad_norm": 0.06975489529697236,
"learning_rate": 1.0142148726144807e-05,
"loss": 0.4509,
"step": 269
},
{
"epoch": 1.6463414634146343,
"grad_norm": 0.08607941878366727,
"learning_rate": 1.0071076158414977e-05,
"loss": 0.5012,
"step": 270
},
{
"epoch": 1.6524390243902438,
"grad_norm": 0.07924808288315173,
"learning_rate": 1e-05,
"loss": 0.4968,
"step": 271
},
{
"epoch": 1.6585365853658538,
"grad_norm": 0.07521641587131223,
"learning_rate": 9.928923841585025e-06,
"loss": 0.5333,
"step": 272
},
{
"epoch": 1.6646341463414633,
"grad_norm": 0.0810914472998851,
"learning_rate": 9.857851273855195e-06,
"loss": 0.5256,
"step": 273
},
{
"epoch": 1.6707317073170733,
"grad_norm": 0.07570611716859411,
"learning_rate": 9.786785887314255e-06,
"loss": 0.4844,
"step": 274
},
{
"epoch": 1.6768292682926829,
"grad_norm": 0.08692043191937647,
"learning_rate": 9.715731272103172e-06,
"loss": 0.55,
"step": 275
},
{
"epoch": 1.6829268292682928,
"grad_norm": 0.07549433485077096,
"learning_rate": 9.644691017818752e-06,
"loss": 0.4599,
"step": 276
},
{
"epoch": 1.6890243902439024,
"grad_norm": 0.07013095295478262,
"learning_rate": 9.573668713332305e-06,
"loss": 0.4641,
"step": 277
},
{
"epoch": 1.6951219512195121,
"grad_norm": 0.085732485667367,
"learning_rate": 9.502667946608332e-06,
"loss": 0.5409,
"step": 278
},
{
"epoch": 1.701219512195122,
"grad_norm": 0.0786732581180552,
"learning_rate": 9.43169230452329e-06,
"loss": 0.5047,
"step": 279
},
{
"epoch": 1.7073170731707317,
"grad_norm": 0.06973904067642213,
"learning_rate": 9.360745372684346e-06,
"loss": 0.4611,
"step": 280
},
{
"epoch": 1.7134146341463414,
"grad_norm": 0.07149507221102347,
"learning_rate": 9.289830735248269e-06,
"loss": 0.5249,
"step": 281
},
{
"epoch": 1.7195121951219512,
"grad_norm": 0.07598878917991338,
"learning_rate": 9.218951974740354e-06,
"loss": 0.53,
"step": 282
},
{
"epoch": 1.725609756097561,
"grad_norm": 0.07880251948989025,
"learning_rate": 9.148112671873433e-06,
"loss": 0.5195,
"step": 283
},
{
"epoch": 1.7317073170731707,
"grad_norm": 0.08006942318123801,
"learning_rate": 9.07731640536698e-06,
"loss": 0.4935,
"step": 284
},
{
"epoch": 1.7378048780487805,
"grad_norm": 0.07419436980709716,
"learning_rate": 9.00656675176633e-06,
"loss": 0.5,
"step": 285
},
{
"epoch": 1.7439024390243902,
"grad_norm": 0.07335821062357814,
"learning_rate": 8.935867285261977e-06,
"loss": 0.4689,
"step": 286
},
{
"epoch": 1.75,
"grad_norm": 0.08300432948525868,
"learning_rate": 8.865221577509034e-06,
"loss": 0.5499,
"step": 287
},
{
"epoch": 1.7560975609756098,
"grad_norm": 0.07414556420580612,
"learning_rate": 8.79463319744677e-06,
"loss": 0.5016,
"step": 288
},
{
"epoch": 1.7621951219512195,
"grad_norm": 0.0777082606308814,
"learning_rate": 8.724105711118342e-06,
"loss": 0.5094,
"step": 289
},
{
"epoch": 1.7682926829268293,
"grad_norm": 0.07879787724881869,
"learning_rate": 8.653642681490608e-06,
"loss": 0.504,
"step": 290
},
{
"epoch": 1.774390243902439,
"grad_norm": 0.07782466491724375,
"learning_rate": 8.583247668274163e-06,
"loss": 0.4871,
"step": 291
},
{
"epoch": 1.7804878048780488,
"grad_norm": 0.07959753822335783,
"learning_rate": 8.512924227743482e-06,
"loss": 0.4637,
"step": 292
},
{
"epoch": 1.7865853658536586,
"grad_norm": 0.08403574176326123,
"learning_rate": 8.442675912557281e-06,
"loss": 0.4978,
"step": 293
},
{
"epoch": 1.7926829268292683,
"grad_norm": 0.07506838029170206,
"learning_rate": 8.372506271579022e-06,
"loss": 0.4801,
"step": 294
},
{
"epoch": 1.798780487804878,
"grad_norm": 0.08007620776198685,
"learning_rate": 8.30241884969764e-06,
"loss": 0.5467,
"step": 295
},
{
"epoch": 1.8048780487804879,
"grad_norm": 0.07302138656473144,
"learning_rate": 8.232417187648454e-06,
"loss": 0.4591,
"step": 296
},
{
"epoch": 1.8109756097560976,
"grad_norm": 0.07968477173539414,
"learning_rate": 8.162504821834296e-06,
"loss": 0.4869,
"step": 297
},
{
"epoch": 1.8170731707317072,
"grad_norm": 0.07295556591701204,
"learning_rate": 8.092685284146865e-06,
"loss": 0.4857,
"step": 298
},
{
"epoch": 1.8231707317073171,
"grad_norm": 0.0687852684162483,
"learning_rate": 8.02296210178828e-06,
"loss": 0.4376,
"step": 299
},
{
"epoch": 1.8292682926829267,
"grad_norm": 0.07614301377824628,
"learning_rate": 7.953338797092902e-06,
"loss": 0.4687,
"step": 300
},
{
"epoch": 1.8353658536585367,
"grad_norm": 0.08065344887657697,
"learning_rate": 7.883818887349391e-06,
"loss": 0.558,
"step": 301
},
{
"epoch": 1.8414634146341462,
"grad_norm": 0.07537828658738212,
"learning_rate": 7.814405884623012e-06,
"loss": 0.4641,
"step": 302
},
{
"epoch": 1.8475609756097562,
"grad_norm": 0.07520932682727934,
"learning_rate": 7.745103295578216e-06,
"loss": 0.4807,
"step": 303
},
{
"epoch": 1.8536585365853657,
"grad_norm": 0.08194352024084861,
"learning_rate": 7.675914621301476e-06,
"loss": 0.5249,
"step": 304
},
{
"epoch": 1.8597560975609757,
"grad_norm": 0.06548230309414133,
"learning_rate": 7.606843357124426e-06,
"loss": 0.4296,
"step": 305
},
{
"epoch": 1.8658536585365852,
"grad_norm": 0.07539507791922381,
"learning_rate": 7.5378929924472735e-06,
"loss": 0.4906,
"step": 306
},
{
"epoch": 1.8719512195121952,
"grad_norm": 0.07757046744416946,
"learning_rate": 7.469067010562538e-06,
"loss": 0.4764,
"step": 307
},
{
"epoch": 1.8780487804878048,
"grad_norm": 0.08347451677435065,
"learning_rate": 7.400368888479048e-06,
"loss": 0.5079,
"step": 308
},
{
"epoch": 1.8841463414634148,
"grad_norm": 0.09459522265909277,
"learning_rate": 7.331802096746309e-06,
"loss": 0.5622,
"step": 309
},
{
"epoch": 1.8902439024390243,
"grad_norm": 0.07271276680988117,
"learning_rate": 7.263370099279173e-06,
"loss": 0.4646,
"step": 310
},
{
"epoch": 1.8963414634146343,
"grad_norm": 0.07270944162582102,
"learning_rate": 7.195076353182834e-06,
"loss": 0.4824,
"step": 311
},
{
"epoch": 1.9024390243902438,
"grad_norm": 0.07557182291040342,
"learning_rate": 7.126924308578196e-06,
"loss": 0.4434,
"step": 312
},
{
"epoch": 1.9085365853658538,
"grad_norm": 0.07838104917336293,
"learning_rate": 7.058917408427559e-06,
"loss": 0.4969,
"step": 313
},
{
"epoch": 1.9146341463414633,
"grad_norm": 0.0772118612308542,
"learning_rate": 6.9910590883607e-06,
"loss": 0.4897,
"step": 314
},
{
"epoch": 1.9207317073170733,
"grad_norm": 0.06733566470669253,
"learning_rate": 6.923352776501302e-06,
"loss": 0.4541,
"step": 315
},
{
"epoch": 1.9268292682926829,
"grad_norm": 0.07768052936426381,
"learning_rate": 6.855801893293765e-06,
"loss": 0.4746,
"step": 316
},
{
"epoch": 1.9329268292682928,
"grad_norm": 0.07601549840390559,
"learning_rate": 6.788409851330419e-06,
"loss": 0.5037,
"step": 317
},
{
"epoch": 1.9390243902439024,
"grad_norm": 0.08311022242797193,
"learning_rate": 6.721180055179113e-06,
"loss": 0.5478,
"step": 318
},
{
"epoch": 1.9451219512195121,
"grad_norm": 0.07527262749855876,
"learning_rate": 6.654115901211229e-06,
"loss": 0.4801,
"step": 319
},
{
"epoch": 1.951219512195122,
"grad_norm": 0.0819908930356081,
"learning_rate": 6.587220777430097e-06,
"loss": 0.5252,
"step": 320
},
{
"epoch": 1.9573170731707317,
"grad_norm": 0.07276145316957822,
"learning_rate": 6.5204980632998394e-06,
"loss": 0.411,
"step": 321
},
{
"epoch": 1.9634146341463414,
"grad_norm": 0.06904185307886326,
"learning_rate": 6.453951129574644e-06,
"loss": 0.4813,
"step": 322
},
{
"epoch": 1.9695121951219512,
"grad_norm": 0.07458141714965788,
"learning_rate": 6.387583338128471e-06,
"loss": 0.5033,
"step": 323
},
{
"epoch": 1.975609756097561,
"grad_norm": 0.07431900473667878,
"learning_rate": 6.321398041785225e-06,
"loss": 0.4907,
"step": 324
},
{
"epoch": 1.9817073170731707,
"grad_norm": 0.07780066087542951,
"learning_rate": 6.255398584149366e-06,
"loss": 0.4902,
"step": 325
},
{
"epoch": 1.9878048780487805,
"grad_norm": 0.07639692261752619,
"learning_rate": 6.189588299436997e-06,
"loss": 0.4978,
"step": 326
},
{
"epoch": 1.9939024390243902,
"grad_norm": 0.07548995093210441,
"learning_rate": 6.123970512307433e-06,
"loss": 0.4664,
"step": 327
},
{
"epoch": 2.0,
"grad_norm": 0.0747178162965431,
"learning_rate": 6.058548537695225e-06,
"loss": 0.474,
"step": 328
},
{
"epoch": 2.0060975609756095,
"grad_norm": 0.07256188615230309,
"learning_rate": 5.9933256806427056e-06,
"loss": 0.45,
"step": 329
},
{
"epoch": 2.0121951219512195,
"grad_norm": 0.07890015744026495,
"learning_rate": 5.928305236133016e-06,
"loss": 0.5278,
"step": 330
},
{
"epoch": 2.018292682926829,
"grad_norm": 0.0743608081300144,
"learning_rate": 5.86349048892366e-06,
"loss": 0.5151,
"step": 331
},
{
"epoch": 2.024390243902439,
"grad_norm": 0.07744372435832222,
"learning_rate": 5.798884713380542e-06,
"loss": 0.4706,
"step": 332
},
{
"epoch": 2.0304878048780486,
"grad_norm": 0.07446342377276646,
"learning_rate": 5.734491173312559e-06,
"loss": 0.3936,
"step": 333
},
{
"epoch": 2.0365853658536586,
"grad_norm": 0.07700683633251168,
"learning_rate": 5.67031312180673e-06,
"loss": 0.4931,
"step": 334
},
{
"epoch": 2.042682926829268,
"grad_norm": 0.07438951456585641,
"learning_rate": 5.60635380106383e-06,
"loss": 0.4958,
"step": 335
},
{
"epoch": 2.048780487804878,
"grad_norm": 0.07659907187784923,
"learning_rate": 5.542616442234618e-06,
"loss": 0.4846,
"step": 336
},
{
"epoch": 2.0548780487804876,
"grad_norm": 0.06905536948425527,
"learning_rate": 5.479104265256583e-06,
"loss": 0.4426,
"step": 337
},
{
"epoch": 2.0609756097560976,
"grad_norm": 0.07334778911378619,
"learning_rate": 5.415820478691301e-06,
"loss": 0.5074,
"step": 338
},
{
"epoch": 2.067073170731707,
"grad_norm": 0.08664025624260953,
"learning_rate": 5.352768279562315e-06,
"loss": 0.5383,
"step": 339
},
{
"epoch": 2.073170731707317,
"grad_norm": 0.08278651278238408,
"learning_rate": 5.2899508531936526e-06,
"loss": 0.4713,
"step": 340
},
{
"epoch": 2.0792682926829267,
"grad_norm": 0.0705392226402741,
"learning_rate": 5.2273713730488886e-06,
"loss": 0.403,
"step": 341
},
{
"epoch": 2.0853658536585367,
"grad_norm": 0.0717929565778019,
"learning_rate": 5.165033000570825e-06,
"loss": 0.472,
"step": 342
},
{
"epoch": 2.091463414634146,
"grad_norm": 0.07551856711067856,
"learning_rate": 5.1029388850217935e-06,
"loss": 0.4945,
"step": 343
},
{
"epoch": 2.097560975609756,
"grad_norm": 0.07438201387306197,
"learning_rate": 5.041092163324537e-06,
"loss": 0.4939,
"step": 344
},
{
"epoch": 2.1036585365853657,
"grad_norm": 0.07626903753672695,
"learning_rate": 4.979495959903759e-06,
"loss": 0.4662,
"step": 345
},
{
"epoch": 2.1097560975609757,
"grad_norm": 0.07374673606028373,
"learning_rate": 4.918153386528271e-06,
"loss": 0.4792,
"step": 346
},
{
"epoch": 2.1158536585365852,
"grad_norm": 0.08229855647674697,
"learning_rate": 4.8570675421537685e-06,
"loss": 0.5428,
"step": 347
},
{
"epoch": 2.1219512195121952,
"grad_norm": 0.07472947580500576,
"learning_rate": 4.7962415127663265e-06,
"loss": 0.5573,
"step": 348
},
{
"epoch": 2.1280487804878048,
"grad_norm": 0.07720860403921907,
"learning_rate": 4.7356783712264405e-06,
"loss": 0.5366,
"step": 349
},
{
"epoch": 2.1341463414634148,
"grad_norm": 0.07671762679161022,
"learning_rate": 4.675381177113837e-06,
"loss": 0.4991,
"step": 350
},
{
"epoch": 2.1402439024390243,
"grad_norm": 0.0698611175055525,
"learning_rate": 4.615352976572867e-06,
"loss": 0.463,
"step": 351
},
{
"epoch": 2.1463414634146343,
"grad_norm": 0.08168992782435783,
"learning_rate": 4.555596802158653e-06,
"loss": 0.5243,
"step": 352
},
{
"epoch": 2.152439024390244,
"grad_norm": 0.07634240659584558,
"learning_rate": 4.4961156726838725e-06,
"loss": 0.4832,
"step": 353
},
{
"epoch": 2.158536585365854,
"grad_norm": 0.07065742577897564,
"learning_rate": 4.436912593066241e-06,
"loss": 0.5121,
"step": 354
},
{
"epoch": 2.1646341463414633,
"grad_norm": 0.07396087251257588,
"learning_rate": 4.377990554176729e-06,
"loss": 0.4896,
"step": 355
},
{
"epoch": 2.1707317073170733,
"grad_norm": 0.07474646282993128,
"learning_rate": 4.319352532688444e-06,
"loss": 0.4612,
"step": 356
},
{
"epoch": 2.176829268292683,
"grad_norm": 0.08069543680226443,
"learning_rate": 4.261001490926272e-06,
"loss": 0.5218,
"step": 357
},
{
"epoch": 2.182926829268293,
"grad_norm": 0.07346735875767992,
"learning_rate": 4.2029403767172175e-06,
"loss": 0.435,
"step": 358
},
{
"epoch": 2.1890243902439024,
"grad_norm": 0.07522119438266486,
"learning_rate": 4.14517212324147e-06,
"loss": 0.4956,
"step": 359
},
{
"epoch": 2.1951219512195124,
"grad_norm": 0.08032494668646596,
"learning_rate": 4.087699648884248e-06,
"loss": 0.4752,
"step": 360
},
{
"epoch": 2.201219512195122,
"grad_norm": 0.08192204498128373,
"learning_rate": 4.0305258570883336e-06,
"loss": 0.5108,
"step": 361
},
{
"epoch": 2.207317073170732,
"grad_norm": 0.08394146118190073,
"learning_rate": 3.973653636207437e-06,
"loss": 0.5567,
"step": 362
},
{
"epoch": 2.2134146341463414,
"grad_norm": 0.07596087618305393,
"learning_rate": 3.917085859360234e-06,
"loss": 0.4685,
"step": 363
},
{
"epoch": 2.2195121951219514,
"grad_norm": 0.07887155715773822,
"learning_rate": 3.860825384285247e-06,
"loss": 0.5206,
"step": 364
},
{
"epoch": 2.225609756097561,
"grad_norm": 0.07296513823227467,
"learning_rate": 3.804875053196477e-06,
"loss": 0.4469,
"step": 365
},
{
"epoch": 2.231707317073171,
"grad_norm": 0.08190684855847946,
"learning_rate": 3.7492376926397966e-06,
"loss": 0.5094,
"step": 366
},
{
"epoch": 2.2378048780487805,
"grad_norm": 0.07617526219017642,
"learning_rate": 3.6939161133501823e-06,
"loss": 0.4479,
"step": 367
},
{
"epoch": 2.2439024390243905,
"grad_norm": 0.08063324451878306,
"learning_rate": 3.6389131101096953e-06,
"loss": 0.5099,
"step": 368
},
{
"epoch": 2.25,
"grad_norm": 0.07739599643572288,
"learning_rate": 3.5842314616063134e-06,
"loss": 0.491,
"step": 369
},
{
"epoch": 2.2560975609756095,
"grad_norm": 0.07768918830309231,
"learning_rate": 3.529873930293546e-06,
"loss": 0.5417,
"step": 370
},
{
"epoch": 2.2621951219512195,
"grad_norm": 0.0822012033362632,
"learning_rate": 3.4758432622508677e-06,
"loss": 0.5186,
"step": 371
},
{
"epoch": 2.2682926829268295,
"grad_norm": 0.0764020181839071,
"learning_rate": 3.422142187045011e-06,
"loss": 0.4754,
"step": 372
},
{
"epoch": 2.274390243902439,
"grad_norm": 0.08335314099643498,
"learning_rate": 3.3687734175920505e-06,
"loss": 0.5537,
"step": 373
},
{
"epoch": 2.2804878048780486,
"grad_norm": 0.0803475394628154,
"learning_rate": 3.3157396500203655e-06,
"loss": 0.4212,
"step": 374
},
{
"epoch": 2.2865853658536586,
"grad_norm": 0.06853604150629149,
"learning_rate": 3.2630435635344283e-06,
"loss": 0.4197,
"step": 375
},
{
"epoch": 2.292682926829268,
"grad_norm": 0.07027233118743227,
"learning_rate": 3.2106878202794513e-06,
"loss": 0.426,
"step": 376
},
{
"epoch": 2.298780487804878,
"grad_norm": 0.08035482501355977,
"learning_rate": 3.1586750652069077e-06,
"loss": 0.4768,
"step": 377
},
{
"epoch": 2.3048780487804876,
"grad_norm": 0.0767033268066497,
"learning_rate": 3.1070079259408934e-06,
"loss": 0.4298,
"step": 378
},
{
"epoch": 2.3109756097560976,
"grad_norm": 0.07871530893320917,
"learning_rate": 3.0556890126454075e-06,
"loss": 0.5194,
"step": 379
},
{
"epoch": 2.317073170731707,
"grad_norm": 0.0694907669966186,
"learning_rate": 3.004720917892464e-06,
"loss": 0.4458,
"step": 380
},
{
"epoch": 2.323170731707317,
"grad_norm": 0.07550036175573449,
"learning_rate": 2.954106216531141e-06,
"loss": 0.4877,
"step": 381
},
{
"epoch": 2.3292682926829267,
"grad_norm": 0.06828766275227673,
"learning_rate": 2.90384746555749e-06,
"loss": 0.4694,
"step": 382
},
{
"epoch": 2.3353658536585367,
"grad_norm": 0.07957885134154746,
"learning_rate": 2.8539472039853557e-06,
"loss": 0.4549,
"step": 383
},
{
"epoch": 2.341463414634146,
"grad_norm": 0.07312612756103479,
"learning_rate": 2.804407952718119e-06,
"loss": 0.4717,
"step": 384
},
{
"epoch": 2.347560975609756,
"grad_norm": 0.07462194695242244,
"learning_rate": 2.7552322144213405e-06,
"loss": 0.4681,
"step": 385
},
{
"epoch": 2.3536585365853657,
"grad_norm": 0.07382029470746747,
"learning_rate": 2.7064224733963197e-06,
"loss": 0.4455,
"step": 386
},
{
"epoch": 2.3597560975609757,
"grad_norm": 0.07566404170504752,
"learning_rate": 2.6579811954546054e-06,
"loss": 0.4601,
"step": 387
},
{
"epoch": 2.3658536585365852,
"grad_norm": 0.06650889658374204,
"learning_rate": 2.6099108277934105e-06,
"loss": 0.403,
"step": 388
},
{
"epoch": 2.3719512195121952,
"grad_norm": 0.08128051733864035,
"learning_rate": 2.5622137988719985e-06,
"loss": 0.5062,
"step": 389
},
{
"epoch": 2.3780487804878048,
"grad_norm": 0.07645763895183058,
"learning_rate": 2.514892518288988e-06,
"loss": 0.4992,
"step": 390
},
{
"epoch": 2.3841463414634148,
"grad_norm": 0.08185922748732076,
"learning_rate": 2.46794937666063e-06,
"loss": 0.4998,
"step": 391
},
{
"epoch": 2.3902439024390243,
"grad_norm": 0.07724446363577575,
"learning_rate": 2.421386745500034e-06,
"loss": 0.4832,
"step": 392
},
{
"epoch": 2.3963414634146343,
"grad_norm": 0.0719202945692499,
"learning_rate": 2.375206977097353e-06,
"loss": 0.4625,
"step": 393
},
{
"epoch": 2.402439024390244,
"grad_norm": 0.07160181702178699,
"learning_rate": 2.329412404400969e-06,
"loss": 0.4786,
"step": 394
},
{
"epoch": 2.408536585365854,
"grad_norm": 0.07705465155073153,
"learning_rate": 2.2840053408996154e-06,
"loss": 0.4873,
"step": 395
},
{
"epoch": 2.4146341463414633,
"grad_norm": 0.06734740120536699,
"learning_rate": 2.238988080505513e-06,
"loss": 0.4268,
"step": 396
},
{
"epoch": 2.4207317073170733,
"grad_norm": 0.07171806752940019,
"learning_rate": 2.1943628974384858e-06,
"loss": 0.4657,
"step": 397
},
{
"epoch": 2.426829268292683,
"grad_norm": 0.06712821968746505,
"learning_rate": 2.150132046111054e-06,
"loss": 0.4201,
"step": 398
},
{
"epoch": 2.432926829268293,
"grad_norm": 0.08316643198749764,
"learning_rate": 2.1062977610145697e-06,
"loss": 0.513,
"step": 399
},
{
"epoch": 2.4390243902439024,
"grad_norm": 0.0730957075364869,
"learning_rate": 2.0628622566063063e-06,
"loss": 0.4895,
"step": 400
},
{
"epoch": 2.4451219512195124,
"grad_norm": 0.07287061567762979,
"learning_rate": 2.019827727197605e-06,
"loss": 0.4306,
"step": 401
},
{
"epoch": 2.451219512195122,
"grad_norm": 0.06700730358392487,
"learning_rate": 1.977196346843019e-06,
"loss": 0.4141,
"step": 402
},
{
"epoch": 2.457317073170732,
"grad_norm": 0.07927651728219412,
"learning_rate": 1.934970269230464e-06,
"loss": 0.4702,
"step": 403
},
{
"epoch": 2.4634146341463414,
"grad_norm": 0.07966939559181735,
"learning_rate": 1.8931516275724527e-06,
"loss": 0.4209,
"step": 404
},
{
"epoch": 2.4695121951219514,
"grad_norm": 0.07505835152415707,
"learning_rate": 1.8517425344982831e-06,
"loss": 0.5004,
"step": 405
},
{
"epoch": 2.475609756097561,
"grad_norm": 0.0792696269268693,
"learning_rate": 1.8107450819473505e-06,
"loss": 0.4954,
"step": 406
},
{
"epoch": 2.4817073170731705,
"grad_norm": 0.07162945978931057,
"learning_rate": 1.7701613410634367e-06,
"loss": 0.496,
"step": 407
},
{
"epoch": 2.4878048780487805,
"grad_norm": 0.07893944712223014,
"learning_rate": 1.7299933620900945e-06,
"loss": 0.4774,
"step": 408
},
{
"epoch": 2.4939024390243905,
"grad_norm": 0.06827623677585598,
"learning_rate": 1.690243174267071e-06,
"loss": 0.4177,
"step": 409
},
{
"epoch": 2.5,
"grad_norm": 0.0754447597879692,
"learning_rate": 1.6509127857277784e-06,
"loss": 0.4889,
"step": 410
},
{
"epoch": 2.5060975609756095,
"grad_norm": 0.08857036691886101,
"learning_rate": 1.6120041833978662e-06,
"loss": 0.5317,
"step": 411
},
{
"epoch": 2.5121951219512195,
"grad_norm": 0.07177001116277269,
"learning_rate": 1.573519332894824e-06,
"loss": 0.414,
"step": 412
},
{
"epoch": 2.5182926829268295,
"grad_norm": 0.07831205185485109,
"learning_rate": 1.535460178428697e-06,
"loss": 0.5028,
"step": 413
},
{
"epoch": 2.524390243902439,
"grad_norm": 0.07285605695660612,
"learning_rate": 1.4978286427038602e-06,
"loss": 0.5031,
"step": 414
},
{
"epoch": 2.5304878048780486,
"grad_norm": 0.08720951847793187,
"learning_rate": 1.4606266268218783e-06,
"loss": 0.5084,
"step": 415
},
{
"epoch": 2.5365853658536586,
"grad_norm": 0.06975140711559835,
"learning_rate": 1.4238560101854815e-06,
"loss": 0.4253,
"step": 416
},
{
"epoch": 2.5426829268292686,
"grad_norm": 0.07714853710418437,
"learning_rate": 1.3875186504035965e-06,
"loss": 0.4744,
"step": 417
},
{
"epoch": 2.548780487804878,
"grad_norm": 0.07565492373996721,
"learning_rate": 1.3516163831975337e-06,
"loss": 0.5152,
"step": 418
},
{
"epoch": 2.5548780487804876,
"grad_norm": 0.07030057664082874,
"learning_rate": 1.3161510223082152e-06,
"loss": 0.4461,
"step": 419
},
{
"epoch": 2.5609756097560976,
"grad_norm": 0.0800726969605912,
"learning_rate": 1.2811243594045697e-06,
"loss": 0.5135,
"step": 420
},
{
"epoch": 2.567073170731707,
"grad_norm": 0.07816897719762364,
"learning_rate": 1.246538163993013e-06,
"loss": 0.4999,
"step": 421
},
{
"epoch": 2.573170731707317,
"grad_norm": 0.07745184122312047,
"learning_rate": 1.2123941833280472e-06,
"loss": 0.4847,
"step": 422
},
{
"epoch": 2.5792682926829267,
"grad_norm": 0.07419017119462436,
"learning_rate": 1.1786941423240072e-06,
"loss": 0.4843,
"step": 423
},
{
"epoch": 2.5853658536585367,
"grad_norm": 0.07931455390788596,
"learning_rate": 1.1454397434679022e-06,
"loss": 0.4946,
"step": 424
},
{
"epoch": 2.591463414634146,
"grad_norm": 0.07615796865199526,
"learning_rate": 1.1126326667334196e-06,
"loss": 0.4524,
"step": 425
},
{
"epoch": 2.597560975609756,
"grad_norm": 0.0772418363352449,
"learning_rate": 1.080274569496057e-06,
"loss": 0.5152,
"step": 426
},
{
"epoch": 2.6036585365853657,
"grad_norm": 0.07025077296325957,
"learning_rate": 1.0483670864493777e-06,
"loss": 0.4332,
"step": 427
},
{
"epoch": 2.6097560975609757,
"grad_norm": 0.07566288563595869,
"learning_rate": 1.0169118295224488e-06,
"loss": 0.5029,
"step": 428
},
{
"epoch": 2.6158536585365852,
"grad_norm": 0.0758658441769188,
"learning_rate": 9.85910387798389e-07,
"loss": 0.4573,
"step": 429
},
{
"epoch": 2.6219512195121952,
"grad_norm": 0.07964262946952728,
"learning_rate": 9.55364327434105e-07,
"loss": 0.4933,
"step": 430
},
{
"epoch": 2.6280487804878048,
"grad_norm": 0.07874589016859943,
"learning_rate": 9.252751915811642e-07,
"loss": 0.473,
"step": 431
},
{
"epoch": 2.6341463414634148,
"grad_norm": 0.0766185482028681,
"learning_rate": 8.956445003078351e-07,
"loss": 0.5018,
"step": 432
},
{
"epoch": 2.6402439024390243,
"grad_norm": 0.08140880877769818,
"learning_rate": 8.664737505223009e-07,
"loss": 0.5203,
"step": 433
},
{
"epoch": 2.6463414634146343,
"grad_norm": 0.08015141287053174,
"learning_rate": 8.377644158970277e-07,
"loss": 0.5215,
"step": 434
},
{
"epoch": 2.652439024390244,
"grad_norm": 0.07563574533469265,
"learning_rate": 8.095179467943293e-07,
"loss": 0.4877,
"step": 435
},
{
"epoch": 2.658536585365854,
"grad_norm": 0.08532123143914754,
"learning_rate": 7.81735770193085e-07,
"loss": 0.5027,
"step": 436
},
{
"epoch": 2.6646341463414633,
"grad_norm": 0.06972127152615569,
"learning_rate": 7.544192896166569e-07,
"loss": 0.4691,
"step": 437
},
{
"epoch": 2.6707317073170733,
"grad_norm": 0.0748196016294252,
"learning_rate": 7.275698850619861e-07,
"loss": 0.5059,
"step": 438
},
{
"epoch": 2.676829268292683,
"grad_norm": 0.07757312698493772,
"learning_rate": 7.011889129298688e-07,
"loss": 0.5559,
"step": 439
},
{
"epoch": 2.682926829268293,
"grad_norm": 0.07577704718768018,
"learning_rate": 6.752777059564431e-07,
"loss": 0.4718,
"step": 440
},
{
"epoch": 2.6890243902439024,
"grad_norm": 0.07357519669905033,
"learning_rate": 6.498375731458529e-07,
"loss": 0.4876,
"step": 441
},
{
"epoch": 2.6951219512195124,
"grad_norm": 0.07445682597283106,
"learning_rate": 6.248697997041219e-07,
"loss": 0.4833,
"step": 442
},
{
"epoch": 2.701219512195122,
"grad_norm": 0.07241140052205494,
"learning_rate": 6.003756469742294e-07,
"loss": 0.4713,
"step": 443
},
{
"epoch": 2.7073170731707314,
"grad_norm": 0.07656055745393084,
"learning_rate": 5.763563523723769e-07,
"loss": 0.4525,
"step": 444
},
{
"epoch": 2.7134146341463414,
"grad_norm": 0.07364895292854746,
"learning_rate": 5.528131293254957e-07,
"loss": 0.477,
"step": 445
},
{
"epoch": 2.7195121951219514,
"grad_norm": 0.07520667622845931,
"learning_rate": 5.29747167209923e-07,
"loss": 0.4747,
"step": 446
},
{
"epoch": 2.725609756097561,
"grad_norm": 0.08218001537128848,
"learning_rate": 5.071596312913329e-07,
"loss": 0.54,
"step": 447
},
{
"epoch": 2.7317073170731705,
"grad_norm": 0.0867606649208939,
"learning_rate": 4.850516626658585e-07,
"loss": 0.5081,
"step": 448
},
{
"epoch": 2.7378048780487805,
"grad_norm": 0.07210738314053156,
"learning_rate": 4.634243782024539e-07,
"loss": 0.4431,
"step": 449
},
{
"epoch": 2.7439024390243905,
"grad_norm": 0.06582252551133536,
"learning_rate": 4.4227887048646335e-07,
"loss": 0.4192,
"step": 450
},
{
"epoch": 2.75,
"grad_norm": 0.07567885015733454,
"learning_rate": 4.216162077644281e-07,
"loss": 0.4887,
"step": 451
},
{
"epoch": 2.7560975609756095,
"grad_norm": 0.07689928271198733,
"learning_rate": 4.014374338901206e-07,
"loss": 0.4683,
"step": 452
},
{
"epoch": 2.7621951219512195,
"grad_norm": 0.07188606850501078,
"learning_rate": 3.817435682718096e-07,
"loss": 0.467,
"step": 453
},
{
"epoch": 2.7682926829268295,
"grad_norm": 0.07163831506927039,
"learning_rate": 3.6253560582076075e-07,
"loss": 0.4539,
"step": 454
},
{
"epoch": 2.774390243902439,
"grad_norm": 0.07592375615552627,
"learning_rate": 3.4381451690097653e-07,
"loss": 0.4736,
"step": 455
},
{
"epoch": 2.7804878048780486,
"grad_norm": 0.0814766286311381,
"learning_rate": 3.255812472801689e-07,
"loss": 0.532,
"step": 456
},
{
"epoch": 2.7865853658536586,
"grad_norm": 0.0717352396757339,
"learning_rate": 3.078367180819863e-07,
"loss": 0.4316,
"step": 457
},
{
"epoch": 2.7926829268292686,
"grad_norm": 0.0746931546839919,
"learning_rate": 2.905818257394799e-07,
"loss": 0.5206,
"step": 458
},
{
"epoch": 2.798780487804878,
"grad_norm": 0.07272697545910387,
"learning_rate": 2.7381744194980963e-07,
"loss": 0.4832,
"step": 459
},
{
"epoch": 2.8048780487804876,
"grad_norm": 0.07230252882563974,
"learning_rate": 2.5754441363021854e-07,
"loss": 0.4778,
"step": 460
},
{
"epoch": 2.8109756097560976,
"grad_norm": 0.08033515297716644,
"learning_rate": 2.417635628752324e-07,
"loss": 0.5301,
"step": 461
},
{
"epoch": 2.817073170731707,
"grad_norm": 0.08891143823100789,
"learning_rate": 2.264756869151441e-07,
"loss": 0.5255,
"step": 462
},
{
"epoch": 2.823170731707317,
"grad_norm": 0.07069972457447841,
"learning_rate": 2.1168155807572476e-07,
"loss": 0.431,
"step": 463
},
{
"epoch": 2.8292682926829267,
"grad_norm": 0.0735272098530535,
"learning_rate": 1.973819237392205e-07,
"loss": 0.4968,
"step": 464
},
{
"epoch": 2.8353658536585367,
"grad_norm": 0.08455153800014176,
"learning_rate": 1.8357750630658367e-07,
"loss": 0.4924,
"step": 465
},
{
"epoch": 2.841463414634146,
"grad_norm": 0.06931732980810784,
"learning_rate": 1.7026900316098217e-07,
"loss": 0.4309,
"step": 466
},
{
"epoch": 2.847560975609756,
"grad_norm": 0.07200984122635784,
"learning_rate": 1.5745708663257199e-07,
"loss": 0.4667,
"step": 467
},
{
"epoch": 2.8536585365853657,
"grad_norm": 0.07677994816913122,
"learning_rate": 1.4514240396452438e-07,
"loss": 0.4834,
"step": 468
},
{
"epoch": 2.8597560975609757,
"grad_norm": 0.08178248437061655,
"learning_rate": 1.333255772803377e-07,
"loss": 0.5251,
"step": 469
},
{
"epoch": 2.8658536585365852,
"grad_norm": 0.07923538700790901,
"learning_rate": 1.2200720355239893e-07,
"loss": 0.5171,
"step": 470
},
{
"epoch": 2.8719512195121952,
"grad_norm": 0.06761152489499198,
"learning_rate": 1.1118785457183034e-07,
"loss": 0.4615,
"step": 471
},
{
"epoch": 2.8780487804878048,
"grad_norm": 0.0767262597066072,
"learning_rate": 1.0086807691960243e-07,
"loss": 0.4976,
"step": 472
},
{
"epoch": 2.8841463414634148,
"grad_norm": 0.08266454055489816,
"learning_rate": 9.104839193892379e-08,
"loss": 0.5389,
"step": 473
},
{
"epoch": 2.8902439024390243,
"grad_norm": 0.08086361495523785,
"learning_rate": 8.172929570889553e-08,
"loss": 0.4929,
"step": 474
},
{
"epoch": 2.8963414634146343,
"grad_norm": 0.0745089911333017,
"learning_rate": 7.291125901946027e-08,
"loss": 0.4939,
"step": 475
},
{
"epoch": 2.902439024390244,
"grad_norm": 0.08025945659207359,
"learning_rate": 6.459472734760997e-08,
"loss": 0.4876,
"step": 476
},
{
"epoch": 2.908536585365854,
"grad_norm": 0.07053947220102097,
"learning_rate": 5.6780120834887264e-08,
"loss": 0.4611,
"step": 477
},
{
"epoch": 2.9146341463414633,
"grad_norm": 0.0803969476007233,
"learning_rate": 4.9467834266154756e-08,
"loss": 0.5419,
"step": 478
},
{
"epoch": 2.9207317073170733,
"grad_norm": 0.07482955639907388,
"learning_rate": 4.2658237049655325e-08,
"loss": 0.4889,
"step": 479
},
{
"epoch": 2.926829268292683,
"grad_norm": 0.07752759847951908,
"learning_rate": 3.635167319834709e-08,
"loss": 0.4749,
"step": 480
},
{
"epoch": 2.932926829268293,
"grad_norm": 0.08359023148499886,
"learning_rate": 3.054846131252731e-08,
"loss": 0.5334,
"step": 481
},
{
"epoch": 2.9390243902439024,
"grad_norm": 0.08139586618522474,
"learning_rate": 2.524889456373525e-08,
"loss": 0.5523,
"step": 482
},
{
"epoch": 2.9451219512195124,
"grad_norm": 0.07935921771678194,
"learning_rate": 2.045324067993959e-08,
"loss": 0.4853,
"step": 483
},
{
"epoch": 2.951219512195122,
"grad_norm": 0.07808298733873976,
"learning_rate": 1.6161741932017026e-08,
"loss": 0.5005,
"step": 484
},
{
"epoch": 2.9573170731707314,
"grad_norm": 0.0649213930995393,
"learning_rate": 1.2374615121508726e-08,
"loss": 0.4098,
"step": 485
},
{
"epoch": 2.9634146341463414,
"grad_norm": 0.08655284099627421,
"learning_rate": 9.092051569674632e-09,
"loss": 0.4856,
"step": 486
},
{
"epoch": 2.9695121951219514,
"grad_norm": 0.07707319839718253,
"learning_rate": 6.314217107817877e-09,
"loss": 0.5193,
"step": 487
},
{
"epoch": 2.975609756097561,
"grad_norm": 0.07715334723099698,
"learning_rate": 4.041252068918145e-09,
"loss": 0.5263,
"step": 488
},
{
"epoch": 2.9817073170731705,
"grad_norm": 0.07908531746017289,
"learning_rate": 2.273271280534006e-09,
"loss": 0.4823,
"step": 489
},
{
"epoch": 2.9878048780487805,
"grad_norm": 0.07306053297024867,
"learning_rate": 1.0103640590064524e-09,
"loss": 0.4543,
"step": 490
},
{
"epoch": 2.9939024390243905,
"grad_norm": 0.08516212104693965,
"learning_rate": 2.525942049436125e-10,
"loss": 0.5,
"step": 491
},
{
"epoch": 3.0,
"grad_norm": 0.07775010775380628,
"learning_rate": 0.0,
"loss": 0.4651,
"step": 492
},
{
"epoch": 3.0,
"step": 492,
"total_flos": 7801758436818944.0,
"train_loss": 0.7506088816780385,
"train_runtime": 5228.6286,
"train_samples_per_second": 6.005,
"train_steps_per_second": 0.094
}
],
"logging_steps": 1,
"max_steps": 492,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7801758436818944.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}