9b-50 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
dde6785 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 1464,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00546448087431694,
"grad_norm": 0.2140842080116272,
"learning_rate": 1.3513513513513515e-07,
"loss": 1.926304817199707,
"step": 2
},
{
"epoch": 0.01092896174863388,
"grad_norm": 0.24205853044986725,
"learning_rate": 4.0540540540540546e-07,
"loss": 2.0275325775146484,
"step": 4
},
{
"epoch": 0.01639344262295082,
"grad_norm": 0.18398630619049072,
"learning_rate": 6.756756756756758e-07,
"loss": 1.8153012990951538,
"step": 6
},
{
"epoch": 0.02185792349726776,
"grad_norm": 0.20006528496742249,
"learning_rate": 9.459459459459461e-07,
"loss": 2.0009360313415527,
"step": 8
},
{
"epoch": 0.0273224043715847,
"grad_norm": 0.4238962233066559,
"learning_rate": 1.2162162162162164e-06,
"loss": 2.262256383895874,
"step": 10
},
{
"epoch": 0.03278688524590164,
"grad_norm": 3.5671658515930176,
"learning_rate": 1.4864864864864868e-06,
"loss": 3.091909885406494,
"step": 12
},
{
"epoch": 0.03825136612021858,
"grad_norm": 0.22672173380851746,
"learning_rate": 1.756756756756757e-06,
"loss": 1.999807357788086,
"step": 14
},
{
"epoch": 0.04371584699453552,
"grad_norm": 0.18996493518352509,
"learning_rate": 2.0270270270270273e-06,
"loss": 1.8604943752288818,
"step": 16
},
{
"epoch": 0.04918032786885246,
"grad_norm": 0.534092366695404,
"learning_rate": 2.297297297297298e-06,
"loss": 2.669407844543457,
"step": 18
},
{
"epoch": 0.0546448087431694,
"grad_norm": 0.5320157408714294,
"learning_rate": 2.5675675675675675e-06,
"loss": 2.0528454780578613,
"step": 20
},
{
"epoch": 0.060109289617486336,
"grad_norm": 2.134730339050293,
"learning_rate": 2.837837837837838e-06,
"loss": 2.3460330963134766,
"step": 22
},
{
"epoch": 0.06557377049180328,
"grad_norm": 0.272125780582428,
"learning_rate": 3.1081081081081082e-06,
"loss": 1.8946585655212402,
"step": 24
},
{
"epoch": 0.07103825136612021,
"grad_norm": 0.1925988346338272,
"learning_rate": 3.3783783783783788e-06,
"loss": 1.9396032094955444,
"step": 26
},
{
"epoch": 0.07650273224043716,
"grad_norm": 0.18215565383434296,
"learning_rate": 3.648648648648649e-06,
"loss": 1.90200936794281,
"step": 28
},
{
"epoch": 0.08196721311475409,
"grad_norm": 0.3920046389102936,
"learning_rate": 3.918918918918919e-06,
"loss": 1.774632453918457,
"step": 30
},
{
"epoch": 0.08743169398907104,
"grad_norm": 1.389986515045166,
"learning_rate": 4.189189189189189e-06,
"loss": 2.4618608951568604,
"step": 32
},
{
"epoch": 0.09289617486338798,
"grad_norm": 0.1785120666027069,
"learning_rate": 4.45945945945946e-06,
"loss": 1.8398792743682861,
"step": 34
},
{
"epoch": 0.09836065573770492,
"grad_norm": 1.0021958351135254,
"learning_rate": 4.72972972972973e-06,
"loss": 2.869662046432495,
"step": 36
},
{
"epoch": 0.10382513661202186,
"grad_norm": 0.8340303897857666,
"learning_rate": 5e-06,
"loss": 2.1690709590911865,
"step": 38
},
{
"epoch": 0.1092896174863388,
"grad_norm": 0.24475215375423431,
"learning_rate": 5.2702702702702705e-06,
"loss": 2.0148115158081055,
"step": 40
},
{
"epoch": 0.11475409836065574,
"grad_norm": 0.18627606332302094,
"learning_rate": 5.540540540540541e-06,
"loss": 1.9455300569534302,
"step": 42
},
{
"epoch": 0.12021857923497267,
"grad_norm": 0.2668582499027252,
"learning_rate": 5.810810810810811e-06,
"loss": 1.8533397912979126,
"step": 44
},
{
"epoch": 0.12568306010928962,
"grad_norm": 0.16417773067951202,
"learning_rate": 6.081081081081082e-06,
"loss": 1.5005569458007812,
"step": 46
},
{
"epoch": 0.13114754098360656,
"grad_norm": 0.16320785880088806,
"learning_rate": 6.351351351351351e-06,
"loss": 1.8427382707595825,
"step": 48
},
{
"epoch": 0.1366120218579235,
"grad_norm": 0.17526981234550476,
"learning_rate": 6.621621621621622e-06,
"loss": 1.8968942165374756,
"step": 50
},
{
"epoch": 0.14207650273224043,
"grad_norm": 0.13793262839317322,
"learning_rate": 6.891891891891892e-06,
"loss": 1.932059645652771,
"step": 52
},
{
"epoch": 0.14754098360655737,
"grad_norm": 0.2548584043979645,
"learning_rate": 7.162162162162163e-06,
"loss": 2.06876540184021,
"step": 54
},
{
"epoch": 0.15300546448087432,
"grad_norm": 0.1797744482755661,
"learning_rate": 7.4324324324324324e-06,
"loss": 1.816495418548584,
"step": 56
},
{
"epoch": 0.15846994535519127,
"grad_norm": 1.0909459590911865,
"learning_rate": 7.702702702702704e-06,
"loss": 1.9190263748168945,
"step": 58
},
{
"epoch": 0.16393442622950818,
"grad_norm": 0.6329110264778137,
"learning_rate": 7.972972972972974e-06,
"loss": 1.9341117143630981,
"step": 60
},
{
"epoch": 0.16939890710382513,
"grad_norm": 0.4930436313152313,
"learning_rate": 8.243243243243245e-06,
"loss": 1.8207262754440308,
"step": 62
},
{
"epoch": 0.17486338797814208,
"grad_norm": 0.14814579486846924,
"learning_rate": 8.513513513513514e-06,
"loss": 1.4946051836013794,
"step": 64
},
{
"epoch": 0.18032786885245902,
"grad_norm": 0.4138646423816681,
"learning_rate": 8.783783783783785e-06,
"loss": 2.005674362182617,
"step": 66
},
{
"epoch": 0.18579234972677597,
"grad_norm": 0.3491123616695404,
"learning_rate": 9.054054054054054e-06,
"loss": 1.7716784477233887,
"step": 68
},
{
"epoch": 0.1912568306010929,
"grad_norm": 0.10840397328138351,
"learning_rate": 9.324324324324325e-06,
"loss": 1.6829311847686768,
"step": 70
},
{
"epoch": 0.19672131147540983,
"grad_norm": 0.6797943115234375,
"learning_rate": 9.594594594594594e-06,
"loss": 1.577182412147522,
"step": 72
},
{
"epoch": 0.20218579234972678,
"grad_norm": 0.15648193657398224,
"learning_rate": 9.864864864864865e-06,
"loss": 1.649492859840393,
"step": 74
},
{
"epoch": 0.20765027322404372,
"grad_norm": 1.0870651006698608,
"learning_rate": 9.999882306974701e-06,
"loss": 1.6824759244918823,
"step": 76
},
{
"epoch": 0.21311475409836064,
"grad_norm": 0.25709015130996704,
"learning_rate": 9.998940799709714e-06,
"loss": 1.6956148147583008,
"step": 78
},
{
"epoch": 0.2185792349726776,
"grad_norm": 0.1276382952928543,
"learning_rate": 9.997057982170656e-06,
"loss": 1.6741969585418701,
"step": 80
},
{
"epoch": 0.22404371584699453,
"grad_norm": 0.3517210781574249,
"learning_rate": 9.994234248298137e-06,
"loss": 1.6751466989517212,
"step": 82
},
{
"epoch": 0.22950819672131148,
"grad_norm": 0.14190161228179932,
"learning_rate": 9.990470188900045e-06,
"loss": 1.647499442100525,
"step": 84
},
{
"epoch": 0.23497267759562843,
"grad_norm": 0.12604814767837524,
"learning_rate": 9.985766591527924e-06,
"loss": 1.6619607210159302,
"step": 86
},
{
"epoch": 0.24043715846994534,
"grad_norm": 0.2547169625759125,
"learning_rate": 9.980124440312204e-06,
"loss": 1.3526731729507446,
"step": 88
},
{
"epoch": 0.2459016393442623,
"grad_norm": 1.3758513927459717,
"learning_rate": 9.973544915756283e-06,
"loss": 1.5609016418457031,
"step": 90
},
{
"epoch": 0.25136612021857924,
"grad_norm": 0.2603493928909302,
"learning_rate": 9.966029394489537e-06,
"loss": 1.5576223134994507,
"step": 92
},
{
"epoch": 0.2568306010928962,
"grad_norm": 2.4040775299072266,
"learning_rate": 9.957579448979287e-06,
"loss": 1.3147119283676147,
"step": 94
},
{
"epoch": 0.26229508196721313,
"grad_norm": 0.2225703001022339,
"learning_rate": 9.948196847201791e-06,
"loss": 1.5775783061981201,
"step": 96
},
{
"epoch": 0.2677595628415301,
"grad_norm": 0.12260692566633224,
"learning_rate": 9.937883552272334e-06,
"loss": 1.5510284900665283,
"step": 98
},
{
"epoch": 0.273224043715847,
"grad_norm": 0.15643945336341858,
"learning_rate": 9.926641722034484e-06,
"loss": 1.591309666633606,
"step": 100
},
{
"epoch": 0.2786885245901639,
"grad_norm": 0.20280535519123077,
"learning_rate": 9.914473708608616e-06,
"loss": 1.1799218654632568,
"step": 102
},
{
"epoch": 0.28415300546448086,
"grad_norm": 0.1971111297607422,
"learning_rate": 9.901382057899766e-06,
"loss": 1.5698447227478027,
"step": 104
},
{
"epoch": 0.2896174863387978,
"grad_norm": 0.11269693076610565,
"learning_rate": 9.887369509064966e-06,
"loss": 1.5561062097549438,
"step": 106
},
{
"epoch": 0.29508196721311475,
"grad_norm": 0.231752410531044,
"learning_rate": 9.872438993940122e-06,
"loss": 1.5972791910171509,
"step": 108
},
{
"epoch": 0.3005464480874317,
"grad_norm": 0.13726752996444702,
"learning_rate": 9.856593636426595e-06,
"loss": 1.5522114038467407,
"step": 110
},
{
"epoch": 0.30601092896174864,
"grad_norm": 0.17194727063179016,
"learning_rate": 9.839836751837586e-06,
"loss": 1.5287986993789673,
"step": 112
},
{
"epoch": 0.3114754098360656,
"grad_norm": 0.35292795300483704,
"learning_rate": 9.822171846204474e-06,
"loss": 1.209470510482788,
"step": 114
},
{
"epoch": 0.31693989071038253,
"grad_norm": 0.1626555174589157,
"learning_rate": 9.80360261554326e-06,
"loss": 1.511365532875061,
"step": 116
},
{
"epoch": 0.3224043715846995,
"grad_norm": 0.12288705259561539,
"learning_rate": 9.784132945081237e-06,
"loss": 1.5239923000335693,
"step": 118
},
{
"epoch": 0.32786885245901637,
"grad_norm": 2.931826114654541,
"learning_rate": 9.7637669084441e-06,
"loss": 1.7835521697998047,
"step": 120
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.3326365649700165,
"learning_rate": 9.74250876680362e-06,
"loss": 1.778532862663269,
"step": 122
},
{
"epoch": 0.33879781420765026,
"grad_norm": 0.1233556792140007,
"learning_rate": 9.720362967986078e-06,
"loss": 1.4904721975326538,
"step": 124
},
{
"epoch": 0.3442622950819672,
"grad_norm": 0.1673988252878189,
"learning_rate": 9.697334145541656e-06,
"loss": 1.4772865772247314,
"step": 126
},
{
"epoch": 0.34972677595628415,
"grad_norm": 0.15732745826244354,
"learning_rate": 9.67342711777496e-06,
"loss": 1.0505764484405518,
"step": 128
},
{
"epoch": 0.3551912568306011,
"grad_norm": 0.16721411049365997,
"learning_rate": 9.64864688673688e-06,
"loss": 1.191750407218933,
"step": 130
},
{
"epoch": 0.36065573770491804,
"grad_norm": 0.32817137241363525,
"learning_rate": 9.622998637178035e-06,
"loss": 1.204863429069519,
"step": 132
},
{
"epoch": 0.366120218579235,
"grad_norm": 0.23698288202285767,
"learning_rate": 9.59648773546395e-06,
"loss": 1.460710048675537,
"step": 134
},
{
"epoch": 0.37158469945355194,
"grad_norm": 0.15079301595687866,
"learning_rate": 9.569119728452268e-06,
"loss": 1.4395004510879517,
"step": 136
},
{
"epoch": 0.3770491803278688,
"grad_norm": 0.15452022850513458,
"learning_rate": 9.54090034233218e-06,
"loss": 1.4421769380569458,
"step": 138
},
{
"epoch": 0.3825136612021858,
"grad_norm": 0.3786206841468811,
"learning_rate": 9.511835481426343e-06,
"loss": 0.9576031565666199,
"step": 140
},
{
"epoch": 0.3879781420765027,
"grad_norm": 0.19373159110546112,
"learning_rate": 9.481931226955514e-06,
"loss": 0.9874456524848938,
"step": 142
},
{
"epoch": 0.39344262295081966,
"grad_norm": 0.17608597874641418,
"learning_rate": 9.451193835766189e-06,
"loss": 0.9980979561805725,
"step": 144
},
{
"epoch": 0.3989071038251366,
"grad_norm": 0.42837488651275635,
"learning_rate": 9.419629739021479e-06,
"loss": 0.8302822709083557,
"step": 146
},
{
"epoch": 0.40437158469945356,
"grad_norm": 0.17534486949443817,
"learning_rate": 9.387245540855535e-06,
"loss": 1.4698011875152588,
"step": 148
},
{
"epoch": 0.4098360655737705,
"grad_norm": 0.16244103014469147,
"learning_rate": 9.354048016991752e-06,
"loss": 1.3908226490020752,
"step": 150
},
{
"epoch": 0.41530054644808745,
"grad_norm": 0.12504081428050995,
"learning_rate": 9.320044113325108e-06,
"loss": 1.4349119663238525,
"step": 152
},
{
"epoch": 0.4207650273224044,
"grad_norm": 0.12712658941745758,
"learning_rate": 9.285240944468863e-06,
"loss": 1.3961336612701416,
"step": 154
},
{
"epoch": 0.4262295081967213,
"grad_norm": 0.1524275839328766,
"learning_rate": 9.249645792265982e-06,
"loss": 1.4216071367263794,
"step": 156
},
{
"epoch": 0.43169398907103823,
"grad_norm": 0.5565000772476196,
"learning_rate": 9.213266104265559e-06,
"loss": 1.1973795890808105,
"step": 158
},
{
"epoch": 0.4371584699453552,
"grad_norm": 0.30147379636764526,
"learning_rate": 9.17610949216457e-06,
"loss": 1.4227999448776245,
"step": 160
},
{
"epoch": 0.4426229508196721,
"grad_norm": 0.2992405891418457,
"learning_rate": 9.138183730215288e-06,
"loss": 0.9599087238311768,
"step": 162
},
{
"epoch": 0.44808743169398907,
"grad_norm": 0.1224522739648819,
"learning_rate": 9.09949675359867e-06,
"loss": 1.4286303520202637,
"step": 164
},
{
"epoch": 0.453551912568306,
"grad_norm": 0.45538783073425293,
"learning_rate": 9.060056656764092e-06,
"loss": 0.9792911410331726,
"step": 166
},
{
"epoch": 0.45901639344262296,
"grad_norm": 0.09756561368703842,
"learning_rate": 9.01987169173576e-06,
"loss": 1.4167883396148682,
"step": 168
},
{
"epoch": 0.4644808743169399,
"grad_norm": 0.1794954091310501,
"learning_rate": 8.978950266386132e-06,
"loss": 1.681199312210083,
"step": 170
},
{
"epoch": 0.46994535519125685,
"grad_norm": 0.2935521900653839,
"learning_rate": 8.937300942676752e-06,
"loss": 0.9957009553909302,
"step": 172
},
{
"epoch": 0.47540983606557374,
"grad_norm": 0.24504221975803375,
"learning_rate": 8.89493243486683e-06,
"loss": 1.4166711568832397,
"step": 174
},
{
"epoch": 0.4808743169398907,
"grad_norm": 0.11474514752626419,
"learning_rate": 8.851853607689975e-06,
"loss": 1.3909870386123657,
"step": 176
},
{
"epoch": 0.48633879781420764,
"grad_norm": 0.36876991391181946,
"learning_rate": 8.808073474499423e-06,
"loss": 1.5189208984375,
"step": 178
},
{
"epoch": 0.4918032786885246,
"grad_norm": 1.9038840532302856,
"learning_rate": 8.763601195382183e-06,
"loss": 1.1209758520126343,
"step": 180
},
{
"epoch": 0.4972677595628415,
"grad_norm": 0.12000768631696701,
"learning_rate": 8.718446075242476e-06,
"loss": 1.4405888319015503,
"step": 182
},
{
"epoch": 0.5027322404371585,
"grad_norm": 0.12939400970935822,
"learning_rate": 8.672617561854881e-06,
"loss": 1.3677761554718018,
"step": 184
},
{
"epoch": 0.5081967213114754,
"grad_norm": 0.12435033172369003,
"learning_rate": 8.626125243887578e-06,
"loss": 1.3820520639419556,
"step": 186
},
{
"epoch": 0.5136612021857924,
"grad_norm": 0.4554789662361145,
"learning_rate": 8.578978848896125e-06,
"loss": 0.576648473739624,
"step": 188
},
{
"epoch": 0.5191256830601093,
"grad_norm": 0.1811702996492386,
"learning_rate": 8.531188241288158e-06,
"loss": 1.3497576713562012,
"step": 190
},
{
"epoch": 0.5245901639344263,
"grad_norm": 0.2503994107246399,
"learning_rate": 8.48276342025948e-06,
"loss": 1.3807411193847656,
"step": 192
},
{
"epoch": 0.5300546448087432,
"grad_norm": 0.14375697076320648,
"learning_rate": 8.43371451770192e-06,
"loss": 1.364574670791626,
"step": 194
},
{
"epoch": 0.5355191256830601,
"grad_norm": 0.22804562747478485,
"learning_rate": 8.384051796083464e-06,
"loss": 1.0098246335983276,
"step": 196
},
{
"epoch": 0.5409836065573771,
"grad_norm": 0.1258535385131836,
"learning_rate": 8.333785646301032e-06,
"loss": 1.1203646659851074,
"step": 198
},
{
"epoch": 0.546448087431694,
"grad_norm": 0.1543567180633545,
"learning_rate": 8.2829265855064e-06,
"loss": 1.1332200765609741,
"step": 200
},
{
"epoch": 0.5519125683060109,
"grad_norm": 0.16096453368663788,
"learning_rate": 8.231485254905705e-06,
"loss": 1.101601243019104,
"step": 202
},
{
"epoch": 0.5573770491803278,
"grad_norm": 0.11994829773902893,
"learning_rate": 8.179472417533e-06,
"loss": 1.336147427558899,
"step": 204
},
{
"epoch": 0.5628415300546448,
"grad_norm": 0.2874319553375244,
"learning_rate": 8.126898955998294e-06,
"loss": 1.3753966093063354,
"step": 206
},
{
"epoch": 0.5683060109289617,
"grad_norm": 0.23725414276123047,
"learning_rate": 8.073775870210613e-06,
"loss": 1.1924476623535156,
"step": 208
},
{
"epoch": 0.5737704918032787,
"grad_norm": 0.147891104221344,
"learning_rate": 8.020114275076475e-06,
"loss": 1.4420928955078125,
"step": 210
},
{
"epoch": 0.5792349726775956,
"grad_norm": 0.3109690546989441,
"learning_rate": 7.965925398174345e-06,
"loss": 1.3464856147766113,
"step": 212
},
{
"epoch": 0.5846994535519126,
"grad_norm": 0.12849116325378418,
"learning_rate": 7.911220577405485e-06,
"loss": 1.4423210620880127,
"step": 214
},
{
"epoch": 0.5901639344262295,
"grad_norm": 0.10997510701417923,
"learning_rate": 7.856011258621733e-06,
"loss": 1.3494584560394287,
"step": 216
},
{
"epoch": 0.5956284153005464,
"grad_norm": 0.14700964093208313,
"learning_rate": 7.800308993230703e-06,
"loss": 1.2998048067092896,
"step": 218
},
{
"epoch": 0.6010928961748634,
"grad_norm": 0.1137683093547821,
"learning_rate": 7.744125435778877e-06,
"loss": 1.3147251605987549,
"step": 220
},
{
"epoch": 0.6065573770491803,
"grad_norm": 0.13836072385311127,
"learning_rate": 7.687472341513158e-06,
"loss": 0.8411474227905273,
"step": 222
},
{
"epoch": 0.6120218579234973,
"grad_norm": 0.16769063472747803,
"learning_rate": 7.6303615639213e-06,
"loss": 1.3210705518722534,
"step": 224
},
{
"epoch": 0.6174863387978142,
"grad_norm": 0.3754584789276123,
"learning_rate": 7.572805052251824e-06,
"loss": 1.3517686128616333,
"step": 226
},
{
"epoch": 0.6229508196721312,
"grad_norm": 0.49983546137809753,
"learning_rate": 7.5148148490138875e-06,
"loss": 0.8512200713157654,
"step": 228
},
{
"epoch": 0.6284153005464481,
"grad_norm": 0.14400480687618256,
"learning_rate": 7.456403087457637e-06,
"loss": 1.3087819814682007,
"step": 230
},
{
"epoch": 0.6338797814207651,
"grad_norm": 0.14718657732009888,
"learning_rate": 7.397581989035567e-06,
"loss": 1.3561756610870361,
"step": 232
},
{
"epoch": 0.639344262295082,
"grad_norm": 0.10177802294492722,
"learning_rate": 7.3383638608454495e-06,
"loss": 1.0540159940719604,
"step": 234
},
{
"epoch": 0.644808743169399,
"grad_norm": 0.20562835037708282,
"learning_rate": 7.278761093055316e-06,
"loss": 1.6283648014068604,
"step": 236
},
{
"epoch": 0.6502732240437158,
"grad_norm": 0.14201946556568146,
"learning_rate": 7.218786156311075e-06,
"loss": 1.327020287513733,
"step": 238
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.13667063415050507,
"learning_rate": 7.1584515991272926e-06,
"loss": 1.2042888402938843,
"step": 240
},
{
"epoch": 0.6612021857923497,
"grad_norm": 0.15532344579696655,
"learning_rate": 7.097770045261667e-06,
"loss": 0.7934575080871582,
"step": 242
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.2028380036354065,
"learning_rate": 7.036754191073771e-06,
"loss": 1.4401774406433105,
"step": 244
},
{
"epoch": 0.6721311475409836,
"grad_norm": 0.3311957120895386,
"learning_rate": 6.975416802868608e-06,
"loss": 1.2962630987167358,
"step": 246
},
{
"epoch": 0.6775956284153005,
"grad_norm": 0.1708596646785736,
"learning_rate": 6.9137707142255175e-06,
"loss": 1.3066641092300415,
"step": 248
},
{
"epoch": 0.6830601092896175,
"grad_norm": 0.12483019381761551,
"learning_rate": 6.8518288233130204e-06,
"loss": 0.4209732711315155,
"step": 250
},
{
"epoch": 0.6885245901639344,
"grad_norm": 0.1456981897354126,
"learning_rate": 6.789604090190138e-06,
"loss": 1.3618273735046387,
"step": 252
},
{
"epoch": 0.6939890710382514,
"grad_norm": 0.11124172061681747,
"learning_rate": 6.72710953409478e-06,
"loss": 1.3271857500076294,
"step": 254
},
{
"epoch": 0.6994535519125683,
"grad_norm": 0.17787770926952362,
"learning_rate": 6.664358230719721e-06,
"loss": 1.0543129444122314,
"step": 256
},
{
"epoch": 0.7049180327868853,
"grad_norm": 0.15766650438308716,
"learning_rate": 6.601363309476797e-06,
"loss": 1.3333730697631836,
"step": 258
},
{
"epoch": 0.7103825136612022,
"grad_norm": 0.15368694067001343,
"learning_rate": 6.53813795074984e-06,
"loss": 1.1295578479766846,
"step": 260
},
{
"epoch": 0.7158469945355191,
"grad_norm": 0.20204530656337738,
"learning_rate": 6.47469538313696e-06,
"loss": 1.3260233402252197,
"step": 262
},
{
"epoch": 0.7213114754098361,
"grad_norm": 0.202342689037323,
"learning_rate": 6.411048880682727e-06,
"loss": 1.2969331741333008,
"step": 264
},
{
"epoch": 0.726775956284153,
"grad_norm": 0.16061684489250183,
"learning_rate": 6.347211760100863e-06,
"loss": 0.8417341709136963,
"step": 266
},
{
"epoch": 0.73224043715847,
"grad_norm": 0.2238452136516571,
"learning_rate": 6.283197377987984e-06,
"loss": 1.3142722845077515,
"step": 268
},
{
"epoch": 0.7377049180327869,
"grad_norm": 0.2547805905342102,
"learning_rate": 6.219019128029017e-06,
"loss": 1.3463314771652222,
"step": 270
},
{
"epoch": 0.7431693989071039,
"grad_norm": 0.2618471682071686,
"learning_rate": 6.154690438194845e-06,
"loss": 1.3877606391906738,
"step": 272
},
{
"epoch": 0.7486338797814208,
"grad_norm": 1.0897411108016968,
"learning_rate": 6.090224767932778e-06,
"loss": 1.3851262331008911,
"step": 274
},
{
"epoch": 0.7540983606557377,
"grad_norm": 0.28123998641967773,
"learning_rate": 6.025635605350451e-06,
"loss": 1.3963508605957031,
"step": 276
},
{
"epoch": 0.7595628415300546,
"grad_norm": 0.1303873360157013,
"learning_rate": 5.960936464393712e-06,
"loss": 0.9244080781936646,
"step": 278
},
{
"epoch": 0.7650273224043715,
"grad_norm": 0.41428306698799133,
"learning_rate": 5.896140882019106e-06,
"loss": 1.3301913738250732,
"step": 280
},
{
"epoch": 0.7704918032786885,
"grad_norm": 0.11913089454174042,
"learning_rate": 5.831262415361546e-06,
"loss": 1.2311517000198364,
"step": 282
},
{
"epoch": 0.7759562841530054,
"grad_norm": 0.12226550281047821,
"learning_rate": 5.7663146388977746e-06,
"loss": 1.3435078859329224,
"step": 284
},
{
"epoch": 0.7814207650273224,
"grad_norm": 0.17994911968708038,
"learning_rate": 5.701311141606169e-06,
"loss": 1.374443769454956,
"step": 286
},
{
"epoch": 0.7868852459016393,
"grad_norm": 0.6281813979148865,
"learning_rate": 5.636265524123543e-06,
"loss": 0.7908742427825928,
"step": 288
},
{
"epoch": 0.7923497267759563,
"grad_norm": 0.16089321672916412,
"learning_rate": 5.571191395899484e-06,
"loss": 0.8485991358757019,
"step": 290
},
{
"epoch": 0.7978142076502732,
"grad_norm": 0.1296064853668213,
"learning_rate": 5.506102372348866e-06,
"loss": 1.0751264095306396,
"step": 292
},
{
"epoch": 0.8032786885245902,
"grad_norm": 0.36786502599716187,
"learning_rate": 5.441012072003098e-06,
"loss": 1.3324687480926514,
"step": 294
},
{
"epoch": 0.8087431693989071,
"grad_norm": 0.12958282232284546,
"learning_rate": 5.375934113660736e-06,
"loss": 1.3289207220077515,
"step": 296
},
{
"epoch": 0.8142076502732241,
"grad_norm": 0.1548251509666443,
"learning_rate": 5.310882113538023e-06,
"loss": 1.3217027187347412,
"step": 298
},
{
"epoch": 0.819672131147541,
"grad_norm": 0.1366654634475708,
"learning_rate": 5.245869682419986e-06,
"loss": 0.9328991174697876,
"step": 300
},
{
"epoch": 0.825136612021858,
"grad_norm": 0.18621976673603058,
"learning_rate": 5.180910422812654e-06,
"loss": 1.3296598196029663,
"step": 302
},
{
"epoch": 0.8306010928961749,
"grad_norm": 0.2505550980567932,
"learning_rate": 5.116017926097018e-06,
"loss": 1.1915266513824463,
"step": 304
},
{
"epoch": 0.8360655737704918,
"grad_norm": 0.23381677269935608,
"learning_rate": 5.051205769685313e-06,
"loss": 1.3220409154891968,
"step": 306
},
{
"epoch": 0.8415300546448088,
"grad_norm": 0.15024042129516602,
"learning_rate": 4.986487514180234e-06,
"loss": 1.357560634613037,
"step": 308
},
{
"epoch": 0.8469945355191257,
"grad_norm": 0.11693745851516724,
"learning_rate": 4.9218767005376466e-06,
"loss": 1.306337594985962,
"step": 310
},
{
"epoch": 0.8524590163934426,
"grad_norm": 0.17347416281700134,
"learning_rate": 4.857386847233436e-06,
"loss": 1.3176946640014648,
"step": 312
},
{
"epoch": 0.8579234972677595,
"grad_norm": 0.2934313714504242,
"learning_rate": 4.7930314474350335e-06,
"loss": 1.3381907939910889,
"step": 314
},
{
"epoch": 0.8633879781420765,
"grad_norm": 0.12957894802093506,
"learning_rate": 4.728823966178264e-06,
"loss": 1.3620543479919434,
"step": 316
},
{
"epoch": 0.8688524590163934,
"grad_norm": 0.7882974743843079,
"learning_rate": 4.664777837550049e-06,
"loss": 0.7972342371940613,
"step": 318
},
{
"epoch": 0.8743169398907104,
"grad_norm": 0.16149283945560455,
"learning_rate": 4.600906461877621e-06,
"loss": 1.301277756690979,
"step": 320
},
{
"epoch": 0.8797814207650273,
"grad_norm": 0.14908155798912048,
"learning_rate": 4.537223202924762e-06,
"loss": 0.8377884030342102,
"step": 322
},
{
"epoch": 0.8852459016393442,
"grad_norm": 0.1579539179801941,
"learning_rate": 4.47374138509573e-06,
"loss": 1.5736957788467407,
"step": 324
},
{
"epoch": 0.8907103825136612,
"grad_norm": 0.34149059653282166,
"learning_rate": 4.410474290647395e-06,
"loss": 1.3020200729370117,
"step": 326
},
{
"epoch": 0.8961748633879781,
"grad_norm": 0.2213527411222458,
"learning_rate": 4.3474351569102006e-06,
"loss": 1.2246307134628296,
"step": 328
},
{
"epoch": 0.9016393442622951,
"grad_norm": 0.22809267044067383,
"learning_rate": 4.284637173518528e-06,
"loss": 1.3429615497589111,
"step": 330
},
{
"epoch": 0.907103825136612,
"grad_norm": 0.2011166512966156,
"learning_rate": 4.222093479651039e-06,
"loss": 1.3419407606124878,
"step": 332
},
{
"epoch": 0.912568306010929,
"grad_norm": 0.3965287208557129,
"learning_rate": 4.159817161281578e-06,
"loss": 1.5418695211410522,
"step": 334
},
{
"epoch": 0.9180327868852459,
"grad_norm": 0.35886895656585693,
"learning_rate": 4.09782124844119e-06,
"loss": 1.285651445388794,
"step": 336
},
{
"epoch": 0.9234972677595629,
"grad_norm": 0.201736718416214,
"learning_rate": 4.036118712491872e-06,
"loss": 1.352068543434143,
"step": 338
},
{
"epoch": 0.9289617486338798,
"grad_norm": 0.9024937748908997,
"learning_rate": 3.9747224634125754e-06,
"loss": 0.9026221632957458,
"step": 340
},
{
"epoch": 0.9344262295081968,
"grad_norm": 0.25523102283477783,
"learning_rate": 3.913645347098064e-06,
"loss": 1.2583445310592651,
"step": 342
},
{
"epoch": 0.9398907103825137,
"grad_norm": 0.3619498908519745,
"learning_rate": 3.852900142671163e-06,
"loss": 0.7962759137153625,
"step": 344
},
{
"epoch": 0.9453551912568307,
"grad_norm": 0.23090143501758575,
"learning_rate": 3.7924995598090176e-06,
"loss": 1.310127854347229,
"step": 346
},
{
"epoch": 0.9508196721311475,
"grad_norm": 0.29544466733932495,
"learning_rate": 3.7324562360838303e-06,
"loss": 1.0680677890777588,
"step": 348
},
{
"epoch": 0.9562841530054644,
"grad_norm": 0.13714782893657684,
"learning_rate": 3.6727827343187284e-06,
"loss": 1.2995237112045288,
"step": 350
},
{
"epoch": 0.9617486338797814,
"grad_norm": 0.28946390748023987,
"learning_rate": 3.6134915399592374e-06,
"loss": 1.4416049718856812,
"step": 352
},
{
"epoch": 0.9672131147540983,
"grad_norm": 0.17794133722782135,
"learning_rate": 3.554595058460979e-06,
"loss": 0.7796456813812256,
"step": 354
},
{
"epoch": 0.9726775956284153,
"grad_norm": 0.08328505605459213,
"learning_rate": 3.496105612694068e-06,
"loss": 0.7880629897117615,
"step": 356
},
{
"epoch": 0.9781420765027322,
"grad_norm": 0.34119200706481934,
"learning_rate": 3.4380354403648185e-06,
"loss": 1.2842247486114502,
"step": 358
},
{
"epoch": 0.9836065573770492,
"grad_norm": 0.1201978251338005,
"learning_rate": 3.3803966914552487e-06,
"loss": 0.9038922190666199,
"step": 360
},
{
"epoch": 0.9890710382513661,
"grad_norm": 0.142914816737175,
"learning_rate": 3.3232014256809626e-06,
"loss": 1.283494234085083,
"step": 362
},
{
"epoch": 0.994535519125683,
"grad_norm": 0.3413692116737366,
"learning_rate": 3.266461609967879e-06,
"loss": 1.4224170446395874,
"step": 364
},
{
"epoch": 1.0,
"grad_norm": 0.16403569281101227,
"learning_rate": 3.2101891159484293e-06,
"loss": 1.3757880926132202,
"step": 366
},
{
"epoch": 1.005464480874317,
"grad_norm": 0.1281866729259491,
"learning_rate": 3.154395717477643e-06,
"loss": 1.1197513341903687,
"step": 368
},
{
"epoch": 1.010928961748634,
"grad_norm": 0.14042557775974274,
"learning_rate": 3.099093088169719e-06,
"loss": 1.4380804300308228,
"step": 370
},
{
"epoch": 1.0163934426229508,
"grad_norm": 0.23390677571296692,
"learning_rate": 3.0442927989555694e-06,
"loss": 1.370072364807129,
"step": 372
},
{
"epoch": 1.0218579234972678,
"grad_norm": 0.16064327955245972,
"learning_rate": 2.990006315661825e-06,
"loss": 1.291727900505066,
"step": 374
},
{
"epoch": 1.0273224043715847,
"grad_norm": 0.13082079589366913,
"learning_rate": 2.9362449966118688e-06,
"loss": 1.284584403038025,
"step": 376
},
{
"epoch": 1.0327868852459017,
"grad_norm": 0.12635229527950287,
"learning_rate": 2.883020090249329e-06,
"loss": 1.274359107017517,
"step": 378
},
{
"epoch": 1.0382513661202186,
"grad_norm": 0.24426782131195068,
"learning_rate": 2.830342732784575e-06,
"loss": 1.2423336505889893,
"step": 380
},
{
"epoch": 1.0437158469945356,
"grad_norm": 0.1784898191690445,
"learning_rate": 2.7782239458647028e-06,
"loss": 0.8372669816017151,
"step": 382
},
{
"epoch": 1.0491803278688525,
"grad_norm": 0.20933738350868225,
"learning_rate": 2.7266746342674767e-06,
"loss": 1.3317946195602417,
"step": 384
},
{
"epoch": 1.0546448087431695,
"grad_norm": 0.1455197036266327,
"learning_rate": 2.6757055836197343e-06,
"loss": 1.3095340728759766,
"step": 386
},
{
"epoch": 1.0601092896174864,
"grad_norm": 2.36281156539917,
"learning_rate": 2.6253274581407146e-06,
"loss": 1.6835390329360962,
"step": 388
},
{
"epoch": 1.0655737704918034,
"grad_norm": 0.11096840351819992,
"learning_rate": 2.575550798410792e-06,
"loss": 1.0233980417251587,
"step": 390
},
{
"epoch": 1.0710382513661203,
"grad_norm": 0.11589275300502777,
"learning_rate": 2.526386019166081e-06,
"loss": 0.7074400186538696,
"step": 392
},
{
"epoch": 1.0765027322404372,
"grad_norm": 0.15736694633960724,
"learning_rate": 2.477843407119361e-06,
"loss": 1.3130176067352295,
"step": 394
},
{
"epoch": 1.0819672131147542,
"grad_norm": 0.10321754217147827,
"learning_rate": 2.4299331188078016e-06,
"loss": 1.2882981300354004,
"step": 396
},
{
"epoch": 1.0874316939890711,
"grad_norm": 0.15577873587608337,
"learning_rate": 2.382665178467911e-06,
"loss": 1.2673397064208984,
"step": 398
},
{
"epoch": 1.092896174863388,
"grad_norm": 0.16171161830425262,
"learning_rate": 2.3360494759381785e-06,
"loss": 1.3687695264816284,
"step": 400
},
{
"epoch": 1.098360655737705,
"grad_norm": 0.12707988917827606,
"learning_rate": 2.2900957645898257e-06,
"loss": 1.2529113292694092,
"step": 402
},
{
"epoch": 1.1038251366120218,
"grad_norm": 0.1637069433927536,
"learning_rate": 2.2448136592861165e-06,
"loss": 1.2688065767288208,
"step": 404
},
{
"epoch": 1.1092896174863387,
"grad_norm": 0.1506628394126892,
"learning_rate": 2.2002126343706455e-06,
"loss": 1.2624497413635254,
"step": 406
},
{
"epoch": 1.1147540983606556,
"grad_norm": 0.08956722170114517,
"learning_rate": 2.1563020216850345e-06,
"loss": 1.525004267692566,
"step": 408
},
{
"epoch": 1.1202185792349726,
"grad_norm": 0.1432586908340454,
"learning_rate": 2.1130910086164203e-06,
"loss": 1.2763861417770386,
"step": 410
},
{
"epoch": 1.1256830601092895,
"grad_norm": 0.1573595106601715,
"learning_rate": 2.070588636175207e-06,
"loss": 1.283130168914795,
"step": 412
},
{
"epoch": 1.1311475409836065,
"grad_norm": 0.16168393194675446,
"learning_rate": 2.028803797103403e-06,
"loss": 0.7431031465530396,
"step": 414
},
{
"epoch": 1.1366120218579234,
"grad_norm": 0.14463603496551514,
"learning_rate": 1.987745234014006e-06,
"loss": 1.5580140352249146,
"step": 416
},
{
"epoch": 1.1420765027322404,
"grad_norm": 0.1040046215057373,
"learning_rate": 1.947421537561798e-06,
"loss": 1.3747049570083618,
"step": 418
},
{
"epoch": 1.1475409836065573,
"grad_norm": 0.1442074328660965,
"learning_rate": 1.9078411446459125e-06,
"loss": 1.2827320098876953,
"step": 420
},
{
"epoch": 1.1530054644808743,
"grad_norm": 0.20063269138336182,
"learning_rate": 1.8690123366446072e-06,
"loss": 1.3030825853347778,
"step": 422
},
{
"epoch": 1.1584699453551912,
"grad_norm": 0.23416046798229218,
"learning_rate": 1.8309432376825431e-06,
"loss": 1.270554542541504,
"step": 424
},
{
"epoch": 1.1639344262295082,
"grad_norm": 0.21537384390830994,
"learning_rate": 1.7936418129309819e-06,
"loss": 0.8575707674026489,
"step": 426
},
{
"epoch": 1.169398907103825,
"grad_norm": 0.23633311688899994,
"learning_rate": 1.7571158669412457e-06,
"loss": 1.3536752462387085,
"step": 428
},
{
"epoch": 1.174863387978142,
"grad_norm": 0.1539081335067749,
"learning_rate": 1.721373042011768e-06,
"loss": 1.2999320030212402,
"step": 430
},
{
"epoch": 1.180327868852459,
"grad_norm": 0.17259754240512848,
"learning_rate": 1.6864208165891072e-06,
"loss": 1.3465114831924438,
"step": 432
},
{
"epoch": 1.185792349726776,
"grad_norm": 0.209925577044487,
"learning_rate": 1.6522665037032281e-06,
"loss": 1.3036139011383057,
"step": 434
},
{
"epoch": 1.1912568306010929,
"grad_norm": 0.2237657904624939,
"learning_rate": 1.6189172494374109e-06,
"loss": 0.657602071762085,
"step": 436
},
{
"epoch": 1.1967213114754098,
"grad_norm": 0.11950195580720901,
"learning_rate": 1.5863800314330774e-06,
"loss": 0.6512199640274048,
"step": 438
},
{
"epoch": 1.2021857923497268,
"grad_norm": 0.18348389863967896,
"learning_rate": 1.5546616574298631e-06,
"loss": 1.179289698600769,
"step": 440
},
{
"epoch": 1.2076502732240437,
"grad_norm": 0.17645582556724548,
"learning_rate": 1.5237687638412348e-06,
"loss": 1.5611987113952637,
"step": 442
},
{
"epoch": 1.2131147540983607,
"grad_norm": 0.530087947845459,
"learning_rate": 1.4937078143659705e-06,
"loss": 1.4589874744415283,
"step": 444
},
{
"epoch": 1.2185792349726776,
"grad_norm": 0.25132983922958374,
"learning_rate": 1.4644850986357493e-06,
"loss": 0.7513068914413452,
"step": 446
},
{
"epoch": 1.2240437158469946,
"grad_norm": 0.15347470343112946,
"learning_rate": 1.4361067308991885e-06,
"loss": 1.1420836448669434,
"step": 448
},
{
"epoch": 1.2295081967213115,
"grad_norm": 0.12291496247053146,
"learning_rate": 1.4085786487425546e-06,
"loss": 1.2970597743988037,
"step": 450
},
{
"epoch": 1.2349726775956285,
"grad_norm": 0.48593783378601074,
"learning_rate": 1.3819066118474497e-06,
"loss": 0.8086254596710205,
"step": 452
},
{
"epoch": 1.2404371584699454,
"grad_norm": 0.14925441145896912,
"learning_rate": 1.3560962007857217e-06,
"loss": 1.3039979934692383,
"step": 454
},
{
"epoch": 1.2459016393442623,
"grad_norm": 0.11459294706583023,
"learning_rate": 1.3311528158518374e-06,
"loss": 1.3036037683486938,
"step": 456
},
{
"epoch": 1.2513661202185793,
"grad_norm": 0.1934930980205536,
"learning_rate": 1.3070816759329848e-06,
"loss": 0.7336347699165344,
"step": 458
},
{
"epoch": 1.2568306010928962,
"grad_norm": 0.18466132879257202,
"learning_rate": 1.2838878174171352e-06,
"loss": 1.3072516918182373,
"step": 460
},
{
"epoch": 1.2622950819672132,
"grad_norm": 0.14232006669044495,
"learning_rate": 1.2615760931392729e-06,
"loss": 1.2593871355056763,
"step": 462
},
{
"epoch": 1.2677595628415301,
"grad_norm": 0.6463603973388672,
"learning_rate": 1.2401511713660493e-06,
"loss": 0.9677884578704834,
"step": 464
},
{
"epoch": 1.273224043715847,
"grad_norm": 0.14358870685100555,
"learning_rate": 1.2196175348190402e-06,
"loss": 1.3230767250061035,
"step": 466
},
{
"epoch": 1.278688524590164,
"grad_norm": 0.4848594069480896,
"learning_rate": 1.1999794797368252e-06,
"loss": 0.5629112720489502,
"step": 468
},
{
"epoch": 1.2841530054644807,
"grad_norm": 0.45330166816711426,
"learning_rate": 1.1812411149760988e-06,
"loss": 0.5099667906761169,
"step": 470
},
{
"epoch": 1.289617486338798,
"grad_norm": 0.24476909637451172,
"learning_rate": 1.1634063611519677e-06,
"loss": 1.178823709487915,
"step": 472
},
{
"epoch": 1.2950819672131146,
"grad_norm": 1.0615447759628296,
"learning_rate": 1.1464789498176495e-06,
"loss": 1.0447105169296265,
"step": 474
},
{
"epoch": 1.3005464480874318,
"grad_norm": 0.15947242081165314,
"learning_rate": 1.1304624226837166e-06,
"loss": 1.2712408304214478,
"step": 476
},
{
"epoch": 1.3060109289617485,
"grad_norm": 0.2022111713886261,
"learning_rate": 1.1153601308770712e-06,
"loss": 1.2936254739761353,
"step": 478
},
{
"epoch": 1.3114754098360657,
"grad_norm": 0.3169823884963989,
"learning_rate": 1.1011752342397915e-06,
"loss": 1.396268367767334,
"step": 480
},
{
"epoch": 1.3169398907103824,
"grad_norm": 0.2586687207221985,
"learning_rate": 1.0879107006679971e-06,
"loss": 1.287591576576233,
"step": 482
},
{
"epoch": 1.3224043715846996,
"grad_norm": 0.15245981514453888,
"learning_rate": 1.0755693054908775e-06,
"loss": 1.2732950448989868,
"step": 484
},
{
"epoch": 1.3278688524590163,
"grad_norm": 0.15711617469787598,
"learning_rate": 1.0641536308900174e-06,
"loss": 0.24730153381824493,
"step": 486
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.24083568155765533,
"learning_rate": 1.053666065359123e-06,
"loss": 1.2901641130447388,
"step": 488
},
{
"epoch": 1.3387978142076502,
"grad_norm": 0.16232901811599731,
"learning_rate": 1.0441088032042823e-06,
"loss": 1.2359588146209717,
"step": 490
},
{
"epoch": 1.3442622950819672,
"grad_norm": 0.15927782654762268,
"learning_rate": 1.0354838440848502e-06,
"loss": 1.2897329330444336,
"step": 492
},
{
"epoch": 1.349726775956284,
"grad_norm": 0.41561204195022583,
"learning_rate": 1.0277929925950617e-06,
"loss": 0.7826656103134155,
"step": 494
},
{
"epoch": 1.355191256830601,
"grad_norm": 0.10889182239770889,
"learning_rate": 1.0210378578864586e-06,
"loss": 1.286065697669983,
"step": 496
},
{
"epoch": 1.360655737704918,
"grad_norm": 0.12036287039518356,
"learning_rate": 1.0152198533312078e-06,
"loss": 1.0256068706512451,
"step": 498
},
{
"epoch": 1.366120218579235,
"grad_norm": 0.12828181684017181,
"learning_rate": 1.0103401962263806e-06,
"loss": 1.2886908054351807,
"step": 500
},
{
"epoch": 1.3715846994535519,
"grad_norm": 0.12206350266933441,
"learning_rate": 1.0063999075392606e-06,
"loss": 1.3998095989227295,
"step": 502
},
{
"epoch": 1.3770491803278688,
"grad_norm": 0.3745965361595154,
"learning_rate": 1.0033998116937264e-06,
"loss": 1.2694718837738037,
"step": 504
},
{
"epoch": 1.3825136612021858,
"grad_norm": 0.12047137320041656,
"learning_rate": 1.0013405363977588e-06,
"loss": 0.9076957702636719,
"step": 506
},
{
"epoch": 1.3879781420765027,
"grad_norm": 0.12304738163948059,
"learning_rate": 1.000222512512104e-06,
"loss": 0.835605263710022,
"step": 508
},
{
"epoch": 1.3934426229508197,
"grad_norm": 0.1796962320804596,
"learning_rate": 1.000045973960127e-06,
"loss": 1.3057663440704346,
"step": 510
},
{
"epoch": 1.3989071038251366,
"grad_norm": 0.19033008813858032,
"learning_rate": 1.0008109576788677e-06,
"loss": 1.3245333433151245,
"step": 512
},
{
"epoch": 1.4043715846994536,
"grad_norm": 0.2826150059700012,
"learning_rate": 1.0025173036113104e-06,
"loss": 0.7605847120285034,
"step": 514
},
{
"epoch": 1.4098360655737705,
"grad_norm": 0.35649386048316956,
"learning_rate": 1.0051646547398755e-06,
"loss": 1.6634979248046875,
"step": 516
},
{
"epoch": 1.4153005464480874,
"grad_norm": 0.21099106967449188,
"learning_rate": 1.008752457161116e-06,
"loss": 0.9581305980682373,
"step": 518
},
{
"epoch": 1.4207650273224044,
"grad_norm": 0.09999340772628784,
"learning_rate": 1.013279960201611e-06,
"loss": 0.928807258605957,
"step": 520
},
{
"epoch": 1.4262295081967213,
"grad_norm": 0.1980668604373932,
"learning_rate": 1.0187462165750289e-06,
"loss": 0.7656407952308655,
"step": 522
},
{
"epoch": 1.4316939890710383,
"grad_norm": 0.21324089169502258,
"learning_rate": 1.0251500825803257e-06,
"loss": 0.9017817378044128,
"step": 524
},
{
"epoch": 1.4371584699453552,
"grad_norm": 0.21465957164764404,
"learning_rate": 1.0324902183410438e-06,
"loss": 1.1432918310165405,
"step": 526
},
{
"epoch": 1.4426229508196722,
"grad_norm": 0.1391940414905548,
"learning_rate": 1.0407650880856522e-06,
"loss": 1.2729235887527466,
"step": 528
},
{
"epoch": 1.4480874316939891,
"grad_norm": 0.11870556324720383,
"learning_rate": 1.0499729604688718e-06,
"loss": 0.879092276096344,
"step": 530
},
{
"epoch": 1.453551912568306,
"grad_norm": 0.13376283645629883,
"learning_rate": 1.0601119089339282e-06,
"loss": 1.3114432096481323,
"step": 532
},
{
"epoch": 1.459016393442623,
"grad_norm": 0.21610385179519653,
"learning_rate": 1.0711798121156419e-06,
"loss": 1.3001433610916138,
"step": 534
},
{
"epoch": 1.46448087431694,
"grad_norm": 0.1800413727760315,
"learning_rate": 1.0831743542842801e-06,
"loss": 1.276673436164856,
"step": 536
},
{
"epoch": 1.469945355191257,
"grad_norm": 0.1097259372472763,
"learning_rate": 1.0960930258300747e-06,
"loss": 0.7756356000900269,
"step": 538
},
{
"epoch": 1.4754098360655736,
"grad_norm": 0.2578687369823456,
"learning_rate": 1.109933123788306e-06,
"loss": 1.098586082458496,
"step": 540
},
{
"epoch": 1.4808743169398908,
"grad_norm": 0.5190820097923279,
"learning_rate": 1.1246917524048448e-06,
"loss": 1.3340952396392822,
"step": 542
},
{
"epoch": 1.4863387978142075,
"grad_norm": 0.13077673316001892,
"learning_rate": 1.1403658237420264e-06,
"loss": 1.283229112625122,
"step": 544
},
{
"epoch": 1.4918032786885247,
"grad_norm": 0.25998857617378235,
"learning_rate": 1.1569520583247379e-06,
"loss": 0.990090548992157,
"step": 546
},
{
"epoch": 1.4972677595628414,
"grad_norm": 0.28979551792144775,
"learning_rate": 1.1744469858265805e-06,
"loss": 1.1415678262710571,
"step": 548
},
{
"epoch": 1.5027322404371586,
"grad_norm": 0.11428312957286835,
"learning_rate": 1.1928469457959649e-06,
"loss": 1.297635555267334,
"step": 550
},
{
"epoch": 1.5081967213114753,
"grad_norm": 0.43478986620903015,
"learning_rate": 1.2121480884219837e-06,
"loss": 0.9222514033317566,
"step": 552
},
{
"epoch": 1.5136612021857925,
"grad_norm": 0.14648959040641785,
"learning_rate": 1.2323463753399065e-06,
"loss": 1.262050986289978,
"step": 554
},
{
"epoch": 1.5191256830601092,
"grad_norm": 0.1419551521539688,
"learning_rate": 1.25343758047612e-06,
"loss": 0.8583607077598572,
"step": 556
},
{
"epoch": 1.5245901639344264,
"grad_norm": 0.11025940626859665,
"learning_rate": 1.2754172909323507e-06,
"loss": 1.295128345489502,
"step": 558
},
{
"epoch": 1.530054644808743,
"grad_norm": 0.14832042157649994,
"learning_rate": 1.2982809079089729e-06,
"loss": 0.47225067019462585,
"step": 560
},
{
"epoch": 1.5355191256830603,
"grad_norm": 0.11338595300912857,
"learning_rate": 1.3220236476672071e-06,
"loss": 1.2546048164367676,
"step": 562
},
{
"epoch": 1.540983606557377,
"grad_norm": 0.17865526676177979,
"learning_rate": 1.3466405425300252e-06,
"loss": 0.7668097019195557,
"step": 564
},
{
"epoch": 1.5464480874316942,
"grad_norm": 0.17245270311832428,
"learning_rate": 1.3721264419215266e-06,
"loss": 1.3768823146820068,
"step": 566
},
{
"epoch": 1.5519125683060109,
"grad_norm": 0.13957710564136505,
"learning_rate": 1.3984760134445984e-06,
"loss": 1.3080743551254272,
"step": 568
},
{
"epoch": 1.5573770491803278,
"grad_norm": 0.15658064186573029,
"learning_rate": 1.4256837439966024e-06,
"loss": 1.2786517143249512,
"step": 570
},
{
"epoch": 1.5628415300546448,
"grad_norm": 0.3794512152671814,
"learning_rate": 1.4537439409228783e-06,
"loss": 0.7999430894851685,
"step": 572
},
{
"epoch": 1.5683060109289617,
"grad_norm": 1.6785387992858887,
"learning_rate": 1.4826507332078171e-06,
"loss": 1.122337818145752,
"step": 574
},
{
"epoch": 1.5737704918032787,
"grad_norm": 0.12103781849145889,
"learning_rate": 1.5123980727032505e-06,
"loss": 1.2726154327392578,
"step": 576
},
{
"epoch": 1.5792349726775956,
"grad_norm": 0.1678047478199005,
"learning_rate": 1.5429797353938934e-06,
"loss": 1.2915924787521362,
"step": 578
},
{
"epoch": 1.5846994535519126,
"grad_norm": 0.17610567808151245,
"learning_rate": 1.5743893226995987e-06,
"loss": 1.2801222801208496,
"step": 580
},
{
"epoch": 1.5901639344262295,
"grad_norm": 0.15169797837734222,
"learning_rate": 1.6066202628141209e-06,
"loss": 1.1805187463760376,
"step": 582
},
{
"epoch": 1.5956284153005464,
"grad_norm": 0.34766384959220886,
"learning_rate": 1.6396658120801397e-06,
"loss": 0.7313578128814697,
"step": 584
},
{
"epoch": 1.6010928961748634,
"grad_norm": 0.21868975460529327,
"learning_rate": 1.6735190564002212e-06,
"loss": 1.2763304710388184,
"step": 586
},
{
"epoch": 1.6065573770491803,
"grad_norm": 0.14653269946575165,
"learning_rate": 1.7081729126834518e-06,
"loss": 1.393769383430481,
"step": 588
},
{
"epoch": 1.6120218579234973,
"grad_norm": 0.1569858342409134,
"learning_rate": 1.7436201303274325e-06,
"loss": 1.56880784034729,
"step": 590
},
{
"epoch": 1.6174863387978142,
"grad_norm": 0.16917826235294342,
"learning_rate": 1.7798532927353185e-06,
"loss": 1.2501717805862427,
"step": 592
},
{
"epoch": 1.6229508196721312,
"grad_norm": 0.12139948457479477,
"learning_rate": 1.8168648188675788e-06,
"loss": 1.2535643577575684,
"step": 594
},
{
"epoch": 1.6284153005464481,
"grad_norm": 0.13256296515464783,
"learning_rate": 1.8546469648281828e-06,
"loss": 1.2605574131011963,
"step": 596
},
{
"epoch": 1.633879781420765,
"grad_norm": 0.17139297723770142,
"learning_rate": 1.8931918254848425e-06,
"loss": 1.4653030633926392,
"step": 598
},
{
"epoch": 1.639344262295082,
"grad_norm": 0.5706589221954346,
"learning_rate": 1.932491336123003e-06,
"loss": 1.168147325515747,
"step": 600
},
{
"epoch": 1.644808743169399,
"grad_norm": 0.1335100382566452,
"learning_rate": 1.9725372741332193e-06,
"loss": 1.581040620803833,
"step": 602
},
{
"epoch": 1.650273224043716,
"grad_norm": 0.12017619609832764,
"learning_rate": 2.013321260731555e-06,
"loss": 1.2702972888946533,
"step": 604
},
{
"epoch": 1.6557377049180326,
"grad_norm": 0.16135352849960327,
"learning_rate": 2.0548347627126854e-06,
"loss": 1.3027235269546509,
"step": 606
},
{
"epoch": 1.6612021857923498,
"grad_norm": 0.15468773245811462,
"learning_rate": 2.0970690942352783e-06,
"loss": 1.2904077768325806,
"step": 608
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.15808522701263428,
"learning_rate": 2.1400154186393448e-06,
"loss": 1.3017288446426392,
"step": 610
},
{
"epoch": 1.6721311475409837,
"grad_norm": 0.19535169005393982,
"learning_rate": 2.1836647502951103e-06,
"loss": 0.7248042225837708,
"step": 612
},
{
"epoch": 1.6775956284153004,
"grad_norm": 0.1245855838060379,
"learning_rate": 2.2280079564830796e-06,
"loss": 1.1138478517532349,
"step": 614
},
{
"epoch": 1.6830601092896176,
"grad_norm": 0.2644435167312622,
"learning_rate": 2.27303575930487e-06,
"loss": 0.7883511185646057,
"step": 616
},
{
"epoch": 1.6885245901639343,
"grad_norm": 0.16241517663002014,
"learning_rate": 2.3187387376244214e-06,
"loss": 1.2666425704956055,
"step": 618
},
{
"epoch": 1.6939890710382515,
"grad_norm": 0.12214373052120209,
"learning_rate": 2.365107329039164e-06,
"loss": 1.2682583332061768,
"step": 620
},
{
"epoch": 1.6994535519125682,
"grad_norm": 0.13607722520828247,
"learning_rate": 2.412131831880767e-06,
"loss": 1.1618129014968872,
"step": 622
},
{
"epoch": 1.7049180327868854,
"grad_norm": 0.5881685614585876,
"learning_rate": 2.4598024072449967e-06,
"loss": 0.7930600047111511,
"step": 624
},
{
"epoch": 1.710382513661202,
"grad_norm": 0.22248564660549164,
"learning_rate": 2.508109081050322e-06,
"loss": 1.3016993999481201,
"step": 626
},
{
"epoch": 1.7158469945355193,
"grad_norm": 0.12585905194282532,
"learning_rate": 2.55704174612477e-06,
"loss": 1.2601913213729858,
"step": 628
},
{
"epoch": 1.721311475409836,
"grad_norm": 0.4879342317581177,
"learning_rate": 2.6065901643206427e-06,
"loss": 0.836104154586792,
"step": 630
},
{
"epoch": 1.7267759562841531,
"grad_norm": 0.1584855616092682,
"learning_rate": 2.656743968656652e-06,
"loss": 1.105610728263855,
"step": 632
},
{
"epoch": 1.7322404371584699,
"grad_norm": 0.3009145259857178,
"learning_rate": 2.7074926654869752e-06,
"loss": 1.3899565935134888,
"step": 634
},
{
"epoch": 1.737704918032787,
"grad_norm": 0.18539711833000183,
"learning_rate": 2.7588256366968553e-06,
"loss": 1.1802400350570679,
"step": 636
},
{
"epoch": 1.7431693989071038,
"grad_norm": 0.18684296309947968,
"learning_rate": 2.810732141924202e-06,
"loss": 1.2819732427597046,
"step": 638
},
{
"epoch": 1.748633879781421,
"grad_norm": 0.13436821103096008,
"learning_rate": 2.8632013208067977e-06,
"loss": 1.241382360458374,
"step": 640
},
{
"epoch": 1.7540983606557377,
"grad_norm": 0.19662614166736603,
"learning_rate": 2.9162221952546167e-06,
"loss": 0.8158561587333679,
"step": 642
},
{
"epoch": 1.7595628415300546,
"grad_norm": 0.8080497980117798,
"learning_rate": 2.9697836717467367e-06,
"loss": 0.5533775091171265,
"step": 644
},
{
"epoch": 1.7650273224043715,
"grad_norm": 0.18492548167705536,
"learning_rate": 3.023874543652457e-06,
"loss": 1.3162730932235718,
"step": 646
},
{
"epoch": 1.7704918032786885,
"grad_norm": 0.13424889743328094,
"learning_rate": 3.07848349357603e-06,
"loss": 1.1636426448822021,
"step": 648
},
{
"epoch": 1.7759562841530054,
"grad_norm": 0.2516067922115326,
"learning_rate": 3.1335990957246042e-06,
"loss": 1.043823003768921,
"step": 650
},
{
"epoch": 1.7814207650273224,
"grad_norm": 0.49497902393341064,
"learning_rate": 3.189209818298837e-06,
"loss": 0.8271763920783997,
"step": 652
},
{
"epoch": 1.7868852459016393,
"grad_norm": 0.20021268725395203,
"learning_rate": 3.245304025905684e-06,
"loss": 1.3043122291564941,
"step": 654
},
{
"epoch": 1.7923497267759563,
"grad_norm": 0.14751023054122925,
"learning_rate": 3.3018699819928534e-06,
"loss": 1.266282081604004,
"step": 656
},
{
"epoch": 1.7978142076502732,
"grad_norm": 0.5914691686630249,
"learning_rate": 3.358895851304446e-06,
"loss": 1.0464860200881958,
"step": 658
},
{
"epoch": 1.8032786885245902,
"grad_norm": 0.20123934745788574,
"learning_rate": 3.4163697023572455e-06,
"loss": 1.2859833240509033,
"step": 660
},
{
"epoch": 1.8087431693989071,
"grad_norm": 0.13834412395954132,
"learning_rate": 3.474279509937113e-06,
"loss": 1.0676109790802002,
"step": 662
},
{
"epoch": 1.814207650273224,
"grad_norm": 0.23103635013103485,
"learning_rate": 3.5326131576150404e-06,
"loss": 0.8037886023521423,
"step": 664
},
{
"epoch": 1.819672131147541,
"grad_norm": 0.18413887917995453,
"learning_rate": 3.5913584402822323e-06,
"loss": 1.2824831008911133,
"step": 666
},
{
"epoch": 1.825136612021858,
"grad_norm": 0.12944690883159637,
"learning_rate": 3.650503066703807e-06,
"loss": 1.2873767614364624,
"step": 668
},
{
"epoch": 1.830601092896175,
"grad_norm": 0.1951185166835785,
"learning_rate": 3.71003466209045e-06,
"loss": 1.2321735620498657,
"step": 670
},
{
"epoch": 1.8360655737704918,
"grad_norm": 0.32999613881111145,
"learning_rate": 3.7699407706876057e-06,
"loss": 1.2794973850250244,
"step": 672
},
{
"epoch": 1.8415300546448088,
"grad_norm": 0.4653666913509369,
"learning_rate": 3.830208858381575e-06,
"loss": 0.9594930410385132,
"step": 674
},
{
"epoch": 1.8469945355191257,
"grad_norm": 0.13141269981861115,
"learning_rate": 3.890826315322031e-06,
"loss": 1.2942712306976318,
"step": 676
},
{
"epoch": 1.8524590163934427,
"grad_norm": 0.1315234899520874,
"learning_rate": 3.951780458560347e-06,
"loss": 1.280110478401184,
"step": 678
},
{
"epoch": 1.8579234972677594,
"grad_norm": 0.15407615900039673,
"learning_rate": 4.0130585347032655e-06,
"loss": 1.3106074333190918,
"step": 680
},
{
"epoch": 1.8633879781420766,
"grad_norm": 0.1454591304063797,
"learning_rate": 4.074647722581262e-06,
"loss": 1.3081083297729492,
"step": 682
},
{
"epoch": 1.8688524590163933,
"grad_norm": 0.16022169589996338,
"learning_rate": 4.136535135931128e-06,
"loss": 0.9257675409317017,
"step": 684
},
{
"epoch": 1.8743169398907105,
"grad_norm": 0.23520433902740479,
"learning_rate": 4.198707826092146e-06,
"loss": 0.8019899725914001,
"step": 686
},
{
"epoch": 1.8797814207650272,
"grad_norm": 0.14440514147281647,
"learning_rate": 4.261152784715319e-06,
"loss": 1.1938315629959106,
"step": 688
},
{
"epoch": 1.8852459016393444,
"grad_norm": 0.1498691886663437,
"learning_rate": 4.323856946485116e-06,
"loss": 0.9831755757331848,
"step": 690
},
{
"epoch": 1.890710382513661,
"grad_norm": 0.17087140679359436,
"learning_rate": 4.3868071918531015e-06,
"loss": 1.2909032106399536,
"step": 692
},
{
"epoch": 1.8961748633879782,
"grad_norm": 0.5026926398277283,
"learning_rate": 4.449990349782951e-06,
"loss": 0.9881775975227356,
"step": 694
},
{
"epoch": 1.901639344262295,
"grad_norm": 0.1403045505285263,
"learning_rate": 4.513393200506203e-06,
"loss": 1.2623631954193115,
"step": 696
},
{
"epoch": 1.9071038251366121,
"grad_norm": 0.12408439815044403,
"learning_rate": 4.5770024782882185e-06,
"loss": 1.5725769996643066,
"step": 698
},
{
"epoch": 1.9125683060109289,
"grad_norm": 0.12653514742851257,
"learning_rate": 4.640804874203767e-06,
"loss": 1.2823572158813477,
"step": 700
},
{
"epoch": 1.918032786885246,
"grad_norm": 0.13838998973369598,
"learning_rate": 4.704787038921654e-06,
"loss": 1.2183997631072998,
"step": 702
},
{
"epoch": 1.9234972677595628,
"grad_norm": 0.12682996690273285,
"learning_rate": 4.7689355854977725e-06,
"loss": 1.2953612804412842,
"step": 704
},
{
"epoch": 1.92896174863388,
"grad_norm": 0.25785738229751587,
"learning_rate": 4.833237092176063e-06,
"loss": 1.2848292589187622,
"step": 706
},
{
"epoch": 1.9344262295081966,
"grad_norm": 0.14749932289123535,
"learning_rate": 4.89767810519672e-06,
"loss": 1.222738265991211,
"step": 708
},
{
"epoch": 1.9398907103825138,
"grad_norm": 0.3948100507259369,
"learning_rate": 4.962245141611136e-06,
"loss": 1.483533501625061,
"step": 710
},
{
"epoch": 1.9453551912568305,
"grad_norm": 0.11861786246299744,
"learning_rate": 5.026924692102901e-06,
"loss": 1.313765287399292,
"step": 712
},
{
"epoch": 1.9508196721311475,
"grad_norm": 0.3464006781578064,
"learning_rate": 5.091703223814366e-06,
"loss": 1.000444769859314,
"step": 714
},
{
"epoch": 1.9562841530054644,
"grad_norm": 0.16236136853694916,
"learning_rate": 5.156567183178118e-06,
"loss": 1.3515253067016602,
"step": 716
},
{
"epoch": 1.9617486338797814,
"grad_norm": 0.14432047307491302,
"learning_rate": 5.221502998752774e-06,
"loss": 0.7439847588539124,
"step": 718
},
{
"epoch": 1.9672131147540983,
"grad_norm": 0.18786761164665222,
"learning_rate": 5.286497084062517e-06,
"loss": 0.7950851917266846,
"step": 720
},
{
"epoch": 1.9726775956284153,
"grad_norm": 0.35462602972984314,
"learning_rate": 5.351535840439799e-06,
"loss": 1.1589150428771973,
"step": 722
},
{
"epoch": 1.9781420765027322,
"grad_norm": 0.2843046188354492,
"learning_rate": 5.41660565987057e-06,
"loss": 1.2756617069244385,
"step": 724
},
{
"epoch": 1.9836065573770492,
"grad_norm": 1.1716567277908325,
"learning_rate": 5.481692927841487e-06,
"loss": 0.7865095138549805,
"step": 726
},
{
"epoch": 1.989071038251366,
"grad_norm": 0.18071912229061127,
"learning_rate": 5.546784026188465e-06,
"loss": 1.0641995668411255,
"step": 728
},
{
"epoch": 1.994535519125683,
"grad_norm": 0.12297914922237396,
"learning_rate": 5.611865335945977e-06,
"loss": 1.2688380479812622,
"step": 730
},
{
"epoch": 2.0,
"grad_norm": 0.20381133258342743,
"learning_rate": 5.676923240196566e-06,
"loss": 1.2503268718719482,
"step": 732
},
{
"epoch": 2.0054644808743167,
"grad_norm": 0.17776302993297577,
"learning_rate": 5.741944126919885e-06,
"loss": 1.0253487825393677,
"step": 734
},
{
"epoch": 2.010928961748634,
"grad_norm": 0.11820948868989944,
"learning_rate": 5.806914391840748e-06,
"loss": 1.2554274797439575,
"step": 736
},
{
"epoch": 2.0163934426229506,
"grad_norm": 0.14874251186847687,
"learning_rate": 5.871820441275534e-06,
"loss": 1.2443053722381592,
"step": 738
},
{
"epoch": 2.021857923497268,
"grad_norm": 0.3678041398525238,
"learning_rate": 5.936648694976378e-06,
"loss": 1.2668925523757935,
"step": 740
},
{
"epoch": 2.0273224043715845,
"grad_norm": 0.18291163444519043,
"learning_rate": 6.001385588972557e-06,
"loss": 1.2606185674667358,
"step": 742
},
{
"epoch": 2.0327868852459017,
"grad_norm": 0.3447842001914978,
"learning_rate": 6.066017578408494e-06,
"loss": 1.5479013919830322,
"step": 744
},
{
"epoch": 2.0382513661202184,
"grad_norm": 0.14480522274971008,
"learning_rate": 6.130531140377701e-06,
"loss": 1.5561115741729736,
"step": 746
},
{
"epoch": 2.0437158469945356,
"grad_norm": 0.1427302062511444,
"learning_rate": 6.194912776752203e-06,
"loss": 1.2826040983200073,
"step": 748
},
{
"epoch": 2.0491803278688523,
"grad_norm": 0.2801133096218109,
"learning_rate": 6.259149017006711e-06,
"loss": 1.2386608123779297,
"step": 750
},
{
"epoch": 2.0546448087431695,
"grad_norm": 0.14323477447032928,
"learning_rate": 6.323226421037082e-06,
"loss": 1.2862393856048584,
"step": 752
},
{
"epoch": 2.060109289617486,
"grad_norm": 0.1799071878194809,
"learning_rate": 6.387131581972354e-06,
"loss": 0.9528294205665588,
"step": 754
},
{
"epoch": 2.0655737704918034,
"grad_norm": 0.28468427062034607,
"learning_rate": 6.450851128979868e-06,
"loss": 0.3149171471595764,
"step": 756
},
{
"epoch": 2.07103825136612,
"grad_norm": 0.15873941779136658,
"learning_rate": 6.5143717300628486e-06,
"loss": 1.224860668182373,
"step": 758
},
{
"epoch": 2.0765027322404372,
"grad_norm": 0.1589648723602295,
"learning_rate": 6.577680094849836e-06,
"loss": 1.094529628753662,
"step": 760
},
{
"epoch": 2.081967213114754,
"grad_norm": 0.17585955560207367,
"learning_rate": 6.64076297737542e-06,
"loss": 1.2662822008132935,
"step": 762
},
{
"epoch": 2.087431693989071,
"grad_norm": 0.2207726389169693,
"learning_rate": 6.703607178851683e-06,
"loss": 1.2135547399520874,
"step": 764
},
{
"epoch": 2.092896174863388,
"grad_norm": 0.2255478799343109,
"learning_rate": 6.766199550429794e-06,
"loss": 1.2652828693389893,
"step": 766
},
{
"epoch": 2.098360655737705,
"grad_norm": 0.19602195918560028,
"learning_rate": 6.828526995951102e-06,
"loss": 1.2324138879776,
"step": 768
},
{
"epoch": 2.1038251366120218,
"grad_norm": 0.12533804774284363,
"learning_rate": 6.890576474687263e-06,
"loss": 1.2735555171966553,
"step": 770
},
{
"epoch": 2.109289617486339,
"grad_norm": 0.3546103835105896,
"learning_rate": 6.9523350040687255e-06,
"loss": 1.1258909702301025,
"step": 772
},
{
"epoch": 2.1147540983606556,
"grad_norm": 0.16880619525909424,
"learning_rate": 7.013789662401067e-06,
"loss": 1.1692241430282593,
"step": 774
},
{
"epoch": 2.120218579234973,
"grad_norm": 0.48631495237350464,
"learning_rate": 7.074927591568607e-06,
"loss": 0.6158280968666077,
"step": 776
},
{
"epoch": 2.1256830601092895,
"grad_norm": 0.2696533501148224,
"learning_rate": 7.135735999724676e-06,
"loss": 1.3194879293441772,
"step": 778
},
{
"epoch": 2.1311475409836067,
"grad_norm": 0.15946127474308014,
"learning_rate": 7.196202163968057e-06,
"loss": 0.6818429827690125,
"step": 780
},
{
"epoch": 2.1366120218579234,
"grad_norm": 0.19602926075458527,
"learning_rate": 7.256313433005011e-06,
"loss": 0.9832362532615662,
"step": 782
},
{
"epoch": 2.1420765027322406,
"grad_norm": 0.3311297297477722,
"learning_rate": 7.3160572297962574e-06,
"loss": 0.9352646470069885,
"step": 784
},
{
"epoch": 2.1475409836065573,
"grad_norm": 0.344480961561203,
"learning_rate": 7.375421054188479e-06,
"loss": 0.7446603775024414,
"step": 786
},
{
"epoch": 2.1530054644808745,
"grad_norm": 0.38910025358200073,
"learning_rate": 7.43439248552974e-06,
"loss": 0.7190406322479248,
"step": 788
},
{
"epoch": 2.158469945355191,
"grad_norm": 0.17629633843898773,
"learning_rate": 7.492959185268212e-06,
"loss": 1.2873114347457886,
"step": 790
},
{
"epoch": 2.1639344262295084,
"grad_norm": 0.22042587399482727,
"learning_rate": 7.5511088995337855e-06,
"loss": 1.2835544347763062,
"step": 792
},
{
"epoch": 2.169398907103825,
"grad_norm": 0.2709881663322449,
"learning_rate": 7.608829461701935e-06,
"loss": 1.0393563508987427,
"step": 794
},
{
"epoch": 2.1748633879781423,
"grad_norm": 0.2166048139333725,
"learning_rate": 7.666108794939333e-06,
"loss": 0.7807997465133667,
"step": 796
},
{
"epoch": 2.180327868852459,
"grad_norm": 0.13187411427497864,
"learning_rate": 7.722934914730682e-06,
"loss": 1.015637755393982,
"step": 798
},
{
"epoch": 2.185792349726776,
"grad_norm": 0.18242332339286804,
"learning_rate": 7.779295931386199e-06,
"loss": 1.2419551610946655,
"step": 800
},
{
"epoch": 2.191256830601093,
"grad_norm": 0.28827011585235596,
"learning_rate": 7.835180052529337e-06,
"loss": 1.3786053657531738,
"step": 802
},
{
"epoch": 2.19672131147541,
"grad_norm": 0.1471245139837265,
"learning_rate": 7.89057558556405e-06,
"loss": 1.6328306198120117,
"step": 804
},
{
"epoch": 2.202185792349727,
"grad_norm": 0.1748630255460739,
"learning_rate": 7.945470940121241e-06,
"loss": 1.2367662191390991,
"step": 806
},
{
"epoch": 2.2076502732240435,
"grad_norm": 0.18178026378154755,
"learning_rate": 7.999854630483825e-06,
"loss": 1.2759439945220947,
"step": 808
},
{
"epoch": 2.2131147540983607,
"grad_norm": 0.12333730608224869,
"learning_rate": 8.053715277989854e-06,
"loss": 1.1919105052947998,
"step": 810
},
{
"epoch": 2.2185792349726774,
"grad_norm": 0.15479306876659393,
"learning_rate": 8.107041613413265e-06,
"loss": 1.266588568687439,
"step": 812
},
{
"epoch": 2.2240437158469946,
"grad_norm": 0.4253162741661072,
"learning_rate": 8.15982247932177e-06,
"loss": 0.8012547492980957,
"step": 814
},
{
"epoch": 2.2295081967213113,
"grad_norm": 0.1888294517993927,
"learning_rate": 8.212046832411267e-06,
"loss": 1.3143601417541504,
"step": 816
},
{
"epoch": 2.2349726775956285,
"grad_norm": 0.28106892108917236,
"learning_rate": 8.263703745816452e-06,
"loss": 0.8637550473213196,
"step": 818
},
{
"epoch": 2.240437158469945,
"grad_norm": 0.22309575974941254,
"learning_rate": 8.314782411397043e-06,
"loss": 1.2210556268692017,
"step": 820
},
{
"epoch": 2.2459016393442623,
"grad_norm": 0.21936748921871185,
"learning_rate": 8.365272141999137e-06,
"loss": 1.2734110355377197,
"step": 822
},
{
"epoch": 2.251366120218579,
"grad_norm": 0.16116738319396973,
"learning_rate": 8.415162373691298e-06,
"loss": 1.2576873302459717,
"step": 824
},
{
"epoch": 2.2568306010928962,
"grad_norm": 0.13302792608737946,
"learning_rate": 8.46444266797483e-06,
"loss": 1.274696707725525,
"step": 826
},
{
"epoch": 2.262295081967213,
"grad_norm": 1.1599286794662476,
"learning_rate": 8.513102713967824e-06,
"loss": 0.8628762364387512,
"step": 828
},
{
"epoch": 2.26775956284153,
"grad_norm": 0.14883320033550262,
"learning_rate": 8.56113233056248e-06,
"loss": 1.2841596603393555,
"step": 830
},
{
"epoch": 2.273224043715847,
"grad_norm": 0.14526048302650452,
"learning_rate": 8.608521468555326e-06,
"loss": 0.8904628753662109,
"step": 832
},
{
"epoch": 2.278688524590164,
"grad_norm": 0.1946345716714859,
"learning_rate": 8.655260212749764e-06,
"loss": 1.0389220714569092,
"step": 834
},
{
"epoch": 2.2841530054644807,
"grad_norm": 0.13552814722061157,
"learning_rate": 8.701338784030653e-06,
"loss": 1.2453982830047607,
"step": 836
},
{
"epoch": 2.289617486338798,
"grad_norm": 0.14240728318691254,
"learning_rate": 8.746747541410367e-06,
"loss": 1.2398008108139038,
"step": 838
},
{
"epoch": 2.2950819672131146,
"grad_norm": 0.1497233361005783,
"learning_rate": 8.791476984045993e-06,
"loss": 0.6871110200881958,
"step": 840
},
{
"epoch": 2.300546448087432,
"grad_norm": 0.16756655275821686,
"learning_rate": 8.835517753227154e-06,
"loss": 1.0810468196868896,
"step": 842
},
{
"epoch": 2.3060109289617485,
"grad_norm": 0.12103111296892166,
"learning_rate": 8.878860634334156e-06,
"loss": 0.7038662433624268,
"step": 844
},
{
"epoch": 2.3114754098360657,
"grad_norm": 0.12554596364498138,
"learning_rate": 8.921496558765938e-06,
"loss": 1.332837462425232,
"step": 846
},
{
"epoch": 2.3169398907103824,
"grad_norm": 0.15018968284130096,
"learning_rate": 8.963416605837507e-06,
"loss": 1.2201088666915894,
"step": 848
},
{
"epoch": 2.3224043715846996,
"grad_norm": 0.22610308229923248,
"learning_rate": 9.004612004646382e-06,
"loss": 1.2539888620376587,
"step": 850
},
{
"epoch": 2.3278688524590163,
"grad_norm": 0.2273012101650238,
"learning_rate": 9.045074135907744e-06,
"loss": 1.2355636358261108,
"step": 852
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.16146086156368256,
"learning_rate": 9.084794533757836e-06,
"loss": 0.8059526681900024,
"step": 854
},
{
"epoch": 2.33879781420765,
"grad_norm": 0.22058957815170288,
"learning_rate": 9.123764887525265e-06,
"loss": 1.0137945413589478,
"step": 856
},
{
"epoch": 2.3442622950819674,
"grad_norm": 0.3819839358329773,
"learning_rate": 9.161977043469848e-06,
"loss": 1.217822551727295,
"step": 858
},
{
"epoch": 2.349726775956284,
"grad_norm": 0.17834967374801636,
"learning_rate": 9.199423006488608e-06,
"loss": 0.9274788498878479,
"step": 860
},
{
"epoch": 2.3551912568306013,
"grad_norm": 1.3659512996673584,
"learning_rate": 9.236094941788585e-06,
"loss": 1.1104533672332764,
"step": 862
},
{
"epoch": 2.360655737704918,
"grad_norm": 0.160284161567688,
"learning_rate": 9.271985176526095e-06,
"loss": 1.2397129535675049,
"step": 864
},
{
"epoch": 2.366120218579235,
"grad_norm": 0.15545697510242462,
"learning_rate": 9.30708620141214e-06,
"loss": 0.7818785309791565,
"step": 866
},
{
"epoch": 2.371584699453552,
"grad_norm": 0.3776909410953522,
"learning_rate": 9.341390672283538e-06,
"loss": 1.5543640851974487,
"step": 868
},
{
"epoch": 2.3770491803278686,
"grad_norm": 0.28088635206222534,
"learning_rate": 9.374891411639553e-06,
"loss": 1.241223692893982,
"step": 870
},
{
"epoch": 2.3825136612021858,
"grad_norm": 0.4876302182674408,
"learning_rate": 9.407581410143652e-06,
"loss": 0.7597371935844421,
"step": 872
},
{
"epoch": 2.387978142076503,
"grad_norm": 0.29702451825141907,
"learning_rate": 9.439453828090041e-06,
"loss": 1.347943663597107,
"step": 874
},
{
"epoch": 2.3934426229508197,
"grad_norm": 0.11804826557636261,
"learning_rate": 9.470501996834735e-06,
"loss": 1.2856158018112183,
"step": 876
},
{
"epoch": 2.3989071038251364,
"grad_norm": 0.28625768423080444,
"learning_rate": 9.500719420190852e-06,
"loss": 0.8894764184951782,
"step": 878
},
{
"epoch": 2.4043715846994536,
"grad_norm": 0.3130474090576172,
"learning_rate": 9.530099775787786e-06,
"loss": 1.2336865663528442,
"step": 880
},
{
"epoch": 2.4098360655737707,
"grad_norm": 0.2206021100282669,
"learning_rate": 9.558636916394043e-06,
"loss": 1.2526772022247314,
"step": 882
},
{
"epoch": 2.4153005464480874,
"grad_norm": 0.16443626582622528,
"learning_rate": 9.586324871203418e-06,
"loss": 1.2226040363311768,
"step": 884
},
{
"epoch": 2.420765027322404,
"grad_norm": 0.589164674282074,
"learning_rate": 9.613157847084261e-06,
"loss": 1.2555053234100342,
"step": 886
},
{
"epoch": 2.4262295081967213,
"grad_norm": 0.4435969293117523,
"learning_rate": 9.639130229791576e-06,
"loss": 1.2479197978973389,
"step": 888
},
{
"epoch": 2.431693989071038,
"grad_norm": 0.421928733587265,
"learning_rate": 9.664236585141678e-06,
"loss": 0.876587450504303,
"step": 890
},
{
"epoch": 2.4371584699453552,
"grad_norm": 0.2530652582645416,
"learning_rate": 9.68847166014919e-06,
"loss": 1.592029094696045,
"step": 892
},
{
"epoch": 2.442622950819672,
"grad_norm": 0.2849081754684448,
"learning_rate": 9.711830384126119e-06,
"loss": 1.1860142946243286,
"step": 894
},
{
"epoch": 2.448087431693989,
"grad_norm": 0.16560609638690948,
"learning_rate": 9.734307869742788e-06,
"loss": 0.9020236730575562,
"step": 896
},
{
"epoch": 2.453551912568306,
"grad_norm": 0.22628025710582733,
"learning_rate": 9.755899414050425e-06,
"loss": 0.46257585287094116,
"step": 898
},
{
"epoch": 2.459016393442623,
"grad_norm": 0.14033140242099762,
"learning_rate": 9.77660049946513e-06,
"loss": 1.2592275142669678,
"step": 900
},
{
"epoch": 2.4644808743169397,
"grad_norm": 0.16893093287944794,
"learning_rate": 9.796406794713113e-06,
"loss": 0.5414184331893921,
"step": 902
},
{
"epoch": 2.469945355191257,
"grad_norm": 0.16352859139442444,
"learning_rate": 9.815314155736906e-06,
"loss": 1.5088986158370972,
"step": 904
},
{
"epoch": 2.4754098360655736,
"grad_norm": 0.7746697664260864,
"learning_rate": 9.833318626562433e-06,
"loss": 0.6330769062042236,
"step": 906
},
{
"epoch": 2.480874316939891,
"grad_norm": 0.4806567132472992,
"learning_rate": 9.850416440126704e-06,
"loss": 0.7819712162017822,
"step": 908
},
{
"epoch": 2.4863387978142075,
"grad_norm": 0.1313900500535965,
"learning_rate": 9.866604019066004e-06,
"loss": 1.2398015260696411,
"step": 910
},
{
"epoch": 2.4918032786885247,
"grad_norm": 0.2975386381149292,
"learning_rate": 9.881877976464384e-06,
"loss": 0.7635778784751892,
"step": 912
},
{
"epoch": 2.4972677595628414,
"grad_norm": 0.19941112399101257,
"learning_rate": 9.896235116562287e-06,
"loss": 1.298919916152954,
"step": 914
},
{
"epoch": 2.5027322404371586,
"grad_norm": 0.2005673348903656,
"learning_rate": 9.909672435425221e-06,
"loss": 1.25544273853302,
"step": 916
},
{
"epoch": 2.5081967213114753,
"grad_norm": 0.17507828772068024,
"learning_rate": 9.922187121572241e-06,
"loss": 1.1938058137893677,
"step": 918
},
{
"epoch": 2.5136612021857925,
"grad_norm": 0.15096338093280792,
"learning_rate": 9.933776556564219e-06,
"loss": 1.1339476108551025,
"step": 920
},
{
"epoch": 2.519125683060109,
"grad_norm": 0.168264701962471,
"learning_rate": 9.944438315551677e-06,
"loss": 1.2415448427200317,
"step": 922
},
{
"epoch": 2.5245901639344264,
"grad_norm": 0.4377535581588745,
"learning_rate": 9.954170167782156e-06,
"loss": 0.6661954522132874,
"step": 924
},
{
"epoch": 2.530054644808743,
"grad_norm": 0.1548946648836136,
"learning_rate": 9.962970077066938e-06,
"loss": 1.2278692722320557,
"step": 926
},
{
"epoch": 2.5355191256830603,
"grad_norm": 0.45442453026771545,
"learning_rate": 9.970836202207084e-06,
"loss": 1.1313743591308594,
"step": 928
},
{
"epoch": 2.540983606557377,
"grad_norm": 0.27604687213897705,
"learning_rate": 9.977766897378666e-06,
"loss": 0.7971192598342896,
"step": 930
},
{
"epoch": 2.546448087431694,
"grad_norm": 0.33876216411590576,
"learning_rate": 9.98376071247713e-06,
"loss": 0.8196964859962463,
"step": 932
},
{
"epoch": 2.551912568306011,
"grad_norm": 0.2421918511390686,
"learning_rate": 9.98881639342068e-06,
"loss": 1.0490155220031738,
"step": 934
},
{
"epoch": 2.557377049180328,
"grad_norm": 0.32190215587615967,
"learning_rate": 9.992932882412686e-06,
"loss": 1.2149512767791748,
"step": 936
},
{
"epoch": 2.5628415300546448,
"grad_norm": 0.15382827818393707,
"learning_rate": 9.996109318163007e-06,
"loss": 1.242163896560669,
"step": 938
},
{
"epoch": 2.5683060109289615,
"grad_norm": 0.2394203394651413,
"learning_rate": 9.99834503606818e-06,
"loss": 1.2160629034042358,
"step": 940
},
{
"epoch": 2.5737704918032787,
"grad_norm": 0.16462068259716034,
"learning_rate": 9.999639568350495e-06,
"loss": 1.2307050228118896,
"step": 942
},
{
"epoch": 2.579234972677596,
"grad_norm": 0.1651630997657776,
"learning_rate": 9.99999264415586e-06,
"loss": 1.238526701927185,
"step": 944
},
{
"epoch": 2.5846994535519126,
"grad_norm": 0.4005145728588104,
"learning_rate": 9.999404189610464e-06,
"loss": 1.4206143617630005,
"step": 946
},
{
"epoch": 2.5901639344262293,
"grad_norm": 0.12670938670635223,
"learning_rate": 9.997874327836247e-06,
"loss": 1.091995358467102,
"step": 948
},
{
"epoch": 2.5956284153005464,
"grad_norm": 0.7716709971427917,
"learning_rate": 9.995403378925133e-06,
"loss": 0.8635514378547668,
"step": 950
},
{
"epoch": 2.6010928961748636,
"grad_norm": 0.11424434930086136,
"learning_rate": 9.991991859872051e-06,
"loss": 1.2319921255111694,
"step": 952
},
{
"epoch": 2.6065573770491803,
"grad_norm": 0.4743253290653229,
"learning_rate": 9.987640484466776e-06,
"loss": 1.2438640594482422,
"step": 954
},
{
"epoch": 2.612021857923497,
"grad_norm": 0.12911947071552277,
"learning_rate": 9.982350163144578e-06,
"loss": 0.7316928505897522,
"step": 956
},
{
"epoch": 2.6174863387978142,
"grad_norm": 0.1389317363500595,
"learning_rate": 9.976122002795727e-06,
"loss": 0.8430681824684143,
"step": 958
},
{
"epoch": 2.6229508196721314,
"grad_norm": 0.5408509969711304,
"learning_rate": 9.96895730653391e-06,
"loss": 1.2399392127990723,
"step": 960
},
{
"epoch": 2.628415300546448,
"grad_norm": 0.7027162313461304,
"learning_rate": 9.960857573423577e-06,
"loss": 1.210518479347229,
"step": 962
},
{
"epoch": 2.633879781420765,
"grad_norm": 0.2593608498573303,
"learning_rate": 9.951824498166288e-06,
"loss": 1.077878475189209,
"step": 964
},
{
"epoch": 2.639344262295082,
"grad_norm": 0.18102502822875977,
"learning_rate": 9.941859970746131e-06,
"loss": 1.21505606174469,
"step": 966
},
{
"epoch": 2.644808743169399,
"grad_norm": 0.15608328580856323,
"learning_rate": 9.930966076034294e-06,
"loss": 1.2278097867965698,
"step": 968
},
{
"epoch": 2.650273224043716,
"grad_norm": 0.14660756289958954,
"learning_rate": 9.91914509335284e-06,
"loss": 1.225424885749817,
"step": 970
},
{
"epoch": 2.6557377049180326,
"grad_norm": 0.4219532608985901,
"learning_rate": 9.906399495997802e-06,
"loss": 1.1617122888565063,
"step": 972
},
{
"epoch": 2.66120218579235,
"grad_norm": 0.1909582018852234,
"learning_rate": 9.892731950721709e-06,
"loss": 1.1968237161636353,
"step": 974
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.36970287561416626,
"learning_rate": 9.878145317175608e-06,
"loss": 1.2798595428466797,
"step": 976
},
{
"epoch": 2.6721311475409837,
"grad_norm": 0.3367147743701935,
"learning_rate": 9.86264264731076e-06,
"loss": 0.5630533695220947,
"step": 978
},
{
"epoch": 2.6775956284153004,
"grad_norm": 0.26080775260925293,
"learning_rate": 9.846227184740064e-06,
"loss": 1.2143992185592651,
"step": 980
},
{
"epoch": 2.6830601092896176,
"grad_norm": 0.5430212020874023,
"learning_rate": 9.828902364059405e-06,
"loss": 1.2745838165283203,
"step": 982
},
{
"epoch": 2.6885245901639343,
"grad_norm": 0.1231357529759407,
"learning_rate": 9.810671810129045e-06,
"loss": 1.2157704830169678,
"step": 984
},
{
"epoch": 2.6939890710382515,
"grad_norm": 0.27228134870529175,
"learning_rate": 9.791539337315177e-06,
"loss": 1.2553633451461792,
"step": 986
},
{
"epoch": 2.699453551912568,
"grad_norm": 0.23602519929409027,
"learning_rate": 9.771508948691868e-06,
"loss": 1.1951829195022583,
"step": 988
},
{
"epoch": 2.7049180327868854,
"grad_norm": 0.3290325105190277,
"learning_rate": 9.750584835203486e-06,
"loss": 1.2601996660232544,
"step": 990
},
{
"epoch": 2.710382513661202,
"grad_norm": 0.14537322521209717,
"learning_rate": 9.728771374787833e-06,
"loss": 1.1963472366333008,
"step": 992
},
{
"epoch": 2.7158469945355193,
"grad_norm": 0.28898876905441284,
"learning_rate": 9.706073131460157e-06,
"loss": 1.1315364837646484,
"step": 994
},
{
"epoch": 2.721311475409836,
"grad_norm": 0.9577022790908813,
"learning_rate": 9.682494854358218e-06,
"loss": 1.2983077764511108,
"step": 996
},
{
"epoch": 2.726775956284153,
"grad_norm": 0.1064349114894867,
"learning_rate": 9.658041476748646e-06,
"loss": 1.2252497673034668,
"step": 998
},
{
"epoch": 2.73224043715847,
"grad_norm": 0.21378850936889648,
"learning_rate": 9.632718114994727e-06,
"loss": 1.2109845876693726,
"step": 1000
},
{
"epoch": 2.737704918032787,
"grad_norm": 0.25137951970100403,
"learning_rate": 9.606530067485943e-06,
"loss": 1.191309928894043,
"step": 1002
},
{
"epoch": 2.7431693989071038,
"grad_norm": 0.219583198428154,
"learning_rate": 9.579482813529375e-06,
"loss": 1.0501433610916138,
"step": 1004
},
{
"epoch": 2.748633879781421,
"grad_norm": 0.4766612648963928,
"learning_rate": 9.551582012203274e-06,
"loss": 1.192663311958313,
"step": 1006
},
{
"epoch": 2.7540983606557377,
"grad_norm": 0.18655823171138763,
"learning_rate": 9.522833501173018e-06,
"loss": 1.385924220085144,
"step": 1008
},
{
"epoch": 2.7595628415300544,
"grad_norm": 0.1709374487400055,
"learning_rate": 9.493243295469702e-06,
"loss": 0.7094982266426086,
"step": 1010
},
{
"epoch": 2.7650273224043715,
"grad_norm": 0.12649549543857574,
"learning_rate": 9.462817586231608e-06,
"loss": 1.231178641319275,
"step": 1012
},
{
"epoch": 2.7704918032786887,
"grad_norm": 1.2715520858764648,
"learning_rate": 9.431562739408857e-06,
"loss": 0.9694324731826782,
"step": 1014
},
{
"epoch": 2.7759562841530054,
"grad_norm": 0.4418940246105194,
"learning_rate": 9.399485294431449e-06,
"loss": 1.2025630474090576,
"step": 1016
},
{
"epoch": 2.781420765027322,
"grad_norm": 1.3623688220977783,
"learning_rate": 9.36659196284102e-06,
"loss": 0.6023474335670471,
"step": 1018
},
{
"epoch": 2.7868852459016393,
"grad_norm": 0.4146368205547333,
"learning_rate": 9.332889626886606e-06,
"loss": 1.2228423357009888,
"step": 1020
},
{
"epoch": 2.7923497267759565,
"grad_norm": 0.15537501871585846,
"learning_rate": 9.298385338084671e-06,
"loss": 1.1514747142791748,
"step": 1022
},
{
"epoch": 2.797814207650273,
"grad_norm": 0.19204667210578918,
"learning_rate": 9.263086315743706e-06,
"loss": 1.2309038639068604,
"step": 1024
},
{
"epoch": 2.80327868852459,
"grad_norm": 10.16671371459961,
"learning_rate": 9.22699994545376e-06,
"loss": 1.24494206905365,
"step": 1026
},
{
"epoch": 2.808743169398907,
"grad_norm": 0.2645006775856018,
"learning_rate": 9.190133777541152e-06,
"loss": 1.2163110971450806,
"step": 1028
},
{
"epoch": 2.8142076502732243,
"grad_norm": 0.19391575455665588,
"learning_rate": 9.152495525488717e-06,
"loss": 1.217330813407898,
"step": 1030
},
{
"epoch": 2.819672131147541,
"grad_norm": 0.13392172753810883,
"learning_rate": 9.114093064321905e-06,
"loss": 1.2340772151947021,
"step": 1032
},
{
"epoch": 2.8251366120218577,
"grad_norm": 0.17339184880256653,
"learning_rate": 9.074934428961133e-06,
"loss": 1.3524383306503296,
"step": 1034
},
{
"epoch": 2.830601092896175,
"grad_norm": 0.21963149309158325,
"learning_rate": 9.035027812540615e-06,
"loss": 1.2574824094772339,
"step": 1036
},
{
"epoch": 2.836065573770492,
"grad_norm": 0.29987937211990356,
"learning_rate": 8.994381564694118e-06,
"loss": 0.45878124237060547,
"step": 1038
},
{
"epoch": 2.841530054644809,
"grad_norm": 0.1113235130906105,
"learning_rate": 8.953004189807984e-06,
"loss": 1.2674195766448975,
"step": 1040
},
{
"epoch": 2.8469945355191255,
"grad_norm": 0.1546678990125656,
"learning_rate": 8.910904345241772e-06,
"loss": 1.1786985397338867,
"step": 1042
},
{
"epoch": 2.8524590163934427,
"grad_norm": 0.33827704191207886,
"learning_rate": 8.868090839516855e-06,
"loss": 1.2287678718566895,
"step": 1044
},
{
"epoch": 2.8579234972677594,
"grad_norm": 0.1689954549074173,
"learning_rate": 8.824572630473447e-06,
"loss": 1.286142349243164,
"step": 1046
},
{
"epoch": 2.8633879781420766,
"grad_norm": 0.1907668560743332,
"learning_rate": 8.780358823396356e-06,
"loss": 1.1970845460891724,
"step": 1048
},
{
"epoch": 2.8688524590163933,
"grad_norm": 0.1794430911540985,
"learning_rate": 8.735458669109865e-06,
"loss": 1.1694499254226685,
"step": 1050
},
{
"epoch": 2.8743169398907105,
"grad_norm": 0.1645282357931137,
"learning_rate": 8.689881562042228e-06,
"loss": 0.8635576367378235,
"step": 1052
},
{
"epoch": 2.879781420765027,
"grad_norm": 0.15801328420639038,
"learning_rate": 8.643637038260062e-06,
"loss": 1.215505599975586,
"step": 1054
},
{
"epoch": 2.8852459016393444,
"grad_norm": 0.43582382798194885,
"learning_rate": 8.596734773473108e-06,
"loss": 1.1912931203842163,
"step": 1056
},
{
"epoch": 2.890710382513661,
"grad_norm": 0.13898280262947083,
"learning_rate": 8.549184581009813e-06,
"loss": 0.9503419995307922,
"step": 1058
},
{
"epoch": 2.8961748633879782,
"grad_norm": 0.1446235030889511,
"learning_rate": 8.500996409764083e-06,
"loss": 1.2274373769760132,
"step": 1060
},
{
"epoch": 2.901639344262295,
"grad_norm": 0.15515106916427612,
"learning_rate": 8.452180342113686e-06,
"loss": 0.9633349776268005,
"step": 1062
},
{
"epoch": 2.907103825136612,
"grad_norm": 1.2907086610794067,
"learning_rate": 8.402746591810711e-06,
"loss": 0.7252454161643982,
"step": 1064
},
{
"epoch": 2.912568306010929,
"grad_norm": 0.4740718603134155,
"learning_rate": 8.352705501844572e-06,
"loss": 0.993940532207489,
"step": 1066
},
{
"epoch": 2.918032786885246,
"grad_norm": 0.14726273715496063,
"learning_rate": 8.302067542277931e-06,
"loss": 0.7903440594673157,
"step": 1068
},
{
"epoch": 2.9234972677595628,
"grad_norm": 0.10241622477769852,
"learning_rate": 8.250843308056071e-06,
"loss": 1.4365119934082031,
"step": 1070
},
{
"epoch": 2.92896174863388,
"grad_norm": 0.5861666202545166,
"learning_rate": 8.199043516790119e-06,
"loss": 1.1525179147720337,
"step": 1072
},
{
"epoch": 2.9344262295081966,
"grad_norm": 0.16144782304763794,
"learning_rate": 8.1466790065146e-06,
"loss": 1.4913655519485474,
"step": 1074
},
{
"epoch": 2.939890710382514,
"grad_norm": 0.1444198042154312,
"learning_rate": 8.0937607334198e-06,
"loss": 0.8650710582733154,
"step": 1076
},
{
"epoch": 2.9453551912568305,
"grad_norm": 0.1328548938035965,
"learning_rate": 8.040299769559432e-06,
"loss": 1.1815372705459595,
"step": 1078
},
{
"epoch": 2.9508196721311473,
"grad_norm": 0.10948026925325394,
"learning_rate": 7.986307300534018e-06,
"loss": 0.6624533534049988,
"step": 1080
},
{
"epoch": 2.9562841530054644,
"grad_norm": 0.40588322281837463,
"learning_rate": 7.931794623150519e-06,
"loss": 0.34090322256088257,
"step": 1082
},
{
"epoch": 2.9617486338797816,
"grad_norm": 0.15707112848758698,
"learning_rate": 7.87677314305876e-06,
"loss": 1.2241772413253784,
"step": 1084
},
{
"epoch": 2.9672131147540983,
"grad_norm": 0.2559359669685364,
"learning_rate": 7.821254372364994e-06,
"loss": 0.7741996049880981,
"step": 1086
},
{
"epoch": 2.972677595628415,
"grad_norm": 0.37332427501678467,
"learning_rate": 7.765249927223232e-06,
"loss": 1.1781744956970215,
"step": 1088
},
{
"epoch": 2.978142076502732,
"grad_norm": 0.12948814034461975,
"learning_rate": 7.708771525404829e-06,
"loss": 1.4896328449249268,
"step": 1090
},
{
"epoch": 2.9836065573770494,
"grad_norm": 0.3209353983402252,
"learning_rate": 7.651830983846761e-06,
"loss": 1.364580750465393,
"step": 1092
},
{
"epoch": 2.989071038251366,
"grad_norm": 1.060642123222351,
"learning_rate": 7.59444021617919e-06,
"loss": 0.6782019138336182,
"step": 1094
},
{
"epoch": 2.994535519125683,
"grad_norm": 0.10502546280622482,
"learning_rate": 7.536611230232757e-06,
"loss": 1.2083946466445923,
"step": 1096
},
{
"epoch": 3.0,
"grad_norm": 0.17473645508289337,
"learning_rate": 7.478356125526241e-06,
"loss": 0.7375582456588745,
"step": 1098
},
{
"epoch": 3.0054644808743167,
"grad_norm": 0.23671270906925201,
"learning_rate": 7.419687090734916e-06,
"loss": 1.1668708324432373,
"step": 1100
},
{
"epoch": 3.010928961748634,
"grad_norm": 0.3294418454170227,
"learning_rate": 7.360616401140392e-06,
"loss": 0.7690999507904053,
"step": 1102
},
{
"epoch": 3.0163934426229506,
"grad_norm": 0.4549873173236847,
"learning_rate": 7.301156416062226e-06,
"loss": 0.955780029296875,
"step": 1104
},
{
"epoch": 3.021857923497268,
"grad_norm": 0.18644486367702484,
"learning_rate": 7.241319576272002e-06,
"loss": 1.1656275987625122,
"step": 1106
},
{
"epoch": 3.0273224043715845,
"grad_norm": 0.15544608235359192,
"learning_rate": 7.181118401390349e-06,
"loss": 1.1792598962783813,
"step": 1108
},
{
"epoch": 3.0327868852459017,
"grad_norm": 0.14606495201587677,
"learning_rate": 7.12056548726748e-06,
"loss": 1.1974881887435913,
"step": 1110
},
{
"epoch": 3.0382513661202184,
"grad_norm": 0.13032948970794678,
"learning_rate": 7.059673503347772e-06,
"loss": 1.1487013101577759,
"step": 1112
},
{
"epoch": 3.0437158469945356,
"grad_norm": 0.14073766767978668,
"learning_rate": 6.998455190018928e-06,
"loss": 1.164973497390747,
"step": 1114
},
{
"epoch": 3.0491803278688523,
"grad_norm": 0.12033533304929733,
"learning_rate": 6.936923355946332e-06,
"loss": 1.1871498823165894,
"step": 1116
},
{
"epoch": 3.0546448087431695,
"grad_norm": 0.13993288576602936,
"learning_rate": 6.875090875393108e-06,
"loss": 1.1747748851776123,
"step": 1118
},
{
"epoch": 3.060109289617486,
"grad_norm": 0.23406071960926056,
"learning_rate": 6.812970685526419e-06,
"loss": 0.9869977235794067,
"step": 1120
},
{
"epoch": 3.0655737704918034,
"grad_norm": 0.5565271377563477,
"learning_rate": 6.750575783710657e-06,
"loss": 1.0610896348953247,
"step": 1122
},
{
"epoch": 3.07103825136612,
"grad_norm": 0.2001454383134842,
"learning_rate": 6.68791922478801e-06,
"loss": 1.1544849872589111,
"step": 1124
},
{
"epoch": 3.0765027322404372,
"grad_norm": 0.1516963094472885,
"learning_rate": 6.625014118346995e-06,
"loss": 1.0433255434036255,
"step": 1126
},
{
"epoch": 3.081967213114754,
"grad_norm": 0.17434273660182953,
"learning_rate": 6.56187362597955e-06,
"loss": 0.8805356025695801,
"step": 1128
},
{
"epoch": 3.087431693989071,
"grad_norm": 0.299541711807251,
"learning_rate": 6.4985109585272575e-06,
"loss": 0.8970946669578552,
"step": 1130
},
{
"epoch": 3.092896174863388,
"grad_norm": 0.2725391983985901,
"learning_rate": 6.434939373317243e-06,
"loss": 1.1752498149871826,
"step": 1132
},
{
"epoch": 3.098360655737705,
"grad_norm": 0.6264455318450928,
"learning_rate": 6.371172171388326e-06,
"loss": 0.21441805362701416,
"step": 1134
},
{
"epoch": 3.1038251366120218,
"grad_norm": 0.806502640247345,
"learning_rate": 6.307222694708102e-06,
"loss": 0.5281939506530762,
"step": 1136
},
{
"epoch": 3.109289617486339,
"grad_norm": 0.3635815382003784,
"learning_rate": 6.24310432338138e-06,
"loss": 1.512853980064392,
"step": 1138
},
{
"epoch": 3.1147540983606556,
"grad_norm": 0.14387531578540802,
"learning_rate": 6.178830472850673e-06,
"loss": 1.1343353986740112,
"step": 1140
},
{
"epoch": 3.120218579234973,
"grad_norm": 0.26559537649154663,
"learning_rate": 6.114414591089306e-06,
"loss": 0.6498351693153381,
"step": 1142
},
{
"epoch": 3.1256830601092895,
"grad_norm": 0.13467195630073547,
"learning_rate": 6.049870155787701e-06,
"loss": 0.7768141031265259,
"step": 1144
},
{
"epoch": 3.1311475409836067,
"grad_norm": 0.402292400598526,
"learning_rate": 5.985210671533442e-06,
"loss": 1.0391204357147217,
"step": 1146
},
{
"epoch": 3.1366120218579234,
"grad_norm": 0.19904881715774536,
"learning_rate": 5.920449666985737e-06,
"loss": 1.1840667724609375,
"step": 1148
},
{
"epoch": 3.1420765027322406,
"grad_norm": 0.1308041214942932,
"learning_rate": 5.855600692044802e-06,
"loss": 1.1587271690368652,
"step": 1150
},
{
"epoch": 3.1475409836065573,
"grad_norm": 0.4252830445766449,
"learning_rate": 5.79067731501684e-06,
"loss": 1.1577224731445312,
"step": 1152
},
{
"epoch": 3.1530054644808745,
"grad_norm": 0.12468914687633514,
"learning_rate": 5.725693119775161e-06,
"loss": 1.483253002166748,
"step": 1154
},
{
"epoch": 3.158469945355191,
"grad_norm": 0.1550026535987854,
"learning_rate": 5.660661702918021e-06,
"loss": 1.1574190855026245,
"step": 1156
},
{
"epoch": 3.1639344262295084,
"grad_norm": 0.2560645341873169,
"learning_rate": 5.595596670923819e-06,
"loss": 1.1417001485824585,
"step": 1158
},
{
"epoch": 3.169398907103825,
"grad_norm": 0.2430458813905716,
"learning_rate": 5.530511637304231e-06,
"loss": 1.1911888122558594,
"step": 1160
},
{
"epoch": 3.1748633879781423,
"grad_norm": 0.3422093689441681,
"learning_rate": 5.465420219755858e-06,
"loss": 0.7014384865760803,
"step": 1162
},
{
"epoch": 3.180327868852459,
"grad_norm": 0.2611311674118042,
"learning_rate": 5.4003360373110135e-06,
"loss": 1.1390665769577026,
"step": 1164
},
{
"epoch": 3.185792349726776,
"grad_norm": 0.40065422654151917,
"learning_rate": 5.3352727074881914e-06,
"loss": 1.1658681631088257,
"step": 1166
},
{
"epoch": 3.191256830601093,
"grad_norm": 0.16395936906337738,
"learning_rate": 5.27024384344293e-06,
"loss": 0.7962675094604492,
"step": 1168
},
{
"epoch": 3.19672131147541,
"grad_norm": 0.32020851969718933,
"learning_rate": 5.205263051119514e-06,
"loss": 0.8730796575546265,
"step": 1170
},
{
"epoch": 3.202185792349727,
"grad_norm": 0.15397824347019196,
"learning_rate": 5.140343926404199e-06,
"loss": 0.599664568901062,
"step": 1172
},
{
"epoch": 3.2076502732240435,
"grad_norm": 0.12873400747776031,
"learning_rate": 5.075500052280585e-06,
"loss": 0.4004330039024353,
"step": 1174
},
{
"epoch": 3.2131147540983607,
"grad_norm": 0.1628398895263672,
"learning_rate": 5.010744995987643e-06,
"loss": 1.1797914505004883,
"step": 1176
},
{
"epoch": 3.2185792349726774,
"grad_norm": 0.20412485301494598,
"learning_rate": 4.946092306181037e-06,
"loss": 0.7300031185150146,
"step": 1178
},
{
"epoch": 3.2240437158469946,
"grad_norm": 0.4510882496833801,
"learning_rate": 4.881555510098376e-06,
"loss": 1.172342300415039,
"step": 1180
},
{
"epoch": 3.2295081967213113,
"grad_norm": 0.09762854129076004,
"learning_rate": 4.817148110728904e-06,
"loss": 0.9902958273887634,
"step": 1182
},
{
"epoch": 3.2349726775956285,
"grad_norm": 0.19788962602615356,
"learning_rate": 4.752883583988266e-06,
"loss": 1.1401504278182983,
"step": 1184
},
{
"epoch": 3.240437158469945,
"grad_norm": 0.19763797521591187,
"learning_rate": 4.688775375898972e-06,
"loss": 1.186100721359253,
"step": 1186
},
{
"epoch": 3.2459016393442623,
"grad_norm": 0.24222883582115173,
"learning_rate": 4.624836899777107e-06,
"loss": 1.1595841646194458,
"step": 1188
},
{
"epoch": 3.251366120218579,
"grad_norm": 0.42586594820022583,
"learning_rate": 4.561081533425824e-06,
"loss": 1.1622511148452759,
"step": 1190
},
{
"epoch": 3.2568306010928962,
"grad_norm": 0.4112228751182556,
"learning_rate": 4.497522616336374e-06,
"loss": 1.106964349746704,
"step": 1192
},
{
"epoch": 3.262295081967213,
"grad_norm": 0.22122681140899658,
"learning_rate": 4.4341734468970454e-06,
"loss": 0.81840580701828,
"step": 1194
},
{
"epoch": 3.26775956284153,
"grad_norm": 0.16680166125297546,
"learning_rate": 4.371047279610766e-06,
"loss": 0.32536399364471436,
"step": 1196
},
{
"epoch": 3.273224043715847,
"grad_norm": 0.30231085419654846,
"learning_rate": 4.308157322321911e-06,
"loss": 1.14083993434906,
"step": 1198
},
{
"epoch": 3.278688524590164,
"grad_norm": 0.2778013348579407,
"learning_rate": 4.245516733452747e-06,
"loss": 0.6304877400398254,
"step": 1200
},
{
"epoch": 3.2841530054644807,
"grad_norm": 0.12325984239578247,
"learning_rate": 4.183138619250412e-06,
"loss": 1.1525787115097046,
"step": 1202
},
{
"epoch": 3.289617486338798,
"grad_norm": 0.27475157380104065,
"learning_rate": 4.121036031044621e-06,
"loss": 1.2028400897979736,
"step": 1204
},
{
"epoch": 3.2950819672131146,
"grad_norm": 0.21590927243232727,
"learning_rate": 4.059221962516973e-06,
"loss": 1.1575835943222046,
"step": 1206
},
{
"epoch": 3.300546448087432,
"grad_norm": 1.6404995918273926,
"learning_rate": 3.997709346982319e-06,
"loss": 0.7506100535392761,
"step": 1208
},
{
"epoch": 3.3060109289617485,
"grad_norm": 0.12230981886386871,
"learning_rate": 3.936511054682678e-06,
"loss": 1.13595449924469,
"step": 1210
},
{
"epoch": 3.3114754098360657,
"grad_norm": 0.16316284239292145,
"learning_rate": 3.875639890094465e-06,
"loss": 1.1835432052612305,
"step": 1212
},
{
"epoch": 3.3169398907103824,
"grad_norm": 0.15454798936843872,
"learning_rate": 3.815108589249368e-06,
"loss": 1.0592103004455566,
"step": 1214
},
{
"epoch": 3.3224043715846996,
"grad_norm": 0.2797645926475525,
"learning_rate": 3.7549298170696247e-06,
"loss": 1.2536249160766602,
"step": 1216
},
{
"epoch": 3.3278688524590163,
"grad_norm": 0.8903380036354065,
"learning_rate": 3.69511616471815e-06,
"loss": 0.7049106359481812,
"step": 1218
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.14891834557056427,
"learning_rate": 3.6356801469640746e-06,
"loss": 1.1736491918563843,
"step": 1220
},
{
"epoch": 3.33879781420765,
"grad_norm": 0.23303283751010895,
"learning_rate": 3.5766341995643296e-06,
"loss": 0.735037088394165,
"step": 1222
},
{
"epoch": 3.3442622950819674,
"grad_norm": 0.32323116064071655,
"learning_rate": 3.5179906766616713e-06,
"loss": 0.9724880456924438,
"step": 1224
},
{
"epoch": 3.349726775956284,
"grad_norm": 0.41624340415000916,
"learning_rate": 3.4597618481998595e-06,
"loss": 1.1626942157745361,
"step": 1226
},
{
"epoch": 3.3551912568306013,
"grad_norm": 0.1781196892261505,
"learning_rate": 3.4019598973564415e-06,
"loss": 0.6936111450195312,
"step": 1228
},
{
"epoch": 3.360655737704918,
"grad_norm": 0.3224226236343384,
"learning_rate": 3.3445969179936356e-06,
"loss": 1.5072102546691895,
"step": 1230
},
{
"epoch": 3.366120218579235,
"grad_norm": 0.6502639651298523,
"learning_rate": 3.287684912127967e-06,
"loss": 1.0385020971298218,
"step": 1232
},
{
"epoch": 3.371584699453552,
"grad_norm": 0.19714456796646118,
"learning_rate": 3.2312357874190814e-06,
"loss": 1.1386412382125854,
"step": 1234
},
{
"epoch": 3.3770491803278686,
"grad_norm": 0.36271029710769653,
"learning_rate": 3.1752613546783405e-06,
"loss": 0.7547073364257812,
"step": 1236
},
{
"epoch": 3.3825136612021858,
"grad_norm": 0.20249810814857483,
"learning_rate": 3.1197733253976137e-06,
"loss": 0.8926489949226379,
"step": 1238
},
{
"epoch": 3.387978142076503,
"grad_norm": 0.4233534634113312,
"learning_rate": 3.064783309298909e-06,
"loss": 1.1394166946411133,
"step": 1240
},
{
"epoch": 3.3934426229508197,
"grad_norm": 0.3082427680492401,
"learning_rate": 3.0103028119053024e-06,
"loss": 0.7299805283546448,
"step": 1242
},
{
"epoch": 3.3989071038251364,
"grad_norm": 0.06073412671685219,
"learning_rate": 2.9563432321336018e-06,
"loss": 1.0126986503601074,
"step": 1244
},
{
"epoch": 3.4043715846994536,
"grad_norm": 0.3158816397190094,
"learning_rate": 2.902915859909382e-06,
"loss": 1.0959231853485107,
"step": 1246
},
{
"epoch": 3.4098360655737707,
"grad_norm": 0.10029984265565872,
"learning_rate": 2.8500318738048284e-06,
"loss": 0.7657914757728577,
"step": 1248
},
{
"epoch": 3.4153005464480874,
"grad_norm": 0.4239802956581116,
"learning_rate": 2.7977023386997743e-06,
"loss": 0.6873133182525635,
"step": 1250
},
{
"epoch": 3.420765027322404,
"grad_norm": 0.167138010263443,
"learning_rate": 2.7459382034667037e-06,
"loss": 1.156643271446228,
"step": 1252
},
{
"epoch": 3.4262295081967213,
"grad_norm": 0.20836292207241058,
"learning_rate": 2.6947502986798396e-06,
"loss": 0.7242846488952637,
"step": 1254
},
{
"epoch": 3.431693989071038,
"grad_norm": 0.1728839874267578,
"learning_rate": 2.6441493343491366e-06,
"loss": 0.6718332171440125,
"step": 1256
},
{
"epoch": 3.4371584699453552,
"grad_norm": 0.21569444239139557,
"learning_rate": 2.594145897679381e-06,
"loss": 1.1607874631881714,
"step": 1258
},
{
"epoch": 3.442622950819672,
"grad_norm": 0.32350954413414,
"learning_rate": 2.5447504508550626e-06,
"loss": 1.022996425628662,
"step": 1260
},
{
"epoch": 3.448087431693989,
"grad_norm": 0.22501367330551147,
"learning_rate": 2.495973328851391e-06,
"loss": 1.1078190803527832,
"step": 1262
},
{
"epoch": 3.453551912568306,
"grad_norm": 0.18108385801315308,
"learning_rate": 2.4478247372718857e-06,
"loss": 1.1930431127548218,
"step": 1264
},
{
"epoch": 3.459016393442623,
"grad_norm": 0.1784006655216217,
"learning_rate": 2.4003147502130847e-06,
"loss": 1.1563763618469238,
"step": 1266
},
{
"epoch": 3.4644808743169397,
"grad_norm": 0.21632422506809235,
"learning_rate": 2.353453308156744e-06,
"loss": 0.5923645496368408,
"step": 1268
},
{
"epoch": 3.469945355191257,
"grad_norm": 0.15631955862045288,
"learning_rate": 2.30725021588999e-06,
"loss": 1.1780391931533813,
"step": 1270
},
{
"epoch": 3.4754098360655736,
"grad_norm": 0.6973638534545898,
"learning_rate": 2.2617151404538967e-06,
"loss": 1.207474946975708,
"step": 1272
},
{
"epoch": 3.480874316939891,
"grad_norm": 0.15252643823623657,
"learning_rate": 2.21685760912082e-06,
"loss": 1.214716911315918,
"step": 1274
},
{
"epoch": 3.4863387978142075,
"grad_norm": 0.11059801280498505,
"learning_rate": 2.1726870074010315e-06,
"loss": 0.9561800360679626,
"step": 1276
},
{
"epoch": 3.4918032786885247,
"grad_norm": 0.5404808521270752,
"learning_rate": 2.129212577079012e-06,
"loss": 1.235242247581482,
"step": 1278
},
{
"epoch": 3.4972677595628414,
"grad_norm": 0.254938006401062,
"learning_rate": 2.0864434142797595e-06,
"loss": 1.1324114799499512,
"step": 1280
},
{
"epoch": 3.5027322404371586,
"grad_norm": 0.17700043320655823,
"learning_rate": 2.044388467565661e-06,
"loss": 1.1970921754837036,
"step": 1282
},
{
"epoch": 3.5081967213114753,
"grad_norm": 0.16740496456623077,
"learning_rate": 2.0030565360641325e-06,
"loss": 0.8614110350608826,
"step": 1284
},
{
"epoch": 3.5136612021857925,
"grad_norm": 0.3018406629562378,
"learning_rate": 1.9624562676266495e-06,
"loss": 1.0356260538101196,
"step": 1286
},
{
"epoch": 3.519125683060109,
"grad_norm": 1.1560367345809937,
"learning_rate": 1.922596157019311e-06,
"loss": 0.6663897633552551,
"step": 1288
},
{
"epoch": 3.5245901639344264,
"grad_norm": 0.16047994792461395,
"learning_rate": 1.8834845441455064e-06,
"loss": 1.156725287437439,
"step": 1290
},
{
"epoch": 3.530054644808743,
"grad_norm": 0.27034106850624084,
"learning_rate": 1.845129612300973e-06,
"loss": 1.1946804523468018,
"step": 1292
},
{
"epoch": 3.5355191256830603,
"grad_norm": 0.2400462031364441,
"learning_rate": 1.807539386461588e-06,
"loss": 1.1415934562683105,
"step": 1294
},
{
"epoch": 3.540983606557377,
"grad_norm": 0.1972731202840805,
"learning_rate": 1.770721731604319e-06,
"loss": 1.137350082397461,
"step": 1296
},
{
"epoch": 3.546448087431694,
"grad_norm": 0.31727540493011475,
"learning_rate": 1.734684351061653e-06,
"loss": 1.4584566354751587,
"step": 1298
},
{
"epoch": 3.551912568306011,
"grad_norm": 0.23261825740337372,
"learning_rate": 1.6994347849098103e-06,
"loss": 1.1674689054489136,
"step": 1300
},
{
"epoch": 3.557377049180328,
"grad_norm": 0.18792183697223663,
"learning_rate": 1.664980408391153e-06,
"loss": 0.7082504630088806,
"step": 1302
},
{
"epoch": 3.5628415300546448,
"grad_norm": 0.14191050827503204,
"learning_rate": 1.6313284303710635e-06,
"loss": 1.1603806018829346,
"step": 1304
},
{
"epoch": 3.5683060109289615,
"grad_norm": 0.18446889519691467,
"learning_rate": 1.5984858918296529e-06,
"loss": 1.4281741380691528,
"step": 1306
},
{
"epoch": 3.5737704918032787,
"grad_norm": 0.18436121940612793,
"learning_rate": 1.566459664388556e-06,
"loss": 1.1683400869369507,
"step": 1308
},
{
"epoch": 3.579234972677596,
"grad_norm": 0.13035498559474945,
"learning_rate": 1.535256448873202e-06,
"loss": 0.7431635856628418,
"step": 1310
},
{
"epoch": 3.5846994535519126,
"grad_norm": 0.2386208474636078,
"learning_rate": 1.5048827739108146e-06,
"loss": 0.8260349035263062,
"step": 1312
},
{
"epoch": 3.5901639344262293,
"grad_norm": 0.1561446338891983,
"learning_rate": 1.4753449945644094e-06,
"loss": 0.7370667457580566,
"step": 1314
},
{
"epoch": 3.5956284153005464,
"grad_norm": 0.28548333048820496,
"learning_rate": 1.4466492910031438e-06,
"loss": 0.8161956667900085,
"step": 1316
},
{
"epoch": 3.6010928961748636,
"grad_norm": 0.6451618671417236,
"learning_rate": 1.4188016672092412e-06,
"loss": 1.30082368850708,
"step": 1318
},
{
"epoch": 3.6065573770491803,
"grad_norm": 0.2740461826324463,
"learning_rate": 1.3918079497217778e-06,
"loss": 1.0646051168441772,
"step": 1320
},
{
"epoch": 3.612021857923497,
"grad_norm": 0.1899060159921646,
"learning_rate": 1.3656737864176132e-06,
"loss": 1.1546478271484375,
"step": 1322
},
{
"epoch": 3.6174863387978142,
"grad_norm": 0.14665991067886353,
"learning_rate": 1.3404046453296666e-06,
"loss": 1.1401379108428955,
"step": 1324
},
{
"epoch": 3.6229508196721314,
"grad_norm": 0.39782294631004333,
"learning_rate": 1.3160058135028686e-06,
"loss": 1.1493659019470215,
"step": 1326
},
{
"epoch": 3.628415300546448,
"grad_norm": 0.17628178000450134,
"learning_rate": 1.2924823958879367e-06,
"loss": 0.6381902098655701,
"step": 1328
},
{
"epoch": 3.633879781420765,
"grad_norm": 0.790067195892334,
"learning_rate": 1.2698393142732774e-06,
"loss": 1.2038449048995972,
"step": 1330
},
{
"epoch": 3.639344262295082,
"grad_norm": 0.1480078101158142,
"learning_rate": 1.2480813062552152e-06,
"loss": 1.1669472455978394,
"step": 1332
},
{
"epoch": 3.644808743169399,
"grad_norm": 0.22172480821609497,
"learning_rate": 1.2272129242467186e-06,
"loss": 0.6880963444709778,
"step": 1334
},
{
"epoch": 3.650273224043716,
"grad_norm": 0.13903914391994476,
"learning_rate": 1.20723853452494e-06,
"loss": 1.1238532066345215,
"step": 1336
},
{
"epoch": 3.6557377049180326,
"grad_norm": 0.28902727365493774,
"learning_rate": 1.188162316317633e-06,
"loss": 1.1814414262771606,
"step": 1338
},
{
"epoch": 3.66120218579235,
"grad_norm": 0.1453699916601181,
"learning_rate": 1.1699882609287466e-06,
"loss": 1.1549361944198608,
"step": 1340
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.13342957198619843,
"learning_rate": 1.152720170903333e-06,
"loss": 1.15314781665802,
"step": 1342
},
{
"epoch": 3.6721311475409837,
"grad_norm": 0.13496847450733185,
"learning_rate": 1.136361659231929e-06,
"loss": 1.173022747039795,
"step": 1344
},
{
"epoch": 3.6775956284153004,
"grad_norm": 0.4596235454082489,
"learning_rate": 1.12091614859463e-06,
"loss": 0.915589451789856,
"step": 1346
},
{
"epoch": 3.6830601092896176,
"grad_norm": 0.15929074585437775,
"learning_rate": 1.1063868706449486e-06,
"loss": 1.1599005460739136,
"step": 1348
},
{
"epoch": 3.6885245901639343,
"grad_norm": 0.505683958530426,
"learning_rate": 1.0927768653336685e-06,
"loss": 1.28230881690979,
"step": 1350
},
{
"epoch": 3.6939890710382515,
"grad_norm": 0.15837951004505157,
"learning_rate": 1.080088980272795e-06,
"loss": 1.1631075143814087,
"step": 1352
},
{
"epoch": 3.699453551912568,
"grad_norm": 0.16127492487430573,
"learning_rate": 1.0683258701397478e-06,
"loss": 1.1622624397277832,
"step": 1354
},
{
"epoch": 3.7049180327868854,
"grad_norm": 0.3719762861728668,
"learning_rate": 1.0574899961219337e-06,
"loss": 1.0233434438705444,
"step": 1356
},
{
"epoch": 3.710382513661202,
"grad_norm": 0.23049604892730713,
"learning_rate": 1.0475836254017824e-06,
"loss": 1.1272181272506714,
"step": 1358
},
{
"epoch": 3.7158469945355193,
"grad_norm": 0.1138298511505127,
"learning_rate": 1.0386088306823915e-06,
"loss": 1.1475929021835327,
"step": 1360
},
{
"epoch": 3.721311475409836,
"grad_norm": 0.2246314287185669,
"learning_rate": 1.0305674897538596e-06,
"loss": 1.2239562273025513,
"step": 1362
},
{
"epoch": 3.726775956284153,
"grad_norm": 0.899022102355957,
"learning_rate": 1.02346128510039e-06,
"loss": 0.5836929082870483,
"step": 1364
},
{
"epoch": 3.73224043715847,
"grad_norm": 0.34119629859924316,
"learning_rate": 1.017291703548272e-06,
"loss": 1.0720897912979126,
"step": 1366
},
{
"epoch": 3.737704918032787,
"grad_norm": 0.11816765367984772,
"learning_rate": 1.0120600359547874e-06,
"loss": 1.1483832597732544,
"step": 1368
},
{
"epoch": 3.7431693989071038,
"grad_norm": 0.22198091447353363,
"learning_rate": 1.0077673769381334e-06,
"loss": 1.1218035221099854,
"step": 1370
},
{
"epoch": 3.748633879781421,
"grad_norm": 0.20811736583709717,
"learning_rate": 1.0044146246483882e-06,
"loss": 0.47951236367225647,
"step": 1372
},
{
"epoch": 3.7540983606557377,
"grad_norm": 0.338046133518219,
"learning_rate": 1.0020024805795956e-06,
"loss": 1.1831530332565308,
"step": 1374
},
{
"epoch": 3.7595628415300544,
"grad_norm": 0.15060603618621826,
"learning_rate": 1.0005314494229926e-06,
"loss": 1.1326758861541748,
"step": 1376
},
{
"epoch": 3.7650273224043715,
"grad_norm": 0.5109639763832092,
"learning_rate": 1.0000018389614113e-06,
"loss": 0.6270037889480591,
"step": 1378
},
{
"epoch": 3.7704918032786887,
"grad_norm": 0.24945014715194702,
"learning_rate": 1.0004137600048822e-06,
"loss": 0.8350727558135986,
"step": 1380
},
{
"epoch": 3.7759562841530054,
"grad_norm": 0.17239828407764435,
"learning_rate": 1.0017671263674504e-06,
"loss": 1.155866265296936,
"step": 1382
},
{
"epoch": 3.781420765027322,
"grad_norm": 0.5272848606109619,
"learning_rate": 1.0040616548852077e-06,
"loss": 1.3420521020889282,
"step": 1384
},
{
"epoch": 3.7868852459016393,
"grad_norm": 0.22848758101463318,
"learning_rate": 1.007296865475537e-06,
"loss": 1.411726713180542,
"step": 1386
},
{
"epoch": 3.7923497267759565,
"grad_norm": 0.4009351134300232,
"learning_rate": 1.0114720812375635e-06,
"loss": 1.1592403650283813,
"step": 1388
},
{
"epoch": 3.797814207650273,
"grad_norm": 0.18678544461727142,
"learning_rate": 1.0165864285937786e-06,
"loss": 0.8567480444908142,
"step": 1390
},
{
"epoch": 3.80327868852459,
"grad_norm": 0.17061619460582733,
"learning_rate": 1.0226388374728194e-06,
"loss": 1.189640998840332,
"step": 1392
},
{
"epoch": 3.808743169398907,
"grad_norm": 0.2989545464515686,
"learning_rate": 1.029628041533361e-06,
"loss": 1.1550894975662231,
"step": 1394
},
{
"epoch": 3.8142076502732243,
"grad_norm": 0.24996203184127808,
"learning_rate": 1.037552578429066e-06,
"loss": 1.4984477758407593,
"step": 1396
},
{
"epoch": 3.819672131147541,
"grad_norm": 0.1401793211698532,
"learning_rate": 1.0464107901145578e-06,
"loss": 1.1697229146957397,
"step": 1398
},
{
"epoch": 3.8251366120218577,
"grad_norm": 0.1961352527141571,
"learning_rate": 1.056200823192328e-06,
"loss": 1.1261929273605347,
"step": 1400
},
{
"epoch": 3.830601092896175,
"grad_norm": 0.14081653952598572,
"learning_rate": 1.0669206293005217e-06,
"loss": 0.6766828298568726,
"step": 1402
},
{
"epoch": 3.836065573770492,
"grad_norm": 0.3913050591945648,
"learning_rate": 1.0785679655415176e-06,
"loss": 1.1940261125564575,
"step": 1404
},
{
"epoch": 3.841530054644809,
"grad_norm": 0.2754296362400055,
"learning_rate": 1.0911403949512015e-06,
"loss": 0.9278432130813599,
"step": 1406
},
{
"epoch": 3.8469945355191255,
"grad_norm": 0.13041289150714874,
"learning_rate": 1.1046352870088589e-06,
"loss": 0.42367491126060486,
"step": 1408
},
{
"epoch": 3.8524590163934427,
"grad_norm": 0.1813327521085739,
"learning_rate": 1.1190498181875525e-06,
"loss": 1.0740183591842651,
"step": 1410
},
{
"epoch": 3.8579234972677594,
"grad_norm": 0.22110266983509064,
"learning_rate": 1.134380972544884e-06,
"loss": 1.1528666019439697,
"step": 1412
},
{
"epoch": 3.8633879781420766,
"grad_norm": 0.15557754039764404,
"learning_rate": 1.1506255423540263e-06,
"loss": 1.1179693937301636,
"step": 1414
},
{
"epoch": 3.8688524590163933,
"grad_norm": 0.1647985577583313,
"learning_rate": 1.16778012877486e-06,
"loss": 0.7647815346717834,
"step": 1416
},
{
"epoch": 3.8743169398907105,
"grad_norm": 0.24811968207359314,
"learning_rate": 1.185841142565133e-06,
"loss": 1.1542158126831055,
"step": 1418
},
{
"epoch": 3.879781420765027,
"grad_norm": 5.862763404846191,
"learning_rate": 1.2048048048314066e-06,
"loss": 0.2956826984882355,
"step": 1420
},
{
"epoch": 3.8852459016393444,
"grad_norm": 0.1758033186197281,
"learning_rate": 1.2246671478197331e-06,
"loss": 1.0049794912338257,
"step": 1422
},
{
"epoch": 3.890710382513661,
"grad_norm": 1.1392064094543457,
"learning_rate": 1.2454240157458205e-06,
"loss": 1.1001185178756714,
"step": 1424
},
{
"epoch": 3.8961748633879782,
"grad_norm": 0.15054211020469666,
"learning_rate": 1.2670710656645328e-06,
"loss": 1.150123953819275,
"step": 1426
},
{
"epoch": 3.901639344262295,
"grad_norm": 0.2475654035806656,
"learning_rate": 1.2896037683785801e-06,
"loss": 0.5788159966468811,
"step": 1428
},
{
"epoch": 3.907103825136612,
"grad_norm": 0.4780398905277252,
"learning_rate": 1.313017409386149e-06,
"loss": 1.1813514232635498,
"step": 1430
},
{
"epoch": 3.912568306010929,
"grad_norm": 0.22174669802188873,
"learning_rate": 1.3373070898673093e-06,
"loss": 1.0555204153060913,
"step": 1432
},
{
"epoch": 3.918032786885246,
"grad_norm": 0.1643574833869934,
"learning_rate": 1.3624677277090036e-06,
"loss": 1.1652286052703857,
"step": 1434
},
{
"epoch": 3.9234972677595628,
"grad_norm": 0.11196662485599518,
"learning_rate": 1.3884940585683671e-06,
"loss": 0.685890793800354,
"step": 1436
},
{
"epoch": 3.92896174863388,
"grad_norm": 1.3236440420150757,
"learning_rate": 1.4153806369741852e-06,
"loss": 1.1726253032684326,
"step": 1438
},
{
"epoch": 3.9344262295081966,
"grad_norm": 0.22437423467636108,
"learning_rate": 1.4431218374662337e-06,
"loss": 1.1598002910614014,
"step": 1440
},
{
"epoch": 3.939890710382514,
"grad_norm": 0.1412169635295868,
"learning_rate": 1.4717118557723104e-06,
"loss": 1.166081428527832,
"step": 1442
},
{
"epoch": 3.9453551912568305,
"grad_norm": 0.23314853012561798,
"learning_rate": 1.5011447100226491e-06,
"loss": 1.1711547374725342,
"step": 1444
},
{
"epoch": 3.9508196721311473,
"grad_norm": 0.2032768577337265,
"learning_rate": 1.5314142420014904e-06,
"loss": 1.1563503742218018,
"step": 1446
},
{
"epoch": 3.9562841530054644,
"grad_norm": 0.16453734040260315,
"learning_rate": 1.5625141184355825e-06,
"loss": 1.1572513580322266,
"step": 1448
},
{
"epoch": 3.9617486338797816,
"grad_norm": 0.20619571208953857,
"learning_rate": 1.5944378323192742e-06,
"loss": 0.855080246925354,
"step": 1450
},
{
"epoch": 3.9672131147540983,
"grad_norm": 0.18206338584423065,
"learning_rate": 1.6271787042759732e-06,
"loss": 1.1654366254806519,
"step": 1452
},
{
"epoch": 3.972677595628415,
"grad_norm": 0.2553770840167999,
"learning_rate": 1.6607298839556735e-06,
"loss": 1.138973355293274,
"step": 1454
},
{
"epoch": 3.978142076502732,
"grad_norm": 0.1462344080209732,
"learning_rate": 1.6950843514682313e-06,
"loss": 0.6342450380325317,
"step": 1456
},
{
"epoch": 3.9836065573770494,
"grad_norm": 0.9649009108543396,
"learning_rate": 1.7302349188521617e-06,
"loss": 0.9733319878578186,
"step": 1458
},
{
"epoch": 3.989071038251366,
"grad_norm": 0.13179883360862732,
"learning_rate": 1.7661742315785368e-06,
"loss": 1.1369507312774658,
"step": 1460
},
{
"epoch": 3.994535519125683,
"grad_norm": 0.1754077821969986,
"learning_rate": 1.802894770089798e-06,
"loss": 0.6347918510437012,
"step": 1462
},
{
"epoch": 4.0,
"grad_norm": 0.32492130994796753,
"learning_rate": 1.8403888513730571e-06,
"loss": 1.5204390287399292,
"step": 1464
},
{
"epoch": 4.0,
"step": 1464,
"total_flos": 6.460565555370787e+18,
"train_loss": 1.1815453414993533,
"train_runtime": 50857.5595,
"train_samples_per_second": 1.727,
"train_steps_per_second": 0.029
}
],
"logging_steps": 2,
"max_steps": 1464,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.460565555370787e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}