9b-48 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
af8a32c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1098,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00546448087431694,
"grad_norm": 0.23017044365406036,
"learning_rate": 3.6363636363636366e-07,
"loss": 1.9264214038848877,
"step": 2
},
{
"epoch": 0.01092896174863388,
"grad_norm": 0.29104915261268616,
"learning_rate": 1.090909090909091e-06,
"loss": 2.0267927646636963,
"step": 4
},
{
"epoch": 0.01639344262295082,
"grad_norm": 0.17760822176933289,
"learning_rate": 1.8181818181818183e-06,
"loss": 1.8153414726257324,
"step": 6
},
{
"epoch": 0.02185792349726776,
"grad_norm": 0.17689602077007294,
"learning_rate": 2.5454545454545456e-06,
"loss": 1.998296856880188,
"step": 8
},
{
"epoch": 0.0273224043715847,
"grad_norm": 0.3168371915817261,
"learning_rate": 3.272727272727273e-06,
"loss": 2.2566328048706055,
"step": 10
},
{
"epoch": 0.03278688524590164,
"grad_norm": 6.834624290466309,
"learning_rate": 4.000000000000001e-06,
"loss": 3.0779101848602295,
"step": 12
},
{
"epoch": 0.03825136612021858,
"grad_norm": 0.23897646367549896,
"learning_rate": 4.727272727272728e-06,
"loss": 1.984060525894165,
"step": 14
},
{
"epoch": 0.04371584699453552,
"grad_norm": 0.17694684863090515,
"learning_rate": 5.4545454545454545e-06,
"loss": 1.8395605087280273,
"step": 16
},
{
"epoch": 0.04918032786885246,
"grad_norm": 0.5123416185379028,
"learning_rate": 6.181818181818182e-06,
"loss": 2.626169443130493,
"step": 18
},
{
"epoch": 0.0546448087431694,
"grad_norm": 0.5538182258605957,
"learning_rate": 6.90909090909091e-06,
"loss": 2.009780168533325,
"step": 20
},
{
"epoch": 0.060109289617486336,
"grad_norm": 1.7161580324172974,
"learning_rate": 7.636363636363638e-06,
"loss": 2.2599620819091797,
"step": 22
},
{
"epoch": 0.06557377049180328,
"grad_norm": 0.20273339748382568,
"learning_rate": 8.363636363636365e-06,
"loss": 1.8350398540496826,
"step": 24
},
{
"epoch": 0.07103825136612021,
"grad_norm": 0.1498890072107315,
"learning_rate": 9.090909090909091e-06,
"loss": 1.871530294418335,
"step": 26
},
{
"epoch": 0.07650273224043716,
"grad_norm": 0.16071414947509766,
"learning_rate": 9.81818181818182e-06,
"loss": 1.820814847946167,
"step": 28
},
{
"epoch": 0.08196721311475409,
"grad_norm": 0.3428778350353241,
"learning_rate": 1.0545454545454546e-05,
"loss": 1.6833339929580688,
"step": 30
},
{
"epoch": 0.08743169398907104,
"grad_norm": 1.4728891849517822,
"learning_rate": 1.1272727272727272e-05,
"loss": 2.2020010948181152,
"step": 32
},
{
"epoch": 0.09289617486338798,
"grad_norm": 0.1522647738456726,
"learning_rate": 1.2e-05,
"loss": 1.7234824895858765,
"step": 34
},
{
"epoch": 0.09836065573770492,
"grad_norm": 0.3773060142993927,
"learning_rate": 1.2727272727272728e-05,
"loss": 2.4320225715637207,
"step": 36
},
{
"epoch": 0.10382513661202186,
"grad_norm": 0.489437073469162,
"learning_rate": 1.3454545454545455e-05,
"loss": 1.9707622528076172,
"step": 38
},
{
"epoch": 0.1092896174863388,
"grad_norm": 0.17363367974758148,
"learning_rate": 1.4181818181818183e-05,
"loss": 1.6727865934371948,
"step": 40
},
{
"epoch": 0.11475409836065574,
"grad_norm": 0.19784805178642273,
"learning_rate": 1.4909090909090911e-05,
"loss": 1.621885061264038,
"step": 42
},
{
"epoch": 0.12021857923497267,
"grad_norm": 0.16416551172733307,
"learning_rate": 1.563636363636364e-05,
"loss": 1.6609282493591309,
"step": 44
},
{
"epoch": 0.12568306010928962,
"grad_norm": 0.1397552490234375,
"learning_rate": 1.6363636363636366e-05,
"loss": 1.3601855039596558,
"step": 46
},
{
"epoch": 0.13114754098360656,
"grad_norm": 0.16565820574760437,
"learning_rate": 1.7090909090909092e-05,
"loss": 1.6233359575271606,
"step": 48
},
{
"epoch": 0.1366120218579235,
"grad_norm": 0.18556559085845947,
"learning_rate": 1.781818181818182e-05,
"loss": 1.619398832321167,
"step": 50
},
{
"epoch": 0.14207650273224043,
"grad_norm": 0.13526372611522675,
"learning_rate": 1.8545454545454545e-05,
"loss": 1.6659531593322754,
"step": 52
},
{
"epoch": 0.14754098360655737,
"grad_norm": 0.14406871795654297,
"learning_rate": 1.9272727272727275e-05,
"loss": 1.7703707218170166,
"step": 54
},
{
"epoch": 0.15300546448087432,
"grad_norm": 0.13732993602752686,
"learning_rate": 2e-05,
"loss": 1.5436078310012817,
"step": 56
},
{
"epoch": 0.15846994535519127,
"grad_norm": 0.5554673671722412,
"learning_rate": 1.9998327792599505e-05,
"loss": 1.1738401651382446,
"step": 58
},
{
"epoch": 0.16393442622950818,
"grad_norm": 0.2894555330276489,
"learning_rate": 1.999331179179304e-05,
"loss": 1.1613794565200806,
"step": 60
},
{
"epoch": 0.16939890710382513,
"grad_norm": 0.2021346539258957,
"learning_rate": 1.9984953861534752e-05,
"loss": 1.5094225406646729,
"step": 62
},
{
"epoch": 0.17486338797814208,
"grad_norm": 0.13112854957580566,
"learning_rate": 1.997325710764527e-05,
"loss": 1.2485407590866089,
"step": 64
},
{
"epoch": 0.18032786885245902,
"grad_norm": 0.37511691451072693,
"learning_rate": 1.9958225876657575e-05,
"loss": 1.0438565015792847,
"step": 66
},
{
"epoch": 0.18579234972677597,
"grad_norm": 0.26088741421699524,
"learning_rate": 1.9939865754201825e-05,
"loss": 1.4915159940719604,
"step": 68
},
{
"epoch": 0.1912568306010929,
"grad_norm": 0.3718518018722534,
"learning_rate": 1.9918183562929717e-05,
"loss": 1.3824762105941772,
"step": 70
},
{
"epoch": 0.19672131147540983,
"grad_norm": 0.7932000160217285,
"learning_rate": 1.9893187359979183e-05,
"loss": 0.9995588660240173,
"step": 72
},
{
"epoch": 0.20218579234972678,
"grad_norm": 0.33288639783859253,
"learning_rate": 1.986488643398035e-05,
"loss": 1.1461442708969116,
"step": 74
},
{
"epoch": 0.20765027322404372,
"grad_norm": 1.125511646270752,
"learning_rate": 1.9833291301603863e-05,
"loss": 1.3429020643234253,
"step": 76
},
{
"epoch": 0.21311475409836064,
"grad_norm": 0.1951577067375183,
"learning_rate": 1.9798413703652867e-05,
"loss": 1.4258100986480713,
"step": 78
},
{
"epoch": 0.2185792349726776,
"grad_norm": 0.2255648821592331,
"learning_rate": 1.976026660070012e-05,
"loss": 1.4055513143539429,
"step": 80
},
{
"epoch": 0.22404371584699453,
"grad_norm": 0.15050630271434784,
"learning_rate": 1.9718864168271823e-05,
"loss": 1.4016798734664917,
"step": 82
},
{
"epoch": 0.22950819672131148,
"grad_norm": 0.1393994390964508,
"learning_rate": 1.9674221791579946e-05,
"loss": 1.362338662147522,
"step": 84
},
{
"epoch": 0.23497267759562843,
"grad_norm": 0.13078755140304565,
"learning_rate": 1.9626356059805085e-05,
"loss": 1.2792019844055176,
"step": 86
},
{
"epoch": 0.24043715846994534,
"grad_norm": 0.12456239014863968,
"learning_rate": 1.957528475993189e-05,
"loss": 0.8783624768257141,
"step": 88
},
{
"epoch": 0.2459016393442623,
"grad_norm": 0.45559144020080566,
"learning_rate": 1.952102687013938e-05,
"loss": 0.9341011643409729,
"step": 90
},
{
"epoch": 0.25136612021857924,
"grad_norm": 0.2474871426820755,
"learning_rate": 1.946360255274863e-05,
"loss": 1.2796369791030884,
"step": 92
},
{
"epoch": 0.2568306010928962,
"grad_norm": 0.1871662735939026,
"learning_rate": 1.9403033146730424e-05,
"loss": 0.7436278462409973,
"step": 94
},
{
"epoch": 0.26229508196721313,
"grad_norm": 0.1684049814939499,
"learning_rate": 1.9339341159775647e-05,
"loss": 1.3334097862243652,
"step": 96
},
{
"epoch": 0.2677595628415301,
"grad_norm": 0.08828236162662506,
"learning_rate": 1.9272550259931398e-05,
"loss": 1.3062154054641724,
"step": 98
},
{
"epoch": 0.273224043715847,
"grad_norm": 0.12633274495601654,
"learning_rate": 1.9202685266805896e-05,
"loss": 1.183910846710205,
"step": 100
},
{
"epoch": 0.2786885245901639,
"grad_norm": 0.14135880768299103,
"learning_rate": 1.9129772142345484e-05,
"loss": 0.8230882883071899,
"step": 102
},
{
"epoch": 0.28415300546448086,
"grad_norm": 0.20387791097164154,
"learning_rate": 1.9053837981187125e-05,
"loss": 1.356655478477478,
"step": 104
},
{
"epoch": 0.2896174863387978,
"grad_norm": 0.09430485963821411,
"learning_rate": 1.897491100058998e-05,
"loss": 1.318677306175232,
"step": 106
},
{
"epoch": 0.29508196721311475,
"grad_norm": 0.21191561222076416,
"learning_rate": 1.8893020529949838e-05,
"loss": 1.4180920124053955,
"step": 108
},
{
"epoch": 0.3005464480874317,
"grad_norm": 0.1686364710330963,
"learning_rate": 1.880819699990027e-05,
"loss": 1.3188916444778442,
"step": 110
},
{
"epoch": 0.30601092896174864,
"grad_norm": 0.17423325777053833,
"learning_rate": 1.8720471931004526e-05,
"loss": 1.3028515577316284,
"step": 112
},
{
"epoch": 0.3114754098360656,
"grad_norm": 0.16852694749832153,
"learning_rate": 1.8629877922042485e-05,
"loss": 1.0075663328170776,
"step": 114
},
{
"epoch": 0.31693989071038253,
"grad_norm": 0.16135092079639435,
"learning_rate": 1.8536448637896866e-05,
"loss": 1.2840803861618042,
"step": 116
},
{
"epoch": 0.3224043715846995,
"grad_norm": 0.14016355574131012,
"learning_rate": 1.84402187970433e-05,
"loss": 1.2989857196807861,
"step": 118
},
{
"epoch": 0.32786885245901637,
"grad_norm": 1.949554204940796,
"learning_rate": 1.834122415864891e-05,
"loss": 1.011613130569458,
"step": 120
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.3110104203224182,
"learning_rate": 1.8239501509284123e-05,
"loss": 1.6019999980926514,
"step": 122
},
{
"epoch": 0.33879781420765026,
"grad_norm": 0.12443661689758301,
"learning_rate": 1.8135088649252725e-05,
"loss": 1.2844353914260864,
"step": 124
},
{
"epoch": 0.3442622950819672,
"grad_norm": 0.15131407976150513,
"learning_rate": 1.8028024378545224e-05,
"loss": 1.2516601085662842,
"step": 126
},
{
"epoch": 0.34972677595628415,
"grad_norm": 0.19532890617847443,
"learning_rate": 1.7918348482420692e-05,
"loss": 0.8701586723327637,
"step": 128
},
{
"epoch": 0.3551912568306011,
"grad_norm": 0.14577443897724152,
"learning_rate": 1.7806101716622486e-05,
"loss": 1.0323655605316162,
"step": 130
},
{
"epoch": 0.36065573770491804,
"grad_norm": 0.21451306343078613,
"learning_rate": 1.7691325792233378e-05,
"loss": 1.0055516958236694,
"step": 132
},
{
"epoch": 0.366120218579235,
"grad_norm": 0.14420630037784576,
"learning_rate": 1.7574063360175625e-05,
"loss": 1.2558668851852417,
"step": 134
},
{
"epoch": 0.37158469945355194,
"grad_norm": 0.1369626522064209,
"learning_rate": 1.745435799536183e-05,
"loss": 1.25625741481781,
"step": 136
},
{
"epoch": 0.3770491803278688,
"grad_norm": 0.23049037158489227,
"learning_rate": 1.7332254180502407e-05,
"loss": 1.2635902166366577,
"step": 138
},
{
"epoch": 0.3825136612021858,
"grad_norm": 0.1365140974521637,
"learning_rate": 1.7207797289575777e-05,
"loss": 0.7525888085365295,
"step": 140
},
{
"epoch": 0.3879781420765027,
"grad_norm": 0.2103356122970581,
"learning_rate": 1.708103357096728e-05,
"loss": 0.8429027199745178,
"step": 142
},
{
"epoch": 0.39344262295081966,
"grad_norm": 0.4111814796924591,
"learning_rate": 1.695201013028322e-05,
"loss": 0.8284322619438171,
"step": 144
},
{
"epoch": 0.3989071038251366,
"grad_norm": 0.7463256120681763,
"learning_rate": 1.6820774912846335e-05,
"loss": 0.7427368760108948,
"step": 146
},
{
"epoch": 0.40437158469945356,
"grad_norm": 0.1264885663986206,
"learning_rate": 1.668737668587926e-05,
"loss": 1.304437518119812,
"step": 148
},
{
"epoch": 0.4098360655737705,
"grad_norm": 0.16682182252407074,
"learning_rate": 1.655186502038251e-05,
"loss": 1.2201122045516968,
"step": 150
},
{
"epoch": 0.41530054644808745,
"grad_norm": 0.10499599575996399,
"learning_rate": 1.641429027271384e-05,
"loss": 1.2546273469924927,
"step": 152
},
{
"epoch": 0.4207650273224044,
"grad_norm": 0.12318555265665054,
"learning_rate": 1.6274703565875736e-05,
"loss": 1.2278828620910645,
"step": 154
},
{
"epoch": 0.4262295081967213,
"grad_norm": 1.7749344110488892,
"learning_rate": 1.613315677051801e-05,
"loss": 1.2571786642074585,
"step": 156
},
{
"epoch": 0.43169398907103823,
"grad_norm": 0.3517024517059326,
"learning_rate": 1.598970248566261e-05,
"loss": 1.0339151620864868,
"step": 158
},
{
"epoch": 0.4371584699453552,
"grad_norm": 0.21650607883930206,
"learning_rate": 1.5844394019157697e-05,
"loss": 1.2541024684906006,
"step": 160
},
{
"epoch": 0.4426229508196721,
"grad_norm": 0.4434749186038971,
"learning_rate": 1.5697285367868393e-05,
"loss": 0.8460209965705872,
"step": 162
},
{
"epoch": 0.44808743169398907,
"grad_norm": 0.13023316860198975,
"learning_rate": 1.5548431197611448e-05,
"loss": 1.2656488418579102,
"step": 164
},
{
"epoch": 0.453551912568306,
"grad_norm": 5.6986212730407715,
"learning_rate": 1.539788682284133e-05,
"loss": 0.8476435542106628,
"step": 166
},
{
"epoch": 0.45901639344262296,
"grad_norm": 0.09535890817642212,
"learning_rate": 1.5245708186095275e-05,
"loss": 1.2499439716339111,
"step": 168
},
{
"epoch": 0.4644808743169399,
"grad_norm": 0.4119236469268799,
"learning_rate": 1.5091951837204973e-05,
"loss": 1.5728163719177246,
"step": 170
},
{
"epoch": 0.46994535519125685,
"grad_norm": 0.41598179936408997,
"learning_rate": 1.4936674912282525e-05,
"loss": 0.907516360282898,
"step": 172
},
{
"epoch": 0.47540983606557374,
"grad_norm": 0.16133107244968414,
"learning_rate": 1.4779935112488597e-05,
"loss": 1.2518316507339478,
"step": 174
},
{
"epoch": 0.4808743169398907,
"grad_norm": 0.12417590618133545,
"learning_rate": 1.4621790682590556e-05,
"loss": 1.2215498685836792,
"step": 176
},
{
"epoch": 0.48633879781420764,
"grad_norm": 0.37595632672309875,
"learning_rate": 1.4462300389318635e-05,
"loss": 1.3558915853500366,
"step": 178
},
{
"epoch": 0.4918032786885246,
"grad_norm": 0.4406963288784027,
"learning_rate": 1.4301523499528099e-05,
"loss": 0.9912778735160828,
"step": 180
},
{
"epoch": 0.4972677595628415,
"grad_norm": 0.5479982495307922,
"learning_rate": 1.4139519758175602e-05,
"loss": 1.2772217988967896,
"step": 182
},
{
"epoch": 0.5027322404371585,
"grad_norm": 0.11196983605623245,
"learning_rate": 1.3976349366117861e-05,
"loss": 1.2128099203109741,
"step": 184
},
{
"epoch": 0.5081967213114754,
"grad_norm": 0.11452614516019821,
"learning_rate": 1.3812072957740898e-05,
"loss": 1.2313976287841797,
"step": 186
},
{
"epoch": 0.5136612021857924,
"grad_norm": 0.7617996335029602,
"learning_rate": 1.3646751578428231e-05,
"loss": 0.4828013777732849,
"step": 188
},
{
"epoch": 0.5191256830601093,
"grad_norm": 0.16669736802577972,
"learning_rate": 1.3480446661876295e-05,
"loss": 1.2047618627548218,
"step": 190
},
{
"epoch": 0.5245901639344263,
"grad_norm": 0.1152682974934578,
"learning_rate": 1.3313220007265572e-05,
"loss": 1.2386715412139893,
"step": 192
},
{
"epoch": 0.5300546448087432,
"grad_norm": 0.10782720148563385,
"learning_rate": 1.3145133756295936e-05,
"loss": 1.2308696508407593,
"step": 194
},
{
"epoch": 0.5355191256830601,
"grad_norm": 0.19120632112026215,
"learning_rate": 1.2976250370094668e-05,
"loss": 0.9248079657554626,
"step": 196
},
{
"epoch": 0.5409836065573771,
"grad_norm": 0.14845414459705353,
"learning_rate": 1.2806632606005822e-05,
"loss": 1.0146936178207397,
"step": 198
},
{
"epoch": 0.546448087431694,
"grad_norm": 0.26761510968208313,
"learning_rate": 1.2636343494269479e-05,
"loss": 1.045541763305664,
"step": 200
},
{
"epoch": 0.5519125683060109,
"grad_norm": 0.08655080199241638,
"learning_rate": 1.2465446314599609e-05,
"loss": 1.0569703578948975,
"step": 202
},
{
"epoch": 0.5573770491803278,
"grad_norm": 0.17617474496364594,
"learning_rate": 1.2294004572669228e-05,
"loss": 1.194622278213501,
"step": 204
},
{
"epoch": 0.5628415300546448,
"grad_norm": 0.22285908460617065,
"learning_rate": 1.2122081976511581e-05,
"loss": 1.2368446588516235,
"step": 206
},
{
"epoch": 0.5683060109289617,
"grad_norm": 0.18463526666164398,
"learning_rate": 1.1949742412846142e-05,
"loss": 1.119215488433838,
"step": 208
},
{
"epoch": 0.5737704918032787,
"grad_norm": 0.14102661609649658,
"learning_rate": 1.177704992333818e-05,
"loss": 1.2999699115753174,
"step": 210
},
{
"epoch": 0.5792349726775956,
"grad_norm": 0.14098992943763733,
"learning_rate": 1.1604068680800809e-05,
"loss": 1.216047763824463,
"step": 212
},
{
"epoch": 0.5846994535519126,
"grad_norm": 0.1435597836971283,
"learning_rate": 1.1430862965348224e-05,
"loss": 1.3120372295379639,
"step": 214
},
{
"epoch": 0.5901639344262295,
"grad_norm": 0.0981784462928772,
"learning_rate": 1.1257497140509141e-05,
"loss": 1.212526798248291,
"step": 216
},
{
"epoch": 0.5956284153005464,
"grad_norm": 0.1765955686569214,
"learning_rate": 1.1084035629309176e-05,
"loss": 1.2513571977615356,
"step": 218
},
{
"epoch": 0.6010928961748634,
"grad_norm": 0.1305796355009079,
"learning_rate": 1.0910542890331162e-05,
"loss": 1.1725019216537476,
"step": 220
},
{
"epoch": 0.6065573770491803,
"grad_norm": 0.14715325832366943,
"learning_rate": 1.0737083393762213e-05,
"loss": 0.7729817032814026,
"step": 222
},
{
"epoch": 0.6120218579234973,
"grad_norm": 0.183350071310997,
"learning_rate": 1.0563721597436525e-05,
"loss": 1.1960976123809814,
"step": 224
},
{
"epoch": 0.6174863387978142,
"grad_norm": 0.12949278950691223,
"learning_rate": 1.039052192288271e-05,
"loss": 1.2234892845153809,
"step": 226
},
{
"epoch": 0.6229508196721312,
"grad_norm": 0.3930734694004059,
"learning_rate": 1.0217548731384677e-05,
"loss": 0.7811821699142456,
"step": 228
},
{
"epoch": 0.6284153005464481,
"grad_norm": 0.24914288520812988,
"learning_rate": 1.0044866300064842e-05,
"loss": 1.1955829858779907,
"step": 230
},
{
"epoch": 0.6338797814207651,
"grad_norm": 0.13359300792217255,
"learning_rate": 9.872538797998672e-06,
"loss": 1.229300856590271,
"step": 232
},
{
"epoch": 0.639344262295082,
"grad_norm": 0.13108184933662415,
"learning_rate": 9.700630262369337e-06,
"loss": 0.9536800980567932,
"step": 234
},
{
"epoch": 0.644808743169399,
"grad_norm": 0.20512312650680542,
"learning_rate": 9.529204574671391e-06,
"loss": 1.5177414417266846,
"step": 236
},
{
"epoch": 0.6502732240437158,
"grad_norm": 0.13552652299404144,
"learning_rate": 9.3583254369723e-06,
"loss": 1.1782840490341187,
"step": 238
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.14942114055156708,
"learning_rate": 9.188056348240655e-06,
"loss": 1.1067190170288086,
"step": 240
},
{
"epoch": 0.6612021857923497,
"grad_norm": 0.15014563500881195,
"learning_rate": 9.018460580749842e-06,
"loss": 0.7160718441009521,
"step": 242
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.25949665904045105,
"learning_rate": 8.849601156565972e-06,
"loss": 1.3130247592926025,
"step": 244
},
{
"epoch": 0.6721311475409836,
"grad_norm": 1.2000055313110352,
"learning_rate": 8.68154082412877e-06,
"loss": 1.1773432493209839,
"step": 246
},
{
"epoch": 0.6775956284153005,
"grad_norm": 0.12505550682544708,
"learning_rate": 8.514342034934159e-06,
"loss": 1.1854091882705688,
"step": 248
},
{
"epoch": 0.6830601092896175,
"grad_norm": 0.2346983551979065,
"learning_rate": 8.348066920327163e-06,
"loss": 0.3909367322921753,
"step": 250
},
{
"epoch": 0.6885245901639344,
"grad_norm": 0.10142967849969864,
"learning_rate": 8.182777268413822e-06,
"loss": 1.246644377708435,
"step": 252
},
{
"epoch": 0.6939890710382514,
"grad_norm": 0.10150952637195587,
"learning_rate": 8.018534501100611e-06,
"loss": 1.2059601545333862,
"step": 254
},
{
"epoch": 0.6994535519125683,
"grad_norm": 0.1962408721446991,
"learning_rate": 7.855399651269982e-06,
"loss": 0.9686606526374817,
"step": 256
},
{
"epoch": 0.7049180327868853,
"grad_norm": 0.2061769962310791,
"learning_rate": 7.6934333401004e-06,
"loss": 1.2205626964569092,
"step": 258
},
{
"epoch": 0.7103825136612022,
"grad_norm": 0.15705908834934235,
"learning_rate": 7.53269575453947e-06,
"loss": 1.035886526107788,
"step": 260
},
{
"epoch": 0.7158469945355191,
"grad_norm": 0.4774817228317261,
"learning_rate": 7.373246624938324e-06,
"loss": 1.2100485563278198,
"step": 262
},
{
"epoch": 0.7213114754098361,
"grad_norm": 0.14761529862880707,
"learning_rate": 7.215145202855746e-06,
"loss": 1.1908841133117676,
"step": 264
},
{
"epoch": 0.726775956284153,
"grad_norm": 0.1933698058128357,
"learning_rate": 7.0584502390401865e-06,
"loss": 0.7691932916641235,
"step": 266
},
{
"epoch": 0.73224043715847,
"grad_norm": 0.16835708916187286,
"learning_rate": 6.903219961597891e-06,
"loss": 1.1964633464813232,
"step": 268
},
{
"epoch": 0.7377049180327869,
"grad_norm": 0.2084718942642212,
"learning_rate": 6.7495120543552475e-06,
"loss": 1.2518548965454102,
"step": 270
},
{
"epoch": 0.7431693989071039,
"grad_norm": 0.1384730041027069,
"learning_rate": 6.59738363542336e-06,
"loss": 1.2805969715118408,
"step": 272
},
{
"epoch": 0.7486338797814208,
"grad_norm": 0.14290660619735718,
"learning_rate": 6.446891235972894e-06,
"loss": 1.3189456462860107,
"step": 274
},
{
"epoch": 0.7540983606557377,
"grad_norm": 0.31153976917266846,
"learning_rate": 6.298090779226977e-06,
"loss": 1.2968159914016724,
"step": 276
},
{
"epoch": 0.7595628415300546,
"grad_norm": 0.11012361198663712,
"learning_rate": 6.151037559680047e-06,
"loss": 0.846051037311554,
"step": 278
},
{
"epoch": 0.7650273224043715,
"grad_norm": 0.17122408747673035,
"learning_rate": 6.005786222550319e-06,
"loss": 1.2251654863357544,
"step": 280
},
{
"epoch": 0.7704918032786885,
"grad_norm": 0.11512715369462967,
"learning_rate": 5.8623907434735515e-06,
"loss": 1.1258071660995483,
"step": 282
},
{
"epoch": 0.7759562841530054,
"grad_norm": 0.18557684123516083,
"learning_rate": 5.720904408445589e-06,
"loss": 1.2396138906478882,
"step": 284
},
{
"epoch": 0.7814207650273224,
"grad_norm": 0.10340467095375061,
"learning_rate": 5.581379794021202e-06,
"loss": 1.2666516304016113,
"step": 286
},
{
"epoch": 0.7868852459016393,
"grad_norm": 0.32197245955467224,
"learning_rate": 5.443868747776579e-06,
"loss": 0.7289301156997681,
"step": 288
},
{
"epoch": 0.7923497267759563,
"grad_norm": 0.11029759049415588,
"learning_rate": 5.308422369042644e-06,
"loss": 0.7729415893554688,
"step": 290
},
{
"epoch": 0.7978142076502732,
"grad_norm": 0.1230657696723938,
"learning_rate": 5.175090989916483e-06,
"loss": 0.9814428091049194,
"step": 292
},
{
"epoch": 0.8032786885245902,
"grad_norm": 0.1301792412996292,
"learning_rate": 5.043924156557844e-06,
"loss": 1.2187029123306274,
"step": 294
},
{
"epoch": 0.8087431693989071,
"grad_norm": 0.13773725926876068,
"learning_rate": 4.914970610777725e-06,
"loss": 1.2197258472442627,
"step": 296
},
{
"epoch": 0.8142076502732241,
"grad_norm": 0.14241813123226166,
"learning_rate": 4.788278271925802e-06,
"loss": 1.2182695865631104,
"step": 298
},
{
"epoch": 0.819672131147541,
"grad_norm": 0.14574883878231049,
"learning_rate": 4.663894219083548e-06,
"loss": 0.8696047067642212,
"step": 300
},
{
"epoch": 0.825136612021858,
"grad_norm": 0.16248951852321625,
"learning_rate": 4.541864673569551e-06,
"loss": 1.220901370048523,
"step": 302
},
{
"epoch": 0.8306010928961749,
"grad_norm": 0.17606770992279053,
"learning_rate": 4.422234981763613e-06,
"loss": 1.1022499799728394,
"step": 304
},
{
"epoch": 0.8360655737704918,
"grad_norm": 0.16798973083496094,
"learning_rate": 4.305049598255946e-06,
"loss": 1.2149680852890015,
"step": 306
},
{
"epoch": 0.8415300546448088,
"grad_norm": 0.15118278563022614,
"learning_rate": 4.190352069327777e-06,
"loss": 1.2510839700698853,
"step": 308
},
{
"epoch": 0.8469945355191257,
"grad_norm": 0.18143419921398163,
"learning_rate": 4.078185016769484e-06,
"loss": 1.1982481479644775,
"step": 310
},
{
"epoch": 0.8524590163934426,
"grad_norm": 0.1455654352903366,
"learning_rate": 3.968590122042265e-06,
"loss": 1.2104380130767822,
"step": 312
},
{
"epoch": 0.8579234972677595,
"grad_norm": 0.13336078822612762,
"learning_rate": 3.861608110789228e-06,
"loss": 1.232424259185791,
"step": 314
},
{
"epoch": 0.8633879781420765,
"grad_norm": 0.172093465924263,
"learning_rate": 3.757278737701697e-06,
"loss": 1.2476005554199219,
"step": 316
},
{
"epoch": 0.8688524590163934,
"grad_norm": 0.4827375113964081,
"learning_rate": 3.6556407717462856e-06,
"loss": 0.7419775128364563,
"step": 318
},
{
"epoch": 0.8743169398907104,
"grad_norm": 0.2118140161037445,
"learning_rate": 3.5567319817582944e-06,
"loss": 1.1996129751205444,
"step": 320
},
{
"epoch": 0.8797814207650273,
"grad_norm": 0.10368747264146805,
"learning_rate": 3.4605891224067423e-06,
"loss": 0.7767283916473389,
"step": 322
},
{
"epoch": 0.8852459016393442,
"grad_norm": 0.18676143884658813,
"learning_rate": 3.3672479205362764e-06,
"loss": 1.491099238395691,
"step": 324
},
{
"epoch": 0.8907103825136612,
"grad_norm": 0.1528121680021286,
"learning_rate": 3.276743061891014e-06,
"loss": 1.1976739168167114,
"step": 326
},
{
"epoch": 0.8961748633879781,
"grad_norm": 0.1898827999830246,
"learning_rate": 3.1891081782252726e-06,
"loss": 1.144290566444397,
"step": 328
},
{
"epoch": 0.9016393442622951,
"grad_norm": 0.18415604531764984,
"learning_rate": 3.1043758348059384e-06,
"loss": 1.2545756101608276,
"step": 330
},
{
"epoch": 0.907103825136612,
"grad_norm": 0.141061931848526,
"learning_rate": 3.0225775183111784e-06,
"loss": 1.2385872602462769,
"step": 332
},
{
"epoch": 0.912568306010929,
"grad_norm": 0.3184133470058441,
"learning_rate": 2.943743625129917e-06,
"loss": 1.4611538648605347,
"step": 334
},
{
"epoch": 0.9180327868852459,
"grad_norm": 0.35060685873031616,
"learning_rate": 2.867903450066513e-06,
"loss": 1.186466932296753,
"step": 336
},
{
"epoch": 0.9234972677595629,
"grad_norm": 0.13374803960323334,
"learning_rate": 2.795085175454741e-06,
"loss": 1.2442353963851929,
"step": 338
},
{
"epoch": 0.9289617486338798,
"grad_norm": 0.8894411325454712,
"learning_rate": 2.7253158606851983e-06,
"loss": 0.7970354557037354,
"step": 340
},
{
"epoch": 0.9344262295081968,
"grad_norm": 0.39505577087402344,
"learning_rate": 2.6586214321499952e-06,
"loss": 1.1527299880981445,
"step": 342
},
{
"epoch": 0.9398907103825137,
"grad_norm": 0.21840043365955353,
"learning_rate": 2.5950266736084558e-06,
"loss": 0.7329099774360657,
"step": 344
},
{
"epoch": 0.9453551912568307,
"grad_norm": 0.18624389171600342,
"learning_rate": 2.5345552169774413e-06,
"loss": 1.213990569114685,
"step": 346
},
{
"epoch": 0.9508196721311475,
"grad_norm": 0.20502322912216187,
"learning_rate": 2.477229533549685e-06,
"loss": 1.0040937662124634,
"step": 348
},
{
"epoch": 0.9562841530054644,
"grad_norm": 0.13913527131080627,
"learning_rate": 2.423070925643422e-06,
"loss": 1.195319652557373,
"step": 350
},
{
"epoch": 0.9617486338797814,
"grad_norm": 0.25502294301986694,
"learning_rate": 2.372099518686416e-06,
"loss": 1.3571830987930298,
"step": 352
},
{
"epoch": 0.9672131147540983,
"grad_norm": 0.48177042603492737,
"learning_rate": 2.324334253737321e-06,
"loss": 0.7296788692474365,
"step": 354
},
{
"epoch": 0.9726775956284153,
"grad_norm": 0.2695556581020355,
"learning_rate": 2.2797928804471413e-06,
"loss": 0.7443707585334778,
"step": 356
},
{
"epoch": 0.9781420765027322,
"grad_norm": 0.2975868880748749,
"learning_rate": 2.2384919504634465e-06,
"loss": 1.2335455417633057,
"step": 358
},
{
"epoch": 0.9836065573770492,
"grad_norm": 0.13546700775623322,
"learning_rate": 2.2004468112797345e-06,
"loss": 0.8478338718414307,
"step": 360
},
{
"epoch": 0.9890710382513661,
"grad_norm": 0.15997223556041718,
"learning_rate": 2.165671600532298e-06,
"loss": 1.1819065809249878,
"step": 362
},
{
"epoch": 0.994535519125683,
"grad_norm": 0.26818767189979553,
"learning_rate": 2.134179240746638e-06,
"loss": 1.3250752687454224,
"step": 364
},
{
"epoch": 1.0,
"grad_norm": 0.16711430251598358,
"learning_rate": 2.1059814345354434e-06,
"loss": 1.2777149677276611,
"step": 366
},
{
"epoch": 1.005464480874317,
"grad_norm": 0.13456079363822937,
"learning_rate": 2.0810886602498733e-06,
"loss": 0.9346177577972412,
"step": 368
},
{
"epoch": 1.010928961748634,
"grad_norm": 0.2345515638589859,
"learning_rate": 2.059510168085791e-06,
"loss": 1.343198537826538,
"step": 370
},
{
"epoch": 1.0163934426229508,
"grad_norm": 0.18461638689041138,
"learning_rate": 2.0412539766463697e-06,
"loss": 1.2866058349609375,
"step": 372
},
{
"epoch": 1.0218579234972678,
"grad_norm": 0.1437111347913742,
"learning_rate": 2.0263268699623746e-06,
"loss": 1.1869018077850342,
"step": 374
},
{
"epoch": 1.0273224043715847,
"grad_norm": 0.13092809915542603,
"learning_rate": 2.0147343949711965e-06,
"loss": 1.1603018045425415,
"step": 376
},
{
"epoch": 1.0327868852459017,
"grad_norm": 0.24336589872837067,
"learning_rate": 2.0064808594556066e-06,
"loss": 1.1444275379180908,
"step": 378
},
{
"epoch": 1.0382513661202186,
"grad_norm": 0.13655312359333038,
"learning_rate": 2.0015693304429757e-06,
"loss": 1.1514266729354858,
"step": 380
},
{
"epoch": 1.0437158469945356,
"grad_norm": 0.09100303798913956,
"learning_rate": 2.000001633065562e-06,
"loss": 0.7742247581481934,
"step": 382
},
{
"epoch": 1.0491803278688525,
"grad_norm": 0.18667501211166382,
"learning_rate": 2.0017783498822896e-06,
"loss": 1.1750892400741577,
"step": 384
},
{
"epoch": 1.0546448087431695,
"grad_norm": 0.14683479070663452,
"learning_rate": 2.006898820662268e-06,
"loss": 1.1899375915527344,
"step": 386
},
{
"epoch": 1.0601092896174864,
"grad_norm": 0.17781662940979004,
"learning_rate": 2.0153611426301325e-06,
"loss": 1.5731885433197021,
"step": 388
},
{
"epoch": 1.0655737704918034,
"grad_norm": 0.09566520154476166,
"learning_rate": 2.027162171173126e-06,
"loss": 0.9452205300331116,
"step": 390
},
{
"epoch": 1.0710382513661203,
"grad_norm": 0.1786738634109497,
"learning_rate": 2.0422975210096317e-06,
"loss": 0.6096203327178955,
"step": 392
},
{
"epoch": 1.0765027322404372,
"grad_norm": 0.15426206588745117,
"learning_rate": 2.0607615678187605e-06,
"loss": 1.1949257850646973,
"step": 394
},
{
"epoch": 1.0819672131147542,
"grad_norm": 0.1298629641532898,
"learning_rate": 2.082547450330353e-06,
"loss": 1.1203322410583496,
"step": 396
},
{
"epoch": 1.0874316939890711,
"grad_norm": 0.1290188431739807,
"learning_rate": 2.1076470728746407e-06,
"loss": 1.1237056255340576,
"step": 398
},
{
"epoch": 1.092896174863388,
"grad_norm": 0.2040422558784485,
"learning_rate": 2.136051108390608e-06,
"loss": 1.2583763599395752,
"step": 400
},
{
"epoch": 1.098360655737705,
"grad_norm": 0.10099250823259354,
"learning_rate": 2.167749001891944e-06,
"loss": 1.1252448558807373,
"step": 402
},
{
"epoch": 1.1038251366120218,
"grad_norm": 0.11583796888589859,
"learning_rate": 2.202728974389296e-06,
"loss": 1.1236039400100708,
"step": 404
},
{
"epoch": 1.1092896174863387,
"grad_norm": 0.13602401316165924,
"learning_rate": 2.240978027267357e-06,
"loss": 1.15111243724823,
"step": 406
},
{
"epoch": 1.1147540983606556,
"grad_norm": 0.09002802520990372,
"learning_rate": 2.2824819471151736e-06,
"loss": 1.4592684507369995,
"step": 408
},
{
"epoch": 1.1202185792349726,
"grad_norm": 0.2053132951259613,
"learning_rate": 2.327225311007878e-06,
"loss": 1.1027615070343018,
"step": 410
},
{
"epoch": 1.1256830601092895,
"grad_norm": 0.16059550642967224,
"learning_rate": 2.3751914922378623e-06,
"loss": 1.101325273513794,
"step": 412
},
{
"epoch": 1.1311475409836065,
"grad_norm": 0.13088057935237885,
"learning_rate": 2.4263626664932998e-06,
"loss": 0.57912278175354,
"step": 414
},
{
"epoch": 1.1366120218579234,
"grad_norm": 0.1548115313053131,
"learning_rate": 2.4807198184816817e-06,
"loss": 1.3488638401031494,
"step": 416
},
{
"epoch": 1.1420765027322404,
"grad_norm": 0.4698297083377838,
"learning_rate": 2.5382427489959373e-06,
"loss": 1.2834604978561401,
"step": 418
},
{
"epoch": 1.1475409836065573,
"grad_norm": 0.1722850799560547,
"learning_rate": 2.5989100824204876e-06,
"loss": 1.126566767692566,
"step": 420
},
{
"epoch": 1.1530054644808743,
"grad_norm": 0.1067054495215416,
"learning_rate": 2.662699274674462e-06,
"loss": 1.2055656909942627,
"step": 422
},
{
"epoch": 1.1584699453551912,
"grad_norm": 0.10827223211526871,
"learning_rate": 2.7295866215891107e-06,
"loss": 1.1322380304336548,
"step": 424
},
{
"epoch": 1.1639344262295082,
"grad_norm": 0.22720572352409363,
"learning_rate": 2.799547267716326e-06,
"loss": 0.7926866412162781,
"step": 426
},
{
"epoch": 1.169398907103825,
"grad_norm": 0.14912304282188416,
"learning_rate": 2.872555215564946e-06,
"loss": 1.257475733757019,
"step": 428
},
{
"epoch": 1.174863387978142,
"grad_norm": 0.12470504641532898,
"learning_rate": 2.9485833352614895e-06,
"loss": 1.196222186088562,
"step": 430
},
{
"epoch": 1.180327868852459,
"grad_norm": 0.2074936479330063,
"learning_rate": 3.027603374631647e-06,
"loss": 1.1812493801116943,
"step": 432
},
{
"epoch": 1.185792349726776,
"grad_norm": 0.15828382968902588,
"learning_rate": 3.1095859696988273e-06,
"loss": 1.1702839136123657,
"step": 434
},
{
"epoch": 1.1912568306010929,
"grad_norm": 0.1257786899805069,
"learning_rate": 3.1945006555958885e-06,
"loss": 0.592043399810791,
"step": 436
},
{
"epoch": 1.1967213114754098,
"grad_norm": 0.0843435600399971,
"learning_rate": 3.2823158778858976e-06,
"loss": 0.6085972785949707,
"step": 438
},
{
"epoch": 1.2021857923497268,
"grad_norm": 0.1315852850675583,
"learning_rate": 3.372999004287839e-06,
"loss": 1.0785596370697021,
"step": 440
},
{
"epoch": 1.2076502732240437,
"grad_norm": 0.15027481317520142,
"learning_rate": 3.4665163368028044e-06,
"loss": 1.4450383186340332,
"step": 442
},
{
"epoch": 1.2131147540983607,
"grad_norm": 0.3036656379699707,
"learning_rate": 3.562833124236238e-06,
"loss": 1.367746353149414,
"step": 444
},
{
"epoch": 1.2185792349726776,
"grad_norm": 0.13871243596076965,
"learning_rate": 3.6619135751115325e-06,
"loss": 0.6442332863807678,
"step": 446
},
{
"epoch": 1.2240437158469946,
"grad_norm": 0.22103987634181976,
"learning_rate": 3.763720870970201e-06,
"loss": 0.9271941184997559,
"step": 448
},
{
"epoch": 1.2295081967213115,
"grad_norm": 0.13876283168792725,
"learning_rate": 3.86821718005367e-06,
"loss": 1.1712263822555542,
"step": 450
},
{
"epoch": 1.2349726775956285,
"grad_norm": 0.1687919646501541,
"learning_rate": 3.975363671361641e-06,
"loss": 0.7494930028915405,
"step": 452
},
{
"epoch": 1.2404371584699454,
"grad_norm": 0.16324791312217712,
"learning_rate": 4.0851205290817254e-06,
"loss": 1.1281697750091553,
"step": 454
},
{
"epoch": 1.2459016393442623,
"grad_norm": 0.11219220608472824,
"learning_rate": 4.197446967385105e-06,
"loss": 1.198438286781311,
"step": 456
},
{
"epoch": 1.2513661202185793,
"grad_norm": 0.1335090547800064,
"learning_rate": 4.312301245582571e-06,
"loss": 0.6554253697395325,
"step": 458
},
{
"epoch": 1.2568306010928962,
"grad_norm": 0.2589283883571625,
"learning_rate": 4.429640683635466e-06,
"loss": 1.20187246799469,
"step": 460
},
{
"epoch": 1.2622950819672132,
"grad_norm": 0.1348496526479721,
"learning_rate": 4.549421678015633e-06,
"loss": 1.147897720336914,
"step": 462
},
{
"epoch": 1.2677595628415301,
"grad_norm": 0.28325796127319336,
"learning_rate": 4.671599717908582e-06,
"loss": 0.9092267155647278,
"step": 464
},
{
"epoch": 1.273224043715847,
"grad_norm": 0.16627341508865356,
"learning_rate": 4.796129401753752e-06,
"loss": 1.2192769050598145,
"step": 466
},
{
"epoch": 1.278688524590164,
"grad_norm": 0.23972608149051666,
"learning_rate": 4.922964454115837e-06,
"loss": 0.45847344398498535,
"step": 468
},
{
"epoch": 1.2841530054644807,
"grad_norm": 0.5658842921257019,
"learning_rate": 5.0520577428807835e-06,
"loss": 0.4286736845970154,
"step": 470
},
{
"epoch": 1.289617486338798,
"grad_norm": 0.46968233585357666,
"learning_rate": 5.183361296770197e-06,
"loss": 1.0588371753692627,
"step": 472
},
{
"epoch": 1.2950819672131146,
"grad_norm": 0.38157007098197937,
"learning_rate": 5.316826323167505e-06,
"loss": 0.9430091977119446,
"step": 474
},
{
"epoch": 1.3005464480874318,
"grad_norm": 0.15258565545082092,
"learning_rate": 5.4524032262494175e-06,
"loss": 1.13564932346344,
"step": 476
},
{
"epoch": 1.3060109289617485,
"grad_norm": 0.2158811241388321,
"learning_rate": 5.590041625415783e-06,
"loss": 1.1955578327178955,
"step": 478
},
{
"epoch": 1.3114754098360657,
"grad_norm": 0.4422401189804077,
"learning_rate": 5.7296903740111076e-06,
"loss": 1.2549294233322144,
"step": 480
},
{
"epoch": 1.3169398907103824,
"grad_norm": 0.11041804403066635,
"learning_rate": 5.87129757833077e-06,
"loss": 1.1942386627197266,
"step": 482
},
{
"epoch": 1.3224043715846996,
"grad_norm": 0.14040009677410126,
"learning_rate": 6.014810616904747e-06,
"loss": 1.1555407047271729,
"step": 484
},
{
"epoch": 1.3278688524590163,
"grad_norm": 0.09884827584028244,
"learning_rate": 6.160176160051906e-06,
"loss": 0.1766074001789093,
"step": 486
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.18465925753116608,
"learning_rate": 6.307340189697344e-06,
"loss": 1.1929932832717896,
"step": 488
},
{
"epoch": 1.3387978142076502,
"grad_norm": 0.1754603087902069,
"learning_rate": 6.456248019445626e-06,
"loss": 1.0935235023498535,
"step": 490
},
{
"epoch": 1.3442622950819672,
"grad_norm": 0.18355412781238556,
"learning_rate": 6.606844314902321e-06,
"loss": 1.175545334815979,
"step": 492
},
{
"epoch": 1.349726775956284,
"grad_norm": 0.2099565863609314,
"learning_rate": 6.7590731142363915e-06,
"loss": 0.6870489716529846,
"step": 494
},
{
"epoch": 1.355191256830601,
"grad_norm": 0.1687641441822052,
"learning_rate": 6.912877848975638e-06,
"loss": 1.1621768474578857,
"step": 496
},
{
"epoch": 1.360655737704918,
"grad_norm": 0.24288491904735565,
"learning_rate": 7.068201365027712e-06,
"loss": 0.9009864330291748,
"step": 498
},
{
"epoch": 1.366120218579235,
"grad_norm": 0.164619579911232,
"learning_rate": 7.2249859439185875e-06,
"loss": 1.132088541984558,
"step": 500
},
{
"epoch": 1.3715846994535519,
"grad_norm": 0.12828585505485535,
"learning_rate": 7.3831733242409285e-06,
"loss": 1.2778956890106201,
"step": 502
},
{
"epoch": 1.3770491803278688,
"grad_norm": 0.2682889997959137,
"learning_rate": 7.5427047233040485e-06,
"loss": 1.1193040609359741,
"step": 504
},
{
"epoch": 1.3825136612021858,
"grad_norm": 0.12167899310588837,
"learning_rate": 7.703520858977702e-06,
"loss": 0.73407381772995,
"step": 506
},
{
"epoch": 1.3879781420765027,
"grad_norm": 0.15630660951137543,
"learning_rate": 7.865561971721389e-06,
"loss": 0.7426860928535461,
"step": 508
},
{
"epoch": 1.3934426229508197,
"grad_norm": 0.19107486307621002,
"learning_rate": 8.02876784679115e-06,
"loss": 1.1898062229156494,
"step": 510
},
{
"epoch": 1.3989071038251366,
"grad_norm": 0.176279217004776,
"learning_rate": 8.193077836615386e-06,
"loss": 1.1608000993728638,
"step": 512
},
{
"epoch": 1.4043715846994536,
"grad_norm": 0.18996112048625946,
"learning_rate": 8.35843088333168e-06,
"loss": 0.6805540323257446,
"step": 514
},
{
"epoch": 1.4098360655737705,
"grad_norm": 0.3888643682003021,
"learning_rate": 8.524765541475935e-06,
"loss": 1.573140025138855,
"step": 516
},
{
"epoch": 1.4153005464480874,
"grad_norm": 0.1736215204000473,
"learning_rate": 8.692020000815627e-06,
"loss": 0.8413932919502258,
"step": 518
},
{
"epoch": 1.4207650273224044,
"grad_norm": 0.1383085995912552,
"learning_rate": 8.860132109318622e-06,
"loss": 0.7804769277572632,
"step": 520
},
{
"epoch": 1.4262295081967213,
"grad_norm": 0.18307553231716156,
"learning_rate": 9.029039396248916e-06,
"loss": 0.7059910893440247,
"step": 522
},
{
"epoch": 1.4316939890710383,
"grad_norm": 0.15533241629600525,
"learning_rate": 9.198679095380924e-06,
"loss": 0.8162409663200378,
"step": 524
},
{
"epoch": 1.4371584699453552,
"grad_norm": 0.3435671329498291,
"learning_rate": 9.368988168323451e-06,
"loss": 1.0322041511535645,
"step": 526
},
{
"epoch": 1.4426229508196722,
"grad_norm": 0.10851337015628815,
"learning_rate": 9.539903327944926e-06,
"loss": 1.1319749355316162,
"step": 528
},
{
"epoch": 1.4480874316939891,
"grad_norm": 0.13830029964447021,
"learning_rate": 9.711361061890942e-06,
"loss": 0.7779232263565063,
"step": 530
},
{
"epoch": 1.453551912568306,
"grad_norm": 0.187329962849617,
"learning_rate": 9.8832976561856e-06,
"loss": 1.1993160247802734,
"step": 532
},
{
"epoch": 1.459016393442623,
"grad_norm": 0.1746772676706314,
"learning_rate": 1.0055649218907688e-05,
"loss": 1.1548646688461304,
"step": 534
},
{
"epoch": 1.46448087431694,
"grad_norm": 0.10917941480875015,
"learning_rate": 1.0228351703933075e-05,
"loss": 1.146438479423523,
"step": 536
},
{
"epoch": 1.469945355191257,
"grad_norm": 0.13628044724464417,
"learning_rate": 1.0401340934734287e-05,
"loss": 0.6834872364997864,
"step": 538
},
{
"epoch": 1.4754098360655736,
"grad_norm": 0.1356225311756134,
"learning_rate": 1.0574552628228691e-05,
"loss": 0.9159919619560242,
"step": 540
},
{
"epoch": 1.4808743169398908,
"grad_norm": 0.40020471811294556,
"learning_rate": 1.0747922418666115e-05,
"loss": 1.231278896331787,
"step": 542
},
{
"epoch": 1.4863387978142075,
"grad_norm": 0.13868877291679382,
"learning_rate": 1.0921385881547311e-05,
"loss": 1.125664234161377,
"step": 544
},
{
"epoch": 1.4918032786885247,
"grad_norm": 0.19291168451309204,
"learning_rate": 1.1094878557564217e-05,
"loss": 0.7880281805992126,
"step": 546
},
{
"epoch": 1.4972677595628414,
"grad_norm": 0.22060799598693848,
"learning_rate": 1.1268335976553098e-05,
"loss": 0.9573584198951721,
"step": 548
},
{
"epoch": 1.5027322404371586,
"grad_norm": 0.11164124310016632,
"learning_rate": 1.144169368145179e-05,
"loss": 1.1322665214538574,
"step": 550
},
{
"epoch": 1.5081967213114753,
"grad_norm": 0.5197082757949829,
"learning_rate": 1.1614887252252076e-05,
"loss": 0.8590179085731506,
"step": 552
},
{
"epoch": 1.5136612021857925,
"grad_norm": 0.15628303587436676,
"learning_rate": 1.1787852329938198e-05,
"loss": 1.131445288658142,
"step": 554
},
{
"epoch": 1.5191256830601092,
"grad_norm": 0.18890385329723358,
"learning_rate": 1.1960524640402862e-05,
"loss": 0.8027105927467346,
"step": 556
},
{
"epoch": 1.5245901639344264,
"grad_norm": 0.2331659346818924,
"learning_rate": 1.2132840018331514e-05,
"loss": 1.1426656246185303,
"step": 558
},
{
"epoch": 1.530054644808743,
"grad_norm": 0.15703825652599335,
"learning_rate": 1.2304734431046335e-05,
"loss": 0.3221997618675232,
"step": 560
},
{
"epoch": 1.5355191256830603,
"grad_norm": 0.10984613001346588,
"learning_rate": 1.2476144002300864e-05,
"loss": 1.136183500289917,
"step": 562
},
{
"epoch": 1.540983606557377,
"grad_norm": 0.14987057447433472,
"learning_rate": 1.264700503601655e-05,
"loss": 0.7029743194580078,
"step": 564
},
{
"epoch": 1.5464480874316942,
"grad_norm": 0.13446514308452606,
"learning_rate": 1.2817254039952253e-05,
"loss": 1.243178367614746,
"step": 566
},
{
"epoch": 1.5519125683060109,
"grad_norm": 0.13616947829723358,
"learning_rate": 1.2986827749298138e-05,
"loss": 1.218723177909851,
"step": 568
},
{
"epoch": 1.5573770491803278,
"grad_norm": 0.11935320496559143,
"learning_rate": 1.3155663150184942e-05,
"loss": 1.1722185611724854,
"step": 570
},
{
"epoch": 1.5628415300546448,
"grad_norm": 0.10870077461004257,
"learning_rate": 1.3323697503100035e-05,
"loss": 0.7070199251174927,
"step": 572
},
{
"epoch": 1.5683060109289617,
"grad_norm": 0.4351557791233063,
"learning_rate": 1.3490868366201527e-05,
"loss": 1.0434682369232178,
"step": 574
},
{
"epoch": 1.5737704918032787,
"grad_norm": 0.10035145282745361,
"learning_rate": 1.3657113618521763e-05,
"loss": 1.1506720781326294,
"step": 576
},
{
"epoch": 1.5792349726775956,
"grad_norm": 0.13236872851848602,
"learning_rate": 1.3822371483051593e-05,
"loss": 1.1399495601654053,
"step": 578
},
{
"epoch": 1.5846994535519126,
"grad_norm": 0.11729606240987778,
"learning_rate": 1.3986580549696777e-05,
"loss": 1.114902138710022,
"step": 580
},
{
"epoch": 1.5901639344262295,
"grad_norm": 0.16270145773887634,
"learning_rate": 1.4149679798098097e-05,
"loss": 1.1003979444503784,
"step": 582
},
{
"epoch": 1.5956284153005464,
"grad_norm": 0.3030289113521576,
"learning_rate": 1.4311608620306626e-05,
"loss": 0.6834750771522522,
"step": 584
},
{
"epoch": 1.6010928961748634,
"grad_norm": 0.11277345567941666,
"learning_rate": 1.447230684330573e-05,
"loss": 1.1540107727050781,
"step": 586
},
{
"epoch": 1.6065573770491803,
"grad_norm": 0.1363985538482666,
"learning_rate": 1.4631714751371456e-05,
"loss": 1.2158739566802979,
"step": 588
},
{
"epoch": 1.6120218579234973,
"grad_norm": 0.12835562229156494,
"learning_rate": 1.4789773108263016e-05,
"loss": 1.4278290271759033,
"step": 590
},
{
"epoch": 1.6174863387978142,
"grad_norm": 0.1124146431684494,
"learning_rate": 1.4946423179235068e-05,
"loss": 1.1424548625946045,
"step": 592
},
{
"epoch": 1.6229508196721312,
"grad_norm": 0.15784600377082825,
"learning_rate": 1.5101606752863606e-05,
"loss": 1.1741244792938232,
"step": 594
},
{
"epoch": 1.6284153005464481,
"grad_norm": 0.1587875932455063,
"learning_rate": 1.5255266162677466e-05,
"loss": 1.113938570022583,
"step": 596
},
{
"epoch": 1.633879781420765,
"grad_norm": 0.1339414268732071,
"learning_rate": 1.540734430858725e-05,
"loss": 1.3817849159240723,
"step": 598
},
{
"epoch": 1.639344262295082,
"grad_norm": 0.24032869935035706,
"learning_rate": 1.5557784678103852e-05,
"loss": 1.0632057189941406,
"step": 600
},
{
"epoch": 1.644808743169399,
"grad_norm": 0.14984670281410217,
"learning_rate": 1.5706531367338546e-05,
"loss": 1.4408550262451172,
"step": 602
},
{
"epoch": 1.650273224043716,
"grad_norm": 0.11051689833402634,
"learning_rate": 1.5853529101776985e-05,
"loss": 1.1191422939300537,
"step": 604
},
{
"epoch": 1.6557377049180326,
"grad_norm": 0.2666247487068176,
"learning_rate": 1.5998723256819298e-05,
"loss": 1.1819491386413574,
"step": 606
},
{
"epoch": 1.6612021857923498,
"grad_norm": 0.1419493705034256,
"learning_rate": 1.614205987807872e-05,
"loss": 1.1393964290618896,
"step": 608
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.20164312422275543,
"learning_rate": 1.628348570143105e-05,
"loss": 1.1867141723632812,
"step": 610
},
{
"epoch": 1.6721311475409837,
"grad_norm": 0.4470498561859131,
"learning_rate": 1.6422948172807745e-05,
"loss": 0.6968726515769958,
"step": 612
},
{
"epoch": 1.6775956284153004,
"grad_norm": 0.3151567876338959,
"learning_rate": 1.6560395467725086e-05,
"loss": 0.984643816947937,
"step": 614
},
{
"epoch": 1.6830601092896176,
"grad_norm": 0.1973286122083664,
"learning_rate": 1.6695776510542253e-05,
"loss": 0.73722904920578,
"step": 616
},
{
"epoch": 1.6885245901639343,
"grad_norm": 0.11708299070596695,
"learning_rate": 1.6829040993441085e-05,
"loss": 1.1374552249908447,
"step": 618
},
{
"epoch": 1.6939890710382515,
"grad_norm": 0.12260973453521729,
"learning_rate": 1.696013939512057e-05,
"loss": 1.111509084701538,
"step": 620
},
{
"epoch": 1.6994535519125682,
"grad_norm": 0.13327494263648987,
"learning_rate": 1.7089022999199064e-05,
"loss": 1.0331177711486816,
"step": 622
},
{
"epoch": 1.7049180327868854,
"grad_norm": 2.275272846221924,
"learning_rate": 1.7215643912317323e-05,
"loss": 0.7297571301460266,
"step": 624
},
{
"epoch": 1.710382513661202,
"grad_norm": 0.16397327184677124,
"learning_rate": 1.73399550819358e-05,
"loss": 1.1528923511505127,
"step": 626
},
{
"epoch": 1.7158469945355193,
"grad_norm": 0.1343916803598404,
"learning_rate": 1.746191031381943e-05,
"loss": 1.107448935508728,
"step": 628
},
{
"epoch": 1.721311475409836,
"grad_norm": 0.2460424154996872,
"learning_rate": 1.7581464289203475e-05,
"loss": 0.7108749151229858,
"step": 630
},
{
"epoch": 1.7267759562841531,
"grad_norm": 0.46268230676651,
"learning_rate": 1.7698572581634083e-05,
"loss": 0.9818768501281738,
"step": 632
},
{
"epoch": 1.7322404371584699,
"grad_norm": 0.15652601420879364,
"learning_rate": 1.781319167347718e-05,
"loss": 1.3066364526748657,
"step": 634
},
{
"epoch": 1.737704918032787,
"grad_norm": 0.24166584014892578,
"learning_rate": 1.7925278972089748e-05,
"loss": 1.037507176399231,
"step": 636
},
{
"epoch": 1.7431693989071038,
"grad_norm": 0.12377048283815384,
"learning_rate": 1.8034792825647287e-05,
"loss": 1.14212965965271,
"step": 638
},
{
"epoch": 1.748633879781421,
"grad_norm": 0.10295464843511581,
"learning_rate": 1.8141692538621716e-05,
"loss": 1.1561766862869263,
"step": 640
},
{
"epoch": 1.7540983606557377,
"grad_norm": 0.17102456092834473,
"learning_rate": 1.8245938386903896e-05,
"loss": 0.7420101761817932,
"step": 642
},
{
"epoch": 1.7595628415300546,
"grad_norm": 0.19951768219470978,
"learning_rate": 1.8347491632565156e-05,
"loss": 0.43516218662261963,
"step": 644
},
{
"epoch": 1.7650273224043715,
"grad_norm": 0.667060136795044,
"learning_rate": 1.8446314538252407e-05,
"loss": 1.194848656654358,
"step": 646
},
{
"epoch": 1.7704918032786885,
"grad_norm": 0.1322481632232666,
"learning_rate": 1.8542370381211374e-05,
"loss": 0.982461154460907,
"step": 648
},
{
"epoch": 1.7759562841530054,
"grad_norm": 0.1529482752084732,
"learning_rate": 1.8635623466932843e-05,
"loss": 0.8924828767776489,
"step": 650
},
{
"epoch": 1.7814207650273224,
"grad_norm": 0.3255729079246521,
"learning_rate": 1.8726039142416796e-05,
"loss": 0.7710011601448059,
"step": 652
},
{
"epoch": 1.7868852459016393,
"grad_norm": 0.14727091789245605,
"learning_rate": 1.881358380904954e-05,
"loss": 1.1044501066207886,
"step": 654
},
{
"epoch": 1.7923497267759563,
"grad_norm": 0.1614963561296463,
"learning_rate": 1.889822493508897e-05,
"loss": 1.1497408151626587,
"step": 656
},
{
"epoch": 1.7978142076502732,
"grad_norm": 0.26955926418304443,
"learning_rate": 1.897993106775346e-05,
"loss": 0.9794219136238098,
"step": 658
},
{
"epoch": 1.8032786885245902,
"grad_norm": 0.2158735692501068,
"learning_rate": 1.9058671844909742e-05,
"loss": 1.1939728260040283,
"step": 660
},
{
"epoch": 1.8087431693989071,
"grad_norm": 0.11255908012390137,
"learning_rate": 1.9134418006355532e-05,
"loss": 0.9392801523208618,
"step": 662
},
{
"epoch": 1.814207650273224,
"grad_norm": 0.17491623759269714,
"learning_rate": 1.9207141404692667e-05,
"loss": 0.7158020734786987,
"step": 664
},
{
"epoch": 1.819672131147541,
"grad_norm": 0.12073390185832977,
"learning_rate": 1.927681501578672e-05,
"loss": 1.1663721799850464,
"step": 666
},
{
"epoch": 1.825136612021858,
"grad_norm": 0.09961558878421783,
"learning_rate": 1.934341294880924e-05,
"loss": 1.1403790712356567,
"step": 668
},
{
"epoch": 1.830601092896175,
"grad_norm": 0.13533425331115723,
"learning_rate": 1.9406910455858783e-05,
"loss": 1.110253095626831,
"step": 670
},
{
"epoch": 1.8360655737704918,
"grad_norm": 0.3127456605434418,
"learning_rate": 1.9467283941157304e-05,
"loss": 1.1353299617767334,
"step": 672
},
{
"epoch": 1.8415300546448088,
"grad_norm": 2.787572145462036,
"learning_rate": 1.952451096981838e-05,
"loss": 0.8424018025398254,
"step": 674
},
{
"epoch": 1.8469945355191257,
"grad_norm": 0.33079859614372253,
"learning_rate": 1.957857027618405e-05,
"loss": 1.183168888092041,
"step": 676
},
{
"epoch": 1.8524590163934427,
"grad_norm": 0.1199263259768486,
"learning_rate": 1.9629441771727166e-05,
"loss": 1.0803476572036743,
"step": 678
},
{
"epoch": 1.8579234972677594,
"grad_norm": 0.1475621610879898,
"learning_rate": 1.9677106552516317e-05,
"loss": 1.1051766872406006,
"step": 680
},
{
"epoch": 1.8633879781420766,
"grad_norm": 0.1131962314248085,
"learning_rate": 1.9721546906240577e-05,
"loss": 1.1643602848052979,
"step": 682
},
{
"epoch": 1.8688524590163933,
"grad_norm": 0.23215758800506592,
"learning_rate": 1.976274631879142e-05,
"loss": 0.8713716268539429,
"step": 684
},
{
"epoch": 1.8743169398907105,
"grad_norm": 0.2560221254825592,
"learning_rate": 1.9800689480399383e-05,
"loss": 0.7212733626365662,
"step": 686
},
{
"epoch": 1.8797814207650272,
"grad_norm": 0.17682117223739624,
"learning_rate": 1.9835362291323222e-05,
"loss": 1.1008837223052979,
"step": 688
},
{
"epoch": 1.8852459016393444,
"grad_norm": 0.1772257536649704,
"learning_rate": 1.9866751867089363e-05,
"loss": 0.8719238638877869,
"step": 690
},
{
"epoch": 1.890710382513661,
"grad_norm": 0.15099941194057465,
"learning_rate": 1.9894846543279838e-05,
"loss": 1.1498489379882812,
"step": 692
},
{
"epoch": 1.8961748633879782,
"grad_norm": 0.321058452129364,
"learning_rate": 1.991963587986677e-05,
"loss": 0.842879593372345,
"step": 694
},
{
"epoch": 1.901639344262295,
"grad_norm": 0.11210023611783981,
"learning_rate": 1.9941110665091922e-05,
"loss": 1.1328097581863403,
"step": 696
},
{
"epoch": 1.9071038251366121,
"grad_norm": 0.1561606526374817,
"learning_rate": 1.9959262918889774e-05,
"loss": 1.4744820594787598,
"step": 698
},
{
"epoch": 1.9125683060109289,
"grad_norm": 0.13894771039485931,
"learning_rate": 1.9974085895852973e-05,
"loss": 1.1326099634170532,
"step": 700
},
{
"epoch": 1.918032786885246,
"grad_norm": 0.09440940618515015,
"learning_rate": 1.99855740877389e-05,
"loss": 1.1104779243469238,
"step": 702
},
{
"epoch": 1.9234972677595628,
"grad_norm": 0.11499873548746109,
"learning_rate": 1.9993723225516553e-05,
"loss": 1.1850953102111816,
"step": 704
},
{
"epoch": 1.92896174863388,
"grad_norm": 0.2087257206439972,
"learning_rate": 1.9998530280952938e-05,
"loss": 1.1748231649398804,
"step": 706
},
{
"epoch": 1.9344262295081966,
"grad_norm": 0.16280730068683624,
"learning_rate": 1.9999993467738345e-05,
"loss": 1.1230883598327637,
"step": 708
},
{
"epoch": 1.9398907103825138,
"grad_norm": 0.5092416405677795,
"learning_rate": 1.9998112242150162e-05,
"loss": 1.339207649230957,
"step": 710
},
{
"epoch": 1.9453551912568305,
"grad_norm": 0.12435825169086456,
"learning_rate": 1.999288730325491e-05,
"loss": 1.1549785137176514,
"step": 712
},
{
"epoch": 1.9508196721311475,
"grad_norm": 0.19420410692691803,
"learning_rate": 1.9984320592648474e-05,
"loss": 0.9229910373687744,
"step": 714
},
{
"epoch": 1.9562841530054644,
"grad_norm": 0.15405113995075226,
"learning_rate": 1.9972415293734607e-05,
"loss": 1.2319244146347046,
"step": 716
},
{
"epoch": 1.9617486338797814,
"grad_norm": 0.14918380975723267,
"learning_rate": 1.995717583054196e-05,
"loss": 0.707750678062439,
"step": 718
},
{
"epoch": 1.9672131147540983,
"grad_norm": 0.11523278057575226,
"learning_rate": 1.9938607866080114e-05,
"loss": 0.7229039669036865,
"step": 720
},
{
"epoch": 1.9726775956284153,
"grad_norm": 0.1472606509923935,
"learning_rate": 1.991671830023521e-05,
"loss": 1.039513349533081,
"step": 722
},
{
"epoch": 1.9781420765027322,
"grad_norm": 0.2472400814294815,
"learning_rate": 1.989151526720591e-05,
"loss": 1.1361088752746582,
"step": 724
},
{
"epoch": 1.9836065573770492,
"grad_norm": 0.3535289764404297,
"learning_rate": 1.986300813248073e-05,
"loss": 0.7156858444213867,
"step": 726
},
{
"epoch": 1.989071038251366,
"grad_norm": 0.14543037116527557,
"learning_rate": 1.9831207489357825e-05,
"loss": 0.9188486933708191,
"step": 728
},
{
"epoch": 1.994535519125683,
"grad_norm": 0.12422552704811096,
"learning_rate": 1.979612515500847e-05,
"loss": 1.148845911026001,
"step": 730
},
{
"epoch": 2.0,
"grad_norm": 0.19218623638153076,
"learning_rate": 1.97577741660858e-05,
"loss": 1.157519817352295,
"step": 732
},
{
"epoch": 2.0054644808743167,
"grad_norm": 0.10968166589736938,
"learning_rate": 1.9716168773880382e-05,
"loss": 0.7050259113311768,
"step": 734
},
{
"epoch": 2.010928961748634,
"grad_norm": 0.14246150851249695,
"learning_rate": 1.9671324439024374e-05,
"loss": 0.9686898589134216,
"step": 736
},
{
"epoch": 2.0163934426229506,
"grad_norm": 0.17498300969600677,
"learning_rate": 1.9623257825746357e-05,
"loss": 1.0406219959259033,
"step": 738
},
{
"epoch": 2.021857923497268,
"grad_norm": 0.11363784968852997,
"learning_rate": 1.9571986795678878e-05,
"loss": 1.0660529136657715,
"step": 740
},
{
"epoch": 2.0273224043715845,
"grad_norm": 0.14233727753162384,
"learning_rate": 1.951753040122102e-05,
"loss": 1.067291498184204,
"step": 742
},
{
"epoch": 2.0327868852459017,
"grad_norm": 0.33568593859672546,
"learning_rate": 1.9459908878458532e-05,
"loss": 1.3619149923324585,
"step": 744
},
{
"epoch": 2.0382513661202184,
"grad_norm": 0.11692183464765549,
"learning_rate": 1.939914363964402e-05,
"loss": 1.4234706163406372,
"step": 746
},
{
"epoch": 2.0437158469945356,
"grad_norm": 0.13339044153690338,
"learning_rate": 1.9335257265240168e-05,
"loss": 1.137938380241394,
"step": 748
},
{
"epoch": 2.0491803278688523,
"grad_norm": 0.1429172307252884,
"learning_rate": 1.9268273495528768e-05,
"loss": 1.0415153503417969,
"step": 750
},
{
"epoch": 2.0546448087431695,
"grad_norm": 0.1754903793334961,
"learning_rate": 1.9198217221788806e-05,
"loss": 1.0955044031143188,
"step": 752
},
{
"epoch": 2.060109289617486,
"grad_norm": 0.13161030411720276,
"learning_rate": 1.9125114477046807e-05,
"loss": 0.6989483833312988,
"step": 754
},
{
"epoch": 2.0655737704918034,
"grad_norm": 0.23866786062717438,
"learning_rate": 1.9048992426402947e-05,
"loss": 0.20047175884246826,
"step": 756
},
{
"epoch": 2.07103825136612,
"grad_norm": 0.2189582735300064,
"learning_rate": 1.896987935693643e-05,
"loss": 1.1039602756500244,
"step": 758
},
{
"epoch": 2.0765027322404372,
"grad_norm": 0.14158737659454346,
"learning_rate": 1.888780466719397e-05,
"loss": 0.8127365708351135,
"step": 760
},
{
"epoch": 2.081967213114754,
"grad_norm": 0.16718794405460358,
"learning_rate": 1.8802798856265254e-05,
"loss": 1.090496301651001,
"step": 762
},
{
"epoch": 2.087431693989071,
"grad_norm": 0.646533727645874,
"learning_rate": 1.8714893512449424e-05,
"loss": 0.9807750582695007,
"step": 764
},
{
"epoch": 2.092896174863388,
"grad_norm": 0.16664795577526093,
"learning_rate": 1.8624121301516808e-05,
"loss": 1.148633599281311,
"step": 766
},
{
"epoch": 2.098360655737705,
"grad_norm": 0.15415580570697784,
"learning_rate": 1.853051595457026e-05,
"loss": 1.0199898481369019,
"step": 768
},
{
"epoch": 2.1038251366120218,
"grad_norm": 0.12932369112968445,
"learning_rate": 1.843411225551065e-05,
"loss": 1.1014589071273804,
"step": 770
},
{
"epoch": 2.109289617486339,
"grad_norm": 0.234808549284935,
"learning_rate": 1.8334946028111088e-05,
"loss": 1.0307773351669312,
"step": 772
},
{
"epoch": 2.1147540983606556,
"grad_norm": 0.15817847847938538,
"learning_rate": 1.8233054122704765e-05,
"loss": 0.892197847366333,
"step": 774
},
{
"epoch": 2.120218579234973,
"grad_norm": 0.18785926699638367,
"learning_rate": 1.8128474402491286e-05,
"loss": 0.5686047673225403,
"step": 776
},
{
"epoch": 2.1256830601092895,
"grad_norm": 0.20626573264598846,
"learning_rate": 1.802124572946668e-05,
"loss": 1.1014513969421387,
"step": 778
},
{
"epoch": 2.1311475409836067,
"grad_norm": 0.1809910535812378,
"learning_rate": 1.791140794998219e-05,
"loss": 0.5480612516403198,
"step": 780
},
{
"epoch": 2.1366120218579234,
"grad_norm": 0.1974882036447525,
"learning_rate": 1.7799001879937294e-05,
"loss": 0.7820758819580078,
"step": 782
},
{
"epoch": 2.1420765027322406,
"grad_norm": 0.2597973346710205,
"learning_rate": 1.768406928961248e-05,
"loss": 0.8670818209648132,
"step": 784
},
{
"epoch": 2.1475409836065573,
"grad_norm": 0.12401880323886871,
"learning_rate": 1.7566652888147328e-05,
"loss": 0.637206494808197,
"step": 786
},
{
"epoch": 2.1530054644808745,
"grad_norm": 0.22445400059223175,
"learning_rate": 1.7446796307669725e-05,
"loss": 0.6058897972106934,
"step": 788
},
{
"epoch": 2.158469945355191,
"grad_norm": 0.14324024319648743,
"learning_rate": 1.732454408708209e-05,
"loss": 1.1209547519683838,
"step": 790
},
{
"epoch": 2.1639344262295084,
"grad_norm": 0.1719355434179306,
"learning_rate": 1.719994165551063e-05,
"loss": 1.0784960985183716,
"step": 792
},
{
"epoch": 2.169398907103825,
"grad_norm": 0.4398317337036133,
"learning_rate": 1.7073035315423838e-05,
"loss": 0.8808025121688843,
"step": 794
},
{
"epoch": 2.1748633879781423,
"grad_norm": 0.14957763254642487,
"learning_rate": 1.6943872225426396e-05,
"loss": 0.6589257717132568,
"step": 796
},
{
"epoch": 2.180327868852459,
"grad_norm": 0.223003551363945,
"learning_rate": 1.6812500382734977e-05,
"loss": 0.740198016166687,
"step": 798
},
{
"epoch": 2.185792349726776,
"grad_norm": 0.1016513854265213,
"learning_rate": 1.6678968605342348e-05,
"loss": 1.0570908784866333,
"step": 800
},
{
"epoch": 2.191256830601093,
"grad_norm": 0.16483862698078156,
"learning_rate": 1.6543326513876602e-05,
"loss": 1.1871097087860107,
"step": 802
},
{
"epoch": 2.19672131147541,
"grad_norm": 0.14547647535800934,
"learning_rate": 1.6405624513162002e-05,
"loss": 1.3363229036331177,
"step": 804
},
{
"epoch": 2.202185792349727,
"grad_norm": 0.12882763147354126,
"learning_rate": 1.6265913773488456e-05,
"loss": 1.0369070768356323,
"step": 806
},
{
"epoch": 2.2076502732240435,
"grad_norm": 0.12072896212339401,
"learning_rate": 1.6124246211596606e-05,
"loss": 0.9998791217803955,
"step": 808
},
{
"epoch": 2.2131147540983607,
"grad_norm": 0.16041557490825653,
"learning_rate": 1.598067447138542e-05,
"loss": 1.0319167375564575,
"step": 810
},
{
"epoch": 2.2185792349726774,
"grad_norm": 0.12052586674690247,
"learning_rate": 1.5835251904349688e-05,
"loss": 1.079565405845642,
"step": 812
},
{
"epoch": 2.2240437158469946,
"grad_norm": 0.2981847822666168,
"learning_rate": 1.5688032549754453e-05,
"loss": 0.6331281661987305,
"step": 814
},
{
"epoch": 2.2295081967213113,
"grad_norm": 0.21407558023929596,
"learning_rate": 1.553907111455401e-05,
"loss": 0.9996432662010193,
"step": 816
},
{
"epoch": 2.2349726775956285,
"grad_norm": 0.38344627618789673,
"learning_rate": 1.538842295306264e-05,
"loss": 0.7279675006866455,
"step": 818
},
{
"epoch": 2.240437158469945,
"grad_norm": 0.23144850134849548,
"learning_rate": 1.5236144046384917e-05,
"loss": 1.0548104047775269,
"step": 820
},
{
"epoch": 2.2459016393442623,
"grad_norm": 0.16722913086414337,
"learning_rate": 1.5082290981612987e-05,
"loss": 1.0882266759872437,
"step": 822
},
{
"epoch": 2.251366120218579,
"grad_norm": 0.19076433777809143,
"learning_rate": 1.4926920930798736e-05,
"loss": 0.9657437801361084,
"step": 824
},
{
"epoch": 2.2568306010928962,
"grad_norm": 0.17660953104496002,
"learning_rate": 1.4770091629708562e-05,
"loss": 1.0023385286331177,
"step": 826
},
{
"epoch": 2.262295081967213,
"grad_norm": 0.3156924247741699,
"learning_rate": 1.461186135636868e-05,
"loss": 0.49919602274894714,
"step": 828
},
{
"epoch": 2.26775956284153,
"grad_norm": 0.13933399319648743,
"learning_rate": 1.4452288909408864e-05,
"loss": 0.9852725267410278,
"step": 830
},
{
"epoch": 2.273224043715847,
"grad_norm": 0.20533017814159393,
"learning_rate": 1.4291433586212831e-05,
"loss": 0.7310548424720764,
"step": 832
},
{
"epoch": 2.278688524590164,
"grad_norm": 0.21601639688014984,
"learning_rate": 1.4129355160883216e-05,
"loss": 0.7628719210624695,
"step": 834
},
{
"epoch": 2.2841530054644807,
"grad_norm": 0.150185227394104,
"learning_rate": 1.3966113862029429e-05,
"loss": 1.0764801502227783,
"step": 836
},
{
"epoch": 2.289617486338798,
"grad_norm": 0.1973620504140854,
"learning_rate": 1.3801770350386568e-05,
"loss": 1.0453038215637207,
"step": 838
},
{
"epoch": 2.2950819672131146,
"grad_norm": 0.12196861952543259,
"learning_rate": 1.363638569627384e-05,
"loss": 0.5716758370399475,
"step": 840
},
{
"epoch": 2.300546448087432,
"grad_norm": 0.2215929925441742,
"learning_rate": 1.3470021356900696e-05,
"loss": 0.8548388481140137,
"step": 842
},
{
"epoch": 2.3060109289617485,
"grad_norm": 0.0590071864426136,
"learning_rate": 1.3302739153529252e-05,
"loss": 0.6019071936607361,
"step": 844
},
{
"epoch": 2.3114754098360657,
"grad_norm": 0.41101735830307007,
"learning_rate": 1.3134601248501366e-05,
"loss": 1.1974279880523682,
"step": 846
},
{
"epoch": 2.3169398907103824,
"grad_norm": 0.13029514253139496,
"learning_rate": 1.2965670122139071e-05,
"loss": 1.1063951253890991,
"step": 848
},
{
"epoch": 2.3224043715846996,
"grad_norm": 0.129970520734787,
"learning_rate": 1.2796008549526752e-05,
"loss": 1.0337756872177124,
"step": 850
},
{
"epoch": 2.3278688524590163,
"grad_norm": 0.13291127979755402,
"learning_rate": 1.262567957718378e-05,
"loss": 0.9729894399642944,
"step": 852
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.18364933133125305,
"learning_rate": 1.2454746499636408e-05,
"loss": 0.6949310302734375,
"step": 854
},
{
"epoch": 2.33879781420765,
"grad_norm": 0.11144661903381348,
"learning_rate": 1.2283272835897359e-05,
"loss": 0.7358170747756958,
"step": 856
},
{
"epoch": 2.3442622950819674,
"grad_norm": 0.09950226545333862,
"learning_rate": 1.2111322305862088e-05,
"loss": 1.0024327039718628,
"step": 858
},
{
"epoch": 2.349726775956284,
"grad_norm": 0.13857939839363098,
"learning_rate": 1.1938958806630322e-05,
"loss": 0.7280409336090088,
"step": 860
},
{
"epoch": 2.3551912568306013,
"grad_norm": 0.20425045490264893,
"learning_rate": 1.1766246388761841e-05,
"loss": 0.9389795660972595,
"step": 862
},
{
"epoch": 2.360655737704918,
"grad_norm": 0.1896969974040985,
"learning_rate": 1.1593249232475162e-05,
"loss": 1.030674695968628,
"step": 864
},
{
"epoch": 2.366120218579235,
"grad_norm": 0.1484568864107132,
"learning_rate": 1.142003162379808e-05,
"loss": 0.6855581402778625,
"step": 866
},
{
"epoch": 2.371584699453552,
"grad_norm": 0.2438308596611023,
"learning_rate": 1.1246657930678817e-05,
"loss": 1.3209505081176758,
"step": 868
},
{
"epoch": 2.3770491803278686,
"grad_norm": 0.19440999627113342,
"learning_rate": 1.1073192579066867e-05,
"loss": 0.9309288263320923,
"step": 870
},
{
"epoch": 2.3825136612021858,
"grad_norm": 0.29453930258750916,
"learning_rate": 1.0899700028972169e-05,
"loss": 0.5769140124320984,
"step": 872
},
{
"epoch": 2.387978142076503,
"grad_norm": 0.25008222460746765,
"learning_rate": 1.072624475051166e-05,
"loss": 1.0771747827529907,
"step": 874
},
{
"epoch": 2.3934426229508197,
"grad_norm": 0.09936109930276871,
"learning_rate": 1.055289119995206e-05,
"loss": 1.0050560235977173,
"step": 876
},
{
"epoch": 2.3989071038251364,
"grad_norm": 0.4316991865634918,
"learning_rate": 1.0379703795757853e-05,
"loss": 0.5357435941696167,
"step": 878
},
{
"epoch": 2.4043715846994536,
"grad_norm": 0.23820410668849945,
"learning_rate": 1.0206746894653252e-05,
"loss": 1.1548678874969482,
"step": 880
},
{
"epoch": 2.4098360655737707,
"grad_norm": 0.1519540250301361,
"learning_rate": 1.0034084767707164e-05,
"loss": 0.9099193811416626,
"step": 882
},
{
"epoch": 2.4153005464480874,
"grad_norm": 0.14186547696590424,
"learning_rate": 9.861781576449879e-06,
"loss": 0.9032131433486938,
"step": 884
},
{
"epoch": 2.420765027322404,
"grad_norm": 0.744184672832489,
"learning_rate": 9.689901349030646e-06,
"loss": 1.0177228450775146,
"step": 886
},
{
"epoch": 2.4262295081967213,
"grad_norm": 0.17870917916297913,
"learning_rate": 9.518507956424643e-06,
"loss": 1.0855988264083862,
"step": 888
},
{
"epoch": 2.431693989071038,
"grad_norm": 0.42778703570365906,
"learning_rate": 9.347665088698444e-06,
"loss": 0.6894425749778748,
"step": 890
},
{
"epoch": 2.4371584699453552,
"grad_norm": 0.24986915290355682,
"learning_rate": 9.177436231342623e-06,
"loss": 1.3268741369247437,
"step": 892
},
{
"epoch": 2.442622950819672,
"grad_norm": 0.40933093428611755,
"learning_rate": 9.00788464168054e-06,
"loss": 0.9357931017875671,
"step": 894
},
{
"epoch": 2.448087431693989,
"grad_norm": 0.3109062910079956,
"learning_rate": 8.839073325361751e-06,
"loss": 0.6064870953559875,
"step": 896
},
{
"epoch": 2.453551912568306,
"grad_norm": 0.12668392062187195,
"learning_rate": 8.67106501294902e-06,
"loss": 0.20398537814617157,
"step": 898
},
{
"epoch": 2.459016393442623,
"grad_norm": 0.14268121123313904,
"learning_rate": 8.503922136607536e-06,
"loss": 1.065590739250183,
"step": 900
},
{
"epoch": 2.4644808743169397,
"grad_norm": 0.17221775650978088,
"learning_rate": 8.337706806905029e-06,
"loss": 0.495491087436676,
"step": 902
},
{
"epoch": 2.469945355191257,
"grad_norm": 0.15687991678714752,
"learning_rate": 8.172480789731374e-06,
"loss": 1.2099788188934326,
"step": 904
},
{
"epoch": 2.4754098360655736,
"grad_norm": 0.6032702326774597,
"learning_rate": 8.00830548334625e-06,
"loss": 0.4631669223308563,
"step": 906
},
{
"epoch": 2.480874316939891,
"grad_norm": 0.5850704312324524,
"learning_rate": 7.84524189556352e-06,
"loss": 0.6608245968818665,
"step": 908
},
{
"epoch": 2.4863387978142075,
"grad_norm": 0.12262270599603653,
"learning_rate": 7.68335062108057e-06,
"loss": 1.0410443544387817,
"step": 910
},
{
"epoch": 2.4918032786885247,
"grad_norm": 0.8080756068229675,
"learning_rate": 7.522691818961252e-06,
"loss": 0.5971605181694031,
"step": 912
},
{
"epoch": 2.4972677595628414,
"grad_norm": 0.20784160494804382,
"learning_rate": 7.3633251902806165e-06,
"loss": 1.0929663181304932,
"step": 914
},
{
"epoch": 2.5027322404371586,
"grad_norm": 0.1316247284412384,
"learning_rate": 7.205309955939983e-06,
"loss": 1.1377352476119995,
"step": 916
},
{
"epoch": 2.5081967213114753,
"grad_norm": 0.13229675590991974,
"learning_rate": 7.048704834660296e-06,
"loss": 1.0250879526138306,
"step": 918
},
{
"epoch": 2.5136612021857925,
"grad_norm": 0.17270365357398987,
"learning_rate": 6.8935680211621715e-06,
"loss": 0.9176226854324341,
"step": 920
},
{
"epoch": 2.519125683060109,
"grad_norm": 0.16042472422122955,
"learning_rate": 6.739957164540634e-06,
"loss": 1.0503426790237427,
"step": 922
},
{
"epoch": 2.5245901639344264,
"grad_norm": 0.3965965509414673,
"learning_rate": 6.587929346842625e-06,
"loss": 0.4507668912410736,
"step": 924
},
{
"epoch": 2.530054644808743,
"grad_norm": 0.14760476350784302,
"learning_rate": 6.437541061855222e-06,
"loss": 1.0362180471420288,
"step": 926
},
{
"epoch": 2.5355191256830603,
"grad_norm": 0.7348153591156006,
"learning_rate": 6.288848194112459e-06,
"loss": 1.0108616352081299,
"step": 928
},
{
"epoch": 2.540983606557377,
"grad_norm": 0.23355959355831146,
"learning_rate": 6.141905998128495e-06,
"loss": 0.7464023232460022,
"step": 930
},
{
"epoch": 2.546448087431694,
"grad_norm": 0.7200878858566284,
"learning_rate": 5.996769077865029e-06,
"loss": 0.6252878308296204,
"step": 932
},
{
"epoch": 2.551912568306011,
"grad_norm": 0.23962250351905823,
"learning_rate": 5.853491366440313e-06,
"loss": 0.9192193150520325,
"step": 934
},
{
"epoch": 2.557377049180328,
"grad_norm": 0.23657557368278503,
"learning_rate": 5.712126106087557e-06,
"loss": 1.0096158981323242,
"step": 936
},
{
"epoch": 2.5628415300546448,
"grad_norm": 0.14654363691806793,
"learning_rate": 5.572725828369961e-06,
"loss": 1.076252818107605,
"step": 938
},
{
"epoch": 2.5683060109289615,
"grad_norm": 0.1286906749010086,
"learning_rate": 5.4353423346599944e-06,
"loss": 1.0361932516098022,
"step": 940
},
{
"epoch": 2.5737704918032787,
"grad_norm": 0.14579473435878754,
"learning_rate": 5.30002667688986e-06,
"loss": 1.0813875198364258,
"step": 942
},
{
"epoch": 2.579234972677596,
"grad_norm": 0.10800908505916595,
"learning_rate": 5.1668291385804995e-06,
"loss": 1.1076337099075317,
"step": 944
},
{
"epoch": 2.5846994535519126,
"grad_norm": 0.31755056977272034,
"learning_rate": 5.03579921615621e-06,
"loss": 1.0581474304199219,
"step": 946
},
{
"epoch": 2.5901639344262293,
"grad_norm": 0.14950014650821686,
"learning_rate": 4.906985600551651e-06,
"loss": 0.858165979385376,
"step": 948
},
{
"epoch": 2.5956284153005464,
"grad_norm": 0.5994306802749634,
"learning_rate": 4.780436159118221e-06,
"loss": 0.8401349186897278,
"step": 950
},
{
"epoch": 2.6010928961748636,
"grad_norm": 0.12480851262807846,
"learning_rate": 4.656197917836474e-06,
"loss": 1.0394039154052734,
"step": 952
},
{
"epoch": 2.6065573770491803,
"grad_norm": 0.19965514540672302,
"learning_rate": 4.5343170438411885e-06,
"loss": 0.9946895241737366,
"step": 954
},
{
"epoch": 2.612021857923497,
"grad_norm": 0.1912984848022461,
"learning_rate": 4.414838828265581e-06,
"loss": 0.5789214372634888,
"step": 956
},
{
"epoch": 2.6174863387978142,
"grad_norm": 0.12282451242208481,
"learning_rate": 4.297807669411057e-06,
"loss": 0.7763662338256836,
"step": 958
},
{
"epoch": 2.6229508196721314,
"grad_norm": 0.10073775053024292,
"learning_rate": 4.183267056248689e-06,
"loss": 0.9413790106773376,
"step": 960
},
{
"epoch": 2.628415300546448,
"grad_norm": 0.7957872748374939,
"learning_rate": 4.071259552258709e-06,
"loss": 1.1018624305725098,
"step": 962
},
{
"epoch": 2.633879781420765,
"grad_norm": 0.12228737026453018,
"learning_rate": 3.961826779613801e-06,
"loss": 0.9796239733695984,
"step": 964
},
{
"epoch": 2.639344262295082,
"grad_norm": 0.16614055633544922,
"learning_rate": 3.85500940371226e-06,
"loss": 0.9639811515808105,
"step": 966
},
{
"epoch": 2.644808743169399,
"grad_norm": 0.2138717919588089,
"learning_rate": 3.750847118066614e-06,
"loss": 1.065807819366455,
"step": 968
},
{
"epoch": 2.650273224043716,
"grad_norm": 0.18565969169139862,
"learning_rate": 3.6493786295535234e-06,
"loss": 1.111021876335144,
"step": 970
},
{
"epoch": 2.6557377049180326,
"grad_norm": 0.32286983728408813,
"learning_rate": 3.5506416440301885e-06,
"loss": 1.085739254951477,
"step": 972
},
{
"epoch": 2.66120218579235,
"grad_norm": 0.15602485835552216,
"learning_rate": 3.4546728523228067e-06,
"loss": 1.0275617837905884,
"step": 974
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.2214060127735138,
"learning_rate": 3.361507916592206e-06,
"loss": 1.0728002786636353,
"step": 976
},
{
"epoch": 2.6721311475409837,
"grad_norm": 0.19005049765110016,
"learning_rate": 3.271181457081715e-06,
"loss": 0.47213196754455566,
"step": 978
},
{
"epoch": 2.6775956284153004,
"grad_norm": 0.10323573648929596,
"learning_rate": 3.1837270392522456e-06,
"loss": 0.9167113304138184,
"step": 980
},
{
"epoch": 2.6830601092896176,
"grad_norm": 0.7527140378952026,
"learning_rate": 3.0991771613092686e-06,
"loss": 1.1480381488800049,
"step": 982
},
{
"epoch": 2.6885245901639343,
"grad_norm": 0.1350458264350891,
"learning_rate": 3.017563242126483e-06,
"loss": 1.1025961637496948,
"step": 984
},
{
"epoch": 2.6939890710382515,
"grad_norm": 0.2891963720321655,
"learning_rate": 2.9389156095704764e-06,
"loss": 1.15847909450531,
"step": 986
},
{
"epoch": 2.699453551912568,
"grad_norm": 0.11150713264942169,
"learning_rate": 2.8632634892308535e-06,
"loss": 1.0358167886734009,
"step": 988
},
{
"epoch": 2.7049180327868854,
"grad_norm": 0.18181155622005463,
"learning_rate": 2.7906349935599326e-06,
"loss": 0.9927688837051392,
"step": 990
},
{
"epoch": 2.710382513661202,
"grad_norm": 0.14437374472618103,
"learning_rate": 2.721057111426154e-06,
"loss": 0.9766374230384827,
"step": 992
},
{
"epoch": 2.7158469945355193,
"grad_norm": 0.32675376534461975,
"learning_rate": 2.6545556980849417e-06,
"loss": 1.0123059749603271,
"step": 994
},
{
"epoch": 2.721311475409836,
"grad_norm": 0.3038513958454132,
"learning_rate": 2.591155465570866e-06,
"loss": 1.0865612030029297,
"step": 996
},
{
"epoch": 2.726775956284153,
"grad_norm": 0.17611156404018402,
"learning_rate": 2.5308799735145813e-06,
"loss": 1.0625133514404297,
"step": 998
},
{
"epoch": 2.73224043715847,
"grad_norm": 0.12994976341724396,
"learning_rate": 2.473751620388069e-06,
"loss": 1.0845409631729126,
"step": 1000
},
{
"epoch": 2.737704918032787,
"grad_norm": 2.454756736755371,
"learning_rate": 2.419791635181301e-06,
"loss": 1.0169895887374878,
"step": 1002
},
{
"epoch": 2.7431693989071038,
"grad_norm": 0.14442019164562225,
"learning_rate": 2.369020069513521e-06,
"loss": 0.7047387361526489,
"step": 1004
},
{
"epoch": 2.748633879781421,
"grad_norm": 0.2404894083738327,
"learning_rate": 2.3214557901820258e-06,
"loss": 0.9643245339393616,
"step": 1006
},
{
"epoch": 2.7540983606557377,
"grad_norm": 0.18125297129154205,
"learning_rate": 2.27711647215124e-06,
"loss": 1.286293387413025,
"step": 1008
},
{
"epoch": 2.7595628415300544,
"grad_norm": 0.24531304836273193,
"learning_rate": 2.2360185919846593e-06,
"loss": 0.6167261004447937,
"step": 1010
},
{
"epoch": 2.7650273224043715,
"grad_norm": 0.12282934784889221,
"learning_rate": 2.1981774217221474e-06,
"loss": 1.0038611888885498,
"step": 1012
},
{
"epoch": 2.7704918032786887,
"grad_norm": 0.5574485659599304,
"learning_rate": 2.1636070232047966e-06,
"loss": 0.9050815105438232,
"step": 1014
},
{
"epoch": 2.7759562841530054,
"grad_norm": 0.16559119522571564,
"learning_rate": 2.1323202428495544e-06,
"loss": 0.986128568649292,
"step": 1016
},
{
"epoch": 2.781420765027322,
"grad_norm": 0.3175508677959442,
"learning_rate": 2.104328706875452e-06,
"loss": 0.5148718953132629,
"step": 1018
},
{
"epoch": 2.7868852459016393,
"grad_norm": 0.15311799943447113,
"learning_rate": 2.079642816983293e-06,
"loss": 1.0339350700378418,
"step": 1020
},
{
"epoch": 2.7923497267759565,
"grad_norm": 0.20977604389190674,
"learning_rate": 2.0582717464903546e-06,
"loss": 1.0123640298843384,
"step": 1022
},
{
"epoch": 2.797814207650273,
"grad_norm": 0.23887036740779877,
"learning_rate": 2.040223436921581e-06,
"loss": 0.9732429385185242,
"step": 1024
},
{
"epoch": 2.80327868852459,
"grad_norm": 0.17824672162532806,
"learning_rate": 2.025504595058489e-06,
"loss": 1.0330421924591064,
"step": 1026
},
{
"epoch": 2.808743169398907,
"grad_norm": 0.13717371225357056,
"learning_rate": 2.0141206904469206e-06,
"loss": 1.1064571142196655,
"step": 1028
},
{
"epoch": 2.8142076502732243,
"grad_norm": 0.1265968233346939,
"learning_rate": 2.006075953364551e-06,
"loss": 1.0096856355667114,
"step": 1030
},
{
"epoch": 2.819672131147541,
"grad_norm": 0.12728005647659302,
"learning_rate": 2.0013733732489103e-06,
"loss": 0.9550838470458984,
"step": 1032
},
{
"epoch": 2.8251366120218577,
"grad_norm": 0.17370416224002838,
"learning_rate": 2.000014697586502e-06,
"loss": 1.2471691370010376,
"step": 1034
},
{
"epoch": 2.830601092896175,
"grad_norm": 0.3669784367084503,
"learning_rate": 2.0020004312634374e-06,
"loss": 1.1650069952011108,
"step": 1036
},
{
"epoch": 2.836065573770492,
"grad_norm": 0.13446620106697083,
"learning_rate": 2.0073298363778166e-06,
"loss": 0.3134404122829437,
"step": 1038
},
{
"epoch": 2.841530054644809,
"grad_norm": 0.6104834675788879,
"learning_rate": 2.016000932513934e-06,
"loss": 1.1606090068817139,
"step": 1040
},
{
"epoch": 2.8469945355191255,
"grad_norm": 0.11602997034788132,
"learning_rate": 2.0280104974782058e-06,
"loss": 0.9772266149520874,
"step": 1042
},
{
"epoch": 2.8524590163934427,
"grad_norm": 0.2954210638999939,
"learning_rate": 2.043354068496541e-06,
"loss": 1.0794169902801514,
"step": 1044
},
{
"epoch": 2.8579234972677594,
"grad_norm": 0.1831609308719635,
"learning_rate": 2.0620259438727168e-06,
"loss": 1.1897934675216675,
"step": 1046
},
{
"epoch": 2.8633879781420766,
"grad_norm": 0.14778214693069458,
"learning_rate": 2.084019185107135e-06,
"loss": 1.0881187915802002,
"step": 1048
},
{
"epoch": 2.8688524590163933,
"grad_norm": 0.13310196995735168,
"learning_rate": 2.1093256194751822e-06,
"loss": 1.029591679573059,
"step": 1050
},
{
"epoch": 2.8743169398907105,
"grad_norm": 0.08348928391933441,
"learning_rate": 2.137935843064233e-06,
"loss": 0.6022316813468933,
"step": 1052
},
{
"epoch": 2.879781420765027,
"grad_norm": 0.11385923624038696,
"learning_rate": 2.1698392242681502e-06,
"loss": 1.1325082778930664,
"step": 1054
},
{
"epoch": 2.8852459016393444,
"grad_norm": 0.1381348967552185,
"learning_rate": 2.2050239077380097e-06,
"loss": 1.0211372375488281,
"step": 1056
},
{
"epoch": 2.890710382513661,
"grad_norm": 0.17021217942237854,
"learning_rate": 2.2434768187875723e-06,
"loss": 0.8306444883346558,
"step": 1058
},
{
"epoch": 2.8961748633879782,
"grad_norm": 0.16226349771022797,
"learning_rate": 2.285183668251853e-06,
"loss": 1.0502756834030151,
"step": 1060
},
{
"epoch": 2.901639344262295,
"grad_norm": 0.16681478917598724,
"learning_rate": 2.3301289577970028e-06,
"loss": 0.8215010762214661,
"step": 1062
},
{
"epoch": 2.907103825136612,
"grad_norm": 0.4038049876689911,
"learning_rate": 2.3782959856795113e-06,
"loss": 0.5339687466621399,
"step": 1064
},
{
"epoch": 2.912568306010929,
"grad_norm": 0.19572515785694122,
"learning_rate": 2.4296668529525998e-06,
"loss": 0.8967607617378235,
"step": 1066
},
{
"epoch": 2.918032786885246,
"grad_norm": 0.1520427167415619,
"learning_rate": 2.4842224701175147e-06,
"loss": 0.7001104950904846,
"step": 1068
},
{
"epoch": 2.9234972677595628,
"grad_norm": 0.12228185683488846,
"learning_rate": 2.541942564217196e-06,
"loss": 1.1667863130569458,
"step": 1070
},
{
"epoch": 2.92896174863388,
"grad_norm": 0.1114974319934845,
"learning_rate": 2.6028056863697506e-06,
"loss": 1.008376955986023,
"step": 1072
},
{
"epoch": 2.9344262295081966,
"grad_norm": 0.163143128156662,
"learning_rate": 2.6667892197388884e-06,
"loss": 1.4090521335601807,
"step": 1074
},
{
"epoch": 2.939890710382514,
"grad_norm": 0.11509400606155396,
"learning_rate": 2.7338693879383967e-06,
"loss": 0.6322512030601501,
"step": 1076
},
{
"epoch": 2.9453551912568305,
"grad_norm": 0.10466722398996353,
"learning_rate": 2.8040212638674506e-06,
"loss": 1.0424083471298218,
"step": 1078
},
{
"epoch": 2.9508196721311473,
"grad_norm": 0.42490720748901367,
"learning_rate": 2.877218778973578e-06,
"loss": 0.6144481301307678,
"step": 1080
},
{
"epoch": 2.9562841530054644,
"grad_norm": 0.09882606565952301,
"learning_rate": 2.9534347329398027e-06,
"loss": 0.17148929834365845,
"step": 1082
},
{
"epoch": 2.9617486338797816,
"grad_norm": 0.22523100674152374,
"learning_rate": 3.0326408037922827e-06,
"loss": 1.109324336051941,
"step": 1084
},
{
"epoch": 2.9672131147540983,
"grad_norm": 0.09981999546289444,
"learning_rate": 3.1148075584248306e-06,
"loss": 0.7445070147514343,
"step": 1086
},
{
"epoch": 2.972677595628415,
"grad_norm": 1.5596518516540527,
"learning_rate": 3.199904463536296e-06,
"loss": 0.970592737197876,
"step": 1088
},
{
"epoch": 2.978142076502732,
"grad_norm": 0.1070963516831398,
"learning_rate": 3.2878998969767954e-06,
"loss": 1.2897882461547852,
"step": 1090
},
{
"epoch": 2.9836065573770494,
"grad_norm": 0.3974516987800598,
"learning_rate": 3.378761159498547e-06,
"loss": 1.1402771472930908,
"step": 1092
},
{
"epoch": 2.989071038251366,
"grad_norm": 0.4319957196712494,
"learning_rate": 3.472454486906972e-06,
"loss": 0.5495699644088745,
"step": 1094
},
{
"epoch": 2.994535519125683,
"grad_norm": 0.1675819605588913,
"learning_rate": 3.5689450626075132e-06,
"loss": 1.0523113012313843,
"step": 1096
},
{
"epoch": 3.0,
"grad_norm": 0.25202980637550354,
"learning_rate": 3.668197030543573e-06,
"loss": 0.6573507785797119,
"step": 1098
},
{
"epoch": 3.0,
"step": 1098,
"total_flos": 4.845424164514824e+18,
"train_loss": 1.0781439861150388,
"train_runtime": 38170.6856,
"train_samples_per_second": 1.726,
"train_steps_per_second": 0.029
}
],
"logging_steps": 2,
"max_steps": 1098,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.845424164514824e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}