base-sentiment / trainer_state.json
candra's picture
Upload folder using huggingface_hub
25326bd verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 10,
"global_step": 7808,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00012807377049180329,
"grad_norm": 13.030157089233398,
"learning_rate": 0.0,
"loss": 0.6785,
"step": 1
},
{
"epoch": 0.0012807377049180327,
"grad_norm": 22.160924911499023,
"learning_rate": 3.837953091684436e-07,
"loss": 0.7023,
"step": 10
},
{
"epoch": 0.0025614754098360654,
"grad_norm": 28.483007431030273,
"learning_rate": 8.102345415778253e-07,
"loss": 0.5318,
"step": 20
},
{
"epoch": 0.0038422131147540983,
"grad_norm": 14.279166221618652,
"learning_rate": 1.236673773987207e-06,
"loss": 0.7028,
"step": 30
},
{
"epoch": 0.005122950819672131,
"grad_norm": 19.40032196044922,
"learning_rate": 1.6631130063965886e-06,
"loss": 0.7097,
"step": 40
},
{
"epoch": 0.006403688524590164,
"grad_norm": 17.553495407104492,
"learning_rate": 2.08955223880597e-06,
"loss": 0.7789,
"step": 50
},
{
"epoch": 0.007684426229508197,
"grad_norm": 5.795664310455322,
"learning_rate": 2.515991471215352e-06,
"loss": 0.5146,
"step": 60
},
{
"epoch": 0.008965163934426229,
"grad_norm": 15.086921691894531,
"learning_rate": 2.9424307036247335e-06,
"loss": 0.703,
"step": 70
},
{
"epoch": 0.010245901639344262,
"grad_norm": 11.061614990234375,
"learning_rate": 3.3688699360341154e-06,
"loss": 0.5794,
"step": 80
},
{
"epoch": 0.011526639344262296,
"grad_norm": 11.43583869934082,
"learning_rate": 3.7953091684434973e-06,
"loss": 0.6144,
"step": 90
},
{
"epoch": 0.012807377049180328,
"grad_norm": 5.861094951629639,
"learning_rate": 4.221748400852878e-06,
"loss": 0.5769,
"step": 100
},
{
"epoch": 0.01408811475409836,
"grad_norm": 28.796695709228516,
"learning_rate": 4.64818763326226e-06,
"loss": 0.5953,
"step": 110
},
{
"epoch": 0.015368852459016393,
"grad_norm": 17.27574348449707,
"learning_rate": 5.074626865671642e-06,
"loss": 0.4116,
"step": 120
},
{
"epoch": 0.016649590163934427,
"grad_norm": 20.032840728759766,
"learning_rate": 5.501066098081024e-06,
"loss": 0.7965,
"step": 130
},
{
"epoch": 0.017930327868852458,
"grad_norm": 35.11494827270508,
"learning_rate": 5.927505330490405e-06,
"loss": 0.8488,
"step": 140
},
{
"epoch": 0.019211065573770492,
"grad_norm": 17.658639907836914,
"learning_rate": 6.353944562899788e-06,
"loss": 0.446,
"step": 150
},
{
"epoch": 0.020491803278688523,
"grad_norm": 10.555081367492676,
"learning_rate": 6.780383795309169e-06,
"loss": 0.4964,
"step": 160
},
{
"epoch": 0.021772540983606557,
"grad_norm": 30.8939266204834,
"learning_rate": 7.20682302771855e-06,
"loss": 0.5762,
"step": 170
},
{
"epoch": 0.02305327868852459,
"grad_norm": 14.771651268005371,
"learning_rate": 7.633262260127933e-06,
"loss": 0.5545,
"step": 180
},
{
"epoch": 0.024334016393442622,
"grad_norm": 0.9534880518913269,
"learning_rate": 8.059701492537314e-06,
"loss": 0.3119,
"step": 190
},
{
"epoch": 0.025614754098360656,
"grad_norm": 13.96252727508545,
"learning_rate": 8.486140724946695e-06,
"loss": 0.6571,
"step": 200
},
{
"epoch": 0.026895491803278687,
"grad_norm": 6.706875801086426,
"learning_rate": 8.912579957356077e-06,
"loss": 0.8117,
"step": 210
},
{
"epoch": 0.02817622950819672,
"grad_norm": 57.71232604980469,
"learning_rate": 9.339019189765458e-06,
"loss": 0.4906,
"step": 220
},
{
"epoch": 0.029456967213114756,
"grad_norm": 15.123934745788574,
"learning_rate": 9.765458422174841e-06,
"loss": 0.4204,
"step": 230
},
{
"epoch": 0.030737704918032786,
"grad_norm": 3.7344789505004883,
"learning_rate": 1.0191897654584222e-05,
"loss": 0.6303,
"step": 240
},
{
"epoch": 0.03201844262295082,
"grad_norm": 0.1564660370349884,
"learning_rate": 1.0618336886993603e-05,
"loss": 0.3297,
"step": 250
},
{
"epoch": 0.033299180327868855,
"grad_norm": 22.37810516357422,
"learning_rate": 1.1044776119402986e-05,
"loss": 0.3784,
"step": 260
},
{
"epoch": 0.034579918032786885,
"grad_norm": 4.665563106536865,
"learning_rate": 1.1471215351812369e-05,
"loss": 0.7696,
"step": 270
},
{
"epoch": 0.035860655737704916,
"grad_norm": 9.491741180419922,
"learning_rate": 1.189765458422175e-05,
"loss": 0.5464,
"step": 280
},
{
"epoch": 0.037141393442622954,
"grad_norm": 18.859682083129883,
"learning_rate": 1.2324093816631131e-05,
"loss": 0.5767,
"step": 290
},
{
"epoch": 0.038422131147540985,
"grad_norm": 7.065849304199219,
"learning_rate": 1.2750533049040512e-05,
"loss": 0.5723,
"step": 300
},
{
"epoch": 0.039702868852459015,
"grad_norm": 43.178043365478516,
"learning_rate": 1.3176972281449893e-05,
"loss": 0.6343,
"step": 310
},
{
"epoch": 0.040983606557377046,
"grad_norm": 9.827512741088867,
"learning_rate": 1.3603411513859277e-05,
"loss": 0.5718,
"step": 320
},
{
"epoch": 0.042264344262295084,
"grad_norm": 2.420236349105835,
"learning_rate": 1.4029850746268658e-05,
"loss": 0.5491,
"step": 330
},
{
"epoch": 0.043545081967213115,
"grad_norm": 8.602315902709961,
"learning_rate": 1.445628997867804e-05,
"loss": 0.5566,
"step": 340
},
{
"epoch": 0.044825819672131145,
"grad_norm": 19.52743148803711,
"learning_rate": 1.488272921108742e-05,
"loss": 0.4385,
"step": 350
},
{
"epoch": 0.04610655737704918,
"grad_norm": 59.86263656616211,
"learning_rate": 1.5309168443496803e-05,
"loss": 0.6635,
"step": 360
},
{
"epoch": 0.047387295081967214,
"grad_norm": 35.44069290161133,
"learning_rate": 1.5735607675906184e-05,
"loss": 0.7269,
"step": 370
},
{
"epoch": 0.048668032786885244,
"grad_norm": 20.887710571289062,
"learning_rate": 1.616204690831557e-05,
"loss": 0.5448,
"step": 380
},
{
"epoch": 0.04994877049180328,
"grad_norm": 22.93721580505371,
"learning_rate": 1.658848614072495e-05,
"loss": 0.7712,
"step": 390
},
{
"epoch": 0.05122950819672131,
"grad_norm": 11.434581756591797,
"learning_rate": 1.701492537313433e-05,
"loss": 0.4954,
"step": 400
},
{
"epoch": 0.052510245901639344,
"grad_norm": 3.9810707569122314,
"learning_rate": 1.7441364605543712e-05,
"loss": 0.4475,
"step": 410
},
{
"epoch": 0.053790983606557374,
"grad_norm": 8.25676155090332,
"learning_rate": 1.7867803837953093e-05,
"loss": 0.5226,
"step": 420
},
{
"epoch": 0.05507172131147541,
"grad_norm": 40.57249069213867,
"learning_rate": 1.8294243070362474e-05,
"loss": 0.5507,
"step": 430
},
{
"epoch": 0.05635245901639344,
"grad_norm": 0.5660319924354553,
"learning_rate": 1.872068230277186e-05,
"loss": 0.4327,
"step": 440
},
{
"epoch": 0.057633196721311473,
"grad_norm": 37.062320709228516,
"learning_rate": 1.914712153518124e-05,
"loss": 0.3592,
"step": 450
},
{
"epoch": 0.05891393442622951,
"grad_norm": 22.973651885986328,
"learning_rate": 1.957356076759062e-05,
"loss": 0.4596,
"step": 460
},
{
"epoch": 0.06019467213114754,
"grad_norm": 24.05460548400879,
"learning_rate": 2e-05,
"loss": 0.5558,
"step": 470
},
{
"epoch": 0.06147540983606557,
"grad_norm": 0.19256171584129333,
"learning_rate": 1.9972748330835264e-05,
"loss": 0.3182,
"step": 480
},
{
"epoch": 0.0627561475409836,
"grad_norm": 3.1222236156463623,
"learning_rate": 1.994549666167053e-05,
"loss": 0.6922,
"step": 490
},
{
"epoch": 0.06403688524590163,
"grad_norm": 35.97098922729492,
"learning_rate": 1.9918244992505793e-05,
"loss": 0.9556,
"step": 500
},
{
"epoch": 0.06531762295081968,
"grad_norm": 0.13992930948734283,
"learning_rate": 1.9890993323341056e-05,
"loss": 0.6617,
"step": 510
},
{
"epoch": 0.06659836065573771,
"grad_norm": 5.176881313323975,
"learning_rate": 1.986374165417632e-05,
"loss": 0.8168,
"step": 520
},
{
"epoch": 0.06787909836065574,
"grad_norm": 43.677433013916016,
"learning_rate": 1.9836489985011584e-05,
"loss": 0.7252,
"step": 530
},
{
"epoch": 0.06915983606557377,
"grad_norm": 15.75368881225586,
"learning_rate": 1.9809238315846847e-05,
"loss": 0.6547,
"step": 540
},
{
"epoch": 0.0704405737704918,
"grad_norm": 14.22448444366455,
"learning_rate": 1.9781986646682113e-05,
"loss": 0.4269,
"step": 550
},
{
"epoch": 0.07172131147540983,
"grad_norm": 20.48627471923828,
"learning_rate": 1.9754734977517372e-05,
"loss": 1.0193,
"step": 560
},
{
"epoch": 0.07300204918032786,
"grad_norm": 51.78612518310547,
"learning_rate": 1.9727483308352638e-05,
"loss": 0.5702,
"step": 570
},
{
"epoch": 0.07428278688524591,
"grad_norm": 0.18359607458114624,
"learning_rate": 1.97002316391879e-05,
"loss": 0.3638,
"step": 580
},
{
"epoch": 0.07556352459016394,
"grad_norm": 74.03116607666016,
"learning_rate": 1.9672979970023163e-05,
"loss": 0.7977,
"step": 590
},
{
"epoch": 0.07684426229508197,
"grad_norm": 12.116443634033203,
"learning_rate": 1.964572830085843e-05,
"loss": 0.8353,
"step": 600
},
{
"epoch": 0.078125,
"grad_norm": 36.37770080566406,
"learning_rate": 1.9618476631693692e-05,
"loss": 0.534,
"step": 610
},
{
"epoch": 0.07940573770491803,
"grad_norm": 1.0840022563934326,
"learning_rate": 1.9591224962528958e-05,
"loss": 0.6757,
"step": 620
},
{
"epoch": 0.08068647540983606,
"grad_norm": 18.524744033813477,
"learning_rate": 1.956397329336422e-05,
"loss": 0.6852,
"step": 630
},
{
"epoch": 0.08196721311475409,
"grad_norm": 0.5453315377235413,
"learning_rate": 1.9536721624199483e-05,
"loss": 0.3746,
"step": 640
},
{
"epoch": 0.08324795081967214,
"grad_norm": 3.647247076034546,
"learning_rate": 1.950946995503475e-05,
"loss": 0.3404,
"step": 650
},
{
"epoch": 0.08452868852459017,
"grad_norm": 2.8789007663726807,
"learning_rate": 1.9482218285870012e-05,
"loss": 0.558,
"step": 660
},
{
"epoch": 0.0858094262295082,
"grad_norm": 59.30935287475586,
"learning_rate": 1.9454966616705274e-05,
"loss": 0.2678,
"step": 670
},
{
"epoch": 0.08709016393442623,
"grad_norm": 36.327701568603516,
"learning_rate": 1.942771494754054e-05,
"loss": 0.8112,
"step": 680
},
{
"epoch": 0.08837090163934426,
"grad_norm": 30.401525497436523,
"learning_rate": 1.9400463278375803e-05,
"loss": 0.8637,
"step": 690
},
{
"epoch": 0.08965163934426229,
"grad_norm": 65.09701538085938,
"learning_rate": 1.9373211609211066e-05,
"loss": 0.4575,
"step": 700
},
{
"epoch": 0.09093237704918032,
"grad_norm": 0.17984363436698914,
"learning_rate": 1.9345959940046332e-05,
"loss": 0.6336,
"step": 710
},
{
"epoch": 0.09221311475409837,
"grad_norm": 8.531198501586914,
"learning_rate": 1.931870827088159e-05,
"loss": 0.6553,
"step": 720
},
{
"epoch": 0.0934938524590164,
"grad_norm": 1.3908320665359497,
"learning_rate": 1.9291456601716857e-05,
"loss": 0.5388,
"step": 730
},
{
"epoch": 0.09477459016393443,
"grad_norm": 27.024486541748047,
"learning_rate": 1.926420493255212e-05,
"loss": 0.5134,
"step": 740
},
{
"epoch": 0.09605532786885246,
"grad_norm": 1.363821268081665,
"learning_rate": 1.9236953263387382e-05,
"loss": 0.4295,
"step": 750
},
{
"epoch": 0.09733606557377049,
"grad_norm": 18.301353454589844,
"learning_rate": 1.9209701594222648e-05,
"loss": 0.8292,
"step": 760
},
{
"epoch": 0.09861680327868852,
"grad_norm": 7.517091751098633,
"learning_rate": 1.918244992505791e-05,
"loss": 0.5639,
"step": 770
},
{
"epoch": 0.09989754098360656,
"grad_norm": 0.8409481048583984,
"learning_rate": 1.9155198255893174e-05,
"loss": 0.4416,
"step": 780
},
{
"epoch": 0.1011782786885246,
"grad_norm": 10.660968780517578,
"learning_rate": 1.912794658672844e-05,
"loss": 0.6898,
"step": 790
},
{
"epoch": 0.10245901639344263,
"grad_norm": 13.175348281860352,
"learning_rate": 1.9100694917563702e-05,
"loss": 0.4468,
"step": 800
},
{
"epoch": 0.10373975409836066,
"grad_norm": 11.351682662963867,
"learning_rate": 1.9073443248398965e-05,
"loss": 1.0168,
"step": 810
},
{
"epoch": 0.10502049180327869,
"grad_norm": 2.7584354877471924,
"learning_rate": 1.904619157923423e-05,
"loss": 0.4303,
"step": 820
},
{
"epoch": 0.10630122950819672,
"grad_norm": 34.519954681396484,
"learning_rate": 1.9018939910069493e-05,
"loss": 0.1582,
"step": 830
},
{
"epoch": 0.10758196721311475,
"grad_norm": 1.5243237018585205,
"learning_rate": 1.8991688240904756e-05,
"loss": 0.5226,
"step": 840
},
{
"epoch": 0.1088627049180328,
"grad_norm": 12.513932228088379,
"learning_rate": 1.8964436571740022e-05,
"loss": 0.6845,
"step": 850
},
{
"epoch": 0.11014344262295082,
"grad_norm": 32.45783996582031,
"learning_rate": 1.8937184902575285e-05,
"loss": 1.0624,
"step": 860
},
{
"epoch": 0.11142418032786885,
"grad_norm": 0.5410599112510681,
"learning_rate": 1.8909933233410547e-05,
"loss": 0.7797,
"step": 870
},
{
"epoch": 0.11270491803278689,
"grad_norm": 40.27082443237305,
"learning_rate": 1.888268156424581e-05,
"loss": 0.6134,
"step": 880
},
{
"epoch": 0.11398565573770492,
"grad_norm": 14.060335159301758,
"learning_rate": 1.8855429895081076e-05,
"loss": 0.4326,
"step": 890
},
{
"epoch": 0.11526639344262295,
"grad_norm": 13.475322723388672,
"learning_rate": 1.882817822591634e-05,
"loss": 0.7133,
"step": 900
},
{
"epoch": 0.11654713114754098,
"grad_norm": 3.2171595096588135,
"learning_rate": 1.88009265567516e-05,
"loss": 0.6357,
"step": 910
},
{
"epoch": 0.11782786885245902,
"grad_norm": 34.33395767211914,
"learning_rate": 1.8773674887586867e-05,
"loss": 0.6751,
"step": 920
},
{
"epoch": 0.11910860655737705,
"grad_norm": 0.17749445140361786,
"learning_rate": 1.874642321842213e-05,
"loss": 0.3697,
"step": 930
},
{
"epoch": 0.12038934426229508,
"grad_norm": 49.89470291137695,
"learning_rate": 1.8719171549257392e-05,
"loss": 0.5144,
"step": 940
},
{
"epoch": 0.12167008196721311,
"grad_norm": 15.842961311340332,
"learning_rate": 1.869191988009266e-05,
"loss": 0.7409,
"step": 950
},
{
"epoch": 0.12295081967213115,
"grad_norm": 5.076769828796387,
"learning_rate": 1.866466821092792e-05,
"loss": 0.2575,
"step": 960
},
{
"epoch": 0.12423155737704918,
"grad_norm": 6.906425476074219,
"learning_rate": 1.8637416541763184e-05,
"loss": 0.4676,
"step": 970
},
{
"epoch": 0.1255122950819672,
"grad_norm": 0.2420882135629654,
"learning_rate": 1.861016487259845e-05,
"loss": 0.5278,
"step": 980
},
{
"epoch": 0.12679303278688525,
"grad_norm": 42.10707473754883,
"learning_rate": 1.8582913203433712e-05,
"loss": 0.2199,
"step": 990
},
{
"epoch": 0.12807377049180327,
"grad_norm": 67.881103515625,
"learning_rate": 1.8555661534268975e-05,
"loss": 0.7105,
"step": 1000
},
{
"epoch": 0.1293545081967213,
"grad_norm": 0.4502294361591339,
"learning_rate": 1.852840986510424e-05,
"loss": 0.7214,
"step": 1010
},
{
"epoch": 0.13063524590163936,
"grad_norm": 0.19563625752925873,
"learning_rate": 1.8501158195939504e-05,
"loss": 0.344,
"step": 1020
},
{
"epoch": 0.13191598360655737,
"grad_norm": 70.44747161865234,
"learning_rate": 1.8473906526774766e-05,
"loss": 0.5475,
"step": 1030
},
{
"epoch": 0.13319672131147542,
"grad_norm": 68.5734634399414,
"learning_rate": 1.844665485761003e-05,
"loss": 0.866,
"step": 1040
},
{
"epoch": 0.13447745901639344,
"grad_norm": 5.665011405944824,
"learning_rate": 1.841940318844529e-05,
"loss": 0.4777,
"step": 1050
},
{
"epoch": 0.13575819672131148,
"grad_norm": 34.88306427001953,
"learning_rate": 1.8392151519280557e-05,
"loss": 0.6516,
"step": 1060
},
{
"epoch": 0.1370389344262295,
"grad_norm": 5.857304096221924,
"learning_rate": 1.836489985011582e-05,
"loss": 0.3298,
"step": 1070
},
{
"epoch": 0.13831967213114754,
"grad_norm": 0.40846720337867737,
"learning_rate": 1.8337648180951083e-05,
"loss": 0.6535,
"step": 1080
},
{
"epoch": 0.1396004098360656,
"grad_norm": 26.644474029541016,
"learning_rate": 1.831039651178635e-05,
"loss": 0.5543,
"step": 1090
},
{
"epoch": 0.1408811475409836,
"grad_norm": 1.7488807439804077,
"learning_rate": 1.828314484262161e-05,
"loss": 0.6122,
"step": 1100
},
{
"epoch": 0.14216188524590165,
"grad_norm": 63.28523254394531,
"learning_rate": 1.8255893173456874e-05,
"loss": 0.805,
"step": 1110
},
{
"epoch": 0.14344262295081966,
"grad_norm": 56.30666732788086,
"learning_rate": 1.822864150429214e-05,
"loss": 0.8171,
"step": 1120
},
{
"epoch": 0.1447233606557377,
"grad_norm": 52.1702880859375,
"learning_rate": 1.8201389835127403e-05,
"loss": 0.5012,
"step": 1130
},
{
"epoch": 0.14600409836065573,
"grad_norm": 6.9870452880859375,
"learning_rate": 1.817413816596267e-05,
"loss": 0.3778,
"step": 1140
},
{
"epoch": 0.14728483606557377,
"grad_norm": 48.00603103637695,
"learning_rate": 1.814688649679793e-05,
"loss": 0.4939,
"step": 1150
},
{
"epoch": 0.14856557377049182,
"grad_norm": 0.6154949069023132,
"learning_rate": 1.8119634827633194e-05,
"loss": 0.3668,
"step": 1160
},
{
"epoch": 0.14984631147540983,
"grad_norm": 1.350846529006958,
"learning_rate": 1.809238315846846e-05,
"loss": 0.8219,
"step": 1170
},
{
"epoch": 0.15112704918032788,
"grad_norm": 17.47528648376465,
"learning_rate": 1.8065131489303723e-05,
"loss": 0.643,
"step": 1180
},
{
"epoch": 0.1524077868852459,
"grad_norm": 2.0453598499298096,
"learning_rate": 1.8037879820138985e-05,
"loss": 0.6053,
"step": 1190
},
{
"epoch": 0.15368852459016394,
"grad_norm": 28.069385528564453,
"learning_rate": 1.8010628150974248e-05,
"loss": 0.7856,
"step": 1200
},
{
"epoch": 0.15496926229508196,
"grad_norm": 4.573185920715332,
"learning_rate": 1.798337648180951e-05,
"loss": 0.4821,
"step": 1210
},
{
"epoch": 0.15625,
"grad_norm": 49.39838790893555,
"learning_rate": 1.7956124812644776e-05,
"loss": 0.596,
"step": 1220
},
{
"epoch": 0.15753073770491804,
"grad_norm": 5.040111064910889,
"learning_rate": 1.792887314348004e-05,
"loss": 0.5817,
"step": 1230
},
{
"epoch": 0.15881147540983606,
"grad_norm": 75.80863189697266,
"learning_rate": 1.79016214743153e-05,
"loss": 1.0482,
"step": 1240
},
{
"epoch": 0.1600922131147541,
"grad_norm": 8.544283866882324,
"learning_rate": 1.7874369805150568e-05,
"loss": 0.616,
"step": 1250
},
{
"epoch": 0.16137295081967212,
"grad_norm": 11.687309265136719,
"learning_rate": 1.784711813598583e-05,
"loss": 0.4421,
"step": 1260
},
{
"epoch": 0.16265368852459017,
"grad_norm": 11.043490409851074,
"learning_rate": 1.7819866466821093e-05,
"loss": 0.2868,
"step": 1270
},
{
"epoch": 0.16393442622950818,
"grad_norm": 3.897243022918701,
"learning_rate": 1.779261479765636e-05,
"loss": 0.4681,
"step": 1280
},
{
"epoch": 0.16521516393442623,
"grad_norm": 0.32223525643348694,
"learning_rate": 1.776536312849162e-05,
"loss": 0.3196,
"step": 1290
},
{
"epoch": 0.16649590163934427,
"grad_norm": 0.1265946328639984,
"learning_rate": 1.7738111459326884e-05,
"loss": 0.638,
"step": 1300
},
{
"epoch": 0.1677766393442623,
"grad_norm": 40.56721115112305,
"learning_rate": 1.771085979016215e-05,
"loss": 0.5043,
"step": 1310
},
{
"epoch": 0.16905737704918034,
"grad_norm": 6.785134315490723,
"learning_rate": 1.7683608120997413e-05,
"loss": 0.4148,
"step": 1320
},
{
"epoch": 0.17033811475409835,
"grad_norm": 33.5522346496582,
"learning_rate": 1.7656356451832675e-05,
"loss": 1.1591,
"step": 1330
},
{
"epoch": 0.1716188524590164,
"grad_norm": 7.858984470367432,
"learning_rate": 1.762910478266794e-05,
"loss": 0.9522,
"step": 1340
},
{
"epoch": 0.1728995901639344,
"grad_norm": 32.17461013793945,
"learning_rate": 1.7601853113503204e-05,
"loss": 0.5926,
"step": 1350
},
{
"epoch": 0.17418032786885246,
"grad_norm": 11.334968566894531,
"learning_rate": 1.7574601444338467e-05,
"loss": 0.7914,
"step": 1360
},
{
"epoch": 0.1754610655737705,
"grad_norm": 13.335744857788086,
"learning_rate": 1.754734977517373e-05,
"loss": 0.8537,
"step": 1370
},
{
"epoch": 0.17674180327868852,
"grad_norm": 19.00205421447754,
"learning_rate": 1.7520098106008992e-05,
"loss": 0.4838,
"step": 1380
},
{
"epoch": 0.17802254098360656,
"grad_norm": 8.699183464050293,
"learning_rate": 1.7492846436844258e-05,
"loss": 0.666,
"step": 1390
},
{
"epoch": 0.17930327868852458,
"grad_norm": 1.6320335865020752,
"learning_rate": 1.746559476767952e-05,
"loss": 0.5107,
"step": 1400
},
{
"epoch": 0.18058401639344263,
"grad_norm": 1.2799221277236938,
"learning_rate": 1.7438343098514787e-05,
"loss": 0.4847,
"step": 1410
},
{
"epoch": 0.18186475409836064,
"grad_norm": 2.808711528778076,
"learning_rate": 1.741109142935005e-05,
"loss": 0.4733,
"step": 1420
},
{
"epoch": 0.1831454918032787,
"grad_norm": 18.037717819213867,
"learning_rate": 1.7383839760185312e-05,
"loss": 0.6985,
"step": 1430
},
{
"epoch": 0.18442622950819673,
"grad_norm": 2.3388659954071045,
"learning_rate": 1.7356588091020578e-05,
"loss": 0.2029,
"step": 1440
},
{
"epoch": 0.18570696721311475,
"grad_norm": 15.241260528564453,
"learning_rate": 1.732933642185584e-05,
"loss": 0.2651,
"step": 1450
},
{
"epoch": 0.1869877049180328,
"grad_norm": 27.643362045288086,
"learning_rate": 1.7302084752691103e-05,
"loss": 0.6641,
"step": 1460
},
{
"epoch": 0.1882684426229508,
"grad_norm": 51.026947021484375,
"learning_rate": 1.727483308352637e-05,
"loss": 0.5786,
"step": 1470
},
{
"epoch": 0.18954918032786885,
"grad_norm": 62.00007247924805,
"learning_rate": 1.7247581414361632e-05,
"loss": 0.3976,
"step": 1480
},
{
"epoch": 0.19082991803278687,
"grad_norm": 80.54548645019531,
"learning_rate": 1.7220329745196894e-05,
"loss": 0.4812,
"step": 1490
},
{
"epoch": 0.19211065573770492,
"grad_norm": 108.28478240966797,
"learning_rate": 1.719307807603216e-05,
"loss": 0.3106,
"step": 1500
},
{
"epoch": 0.19339139344262296,
"grad_norm": 31.335493087768555,
"learning_rate": 1.7165826406867423e-05,
"loss": 0.4389,
"step": 1510
},
{
"epoch": 0.19467213114754098,
"grad_norm": 2.4842689037323,
"learning_rate": 1.7138574737702686e-05,
"loss": 0.4561,
"step": 1520
},
{
"epoch": 0.19595286885245902,
"grad_norm": 34.57732391357422,
"learning_rate": 1.7111323068537948e-05,
"loss": 0.6538,
"step": 1530
},
{
"epoch": 0.19723360655737704,
"grad_norm": 89.62613677978516,
"learning_rate": 1.708407139937321e-05,
"loss": 0.3808,
"step": 1540
},
{
"epoch": 0.19851434426229508,
"grad_norm": 0.6716292500495911,
"learning_rate": 1.7056819730208477e-05,
"loss": 0.5189,
"step": 1550
},
{
"epoch": 0.19979508196721313,
"grad_norm": 32.78571319580078,
"learning_rate": 1.702956806104374e-05,
"loss": 1.2536,
"step": 1560
},
{
"epoch": 0.20107581967213115,
"grad_norm": 5.39422607421875,
"learning_rate": 1.7002316391879002e-05,
"loss": 0.4674,
"step": 1570
},
{
"epoch": 0.2023565573770492,
"grad_norm": 0.295356422662735,
"learning_rate": 1.6975064722714268e-05,
"loss": 0.9923,
"step": 1580
},
{
"epoch": 0.2036372950819672,
"grad_norm": 2.7056820392608643,
"learning_rate": 1.694781305354953e-05,
"loss": 0.1946,
"step": 1590
},
{
"epoch": 0.20491803278688525,
"grad_norm": 2.453801393508911,
"learning_rate": 1.6920561384384793e-05,
"loss": 0.4442,
"step": 1600
},
{
"epoch": 0.20619877049180327,
"grad_norm": 5.696882247924805,
"learning_rate": 1.689330971522006e-05,
"loss": 0.5623,
"step": 1610
},
{
"epoch": 0.2074795081967213,
"grad_norm": 17.160661697387695,
"learning_rate": 1.6866058046055322e-05,
"loss": 1.097,
"step": 1620
},
{
"epoch": 0.20876024590163936,
"grad_norm": 23.408737182617188,
"learning_rate": 1.6838806376890585e-05,
"loss": 0.5359,
"step": 1630
},
{
"epoch": 0.21004098360655737,
"grad_norm": 0.7226897478103638,
"learning_rate": 1.681155470772585e-05,
"loss": 0.3844,
"step": 1640
},
{
"epoch": 0.21132172131147542,
"grad_norm": 28.273542404174805,
"learning_rate": 1.6784303038561113e-05,
"loss": 0.6221,
"step": 1650
},
{
"epoch": 0.21260245901639344,
"grad_norm": 0.6800060272216797,
"learning_rate": 1.6757051369396376e-05,
"loss": 0.4647,
"step": 1660
},
{
"epoch": 0.21388319672131148,
"grad_norm": 7.838409423828125,
"learning_rate": 1.6729799700231642e-05,
"loss": 0.7482,
"step": 1670
},
{
"epoch": 0.2151639344262295,
"grad_norm": 68.58909606933594,
"learning_rate": 1.6702548031066905e-05,
"loss": 0.7191,
"step": 1680
},
{
"epoch": 0.21644467213114754,
"grad_norm": 10.408316612243652,
"learning_rate": 1.6675296361902167e-05,
"loss": 0.9812,
"step": 1690
},
{
"epoch": 0.2177254098360656,
"grad_norm": 45.571781158447266,
"learning_rate": 1.664804469273743e-05,
"loss": 0.4901,
"step": 1700
},
{
"epoch": 0.2190061475409836,
"grad_norm": 14.653166770935059,
"learning_rate": 1.6620793023572696e-05,
"loss": 0.4544,
"step": 1710
},
{
"epoch": 0.22028688524590165,
"grad_norm": 75.12469482421875,
"learning_rate": 1.659354135440796e-05,
"loss": 0.5393,
"step": 1720
},
{
"epoch": 0.22156762295081966,
"grad_norm": 20.70387077331543,
"learning_rate": 1.656628968524322e-05,
"loss": 0.7864,
"step": 1730
},
{
"epoch": 0.2228483606557377,
"grad_norm": 2.0562734603881836,
"learning_rate": 1.6539038016078487e-05,
"loss": 0.6331,
"step": 1740
},
{
"epoch": 0.22412909836065573,
"grad_norm": 13.042604446411133,
"learning_rate": 1.651178634691375e-05,
"loss": 0.5563,
"step": 1750
},
{
"epoch": 0.22540983606557377,
"grad_norm": 33.89776611328125,
"learning_rate": 1.6484534677749012e-05,
"loss": 0.4452,
"step": 1760
},
{
"epoch": 0.22669057377049182,
"grad_norm": 16.996103286743164,
"learning_rate": 1.645728300858428e-05,
"loss": 0.6087,
"step": 1770
},
{
"epoch": 0.22797131147540983,
"grad_norm": 2.9814796447753906,
"learning_rate": 1.643003133941954e-05,
"loss": 0.3813,
"step": 1780
},
{
"epoch": 0.22925204918032788,
"grad_norm": 0.20661257207393646,
"learning_rate": 1.6402779670254804e-05,
"loss": 0.3531,
"step": 1790
},
{
"epoch": 0.2305327868852459,
"grad_norm": 0.23248881101608276,
"learning_rate": 1.637552800109007e-05,
"loss": 0.4496,
"step": 1800
},
{
"epoch": 0.23181352459016394,
"grad_norm": 55.3471565246582,
"learning_rate": 1.6348276331925332e-05,
"loss": 0.5625,
"step": 1810
},
{
"epoch": 0.23309426229508196,
"grad_norm": 11.669384002685547,
"learning_rate": 1.6321024662760595e-05,
"loss": 0.4075,
"step": 1820
},
{
"epoch": 0.234375,
"grad_norm": 65.76184844970703,
"learning_rate": 1.629377299359586e-05,
"loss": 0.3711,
"step": 1830
},
{
"epoch": 0.23565573770491804,
"grad_norm": 1.0016331672668457,
"learning_rate": 1.6266521324431124e-05,
"loss": 0.1958,
"step": 1840
},
{
"epoch": 0.23693647540983606,
"grad_norm": 9.233772277832031,
"learning_rate": 1.6239269655266386e-05,
"loss": 0.5992,
"step": 1850
},
{
"epoch": 0.2382172131147541,
"grad_norm": 7.4546732902526855,
"learning_rate": 1.621201798610165e-05,
"loss": 0.775,
"step": 1860
},
{
"epoch": 0.23949795081967212,
"grad_norm": 0.771056056022644,
"learning_rate": 1.618476631693691e-05,
"loss": 0.6516,
"step": 1870
},
{
"epoch": 0.24077868852459017,
"grad_norm": 13.350895881652832,
"learning_rate": 1.6157514647772177e-05,
"loss": 0.6574,
"step": 1880
},
{
"epoch": 0.24205942622950818,
"grad_norm": 1.9616976976394653,
"learning_rate": 1.613026297860744e-05,
"loss": 0.7321,
"step": 1890
},
{
"epoch": 0.24334016393442623,
"grad_norm": 0.2918919622898102,
"learning_rate": 1.6103011309442703e-05,
"loss": 1.1059,
"step": 1900
},
{
"epoch": 0.24462090163934427,
"grad_norm": 13.870285987854004,
"learning_rate": 1.607575964027797e-05,
"loss": 0.4844,
"step": 1910
},
{
"epoch": 0.2459016393442623,
"grad_norm": 19.64275360107422,
"learning_rate": 1.604850797111323e-05,
"loss": 0.3566,
"step": 1920
},
{
"epoch": 0.24718237704918034,
"grad_norm": 54.27963638305664,
"learning_rate": 1.6021256301948497e-05,
"loss": 0.5747,
"step": 1930
},
{
"epoch": 0.24846311475409835,
"grad_norm": 163.11248779296875,
"learning_rate": 1.599400463278376e-05,
"loss": 1.0376,
"step": 1940
},
{
"epoch": 0.2497438524590164,
"grad_norm": 3.2400197982788086,
"learning_rate": 1.5966752963619023e-05,
"loss": 0.3435,
"step": 1950
},
{
"epoch": 0.2510245901639344,
"grad_norm": 0.17113502323627472,
"learning_rate": 1.593950129445429e-05,
"loss": 0.5393,
"step": 1960
},
{
"epoch": 0.25230532786885246,
"grad_norm": 22.859413146972656,
"learning_rate": 1.591224962528955e-05,
"loss": 0.3003,
"step": 1970
},
{
"epoch": 0.2535860655737705,
"grad_norm": 1.3010896444320679,
"learning_rate": 1.5884997956124814e-05,
"loss": 0.626,
"step": 1980
},
{
"epoch": 0.25486680327868855,
"grad_norm": 2.824781656265259,
"learning_rate": 1.585774628696008e-05,
"loss": 0.5887,
"step": 1990
},
{
"epoch": 0.25614754098360654,
"grad_norm": 52.8790397644043,
"learning_rate": 1.5830494617795342e-05,
"loss": 0.5767,
"step": 2000
},
{
"epoch": 0.2574282786885246,
"grad_norm": 10.472972869873047,
"learning_rate": 1.5803242948630605e-05,
"loss": 0.5818,
"step": 2010
},
{
"epoch": 0.2587090163934426,
"grad_norm": 0.7781365513801575,
"learning_rate": 1.5775991279465868e-05,
"loss": 0.9479,
"step": 2020
},
{
"epoch": 0.25998975409836067,
"grad_norm": 5.116518974304199,
"learning_rate": 1.574873961030113e-05,
"loss": 0.6473,
"step": 2030
},
{
"epoch": 0.2612704918032787,
"grad_norm": 31.682783126831055,
"learning_rate": 1.5721487941136396e-05,
"loss": 1.0271,
"step": 2040
},
{
"epoch": 0.2625512295081967,
"grad_norm": 0.6573253273963928,
"learning_rate": 1.569423627197166e-05,
"loss": 0.3799,
"step": 2050
},
{
"epoch": 0.26383196721311475,
"grad_norm": 5.006514072418213,
"learning_rate": 1.566698460280692e-05,
"loss": 0.406,
"step": 2060
},
{
"epoch": 0.2651127049180328,
"grad_norm": 30.3986873626709,
"learning_rate": 1.5639732933642188e-05,
"loss": 0.7609,
"step": 2070
},
{
"epoch": 0.26639344262295084,
"grad_norm": 8.392414093017578,
"learning_rate": 1.561248126447745e-05,
"loss": 0.3631,
"step": 2080
},
{
"epoch": 0.2676741803278688,
"grad_norm": 0.6506038308143616,
"learning_rate": 1.5585229595312713e-05,
"loss": 0.5563,
"step": 2090
},
{
"epoch": 0.26895491803278687,
"grad_norm": 34.08297348022461,
"learning_rate": 1.555797792614798e-05,
"loss": 0.3359,
"step": 2100
},
{
"epoch": 0.2702356557377049,
"grad_norm": 30.52340316772461,
"learning_rate": 1.553072625698324e-05,
"loss": 0.4175,
"step": 2110
},
{
"epoch": 0.27151639344262296,
"grad_norm": 19.159420013427734,
"learning_rate": 1.5503474587818504e-05,
"loss": 0.5232,
"step": 2120
},
{
"epoch": 0.272797131147541,
"grad_norm": 1.3067234754562378,
"learning_rate": 1.547622291865377e-05,
"loss": 0.2685,
"step": 2130
},
{
"epoch": 0.274077868852459,
"grad_norm": 29.783512115478516,
"learning_rate": 1.5448971249489033e-05,
"loss": 0.7583,
"step": 2140
},
{
"epoch": 0.27535860655737704,
"grad_norm": 55.58544921875,
"learning_rate": 1.5421719580324295e-05,
"loss": 0.7714,
"step": 2150
},
{
"epoch": 0.2766393442622951,
"grad_norm": 6.930970191955566,
"learning_rate": 1.539446791115956e-05,
"loss": 0.4346,
"step": 2160
},
{
"epoch": 0.27792008196721313,
"grad_norm": 7.723865509033203,
"learning_rate": 1.5367216241994824e-05,
"loss": 0.6486,
"step": 2170
},
{
"epoch": 0.2792008196721312,
"grad_norm": 0.23200243711471558,
"learning_rate": 1.5339964572830087e-05,
"loss": 0.514,
"step": 2180
},
{
"epoch": 0.28048155737704916,
"grad_norm": 29.773784637451172,
"learning_rate": 1.531271290366535e-05,
"loss": 0.9953,
"step": 2190
},
{
"epoch": 0.2817622950819672,
"grad_norm": 19.467941284179688,
"learning_rate": 1.5285461234500615e-05,
"loss": 0.6698,
"step": 2200
},
{
"epoch": 0.28304303278688525,
"grad_norm": 0.44849446415901184,
"learning_rate": 1.5258209565335878e-05,
"loss": 0.3486,
"step": 2210
},
{
"epoch": 0.2843237704918033,
"grad_norm": 3.40317702293396,
"learning_rate": 1.523095789617114e-05,
"loss": 0.3062,
"step": 2220
},
{
"epoch": 0.2856045081967213,
"grad_norm": 23.58439826965332,
"learning_rate": 1.5203706227006405e-05,
"loss": 0.4546,
"step": 2230
},
{
"epoch": 0.28688524590163933,
"grad_norm": 0.14240220189094543,
"learning_rate": 1.517645455784167e-05,
"loss": 0.466,
"step": 2240
},
{
"epoch": 0.2881659836065574,
"grad_norm": 22.152645111083984,
"learning_rate": 1.5149202888676932e-05,
"loss": 0.9443,
"step": 2250
},
{
"epoch": 0.2894467213114754,
"grad_norm": 40.078433990478516,
"learning_rate": 1.5121951219512196e-05,
"loss": 0.7763,
"step": 2260
},
{
"epoch": 0.29072745901639346,
"grad_norm": 22.58036231994629,
"learning_rate": 1.509469955034746e-05,
"loss": 0.5156,
"step": 2270
},
{
"epoch": 0.29200819672131145,
"grad_norm": 9.161020278930664,
"learning_rate": 1.5067447881182725e-05,
"loss": 0.5463,
"step": 2280
},
{
"epoch": 0.2932889344262295,
"grad_norm": 8.112720489501953,
"learning_rate": 1.5040196212017987e-05,
"loss": 0.8208,
"step": 2290
},
{
"epoch": 0.29456967213114754,
"grad_norm": 2.3632164001464844,
"learning_rate": 1.5012944542853252e-05,
"loss": 0.5992,
"step": 2300
},
{
"epoch": 0.2958504098360656,
"grad_norm": 5.630832672119141,
"learning_rate": 1.4985692873688516e-05,
"loss": 0.6949,
"step": 2310
},
{
"epoch": 0.29713114754098363,
"grad_norm": 75.62430572509766,
"learning_rate": 1.4958441204523779e-05,
"loss": 0.5873,
"step": 2320
},
{
"epoch": 0.2984118852459016,
"grad_norm": 11.58348274230957,
"learning_rate": 1.4931189535359043e-05,
"loss": 0.704,
"step": 2330
},
{
"epoch": 0.29969262295081966,
"grad_norm": 20.816808700561523,
"learning_rate": 1.4903937866194304e-05,
"loss": 0.2416,
"step": 2340
},
{
"epoch": 0.3009733606557377,
"grad_norm": 0.5709815621376038,
"learning_rate": 1.4876686197029568e-05,
"loss": 0.4159,
"step": 2350
},
{
"epoch": 0.30225409836065575,
"grad_norm": 0.5212659239768982,
"learning_rate": 1.4849434527864833e-05,
"loss": 0.4472,
"step": 2360
},
{
"epoch": 0.30353483606557374,
"grad_norm": 10.903100967407227,
"learning_rate": 1.4822182858700095e-05,
"loss": 0.5868,
"step": 2370
},
{
"epoch": 0.3048155737704918,
"grad_norm": 60.755706787109375,
"learning_rate": 1.479493118953536e-05,
"loss": 0.8967,
"step": 2380
},
{
"epoch": 0.30609631147540983,
"grad_norm": 0.22794629633426666,
"learning_rate": 1.4767679520370624e-05,
"loss": 0.7226,
"step": 2390
},
{
"epoch": 0.3073770491803279,
"grad_norm": 52.29710006713867,
"learning_rate": 1.4740427851205888e-05,
"loss": 0.7622,
"step": 2400
},
{
"epoch": 0.3086577868852459,
"grad_norm": 0.6769666075706482,
"learning_rate": 1.471317618204115e-05,
"loss": 0.6245,
"step": 2410
},
{
"epoch": 0.3099385245901639,
"grad_norm": 1.508181095123291,
"learning_rate": 1.4685924512876415e-05,
"loss": 0.3395,
"step": 2420
},
{
"epoch": 0.31121926229508196,
"grad_norm": 78.36157989501953,
"learning_rate": 1.465867284371168e-05,
"loss": 0.6791,
"step": 2430
},
{
"epoch": 0.3125,
"grad_norm": 0.1883663535118103,
"learning_rate": 1.4631421174546942e-05,
"loss": 0.2169,
"step": 2440
},
{
"epoch": 0.31378073770491804,
"grad_norm": 42.14516067504883,
"learning_rate": 1.4604169505382206e-05,
"loss": 1.2172,
"step": 2450
},
{
"epoch": 0.3150614754098361,
"grad_norm": 11.31810474395752,
"learning_rate": 1.457691783621747e-05,
"loss": 0.5506,
"step": 2460
},
{
"epoch": 0.3163422131147541,
"grad_norm": 5.650265216827393,
"learning_rate": 1.4549666167052733e-05,
"loss": 0.595,
"step": 2470
},
{
"epoch": 0.3176229508196721,
"grad_norm": 1.0849229097366333,
"learning_rate": 1.4522414497887998e-05,
"loss": 0.6437,
"step": 2480
},
{
"epoch": 0.31890368852459017,
"grad_norm": 25.959819793701172,
"learning_rate": 1.4495162828723262e-05,
"loss": 0.6475,
"step": 2490
},
{
"epoch": 0.3201844262295082,
"grad_norm": 0.33578041195869446,
"learning_rate": 1.4467911159558523e-05,
"loss": 0.7596,
"step": 2500
},
{
"epoch": 0.32146516393442626,
"grad_norm": 3.292280673980713,
"learning_rate": 1.4440659490393787e-05,
"loss": 0.6943,
"step": 2510
},
{
"epoch": 0.32274590163934425,
"grad_norm": 12.404119491577148,
"learning_rate": 1.4413407821229052e-05,
"loss": 0.8402,
"step": 2520
},
{
"epoch": 0.3240266393442623,
"grad_norm": 23.83495330810547,
"learning_rate": 1.4386156152064314e-05,
"loss": 0.8554,
"step": 2530
},
{
"epoch": 0.32530737704918034,
"grad_norm": 13.377483367919922,
"learning_rate": 1.4358904482899578e-05,
"loss": 0.7793,
"step": 2540
},
{
"epoch": 0.3265881147540984,
"grad_norm": 73.42285919189453,
"learning_rate": 1.4331652813734843e-05,
"loss": 0.4669,
"step": 2550
},
{
"epoch": 0.32786885245901637,
"grad_norm": 40.041080474853516,
"learning_rate": 1.4304401144570105e-05,
"loss": 0.721,
"step": 2560
},
{
"epoch": 0.3291495901639344,
"grad_norm": 0.547555148601532,
"learning_rate": 1.427714947540537e-05,
"loss": 0.4564,
"step": 2570
},
{
"epoch": 0.33043032786885246,
"grad_norm": 1.5647186040878296,
"learning_rate": 1.4249897806240634e-05,
"loss": 0.4615,
"step": 2580
},
{
"epoch": 0.3317110655737705,
"grad_norm": 35.03215789794922,
"learning_rate": 1.4222646137075897e-05,
"loss": 0.8154,
"step": 2590
},
{
"epoch": 0.33299180327868855,
"grad_norm": 2.925804615020752,
"learning_rate": 1.4195394467911161e-05,
"loss": 0.4514,
"step": 2600
},
{
"epoch": 0.33427254098360654,
"grad_norm": 67.04120635986328,
"learning_rate": 1.4168142798746425e-05,
"loss": 0.3808,
"step": 2610
},
{
"epoch": 0.3355532786885246,
"grad_norm": 44.40123748779297,
"learning_rate": 1.4140891129581688e-05,
"loss": 0.4096,
"step": 2620
},
{
"epoch": 0.3368340163934426,
"grad_norm": 0.8358442187309265,
"learning_rate": 1.4113639460416952e-05,
"loss": 0.5851,
"step": 2630
},
{
"epoch": 0.33811475409836067,
"grad_norm": 0.4117409884929657,
"learning_rate": 1.4086387791252217e-05,
"loss": 0.4103,
"step": 2640
},
{
"epoch": 0.3393954918032787,
"grad_norm": 34.275489807128906,
"learning_rate": 1.405913612208748e-05,
"loss": 0.5521,
"step": 2650
},
{
"epoch": 0.3406762295081967,
"grad_norm": 0.12396706640720367,
"learning_rate": 1.4031884452922742e-05,
"loss": 0.1471,
"step": 2660
},
{
"epoch": 0.34195696721311475,
"grad_norm": 26.100513458251953,
"learning_rate": 1.4004632783758006e-05,
"loss": 0.6545,
"step": 2670
},
{
"epoch": 0.3432377049180328,
"grad_norm": 1.4054203033447266,
"learning_rate": 1.3977381114593269e-05,
"loss": 0.3345,
"step": 2680
},
{
"epoch": 0.34451844262295084,
"grad_norm": 18.780344009399414,
"learning_rate": 1.3950129445428533e-05,
"loss": 0.5751,
"step": 2690
},
{
"epoch": 0.3457991803278688,
"grad_norm": 2.4345474243164062,
"learning_rate": 1.3922877776263797e-05,
"loss": 0.518,
"step": 2700
},
{
"epoch": 0.34707991803278687,
"grad_norm": 153.76368713378906,
"learning_rate": 1.389562610709906e-05,
"loss": 0.9605,
"step": 2710
},
{
"epoch": 0.3483606557377049,
"grad_norm": 23.214303970336914,
"learning_rate": 1.3868374437934324e-05,
"loss": 0.5311,
"step": 2720
},
{
"epoch": 0.34964139344262296,
"grad_norm": 3.1090455055236816,
"learning_rate": 1.3841122768769589e-05,
"loss": 0.7492,
"step": 2730
},
{
"epoch": 0.350922131147541,
"grad_norm": 18.95741081237793,
"learning_rate": 1.3813871099604851e-05,
"loss": 0.8132,
"step": 2740
},
{
"epoch": 0.352202868852459,
"grad_norm": 35.78852081298828,
"learning_rate": 1.3786619430440116e-05,
"loss": 0.4757,
"step": 2750
},
{
"epoch": 0.35348360655737704,
"grad_norm": 0.2885892391204834,
"learning_rate": 1.375936776127538e-05,
"loss": 0.4375,
"step": 2760
},
{
"epoch": 0.3547643442622951,
"grad_norm": 32.26221466064453,
"learning_rate": 1.3732116092110643e-05,
"loss": 0.7023,
"step": 2770
},
{
"epoch": 0.35604508196721313,
"grad_norm": 23.65122413635254,
"learning_rate": 1.3704864422945907e-05,
"loss": 0.1926,
"step": 2780
},
{
"epoch": 0.3573258196721312,
"grad_norm": 22.145179748535156,
"learning_rate": 1.3677612753781171e-05,
"loss": 0.7968,
"step": 2790
},
{
"epoch": 0.35860655737704916,
"grad_norm": 15.272971153259277,
"learning_rate": 1.3650361084616435e-05,
"loss": 0.4302,
"step": 2800
},
{
"epoch": 0.3598872950819672,
"grad_norm": 69.59125518798828,
"learning_rate": 1.3623109415451698e-05,
"loss": 0.5592,
"step": 2810
},
{
"epoch": 0.36116803278688525,
"grad_norm": 0.19557702541351318,
"learning_rate": 1.359585774628696e-05,
"loss": 0.5795,
"step": 2820
},
{
"epoch": 0.3624487704918033,
"grad_norm": 28.615272521972656,
"learning_rate": 1.3568606077122223e-05,
"loss": 0.517,
"step": 2830
},
{
"epoch": 0.3637295081967213,
"grad_norm": 4.395263195037842,
"learning_rate": 1.3541354407957488e-05,
"loss": 0.6146,
"step": 2840
},
{
"epoch": 0.36501024590163933,
"grad_norm": 0.7227006554603577,
"learning_rate": 1.3514102738792752e-05,
"loss": 0.5504,
"step": 2850
},
{
"epoch": 0.3662909836065574,
"grad_norm": 15.734036445617676,
"learning_rate": 1.3486851069628015e-05,
"loss": 0.59,
"step": 2860
},
{
"epoch": 0.3675717213114754,
"grad_norm": 1.6639937162399292,
"learning_rate": 1.3459599400463279e-05,
"loss": 0.5332,
"step": 2870
},
{
"epoch": 0.36885245901639346,
"grad_norm": 0.7330634593963623,
"learning_rate": 1.3432347731298543e-05,
"loss": 0.4509,
"step": 2880
},
{
"epoch": 0.37013319672131145,
"grad_norm": 42.200531005859375,
"learning_rate": 1.3405096062133806e-05,
"loss": 0.5252,
"step": 2890
},
{
"epoch": 0.3714139344262295,
"grad_norm": 65.34646606445312,
"learning_rate": 1.337784439296907e-05,
"loss": 0.4143,
"step": 2900
},
{
"epoch": 0.37269467213114754,
"grad_norm": 0.17863045632839203,
"learning_rate": 1.3350592723804335e-05,
"loss": 0.4167,
"step": 2910
},
{
"epoch": 0.3739754098360656,
"grad_norm": 28.605680465698242,
"learning_rate": 1.3323341054639599e-05,
"loss": 0.6769,
"step": 2920
},
{
"epoch": 0.37525614754098363,
"grad_norm": 0.0853688195347786,
"learning_rate": 1.3296089385474861e-05,
"loss": 0.5126,
"step": 2930
},
{
"epoch": 0.3765368852459016,
"grad_norm": 63.26204299926758,
"learning_rate": 1.3268837716310126e-05,
"loss": 1.1696,
"step": 2940
},
{
"epoch": 0.37781762295081966,
"grad_norm": 45.06633377075195,
"learning_rate": 1.324158604714539e-05,
"loss": 0.6522,
"step": 2950
},
{
"epoch": 0.3790983606557377,
"grad_norm": 36.450233459472656,
"learning_rate": 1.3214334377980653e-05,
"loss": 0.91,
"step": 2960
},
{
"epoch": 0.38037909836065575,
"grad_norm": 58.59020233154297,
"learning_rate": 1.3187082708815917e-05,
"loss": 0.5549,
"step": 2970
},
{
"epoch": 0.38165983606557374,
"grad_norm": 13.287269592285156,
"learning_rate": 1.3159831039651181e-05,
"loss": 0.4198,
"step": 2980
},
{
"epoch": 0.3829405737704918,
"grad_norm": 20.24810218811035,
"learning_rate": 1.3132579370486442e-05,
"loss": 0.6179,
"step": 2990
},
{
"epoch": 0.38422131147540983,
"grad_norm": 18.099557876586914,
"learning_rate": 1.3105327701321707e-05,
"loss": 0.8484,
"step": 3000
},
{
"epoch": 0.3855020491803279,
"grad_norm": 41.92770004272461,
"learning_rate": 1.307807603215697e-05,
"loss": 0.929,
"step": 3010
},
{
"epoch": 0.3867827868852459,
"grad_norm": 11.101128578186035,
"learning_rate": 1.3050824362992234e-05,
"loss": 0.6886,
"step": 3020
},
{
"epoch": 0.3880635245901639,
"grad_norm": 0.5516038537025452,
"learning_rate": 1.3023572693827498e-05,
"loss": 0.232,
"step": 3030
},
{
"epoch": 0.38934426229508196,
"grad_norm": 19.20160675048828,
"learning_rate": 1.299632102466276e-05,
"loss": 0.632,
"step": 3040
},
{
"epoch": 0.390625,
"grad_norm": 89.39508056640625,
"learning_rate": 1.2969069355498025e-05,
"loss": 0.5,
"step": 3050
},
{
"epoch": 0.39190573770491804,
"grad_norm": 3.156262159347534,
"learning_rate": 1.294181768633329e-05,
"loss": 0.5471,
"step": 3060
},
{
"epoch": 0.3931864754098361,
"grad_norm": 1.8074102401733398,
"learning_rate": 1.2914566017168553e-05,
"loss": 0.4993,
"step": 3070
},
{
"epoch": 0.3944672131147541,
"grad_norm": 10.57691764831543,
"learning_rate": 1.2887314348003816e-05,
"loss": 0.3536,
"step": 3080
},
{
"epoch": 0.3957479508196721,
"grad_norm": 31.425968170166016,
"learning_rate": 1.286006267883908e-05,
"loss": 0.4929,
"step": 3090
},
{
"epoch": 0.39702868852459017,
"grad_norm": 1.107421636581421,
"learning_rate": 1.2832811009674345e-05,
"loss": 0.5302,
"step": 3100
},
{
"epoch": 0.3983094262295082,
"grad_norm": 31.851308822631836,
"learning_rate": 1.2805559340509607e-05,
"loss": 0.6524,
"step": 3110
},
{
"epoch": 0.39959016393442626,
"grad_norm": 33.0150146484375,
"learning_rate": 1.2778307671344872e-05,
"loss": 0.5296,
"step": 3120
},
{
"epoch": 0.40087090163934425,
"grad_norm": 60.5539665222168,
"learning_rate": 1.2751056002180136e-05,
"loss": 0.7329,
"step": 3130
},
{
"epoch": 0.4021516393442623,
"grad_norm": 26.929574966430664,
"learning_rate": 1.2723804333015399e-05,
"loss": 0.5035,
"step": 3140
},
{
"epoch": 0.40343237704918034,
"grad_norm": 28.021299362182617,
"learning_rate": 1.2696552663850661e-05,
"loss": 0.603,
"step": 3150
},
{
"epoch": 0.4047131147540984,
"grad_norm": 59.49539566040039,
"learning_rate": 1.2669300994685924e-05,
"loss": 0.5007,
"step": 3160
},
{
"epoch": 0.40599385245901637,
"grad_norm": 31.815570831298828,
"learning_rate": 1.2642049325521188e-05,
"loss": 0.4406,
"step": 3170
},
{
"epoch": 0.4072745901639344,
"grad_norm": 60.27109146118164,
"learning_rate": 1.2614797656356453e-05,
"loss": 0.5205,
"step": 3180
},
{
"epoch": 0.40855532786885246,
"grad_norm": 3.3493058681488037,
"learning_rate": 1.2587545987191717e-05,
"loss": 0.6267,
"step": 3190
},
{
"epoch": 0.4098360655737705,
"grad_norm": 23.72585678100586,
"learning_rate": 1.256029431802698e-05,
"loss": 0.6263,
"step": 3200
},
{
"epoch": 0.41111680327868855,
"grad_norm": 24.219833374023438,
"learning_rate": 1.2533042648862244e-05,
"loss": 0.4589,
"step": 3210
},
{
"epoch": 0.41239754098360654,
"grad_norm": 0.2840415835380554,
"learning_rate": 1.2505790979697508e-05,
"loss": 0.3652,
"step": 3220
},
{
"epoch": 0.4136782786885246,
"grad_norm": 17.429651260375977,
"learning_rate": 1.247853931053277e-05,
"loss": 0.7563,
"step": 3230
},
{
"epoch": 0.4149590163934426,
"grad_norm": 0.5108852386474609,
"learning_rate": 1.2451287641368035e-05,
"loss": 0.3132,
"step": 3240
},
{
"epoch": 0.41623975409836067,
"grad_norm": 50.98451614379883,
"learning_rate": 1.24240359722033e-05,
"loss": 0.5077,
"step": 3250
},
{
"epoch": 0.4175204918032787,
"grad_norm": 1.3974177837371826,
"learning_rate": 1.2396784303038562e-05,
"loss": 0.4276,
"step": 3260
},
{
"epoch": 0.4188012295081967,
"grad_norm": 12.84176254272461,
"learning_rate": 1.2369532633873826e-05,
"loss": 0.5647,
"step": 3270
},
{
"epoch": 0.42008196721311475,
"grad_norm": 21.05103302001953,
"learning_rate": 1.234228096470909e-05,
"loss": 0.1421,
"step": 3280
},
{
"epoch": 0.4213627049180328,
"grad_norm": 0.3647187352180481,
"learning_rate": 1.2315029295544353e-05,
"loss": 0.6179,
"step": 3290
},
{
"epoch": 0.42264344262295084,
"grad_norm": 90.5313720703125,
"learning_rate": 1.2287777626379618e-05,
"loss": 0.8233,
"step": 3300
},
{
"epoch": 0.4239241803278688,
"grad_norm": 19.75844955444336,
"learning_rate": 1.226052595721488e-05,
"loss": 0.5078,
"step": 3310
},
{
"epoch": 0.42520491803278687,
"grad_norm": 0.42248353362083435,
"learning_rate": 1.2233274288050143e-05,
"loss": 0.3436,
"step": 3320
},
{
"epoch": 0.4264856557377049,
"grad_norm": 59.313232421875,
"learning_rate": 1.2206022618885407e-05,
"loss": 0.5244,
"step": 3330
},
{
"epoch": 0.42776639344262296,
"grad_norm": 14.109567642211914,
"learning_rate": 1.2178770949720671e-05,
"loss": 0.7947,
"step": 3340
},
{
"epoch": 0.429047131147541,
"grad_norm": 16.229310989379883,
"learning_rate": 1.2151519280555934e-05,
"loss": 0.5386,
"step": 3350
},
{
"epoch": 0.430327868852459,
"grad_norm": 25.23029136657715,
"learning_rate": 1.2124267611391198e-05,
"loss": 0.5719,
"step": 3360
},
{
"epoch": 0.43160860655737704,
"grad_norm": 1.4985939264297485,
"learning_rate": 1.2097015942226463e-05,
"loss": 0.3424,
"step": 3370
},
{
"epoch": 0.4328893442622951,
"grad_norm": 24.808349609375,
"learning_rate": 1.2069764273061725e-05,
"loss": 0.8804,
"step": 3380
},
{
"epoch": 0.43417008196721313,
"grad_norm": 30.150056838989258,
"learning_rate": 1.204251260389699e-05,
"loss": 0.3999,
"step": 3390
},
{
"epoch": 0.4354508196721312,
"grad_norm": 59.782325744628906,
"learning_rate": 1.2015260934732254e-05,
"loss": 0.4612,
"step": 3400
},
{
"epoch": 0.43673155737704916,
"grad_norm": 55.766117095947266,
"learning_rate": 1.1988009265567517e-05,
"loss": 0.3971,
"step": 3410
},
{
"epoch": 0.4380122950819672,
"grad_norm": 69.8100814819336,
"learning_rate": 1.1960757596402781e-05,
"loss": 0.5867,
"step": 3420
},
{
"epoch": 0.43929303278688525,
"grad_norm": 24.89929962158203,
"learning_rate": 1.1933505927238045e-05,
"loss": 0.793,
"step": 3430
},
{
"epoch": 0.4405737704918033,
"grad_norm": 21.96668243408203,
"learning_rate": 1.1906254258073308e-05,
"loss": 0.8675,
"step": 3440
},
{
"epoch": 0.4418545081967213,
"grad_norm": 59.37974548339844,
"learning_rate": 1.1879002588908572e-05,
"loss": 0.5162,
"step": 3450
},
{
"epoch": 0.44313524590163933,
"grad_norm": 0.49646639823913574,
"learning_rate": 1.1851750919743837e-05,
"loss": 0.6127,
"step": 3460
},
{
"epoch": 0.4444159836065574,
"grad_norm": 8.308236122131348,
"learning_rate": 1.1824499250579097e-05,
"loss": 0.6185,
"step": 3470
},
{
"epoch": 0.4456967213114754,
"grad_norm": 2.5998694896698,
"learning_rate": 1.1797247581414362e-05,
"loss": 0.5386,
"step": 3480
},
{
"epoch": 0.44697745901639346,
"grad_norm": 39.297706604003906,
"learning_rate": 1.1769995912249626e-05,
"loss": 0.7987,
"step": 3490
},
{
"epoch": 0.44825819672131145,
"grad_norm": 7.121617794036865,
"learning_rate": 1.1742744243084889e-05,
"loss": 0.8963,
"step": 3500
},
{
"epoch": 0.4495389344262295,
"grad_norm": 1.0637052059173584,
"learning_rate": 1.1715492573920153e-05,
"loss": 0.3344,
"step": 3510
},
{
"epoch": 0.45081967213114754,
"grad_norm": 65.03225708007812,
"learning_rate": 1.1688240904755417e-05,
"loss": 0.6484,
"step": 3520
},
{
"epoch": 0.4521004098360656,
"grad_norm": 0.4046671986579895,
"learning_rate": 1.166098923559068e-05,
"loss": 0.2954,
"step": 3530
},
{
"epoch": 0.45338114754098363,
"grad_norm": 10.253545761108398,
"learning_rate": 1.1633737566425944e-05,
"loss": 0.1934,
"step": 3540
},
{
"epoch": 0.4546618852459016,
"grad_norm": 99.9068832397461,
"learning_rate": 1.1606485897261209e-05,
"loss": 0.7702,
"step": 3550
},
{
"epoch": 0.45594262295081966,
"grad_norm": 58.01685333251953,
"learning_rate": 1.1579234228096471e-05,
"loss": 0.5098,
"step": 3560
},
{
"epoch": 0.4572233606557377,
"grad_norm": 116.0182876586914,
"learning_rate": 1.1551982558931736e-05,
"loss": 0.7526,
"step": 3570
},
{
"epoch": 0.45850409836065575,
"grad_norm": 0.7602908611297607,
"learning_rate": 1.1524730889767e-05,
"loss": 0.3513,
"step": 3580
},
{
"epoch": 0.45978483606557374,
"grad_norm": 23.507183074951172,
"learning_rate": 1.1497479220602264e-05,
"loss": 0.5627,
"step": 3590
},
{
"epoch": 0.4610655737704918,
"grad_norm": 0.25320929288864136,
"learning_rate": 1.1470227551437527e-05,
"loss": 0.7757,
"step": 3600
},
{
"epoch": 0.46234631147540983,
"grad_norm": 2.4358434677124023,
"learning_rate": 1.1442975882272791e-05,
"loss": 0.5189,
"step": 3610
},
{
"epoch": 0.4636270491803279,
"grad_norm": 3.7247753143310547,
"learning_rate": 1.1415724213108055e-05,
"loss": 0.5093,
"step": 3620
},
{
"epoch": 0.4649077868852459,
"grad_norm": 39.57719421386719,
"learning_rate": 1.1388472543943316e-05,
"loss": 0.7375,
"step": 3630
},
{
"epoch": 0.4661885245901639,
"grad_norm": 68.47445678710938,
"learning_rate": 1.136122087477858e-05,
"loss": 0.5694,
"step": 3640
},
{
"epoch": 0.46746926229508196,
"grad_norm": 18.36240577697754,
"learning_rate": 1.1333969205613843e-05,
"loss": 0.668,
"step": 3650
},
{
"epoch": 0.46875,
"grad_norm": 38.88651657104492,
"learning_rate": 1.1306717536449108e-05,
"loss": 0.6662,
"step": 3660
},
{
"epoch": 0.47003073770491804,
"grad_norm": 22.401813507080078,
"learning_rate": 1.1279465867284372e-05,
"loss": 0.7211,
"step": 3670
},
{
"epoch": 0.4713114754098361,
"grad_norm": 0.3502928912639618,
"learning_rate": 1.1252214198119635e-05,
"loss": 0.4916,
"step": 3680
},
{
"epoch": 0.4725922131147541,
"grad_norm": 4.397254467010498,
"learning_rate": 1.1224962528954899e-05,
"loss": 0.7776,
"step": 3690
},
{
"epoch": 0.4738729508196721,
"grad_norm": 3.871940851211548,
"learning_rate": 1.1197710859790163e-05,
"loss": 0.707,
"step": 3700
},
{
"epoch": 0.47515368852459017,
"grad_norm": 33.0516242980957,
"learning_rate": 1.1170459190625428e-05,
"loss": 0.383,
"step": 3710
},
{
"epoch": 0.4764344262295082,
"grad_norm": 26.215961456298828,
"learning_rate": 1.114320752146069e-05,
"loss": 0.4088,
"step": 3720
},
{
"epoch": 0.47771516393442626,
"grad_norm": 32.82633972167969,
"learning_rate": 1.1115955852295954e-05,
"loss": 0.4577,
"step": 3730
},
{
"epoch": 0.47899590163934425,
"grad_norm": 186.45492553710938,
"learning_rate": 1.1088704183131219e-05,
"loss": 0.4099,
"step": 3740
},
{
"epoch": 0.4802766393442623,
"grad_norm": 129.46585083007812,
"learning_rate": 1.1061452513966481e-05,
"loss": 0.6375,
"step": 3750
},
{
"epoch": 0.48155737704918034,
"grad_norm": 0.7614141702651978,
"learning_rate": 1.1034200844801746e-05,
"loss": 0.5316,
"step": 3760
},
{
"epoch": 0.4828381147540984,
"grad_norm": 34.36369323730469,
"learning_rate": 1.100694917563701e-05,
"loss": 0.7375,
"step": 3770
},
{
"epoch": 0.48411885245901637,
"grad_norm": 0.140080064535141,
"learning_rate": 1.0979697506472273e-05,
"loss": 0.6295,
"step": 3780
},
{
"epoch": 0.4853995901639344,
"grad_norm": 7.306354999542236,
"learning_rate": 1.0952445837307537e-05,
"loss": 0.9806,
"step": 3790
},
{
"epoch": 0.48668032786885246,
"grad_norm": 85.7445068359375,
"learning_rate": 1.0925194168142798e-05,
"loss": 0.7028,
"step": 3800
},
{
"epoch": 0.4879610655737705,
"grad_norm": 1.7156010866165161,
"learning_rate": 1.0897942498978062e-05,
"loss": 0.3287,
"step": 3810
},
{
"epoch": 0.48924180327868855,
"grad_norm": 4.566237926483154,
"learning_rate": 1.0870690829813327e-05,
"loss": 0.4033,
"step": 3820
},
{
"epoch": 0.49052254098360654,
"grad_norm": 46.89541244506836,
"learning_rate": 1.084343916064859e-05,
"loss": 0.823,
"step": 3830
},
{
"epoch": 0.4918032786885246,
"grad_norm": 12.144411087036133,
"learning_rate": 1.0816187491483854e-05,
"loss": 0.5362,
"step": 3840
},
{
"epoch": 0.4930840163934426,
"grad_norm": 18.448686599731445,
"learning_rate": 1.0788935822319118e-05,
"loss": 0.4256,
"step": 3850
},
{
"epoch": 0.49436475409836067,
"grad_norm": 0.24063384532928467,
"learning_rate": 1.0761684153154382e-05,
"loss": 0.2343,
"step": 3860
},
{
"epoch": 0.4956454918032787,
"grad_norm": 65.46757507324219,
"learning_rate": 1.0734432483989645e-05,
"loss": 0.5689,
"step": 3870
},
{
"epoch": 0.4969262295081967,
"grad_norm": 11.055042266845703,
"learning_rate": 1.0707180814824909e-05,
"loss": 0.6077,
"step": 3880
},
{
"epoch": 0.49820696721311475,
"grad_norm": 0.7104390263557434,
"learning_rate": 1.0679929145660173e-05,
"loss": 0.4516,
"step": 3890
},
{
"epoch": 0.4994877049180328,
"grad_norm": 33.67184066772461,
"learning_rate": 1.0652677476495436e-05,
"loss": 0.895,
"step": 3900
},
{
"epoch": 0.5007684426229508,
"grad_norm": 2.971726417541504,
"learning_rate": 1.06254258073307e-05,
"loss": 0.6966,
"step": 3910
},
{
"epoch": 0.5020491803278688,
"grad_norm": 0.6927921772003174,
"learning_rate": 1.0598174138165965e-05,
"loss": 0.6475,
"step": 3920
},
{
"epoch": 0.5033299180327869,
"grad_norm": 18.608713150024414,
"learning_rate": 1.0570922469001227e-05,
"loss": 0.6699,
"step": 3930
},
{
"epoch": 0.5046106557377049,
"grad_norm": 4.135254859924316,
"learning_rate": 1.0543670799836492e-05,
"loss": 0.4654,
"step": 3940
},
{
"epoch": 0.5058913934426229,
"grad_norm": 18.821929931640625,
"learning_rate": 1.0516419130671756e-05,
"loss": 0.8085,
"step": 3950
},
{
"epoch": 0.507172131147541,
"grad_norm": 45.03554916381836,
"learning_rate": 1.0489167461507017e-05,
"loss": 0.4745,
"step": 3960
},
{
"epoch": 0.508452868852459,
"grad_norm": 170.6229705810547,
"learning_rate": 1.0461915792342281e-05,
"loss": 0.6601,
"step": 3970
},
{
"epoch": 0.5097336065573771,
"grad_norm": 23.49982261657715,
"learning_rate": 1.0434664123177546e-05,
"loss": 0.379,
"step": 3980
},
{
"epoch": 0.5110143442622951,
"grad_norm": 2.7527880668640137,
"learning_rate": 1.0407412454012808e-05,
"loss": 0.427,
"step": 3990
},
{
"epoch": 0.5122950819672131,
"grad_norm": 0.854061484336853,
"learning_rate": 1.0380160784848072e-05,
"loss": 0.8099,
"step": 4000
},
{
"epoch": 0.5135758196721312,
"grad_norm": 77.43912506103516,
"learning_rate": 1.0352909115683337e-05,
"loss": 0.2461,
"step": 4010
},
{
"epoch": 0.5148565573770492,
"grad_norm": 0.2251901924610138,
"learning_rate": 1.03256574465186e-05,
"loss": 0.6852,
"step": 4020
},
{
"epoch": 0.5161372950819673,
"grad_norm": 0.30500558018684387,
"learning_rate": 1.0298405777353864e-05,
"loss": 0.2133,
"step": 4030
},
{
"epoch": 0.5174180327868853,
"grad_norm": 258.4718017578125,
"learning_rate": 1.0271154108189128e-05,
"loss": 0.6098,
"step": 4040
},
{
"epoch": 0.5186987704918032,
"grad_norm": 38.535884857177734,
"learning_rate": 1.024390243902439e-05,
"loss": 1.0855,
"step": 4050
},
{
"epoch": 0.5199795081967213,
"grad_norm": 13.258109092712402,
"learning_rate": 1.0216650769859655e-05,
"loss": 0.8256,
"step": 4060
},
{
"epoch": 0.5212602459016393,
"grad_norm": 45.26698684692383,
"learning_rate": 1.018939910069492e-05,
"loss": 0.579,
"step": 4070
},
{
"epoch": 0.5225409836065574,
"grad_norm": 9.766562461853027,
"learning_rate": 1.0162147431530182e-05,
"loss": 0.3964,
"step": 4080
},
{
"epoch": 0.5238217213114754,
"grad_norm": 12.843767166137695,
"learning_rate": 1.0134895762365446e-05,
"loss": 0.5889,
"step": 4090
},
{
"epoch": 0.5251024590163934,
"grad_norm": 10.034939765930176,
"learning_rate": 1.010764409320071e-05,
"loss": 0.5689,
"step": 4100
},
{
"epoch": 0.5263831967213115,
"grad_norm": 18.635753631591797,
"learning_rate": 1.0080392424035975e-05,
"loss": 0.3298,
"step": 4110
},
{
"epoch": 0.5276639344262295,
"grad_norm": 6.539854049682617,
"learning_rate": 1.0053140754871236e-05,
"loss": 0.8252,
"step": 4120
},
{
"epoch": 0.5289446721311475,
"grad_norm": 19.9680118560791,
"learning_rate": 1.00258890857065e-05,
"loss": 0.5432,
"step": 4130
},
{
"epoch": 0.5302254098360656,
"grad_norm": 38.84269714355469,
"learning_rate": 9.998637416541764e-06,
"loss": 1.0371,
"step": 4140
},
{
"epoch": 0.5315061475409836,
"grad_norm": 8.018956184387207,
"learning_rate": 9.971385747377029e-06,
"loss": 1.1152,
"step": 4150
},
{
"epoch": 0.5327868852459017,
"grad_norm": 1.0766541957855225,
"learning_rate": 9.944134078212291e-06,
"loss": 0.5306,
"step": 4160
},
{
"epoch": 0.5340676229508197,
"grad_norm": 0.5119646787643433,
"learning_rate": 9.916882409047554e-06,
"loss": 0.4113,
"step": 4170
},
{
"epoch": 0.5353483606557377,
"grad_norm": 60.749359130859375,
"learning_rate": 9.889630739882818e-06,
"loss": 0.4195,
"step": 4180
},
{
"epoch": 0.5366290983606558,
"grad_norm": 65.9601058959961,
"learning_rate": 9.862379070718083e-06,
"loss": 0.3612,
"step": 4190
},
{
"epoch": 0.5379098360655737,
"grad_norm": 10.21090316772461,
"learning_rate": 9.835127401553345e-06,
"loss": 0.6783,
"step": 4200
},
{
"epoch": 0.5391905737704918,
"grad_norm": 1.9999886751174927,
"learning_rate": 9.80787573238861e-06,
"loss": 0.1912,
"step": 4210
},
{
"epoch": 0.5404713114754098,
"grad_norm": 1.5724451541900635,
"learning_rate": 9.780624063223874e-06,
"loss": 0.5334,
"step": 4220
},
{
"epoch": 0.5417520491803278,
"grad_norm": 64.44462585449219,
"learning_rate": 9.753372394059137e-06,
"loss": 0.6993,
"step": 4230
},
{
"epoch": 0.5430327868852459,
"grad_norm": 61.30992126464844,
"learning_rate": 9.726120724894401e-06,
"loss": 0.4775,
"step": 4240
},
{
"epoch": 0.5443135245901639,
"grad_norm": 0.6172360777854919,
"learning_rate": 9.698869055729663e-06,
"loss": 0.829,
"step": 4250
},
{
"epoch": 0.545594262295082,
"grad_norm": 73.66020202636719,
"learning_rate": 9.671617386564928e-06,
"loss": 0.6753,
"step": 4260
},
{
"epoch": 0.546875,
"grad_norm": 14.051901817321777,
"learning_rate": 9.644365717400192e-06,
"loss": 0.9101,
"step": 4270
},
{
"epoch": 0.548155737704918,
"grad_norm": 8.695210456848145,
"learning_rate": 9.617114048235455e-06,
"loss": 0.3771,
"step": 4280
},
{
"epoch": 0.5494364754098361,
"grad_norm": 0.41656801104545593,
"learning_rate": 9.589862379070719e-06,
"loss": 0.498,
"step": 4290
},
{
"epoch": 0.5507172131147541,
"grad_norm": 0.6697580814361572,
"learning_rate": 9.562610709905983e-06,
"loss": 0.6485,
"step": 4300
},
{
"epoch": 0.5519979508196722,
"grad_norm": 7.877650260925293,
"learning_rate": 9.535359040741246e-06,
"loss": 0.6239,
"step": 4310
},
{
"epoch": 0.5532786885245902,
"grad_norm": 7.576630115509033,
"learning_rate": 9.508107371576509e-06,
"loss": 0.9483,
"step": 4320
},
{
"epoch": 0.5545594262295082,
"grad_norm": 21.719369888305664,
"learning_rate": 9.480855702411773e-06,
"loss": 0.4436,
"step": 4330
},
{
"epoch": 0.5558401639344263,
"grad_norm": 21.08763885498047,
"learning_rate": 9.453604033247037e-06,
"loss": 0.4597,
"step": 4340
},
{
"epoch": 0.5571209016393442,
"grad_norm": 18.030412673950195,
"learning_rate": 9.4263523640823e-06,
"loss": 0.77,
"step": 4350
},
{
"epoch": 0.5584016393442623,
"grad_norm": 18.394670486450195,
"learning_rate": 9.399100694917564e-06,
"loss": 0.2567,
"step": 4360
},
{
"epoch": 0.5596823770491803,
"grad_norm": 9.325862884521484,
"learning_rate": 9.371849025752829e-06,
"loss": 0.514,
"step": 4370
},
{
"epoch": 0.5609631147540983,
"grad_norm": 0.574291467666626,
"learning_rate": 9.344597356588093e-06,
"loss": 0.4198,
"step": 4380
},
{
"epoch": 0.5622438524590164,
"grad_norm": 0.7731497883796692,
"learning_rate": 9.317345687423355e-06,
"loss": 0.5641,
"step": 4390
},
{
"epoch": 0.5635245901639344,
"grad_norm": 10.017977714538574,
"learning_rate": 9.290094018258618e-06,
"loss": 0.4151,
"step": 4400
},
{
"epoch": 0.5648053278688525,
"grad_norm": 9.897109031677246,
"learning_rate": 9.262842349093882e-06,
"loss": 0.5924,
"step": 4410
},
{
"epoch": 0.5660860655737705,
"grad_norm": 0.5375286936759949,
"learning_rate": 9.235590679929147e-06,
"loss": 0.2296,
"step": 4420
},
{
"epoch": 0.5673668032786885,
"grad_norm": 15.379401206970215,
"learning_rate": 9.20833901076441e-06,
"loss": 0.3977,
"step": 4430
},
{
"epoch": 0.5686475409836066,
"grad_norm": 33.24900436401367,
"learning_rate": 9.181087341599674e-06,
"loss": 0.6875,
"step": 4440
},
{
"epoch": 0.5699282786885246,
"grad_norm": 17.754283905029297,
"learning_rate": 9.153835672434938e-06,
"loss": 0.7589,
"step": 4450
},
{
"epoch": 0.5712090163934426,
"grad_norm": 9.958178520202637,
"learning_rate": 9.1265840032702e-06,
"loss": 1.0191,
"step": 4460
},
{
"epoch": 0.5724897540983607,
"grad_norm": 0.880713701248169,
"learning_rate": 9.099332334105465e-06,
"loss": 0.3109,
"step": 4470
},
{
"epoch": 0.5737704918032787,
"grad_norm": 19.9377498626709,
"learning_rate": 9.072080664940728e-06,
"loss": 1.0805,
"step": 4480
},
{
"epoch": 0.5750512295081968,
"grad_norm": 45.10675811767578,
"learning_rate": 9.044828995775992e-06,
"loss": 0.485,
"step": 4490
},
{
"epoch": 0.5763319672131147,
"grad_norm": 2.320873498916626,
"learning_rate": 9.017577326611256e-06,
"loss": 0.7725,
"step": 4500
},
{
"epoch": 0.5776127049180327,
"grad_norm": 8.428739547729492,
"learning_rate": 8.990325657446519e-06,
"loss": 0.9644,
"step": 4510
},
{
"epoch": 0.5788934426229508,
"grad_norm": 31.189594268798828,
"learning_rate": 8.963073988281783e-06,
"loss": 1.0522,
"step": 4520
},
{
"epoch": 0.5801741803278688,
"grad_norm": 3.0397989749908447,
"learning_rate": 8.935822319117047e-06,
"loss": 0.7336,
"step": 4530
},
{
"epoch": 0.5814549180327869,
"grad_norm": 28.37086296081543,
"learning_rate": 8.90857064995231e-06,
"loss": 1.0115,
"step": 4540
},
{
"epoch": 0.5827356557377049,
"grad_norm": 24.746339797973633,
"learning_rate": 8.881318980787574e-06,
"loss": 0.6436,
"step": 4550
},
{
"epoch": 0.5840163934426229,
"grad_norm": 41.54196548461914,
"learning_rate": 8.854067311622839e-06,
"loss": 0.7121,
"step": 4560
},
{
"epoch": 0.585297131147541,
"grad_norm": 45.90923309326172,
"learning_rate": 8.826815642458101e-06,
"loss": 0.8686,
"step": 4570
},
{
"epoch": 0.586577868852459,
"grad_norm": 16.441612243652344,
"learning_rate": 8.799563973293364e-06,
"loss": 0.8231,
"step": 4580
},
{
"epoch": 0.5878586065573771,
"grad_norm": 16.66089630126953,
"learning_rate": 8.772312304128628e-06,
"loss": 0.5949,
"step": 4590
},
{
"epoch": 0.5891393442622951,
"grad_norm": 23.114477157592773,
"learning_rate": 8.745060634963893e-06,
"loss": 0.8395,
"step": 4600
},
{
"epoch": 0.5904200819672131,
"grad_norm": 22.976099014282227,
"learning_rate": 8.717808965799155e-06,
"loss": 0.8844,
"step": 4610
},
{
"epoch": 0.5917008196721312,
"grad_norm": 12.82754898071289,
"learning_rate": 8.69055729663442e-06,
"loss": 0.8147,
"step": 4620
},
{
"epoch": 0.5929815573770492,
"grad_norm": 43.78225326538086,
"learning_rate": 8.663305627469684e-06,
"loss": 0.7544,
"step": 4630
},
{
"epoch": 0.5942622950819673,
"grad_norm": 19.483823776245117,
"learning_rate": 8.636053958304948e-06,
"loss": 0.6818,
"step": 4640
},
{
"epoch": 0.5955430327868853,
"grad_norm": 8.231918334960938,
"learning_rate": 8.60880228914021e-06,
"loss": 0.4572,
"step": 4650
},
{
"epoch": 0.5968237704918032,
"grad_norm": 8.501511573791504,
"learning_rate": 8.581550619975473e-06,
"loss": 0.7072,
"step": 4660
},
{
"epoch": 0.5981045081967213,
"grad_norm": 28.3646297454834,
"learning_rate": 8.554298950810738e-06,
"loss": 0.7487,
"step": 4670
},
{
"epoch": 0.5993852459016393,
"grad_norm": 10.618340492248535,
"learning_rate": 8.527047281646002e-06,
"loss": 0.4582,
"step": 4680
},
{
"epoch": 0.6006659836065574,
"grad_norm": 48.34426498413086,
"learning_rate": 8.499795612481265e-06,
"loss": 0.743,
"step": 4690
},
{
"epoch": 0.6019467213114754,
"grad_norm": 19.851808547973633,
"learning_rate": 8.472543943316529e-06,
"loss": 1.0467,
"step": 4700
},
{
"epoch": 0.6032274590163934,
"grad_norm": 14.339262962341309,
"learning_rate": 8.445292274151793e-06,
"loss": 0.7851,
"step": 4710
},
{
"epoch": 0.6045081967213115,
"grad_norm": 28.62281608581543,
"learning_rate": 8.418040604987056e-06,
"loss": 0.7838,
"step": 4720
},
{
"epoch": 0.6057889344262295,
"grad_norm": 19.882169723510742,
"learning_rate": 8.390788935822319e-06,
"loss": 0.8792,
"step": 4730
},
{
"epoch": 0.6070696721311475,
"grad_norm": 6.609494686126709,
"learning_rate": 8.363537266657583e-06,
"loss": 0.4227,
"step": 4740
},
{
"epoch": 0.6083504098360656,
"grad_norm": 15.172801971435547,
"learning_rate": 8.336285597492847e-06,
"loss": 0.8444,
"step": 4750
},
{
"epoch": 0.6096311475409836,
"grad_norm": 46.852413177490234,
"learning_rate": 8.309033928328112e-06,
"loss": 0.5798,
"step": 4760
},
{
"epoch": 0.6109118852459017,
"grad_norm": 41.42491912841797,
"learning_rate": 8.281782259163374e-06,
"loss": 0.8357,
"step": 4770
},
{
"epoch": 0.6121926229508197,
"grad_norm": 19.07272720336914,
"learning_rate": 8.254530589998639e-06,
"loss": 0.7716,
"step": 4780
},
{
"epoch": 0.6134733606557377,
"grad_norm": 6.932359218597412,
"learning_rate": 8.227278920833903e-06,
"loss": 1.1022,
"step": 4790
},
{
"epoch": 0.6147540983606558,
"grad_norm": 39.23098373413086,
"learning_rate": 8.200027251669165e-06,
"loss": 0.7454,
"step": 4800
},
{
"epoch": 0.6160348360655737,
"grad_norm": 25.000465393066406,
"learning_rate": 8.172775582504428e-06,
"loss": 0.6045,
"step": 4810
},
{
"epoch": 0.6173155737704918,
"grad_norm": 16.970958709716797,
"learning_rate": 8.145523913339692e-06,
"loss": 0.8267,
"step": 4820
},
{
"epoch": 0.6185963114754098,
"grad_norm": 15.70919132232666,
"learning_rate": 8.118272244174957e-06,
"loss": 0.6148,
"step": 4830
},
{
"epoch": 0.6198770491803278,
"grad_norm": 25.68458366394043,
"learning_rate": 8.09102057501022e-06,
"loss": 0.6463,
"step": 4840
},
{
"epoch": 0.6211577868852459,
"grad_norm": 13.340360641479492,
"learning_rate": 8.063768905845484e-06,
"loss": 0.645,
"step": 4850
},
{
"epoch": 0.6224385245901639,
"grad_norm": 124.80747985839844,
"learning_rate": 8.036517236680748e-06,
"loss": 0.5991,
"step": 4860
},
{
"epoch": 0.623719262295082,
"grad_norm": 20.25383186340332,
"learning_rate": 8.00926556751601e-06,
"loss": 0.5449,
"step": 4870
},
{
"epoch": 0.625,
"grad_norm": 19.14507484436035,
"learning_rate": 7.982013898351275e-06,
"loss": 0.8269,
"step": 4880
},
{
"epoch": 0.626280737704918,
"grad_norm": 15.882426261901855,
"learning_rate": 7.954762229186538e-06,
"loss": 0.8284,
"step": 4890
},
{
"epoch": 0.6275614754098361,
"grad_norm": 22.384090423583984,
"learning_rate": 7.927510560021802e-06,
"loss": 0.6205,
"step": 4900
},
{
"epoch": 0.6288422131147541,
"grad_norm": 32.309017181396484,
"learning_rate": 7.900258890857066e-06,
"loss": 1.0006,
"step": 4910
},
{
"epoch": 0.6301229508196722,
"grad_norm": 0.9309699535369873,
"learning_rate": 7.873007221692329e-06,
"loss": 0.8793,
"step": 4920
},
{
"epoch": 0.6314036885245902,
"grad_norm": 18.254060745239258,
"learning_rate": 7.845755552527593e-06,
"loss": 0.5832,
"step": 4930
},
{
"epoch": 0.6326844262295082,
"grad_norm": 1.1032278537750244,
"learning_rate": 7.818503883362857e-06,
"loss": 0.6085,
"step": 4940
},
{
"epoch": 0.6339651639344263,
"grad_norm": 4.0901360511779785,
"learning_rate": 7.79125221419812e-06,
"loss": 0.7917,
"step": 4950
},
{
"epoch": 0.6352459016393442,
"grad_norm": 8.3672456741333,
"learning_rate": 7.764000545033384e-06,
"loss": 0.4282,
"step": 4960
},
{
"epoch": 0.6365266393442623,
"grad_norm": 25.113962173461914,
"learning_rate": 7.736748875868647e-06,
"loss": 0.719,
"step": 4970
},
{
"epoch": 0.6378073770491803,
"grad_norm": 16.38678741455078,
"learning_rate": 7.709497206703911e-06,
"loss": 0.8253,
"step": 4980
},
{
"epoch": 0.6390881147540983,
"grad_norm": 38.32978439331055,
"learning_rate": 7.682245537539174e-06,
"loss": 0.7423,
"step": 4990
},
{
"epoch": 0.6403688524590164,
"grad_norm": 36.88998794555664,
"learning_rate": 7.654993868374438e-06,
"loss": 0.6787,
"step": 5000
},
{
"epoch": 0.6416495901639344,
"grad_norm": 31.3937931060791,
"learning_rate": 7.627742199209703e-06,
"loss": 0.4184,
"step": 5010
},
{
"epoch": 0.6429303278688525,
"grad_norm": 43.30199432373047,
"learning_rate": 7.600490530044966e-06,
"loss": 0.7261,
"step": 5020
},
{
"epoch": 0.6442110655737705,
"grad_norm": 24.66848373413086,
"learning_rate": 7.5732388608802296e-06,
"loss": 0.782,
"step": 5030
},
{
"epoch": 0.6454918032786885,
"grad_norm": 0.7670093774795532,
"learning_rate": 7.545987191715494e-06,
"loss": 0.7826,
"step": 5040
},
{
"epoch": 0.6467725409836066,
"grad_norm": 28.53043556213379,
"learning_rate": 7.5187355225507565e-06,
"loss": 0.7738,
"step": 5050
},
{
"epoch": 0.6480532786885246,
"grad_norm": 39.68383026123047,
"learning_rate": 7.49148385338602e-06,
"loss": 0.6153,
"step": 5060
},
{
"epoch": 0.6493340163934426,
"grad_norm": 1.5401833057403564,
"learning_rate": 7.464232184221284e-06,
"loss": 0.4508,
"step": 5070
},
{
"epoch": 0.6506147540983607,
"grad_norm": 18.586135864257812,
"learning_rate": 7.436980515056548e-06,
"loss": 0.7714,
"step": 5080
},
{
"epoch": 0.6518954918032787,
"grad_norm": 4.915235996246338,
"learning_rate": 7.409728845891811e-06,
"loss": 0.7213,
"step": 5090
},
{
"epoch": 0.6531762295081968,
"grad_norm": 13.506136894226074,
"learning_rate": 7.3824771767270756e-06,
"loss": 0.8002,
"step": 5100
},
{
"epoch": 0.6544569672131147,
"grad_norm": 24.696321487426758,
"learning_rate": 7.355225507562339e-06,
"loss": 0.6098,
"step": 5110
},
{
"epoch": 0.6557377049180327,
"grad_norm": 1.202723503112793,
"learning_rate": 7.3279738383976025e-06,
"loss": 0.5947,
"step": 5120
},
{
"epoch": 0.6570184426229508,
"grad_norm": 27.57708168029785,
"learning_rate": 7.300722169232866e-06,
"loss": 0.7283,
"step": 5130
},
{
"epoch": 0.6582991803278688,
"grad_norm": 27.763059616088867,
"learning_rate": 7.2734705000681294e-06,
"loss": 0.5104,
"step": 5140
},
{
"epoch": 0.6595799180327869,
"grad_norm": 22.97685432434082,
"learning_rate": 7.246218830903393e-06,
"loss": 0.7658,
"step": 5150
},
{
"epoch": 0.6608606557377049,
"grad_norm": 20.47222137451172,
"learning_rate": 7.218967161738657e-06,
"loss": 0.3926,
"step": 5160
},
{
"epoch": 0.6621413934426229,
"grad_norm": 34.984249114990234,
"learning_rate": 7.191715492573921e-06,
"loss": 0.9199,
"step": 5170
},
{
"epoch": 0.663422131147541,
"grad_norm": 32.431888580322266,
"learning_rate": 7.164463823409184e-06,
"loss": 0.4709,
"step": 5180
},
{
"epoch": 0.664702868852459,
"grad_norm": 11.426986694335938,
"learning_rate": 7.1372121542444485e-06,
"loss": 0.8031,
"step": 5190
},
{
"epoch": 0.6659836065573771,
"grad_norm": 27.146059036254883,
"learning_rate": 7.109960485079712e-06,
"loss": 0.7132,
"step": 5200
},
{
"epoch": 0.6672643442622951,
"grad_norm": 11.636030197143555,
"learning_rate": 7.082708815914975e-06,
"loss": 0.5368,
"step": 5210
},
{
"epoch": 0.6685450819672131,
"grad_norm": 16.758148193359375,
"learning_rate": 7.055457146750239e-06,
"loss": 0.6143,
"step": 5220
},
{
"epoch": 0.6698258196721312,
"grad_norm": 0.33391350507736206,
"learning_rate": 7.028205477585502e-06,
"loss": 0.5793,
"step": 5230
},
{
"epoch": 0.6711065573770492,
"grad_norm": 25.285449981689453,
"learning_rate": 7.000953808420766e-06,
"loss": 0.64,
"step": 5240
},
{
"epoch": 0.6723872950819673,
"grad_norm": 1.447174072265625,
"learning_rate": 6.97370213925603e-06,
"loss": 0.8713,
"step": 5250
},
{
"epoch": 0.6736680327868853,
"grad_norm": 34.83108139038086,
"learning_rate": 6.946450470091294e-06,
"loss": 0.6408,
"step": 5260
},
{
"epoch": 0.6749487704918032,
"grad_norm": 13.1771821975708,
"learning_rate": 6.919198800926558e-06,
"loss": 0.6303,
"step": 5270
},
{
"epoch": 0.6762295081967213,
"grad_norm": 31.439207077026367,
"learning_rate": 6.8919471317618214e-06,
"loss": 0.6238,
"step": 5280
},
{
"epoch": 0.6775102459016393,
"grad_norm": 11.551750183105469,
"learning_rate": 6.864695462597084e-06,
"loss": 0.9247,
"step": 5290
},
{
"epoch": 0.6787909836065574,
"grad_norm": 18.42095947265625,
"learning_rate": 6.8374437934323475e-06,
"loss": 1.0127,
"step": 5300
},
{
"epoch": 0.6800717213114754,
"grad_norm": 0.4005849361419678,
"learning_rate": 6.810192124267612e-06,
"loss": 0.675,
"step": 5310
},
{
"epoch": 0.6813524590163934,
"grad_norm": 13.756119728088379,
"learning_rate": 6.782940455102875e-06,
"loss": 0.5458,
"step": 5320
},
{
"epoch": 0.6826331967213115,
"grad_norm": 15.997631072998047,
"learning_rate": 6.75568878593814e-06,
"loss": 0.4342,
"step": 5330
},
{
"epoch": 0.6839139344262295,
"grad_norm": 16.906126022338867,
"learning_rate": 6.728437116773403e-06,
"loss": 0.6665,
"step": 5340
},
{
"epoch": 0.6851946721311475,
"grad_norm": 12.170743942260742,
"learning_rate": 6.701185447608667e-06,
"loss": 0.9264,
"step": 5350
},
{
"epoch": 0.6864754098360656,
"grad_norm": 35.61259841918945,
"learning_rate": 6.673933778443931e-06,
"loss": 0.6735,
"step": 5360
},
{
"epoch": 0.6877561475409836,
"grad_norm": 13.542879104614258,
"learning_rate": 6.646682109279194e-06,
"loss": 0.8259,
"step": 5370
},
{
"epoch": 0.6890368852459017,
"grad_norm": 39.6423225402832,
"learning_rate": 6.619430440114457e-06,
"loss": 0.7956,
"step": 5380
},
{
"epoch": 0.6903176229508197,
"grad_norm": 30.907363891601562,
"learning_rate": 6.592178770949721e-06,
"loss": 0.7366,
"step": 5390
},
{
"epoch": 0.6915983606557377,
"grad_norm": 12.479640007019043,
"learning_rate": 6.564927101784985e-06,
"loss": 0.3273,
"step": 5400
},
{
"epoch": 0.6928790983606558,
"grad_norm": 19.15838623046875,
"learning_rate": 6.537675432620248e-06,
"loss": 0.4575,
"step": 5410
},
{
"epoch": 0.6941598360655737,
"grad_norm": 20.0745792388916,
"learning_rate": 6.510423763455513e-06,
"loss": 0.8541,
"step": 5420
},
{
"epoch": 0.6954405737704918,
"grad_norm": 30.12567901611328,
"learning_rate": 6.483172094290776e-06,
"loss": 0.3144,
"step": 5430
},
{
"epoch": 0.6967213114754098,
"grad_norm": 8.731266975402832,
"learning_rate": 6.4559204251260395e-06,
"loss": 0.4327,
"step": 5440
},
{
"epoch": 0.6980020491803278,
"grad_norm": 1.370941400527954,
"learning_rate": 6.428668755961304e-06,
"loss": 0.3519,
"step": 5450
},
{
"epoch": 0.6992827868852459,
"grad_norm": 28.71232795715332,
"learning_rate": 6.4014170867965665e-06,
"loss": 0.6712,
"step": 5460
},
{
"epoch": 0.7005635245901639,
"grad_norm": 20.623737335205078,
"learning_rate": 6.37416541763183e-06,
"loss": 0.6607,
"step": 5470
},
{
"epoch": 0.701844262295082,
"grad_norm": 7.713385581970215,
"learning_rate": 6.346913748467094e-06,
"loss": 0.5568,
"step": 5480
},
{
"epoch": 0.703125,
"grad_norm": 10.449071884155273,
"learning_rate": 6.319662079302358e-06,
"loss": 0.8054,
"step": 5490
},
{
"epoch": 0.704405737704918,
"grad_norm": 11.34548568725586,
"learning_rate": 6.292410410137621e-06,
"loss": 0.8166,
"step": 5500
},
{
"epoch": 0.7056864754098361,
"grad_norm": 2.661618947982788,
"learning_rate": 6.2651587409728855e-06,
"loss": 0.6567,
"step": 5510
},
{
"epoch": 0.7069672131147541,
"grad_norm": 4.278378486633301,
"learning_rate": 6.237907071808149e-06,
"loss": 0.7271,
"step": 5520
},
{
"epoch": 0.7082479508196722,
"grad_norm": 56.11579513549805,
"learning_rate": 6.210655402643413e-06,
"loss": 0.8394,
"step": 5530
},
{
"epoch": 0.7095286885245902,
"grad_norm": 25.923078536987305,
"learning_rate": 6.183403733478676e-06,
"loss": 0.7055,
"step": 5540
},
{
"epoch": 0.7108094262295082,
"grad_norm": 7.200451850891113,
"learning_rate": 6.156152064313939e-06,
"loss": 0.6715,
"step": 5550
},
{
"epoch": 0.7120901639344263,
"grad_norm": 25.070093154907227,
"learning_rate": 6.128900395149203e-06,
"loss": 0.6701,
"step": 5560
},
{
"epoch": 0.7133709016393442,
"grad_norm": 0.7995045781135559,
"learning_rate": 6.101648725984467e-06,
"loss": 0.7706,
"step": 5570
},
{
"epoch": 0.7146516393442623,
"grad_norm": 14.150104522705078,
"learning_rate": 6.074397056819731e-06,
"loss": 0.8404,
"step": 5580
},
{
"epoch": 0.7159323770491803,
"grad_norm": 21.669960021972656,
"learning_rate": 6.047145387654995e-06,
"loss": 0.5122,
"step": 5590
},
{
"epoch": 0.7172131147540983,
"grad_norm": 10.61308765411377,
"learning_rate": 6.0198937184902585e-06,
"loss": 0.7182,
"step": 5600
},
{
"epoch": 0.7184938524590164,
"grad_norm": 12.267438888549805,
"learning_rate": 5.992642049325522e-06,
"loss": 0.7212,
"step": 5610
},
{
"epoch": 0.7197745901639344,
"grad_norm": 12.50552749633789,
"learning_rate": 5.9653903801607846e-06,
"loss": 0.6373,
"step": 5620
},
{
"epoch": 0.7210553278688525,
"grad_norm": 3.3687191009521484,
"learning_rate": 5.938138710996049e-06,
"loss": 0.7845,
"step": 5630
},
{
"epoch": 0.7223360655737705,
"grad_norm": 4.029101848602295,
"learning_rate": 5.910887041831312e-06,
"loss": 0.6061,
"step": 5640
},
{
"epoch": 0.7236168032786885,
"grad_norm": 11.404590606689453,
"learning_rate": 5.883635372666576e-06,
"loss": 0.2602,
"step": 5650
},
{
"epoch": 0.7248975409836066,
"grad_norm": 14.377605438232422,
"learning_rate": 5.85638370350184e-06,
"loss": 0.6709,
"step": 5660
},
{
"epoch": 0.7261782786885246,
"grad_norm": 54.396888732910156,
"learning_rate": 5.829132034337104e-06,
"loss": 0.7768,
"step": 5670
},
{
"epoch": 0.7274590163934426,
"grad_norm": 11.300426483154297,
"learning_rate": 5.801880365172368e-06,
"loss": 0.6319,
"step": 5680
},
{
"epoch": 0.7287397540983607,
"grad_norm": 25.368356704711914,
"learning_rate": 5.774628696007631e-06,
"loss": 0.6695,
"step": 5690
},
{
"epoch": 0.7300204918032787,
"grad_norm": 8.80262279510498,
"learning_rate": 5.747377026842894e-06,
"loss": 0.8748,
"step": 5700
},
{
"epoch": 0.7313012295081968,
"grad_norm": 14.3671236038208,
"learning_rate": 5.7201253576781575e-06,
"loss": 0.7312,
"step": 5710
},
{
"epoch": 0.7325819672131147,
"grad_norm": 20.28556251525879,
"learning_rate": 5.692873688513422e-06,
"loss": 0.6096,
"step": 5720
},
{
"epoch": 0.7338627049180327,
"grad_norm": 22.88327980041504,
"learning_rate": 5.665622019348685e-06,
"loss": 1.0243,
"step": 5730
},
{
"epoch": 0.7351434426229508,
"grad_norm": 12.539216041564941,
"learning_rate": 5.63837035018395e-06,
"loss": 0.6279,
"step": 5740
},
{
"epoch": 0.7364241803278688,
"grad_norm": 37.97767639160156,
"learning_rate": 5.611118681019213e-06,
"loss": 0.7142,
"step": 5750
},
{
"epoch": 0.7377049180327869,
"grad_norm": 2.0420548915863037,
"learning_rate": 5.5838670118544766e-06,
"loss": 0.5773,
"step": 5760
},
{
"epoch": 0.7389856557377049,
"grad_norm": 12.780746459960938,
"learning_rate": 5.556615342689741e-06,
"loss": 0.6708,
"step": 5770
},
{
"epoch": 0.7402663934426229,
"grad_norm": 2.0761895179748535,
"learning_rate": 5.5293636735250035e-06,
"loss": 0.6491,
"step": 5780
},
{
"epoch": 0.741547131147541,
"grad_norm": 41.1733512878418,
"learning_rate": 5.502112004360267e-06,
"loss": 0.9564,
"step": 5790
},
{
"epoch": 0.742827868852459,
"grad_norm": 25.633703231811523,
"learning_rate": 5.474860335195531e-06,
"loss": 0.7985,
"step": 5800
},
{
"epoch": 0.7441086065573771,
"grad_norm": 9.461475372314453,
"learning_rate": 5.447608666030795e-06,
"loss": 0.5234,
"step": 5810
},
{
"epoch": 0.7453893442622951,
"grad_norm": 18.90468978881836,
"learning_rate": 5.420356996866058e-06,
"loss": 0.4353,
"step": 5820
},
{
"epoch": 0.7466700819672131,
"grad_norm": 8.587220191955566,
"learning_rate": 5.3931053277013226e-06,
"loss": 0.6629,
"step": 5830
},
{
"epoch": 0.7479508196721312,
"grad_norm": 15.917558670043945,
"learning_rate": 5.365853658536586e-06,
"loss": 0.7589,
"step": 5840
},
{
"epoch": 0.7492315573770492,
"grad_norm": 6.725412368774414,
"learning_rate": 5.3386019893718495e-06,
"loss": 0.5328,
"step": 5850
},
{
"epoch": 0.7505122950819673,
"grad_norm": 18.641759872436523,
"learning_rate": 5.311350320207113e-06,
"loss": 0.5993,
"step": 5860
},
{
"epoch": 0.7517930327868853,
"grad_norm": 30.297088623046875,
"learning_rate": 5.2840986510423764e-06,
"loss": 0.4351,
"step": 5870
},
{
"epoch": 0.7530737704918032,
"grad_norm": 22.469974517822266,
"learning_rate": 5.25684698187764e-06,
"loss": 0.5852,
"step": 5880
},
{
"epoch": 0.7543545081967213,
"grad_norm": 5.6571173667907715,
"learning_rate": 5.229595312712904e-06,
"loss": 0.6621,
"step": 5890
},
{
"epoch": 0.7556352459016393,
"grad_norm": 32.2354736328125,
"learning_rate": 5.202343643548168e-06,
"loss": 0.8106,
"step": 5900
},
{
"epoch": 0.7569159836065574,
"grad_norm": 15.729165077209473,
"learning_rate": 5.175091974383431e-06,
"loss": 0.519,
"step": 5910
},
{
"epoch": 0.7581967213114754,
"grad_norm": 20.02010726928711,
"learning_rate": 5.1478403052186955e-06,
"loss": 0.599,
"step": 5920
},
{
"epoch": 0.7594774590163934,
"grad_norm": 1.9774470329284668,
"learning_rate": 5.120588636053959e-06,
"loss": 0.3835,
"step": 5930
},
{
"epoch": 0.7607581967213115,
"grad_norm": 10.6248779296875,
"learning_rate": 5.093336966889222e-06,
"loss": 0.5819,
"step": 5940
},
{
"epoch": 0.7620389344262295,
"grad_norm": 8.844250679016113,
"learning_rate": 5.066085297724486e-06,
"loss": 0.524,
"step": 5950
},
{
"epoch": 0.7633196721311475,
"grad_norm": 24.882261276245117,
"learning_rate": 5.038833628559749e-06,
"loss": 0.5727,
"step": 5960
},
{
"epoch": 0.7646004098360656,
"grad_norm": 16.70749855041504,
"learning_rate": 5.011581959395013e-06,
"loss": 0.651,
"step": 5970
},
{
"epoch": 0.7658811475409836,
"grad_norm": 25.65505027770996,
"learning_rate": 4.984330290230277e-06,
"loss": 0.563,
"step": 5980
},
{
"epoch": 0.7671618852459017,
"grad_norm": 27.863927841186523,
"learning_rate": 4.957078621065541e-06,
"loss": 0.4771,
"step": 5990
},
{
"epoch": 0.7684426229508197,
"grad_norm": 0.7001621723175049,
"learning_rate": 4.929826951900804e-06,
"loss": 0.5382,
"step": 6000
},
{
"epoch": 0.7697233606557377,
"grad_norm": 16.65908432006836,
"learning_rate": 4.902575282736068e-06,
"loss": 0.829,
"step": 6010
},
{
"epoch": 0.7710040983606558,
"grad_norm": 6.999290943145752,
"learning_rate": 4.875323613571332e-06,
"loss": 0.3504,
"step": 6020
},
{
"epoch": 0.7722848360655737,
"grad_norm": 21.872570037841797,
"learning_rate": 4.848071944406595e-06,
"loss": 0.6681,
"step": 6030
},
{
"epoch": 0.7735655737704918,
"grad_norm": 12.923929214477539,
"learning_rate": 4.820820275241859e-06,
"loss": 0.7376,
"step": 6040
},
{
"epoch": 0.7748463114754098,
"grad_norm": 24.330562591552734,
"learning_rate": 4.793568606077122e-06,
"loss": 0.6243,
"step": 6050
},
{
"epoch": 0.7761270491803278,
"grad_norm": 9.132780075073242,
"learning_rate": 4.766316936912387e-06,
"loss": 0.9117,
"step": 6060
},
{
"epoch": 0.7774077868852459,
"grad_norm": 9.875121116638184,
"learning_rate": 4.73906526774765e-06,
"loss": 0.6056,
"step": 6070
},
{
"epoch": 0.7786885245901639,
"grad_norm": 14.28087329864502,
"learning_rate": 4.711813598582914e-06,
"loss": 0.8161,
"step": 6080
},
{
"epoch": 0.779969262295082,
"grad_norm": 4.369551658630371,
"learning_rate": 4.684561929418177e-06,
"loss": 0.5907,
"step": 6090
},
{
"epoch": 0.78125,
"grad_norm": 30.508066177368164,
"learning_rate": 4.6573102602534405e-06,
"loss": 0.5567,
"step": 6100
},
{
"epoch": 0.782530737704918,
"grad_norm": 24.87715721130371,
"learning_rate": 4.630058591088705e-06,
"loss": 0.6462,
"step": 6110
},
{
"epoch": 0.7838114754098361,
"grad_norm": 15.003620147705078,
"learning_rate": 4.602806921923968e-06,
"loss": 0.5864,
"step": 6120
},
{
"epoch": 0.7850922131147541,
"grad_norm": 21.42226219177246,
"learning_rate": 4.575555252759232e-06,
"loss": 0.7504,
"step": 6130
},
{
"epoch": 0.7863729508196722,
"grad_norm": 2.328996181488037,
"learning_rate": 4.548303583594495e-06,
"loss": 0.6912,
"step": 6140
},
{
"epoch": 0.7876536885245902,
"grad_norm": 10.392210960388184,
"learning_rate": 4.52105191442976e-06,
"loss": 0.8199,
"step": 6150
},
{
"epoch": 0.7889344262295082,
"grad_norm": 8.533187866210938,
"learning_rate": 4.493800245265023e-06,
"loss": 0.5175,
"step": 6160
},
{
"epoch": 0.7902151639344263,
"grad_norm": 11.740133285522461,
"learning_rate": 4.4665485761002865e-06,
"loss": 0.9367,
"step": 6170
},
{
"epoch": 0.7914959016393442,
"grad_norm": 26.58624267578125,
"learning_rate": 4.43929690693555e-06,
"loss": 0.6825,
"step": 6180
},
{
"epoch": 0.7927766393442623,
"grad_norm": 1.783715844154358,
"learning_rate": 4.412045237770814e-06,
"loss": 0.5531,
"step": 6190
},
{
"epoch": 0.7940573770491803,
"grad_norm": 10.153545379638672,
"learning_rate": 4.384793568606078e-06,
"loss": 0.7688,
"step": 6200
},
{
"epoch": 0.7953381147540983,
"grad_norm": 12.468855857849121,
"learning_rate": 4.357541899441341e-06,
"loss": 0.5524,
"step": 6210
},
{
"epoch": 0.7966188524590164,
"grad_norm": 3.6368038654327393,
"learning_rate": 4.330290230276605e-06,
"loss": 0.6093,
"step": 6220
},
{
"epoch": 0.7978995901639344,
"grad_norm": 12.435820579528809,
"learning_rate": 4.303038561111868e-06,
"loss": 0.6865,
"step": 6230
},
{
"epoch": 0.7991803278688525,
"grad_norm": 9.179264068603516,
"learning_rate": 4.2757868919471325e-06,
"loss": 0.6233,
"step": 6240
},
{
"epoch": 0.8004610655737705,
"grad_norm": 46.63306427001953,
"learning_rate": 4.248535222782396e-06,
"loss": 0.5358,
"step": 6250
},
{
"epoch": 0.8017418032786885,
"grad_norm": 7.405709266662598,
"learning_rate": 4.2212835536176595e-06,
"loss": 0.5872,
"step": 6260
},
{
"epoch": 0.8030225409836066,
"grad_norm": 20.083263397216797,
"learning_rate": 4.194031884452923e-06,
"loss": 0.4936,
"step": 6270
},
{
"epoch": 0.8043032786885246,
"grad_norm": 2.6786341667175293,
"learning_rate": 4.166780215288187e-06,
"loss": 0.6548,
"step": 6280
},
{
"epoch": 0.8055840163934426,
"grad_norm": 13.946334838867188,
"learning_rate": 4.13952854612345e-06,
"loss": 0.4651,
"step": 6290
},
{
"epoch": 0.8068647540983607,
"grad_norm": 39.37618637084961,
"learning_rate": 4.112276876958714e-06,
"loss": 0.7712,
"step": 6300
},
{
"epoch": 0.8081454918032787,
"grad_norm": 18.16588020324707,
"learning_rate": 4.085025207793978e-06,
"loss": 0.7139,
"step": 6310
},
{
"epoch": 0.8094262295081968,
"grad_norm": 12.700222969055176,
"learning_rate": 4.057773538629242e-06,
"loss": 0.5219,
"step": 6320
},
{
"epoch": 0.8107069672131147,
"grad_norm": 28.98236656188965,
"learning_rate": 4.030521869464505e-06,
"loss": 0.7143,
"step": 6330
},
{
"epoch": 0.8119877049180327,
"grad_norm": 24.590084075927734,
"learning_rate": 4.003270200299769e-06,
"loss": 0.46,
"step": 6340
},
{
"epoch": 0.8132684426229508,
"grad_norm": 24.325733184814453,
"learning_rate": 3.976018531135032e-06,
"loss": 0.7554,
"step": 6350
},
{
"epoch": 0.8145491803278688,
"grad_norm": 8.794258117675781,
"learning_rate": 3.948766861970296e-06,
"loss": 0.4404,
"step": 6360
},
{
"epoch": 0.8158299180327869,
"grad_norm": 0.7277682423591614,
"learning_rate": 3.921515192805559e-06,
"loss": 0.6449,
"step": 6370
},
{
"epoch": 0.8171106557377049,
"grad_norm": 22.101137161254883,
"learning_rate": 3.894263523640824e-06,
"loss": 0.7285,
"step": 6380
},
{
"epoch": 0.8183913934426229,
"grad_norm": 22.26101303100586,
"learning_rate": 3.867011854476087e-06,
"loss": 0.7699,
"step": 6390
},
{
"epoch": 0.819672131147541,
"grad_norm": 17.823871612548828,
"learning_rate": 3.839760185311351e-06,
"loss": 0.6389,
"step": 6400
},
{
"epoch": 0.820952868852459,
"grad_norm": 35.937286376953125,
"learning_rate": 3.812508516146614e-06,
"loss": 0.7776,
"step": 6410
},
{
"epoch": 0.8222336065573771,
"grad_norm": 1.8482409715652466,
"learning_rate": 3.785256846981878e-06,
"loss": 0.7117,
"step": 6420
},
{
"epoch": 0.8235143442622951,
"grad_norm": 1.5273475646972656,
"learning_rate": 3.758005177817142e-06,
"loss": 0.8126,
"step": 6430
},
{
"epoch": 0.8247950819672131,
"grad_norm": 17.53533935546875,
"learning_rate": 3.7307535086524054e-06,
"loss": 0.8421,
"step": 6440
},
{
"epoch": 0.8260758196721312,
"grad_norm": 9.50154972076416,
"learning_rate": 3.703501839487669e-06,
"loss": 0.5142,
"step": 6450
},
{
"epoch": 0.8273565573770492,
"grad_norm": 12.528085708618164,
"learning_rate": 3.6762501703229327e-06,
"loss": 0.7004,
"step": 6460
},
{
"epoch": 0.8286372950819673,
"grad_norm": 19.719446182250977,
"learning_rate": 3.648998501158196e-06,
"loss": 0.5631,
"step": 6470
},
{
"epoch": 0.8299180327868853,
"grad_norm": 21.097314834594727,
"learning_rate": 3.62174683199346e-06,
"loss": 0.6893,
"step": 6480
},
{
"epoch": 0.8311987704918032,
"grad_norm": 9.299731254577637,
"learning_rate": 3.594495162828723e-06,
"loss": 0.389,
"step": 6490
},
{
"epoch": 0.8324795081967213,
"grad_norm": 5.358484268188477,
"learning_rate": 3.567243493663987e-06,
"loss": 0.59,
"step": 6500
},
{
"epoch": 0.8337602459016393,
"grad_norm": 17.12688446044922,
"learning_rate": 3.539991824499251e-06,
"loss": 0.4049,
"step": 6510
},
{
"epoch": 0.8350409836065574,
"grad_norm": 22.643938064575195,
"learning_rate": 3.512740155334515e-06,
"loss": 0.5396,
"step": 6520
},
{
"epoch": 0.8363217213114754,
"grad_norm": 15.25439167022705,
"learning_rate": 3.485488486169778e-06,
"loss": 0.7493,
"step": 6530
},
{
"epoch": 0.8376024590163934,
"grad_norm": 0.7836318016052246,
"learning_rate": 3.4582368170050418e-06,
"loss": 0.6233,
"step": 6540
},
{
"epoch": 0.8388831967213115,
"grad_norm": 7.646884918212891,
"learning_rate": 3.4309851478403057e-06,
"loss": 0.6084,
"step": 6550
},
{
"epoch": 0.8401639344262295,
"grad_norm": 0.6499843001365662,
"learning_rate": 3.4037334786755696e-06,
"loss": 0.6373,
"step": 6560
},
{
"epoch": 0.8414446721311475,
"grad_norm": 4.813564777374268,
"learning_rate": 3.3764818095108326e-06,
"loss": 0.8403,
"step": 6570
},
{
"epoch": 0.8427254098360656,
"grad_norm": 20.393510818481445,
"learning_rate": 3.3492301403460965e-06,
"loss": 0.7535,
"step": 6580
},
{
"epoch": 0.8440061475409836,
"grad_norm": 15.966805458068848,
"learning_rate": 3.32197847118136e-06,
"loss": 0.8566,
"step": 6590
},
{
"epoch": 0.8452868852459017,
"grad_norm": 4.333749294281006,
"learning_rate": 3.294726802016624e-06,
"loss": 0.742,
"step": 6600
},
{
"epoch": 0.8465676229508197,
"grad_norm": 10.89919376373291,
"learning_rate": 3.2674751328518873e-06,
"loss": 0.5804,
"step": 6610
},
{
"epoch": 0.8478483606557377,
"grad_norm": 9.388509750366211,
"learning_rate": 3.240223463687151e-06,
"loss": 0.5165,
"step": 6620
},
{
"epoch": 0.8491290983606558,
"grad_norm": 4.184054374694824,
"learning_rate": 3.2129717945224147e-06,
"loss": 0.5183,
"step": 6630
},
{
"epoch": 0.8504098360655737,
"grad_norm": 7.253784656524658,
"learning_rate": 3.1857201253576786e-06,
"loss": 0.7198,
"step": 6640
},
{
"epoch": 0.8516905737704918,
"grad_norm": 11.86843490600586,
"learning_rate": 3.1584684561929417e-06,
"loss": 0.4899,
"step": 6650
},
{
"epoch": 0.8529713114754098,
"grad_norm": 10.385624885559082,
"learning_rate": 3.1312167870282056e-06,
"loss": 0.8167,
"step": 6660
},
{
"epoch": 0.8542520491803278,
"grad_norm": 21.208568572998047,
"learning_rate": 3.1039651178634695e-06,
"loss": 0.5819,
"step": 6670
},
{
"epoch": 0.8555327868852459,
"grad_norm": 0.44370558857917786,
"learning_rate": 3.0767134486987333e-06,
"loss": 0.7301,
"step": 6680
},
{
"epoch": 0.8568135245901639,
"grad_norm": 8.252354621887207,
"learning_rate": 3.0494617795339964e-06,
"loss": 0.488,
"step": 6690
},
{
"epoch": 0.858094262295082,
"grad_norm": 13.996326446533203,
"learning_rate": 3.0222101103692603e-06,
"loss": 0.7691,
"step": 6700
},
{
"epoch": 0.859375,
"grad_norm": 35.99543380737305,
"learning_rate": 2.994958441204524e-06,
"loss": 0.5266,
"step": 6710
},
{
"epoch": 0.860655737704918,
"grad_norm": 19.631608963012695,
"learning_rate": 2.9677067720397877e-06,
"loss": 0.9918,
"step": 6720
},
{
"epoch": 0.8619364754098361,
"grad_norm": 33.22177505493164,
"learning_rate": 2.940455102875051e-06,
"loss": 0.6044,
"step": 6730
},
{
"epoch": 0.8632172131147541,
"grad_norm": 20.986621856689453,
"learning_rate": 2.913203433710315e-06,
"loss": 0.3522,
"step": 6740
},
{
"epoch": 0.8644979508196722,
"grad_norm": 14.024015426635742,
"learning_rate": 2.8859517645455785e-06,
"loss": 0.6916,
"step": 6750
},
{
"epoch": 0.8657786885245902,
"grad_norm": 11.796330451965332,
"learning_rate": 2.8587000953808424e-06,
"loss": 0.704,
"step": 6760
},
{
"epoch": 0.8670594262295082,
"grad_norm": 20.83628273010254,
"learning_rate": 2.831448426216106e-06,
"loss": 0.8603,
"step": 6770
},
{
"epoch": 0.8683401639344263,
"grad_norm": 18.570674896240234,
"learning_rate": 2.8041967570513693e-06,
"loss": 0.6714,
"step": 6780
},
{
"epoch": 0.8696209016393442,
"grad_norm": 8.486098289489746,
"learning_rate": 2.7769450878866332e-06,
"loss": 0.7107,
"step": 6790
},
{
"epoch": 0.8709016393442623,
"grad_norm": 2.3732173442840576,
"learning_rate": 2.749693418721897e-06,
"loss": 0.7988,
"step": 6800
},
{
"epoch": 0.8721823770491803,
"grad_norm": 2.5915911197662354,
"learning_rate": 2.72244174955716e-06,
"loss": 0.6847,
"step": 6810
},
{
"epoch": 0.8734631147540983,
"grad_norm": 30.59233856201172,
"learning_rate": 2.695190080392424e-06,
"loss": 0.6228,
"step": 6820
},
{
"epoch": 0.8747438524590164,
"grad_norm": 9.502323150634766,
"learning_rate": 2.667938411227688e-06,
"loss": 0.5615,
"step": 6830
},
{
"epoch": 0.8760245901639344,
"grad_norm": 17.929569244384766,
"learning_rate": 2.640686742062952e-06,
"loss": 0.5991,
"step": 6840
},
{
"epoch": 0.8773053278688525,
"grad_norm": 14.03685474395752,
"learning_rate": 2.613435072898215e-06,
"loss": 0.7148,
"step": 6850
},
{
"epoch": 0.8785860655737705,
"grad_norm": 14.739727020263672,
"learning_rate": 2.586183403733479e-06,
"loss": 0.5939,
"step": 6860
},
{
"epoch": 0.8798668032786885,
"grad_norm": 13.458857536315918,
"learning_rate": 2.5589317345687427e-06,
"loss": 0.5892,
"step": 6870
},
{
"epoch": 0.8811475409836066,
"grad_norm": 13.960780143737793,
"learning_rate": 2.531680065404006e-06,
"loss": 0.5841,
"step": 6880
},
{
"epoch": 0.8824282786885246,
"grad_norm": 13.514850616455078,
"learning_rate": 2.5044283962392696e-06,
"loss": 0.5341,
"step": 6890
},
{
"epoch": 0.8837090163934426,
"grad_norm": 12.330262184143066,
"learning_rate": 2.4771767270745335e-06,
"loss": 0.8722,
"step": 6900
},
{
"epoch": 0.8849897540983607,
"grad_norm": 14.698627471923828,
"learning_rate": 2.449925057909797e-06,
"loss": 0.3832,
"step": 6910
},
{
"epoch": 0.8862704918032787,
"grad_norm": 25.05308723449707,
"learning_rate": 2.4226733887450605e-06,
"loss": 0.9005,
"step": 6920
},
{
"epoch": 0.8875512295081968,
"grad_norm": 16.247404098510742,
"learning_rate": 2.3954217195803244e-06,
"loss": 0.4334,
"step": 6930
},
{
"epoch": 0.8888319672131147,
"grad_norm": 24.826126098632812,
"learning_rate": 2.3681700504155883e-06,
"loss": 0.4239,
"step": 6940
},
{
"epoch": 0.8901127049180327,
"grad_norm": 30.12708282470703,
"learning_rate": 2.3409183812508517e-06,
"loss": 0.9281,
"step": 6950
},
{
"epoch": 0.8913934426229508,
"grad_norm": 33.377967834472656,
"learning_rate": 2.3136667120861156e-06,
"loss": 1.0247,
"step": 6960
},
{
"epoch": 0.8926741803278688,
"grad_norm": 9.090696334838867,
"learning_rate": 2.286415042921379e-06,
"loss": 0.6998,
"step": 6970
},
{
"epoch": 0.8939549180327869,
"grad_norm": 18.32761001586914,
"learning_rate": 2.259163373756643e-06,
"loss": 0.6415,
"step": 6980
},
{
"epoch": 0.8952356557377049,
"grad_norm": 3.1769232749938965,
"learning_rate": 2.2319117045919065e-06,
"loss": 0.5628,
"step": 6990
},
{
"epoch": 0.8965163934426229,
"grad_norm": 11.886983871459961,
"learning_rate": 2.2046600354271704e-06,
"loss": 0.9674,
"step": 7000
},
{
"epoch": 0.897797131147541,
"grad_norm": 2.503143072128296,
"learning_rate": 2.177408366262434e-06,
"loss": 0.5693,
"step": 7010
},
{
"epoch": 0.899077868852459,
"grad_norm": 29.00408935546875,
"learning_rate": 2.1501566970976973e-06,
"loss": 0.6684,
"step": 7020
},
{
"epoch": 0.9003586065573771,
"grad_norm": 14.518806457519531,
"learning_rate": 2.1229050279329612e-06,
"loss": 0.8301,
"step": 7030
},
{
"epoch": 0.9016393442622951,
"grad_norm": 46.252830505371094,
"learning_rate": 2.0956533587682247e-06,
"loss": 1.0172,
"step": 7040
},
{
"epoch": 0.9029200819672131,
"grad_norm": 19.148435592651367,
"learning_rate": 2.068401689603488e-06,
"loss": 0.7153,
"step": 7050
},
{
"epoch": 0.9042008196721312,
"grad_norm": 17.86318588256836,
"learning_rate": 2.041150020438752e-06,
"loss": 0.7907,
"step": 7060
},
{
"epoch": 0.9054815573770492,
"grad_norm": 14.341056823730469,
"learning_rate": 2.0138983512740155e-06,
"loss": 0.4611,
"step": 7070
},
{
"epoch": 0.9067622950819673,
"grad_norm": 8.442182540893555,
"learning_rate": 1.9866466821092794e-06,
"loss": 0.8596,
"step": 7080
},
{
"epoch": 0.9080430327868853,
"grad_norm": 15.53111743927002,
"learning_rate": 1.959395012944543e-06,
"loss": 0.5854,
"step": 7090
},
{
"epoch": 0.9093237704918032,
"grad_norm": 8.210785865783691,
"learning_rate": 1.932143343779807e-06,
"loss": 0.855,
"step": 7100
},
{
"epoch": 0.9106045081967213,
"grad_norm": 11.097797393798828,
"learning_rate": 1.9048916746150703e-06,
"loss": 0.7989,
"step": 7110
},
{
"epoch": 0.9118852459016393,
"grad_norm": 6.103325843811035,
"learning_rate": 1.8776400054503342e-06,
"loss": 0.4565,
"step": 7120
},
{
"epoch": 0.9131659836065574,
"grad_norm": 15.080409049987793,
"learning_rate": 1.8503883362855976e-06,
"loss": 0.4197,
"step": 7130
},
{
"epoch": 0.9144467213114754,
"grad_norm": 23.386219024658203,
"learning_rate": 1.8231366671208613e-06,
"loss": 0.6161,
"step": 7140
},
{
"epoch": 0.9157274590163934,
"grad_norm": 13.018634796142578,
"learning_rate": 1.795884997956125e-06,
"loss": 0.4554,
"step": 7150
},
{
"epoch": 0.9170081967213115,
"grad_norm": 9.674510955810547,
"learning_rate": 1.7686333287913887e-06,
"loss": 0.7063,
"step": 7160
},
{
"epoch": 0.9182889344262295,
"grad_norm": 13.369217872619629,
"learning_rate": 1.7413816596266522e-06,
"loss": 0.6227,
"step": 7170
},
{
"epoch": 0.9195696721311475,
"grad_norm": 19.81302833557129,
"learning_rate": 1.714129990461916e-06,
"loss": 0.5844,
"step": 7180
},
{
"epoch": 0.9208504098360656,
"grad_norm": 13.579237937927246,
"learning_rate": 1.6868783212971795e-06,
"loss": 0.632,
"step": 7190
},
{
"epoch": 0.9221311475409836,
"grad_norm": 9.165477752685547,
"learning_rate": 1.6596266521324434e-06,
"loss": 0.5509,
"step": 7200
},
{
"epoch": 0.9234118852459017,
"grad_norm": 18.232845306396484,
"learning_rate": 1.6323749829677069e-06,
"loss": 0.6403,
"step": 7210
},
{
"epoch": 0.9246926229508197,
"grad_norm": 18.56736946105957,
"learning_rate": 1.6051233138029706e-06,
"loss": 0.7943,
"step": 7220
},
{
"epoch": 0.9259733606557377,
"grad_norm": 8.743745803833008,
"learning_rate": 1.5778716446382343e-06,
"loss": 0.4279,
"step": 7230
},
{
"epoch": 0.9272540983606558,
"grad_norm": 2.6923177242279053,
"learning_rate": 1.550619975473498e-06,
"loss": 0.5608,
"step": 7240
},
{
"epoch": 0.9285348360655737,
"grad_norm": 29.790340423583984,
"learning_rate": 1.5233683063087614e-06,
"loss": 0.4361,
"step": 7250
},
{
"epoch": 0.9298155737704918,
"grad_norm": 1.7628939151763916,
"learning_rate": 1.4961166371440253e-06,
"loss": 0.6859,
"step": 7260
},
{
"epoch": 0.9310963114754098,
"grad_norm": 10.456538200378418,
"learning_rate": 1.4688649679792888e-06,
"loss": 0.7482,
"step": 7270
},
{
"epoch": 0.9323770491803278,
"grad_norm": 28.223440170288086,
"learning_rate": 1.4416132988145527e-06,
"loss": 0.675,
"step": 7280
},
{
"epoch": 0.9336577868852459,
"grad_norm": 6.400082111358643,
"learning_rate": 1.4143616296498161e-06,
"loss": 0.6709,
"step": 7290
},
{
"epoch": 0.9349385245901639,
"grad_norm": 16.48478889465332,
"learning_rate": 1.3871099604850798e-06,
"loss": 0.3667,
"step": 7300
},
{
"epoch": 0.936219262295082,
"grad_norm": 14.860025405883789,
"learning_rate": 1.3598582913203435e-06,
"loss": 0.7024,
"step": 7310
},
{
"epoch": 0.9375,
"grad_norm": 14.933452606201172,
"learning_rate": 1.3326066221556072e-06,
"loss": 0.6838,
"step": 7320
},
{
"epoch": 0.938780737704918,
"grad_norm": 23.65451431274414,
"learning_rate": 1.3053549529908707e-06,
"loss": 0.5882,
"step": 7330
},
{
"epoch": 0.9400614754098361,
"grad_norm": 23.98202133178711,
"learning_rate": 1.2781032838261346e-06,
"loss": 0.7442,
"step": 7340
},
{
"epoch": 0.9413422131147541,
"grad_norm": 38.25538635253906,
"learning_rate": 1.250851614661398e-06,
"loss": 0.6544,
"step": 7350
},
{
"epoch": 0.9426229508196722,
"grad_norm": 1.7557686567306519,
"learning_rate": 1.223599945496662e-06,
"loss": 0.3867,
"step": 7360
},
{
"epoch": 0.9439036885245902,
"grad_norm": 10.53632926940918,
"learning_rate": 1.1963482763319254e-06,
"loss": 0.9491,
"step": 7370
},
{
"epoch": 0.9451844262295082,
"grad_norm": 8.34455680847168,
"learning_rate": 1.169096607167189e-06,
"loss": 0.4885,
"step": 7380
},
{
"epoch": 0.9464651639344263,
"grad_norm": 3.460608720779419,
"learning_rate": 1.1418449380024528e-06,
"loss": 0.4286,
"step": 7390
},
{
"epoch": 0.9477459016393442,
"grad_norm": 20.152204513549805,
"learning_rate": 1.1145932688377165e-06,
"loss": 0.7888,
"step": 7400
},
{
"epoch": 0.9490266393442623,
"grad_norm": 12.72758960723877,
"learning_rate": 1.0873415996729801e-06,
"loss": 0.6784,
"step": 7410
},
{
"epoch": 0.9503073770491803,
"grad_norm": 13.164525985717773,
"learning_rate": 1.0600899305082438e-06,
"loss": 0.3325,
"step": 7420
},
{
"epoch": 0.9515881147540983,
"grad_norm": 15.550426483154297,
"learning_rate": 1.0328382613435075e-06,
"loss": 0.5533,
"step": 7430
},
{
"epoch": 0.9528688524590164,
"grad_norm": 4.542503356933594,
"learning_rate": 1.005586592178771e-06,
"loss": 0.5264,
"step": 7440
},
{
"epoch": 0.9541495901639344,
"grad_norm": 22.304424285888672,
"learning_rate": 9.783349230140347e-07,
"loss": 0.7576,
"step": 7450
},
{
"epoch": 0.9554303278688525,
"grad_norm": 24.396604537963867,
"learning_rate": 9.510832538492983e-07,
"loss": 0.6932,
"step": 7460
},
{
"epoch": 0.9567110655737705,
"grad_norm": 5.150862216949463,
"learning_rate": 9.23831584684562e-07,
"loss": 0.5392,
"step": 7470
},
{
"epoch": 0.9579918032786885,
"grad_norm": 6.6292829513549805,
"learning_rate": 8.965799155198257e-07,
"loss": 0.43,
"step": 7480
},
{
"epoch": 0.9592725409836066,
"grad_norm": 35.094058990478516,
"learning_rate": 8.693282463550894e-07,
"loss": 0.4879,
"step": 7490
},
{
"epoch": 0.9605532786885246,
"grad_norm": 31.886293411254883,
"learning_rate": 8.42076577190353e-07,
"loss": 0.4554,
"step": 7500
},
{
"epoch": 0.9618340163934426,
"grad_norm": 10.12392807006836,
"learning_rate": 8.148249080256167e-07,
"loss": 0.8466,
"step": 7510
},
{
"epoch": 0.9631147540983607,
"grad_norm": 23.29629898071289,
"learning_rate": 7.875732388608803e-07,
"loss": 0.6954,
"step": 7520
},
{
"epoch": 0.9643954918032787,
"grad_norm": 34.42799758911133,
"learning_rate": 7.60321569696144e-07,
"loss": 0.4265,
"step": 7530
},
{
"epoch": 0.9656762295081968,
"grad_norm": 20.460311889648438,
"learning_rate": 7.330699005314076e-07,
"loss": 0.542,
"step": 7540
},
{
"epoch": 0.9669569672131147,
"grad_norm": 1.3875937461853027,
"learning_rate": 7.058182313666713e-07,
"loss": 0.6803,
"step": 7550
},
{
"epoch": 0.9682377049180327,
"grad_norm": 20.104841232299805,
"learning_rate": 6.78566562201935e-07,
"loss": 0.8385,
"step": 7560
},
{
"epoch": 0.9695184426229508,
"grad_norm": 17.50690269470215,
"learning_rate": 6.513148930371987e-07,
"loss": 0.9916,
"step": 7570
},
{
"epoch": 0.9707991803278688,
"grad_norm": 72.95804595947266,
"learning_rate": 6.240632238724622e-07,
"loss": 0.8251,
"step": 7580
},
{
"epoch": 0.9720799180327869,
"grad_norm": 11.275779724121094,
"learning_rate": 5.968115547077259e-07,
"loss": 0.4506,
"step": 7590
},
{
"epoch": 0.9733606557377049,
"grad_norm": 20.942705154418945,
"learning_rate": 5.695598855429896e-07,
"loss": 0.6638,
"step": 7600
},
{
"epoch": 0.9746413934426229,
"grad_norm": 8.423453330993652,
"learning_rate": 5.423082163782532e-07,
"loss": 0.6953,
"step": 7610
},
{
"epoch": 0.975922131147541,
"grad_norm": 24.83681297302246,
"learning_rate": 5.150565472135169e-07,
"loss": 0.7618,
"step": 7620
},
{
"epoch": 0.977202868852459,
"grad_norm": 18.958438873291016,
"learning_rate": 4.878048780487805e-07,
"loss": 0.734,
"step": 7630
},
{
"epoch": 0.9784836065573771,
"grad_norm": 12.136439323425293,
"learning_rate": 4.605532088840442e-07,
"loss": 0.5637,
"step": 7640
},
{
"epoch": 0.9797643442622951,
"grad_norm": 7.522444725036621,
"learning_rate": 4.3330153971930786e-07,
"loss": 0.8323,
"step": 7650
},
{
"epoch": 0.9810450819672131,
"grad_norm": 34.33516311645508,
"learning_rate": 4.060498705545715e-07,
"loss": 0.591,
"step": 7660
},
{
"epoch": 0.9823258196721312,
"grad_norm": 6.395289421081543,
"learning_rate": 3.787982013898352e-07,
"loss": 0.5533,
"step": 7670
},
{
"epoch": 0.9836065573770492,
"grad_norm": 7.777110576629639,
"learning_rate": 3.515465322250988e-07,
"loss": 0.7885,
"step": 7680
},
{
"epoch": 0.9848872950819673,
"grad_norm": 18.54967498779297,
"learning_rate": 3.242948630603625e-07,
"loss": 0.5966,
"step": 7690
},
{
"epoch": 0.9861680327868853,
"grad_norm": 13.985085487365723,
"learning_rate": 2.970431938956261e-07,
"loss": 0.7105,
"step": 7700
},
{
"epoch": 0.9874487704918032,
"grad_norm": 37.31953811645508,
"learning_rate": 2.697915247308898e-07,
"loss": 0.8736,
"step": 7710
},
{
"epoch": 0.9887295081967213,
"grad_norm": 9.107115745544434,
"learning_rate": 2.4253985556615344e-07,
"loss": 0.699,
"step": 7720
},
{
"epoch": 0.9900102459016393,
"grad_norm": 14.522866249084473,
"learning_rate": 2.152881864014171e-07,
"loss": 0.7096,
"step": 7730
},
{
"epoch": 0.9912909836065574,
"grad_norm": 12.966835975646973,
"learning_rate": 1.8803651723668075e-07,
"loss": 0.6627,
"step": 7740
},
{
"epoch": 0.9925717213114754,
"grad_norm": 33.506622314453125,
"learning_rate": 1.607848480719444e-07,
"loss": 0.6537,
"step": 7750
},
{
"epoch": 0.9938524590163934,
"grad_norm": 14.853964805603027,
"learning_rate": 1.3353317890720807e-07,
"loss": 0.4942,
"step": 7760
},
{
"epoch": 0.9951331967213115,
"grad_norm": 5.332017421722412,
"learning_rate": 1.0628150974247172e-07,
"loss": 0.4523,
"step": 7770
},
{
"epoch": 0.9964139344262295,
"grad_norm": 24.917579650878906,
"learning_rate": 7.902984057773541e-08,
"loss": 0.706,
"step": 7780
},
{
"epoch": 0.9976946721311475,
"grad_norm": 21.65096664428711,
"learning_rate": 5.177817141299905e-08,
"loss": 0.4827,
"step": 7790
},
{
"epoch": 0.9989754098360656,
"grad_norm": 3.226344347000122,
"learning_rate": 2.452650224826271e-08,
"loss": 0.8295,
"step": 7800
}
],
"logging_steps": 10,
"max_steps": 7808,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8217558262480896.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}