CocoRoF's picture
Training in progress, step 13748, checkpoint
d20677c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.99692914763958,
"eval_steps": 500,
"global_step": 13748,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005091568680488154,
"grad_norm": 0.7564847469329834,
"learning_rate": 4.9999857936989376e-05,
"loss": 0.6957,
"step": 10
},
{
"epoch": 0.010183137360976308,
"grad_norm": 0.3457886576652527,
"learning_rate": 4.999971587397875e-05,
"loss": 0.6941,
"step": 20
},
{
"epoch": 0.015274706041464463,
"grad_norm": 0.2739073932170868,
"learning_rate": 4.999957381096812e-05,
"loss": 0.6921,
"step": 30
},
{
"epoch": 0.020366274721952616,
"grad_norm": 0.2826838195323944,
"learning_rate": 4.9999431747957495e-05,
"loss": 0.6932,
"step": 40
},
{
"epoch": 0.02545784340244077,
"grad_norm": 0.1553473025560379,
"learning_rate": 4.999928968494687e-05,
"loss": 0.6933,
"step": 50
},
{
"epoch": 0.030549412082928926,
"grad_norm": 0.17130891978740692,
"learning_rate": 4.9999147621936234e-05,
"loss": 0.6916,
"step": 60
},
{
"epoch": 0.03564098076341708,
"grad_norm": 0.1938512623310089,
"learning_rate": 4.999900555892561e-05,
"loss": 0.6919,
"step": 70
},
{
"epoch": 0.04073254944390523,
"grad_norm": 0.2490479201078415,
"learning_rate": 4.999886349591498e-05,
"loss": 0.6907,
"step": 80
},
{
"epoch": 0.04582411812439339,
"grad_norm": 0.20010128617286682,
"learning_rate": 4.9998721432904354e-05,
"loss": 0.691,
"step": 90
},
{
"epoch": 0.05091568680488154,
"grad_norm": 0.16262651979923248,
"learning_rate": 4.999857936989373e-05,
"loss": 0.6917,
"step": 100
},
{
"epoch": 0.056007255485369695,
"grad_norm": 0.21441149711608887,
"learning_rate": 4.999843730688309e-05,
"loss": 0.6872,
"step": 110
},
{
"epoch": 0.06109882416585785,
"grad_norm": 0.20518313348293304,
"learning_rate": 4.9998295243872466e-05,
"loss": 0.6899,
"step": 120
},
{
"epoch": 0.06619039284634601,
"grad_norm": 0.2965710759162903,
"learning_rate": 4.999815318086184e-05,
"loss": 0.6869,
"step": 130
},
{
"epoch": 0.07128196152683416,
"grad_norm": 0.269436776638031,
"learning_rate": 4.999801111785121e-05,
"loss": 0.6893,
"step": 140
},
{
"epoch": 0.07637353020732231,
"grad_norm": 0.2355806529521942,
"learning_rate": 4.9997869054840585e-05,
"loss": 0.688,
"step": 150
},
{
"epoch": 0.08146509888781046,
"grad_norm": 0.2859913408756256,
"learning_rate": 4.999772699182996e-05,
"loss": 0.6882,
"step": 160
},
{
"epoch": 0.08655666756829862,
"grad_norm": 0.22986085712909698,
"learning_rate": 4.999758492881933e-05,
"loss": 0.6892,
"step": 170
},
{
"epoch": 0.09164823624878678,
"grad_norm": 0.1777602881193161,
"learning_rate": 4.9997442865808705e-05,
"loss": 0.6891,
"step": 180
},
{
"epoch": 0.09673980492927493,
"grad_norm": 0.24697603285312653,
"learning_rate": 4.999730080279808e-05,
"loss": 0.6901,
"step": 190
},
{
"epoch": 0.10183137360976308,
"grad_norm": 0.24384862184524536,
"learning_rate": 4.9997158739787444e-05,
"loss": 0.6864,
"step": 200
},
{
"epoch": 0.10692294229025123,
"grad_norm": 0.22532877326011658,
"learning_rate": 4.999701667677682e-05,
"loss": 0.6925,
"step": 210
},
{
"epoch": 0.11201451097073939,
"grad_norm": 0.33894965052604675,
"learning_rate": 4.999687461376619e-05,
"loss": 0.69,
"step": 220
},
{
"epoch": 0.11710607965122755,
"grad_norm": 0.1438864767551422,
"learning_rate": 4.9996732550755563e-05,
"loss": 0.6905,
"step": 230
},
{
"epoch": 0.1221976483317157,
"grad_norm": 0.1356513947248459,
"learning_rate": 4.9996590487744937e-05,
"loss": 0.6895,
"step": 240
},
{
"epoch": 0.12728921701220386,
"grad_norm": 0.11220791190862656,
"learning_rate": 4.999644842473431e-05,
"loss": 0.6889,
"step": 250
},
{
"epoch": 0.13238078569269202,
"grad_norm": 0.18669378757476807,
"learning_rate": 4.999630636172368e-05,
"loss": 0.6877,
"step": 260
},
{
"epoch": 0.13747235437318017,
"grad_norm": 0.24448029696941376,
"learning_rate": 4.9996164298713056e-05,
"loss": 0.6821,
"step": 270
},
{
"epoch": 0.14256392305366833,
"grad_norm": 0.33900442719459534,
"learning_rate": 4.999602223570243e-05,
"loss": 0.6825,
"step": 280
},
{
"epoch": 0.14765549173415646,
"grad_norm": 0.20542432367801666,
"learning_rate": 4.99958801726918e-05,
"loss": 0.6835,
"step": 290
},
{
"epoch": 0.15274706041464461,
"grad_norm": 0.17819932103157043,
"learning_rate": 4.9995738109681175e-05,
"loss": 0.6875,
"step": 300
},
{
"epoch": 0.15783862909513277,
"grad_norm": 0.16522936522960663,
"learning_rate": 4.999559604667055e-05,
"loss": 0.6868,
"step": 310
},
{
"epoch": 0.16293019777562093,
"grad_norm": 0.1356010138988495,
"learning_rate": 4.9995453983659915e-05,
"loss": 0.6888,
"step": 320
},
{
"epoch": 0.16802176645610908,
"grad_norm": 0.20955336093902588,
"learning_rate": 4.999531192064929e-05,
"loss": 0.6827,
"step": 330
},
{
"epoch": 0.17311333513659724,
"grad_norm": 0.14871099591255188,
"learning_rate": 4.9995169857638654e-05,
"loss": 0.687,
"step": 340
},
{
"epoch": 0.1782049038170854,
"grad_norm": 0.1207965835928917,
"learning_rate": 4.999502779462803e-05,
"loss": 0.6871,
"step": 350
},
{
"epoch": 0.18329647249757355,
"grad_norm": 0.25459375977516174,
"learning_rate": 4.99948857316174e-05,
"loss": 0.6879,
"step": 360
},
{
"epoch": 0.1883880411780617,
"grad_norm": 0.12454749643802643,
"learning_rate": 4.9994743668606773e-05,
"loss": 0.689,
"step": 370
},
{
"epoch": 0.19347960985854987,
"grad_norm": 0.13212984800338745,
"learning_rate": 4.9994601605596147e-05,
"loss": 0.6827,
"step": 380
},
{
"epoch": 0.19857117853903802,
"grad_norm": 0.15493592619895935,
"learning_rate": 4.999445954258552e-05,
"loss": 0.6916,
"step": 390
},
{
"epoch": 0.20366274721952615,
"grad_norm": 0.1339859962463379,
"learning_rate": 4.999431747957489e-05,
"loss": 0.6862,
"step": 400
},
{
"epoch": 0.2087543159000143,
"grad_norm": 0.17246641218662262,
"learning_rate": 4.9994175416564266e-05,
"loss": 0.6915,
"step": 410
},
{
"epoch": 0.21384588458050247,
"grad_norm": 0.09907015413045883,
"learning_rate": 4.999403335355364e-05,
"loss": 0.6884,
"step": 420
},
{
"epoch": 0.21893745326099062,
"grad_norm": 0.13688722252845764,
"learning_rate": 4.999389129054301e-05,
"loss": 0.6894,
"step": 430
},
{
"epoch": 0.22402902194147878,
"grad_norm": 0.1572660207748413,
"learning_rate": 4.9993749227532385e-05,
"loss": 0.6857,
"step": 440
},
{
"epoch": 0.22912059062196694,
"grad_norm": 0.22315748035907745,
"learning_rate": 4.999360716452176e-05,
"loss": 0.6819,
"step": 450
},
{
"epoch": 0.2342121593024551,
"grad_norm": 0.10592522472143173,
"learning_rate": 4.9993465101511125e-05,
"loss": 0.6886,
"step": 460
},
{
"epoch": 0.23930372798294325,
"grad_norm": 0.10022767633199692,
"learning_rate": 4.99933230385005e-05,
"loss": 0.687,
"step": 470
},
{
"epoch": 0.2443952966634314,
"grad_norm": 0.12280535697937012,
"learning_rate": 4.999318097548987e-05,
"loss": 0.6862,
"step": 480
},
{
"epoch": 0.24948686534391956,
"grad_norm": 0.11044813692569733,
"learning_rate": 4.9993038912479244e-05,
"loss": 0.6889,
"step": 490
},
{
"epoch": 0.2545784340244077,
"grad_norm": 0.13826850056648254,
"learning_rate": 4.999289684946862e-05,
"loss": 0.6875,
"step": 500
},
{
"epoch": 0.2596700027048959,
"grad_norm": 0.10267098248004913,
"learning_rate": 4.999275478645799e-05,
"loss": 0.6877,
"step": 510
},
{
"epoch": 0.26476157138538403,
"grad_norm": 0.08445768803358078,
"learning_rate": 4.999261272344736e-05,
"loss": 0.6879,
"step": 520
},
{
"epoch": 0.2698531400658722,
"grad_norm": 0.1590685397386551,
"learning_rate": 4.999247066043673e-05,
"loss": 0.6872,
"step": 530
},
{
"epoch": 0.27494470874636034,
"grad_norm": 0.1537754386663437,
"learning_rate": 4.99923285974261e-05,
"loss": 0.6812,
"step": 540
},
{
"epoch": 0.2800362774268485,
"grad_norm": 0.12273769080638885,
"learning_rate": 4.9992186534415476e-05,
"loss": 0.6855,
"step": 550
},
{
"epoch": 0.28512784610733666,
"grad_norm": 0.19177380204200745,
"learning_rate": 4.999204447140485e-05,
"loss": 0.6857,
"step": 560
},
{
"epoch": 0.29021941478782476,
"grad_norm": 0.1194639578461647,
"learning_rate": 4.999190240839422e-05,
"loss": 0.6851,
"step": 570
},
{
"epoch": 0.2953109834683129,
"grad_norm": 0.12458167970180511,
"learning_rate": 4.9991760345383595e-05,
"loss": 0.6875,
"step": 580
},
{
"epoch": 0.30040255214880107,
"grad_norm": 0.15139921009540558,
"learning_rate": 4.999161828237297e-05,
"loss": 0.6843,
"step": 590
},
{
"epoch": 0.30549412082928923,
"grad_norm": 0.13201646506786346,
"learning_rate": 4.9991476219362335e-05,
"loss": 0.688,
"step": 600
},
{
"epoch": 0.3105856895097774,
"grad_norm": 0.10855768620967865,
"learning_rate": 4.999133415635171e-05,
"loss": 0.6859,
"step": 610
},
{
"epoch": 0.31567725819026554,
"grad_norm": 0.14113789796829224,
"learning_rate": 4.999119209334108e-05,
"loss": 0.6858,
"step": 620
},
{
"epoch": 0.3207688268707537,
"grad_norm": 0.19037926197052002,
"learning_rate": 4.9991050030330454e-05,
"loss": 0.6851,
"step": 630
},
{
"epoch": 0.32586039555124185,
"grad_norm": 0.18522581458091736,
"learning_rate": 4.999090796731983e-05,
"loss": 0.6854,
"step": 640
},
{
"epoch": 0.33095196423173,
"grad_norm": 0.24384431540966034,
"learning_rate": 4.99907659043092e-05,
"loss": 0.6852,
"step": 650
},
{
"epoch": 0.33604353291221817,
"grad_norm": 0.21666169166564941,
"learning_rate": 4.999062384129857e-05,
"loss": 0.6836,
"step": 660
},
{
"epoch": 0.3411351015927063,
"grad_norm": 0.1427813023328781,
"learning_rate": 4.9990481778287946e-05,
"loss": 0.6899,
"step": 670
},
{
"epoch": 0.3462266702731945,
"grad_norm": 0.09287853538990021,
"learning_rate": 4.999033971527732e-05,
"loss": 0.6861,
"step": 680
},
{
"epoch": 0.35131823895368264,
"grad_norm": 0.1490527093410492,
"learning_rate": 4.999019765226669e-05,
"loss": 0.6859,
"step": 690
},
{
"epoch": 0.3564098076341708,
"grad_norm": 0.0858352780342102,
"learning_rate": 4.9990055589256066e-05,
"loss": 0.6888,
"step": 700
},
{
"epoch": 0.36150137631465895,
"grad_norm": 0.15133963525295258,
"learning_rate": 4.998991352624544e-05,
"loss": 0.6837,
"step": 710
},
{
"epoch": 0.3665929449951471,
"grad_norm": 0.14562425017356873,
"learning_rate": 4.9989771463234805e-05,
"loss": 0.6862,
"step": 720
},
{
"epoch": 0.37168451367563526,
"grad_norm": 0.15240037441253662,
"learning_rate": 4.998962940022418e-05,
"loss": 0.6839,
"step": 730
},
{
"epoch": 0.3767760823561234,
"grad_norm": 0.1231294646859169,
"learning_rate": 4.9989487337213544e-05,
"loss": 0.6868,
"step": 740
},
{
"epoch": 0.3818676510366116,
"grad_norm": 0.14511612057685852,
"learning_rate": 4.998934527420292e-05,
"loss": 0.6816,
"step": 750
},
{
"epoch": 0.38695921971709973,
"grad_norm": 0.1693543940782547,
"learning_rate": 4.998920321119229e-05,
"loss": 0.6843,
"step": 760
},
{
"epoch": 0.3920507883975879,
"grad_norm": 0.09580985456705093,
"learning_rate": 4.9989061148181664e-05,
"loss": 0.6875,
"step": 770
},
{
"epoch": 0.39714235707807605,
"grad_norm": 0.1047905758023262,
"learning_rate": 4.998891908517104e-05,
"loss": 0.6859,
"step": 780
},
{
"epoch": 0.4022339257585642,
"grad_norm": 0.1090409904718399,
"learning_rate": 4.998877702216041e-05,
"loss": 0.6852,
"step": 790
},
{
"epoch": 0.4073254944390523,
"grad_norm": 0.12578189373016357,
"learning_rate": 4.998863495914978e-05,
"loss": 0.6867,
"step": 800
},
{
"epoch": 0.41241706311954046,
"grad_norm": 0.11900747567415237,
"learning_rate": 4.9988492896139156e-05,
"loss": 0.6814,
"step": 810
},
{
"epoch": 0.4175086318000286,
"grad_norm": 0.11401454359292984,
"learning_rate": 4.998835083312853e-05,
"loss": 0.6899,
"step": 820
},
{
"epoch": 0.4226002004805168,
"grad_norm": 0.1015952005982399,
"learning_rate": 4.99882087701179e-05,
"loss": 0.6865,
"step": 830
},
{
"epoch": 0.42769176916100493,
"grad_norm": 0.16676318645477295,
"learning_rate": 4.9988066707107276e-05,
"loss": 0.6891,
"step": 840
},
{
"epoch": 0.4327833378414931,
"grad_norm": 0.10982430726289749,
"learning_rate": 4.998792464409665e-05,
"loss": 0.6863,
"step": 850
},
{
"epoch": 0.43787490652198124,
"grad_norm": 0.13861846923828125,
"learning_rate": 4.9987782581086015e-05,
"loss": 0.6898,
"step": 860
},
{
"epoch": 0.4429664752024694,
"grad_norm": 0.09421814233064651,
"learning_rate": 4.998764051807539e-05,
"loss": 0.6861,
"step": 870
},
{
"epoch": 0.44805804388295756,
"grad_norm": 0.14085189998149872,
"learning_rate": 4.998749845506476e-05,
"loss": 0.683,
"step": 880
},
{
"epoch": 0.4531496125634457,
"grad_norm": 0.1671237349510193,
"learning_rate": 4.9987356392054134e-05,
"loss": 0.6835,
"step": 890
},
{
"epoch": 0.45824118124393387,
"grad_norm": 0.13570742309093475,
"learning_rate": 4.998721432904351e-05,
"loss": 0.685,
"step": 900
},
{
"epoch": 0.463332749924422,
"grad_norm": 0.10402018576860428,
"learning_rate": 4.998707226603288e-05,
"loss": 0.6872,
"step": 910
},
{
"epoch": 0.4684243186049102,
"grad_norm": 0.10226580500602722,
"learning_rate": 4.9986930203022254e-05,
"loss": 0.6871,
"step": 920
},
{
"epoch": 0.47351588728539834,
"grad_norm": 0.10132193565368652,
"learning_rate": 4.998678814001163e-05,
"loss": 0.6902,
"step": 930
},
{
"epoch": 0.4786074559658865,
"grad_norm": 0.11389295756816864,
"learning_rate": 4.9986646077001e-05,
"loss": 0.6818,
"step": 940
},
{
"epoch": 0.48369902464637465,
"grad_norm": 0.13445314764976501,
"learning_rate": 4.9986504013990366e-05,
"loss": 0.6882,
"step": 950
},
{
"epoch": 0.4887905933268628,
"grad_norm": 0.08756639063358307,
"learning_rate": 4.998636195097974e-05,
"loss": 0.6893,
"step": 960
},
{
"epoch": 0.49388216200735097,
"grad_norm": 0.08042973279953003,
"learning_rate": 4.998621988796911e-05,
"loss": 0.6891,
"step": 970
},
{
"epoch": 0.4989737306878391,
"grad_norm": 0.10082978755235672,
"learning_rate": 4.9986077824958485e-05,
"loss": 0.6903,
"step": 980
},
{
"epoch": 0.5040652993683272,
"grad_norm": 0.0936272069811821,
"learning_rate": 4.998593576194786e-05,
"loss": 0.6864,
"step": 990
},
{
"epoch": 0.5091568680488154,
"grad_norm": 0.12317179143428802,
"learning_rate": 4.9985793698937225e-05,
"loss": 0.6856,
"step": 1000
},
{
"epoch": 0.5142484367293035,
"grad_norm": 0.12630991637706757,
"learning_rate": 4.99856516359266e-05,
"loss": 0.6843,
"step": 1010
},
{
"epoch": 0.5193400054097917,
"grad_norm": 0.13727591931819916,
"learning_rate": 4.998550957291597e-05,
"loss": 0.6844,
"step": 1020
},
{
"epoch": 0.5244315740902799,
"grad_norm": 0.20466050505638123,
"learning_rate": 4.9985367509905344e-05,
"loss": 0.6826,
"step": 1030
},
{
"epoch": 0.5295231427707681,
"grad_norm": 0.18256479501724243,
"learning_rate": 4.998522544689472e-05,
"loss": 0.6779,
"step": 1040
},
{
"epoch": 0.5346147114512562,
"grad_norm": 0.20778831839561462,
"learning_rate": 4.998508338388409e-05,
"loss": 0.6853,
"step": 1050
},
{
"epoch": 0.5397062801317444,
"grad_norm": 0.08918287605047226,
"learning_rate": 4.9984941320873464e-05,
"loss": 0.6867,
"step": 1060
},
{
"epoch": 0.5447978488122325,
"grad_norm": 0.08476213365793228,
"learning_rate": 4.9984799257862837e-05,
"loss": 0.6843,
"step": 1070
},
{
"epoch": 0.5498894174927207,
"grad_norm": 0.11851644515991211,
"learning_rate": 4.998465719485221e-05,
"loss": 0.6831,
"step": 1080
},
{
"epoch": 0.5549809861732088,
"grad_norm": 0.18159732222557068,
"learning_rate": 4.998451513184158e-05,
"loss": 0.6827,
"step": 1090
},
{
"epoch": 0.560072554853697,
"grad_norm": 0.1184081956744194,
"learning_rate": 4.9984373068830956e-05,
"loss": 0.6835,
"step": 1100
},
{
"epoch": 0.5651641235341851,
"grad_norm": 0.13530392944812775,
"learning_rate": 4.998423100582032e-05,
"loss": 0.6832,
"step": 1110
},
{
"epoch": 0.5702556922146733,
"grad_norm": 0.17794091999530792,
"learning_rate": 4.9984088942809695e-05,
"loss": 0.6844,
"step": 1120
},
{
"epoch": 0.5753472608951614,
"grad_norm": 0.18658983707427979,
"learning_rate": 4.998394687979907e-05,
"loss": 0.682,
"step": 1130
},
{
"epoch": 0.5804388295756495,
"grad_norm": 0.11487103253602982,
"learning_rate": 4.998380481678844e-05,
"loss": 0.687,
"step": 1140
},
{
"epoch": 0.5855303982561377,
"grad_norm": 0.09985563158988953,
"learning_rate": 4.9983662753777815e-05,
"loss": 0.6855,
"step": 1150
},
{
"epoch": 0.5906219669366258,
"grad_norm": 0.1510723978281021,
"learning_rate": 4.998352069076718e-05,
"loss": 0.684,
"step": 1160
},
{
"epoch": 0.595713535617114,
"grad_norm": 0.15650640428066254,
"learning_rate": 4.9983378627756554e-05,
"loss": 0.6807,
"step": 1170
},
{
"epoch": 0.6008051042976021,
"grad_norm": 0.3100273311138153,
"learning_rate": 4.998323656474593e-05,
"loss": 0.6837,
"step": 1180
},
{
"epoch": 0.6058966729780904,
"grad_norm": 0.09822337329387665,
"learning_rate": 4.99830945017353e-05,
"loss": 0.6851,
"step": 1190
},
{
"epoch": 0.6109882416585785,
"grad_norm": 0.16111738979816437,
"learning_rate": 4.9982952438724673e-05,
"loss": 0.6827,
"step": 1200
},
{
"epoch": 0.6160798103390667,
"grad_norm": 0.1878943145275116,
"learning_rate": 4.9982810375714047e-05,
"loss": 0.6871,
"step": 1210
},
{
"epoch": 0.6211713790195548,
"grad_norm": 0.1281467080116272,
"learning_rate": 4.998266831270342e-05,
"loss": 0.6866,
"step": 1220
},
{
"epoch": 0.626262947700043,
"grad_norm": 0.1051391065120697,
"learning_rate": 4.998252624969279e-05,
"loss": 0.6869,
"step": 1230
},
{
"epoch": 0.6313545163805311,
"grad_norm": 0.138059601187706,
"learning_rate": 4.9982384186682166e-05,
"loss": 0.6825,
"step": 1240
},
{
"epoch": 0.6364460850610193,
"grad_norm": 0.10719313472509384,
"learning_rate": 4.998224212367153e-05,
"loss": 0.6837,
"step": 1250
},
{
"epoch": 0.6415376537415074,
"grad_norm": 0.09252595156431198,
"learning_rate": 4.9982100060660905e-05,
"loss": 0.689,
"step": 1260
},
{
"epoch": 0.6466292224219956,
"grad_norm": 0.12894387543201447,
"learning_rate": 4.998195799765028e-05,
"loss": 0.6833,
"step": 1270
},
{
"epoch": 0.6517207911024837,
"grad_norm": 0.10794473439455032,
"learning_rate": 4.998181593463965e-05,
"loss": 0.6866,
"step": 1280
},
{
"epoch": 0.6568123597829719,
"grad_norm": 0.11546550691127777,
"learning_rate": 4.9981673871629025e-05,
"loss": 0.6861,
"step": 1290
},
{
"epoch": 0.66190392846346,
"grad_norm": 0.10733726620674133,
"learning_rate": 4.99815318086184e-05,
"loss": 0.683,
"step": 1300
},
{
"epoch": 0.6669954971439482,
"grad_norm": 0.17388881742954254,
"learning_rate": 4.998138974560777e-05,
"loss": 0.686,
"step": 1310
},
{
"epoch": 0.6720870658244363,
"grad_norm": 0.15069304406642914,
"learning_rate": 4.9981247682597144e-05,
"loss": 0.6828,
"step": 1320
},
{
"epoch": 0.6771786345049245,
"grad_norm": 0.14276649057865143,
"learning_rate": 4.998110561958652e-05,
"loss": 0.6814,
"step": 1330
},
{
"epoch": 0.6822702031854126,
"grad_norm": 0.12937600910663605,
"learning_rate": 4.998096355657589e-05,
"loss": 0.6868,
"step": 1340
},
{
"epoch": 0.6873617718659009,
"grad_norm": 0.1466054916381836,
"learning_rate": 4.998082149356526e-05,
"loss": 0.6848,
"step": 1350
},
{
"epoch": 0.692453340546389,
"grad_norm": 0.14180545508861542,
"learning_rate": 4.9980679430554636e-05,
"loss": 0.6847,
"step": 1360
},
{
"epoch": 0.6975449092268771,
"grad_norm": 0.11979173868894577,
"learning_rate": 4.9980537367544e-05,
"loss": 0.6809,
"step": 1370
},
{
"epoch": 0.7026364779073653,
"grad_norm": 0.15614405274391174,
"learning_rate": 4.9980395304533376e-05,
"loss": 0.6802,
"step": 1380
},
{
"epoch": 0.7077280465878534,
"grad_norm": 0.16178403794765472,
"learning_rate": 4.998025324152274e-05,
"loss": 0.6766,
"step": 1390
},
{
"epoch": 0.7128196152683416,
"grad_norm": 0.11734528839588165,
"learning_rate": 4.9980111178512115e-05,
"loss": 0.6853,
"step": 1400
},
{
"epoch": 0.7179111839488297,
"grad_norm": 0.09437315165996552,
"learning_rate": 4.997996911550149e-05,
"loss": 0.6859,
"step": 1410
},
{
"epoch": 0.7230027526293179,
"grad_norm": 0.08119911700487137,
"learning_rate": 4.997982705249086e-05,
"loss": 0.6902,
"step": 1420
},
{
"epoch": 0.728094321309806,
"grad_norm": 0.14570364356040955,
"learning_rate": 4.9979684989480235e-05,
"loss": 0.6841,
"step": 1430
},
{
"epoch": 0.7331858899902942,
"grad_norm": 0.12333963066339493,
"learning_rate": 4.997954292646961e-05,
"loss": 0.6819,
"step": 1440
},
{
"epoch": 0.7382774586707823,
"grad_norm": 0.11946499347686768,
"learning_rate": 4.997940086345898e-05,
"loss": 0.6847,
"step": 1450
},
{
"epoch": 0.7433690273512705,
"grad_norm": 0.12417126446962357,
"learning_rate": 4.9979258800448354e-05,
"loss": 0.6826,
"step": 1460
},
{
"epoch": 0.7484605960317586,
"grad_norm": 0.11672031134366989,
"learning_rate": 4.997911673743773e-05,
"loss": 0.6831,
"step": 1470
},
{
"epoch": 0.7535521647122468,
"grad_norm": 0.1273321509361267,
"learning_rate": 4.99789746744271e-05,
"loss": 0.6828,
"step": 1480
},
{
"epoch": 0.7586437333927349,
"grad_norm": 0.1056080237030983,
"learning_rate": 4.997883261141647e-05,
"loss": 0.6868,
"step": 1490
},
{
"epoch": 0.7637353020732232,
"grad_norm": 0.12784817814826965,
"learning_rate": 4.9978690548405846e-05,
"loss": 0.6819,
"step": 1500
},
{
"epoch": 0.7688268707537113,
"grad_norm": 0.16047458350658417,
"learning_rate": 4.997854848539521e-05,
"loss": 0.6825,
"step": 1510
},
{
"epoch": 0.7739184394341995,
"grad_norm": 0.11385879665613174,
"learning_rate": 4.9978406422384586e-05,
"loss": 0.686,
"step": 1520
},
{
"epoch": 0.7790100081146876,
"grad_norm": 0.13264243304729462,
"learning_rate": 4.997826435937396e-05,
"loss": 0.6799,
"step": 1530
},
{
"epoch": 0.7841015767951758,
"grad_norm": 0.2524195611476898,
"learning_rate": 4.997812229636333e-05,
"loss": 0.6771,
"step": 1540
},
{
"epoch": 0.7891931454756639,
"grad_norm": 0.14071324467658997,
"learning_rate": 4.9977980233352705e-05,
"loss": 0.6833,
"step": 1550
},
{
"epoch": 0.7942847141561521,
"grad_norm": 0.12755858898162842,
"learning_rate": 4.997783817034208e-05,
"loss": 0.6831,
"step": 1560
},
{
"epoch": 0.7993762828366402,
"grad_norm": 0.17357097566127777,
"learning_rate": 4.997769610733145e-05,
"loss": 0.6829,
"step": 1570
},
{
"epoch": 0.8044678515171284,
"grad_norm": 0.13588126003742218,
"learning_rate": 4.997755404432082e-05,
"loss": 0.6896,
"step": 1580
},
{
"epoch": 0.8095594201976165,
"grad_norm": 0.0981392115354538,
"learning_rate": 4.997741198131019e-05,
"loss": 0.6859,
"step": 1590
},
{
"epoch": 0.8146509888781046,
"grad_norm": 0.13461001217365265,
"learning_rate": 4.9977269918299564e-05,
"loss": 0.6811,
"step": 1600
},
{
"epoch": 0.8197425575585928,
"grad_norm": 0.25011003017425537,
"learning_rate": 4.997712785528894e-05,
"loss": 0.6801,
"step": 1610
},
{
"epoch": 0.8248341262390809,
"grad_norm": 0.19415059685707092,
"learning_rate": 4.997698579227831e-05,
"loss": 0.68,
"step": 1620
},
{
"epoch": 0.8299256949195691,
"grad_norm": 0.1742919236421585,
"learning_rate": 4.997684372926768e-05,
"loss": 0.684,
"step": 1630
},
{
"epoch": 0.8350172636000572,
"grad_norm": 0.12717512249946594,
"learning_rate": 4.9976701666257056e-05,
"loss": 0.6864,
"step": 1640
},
{
"epoch": 0.8401088322805454,
"grad_norm": 0.09787630289793015,
"learning_rate": 4.997655960324642e-05,
"loss": 0.6865,
"step": 1650
},
{
"epoch": 0.8452004009610335,
"grad_norm": 0.11440135538578033,
"learning_rate": 4.9976417540235796e-05,
"loss": 0.6846,
"step": 1660
},
{
"epoch": 0.8502919696415218,
"grad_norm": 0.13598810136318207,
"learning_rate": 4.997627547722517e-05,
"loss": 0.6861,
"step": 1670
},
{
"epoch": 0.8553835383220099,
"grad_norm": 0.1623242348432541,
"learning_rate": 4.997613341421454e-05,
"loss": 0.6797,
"step": 1680
},
{
"epoch": 0.8604751070024981,
"grad_norm": 0.12565261125564575,
"learning_rate": 4.9975991351203915e-05,
"loss": 0.6847,
"step": 1690
},
{
"epoch": 0.8655666756829862,
"grad_norm": 0.11019585281610489,
"learning_rate": 4.997584928819329e-05,
"loss": 0.6802,
"step": 1700
},
{
"epoch": 0.8706582443634744,
"grad_norm": 0.10026270151138306,
"learning_rate": 4.997570722518266e-05,
"loss": 0.6863,
"step": 1710
},
{
"epoch": 0.8757498130439625,
"grad_norm": 0.10043281316757202,
"learning_rate": 4.9975565162172034e-05,
"loss": 0.6843,
"step": 1720
},
{
"epoch": 0.8808413817244507,
"grad_norm": 0.0944572165608406,
"learning_rate": 4.997542309916141e-05,
"loss": 0.684,
"step": 1730
},
{
"epoch": 0.8859329504049388,
"grad_norm": 0.12859639525413513,
"learning_rate": 4.997528103615078e-05,
"loss": 0.6856,
"step": 1740
},
{
"epoch": 0.891024519085427,
"grad_norm": 0.11585383117198944,
"learning_rate": 4.9975138973140154e-05,
"loss": 0.6807,
"step": 1750
},
{
"epoch": 0.8961160877659151,
"grad_norm": 0.13746441900730133,
"learning_rate": 4.997499691012953e-05,
"loss": 0.6846,
"step": 1760
},
{
"epoch": 0.9012076564464033,
"grad_norm": 0.09316791594028473,
"learning_rate": 4.997485484711889e-05,
"loss": 0.6848,
"step": 1770
},
{
"epoch": 0.9062992251268914,
"grad_norm": 0.07422750443220139,
"learning_rate": 4.9974712784108266e-05,
"loss": 0.688,
"step": 1780
},
{
"epoch": 0.9113907938073796,
"grad_norm": 0.08577447384595871,
"learning_rate": 4.997457072109763e-05,
"loss": 0.6856,
"step": 1790
},
{
"epoch": 0.9164823624878677,
"grad_norm": 0.09143663942813873,
"learning_rate": 4.9974428658087006e-05,
"loss": 0.6848,
"step": 1800
},
{
"epoch": 0.921573931168356,
"grad_norm": 0.10064688324928284,
"learning_rate": 4.997428659507638e-05,
"loss": 0.6889,
"step": 1810
},
{
"epoch": 0.926665499848844,
"grad_norm": 0.0921172946691513,
"learning_rate": 4.997414453206575e-05,
"loss": 0.6824,
"step": 1820
},
{
"epoch": 0.9317570685293322,
"grad_norm": 0.12253455817699432,
"learning_rate": 4.9974002469055125e-05,
"loss": 0.686,
"step": 1830
},
{
"epoch": 0.9368486372098204,
"grad_norm": 0.16046911478042603,
"learning_rate": 4.99738604060445e-05,
"loss": 0.6833,
"step": 1840
},
{
"epoch": 0.9419402058903085,
"grad_norm": 0.1947670727968216,
"learning_rate": 4.997371834303387e-05,
"loss": 0.676,
"step": 1850
},
{
"epoch": 0.9470317745707967,
"grad_norm": 0.17206092178821564,
"learning_rate": 4.9973576280023244e-05,
"loss": 0.6832,
"step": 1860
},
{
"epoch": 0.9521233432512848,
"grad_norm": 0.16195142269134521,
"learning_rate": 4.997343421701262e-05,
"loss": 0.681,
"step": 1870
},
{
"epoch": 0.957214911931773,
"grad_norm": 0.1436363011598587,
"learning_rate": 4.997329215400199e-05,
"loss": 0.6847,
"step": 1880
},
{
"epoch": 0.9623064806122611,
"grad_norm": 0.11514883488416672,
"learning_rate": 4.9973150090991364e-05,
"loss": 0.6823,
"step": 1890
},
{
"epoch": 0.9673980492927493,
"grad_norm": 0.11169356852769852,
"learning_rate": 4.997300802798074e-05,
"loss": 0.6859,
"step": 1900
},
{
"epoch": 0.9724896179732374,
"grad_norm": 0.09914061427116394,
"learning_rate": 4.99728659649701e-05,
"loss": 0.6848,
"step": 1910
},
{
"epoch": 0.9775811866537256,
"grad_norm": 0.10717228055000305,
"learning_rate": 4.9972723901959476e-05,
"loss": 0.6833,
"step": 1920
},
{
"epoch": 0.9826727553342137,
"grad_norm": 0.11209560185670853,
"learning_rate": 4.997258183894885e-05,
"loss": 0.6861,
"step": 1930
},
{
"epoch": 0.9877643240147019,
"grad_norm": 0.1293047070503235,
"learning_rate": 4.997243977593822e-05,
"loss": 0.6833,
"step": 1940
},
{
"epoch": 0.99285589269519,
"grad_norm": 0.13042615354061127,
"learning_rate": 4.9972297712927595e-05,
"loss": 0.6837,
"step": 1950
},
{
"epoch": 0.9979474613756782,
"grad_norm": 0.1377701610326767,
"learning_rate": 4.997215564991697e-05,
"loss": 0.6858,
"step": 1960
},
{
"epoch": 1.002545784340244,
"grad_norm": 0.13352081179618835,
"learning_rate": 4.997201358690634e-05,
"loss": 0.6173,
"step": 1970
},
{
"epoch": 1.0076373530207323,
"grad_norm": 0.12459533661603928,
"learning_rate": 4.9971871523895715e-05,
"loss": 0.6836,
"step": 1980
},
{
"epoch": 1.0127289217012203,
"grad_norm": 0.10235695540904999,
"learning_rate": 4.997172946088509e-05,
"loss": 0.6867,
"step": 1990
},
{
"epoch": 1.0178204903817085,
"grad_norm": 0.0960664227604866,
"learning_rate": 4.9971587397874454e-05,
"loss": 0.6847,
"step": 2000
},
{
"epoch": 1.0229120590621967,
"grad_norm": 0.12099937349557877,
"learning_rate": 4.997144533486383e-05,
"loss": 0.6828,
"step": 2010
},
{
"epoch": 1.028003627742685,
"grad_norm": 0.10949967801570892,
"learning_rate": 4.99713032718532e-05,
"loss": 0.6828,
"step": 2020
},
{
"epoch": 1.033095196423173,
"grad_norm": 0.09417010843753815,
"learning_rate": 4.9971161208842573e-05,
"loss": 0.6866,
"step": 2030
},
{
"epoch": 1.0381867651036611,
"grad_norm": 0.08539358526468277,
"learning_rate": 4.9971019145831947e-05,
"loss": 0.6869,
"step": 2040
},
{
"epoch": 1.0432783337841494,
"grad_norm": 0.10147163271903992,
"learning_rate": 4.997087708282131e-05,
"loss": 0.68,
"step": 2050
},
{
"epoch": 1.0483699024646376,
"grad_norm": 0.15451493859291077,
"learning_rate": 4.9970735019810686e-05,
"loss": 0.6826,
"step": 2060
},
{
"epoch": 1.0534614711451256,
"grad_norm": 0.09405049681663513,
"learning_rate": 4.997059295680006e-05,
"loss": 0.6868,
"step": 2070
},
{
"epoch": 1.0585530398256138,
"grad_norm": 0.12649065256118774,
"learning_rate": 4.997045089378943e-05,
"loss": 0.6845,
"step": 2080
},
{
"epoch": 1.063644608506102,
"grad_norm": 0.12368927896022797,
"learning_rate": 4.9970308830778805e-05,
"loss": 0.6804,
"step": 2090
},
{
"epoch": 1.0687361771865902,
"grad_norm": 0.15372063219547272,
"learning_rate": 4.997016676776818e-05,
"loss": 0.6841,
"step": 2100
},
{
"epoch": 1.0738277458670782,
"grad_norm": 0.14659467339515686,
"learning_rate": 4.997002470475755e-05,
"loss": 0.6808,
"step": 2110
},
{
"epoch": 1.0789193145475664,
"grad_norm": 0.15990343689918518,
"learning_rate": 4.9969882641746925e-05,
"loss": 0.6824,
"step": 2120
},
{
"epoch": 1.0840108832280546,
"grad_norm": 0.1342097967863083,
"learning_rate": 4.99697405787363e-05,
"loss": 0.6819,
"step": 2130
},
{
"epoch": 1.0891024519085426,
"grad_norm": 0.1691288948059082,
"learning_rate": 4.996959851572567e-05,
"loss": 0.6803,
"step": 2140
},
{
"epoch": 1.0941940205890308,
"grad_norm": 0.12277499586343765,
"learning_rate": 4.9969456452715044e-05,
"loss": 0.683,
"step": 2150
},
{
"epoch": 1.099285589269519,
"grad_norm": 0.19435428082942963,
"learning_rate": 4.996931438970441e-05,
"loss": 0.6824,
"step": 2160
},
{
"epoch": 1.1043771579500072,
"grad_norm": 0.0905819982290268,
"learning_rate": 4.996917232669378e-05,
"loss": 0.6894,
"step": 2170
},
{
"epoch": 1.1094687266304952,
"grad_norm": 0.07771383225917816,
"learning_rate": 4.9969030263683156e-05,
"loss": 0.6878,
"step": 2180
},
{
"epoch": 1.1145602953109834,
"grad_norm": 0.10115786641836166,
"learning_rate": 4.996888820067253e-05,
"loss": 0.6818,
"step": 2190
},
{
"epoch": 1.1196518639914717,
"grad_norm": 0.10046926885843277,
"learning_rate": 4.99687461376619e-05,
"loss": 0.6872,
"step": 2200
},
{
"epoch": 1.1247434326719599,
"grad_norm": 0.1584903746843338,
"learning_rate": 4.996860407465127e-05,
"loss": 0.6772,
"step": 2210
},
{
"epoch": 1.1298350013524479,
"grad_norm": 0.19328419864177704,
"learning_rate": 4.996846201164064e-05,
"loss": 0.6814,
"step": 2220
},
{
"epoch": 1.134926570032936,
"grad_norm": 0.12247782945632935,
"learning_rate": 4.9968319948630015e-05,
"loss": 0.6817,
"step": 2230
},
{
"epoch": 1.1400181387134243,
"grad_norm": 0.13911662995815277,
"learning_rate": 4.996817788561939e-05,
"loss": 0.678,
"step": 2240
},
{
"epoch": 1.1451097073939125,
"grad_norm": 0.16294950246810913,
"learning_rate": 4.996803582260876e-05,
"loss": 0.6825,
"step": 2250
},
{
"epoch": 1.1502012760744005,
"grad_norm": 0.15820349752902985,
"learning_rate": 4.9967893759598135e-05,
"loss": 0.6781,
"step": 2260
},
{
"epoch": 1.1552928447548887,
"grad_norm": 0.13467393815517426,
"learning_rate": 4.996775169658751e-05,
"loss": 0.6862,
"step": 2270
},
{
"epoch": 1.160384413435377,
"grad_norm": 0.11259343475103378,
"learning_rate": 4.996760963357688e-05,
"loss": 0.6809,
"step": 2280
},
{
"epoch": 1.165475982115865,
"grad_norm": 0.13340143859386444,
"learning_rate": 4.9967467570566254e-05,
"loss": 0.6805,
"step": 2290
},
{
"epoch": 1.170567550796353,
"grad_norm": 0.12365837395191193,
"learning_rate": 4.996732550755562e-05,
"loss": 0.6802,
"step": 2300
},
{
"epoch": 1.1756591194768413,
"grad_norm": 0.09431267529726028,
"learning_rate": 4.996718344454499e-05,
"loss": 0.6903,
"step": 2310
},
{
"epoch": 1.1807506881573295,
"grad_norm": 0.08034133911132812,
"learning_rate": 4.9967041381534366e-05,
"loss": 0.6837,
"step": 2320
},
{
"epoch": 1.1858422568378177,
"grad_norm": 0.07523014396429062,
"learning_rate": 4.996689931852374e-05,
"loss": 0.6836,
"step": 2330
},
{
"epoch": 1.1909338255183057,
"grad_norm": 0.1202087476849556,
"learning_rate": 4.996675725551311e-05,
"loss": 0.6771,
"step": 2340
},
{
"epoch": 1.196025394198794,
"grad_norm": 0.13852781057357788,
"learning_rate": 4.9966615192502486e-05,
"loss": 0.6813,
"step": 2350
},
{
"epoch": 1.2011169628792822,
"grad_norm": 0.1234976053237915,
"learning_rate": 4.996647312949186e-05,
"loss": 0.6799,
"step": 2360
},
{
"epoch": 1.2062085315597701,
"grad_norm": 0.1308223158121109,
"learning_rate": 4.996633106648123e-05,
"loss": 0.6839,
"step": 2370
},
{
"epoch": 1.2113001002402584,
"grad_norm": 0.11719993501901627,
"learning_rate": 4.9966189003470605e-05,
"loss": 0.6826,
"step": 2380
},
{
"epoch": 1.2163916689207466,
"grad_norm": 0.13690686225891113,
"learning_rate": 4.996604694045998e-05,
"loss": 0.6804,
"step": 2390
},
{
"epoch": 1.2214832376012348,
"grad_norm": 0.12480480223894119,
"learning_rate": 4.996590487744935e-05,
"loss": 0.6831,
"step": 2400
},
{
"epoch": 1.2265748062817228,
"grad_norm": 0.12938359379768372,
"learning_rate": 4.9965762814438724e-05,
"loss": 0.6821,
"step": 2410
},
{
"epoch": 1.231666374962211,
"grad_norm": 0.14791236817836761,
"learning_rate": 4.996562075142809e-05,
"loss": 0.6813,
"step": 2420
},
{
"epoch": 1.2367579436426992,
"grad_norm": 0.1450764387845993,
"learning_rate": 4.9965478688417464e-05,
"loss": 0.6835,
"step": 2430
},
{
"epoch": 1.2418495123231874,
"grad_norm": 0.12077004462480545,
"learning_rate": 4.996533662540683e-05,
"loss": 0.6843,
"step": 2440
},
{
"epoch": 1.2469410810036754,
"grad_norm": 0.0918821468949318,
"learning_rate": 4.99651945623962e-05,
"loss": 0.6836,
"step": 2450
},
{
"epoch": 1.2520326496841636,
"grad_norm": 0.09863133728504181,
"learning_rate": 4.9965052499385576e-05,
"loss": 0.683,
"step": 2460
},
{
"epoch": 1.2571242183646518,
"grad_norm": 0.10029463469982147,
"learning_rate": 4.996491043637495e-05,
"loss": 0.6839,
"step": 2470
},
{
"epoch": 1.26221578704514,
"grad_norm": 0.11962137371301651,
"learning_rate": 4.996476837336432e-05,
"loss": 0.677,
"step": 2480
},
{
"epoch": 1.267307355725628,
"grad_norm": 0.11363455653190613,
"learning_rate": 4.9964626310353696e-05,
"loss": 0.6843,
"step": 2490
},
{
"epoch": 1.2723989244061162,
"grad_norm": 0.08753272145986557,
"learning_rate": 4.996448424734307e-05,
"loss": 0.6857,
"step": 2500
},
{
"epoch": 1.2774904930866045,
"grad_norm": 0.10698520392179489,
"learning_rate": 4.996434218433244e-05,
"loss": 0.6855,
"step": 2510
},
{
"epoch": 1.2825820617670924,
"grad_norm": 0.09481139481067657,
"learning_rate": 4.9964200121321815e-05,
"loss": 0.683,
"step": 2520
},
{
"epoch": 1.2876736304475807,
"grad_norm": 0.15638168156147003,
"learning_rate": 4.996405805831119e-05,
"loss": 0.6842,
"step": 2530
},
{
"epoch": 1.2927651991280689,
"grad_norm": 0.1133870854973793,
"learning_rate": 4.996391599530056e-05,
"loss": 0.6831,
"step": 2540
},
{
"epoch": 1.297856767808557,
"grad_norm": 0.1086922213435173,
"learning_rate": 4.9963773932289934e-05,
"loss": 0.6832,
"step": 2550
},
{
"epoch": 1.3029483364890453,
"grad_norm": 0.11375133693218231,
"learning_rate": 4.99636318692793e-05,
"loss": 0.6871,
"step": 2560
},
{
"epoch": 1.3080399051695333,
"grad_norm": 0.11502251029014587,
"learning_rate": 4.9963489806268674e-05,
"loss": 0.6844,
"step": 2570
},
{
"epoch": 1.3131314738500215,
"grad_norm": 0.13333244621753693,
"learning_rate": 4.996334774325805e-05,
"loss": 0.6821,
"step": 2580
},
{
"epoch": 1.3182230425305097,
"grad_norm": 0.17771519720554352,
"learning_rate": 4.996320568024742e-05,
"loss": 0.6817,
"step": 2590
},
{
"epoch": 1.3233146112109977,
"grad_norm": 0.10856124758720398,
"learning_rate": 4.996306361723679e-05,
"loss": 0.6832,
"step": 2600
},
{
"epoch": 1.328406179891486,
"grad_norm": 0.13525483012199402,
"learning_rate": 4.9962921554226166e-05,
"loss": 0.6848,
"step": 2610
},
{
"epoch": 1.3334977485719741,
"grad_norm": 0.14420652389526367,
"learning_rate": 4.996277949121553e-05,
"loss": 0.6831,
"step": 2620
},
{
"epoch": 1.3385893172524623,
"grad_norm": 0.10660698264837265,
"learning_rate": 4.9962637428204906e-05,
"loss": 0.686,
"step": 2630
},
{
"epoch": 1.3436808859329505,
"grad_norm": 0.16599448025226593,
"learning_rate": 4.996249536519428e-05,
"loss": 0.6826,
"step": 2640
},
{
"epoch": 1.3487724546134385,
"grad_norm": 0.13518887758255005,
"learning_rate": 4.996235330218365e-05,
"loss": 0.6854,
"step": 2650
},
{
"epoch": 1.3538640232939267,
"grad_norm": 0.1113041415810585,
"learning_rate": 4.9962211239173025e-05,
"loss": 0.6832,
"step": 2660
},
{
"epoch": 1.3589555919744147,
"grad_norm": 0.13242138922214508,
"learning_rate": 4.99620691761624e-05,
"loss": 0.6814,
"step": 2670
},
{
"epoch": 1.364047160654903,
"grad_norm": 0.18434931337833405,
"learning_rate": 4.996192711315177e-05,
"loss": 0.6808,
"step": 2680
},
{
"epoch": 1.3691387293353912,
"grad_norm": 0.11528836935758591,
"learning_rate": 4.9961785050141144e-05,
"loss": 0.685,
"step": 2690
},
{
"epoch": 1.3742302980158794,
"grad_norm": 0.1295492947101593,
"learning_rate": 4.996164298713051e-05,
"loss": 0.6825,
"step": 2700
},
{
"epoch": 1.3793218666963676,
"grad_norm": 0.09657806158065796,
"learning_rate": 4.9961500924119884e-05,
"loss": 0.6825,
"step": 2710
},
{
"epoch": 1.3844134353768556,
"grad_norm": 0.08716735243797302,
"learning_rate": 4.996135886110926e-05,
"loss": 0.6872,
"step": 2720
},
{
"epoch": 1.3895050040573438,
"grad_norm": 0.0896734893321991,
"learning_rate": 4.996121679809863e-05,
"loss": 0.6817,
"step": 2730
},
{
"epoch": 1.394596572737832,
"grad_norm": 0.10860587656497955,
"learning_rate": 4.9961074735088e-05,
"loss": 0.6815,
"step": 2740
},
{
"epoch": 1.39968814141832,
"grad_norm": 0.1187656968832016,
"learning_rate": 4.9960932672077376e-05,
"loss": 0.686,
"step": 2750
},
{
"epoch": 1.4047797100988082,
"grad_norm": 0.11682062596082687,
"learning_rate": 4.996079060906675e-05,
"loss": 0.6799,
"step": 2760
},
{
"epoch": 1.4098712787792964,
"grad_norm": 0.14465422928333282,
"learning_rate": 4.996064854605612e-05,
"loss": 0.6823,
"step": 2770
},
{
"epoch": 1.4149628474597846,
"grad_norm": 0.13644230365753174,
"learning_rate": 4.9960506483045495e-05,
"loss": 0.6828,
"step": 2780
},
{
"epoch": 1.4200544161402728,
"grad_norm": 0.09552885591983795,
"learning_rate": 4.996036442003487e-05,
"loss": 0.6828,
"step": 2790
},
{
"epoch": 1.4251459848207608,
"grad_norm": 0.1287170648574829,
"learning_rate": 4.996022235702424e-05,
"loss": 0.6846,
"step": 2800
},
{
"epoch": 1.430237553501249,
"grad_norm": 0.11409243196249008,
"learning_rate": 4.9960080294013615e-05,
"loss": 0.6844,
"step": 2810
},
{
"epoch": 1.4353291221817372,
"grad_norm": 0.16463157534599304,
"learning_rate": 4.995993823100298e-05,
"loss": 0.6842,
"step": 2820
},
{
"epoch": 1.4404206908622252,
"grad_norm": 0.12962253391742706,
"learning_rate": 4.9959796167992354e-05,
"loss": 0.6831,
"step": 2830
},
{
"epoch": 1.4455122595427135,
"grad_norm": 0.1317017823457718,
"learning_rate": 4.995965410498172e-05,
"loss": 0.6825,
"step": 2840
},
{
"epoch": 1.4506038282232017,
"grad_norm": 0.1253054440021515,
"learning_rate": 4.9959512041971094e-05,
"loss": 0.6869,
"step": 2850
},
{
"epoch": 1.4556953969036899,
"grad_norm": 0.10968417674303055,
"learning_rate": 4.995936997896047e-05,
"loss": 0.6837,
"step": 2860
},
{
"epoch": 1.460786965584178,
"grad_norm": 0.15329721570014954,
"learning_rate": 4.995922791594984e-05,
"loss": 0.6856,
"step": 2870
},
{
"epoch": 1.465878534264666,
"grad_norm": 0.1338498741388321,
"learning_rate": 4.995908585293921e-05,
"loss": 0.6862,
"step": 2880
},
{
"epoch": 1.4709701029451543,
"grad_norm": 0.10569129139184952,
"learning_rate": 4.9958943789928586e-05,
"loss": 0.6834,
"step": 2890
},
{
"epoch": 1.4760616716256423,
"grad_norm": 0.14210055768489838,
"learning_rate": 4.995880172691796e-05,
"loss": 0.682,
"step": 2900
},
{
"epoch": 1.4811532403061305,
"grad_norm": 0.13887226581573486,
"learning_rate": 4.995865966390733e-05,
"loss": 0.6813,
"step": 2910
},
{
"epoch": 1.4862448089866187,
"grad_norm": 0.14252229034900665,
"learning_rate": 4.9958517600896705e-05,
"loss": 0.6819,
"step": 2920
},
{
"epoch": 1.491336377667107,
"grad_norm": 0.1889895647764206,
"learning_rate": 4.995837553788608e-05,
"loss": 0.6833,
"step": 2930
},
{
"epoch": 1.4964279463475951,
"grad_norm": 0.14179687201976776,
"learning_rate": 4.995823347487545e-05,
"loss": 0.6774,
"step": 2940
},
{
"epoch": 1.5015195150280833,
"grad_norm": 0.1529311090707779,
"learning_rate": 4.9958091411864825e-05,
"loss": 0.6839,
"step": 2950
},
{
"epoch": 1.5066110837085713,
"grad_norm": 0.11903861910104752,
"learning_rate": 4.995794934885419e-05,
"loss": 0.6828,
"step": 2960
},
{
"epoch": 1.5117026523890595,
"grad_norm": 0.11958228051662445,
"learning_rate": 4.9957807285843564e-05,
"loss": 0.6809,
"step": 2970
},
{
"epoch": 1.5167942210695475,
"grad_norm": 0.10305473953485489,
"learning_rate": 4.995766522283294e-05,
"loss": 0.6834,
"step": 2980
},
{
"epoch": 1.5218857897500357,
"grad_norm": 0.11478529125452042,
"learning_rate": 4.995752315982231e-05,
"loss": 0.6835,
"step": 2990
},
{
"epoch": 1.526977358430524,
"grad_norm": 0.13605500757694244,
"learning_rate": 4.995738109681168e-05,
"loss": 0.6804,
"step": 3000
},
{
"epoch": 1.5320689271110122,
"grad_norm": 0.12643195688724518,
"learning_rate": 4.9957239033801056e-05,
"loss": 0.6763,
"step": 3010
},
{
"epoch": 1.5371604957915004,
"grad_norm": 0.22794055938720703,
"learning_rate": 4.995709697079043e-05,
"loss": 0.6822,
"step": 3020
},
{
"epoch": 1.5422520644719884,
"grad_norm": 0.1283722072839737,
"learning_rate": 4.99569549077798e-05,
"loss": 0.6823,
"step": 3030
},
{
"epoch": 1.5473436331524766,
"grad_norm": 0.12796291708946228,
"learning_rate": 4.995681284476917e-05,
"loss": 0.6789,
"step": 3040
},
{
"epoch": 1.5524352018329646,
"grad_norm": 0.20063504576683044,
"learning_rate": 4.995667078175854e-05,
"loss": 0.6782,
"step": 3050
},
{
"epoch": 1.5575267705134528,
"grad_norm": 0.10560201853513718,
"learning_rate": 4.9956528718747915e-05,
"loss": 0.6899,
"step": 3060
},
{
"epoch": 1.562618339193941,
"grad_norm": 0.09931265562772751,
"learning_rate": 4.995638665573729e-05,
"loss": 0.6857,
"step": 3070
},
{
"epoch": 1.5677099078744292,
"grad_norm": 0.09285406023263931,
"learning_rate": 4.995624459272666e-05,
"loss": 0.6865,
"step": 3080
},
{
"epoch": 1.5728014765549174,
"grad_norm": 0.11098553240299225,
"learning_rate": 4.9956102529716035e-05,
"loss": 0.6837,
"step": 3090
},
{
"epoch": 1.5778930452354056,
"grad_norm": 0.12747269868850708,
"learning_rate": 4.99559604667054e-05,
"loss": 0.6805,
"step": 3100
},
{
"epoch": 1.5829846139158936,
"grad_norm": 0.19148799777030945,
"learning_rate": 4.9955818403694774e-05,
"loss": 0.6809,
"step": 3110
},
{
"epoch": 1.5880761825963818,
"grad_norm": 0.11333976686000824,
"learning_rate": 4.995567634068415e-05,
"loss": 0.6834,
"step": 3120
},
{
"epoch": 1.5931677512768698,
"grad_norm": 0.12725278735160828,
"learning_rate": 4.995553427767352e-05,
"loss": 0.682,
"step": 3130
},
{
"epoch": 1.598259319957358,
"grad_norm": 0.08800658583641052,
"learning_rate": 4.995539221466289e-05,
"loss": 0.6863,
"step": 3140
},
{
"epoch": 1.6033508886378462,
"grad_norm": 0.1162288561463356,
"learning_rate": 4.9955250151652266e-05,
"loss": 0.6816,
"step": 3150
},
{
"epoch": 1.6084424573183345,
"grad_norm": 0.11001812666654587,
"learning_rate": 4.995510808864164e-05,
"loss": 0.6834,
"step": 3160
},
{
"epoch": 1.6135340259988227,
"grad_norm": 0.17772439122200012,
"learning_rate": 4.995496602563101e-05,
"loss": 0.6839,
"step": 3170
},
{
"epoch": 1.6186255946793109,
"grad_norm": 0.14124180376529694,
"learning_rate": 4.9954823962620386e-05,
"loss": 0.6817,
"step": 3180
},
{
"epoch": 1.6237171633597989,
"grad_norm": 0.12131723016500473,
"learning_rate": 4.995468189960976e-05,
"loss": 0.68,
"step": 3190
},
{
"epoch": 1.628808732040287,
"grad_norm": 0.1277320384979248,
"learning_rate": 4.995453983659913e-05,
"loss": 0.6761,
"step": 3200
},
{
"epoch": 1.633900300720775,
"grad_norm": 0.11980846524238586,
"learning_rate": 4.99543977735885e-05,
"loss": 0.6845,
"step": 3210
},
{
"epoch": 1.6389918694012633,
"grad_norm": 0.18087904155254364,
"learning_rate": 4.995425571057787e-05,
"loss": 0.6815,
"step": 3220
},
{
"epoch": 1.6440834380817515,
"grad_norm": 0.1640498787164688,
"learning_rate": 4.9954113647567244e-05,
"loss": 0.6799,
"step": 3230
},
{
"epoch": 1.6491750067622397,
"grad_norm": 0.14339861273765564,
"learning_rate": 4.995397158455662e-05,
"loss": 0.684,
"step": 3240
},
{
"epoch": 1.654266575442728,
"grad_norm": 0.11472135037183762,
"learning_rate": 4.9953829521545984e-05,
"loss": 0.6825,
"step": 3250
},
{
"epoch": 1.659358144123216,
"grad_norm": 0.12307639420032501,
"learning_rate": 4.995368745853536e-05,
"loss": 0.6892,
"step": 3260
},
{
"epoch": 1.6644497128037041,
"grad_norm": 0.09782890975475311,
"learning_rate": 4.995354539552473e-05,
"loss": 0.6823,
"step": 3270
},
{
"epoch": 1.6695412814841921,
"grad_norm": 0.1154768094420433,
"learning_rate": 4.99534033325141e-05,
"loss": 0.6842,
"step": 3280
},
{
"epoch": 1.6746328501646803,
"grad_norm": 0.18311528861522675,
"learning_rate": 4.9953261269503476e-05,
"loss": 0.6833,
"step": 3290
},
{
"epoch": 1.6797244188451685,
"grad_norm": 0.11727318912744522,
"learning_rate": 4.995311920649285e-05,
"loss": 0.6819,
"step": 3300
},
{
"epoch": 1.6848159875256568,
"grad_norm": 0.12900975346565247,
"learning_rate": 4.995297714348222e-05,
"loss": 0.6849,
"step": 3310
},
{
"epoch": 1.689907556206145,
"grad_norm": 0.12244871258735657,
"learning_rate": 4.9952835080471596e-05,
"loss": 0.684,
"step": 3320
},
{
"epoch": 1.6949991248866332,
"grad_norm": 0.11466418206691742,
"learning_rate": 4.995269301746097e-05,
"loss": 0.6833,
"step": 3330
},
{
"epoch": 1.7000906935671212,
"grad_norm": 0.12230440229177475,
"learning_rate": 4.995255095445034e-05,
"loss": 0.6831,
"step": 3340
},
{
"epoch": 1.7051822622476094,
"grad_norm": 0.13069362938404083,
"learning_rate": 4.995240889143971e-05,
"loss": 0.6842,
"step": 3350
},
{
"epoch": 1.7102738309280974,
"grad_norm": 0.11203134804964066,
"learning_rate": 4.995226682842908e-05,
"loss": 0.6858,
"step": 3360
},
{
"epoch": 1.7153653996085856,
"grad_norm": 0.1311291605234146,
"learning_rate": 4.9952124765418454e-05,
"loss": 0.6838,
"step": 3370
},
{
"epoch": 1.7204569682890738,
"grad_norm": 0.1232665479183197,
"learning_rate": 4.995198270240783e-05,
"loss": 0.6841,
"step": 3380
},
{
"epoch": 1.725548536969562,
"grad_norm": 0.1445329189300537,
"learning_rate": 4.99518406393972e-05,
"loss": 0.6801,
"step": 3390
},
{
"epoch": 1.7306401056500502,
"grad_norm": 0.1403801292181015,
"learning_rate": 4.9951698576386574e-05,
"loss": 0.6863,
"step": 3400
},
{
"epoch": 1.7357316743305384,
"grad_norm": 0.09926485270261765,
"learning_rate": 4.995155651337595e-05,
"loss": 0.6839,
"step": 3410
},
{
"epoch": 1.7408232430110264,
"grad_norm": 0.10609301924705505,
"learning_rate": 4.995141445036532e-05,
"loss": 0.6851,
"step": 3420
},
{
"epoch": 1.7459148116915146,
"grad_norm": 0.1048160120844841,
"learning_rate": 4.995127238735469e-05,
"loss": 0.6875,
"step": 3430
},
{
"epoch": 1.7510063803720026,
"grad_norm": 0.10706604272127151,
"learning_rate": 4.9951130324344066e-05,
"loss": 0.684,
"step": 3440
},
{
"epoch": 1.7560979490524908,
"grad_norm": 0.1004481241106987,
"learning_rate": 4.995098826133344e-05,
"loss": 0.6847,
"step": 3450
},
{
"epoch": 1.761189517732979,
"grad_norm": 0.13026846945285797,
"learning_rate": 4.9950846198322806e-05,
"loss": 0.6822,
"step": 3460
},
{
"epoch": 1.7662810864134673,
"grad_norm": 0.20907576382160187,
"learning_rate": 4.995070413531218e-05,
"loss": 0.6808,
"step": 3470
},
{
"epoch": 1.7713726550939555,
"grad_norm": 0.14915932714939117,
"learning_rate": 4.995056207230155e-05,
"loss": 0.6803,
"step": 3480
},
{
"epoch": 1.7764642237744435,
"grad_norm": 0.12906627357006073,
"learning_rate": 4.995042000929092e-05,
"loss": 0.6868,
"step": 3490
},
{
"epoch": 1.7815557924549317,
"grad_norm": 0.10379557311534882,
"learning_rate": 4.995027794628029e-05,
"loss": 0.6853,
"step": 3500
},
{
"epoch": 1.7866473611354197,
"grad_norm": 0.10871709883213043,
"learning_rate": 4.9950135883269664e-05,
"loss": 0.6863,
"step": 3510
},
{
"epoch": 1.7917389298159079,
"grad_norm": 0.1513502597808838,
"learning_rate": 4.994999382025904e-05,
"loss": 0.6816,
"step": 3520
},
{
"epoch": 1.796830498496396,
"grad_norm": 0.13802939653396606,
"learning_rate": 4.994985175724841e-05,
"loss": 0.6838,
"step": 3530
},
{
"epoch": 1.8019220671768843,
"grad_norm": 0.13514472544193268,
"learning_rate": 4.9949709694237784e-05,
"loss": 0.6794,
"step": 3540
},
{
"epoch": 1.8070136358573725,
"grad_norm": 0.16484974324703217,
"learning_rate": 4.994956763122716e-05,
"loss": 0.682,
"step": 3550
},
{
"epoch": 1.8121052045378607,
"grad_norm": 0.11142993718385696,
"learning_rate": 4.994942556821653e-05,
"loss": 0.6816,
"step": 3560
},
{
"epoch": 1.8171967732183487,
"grad_norm": 0.14259152114391327,
"learning_rate": 4.99492835052059e-05,
"loss": 0.6781,
"step": 3570
},
{
"epoch": 1.822288341898837,
"grad_norm": 0.15921758115291595,
"learning_rate": 4.9949141442195276e-05,
"loss": 0.6855,
"step": 3580
},
{
"epoch": 1.827379910579325,
"grad_norm": 0.09428475797176361,
"learning_rate": 4.994899937918465e-05,
"loss": 0.6833,
"step": 3590
},
{
"epoch": 1.8324714792598131,
"grad_norm": 0.1155981793999672,
"learning_rate": 4.994885731617402e-05,
"loss": 0.6841,
"step": 3600
},
{
"epoch": 1.8375630479403013,
"grad_norm": 0.10845302045345306,
"learning_rate": 4.994871525316339e-05,
"loss": 0.6846,
"step": 3610
},
{
"epoch": 1.8426546166207896,
"grad_norm": 0.12848089635372162,
"learning_rate": 4.994857319015276e-05,
"loss": 0.682,
"step": 3620
},
{
"epoch": 1.8477461853012778,
"grad_norm": 0.10348972678184509,
"learning_rate": 4.9948431127142135e-05,
"loss": 0.6842,
"step": 3630
},
{
"epoch": 1.852837753981766,
"grad_norm": 0.13845866918563843,
"learning_rate": 4.994828906413151e-05,
"loss": 0.6812,
"step": 3640
},
{
"epoch": 1.857929322662254,
"grad_norm": 0.11816728860139847,
"learning_rate": 4.994814700112088e-05,
"loss": 0.6812,
"step": 3650
},
{
"epoch": 1.8630208913427422,
"grad_norm": 0.13902199268341064,
"learning_rate": 4.9948004938110254e-05,
"loss": 0.6871,
"step": 3660
},
{
"epoch": 1.8681124600232302,
"grad_norm": 0.12729224562644958,
"learning_rate": 4.994786287509962e-05,
"loss": 0.6802,
"step": 3670
},
{
"epoch": 1.8732040287037184,
"grad_norm": 0.14033198356628418,
"learning_rate": 4.9947720812088994e-05,
"loss": 0.6843,
"step": 3680
},
{
"epoch": 1.8782955973842066,
"grad_norm": 0.12836380302906036,
"learning_rate": 4.994757874907837e-05,
"loss": 0.6833,
"step": 3690
},
{
"epoch": 1.8833871660646948,
"grad_norm": 0.1290048211812973,
"learning_rate": 4.994743668606774e-05,
"loss": 0.6832,
"step": 3700
},
{
"epoch": 1.888478734745183,
"grad_norm": 0.1284429430961609,
"learning_rate": 4.994729462305711e-05,
"loss": 0.6843,
"step": 3710
},
{
"epoch": 1.893570303425671,
"grad_norm": 0.13112841546535492,
"learning_rate": 4.9947152560046486e-05,
"loss": 0.6811,
"step": 3720
},
{
"epoch": 1.8986618721061592,
"grad_norm": 0.14525501430034637,
"learning_rate": 4.994701049703586e-05,
"loss": 0.6793,
"step": 3730
},
{
"epoch": 1.9037534407866472,
"grad_norm": 0.1803501546382904,
"learning_rate": 4.994686843402523e-05,
"loss": 0.6791,
"step": 3740
},
{
"epoch": 1.9088450094671354,
"grad_norm": 0.1837460994720459,
"learning_rate": 4.99467263710146e-05,
"loss": 0.6771,
"step": 3750
},
{
"epoch": 1.9139365781476236,
"grad_norm": 0.12087200582027435,
"learning_rate": 4.994658430800397e-05,
"loss": 0.6836,
"step": 3760
},
{
"epoch": 1.9190281468281118,
"grad_norm": 0.1253005713224411,
"learning_rate": 4.9946442244993345e-05,
"loss": 0.6822,
"step": 3770
},
{
"epoch": 1.9241197155086,
"grad_norm": 0.11462333053350449,
"learning_rate": 4.994630018198272e-05,
"loss": 0.684,
"step": 3780
},
{
"epoch": 1.9292112841890883,
"grad_norm": 0.1458183377981186,
"learning_rate": 4.994615811897209e-05,
"loss": 0.6812,
"step": 3790
},
{
"epoch": 1.9343028528695763,
"grad_norm": 0.13514210283756256,
"learning_rate": 4.9946016055961464e-05,
"loss": 0.6883,
"step": 3800
},
{
"epoch": 1.9393944215500645,
"grad_norm": 0.10077164322137833,
"learning_rate": 4.994587399295084e-05,
"loss": 0.6841,
"step": 3810
},
{
"epoch": 1.9444859902305525,
"grad_norm": 0.1145828515291214,
"learning_rate": 4.994573192994021e-05,
"loss": 0.6794,
"step": 3820
},
{
"epoch": 1.9495775589110407,
"grad_norm": 0.12171609699726105,
"learning_rate": 4.994558986692958e-05,
"loss": 0.6801,
"step": 3830
},
{
"epoch": 1.9546691275915289,
"grad_norm": 0.1296948492527008,
"learning_rate": 4.9945447803918956e-05,
"loss": 0.6836,
"step": 3840
},
{
"epoch": 1.959760696272017,
"grad_norm": 0.13795003294944763,
"learning_rate": 4.994530574090833e-05,
"loss": 0.6814,
"step": 3850
},
{
"epoch": 1.9648522649525053,
"grad_norm": 0.10949226468801498,
"learning_rate": 4.99451636778977e-05,
"loss": 0.6851,
"step": 3860
},
{
"epoch": 1.9699438336329935,
"grad_norm": 0.09504687041044235,
"learning_rate": 4.994502161488707e-05,
"loss": 0.6847,
"step": 3870
},
{
"epoch": 1.9750354023134815,
"grad_norm": 0.12004721909761429,
"learning_rate": 4.994487955187644e-05,
"loss": 0.6853,
"step": 3880
},
{
"epoch": 1.9801269709939697,
"grad_norm": 0.15672442317008972,
"learning_rate": 4.994473748886581e-05,
"loss": 0.678,
"step": 3890
},
{
"epoch": 1.9852185396744577,
"grad_norm": 0.1672324538230896,
"learning_rate": 4.994459542585518e-05,
"loss": 0.6801,
"step": 3900
},
{
"epoch": 1.990310108354946,
"grad_norm": 0.13963304460048676,
"learning_rate": 4.9944453362844555e-05,
"loss": 0.6896,
"step": 3910
},
{
"epoch": 1.9954016770354341,
"grad_norm": 0.11426424980163574,
"learning_rate": 4.994431129983393e-05,
"loss": 0.6802,
"step": 3920
},
{
"epoch": 2.0,
"grad_norm": 0.01827167347073555,
"learning_rate": 4.99441692368233e-05,
"loss": 0.6174,
"step": 3930
},
{
"epoch": 2.005091568680488,
"grad_norm": 0.17911553382873535,
"learning_rate": 4.9944027173812674e-05,
"loss": 0.6826,
"step": 3940
},
{
"epoch": 2.0101831373609764,
"grad_norm": 0.11154532432556152,
"learning_rate": 4.994388511080205e-05,
"loss": 0.6843,
"step": 3950
},
{
"epoch": 2.0152747060414646,
"grad_norm": 0.09386030584573746,
"learning_rate": 4.994374304779142e-05,
"loss": 0.686,
"step": 3960
},
{
"epoch": 2.0203662747219524,
"grad_norm": 0.09608808904886246,
"learning_rate": 4.994360098478079e-05,
"loss": 0.6791,
"step": 3970
},
{
"epoch": 2.0254578434024406,
"grad_norm": 0.12537717819213867,
"learning_rate": 4.9943458921770166e-05,
"loss": 0.6842,
"step": 3980
},
{
"epoch": 2.030549412082929,
"grad_norm": 0.09800703823566437,
"learning_rate": 4.994331685875954e-05,
"loss": 0.6859,
"step": 3990
},
{
"epoch": 2.035640980763417,
"grad_norm": 0.07934601604938507,
"learning_rate": 4.994317479574891e-05,
"loss": 0.6846,
"step": 4000
},
{
"epoch": 2.0407325494439053,
"grad_norm": 0.10269072651863098,
"learning_rate": 4.994303273273828e-05,
"loss": 0.6852,
"step": 4010
},
{
"epoch": 2.0458241181243935,
"grad_norm": 0.09138213843107224,
"learning_rate": 4.994289066972765e-05,
"loss": 0.682,
"step": 4020
},
{
"epoch": 2.0509156868048817,
"grad_norm": 0.1062936782836914,
"learning_rate": 4.9942748606717025e-05,
"loss": 0.6863,
"step": 4030
},
{
"epoch": 2.05600725548537,
"grad_norm": 0.13446182012557983,
"learning_rate": 4.99426065437064e-05,
"loss": 0.68,
"step": 4040
},
{
"epoch": 2.0610988241658577,
"grad_norm": 0.1352904587984085,
"learning_rate": 4.994246448069577e-05,
"loss": 0.68,
"step": 4050
},
{
"epoch": 2.066190392846346,
"grad_norm": 0.14259877800941467,
"learning_rate": 4.9942322417685144e-05,
"loss": 0.684,
"step": 4060
},
{
"epoch": 2.071281961526834,
"grad_norm": 0.1194225326180458,
"learning_rate": 4.994218035467452e-05,
"loss": 0.6845,
"step": 4070
},
{
"epoch": 2.0763735302073223,
"grad_norm": 0.10988787561655045,
"learning_rate": 4.994203829166389e-05,
"loss": 0.6834,
"step": 4080
},
{
"epoch": 2.0814650988878105,
"grad_norm": 0.10374101996421814,
"learning_rate": 4.994189622865326e-05,
"loss": 0.6833,
"step": 4090
},
{
"epoch": 2.0865566675682987,
"grad_norm": 0.11888198554515839,
"learning_rate": 4.994175416564263e-05,
"loss": 0.683,
"step": 4100
},
{
"epoch": 2.091648236248787,
"grad_norm": 0.11808530986309052,
"learning_rate": 4.9941612102632e-05,
"loss": 0.6813,
"step": 4110
},
{
"epoch": 2.096739804929275,
"grad_norm": 0.12874440848827362,
"learning_rate": 4.9941470039621376e-05,
"loss": 0.6809,
"step": 4120
},
{
"epoch": 2.101831373609763,
"grad_norm": 0.1372908353805542,
"learning_rate": 4.994132797661075e-05,
"loss": 0.6793,
"step": 4130
},
{
"epoch": 2.106922942290251,
"grad_norm": 0.15299095213413239,
"learning_rate": 4.994118591360012e-05,
"loss": 0.6785,
"step": 4140
},
{
"epoch": 2.1120145109707393,
"grad_norm": 0.1464032679796219,
"learning_rate": 4.994104385058949e-05,
"loss": 0.6804,
"step": 4150
},
{
"epoch": 2.1171060796512275,
"grad_norm": 0.10995624214410782,
"learning_rate": 4.994090178757886e-05,
"loss": 0.6854,
"step": 4160
},
{
"epoch": 2.1221976483317158,
"grad_norm": 0.1125839501619339,
"learning_rate": 4.9940759724568235e-05,
"loss": 0.6802,
"step": 4170
},
{
"epoch": 2.127289217012204,
"grad_norm": 0.15469452738761902,
"learning_rate": 4.994061766155761e-05,
"loss": 0.6805,
"step": 4180
},
{
"epoch": 2.132380785692692,
"grad_norm": 0.15448547899723053,
"learning_rate": 4.994047559854698e-05,
"loss": 0.6826,
"step": 4190
},
{
"epoch": 2.1374723543731804,
"grad_norm": 0.12687282264232635,
"learning_rate": 4.9940333535536354e-05,
"loss": 0.681,
"step": 4200
},
{
"epoch": 2.142563923053668,
"grad_norm": 0.13330869376659393,
"learning_rate": 4.994019147252573e-05,
"loss": 0.6811,
"step": 4210
},
{
"epoch": 2.1476554917341564,
"grad_norm": 0.13483920693397522,
"learning_rate": 4.99400494095151e-05,
"loss": 0.6834,
"step": 4220
},
{
"epoch": 2.1527470604146446,
"grad_norm": 0.08532749861478806,
"learning_rate": 4.9939907346504474e-05,
"loss": 0.6867,
"step": 4230
},
{
"epoch": 2.157838629095133,
"grad_norm": 0.12028615176677704,
"learning_rate": 4.993976528349385e-05,
"loss": 0.6849,
"step": 4240
},
{
"epoch": 2.162930197775621,
"grad_norm": 0.10255931317806244,
"learning_rate": 4.993962322048322e-05,
"loss": 0.6816,
"step": 4250
},
{
"epoch": 2.1680217664561092,
"grad_norm": 0.16485556960105896,
"learning_rate": 4.9939481157472586e-05,
"loss": 0.6791,
"step": 4260
},
{
"epoch": 2.1731133351365974,
"grad_norm": 0.1411302089691162,
"learning_rate": 4.993933909446196e-05,
"loss": 0.6788,
"step": 4270
},
{
"epoch": 2.178204903817085,
"grad_norm": 0.18721655011177063,
"learning_rate": 4.993919703145133e-05,
"loss": 0.6854,
"step": 4280
},
{
"epoch": 2.1832964724975734,
"grad_norm": 0.0997004359960556,
"learning_rate": 4.9939054968440706e-05,
"loss": 0.6842,
"step": 4290
},
{
"epoch": 2.1883880411780616,
"grad_norm": 0.11703092604875565,
"learning_rate": 4.993891290543007e-05,
"loss": 0.68,
"step": 4300
},
{
"epoch": 2.19347960985855,
"grad_norm": 0.13729970157146454,
"learning_rate": 4.9938770842419445e-05,
"loss": 0.6832,
"step": 4310
},
{
"epoch": 2.198571178539038,
"grad_norm": 0.12172706425189972,
"learning_rate": 4.993862877940882e-05,
"loss": 0.6827,
"step": 4320
},
{
"epoch": 2.2036627472195263,
"grad_norm": 0.12669777870178223,
"learning_rate": 4.993848671639819e-05,
"loss": 0.6829,
"step": 4330
},
{
"epoch": 2.2087543159000145,
"grad_norm": 0.13186220824718475,
"learning_rate": 4.9938344653387564e-05,
"loss": 0.6824,
"step": 4340
},
{
"epoch": 2.2138458845805022,
"grad_norm": 0.13194870948791504,
"learning_rate": 4.993820259037694e-05,
"loss": 0.6803,
"step": 4350
},
{
"epoch": 2.2189374532609905,
"grad_norm": 0.14057835936546326,
"learning_rate": 4.993806052736631e-05,
"loss": 0.68,
"step": 4360
},
{
"epoch": 2.2240290219414787,
"grad_norm": 0.12043063342571259,
"learning_rate": 4.9937918464355684e-05,
"loss": 0.6849,
"step": 4370
},
{
"epoch": 2.229120590621967,
"grad_norm": 0.11859495937824249,
"learning_rate": 4.993777640134506e-05,
"loss": 0.6831,
"step": 4380
},
{
"epoch": 2.234212159302455,
"grad_norm": 0.12299305200576782,
"learning_rate": 4.993763433833443e-05,
"loss": 0.6803,
"step": 4390
},
{
"epoch": 2.2393037279829433,
"grad_norm": 0.12101336568593979,
"learning_rate": 4.9937492275323796e-05,
"loss": 0.6812,
"step": 4400
},
{
"epoch": 2.2443952966634315,
"grad_norm": 0.10430170595645905,
"learning_rate": 4.993735021231317e-05,
"loss": 0.6866,
"step": 4410
},
{
"epoch": 2.2494868653439197,
"grad_norm": 0.08973786234855652,
"learning_rate": 4.993720814930254e-05,
"loss": 0.6861,
"step": 4420
},
{
"epoch": 2.254578434024408,
"grad_norm": 0.09560893476009369,
"learning_rate": 4.9937066086291916e-05,
"loss": 0.6821,
"step": 4430
},
{
"epoch": 2.2596700027048957,
"grad_norm": 0.12744377553462982,
"learning_rate": 4.993692402328129e-05,
"loss": 0.6861,
"step": 4440
},
{
"epoch": 2.264761571385384,
"grad_norm": 0.09390248358249664,
"learning_rate": 4.993678196027066e-05,
"loss": 0.6837,
"step": 4450
},
{
"epoch": 2.269853140065872,
"grad_norm": 0.10652091354131699,
"learning_rate": 4.9936639897260035e-05,
"loss": 0.6821,
"step": 4460
},
{
"epoch": 2.2749447087463603,
"grad_norm": 0.14594070613384247,
"learning_rate": 4.993649783424941e-05,
"loss": 0.6831,
"step": 4470
},
{
"epoch": 2.2800362774268486,
"grad_norm": 0.11480095237493515,
"learning_rate": 4.993635577123878e-05,
"loss": 0.676,
"step": 4480
},
{
"epoch": 2.2851278461073368,
"grad_norm": 0.15268968045711517,
"learning_rate": 4.9936213708228154e-05,
"loss": 0.6802,
"step": 4490
},
{
"epoch": 2.290219414787825,
"grad_norm": 0.12245162576436996,
"learning_rate": 4.993607164521753e-05,
"loss": 0.6816,
"step": 4500
},
{
"epoch": 2.2953109834683127,
"grad_norm": 0.09671120345592499,
"learning_rate": 4.9935929582206894e-05,
"loss": 0.6808,
"step": 4510
},
{
"epoch": 2.300402552148801,
"grad_norm": 0.10151444375514984,
"learning_rate": 4.993578751919627e-05,
"loss": 0.6862,
"step": 4520
},
{
"epoch": 2.305494120829289,
"grad_norm": 0.10020536184310913,
"learning_rate": 4.993564545618564e-05,
"loss": 0.6809,
"step": 4530
},
{
"epoch": 2.3105856895097774,
"grad_norm": 0.20587410032749176,
"learning_rate": 4.9935503393175006e-05,
"loss": 0.6839,
"step": 4540
},
{
"epoch": 2.3156772581902656,
"grad_norm": 0.12846329808235168,
"learning_rate": 4.993536133016438e-05,
"loss": 0.6814,
"step": 4550
},
{
"epoch": 2.320768826870754,
"grad_norm": 0.12553255259990692,
"learning_rate": 4.993521926715375e-05,
"loss": 0.6828,
"step": 4560
},
{
"epoch": 2.325860395551242,
"grad_norm": 0.11741780489683151,
"learning_rate": 4.9935077204143125e-05,
"loss": 0.6855,
"step": 4570
},
{
"epoch": 2.33095196423173,
"grad_norm": 0.09674712270498276,
"learning_rate": 4.99349351411325e-05,
"loss": 0.6813,
"step": 4580
},
{
"epoch": 2.336043532912218,
"grad_norm": 0.11124306917190552,
"learning_rate": 4.993479307812187e-05,
"loss": 0.6763,
"step": 4590
},
{
"epoch": 2.341135101592706,
"grad_norm": 0.1364033818244934,
"learning_rate": 4.9934651015111245e-05,
"loss": 0.6798,
"step": 4600
},
{
"epoch": 2.3462266702731944,
"grad_norm": 0.14521688222885132,
"learning_rate": 4.993450895210062e-05,
"loss": 0.6824,
"step": 4610
},
{
"epoch": 2.3513182389536826,
"grad_norm": 0.10061439126729965,
"learning_rate": 4.993436688908999e-05,
"loss": 0.6841,
"step": 4620
},
{
"epoch": 2.356409807634171,
"grad_norm": 0.09391237050294876,
"learning_rate": 4.9934224826079364e-05,
"loss": 0.6827,
"step": 4630
},
{
"epoch": 2.361501376314659,
"grad_norm": 0.12690366804599762,
"learning_rate": 4.993408276306874e-05,
"loss": 0.6798,
"step": 4640
},
{
"epoch": 2.3665929449951473,
"grad_norm": 0.11659922450780869,
"learning_rate": 4.993394070005811e-05,
"loss": 0.6805,
"step": 4650
},
{
"epoch": 2.3716845136756355,
"grad_norm": 0.1206756979227066,
"learning_rate": 4.993379863704748e-05,
"loss": 0.6792,
"step": 4660
},
{
"epoch": 2.3767760823561233,
"grad_norm": 0.11938859522342682,
"learning_rate": 4.993365657403685e-05,
"loss": 0.6837,
"step": 4670
},
{
"epoch": 2.3818676510366115,
"grad_norm": 0.10022424161434174,
"learning_rate": 4.993351451102622e-05,
"loss": 0.6807,
"step": 4680
},
{
"epoch": 2.3869592197170997,
"grad_norm": 0.14838755130767822,
"learning_rate": 4.9933372448015596e-05,
"loss": 0.6795,
"step": 4690
},
{
"epoch": 2.392050788397588,
"grad_norm": 0.131904736161232,
"learning_rate": 4.993323038500497e-05,
"loss": 0.6855,
"step": 4700
},
{
"epoch": 2.397142357078076,
"grad_norm": 0.1132061704993248,
"learning_rate": 4.993308832199434e-05,
"loss": 0.6792,
"step": 4710
},
{
"epoch": 2.4022339257585643,
"grad_norm": 0.10466153919696808,
"learning_rate": 4.993294625898371e-05,
"loss": 0.6856,
"step": 4720
},
{
"epoch": 2.4073254944390525,
"grad_norm": 0.108913853764534,
"learning_rate": 4.993280419597308e-05,
"loss": 0.6787,
"step": 4730
},
{
"epoch": 2.4124170631195403,
"grad_norm": 0.12613457441329956,
"learning_rate": 4.9932662132962455e-05,
"loss": 0.6804,
"step": 4740
},
{
"epoch": 2.4175086318000285,
"grad_norm": 0.11993265151977539,
"learning_rate": 4.993252006995183e-05,
"loss": 0.6809,
"step": 4750
},
{
"epoch": 2.4226002004805167,
"grad_norm": 0.13760647177696228,
"learning_rate": 4.99323780069412e-05,
"loss": 0.6762,
"step": 4760
},
{
"epoch": 2.427691769161005,
"grad_norm": 0.15461039543151855,
"learning_rate": 4.9932235943930574e-05,
"loss": 0.6808,
"step": 4770
},
{
"epoch": 2.432783337841493,
"grad_norm": 0.11814858764410019,
"learning_rate": 4.993209388091995e-05,
"loss": 0.6807,
"step": 4780
},
{
"epoch": 2.4378749065219814,
"grad_norm": 0.12167418003082275,
"learning_rate": 4.993195181790932e-05,
"loss": 0.6838,
"step": 4790
},
{
"epoch": 2.4429664752024696,
"grad_norm": 0.13912709057331085,
"learning_rate": 4.9931809754898687e-05,
"loss": 0.6814,
"step": 4800
},
{
"epoch": 2.4480580438829573,
"grad_norm": 0.1079849898815155,
"learning_rate": 4.993166769188806e-05,
"loss": 0.6802,
"step": 4810
},
{
"epoch": 2.4531496125634455,
"grad_norm": 0.1483919620513916,
"learning_rate": 4.993152562887743e-05,
"loss": 0.6809,
"step": 4820
},
{
"epoch": 2.4582411812439338,
"grad_norm": 0.1411130726337433,
"learning_rate": 4.9931383565866806e-05,
"loss": 0.6819,
"step": 4830
},
{
"epoch": 2.463332749924422,
"grad_norm": 0.13872161507606506,
"learning_rate": 4.993124150285618e-05,
"loss": 0.6814,
"step": 4840
},
{
"epoch": 2.46842431860491,
"grad_norm": 0.13207204639911652,
"learning_rate": 4.993109943984555e-05,
"loss": 0.6819,
"step": 4850
},
{
"epoch": 2.4735158872853984,
"grad_norm": 0.13904866576194763,
"learning_rate": 4.9930957376834925e-05,
"loss": 0.6799,
"step": 4860
},
{
"epoch": 2.4786074559658866,
"grad_norm": 0.10088212043046951,
"learning_rate": 4.99308153138243e-05,
"loss": 0.6849,
"step": 4870
},
{
"epoch": 2.483699024646375,
"grad_norm": 0.15108828246593475,
"learning_rate": 4.993067325081367e-05,
"loss": 0.6824,
"step": 4880
},
{
"epoch": 2.488790593326863,
"grad_norm": 0.11093771457672119,
"learning_rate": 4.9930531187803044e-05,
"loss": 0.6848,
"step": 4890
},
{
"epoch": 2.493882162007351,
"grad_norm": 0.10378114134073257,
"learning_rate": 4.993038912479242e-05,
"loss": 0.6824,
"step": 4900
},
{
"epoch": 2.498973730687839,
"grad_norm": 0.1797563135623932,
"learning_rate": 4.993024706178179e-05,
"loss": 0.6805,
"step": 4910
},
{
"epoch": 2.5040652993683272,
"grad_norm": 0.13369685411453247,
"learning_rate": 4.993010499877116e-05,
"loss": 0.6798,
"step": 4920
},
{
"epoch": 2.5091568680488154,
"grad_norm": 0.11391709744930267,
"learning_rate": 4.992996293576053e-05,
"loss": 0.6769,
"step": 4930
},
{
"epoch": 2.5142484367293036,
"grad_norm": 0.15841761231422424,
"learning_rate": 4.9929820872749896e-05,
"loss": 0.6813,
"step": 4940
},
{
"epoch": 2.519340005409792,
"grad_norm": 0.1152459904551506,
"learning_rate": 4.992967880973927e-05,
"loss": 0.6825,
"step": 4950
},
{
"epoch": 2.52443157409028,
"grad_norm": 0.1523844301700592,
"learning_rate": 4.992953674672864e-05,
"loss": 0.6795,
"step": 4960
},
{
"epoch": 2.529523142770768,
"grad_norm": 0.15071742236614227,
"learning_rate": 4.9929394683718016e-05,
"loss": 0.6778,
"step": 4970
},
{
"epoch": 2.534614711451256,
"grad_norm": 0.0915883481502533,
"learning_rate": 4.992925262070739e-05,
"loss": 0.689,
"step": 4980
},
{
"epoch": 2.5397062801317443,
"grad_norm": 0.08719677478075027,
"learning_rate": 4.992911055769676e-05,
"loss": 0.6831,
"step": 4990
},
{
"epoch": 2.5447978488122325,
"grad_norm": 0.10521717369556427,
"learning_rate": 4.9928968494686135e-05,
"loss": 0.6838,
"step": 5000
},
{
"epoch": 2.5498894174927207,
"grad_norm": 0.14673079550266266,
"learning_rate": 4.992882643167551e-05,
"loss": 0.6777,
"step": 5010
},
{
"epoch": 2.554980986173209,
"grad_norm": 0.1252555549144745,
"learning_rate": 4.992868436866488e-05,
"loss": 0.6774,
"step": 5020
},
{
"epoch": 2.560072554853697,
"grad_norm": 0.17313307523727417,
"learning_rate": 4.9928542305654254e-05,
"loss": 0.6846,
"step": 5030
},
{
"epoch": 2.565164123534185,
"grad_norm": 0.12619802355766296,
"learning_rate": 4.992840024264363e-05,
"loss": 0.6827,
"step": 5040
},
{
"epoch": 2.5702556922146735,
"grad_norm": 0.11647044122219086,
"learning_rate": 4.9928258179633e-05,
"loss": 0.6779,
"step": 5050
},
{
"epoch": 2.5753472608951613,
"grad_norm": 0.11227191984653473,
"learning_rate": 4.992811611662237e-05,
"loss": 0.6767,
"step": 5060
},
{
"epoch": 2.5804388295756495,
"grad_norm": 0.12041344493627548,
"learning_rate": 4.992797405361174e-05,
"loss": 0.6784,
"step": 5070
},
{
"epoch": 2.5855303982561377,
"grad_norm": 0.14506416022777557,
"learning_rate": 4.992783199060111e-05,
"loss": 0.6798,
"step": 5080
},
{
"epoch": 2.590621966936626,
"grad_norm": 0.10675019025802612,
"learning_rate": 4.9927689927590486e-05,
"loss": 0.684,
"step": 5090
},
{
"epoch": 2.595713535617114,
"grad_norm": 0.09595705568790436,
"learning_rate": 4.992754786457986e-05,
"loss": 0.68,
"step": 5100
},
{
"epoch": 2.600805104297602,
"grad_norm": 0.12361190468072891,
"learning_rate": 4.992740580156923e-05,
"loss": 0.6813,
"step": 5110
},
{
"epoch": 2.6058966729780906,
"grad_norm": 0.14116083085536957,
"learning_rate": 4.9927263738558606e-05,
"loss": 0.6791,
"step": 5120
},
{
"epoch": 2.6109882416585783,
"grad_norm": 0.14521893858909607,
"learning_rate": 4.992712167554798e-05,
"loss": 0.6841,
"step": 5130
},
{
"epoch": 2.6160798103390666,
"grad_norm": 0.08931027352809906,
"learning_rate": 4.9926979612537345e-05,
"loss": 0.6839,
"step": 5140
},
{
"epoch": 2.6211713790195548,
"grad_norm": 0.15768922865390778,
"learning_rate": 4.992683754952672e-05,
"loss": 0.6837,
"step": 5150
},
{
"epoch": 2.626262947700043,
"grad_norm": 0.11857085675001144,
"learning_rate": 4.992669548651609e-05,
"loss": 0.6791,
"step": 5160
},
{
"epoch": 2.631354516380531,
"grad_norm": 0.12832790613174438,
"learning_rate": 4.9926553423505464e-05,
"loss": 0.6789,
"step": 5170
},
{
"epoch": 2.6364460850610194,
"grad_norm": 0.1246199905872345,
"learning_rate": 4.992641136049484e-05,
"loss": 0.6834,
"step": 5180
},
{
"epoch": 2.6415376537415076,
"grad_norm": 0.10562731325626373,
"learning_rate": 4.992626929748421e-05,
"loss": 0.681,
"step": 5190
},
{
"epoch": 2.6466292224219954,
"grad_norm": 0.1098145917057991,
"learning_rate": 4.992612723447358e-05,
"loss": 0.68,
"step": 5200
},
{
"epoch": 2.6517207911024836,
"grad_norm": 0.1007496640086174,
"learning_rate": 4.992598517146295e-05,
"loss": 0.6835,
"step": 5210
},
{
"epoch": 2.656812359782972,
"grad_norm": 0.16250421106815338,
"learning_rate": 4.992584310845232e-05,
"loss": 0.6781,
"step": 5220
},
{
"epoch": 2.66190392846346,
"grad_norm": 0.1358012706041336,
"learning_rate": 4.9925701045441696e-05,
"loss": 0.6838,
"step": 5230
},
{
"epoch": 2.6669954971439482,
"grad_norm": 0.10603620857000351,
"learning_rate": 4.992555898243107e-05,
"loss": 0.6847,
"step": 5240
},
{
"epoch": 2.6720870658244364,
"grad_norm": 0.12339074909687042,
"learning_rate": 4.992541691942044e-05,
"loss": 0.6809,
"step": 5250
},
{
"epoch": 2.6771786345049247,
"grad_norm": 0.13252249360084534,
"learning_rate": 4.9925274856409816e-05,
"loss": 0.6809,
"step": 5260
},
{
"epoch": 2.6822702031854124,
"grad_norm": 0.12156182527542114,
"learning_rate": 4.992513279339919e-05,
"loss": 0.6803,
"step": 5270
},
{
"epoch": 2.687361771865901,
"grad_norm": 0.1240081861615181,
"learning_rate": 4.992499073038856e-05,
"loss": 0.6828,
"step": 5280
},
{
"epoch": 2.692453340546389,
"grad_norm": 0.1085842102766037,
"learning_rate": 4.9924848667377935e-05,
"loss": 0.6809,
"step": 5290
},
{
"epoch": 2.697544909226877,
"grad_norm": 0.10199875384569168,
"learning_rate": 4.992470660436731e-05,
"loss": 0.6799,
"step": 5300
},
{
"epoch": 2.7026364779073653,
"grad_norm": 0.10421440750360489,
"learning_rate": 4.9924564541356674e-05,
"loss": 0.6774,
"step": 5310
},
{
"epoch": 2.7077280465878535,
"grad_norm": 0.11737542599439621,
"learning_rate": 4.992442247834605e-05,
"loss": 0.6866,
"step": 5320
},
{
"epoch": 2.7128196152683417,
"grad_norm": 0.1116197407245636,
"learning_rate": 4.992428041533542e-05,
"loss": 0.6826,
"step": 5330
},
{
"epoch": 2.7179111839488295,
"grad_norm": 0.07906144112348557,
"learning_rate": 4.9924138352324794e-05,
"loss": 0.6833,
"step": 5340
},
{
"epoch": 2.723002752629318,
"grad_norm": 0.09525004774332047,
"learning_rate": 4.992399628931416e-05,
"loss": 0.6846,
"step": 5350
},
{
"epoch": 2.728094321309806,
"grad_norm": 0.10529020428657532,
"learning_rate": 4.992385422630353e-05,
"loss": 0.6805,
"step": 5360
},
{
"epoch": 2.733185889990294,
"grad_norm": 0.1130564957857132,
"learning_rate": 4.9923712163292906e-05,
"loss": 0.6834,
"step": 5370
},
{
"epoch": 2.7382774586707823,
"grad_norm": 0.1169043555855751,
"learning_rate": 4.992357010028228e-05,
"loss": 0.6791,
"step": 5380
},
{
"epoch": 2.7433690273512705,
"grad_norm": 0.10529076308012009,
"learning_rate": 4.992342803727165e-05,
"loss": 0.6807,
"step": 5390
},
{
"epoch": 2.7484605960317587,
"grad_norm": 0.11143583059310913,
"learning_rate": 4.9923285974261025e-05,
"loss": 0.6809,
"step": 5400
},
{
"epoch": 2.753552164712247,
"grad_norm": 0.12018362432718277,
"learning_rate": 4.99231439112504e-05,
"loss": 0.6803,
"step": 5410
},
{
"epoch": 2.758643733392735,
"grad_norm": 0.10221763700246811,
"learning_rate": 4.992300184823977e-05,
"loss": 0.6844,
"step": 5420
},
{
"epoch": 2.763735302073223,
"grad_norm": 0.12819118797779083,
"learning_rate": 4.9922859785229145e-05,
"loss": 0.6802,
"step": 5430
},
{
"epoch": 2.768826870753711,
"grad_norm": 0.11218137294054031,
"learning_rate": 4.992271772221852e-05,
"loss": 0.6816,
"step": 5440
},
{
"epoch": 2.7739184394341994,
"grad_norm": 0.1787531077861786,
"learning_rate": 4.9922575659207884e-05,
"loss": 0.6815,
"step": 5450
},
{
"epoch": 2.7790100081146876,
"grad_norm": 0.10983338207006454,
"learning_rate": 4.992243359619726e-05,
"loss": 0.6777,
"step": 5460
},
{
"epoch": 2.784101576795176,
"grad_norm": 0.12096842378377914,
"learning_rate": 4.992229153318663e-05,
"loss": 0.6793,
"step": 5470
},
{
"epoch": 2.789193145475664,
"grad_norm": 0.1177634447813034,
"learning_rate": 4.9922149470176004e-05,
"loss": 0.6845,
"step": 5480
},
{
"epoch": 2.794284714156152,
"grad_norm": 0.09383808076381683,
"learning_rate": 4.992200740716538e-05,
"loss": 0.6816,
"step": 5490
},
{
"epoch": 2.79937628283664,
"grad_norm": 0.11048846691846848,
"learning_rate": 4.992186534415475e-05,
"loss": 0.6812,
"step": 5500
},
{
"epoch": 2.8044678515171286,
"grad_norm": 0.11928955465555191,
"learning_rate": 4.992172328114412e-05,
"loss": 0.6855,
"step": 5510
},
{
"epoch": 2.8095594201976164,
"grad_norm": 0.10312807559967041,
"learning_rate": 4.9921581218133496e-05,
"loss": 0.6834,
"step": 5520
},
{
"epoch": 2.8146509888781046,
"grad_norm": 0.14140763878822327,
"learning_rate": 4.992143915512287e-05,
"loss": 0.6826,
"step": 5530
},
{
"epoch": 2.819742557558593,
"grad_norm": 0.12414680421352386,
"learning_rate": 4.992129709211224e-05,
"loss": 0.6778,
"step": 5540
},
{
"epoch": 2.824834126239081,
"grad_norm": 0.18568123877048492,
"learning_rate": 4.9921155029101615e-05,
"loss": 0.6834,
"step": 5550
},
{
"epoch": 2.8299256949195692,
"grad_norm": 0.09774978458881378,
"learning_rate": 4.992101296609098e-05,
"loss": 0.6826,
"step": 5560
},
{
"epoch": 2.835017263600057,
"grad_norm": 0.11985506862401962,
"learning_rate": 4.9920870903080355e-05,
"loss": 0.6865,
"step": 5570
},
{
"epoch": 2.8401088322805457,
"grad_norm": 0.09641832858324051,
"learning_rate": 4.992072884006973e-05,
"loss": 0.6821,
"step": 5580
},
{
"epoch": 2.8452004009610334,
"grad_norm": 0.10907211899757385,
"learning_rate": 4.9920586777059094e-05,
"loss": 0.6847,
"step": 5590
},
{
"epoch": 2.8502919696415216,
"grad_norm": 0.11031023412942886,
"learning_rate": 4.992044471404847e-05,
"loss": 0.682,
"step": 5600
},
{
"epoch": 2.85538353832201,
"grad_norm": 0.12869343161582947,
"learning_rate": 4.992030265103784e-05,
"loss": 0.683,
"step": 5610
},
{
"epoch": 2.860475107002498,
"grad_norm": 0.114951953291893,
"learning_rate": 4.9920160588027213e-05,
"loss": 0.6801,
"step": 5620
},
{
"epoch": 2.8655666756829863,
"grad_norm": 0.12400404363870621,
"learning_rate": 4.9920018525016587e-05,
"loss": 0.685,
"step": 5630
},
{
"epoch": 2.8706582443634745,
"grad_norm": 0.11837892979383469,
"learning_rate": 4.991987646200596e-05,
"loss": 0.6826,
"step": 5640
},
{
"epoch": 2.8757498130439627,
"grad_norm": 0.16485707461833954,
"learning_rate": 4.991973439899533e-05,
"loss": 0.6798,
"step": 5650
},
{
"epoch": 2.8808413817244505,
"grad_norm": 0.1649584323167801,
"learning_rate": 4.9919592335984706e-05,
"loss": 0.6846,
"step": 5660
},
{
"epoch": 2.8859329504049387,
"grad_norm": 0.09823145717382431,
"learning_rate": 4.991945027297408e-05,
"loss": 0.6825,
"step": 5670
},
{
"epoch": 2.891024519085427,
"grad_norm": 0.10554816573858261,
"learning_rate": 4.991930820996345e-05,
"loss": 0.6833,
"step": 5680
},
{
"epoch": 2.896116087765915,
"grad_norm": 0.09985250979661942,
"learning_rate": 4.9919166146952825e-05,
"loss": 0.685,
"step": 5690
},
{
"epoch": 2.9012076564464033,
"grad_norm": 0.1473183035850525,
"learning_rate": 4.99190240839422e-05,
"loss": 0.6773,
"step": 5700
},
{
"epoch": 2.9062992251268915,
"grad_norm": 0.1321994662284851,
"learning_rate": 4.9918882020931565e-05,
"loss": 0.6851,
"step": 5710
},
{
"epoch": 2.9113907938073798,
"grad_norm": 0.11778974533081055,
"learning_rate": 4.991873995792094e-05,
"loss": 0.6777,
"step": 5720
},
{
"epoch": 2.9164823624878675,
"grad_norm": 0.12440946698188782,
"learning_rate": 4.991859789491031e-05,
"loss": 0.6866,
"step": 5730
},
{
"epoch": 2.921573931168356,
"grad_norm": 0.14024010300636292,
"learning_rate": 4.9918455831899684e-05,
"loss": 0.6786,
"step": 5740
},
{
"epoch": 2.926665499848844,
"grad_norm": 0.1383139193058014,
"learning_rate": 4.991831376888906e-05,
"loss": 0.6843,
"step": 5750
},
{
"epoch": 2.931757068529332,
"grad_norm": 0.16354554891586304,
"learning_rate": 4.991817170587843e-05,
"loss": 0.6788,
"step": 5760
},
{
"epoch": 2.9368486372098204,
"grad_norm": 0.12788814306259155,
"learning_rate": 4.9918029642867796e-05,
"loss": 0.6803,
"step": 5770
},
{
"epoch": 2.9419402058903086,
"grad_norm": 0.1226319745182991,
"learning_rate": 4.991788757985717e-05,
"loss": 0.6833,
"step": 5780
},
{
"epoch": 2.947031774570797,
"grad_norm": 0.12122051417827606,
"learning_rate": 4.991774551684654e-05,
"loss": 0.6827,
"step": 5790
},
{
"epoch": 2.9521233432512846,
"grad_norm": 0.12066159397363663,
"learning_rate": 4.9917603453835916e-05,
"loss": 0.6812,
"step": 5800
},
{
"epoch": 2.957214911931773,
"grad_norm": 0.12547747790813446,
"learning_rate": 4.991746139082529e-05,
"loss": 0.6801,
"step": 5810
},
{
"epoch": 2.962306480612261,
"grad_norm": 0.1140349805355072,
"learning_rate": 4.991731932781466e-05,
"loss": 0.6833,
"step": 5820
},
{
"epoch": 2.967398049292749,
"grad_norm": 0.14640016853809357,
"learning_rate": 4.9917177264804035e-05,
"loss": 0.6806,
"step": 5830
},
{
"epoch": 2.9724896179732374,
"grad_norm": 0.1226801946759224,
"learning_rate": 4.991703520179341e-05,
"loss": 0.6816,
"step": 5840
},
{
"epoch": 2.9775811866537256,
"grad_norm": 0.1280628740787506,
"learning_rate": 4.9916893138782775e-05,
"loss": 0.6814,
"step": 5850
},
{
"epoch": 2.982672755334214,
"grad_norm": 0.13127422332763672,
"learning_rate": 4.991675107577215e-05,
"loss": 0.683,
"step": 5860
},
{
"epoch": 2.987764324014702,
"grad_norm": 0.10727940499782562,
"learning_rate": 4.991660901276152e-05,
"loss": 0.6824,
"step": 5870
},
{
"epoch": 2.9928558926951903,
"grad_norm": 0.13203033804893494,
"learning_rate": 4.9916466949750894e-05,
"loss": 0.6825,
"step": 5880
},
{
"epoch": 2.997947461375678,
"grad_norm": 0.1205354556441307,
"learning_rate": 4.991632488674027e-05,
"loss": 0.6868,
"step": 5890
},
{
"epoch": 3.002545784340244,
"grad_norm": 0.1364830732345581,
"learning_rate": 4.991618282372964e-05,
"loss": 0.6114,
"step": 5900
},
{
"epoch": 3.007637353020732,
"grad_norm": 0.1269853115081787,
"learning_rate": 4.991604076071901e-05,
"loss": 0.6826,
"step": 5910
},
{
"epoch": 3.0127289217012203,
"grad_norm": 0.1348942220211029,
"learning_rate": 4.9915898697708386e-05,
"loss": 0.6837,
"step": 5920
},
{
"epoch": 3.0178204903817085,
"grad_norm": 0.13320055603981018,
"learning_rate": 4.991575663469776e-05,
"loss": 0.6775,
"step": 5930
},
{
"epoch": 3.0229120590621967,
"grad_norm": 0.11422030627727509,
"learning_rate": 4.991561457168713e-05,
"loss": 0.6801,
"step": 5940
},
{
"epoch": 3.028003627742685,
"grad_norm": 0.10496284067630768,
"learning_rate": 4.9915472508676506e-05,
"loss": 0.6809,
"step": 5950
},
{
"epoch": 3.033095196423173,
"grad_norm": 0.10586734861135483,
"learning_rate": 4.991533044566588e-05,
"loss": 0.6775,
"step": 5960
},
{
"epoch": 3.0381867651036614,
"grad_norm": 0.13202211260795593,
"learning_rate": 4.9915188382655245e-05,
"loss": 0.6777,
"step": 5970
},
{
"epoch": 3.043278333784149,
"grad_norm": 0.13048899173736572,
"learning_rate": 4.991504631964462e-05,
"loss": 0.6779,
"step": 5980
},
{
"epoch": 3.0483699024646373,
"grad_norm": 0.12446481734514236,
"learning_rate": 4.9914904256633984e-05,
"loss": 0.6763,
"step": 5990
},
{
"epoch": 3.0534614711451256,
"grad_norm": 0.10315615683794022,
"learning_rate": 4.991476219362336e-05,
"loss": 0.6877,
"step": 6000
},
{
"epoch": 3.0585530398256138,
"grad_norm": 0.11032961308956146,
"learning_rate": 4.991462013061273e-05,
"loss": 0.6812,
"step": 6010
},
{
"epoch": 3.063644608506102,
"grad_norm": 0.0968027114868164,
"learning_rate": 4.9914478067602104e-05,
"loss": 0.6818,
"step": 6020
},
{
"epoch": 3.06873617718659,
"grad_norm": 0.11660617589950562,
"learning_rate": 4.991433600459148e-05,
"loss": 0.6806,
"step": 6030
},
{
"epoch": 3.0738277458670784,
"grad_norm": 0.1213793009519577,
"learning_rate": 4.991419394158085e-05,
"loss": 0.6818,
"step": 6040
},
{
"epoch": 3.078919314547566,
"grad_norm": 0.1275392472743988,
"learning_rate": 4.991405187857022e-05,
"loss": 0.6802,
"step": 6050
},
{
"epoch": 3.0840108832280544,
"grad_norm": 0.1026177927851677,
"learning_rate": 4.9913909815559596e-05,
"loss": 0.6835,
"step": 6060
},
{
"epoch": 3.0891024519085426,
"grad_norm": 0.10983236879110336,
"learning_rate": 4.991376775254897e-05,
"loss": 0.6838,
"step": 6070
},
{
"epoch": 3.094194020589031,
"grad_norm": 0.11360979080200195,
"learning_rate": 4.991362568953834e-05,
"loss": 0.6806,
"step": 6080
},
{
"epoch": 3.099285589269519,
"grad_norm": 0.1488681137561798,
"learning_rate": 4.9913483626527716e-05,
"loss": 0.6797,
"step": 6090
},
{
"epoch": 3.1043771579500072,
"grad_norm": 0.13620369136333466,
"learning_rate": 4.991334156351709e-05,
"loss": 0.6871,
"step": 6100
},
{
"epoch": 3.1094687266304954,
"grad_norm": 0.12065689265727997,
"learning_rate": 4.9913199500506455e-05,
"loss": 0.6792,
"step": 6110
},
{
"epoch": 3.1145602953109837,
"grad_norm": 0.13917431235313416,
"learning_rate": 4.991305743749583e-05,
"loss": 0.676,
"step": 6120
},
{
"epoch": 3.1196518639914714,
"grad_norm": 0.1255902796983719,
"learning_rate": 4.99129153744852e-05,
"loss": 0.6842,
"step": 6130
},
{
"epoch": 3.1247434326719596,
"grad_norm": 0.11472214758396149,
"learning_rate": 4.9912773311474574e-05,
"loss": 0.6788,
"step": 6140
},
{
"epoch": 3.129835001352448,
"grad_norm": 0.12614910304546356,
"learning_rate": 4.991263124846395e-05,
"loss": 0.6802,
"step": 6150
},
{
"epoch": 3.134926570032936,
"grad_norm": 0.13529641926288605,
"learning_rate": 4.991248918545332e-05,
"loss": 0.6805,
"step": 6160
},
{
"epoch": 3.1400181387134243,
"grad_norm": 0.11604179441928864,
"learning_rate": 4.9912347122442694e-05,
"loss": 0.6813,
"step": 6170
},
{
"epoch": 3.1451097073939125,
"grad_norm": 0.12789122760295868,
"learning_rate": 4.991220505943207e-05,
"loss": 0.6845,
"step": 6180
},
{
"epoch": 3.1502012760744007,
"grad_norm": 0.12958049774169922,
"learning_rate": 4.991206299642143e-05,
"loss": 0.6804,
"step": 6190
},
{
"epoch": 3.1552928447548885,
"grad_norm": 0.10314188152551651,
"learning_rate": 4.9911920933410806e-05,
"loss": 0.6847,
"step": 6200
},
{
"epoch": 3.1603844134353767,
"grad_norm": 0.10737662017345428,
"learning_rate": 4.991177887040018e-05,
"loss": 0.6808,
"step": 6210
},
{
"epoch": 3.165475982115865,
"grad_norm": 0.15128542482852936,
"learning_rate": 4.991163680738955e-05,
"loss": 0.6789,
"step": 6220
},
{
"epoch": 3.170567550796353,
"grad_norm": 0.11941689997911453,
"learning_rate": 4.9911494744378925e-05,
"loss": 0.6778,
"step": 6230
},
{
"epoch": 3.1756591194768413,
"grad_norm": 0.15348762273788452,
"learning_rate": 4.99113526813683e-05,
"loss": 0.6799,
"step": 6240
},
{
"epoch": 3.1807506881573295,
"grad_norm": 0.11959049850702286,
"learning_rate": 4.9911210618357665e-05,
"loss": 0.6776,
"step": 6250
},
{
"epoch": 3.1858422568378177,
"grad_norm": 0.11588987708091736,
"learning_rate": 4.991106855534704e-05,
"loss": 0.688,
"step": 6260
},
{
"epoch": 3.190933825518306,
"grad_norm": 0.09905340522527695,
"learning_rate": 4.991092649233641e-05,
"loss": 0.6845,
"step": 6270
},
{
"epoch": 3.196025394198794,
"grad_norm": 0.11044521629810333,
"learning_rate": 4.9910784429325784e-05,
"loss": 0.6823,
"step": 6280
},
{
"epoch": 3.201116962879282,
"grad_norm": 0.10236191004514694,
"learning_rate": 4.991064236631516e-05,
"loss": 0.6824,
"step": 6290
},
{
"epoch": 3.20620853155977,
"grad_norm": 0.12017529457807541,
"learning_rate": 4.991050030330453e-05,
"loss": 0.682,
"step": 6300
},
{
"epoch": 3.2113001002402584,
"grad_norm": 0.14782628417015076,
"learning_rate": 4.9910358240293904e-05,
"loss": 0.6781,
"step": 6310
},
{
"epoch": 3.2163916689207466,
"grad_norm": 0.14653240144252777,
"learning_rate": 4.991021617728328e-05,
"loss": 0.6809,
"step": 6320
},
{
"epoch": 3.221483237601235,
"grad_norm": 0.12069736421108246,
"learning_rate": 4.991007411427265e-05,
"loss": 0.6811,
"step": 6330
},
{
"epoch": 3.226574806281723,
"grad_norm": 0.13772337138652802,
"learning_rate": 4.990993205126202e-05,
"loss": 0.6838,
"step": 6340
},
{
"epoch": 3.231666374962211,
"grad_norm": 0.10374171286821365,
"learning_rate": 4.9909789988251396e-05,
"loss": 0.6836,
"step": 6350
},
{
"epoch": 3.236757943642699,
"grad_norm": 0.11860493570566177,
"learning_rate": 4.990964792524076e-05,
"loss": 0.6778,
"step": 6360
},
{
"epoch": 3.241849512323187,
"grad_norm": 0.1429886519908905,
"learning_rate": 4.9909505862230135e-05,
"loss": 0.6775,
"step": 6370
},
{
"epoch": 3.2469410810036754,
"grad_norm": 0.1501941680908203,
"learning_rate": 4.990936379921951e-05,
"loss": 0.6804,
"step": 6380
},
{
"epoch": 3.2520326496841636,
"grad_norm": 0.12676025927066803,
"learning_rate": 4.990922173620888e-05,
"loss": 0.6802,
"step": 6390
},
{
"epoch": 3.257124218364652,
"grad_norm": 0.14346669614315033,
"learning_rate": 4.990907967319825e-05,
"loss": 0.6815,
"step": 6400
},
{
"epoch": 3.26221578704514,
"grad_norm": 0.11594365537166595,
"learning_rate": 4.990893761018762e-05,
"loss": 0.6783,
"step": 6410
},
{
"epoch": 3.2673073557256282,
"grad_norm": 0.12863503396511078,
"learning_rate": 4.9908795547176994e-05,
"loss": 0.68,
"step": 6420
},
{
"epoch": 3.272398924406116,
"grad_norm": 0.13634729385375977,
"learning_rate": 4.990865348416637e-05,
"loss": 0.6795,
"step": 6430
},
{
"epoch": 3.2774904930866042,
"grad_norm": 0.10696328431367874,
"learning_rate": 4.990851142115574e-05,
"loss": 0.6827,
"step": 6440
},
{
"epoch": 3.2825820617670924,
"grad_norm": 0.1048332154750824,
"learning_rate": 4.9908369358145113e-05,
"loss": 0.6812,
"step": 6450
},
{
"epoch": 3.2876736304475807,
"grad_norm": 0.09791410714387894,
"learning_rate": 4.9908227295134487e-05,
"loss": 0.6848,
"step": 6460
},
{
"epoch": 3.292765199128069,
"grad_norm": 0.13385730981826782,
"learning_rate": 4.990808523212386e-05,
"loss": 0.6826,
"step": 6470
},
{
"epoch": 3.297856767808557,
"grad_norm": 0.13646642863750458,
"learning_rate": 4.990794316911323e-05,
"loss": 0.6767,
"step": 6480
},
{
"epoch": 3.3029483364890453,
"grad_norm": 0.14173270761966705,
"learning_rate": 4.9907801106102606e-05,
"loss": 0.6838,
"step": 6490
},
{
"epoch": 3.3080399051695335,
"grad_norm": 0.14603695273399353,
"learning_rate": 4.990765904309197e-05,
"loss": 0.6766,
"step": 6500
},
{
"epoch": 3.3131314738500217,
"grad_norm": 0.138224795460701,
"learning_rate": 4.9907516980081345e-05,
"loss": 0.6797,
"step": 6510
},
{
"epoch": 3.3182230425305095,
"grad_norm": 0.11541623622179031,
"learning_rate": 4.990737491707072e-05,
"loss": 0.6814,
"step": 6520
},
{
"epoch": 3.3233146112109977,
"grad_norm": 0.1160949096083641,
"learning_rate": 4.990723285406009e-05,
"loss": 0.6842,
"step": 6530
},
{
"epoch": 3.328406179891486,
"grad_norm": 0.1572464108467102,
"learning_rate": 4.9907090791049465e-05,
"loss": 0.6783,
"step": 6540
},
{
"epoch": 3.333497748571974,
"grad_norm": 0.13026835024356842,
"learning_rate": 4.990694872803884e-05,
"loss": 0.681,
"step": 6550
},
{
"epoch": 3.3385893172524623,
"grad_norm": 0.11961708962917328,
"learning_rate": 4.990680666502821e-05,
"loss": 0.6807,
"step": 6560
},
{
"epoch": 3.3436808859329505,
"grad_norm": 0.11406982690095901,
"learning_rate": 4.9906664602017584e-05,
"loss": 0.6795,
"step": 6570
},
{
"epoch": 3.3487724546134388,
"grad_norm": 0.20744380354881287,
"learning_rate": 4.990652253900696e-05,
"loss": 0.6771,
"step": 6580
},
{
"epoch": 3.3538640232939265,
"grad_norm": 0.11253584921360016,
"learning_rate": 4.990638047599633e-05,
"loss": 0.6802,
"step": 6590
},
{
"epoch": 3.3589555919744147,
"grad_norm": 0.08123784512281418,
"learning_rate": 4.99062384129857e-05,
"loss": 0.6871,
"step": 6600
},
{
"epoch": 3.364047160654903,
"grad_norm": 0.10802698135375977,
"learning_rate": 4.990609634997507e-05,
"loss": 0.683,
"step": 6610
},
{
"epoch": 3.369138729335391,
"grad_norm": 0.11430787295103073,
"learning_rate": 4.990595428696444e-05,
"loss": 0.6821,
"step": 6620
},
{
"epoch": 3.3742302980158794,
"grad_norm": 0.09323684871196747,
"learning_rate": 4.9905812223953816e-05,
"loss": 0.6836,
"step": 6630
},
{
"epoch": 3.3793218666963676,
"grad_norm": 0.10404845327138901,
"learning_rate": 4.990567016094318e-05,
"loss": 0.6849,
"step": 6640
},
{
"epoch": 3.384413435376856,
"grad_norm": 0.1404566615819931,
"learning_rate": 4.9905528097932555e-05,
"loss": 0.6769,
"step": 6650
},
{
"epoch": 3.3895050040573436,
"grad_norm": 0.17702195048332214,
"learning_rate": 4.990538603492193e-05,
"loss": 0.6808,
"step": 6660
},
{
"epoch": 3.3945965727378318,
"grad_norm": 0.1227133646607399,
"learning_rate": 4.99052439719113e-05,
"loss": 0.6855,
"step": 6670
},
{
"epoch": 3.39968814141832,
"grad_norm": 0.0946226418018341,
"learning_rate": 4.9905101908900675e-05,
"loss": 0.6804,
"step": 6680
},
{
"epoch": 3.404779710098808,
"grad_norm": 0.11467920988798141,
"learning_rate": 4.990495984589005e-05,
"loss": 0.677,
"step": 6690
},
{
"epoch": 3.4098712787792964,
"grad_norm": 0.1885383576154709,
"learning_rate": 4.990481778287942e-05,
"loss": 0.6786,
"step": 6700
},
{
"epoch": 3.4149628474597846,
"grad_norm": 0.0994097888469696,
"learning_rate": 4.9904675719868794e-05,
"loss": 0.6908,
"step": 6710
},
{
"epoch": 3.420054416140273,
"grad_norm": 0.09989442676305771,
"learning_rate": 4.990453365685817e-05,
"loss": 0.6804,
"step": 6720
},
{
"epoch": 3.425145984820761,
"grad_norm": 0.12362310290336609,
"learning_rate": 4.990439159384754e-05,
"loss": 0.6817,
"step": 6730
},
{
"epoch": 3.4302375535012493,
"grad_norm": 0.13228794932365417,
"learning_rate": 4.990424953083691e-05,
"loss": 0.681,
"step": 6740
},
{
"epoch": 3.435329122181737,
"grad_norm": 0.11642909795045853,
"learning_rate": 4.9904107467826286e-05,
"loss": 0.6825,
"step": 6750
},
{
"epoch": 3.4404206908622252,
"grad_norm": 0.12548530101776123,
"learning_rate": 4.990396540481565e-05,
"loss": 0.6814,
"step": 6760
},
{
"epoch": 3.4455122595427135,
"grad_norm": 0.11513999849557877,
"learning_rate": 4.9903823341805026e-05,
"loss": 0.6792,
"step": 6770
},
{
"epoch": 3.4506038282232017,
"grad_norm": 0.12245498597621918,
"learning_rate": 4.99036812787944e-05,
"loss": 0.6771,
"step": 6780
},
{
"epoch": 3.45569539690369,
"grad_norm": 0.12722285091876984,
"learning_rate": 4.990353921578377e-05,
"loss": 0.679,
"step": 6790
},
{
"epoch": 3.460786965584178,
"grad_norm": 0.13212384283542633,
"learning_rate": 4.9903397152773145e-05,
"loss": 0.6818,
"step": 6800
},
{
"epoch": 3.4658785342646663,
"grad_norm": 0.11193917691707611,
"learning_rate": 4.990325508976252e-05,
"loss": 0.6822,
"step": 6810
},
{
"epoch": 3.470970102945154,
"grad_norm": 0.14051009714603424,
"learning_rate": 4.9903113026751884e-05,
"loss": 0.673,
"step": 6820
},
{
"epoch": 3.4760616716256423,
"grad_norm": 0.16787344217300415,
"learning_rate": 4.990297096374126e-05,
"loss": 0.6834,
"step": 6830
},
{
"epoch": 3.4811532403061305,
"grad_norm": 0.1313748061656952,
"learning_rate": 4.990282890073063e-05,
"loss": 0.6785,
"step": 6840
},
{
"epoch": 3.4862448089866187,
"grad_norm": 0.13282889127731323,
"learning_rate": 4.9902686837720004e-05,
"loss": 0.6791,
"step": 6850
},
{
"epoch": 3.491336377667107,
"grad_norm": 0.15743672847747803,
"learning_rate": 4.990254477470938e-05,
"loss": 0.685,
"step": 6860
},
{
"epoch": 3.496427946347595,
"grad_norm": 0.09886245429515839,
"learning_rate": 4.990240271169875e-05,
"loss": 0.6831,
"step": 6870
},
{
"epoch": 3.5015195150280833,
"grad_norm": 0.14891770482063293,
"learning_rate": 4.990226064868812e-05,
"loss": 0.681,
"step": 6880
},
{
"epoch": 3.506611083708571,
"grad_norm": 0.13956576585769653,
"learning_rate": 4.9902118585677496e-05,
"loss": 0.6806,
"step": 6890
},
{
"epoch": 3.5117026523890598,
"grad_norm": 0.1325678676366806,
"learning_rate": 4.990197652266686e-05,
"loss": 0.6809,
"step": 6900
},
{
"epoch": 3.5167942210695475,
"grad_norm": 0.13164210319519043,
"learning_rate": 4.9901834459656236e-05,
"loss": 0.6822,
"step": 6910
},
{
"epoch": 3.5218857897500357,
"grad_norm": 0.13481168448925018,
"learning_rate": 4.990169239664561e-05,
"loss": 0.673,
"step": 6920
},
{
"epoch": 3.526977358430524,
"grad_norm": 0.16314196586608887,
"learning_rate": 4.990155033363498e-05,
"loss": 0.6768,
"step": 6930
},
{
"epoch": 3.532068927111012,
"grad_norm": 0.1418369710445404,
"learning_rate": 4.9901408270624355e-05,
"loss": 0.6777,
"step": 6940
},
{
"epoch": 3.5371604957915004,
"grad_norm": 0.12762701511383057,
"learning_rate": 4.990126620761373e-05,
"loss": 0.6788,
"step": 6950
},
{
"epoch": 3.542252064471988,
"grad_norm": 0.10353351384401321,
"learning_rate": 4.99011241446031e-05,
"loss": 0.6858,
"step": 6960
},
{
"epoch": 3.547343633152477,
"grad_norm": 0.0953698605298996,
"learning_rate": 4.9900982081592474e-05,
"loss": 0.6783,
"step": 6970
},
{
"epoch": 3.5524352018329646,
"grad_norm": 0.10428538918495178,
"learning_rate": 4.990084001858185e-05,
"loss": 0.6844,
"step": 6980
},
{
"epoch": 3.557526770513453,
"grad_norm": 0.11740399152040482,
"learning_rate": 4.990069795557122e-05,
"loss": 0.6838,
"step": 6990
},
{
"epoch": 3.562618339193941,
"grad_norm": 0.12733303010463715,
"learning_rate": 4.9900555892560594e-05,
"loss": 0.6768,
"step": 7000
},
{
"epoch": 3.567709907874429,
"grad_norm": 0.16426721215248108,
"learning_rate": 4.990041382954997e-05,
"loss": 0.683,
"step": 7010
},
{
"epoch": 3.5728014765549174,
"grad_norm": 0.12947894632816315,
"learning_rate": 4.990027176653933e-05,
"loss": 0.6729,
"step": 7020
},
{
"epoch": 3.5778930452354056,
"grad_norm": 0.15960286557674408,
"learning_rate": 4.9900129703528706e-05,
"loss": 0.679,
"step": 7030
},
{
"epoch": 3.582984613915894,
"grad_norm": 0.12176317721605301,
"learning_rate": 4.989998764051807e-05,
"loss": 0.6832,
"step": 7040
},
{
"epoch": 3.5880761825963816,
"grad_norm": 0.12822549045085907,
"learning_rate": 4.9899845577507446e-05,
"loss": 0.6807,
"step": 7050
},
{
"epoch": 3.59316775127687,
"grad_norm": 0.09114730358123779,
"learning_rate": 4.989970351449682e-05,
"loss": 0.6837,
"step": 7060
},
{
"epoch": 3.598259319957358,
"grad_norm": 0.11248596012592316,
"learning_rate": 4.989956145148619e-05,
"loss": 0.6773,
"step": 7070
},
{
"epoch": 3.6033508886378462,
"grad_norm": 0.14381690323352814,
"learning_rate": 4.9899419388475565e-05,
"loss": 0.6763,
"step": 7080
},
{
"epoch": 3.6084424573183345,
"grad_norm": 0.1576450616121292,
"learning_rate": 4.989927732546494e-05,
"loss": 0.6796,
"step": 7090
},
{
"epoch": 3.6135340259988227,
"grad_norm": 0.12672173976898193,
"learning_rate": 4.989913526245431e-05,
"loss": 0.6773,
"step": 7100
},
{
"epoch": 3.618625594679311,
"grad_norm": 0.10089720040559769,
"learning_rate": 4.9898993199443684e-05,
"loss": 0.6835,
"step": 7110
},
{
"epoch": 3.6237171633597987,
"grad_norm": 0.10352669656276703,
"learning_rate": 4.989885113643306e-05,
"loss": 0.6804,
"step": 7120
},
{
"epoch": 3.6288087320402873,
"grad_norm": 0.12168221920728683,
"learning_rate": 4.989870907342243e-05,
"loss": 0.6775,
"step": 7130
},
{
"epoch": 3.633900300720775,
"grad_norm": 0.152724489569664,
"learning_rate": 4.9898567010411804e-05,
"loss": 0.6832,
"step": 7140
},
{
"epoch": 3.6389918694012633,
"grad_norm": 0.10124222189188004,
"learning_rate": 4.989842494740118e-05,
"loss": 0.6824,
"step": 7150
},
{
"epoch": 3.6440834380817515,
"grad_norm": 0.10840737819671631,
"learning_rate": 4.989828288439054e-05,
"loss": 0.6781,
"step": 7160
},
{
"epoch": 3.6491750067622397,
"grad_norm": 0.10668514668941498,
"learning_rate": 4.9898140821379916e-05,
"loss": 0.6857,
"step": 7170
},
{
"epoch": 3.654266575442728,
"grad_norm": 0.11429141461849213,
"learning_rate": 4.989799875836929e-05,
"loss": 0.6823,
"step": 7180
},
{
"epoch": 3.6593581441232157,
"grad_norm": 0.1012284979224205,
"learning_rate": 4.989785669535866e-05,
"loss": 0.678,
"step": 7190
},
{
"epoch": 3.6644497128037044,
"grad_norm": 0.15000002086162567,
"learning_rate": 4.9897714632348035e-05,
"loss": 0.6763,
"step": 7200
},
{
"epoch": 3.669541281484192,
"grad_norm": 0.15613609552383423,
"learning_rate": 4.989757256933741e-05,
"loss": 0.6837,
"step": 7210
},
{
"epoch": 3.6746328501646803,
"grad_norm": 0.13344906270503998,
"learning_rate": 4.989743050632678e-05,
"loss": 0.6841,
"step": 7220
},
{
"epoch": 3.6797244188451685,
"grad_norm": 0.12140567600727081,
"learning_rate": 4.9897288443316155e-05,
"loss": 0.6792,
"step": 7230
},
{
"epoch": 3.6848159875256568,
"grad_norm": 0.11317454278469086,
"learning_rate": 4.989714638030552e-05,
"loss": 0.6821,
"step": 7240
},
{
"epoch": 3.689907556206145,
"grad_norm": 0.1328129768371582,
"learning_rate": 4.9897004317294894e-05,
"loss": 0.6842,
"step": 7250
},
{
"epoch": 3.694999124886633,
"grad_norm": 0.1081654503941536,
"learning_rate": 4.989686225428427e-05,
"loss": 0.6796,
"step": 7260
},
{
"epoch": 3.7000906935671214,
"grad_norm": 0.09531684964895248,
"learning_rate": 4.989672019127364e-05,
"loss": 0.6833,
"step": 7270
},
{
"epoch": 3.705182262247609,
"grad_norm": 0.10997920483350754,
"learning_rate": 4.9896578128263013e-05,
"loss": 0.6795,
"step": 7280
},
{
"epoch": 3.7102738309280974,
"grad_norm": 0.15568581223487854,
"learning_rate": 4.9896436065252387e-05,
"loss": 0.6804,
"step": 7290
},
{
"epoch": 3.7153653996085856,
"grad_norm": 0.130909726023674,
"learning_rate": 4.989629400224175e-05,
"loss": 0.6814,
"step": 7300
},
{
"epoch": 3.720456968289074,
"grad_norm": 0.13917888700962067,
"learning_rate": 4.9896151939231126e-05,
"loss": 0.6793,
"step": 7310
},
{
"epoch": 3.725548536969562,
"grad_norm": 0.12968967854976654,
"learning_rate": 4.98960098762205e-05,
"loss": 0.6819,
"step": 7320
},
{
"epoch": 3.73064010565005,
"grad_norm": 0.12175523489713669,
"learning_rate": 4.989586781320987e-05,
"loss": 0.6785,
"step": 7330
},
{
"epoch": 3.7357316743305384,
"grad_norm": 0.12431439012289047,
"learning_rate": 4.9895725750199245e-05,
"loss": 0.6785,
"step": 7340
},
{
"epoch": 3.740823243011026,
"grad_norm": 0.1398157924413681,
"learning_rate": 4.989558368718862e-05,
"loss": 0.6779,
"step": 7350
},
{
"epoch": 3.745914811691515,
"grad_norm": 0.11357001215219498,
"learning_rate": 4.989544162417799e-05,
"loss": 0.685,
"step": 7360
},
{
"epoch": 3.7510063803720026,
"grad_norm": 0.16288457810878754,
"learning_rate": 4.9895299561167365e-05,
"loss": 0.6811,
"step": 7370
},
{
"epoch": 3.756097949052491,
"grad_norm": 0.11568481475114822,
"learning_rate": 4.989515749815674e-05,
"loss": 0.6796,
"step": 7380
},
{
"epoch": 3.761189517732979,
"grad_norm": 0.15195196866989136,
"learning_rate": 4.989501543514611e-05,
"loss": 0.6777,
"step": 7390
},
{
"epoch": 3.7662810864134673,
"grad_norm": 0.12881244719028473,
"learning_rate": 4.9894873372135484e-05,
"loss": 0.6775,
"step": 7400
},
{
"epoch": 3.7713726550939555,
"grad_norm": 0.1401291787624359,
"learning_rate": 4.989473130912485e-05,
"loss": 0.6834,
"step": 7410
},
{
"epoch": 3.7764642237744432,
"grad_norm": 0.12248072773218155,
"learning_rate": 4.9894589246114223e-05,
"loss": 0.6792,
"step": 7420
},
{
"epoch": 3.781555792454932,
"grad_norm": 0.11089824140071869,
"learning_rate": 4.9894447183103596e-05,
"loss": 0.6819,
"step": 7430
},
{
"epoch": 3.7866473611354197,
"grad_norm": 0.09657898545265198,
"learning_rate": 4.989430512009297e-05,
"loss": 0.6841,
"step": 7440
},
{
"epoch": 3.791738929815908,
"grad_norm": 0.12385948747396469,
"learning_rate": 4.9894163057082336e-05,
"loss": 0.6795,
"step": 7450
},
{
"epoch": 3.796830498496396,
"grad_norm": 0.10562111437320709,
"learning_rate": 4.989402099407171e-05,
"loss": 0.6783,
"step": 7460
},
{
"epoch": 3.8019220671768843,
"grad_norm": 0.11349403858184814,
"learning_rate": 4.989387893106108e-05,
"loss": 0.6807,
"step": 7470
},
{
"epoch": 3.8070136358573725,
"grad_norm": 0.11444567143917084,
"learning_rate": 4.9893736868050455e-05,
"loss": 0.6791,
"step": 7480
},
{
"epoch": 3.8121052045378607,
"grad_norm": 0.1610439121723175,
"learning_rate": 4.989359480503983e-05,
"loss": 0.6812,
"step": 7490
},
{
"epoch": 3.817196773218349,
"grad_norm": 0.1214766800403595,
"learning_rate": 4.98934527420292e-05,
"loss": 0.6817,
"step": 7500
},
{
"epoch": 3.8222883418988367,
"grad_norm": 0.12765400111675262,
"learning_rate": 4.9893310679018575e-05,
"loss": 0.6787,
"step": 7510
},
{
"epoch": 3.827379910579325,
"grad_norm": 0.10731592029333115,
"learning_rate": 4.989316861600795e-05,
"loss": 0.683,
"step": 7520
},
{
"epoch": 3.832471479259813,
"grad_norm": 0.12986642122268677,
"learning_rate": 4.989302655299732e-05,
"loss": 0.6766,
"step": 7530
},
{
"epoch": 3.8375630479403013,
"grad_norm": 0.12156540900468826,
"learning_rate": 4.9892884489986694e-05,
"loss": 0.6834,
"step": 7540
},
{
"epoch": 3.8426546166207896,
"grad_norm": 0.10650958865880966,
"learning_rate": 4.989274242697606e-05,
"loss": 0.6821,
"step": 7550
},
{
"epoch": 3.8477461853012778,
"grad_norm": 0.09265447407960892,
"learning_rate": 4.989260036396543e-05,
"loss": 0.6807,
"step": 7560
},
{
"epoch": 3.852837753981766,
"grad_norm": 0.13007622957229614,
"learning_rate": 4.9892458300954806e-05,
"loss": 0.681,
"step": 7570
},
{
"epoch": 3.8579293226622537,
"grad_norm": 0.1033967137336731,
"learning_rate": 4.989231623794418e-05,
"loss": 0.686,
"step": 7580
},
{
"epoch": 3.8630208913427424,
"grad_norm": 0.10867638140916824,
"learning_rate": 4.989217417493355e-05,
"loss": 0.6796,
"step": 7590
},
{
"epoch": 3.86811246002323,
"grad_norm": 0.105263352394104,
"learning_rate": 4.9892032111922926e-05,
"loss": 0.6824,
"step": 7600
},
{
"epoch": 3.8732040287037184,
"grad_norm": 0.12403067946434021,
"learning_rate": 4.98918900489123e-05,
"loss": 0.6793,
"step": 7610
},
{
"epoch": 3.8782955973842066,
"grad_norm": 0.09988098591566086,
"learning_rate": 4.989174798590167e-05,
"loss": 0.6842,
"step": 7620
},
{
"epoch": 3.883387166064695,
"grad_norm": 0.13452745974063873,
"learning_rate": 4.9891605922891045e-05,
"loss": 0.6811,
"step": 7630
},
{
"epoch": 3.888478734745183,
"grad_norm": 0.10854171961545944,
"learning_rate": 4.989146385988042e-05,
"loss": 0.6827,
"step": 7640
},
{
"epoch": 3.893570303425671,
"grad_norm": 0.10819829255342484,
"learning_rate": 4.9891321796869784e-05,
"loss": 0.6796,
"step": 7650
},
{
"epoch": 3.8986618721061594,
"grad_norm": 0.17421726882457733,
"learning_rate": 4.989117973385916e-05,
"loss": 0.6808,
"step": 7660
},
{
"epoch": 3.903753440786647,
"grad_norm": 0.13020376861095428,
"learning_rate": 4.989103767084853e-05,
"loss": 0.6796,
"step": 7670
},
{
"epoch": 3.9088450094671354,
"grad_norm": 0.10870732367038727,
"learning_rate": 4.9890895607837904e-05,
"loss": 0.6867,
"step": 7680
},
{
"epoch": 3.9139365781476236,
"grad_norm": 0.10249564051628113,
"learning_rate": 4.989075354482727e-05,
"loss": 0.6847,
"step": 7690
},
{
"epoch": 3.919028146828112,
"grad_norm": 0.09583424031734467,
"learning_rate": 4.989061148181664e-05,
"loss": 0.6837,
"step": 7700
},
{
"epoch": 3.9241197155086,
"grad_norm": 0.10090246796607971,
"learning_rate": 4.9890469418806016e-05,
"loss": 0.6814,
"step": 7710
},
{
"epoch": 3.9292112841890883,
"grad_norm": 0.1201721727848053,
"learning_rate": 4.989032735579539e-05,
"loss": 0.6843,
"step": 7720
},
{
"epoch": 3.9343028528695765,
"grad_norm": 0.11703382432460785,
"learning_rate": 4.989018529278476e-05,
"loss": 0.6784,
"step": 7730
},
{
"epoch": 3.9393944215500643,
"grad_norm": 0.1226707398891449,
"learning_rate": 4.9890043229774136e-05,
"loss": 0.6847,
"step": 7740
},
{
"epoch": 3.9444859902305525,
"grad_norm": 0.09304598718881607,
"learning_rate": 4.988990116676351e-05,
"loss": 0.6812,
"step": 7750
},
{
"epoch": 3.9495775589110407,
"grad_norm": 0.10586468130350113,
"learning_rate": 4.988975910375288e-05,
"loss": 0.679,
"step": 7760
},
{
"epoch": 3.954669127591529,
"grad_norm": 0.10969860106706619,
"learning_rate": 4.9889617040742255e-05,
"loss": 0.6826,
"step": 7770
},
{
"epoch": 3.959760696272017,
"grad_norm": 0.1249874085187912,
"learning_rate": 4.988947497773163e-05,
"loss": 0.6767,
"step": 7780
},
{
"epoch": 3.9648522649525053,
"grad_norm": 0.16480816900730133,
"learning_rate": 4.9889332914721e-05,
"loss": 0.6796,
"step": 7790
},
{
"epoch": 3.9699438336329935,
"grad_norm": 0.2025347650051117,
"learning_rate": 4.9889190851710374e-05,
"loss": 0.678,
"step": 7800
},
{
"epoch": 3.9750354023134813,
"grad_norm": 0.1153530701994896,
"learning_rate": 4.988904878869974e-05,
"loss": 0.6812,
"step": 7810
},
{
"epoch": 3.98012697099397,
"grad_norm": 0.12336631864309311,
"learning_rate": 4.9888906725689114e-05,
"loss": 0.6781,
"step": 7820
},
{
"epoch": 3.9852185396744577,
"grad_norm": 0.1417071670293808,
"learning_rate": 4.988876466267849e-05,
"loss": 0.6796,
"step": 7830
},
{
"epoch": 3.990310108354946,
"grad_norm": 0.12677961587905884,
"learning_rate": 4.988862259966786e-05,
"loss": 0.6819,
"step": 7840
},
{
"epoch": 3.995401677035434,
"grad_norm": 0.1134430319070816,
"learning_rate": 4.988848053665723e-05,
"loss": 0.6802,
"step": 7850
},
{
"epoch": 4.0,
"grad_norm": 0.021812934428453445,
"learning_rate": 4.98883384736466e-05,
"loss": 0.6148,
"step": 7860
},
{
"epoch": 4.005091568680488,
"grad_norm": 0.11325574666261673,
"learning_rate": 4.988819641063597e-05,
"loss": 0.6823,
"step": 7870
},
{
"epoch": 4.010183137360976,
"grad_norm": 0.12439537793397903,
"learning_rate": 4.9888054347625346e-05,
"loss": 0.6808,
"step": 7880
},
{
"epoch": 4.015274706041464,
"grad_norm": 0.11274933069944382,
"learning_rate": 4.988791228461472e-05,
"loss": 0.6828,
"step": 7890
},
{
"epoch": 4.020366274721953,
"grad_norm": 0.10643935203552246,
"learning_rate": 4.988777022160409e-05,
"loss": 0.6833,
"step": 7900
},
{
"epoch": 4.025457843402441,
"grad_norm": 0.0944155901670456,
"learning_rate": 4.9887628158593465e-05,
"loss": 0.6842,
"step": 7910
},
{
"epoch": 4.030549412082929,
"grad_norm": 0.12772725522518158,
"learning_rate": 4.988748609558284e-05,
"loss": 0.6764,
"step": 7920
},
{
"epoch": 4.035640980763417,
"grad_norm": 0.19370485842227936,
"learning_rate": 4.988734403257221e-05,
"loss": 0.6845,
"step": 7930
},
{
"epoch": 4.040732549443905,
"grad_norm": 0.13512100279331207,
"learning_rate": 4.9887201969561584e-05,
"loss": 0.6745,
"step": 7940
},
{
"epoch": 4.0458241181243935,
"grad_norm": 0.13933135569095612,
"learning_rate": 4.988705990655095e-05,
"loss": 0.6842,
"step": 7950
},
{
"epoch": 4.050915686804881,
"grad_norm": 0.13375182449817657,
"learning_rate": 4.9886917843540324e-05,
"loss": 0.6815,
"step": 7960
},
{
"epoch": 4.05600725548537,
"grad_norm": 0.11060313135385513,
"learning_rate": 4.98867757805297e-05,
"loss": 0.6798,
"step": 7970
},
{
"epoch": 4.061098824165858,
"grad_norm": 0.14003530144691467,
"learning_rate": 4.988663371751907e-05,
"loss": 0.6787,
"step": 7980
},
{
"epoch": 4.066190392846346,
"grad_norm": 0.10484720021486282,
"learning_rate": 4.988649165450844e-05,
"loss": 0.6818,
"step": 7990
},
{
"epoch": 4.071281961526834,
"grad_norm": 0.11415210366249084,
"learning_rate": 4.9886349591497816e-05,
"loss": 0.6804,
"step": 8000
},
{
"epoch": 4.076373530207323,
"grad_norm": 0.1279604583978653,
"learning_rate": 4.988620752848719e-05,
"loss": 0.6793,
"step": 8010
},
{
"epoch": 4.0814650988878105,
"grad_norm": 0.12138471007347107,
"learning_rate": 4.988606546547656e-05,
"loss": 0.6814,
"step": 8020
},
{
"epoch": 4.086556667568298,
"grad_norm": 0.13427557051181793,
"learning_rate": 4.9885923402465935e-05,
"loss": 0.6752,
"step": 8030
},
{
"epoch": 4.091648236248787,
"grad_norm": 0.14821045100688934,
"learning_rate": 4.988578133945531e-05,
"loss": 0.6775,
"step": 8040
},
{
"epoch": 4.096739804929275,
"grad_norm": 0.13484236598014832,
"learning_rate": 4.988563927644468e-05,
"loss": 0.6846,
"step": 8050
},
{
"epoch": 4.101831373609763,
"grad_norm": 0.07954470813274384,
"learning_rate": 4.9885497213434055e-05,
"loss": 0.684,
"step": 8060
},
{
"epoch": 4.106922942290251,
"grad_norm": 0.10616060346364975,
"learning_rate": 4.988535515042342e-05,
"loss": 0.6822,
"step": 8070
},
{
"epoch": 4.11201451097074,
"grad_norm": 0.10499216616153717,
"learning_rate": 4.9885213087412794e-05,
"loss": 0.6798,
"step": 8080
},
{
"epoch": 4.1171060796512275,
"grad_norm": 0.12274570018053055,
"learning_rate": 4.988507102440216e-05,
"loss": 0.6799,
"step": 8090
},
{
"epoch": 4.122197648331715,
"grad_norm": 0.11465749889612198,
"learning_rate": 4.9884928961391534e-05,
"loss": 0.6817,
"step": 8100
},
{
"epoch": 4.127289217012204,
"grad_norm": 0.09962257742881775,
"learning_rate": 4.988478689838091e-05,
"loss": 0.6844,
"step": 8110
},
{
"epoch": 4.132380785692692,
"grad_norm": 0.1151047945022583,
"learning_rate": 4.988464483537028e-05,
"loss": 0.6787,
"step": 8120
},
{
"epoch": 4.13747235437318,
"grad_norm": 0.1360507756471634,
"learning_rate": 4.988450277235965e-05,
"loss": 0.6791,
"step": 8130
},
{
"epoch": 4.142563923053668,
"grad_norm": 0.16751664876937866,
"learning_rate": 4.9884360709349026e-05,
"loss": 0.6738,
"step": 8140
},
{
"epoch": 4.147655491734157,
"grad_norm": 0.18576379120349884,
"learning_rate": 4.98842186463384e-05,
"loss": 0.678,
"step": 8150
},
{
"epoch": 4.152747060414645,
"grad_norm": 0.12279310077428818,
"learning_rate": 4.988407658332777e-05,
"loss": 0.6786,
"step": 8160
},
{
"epoch": 4.157838629095132,
"grad_norm": 0.14428728818893433,
"learning_rate": 4.9883934520317145e-05,
"loss": 0.6756,
"step": 8170
},
{
"epoch": 4.162930197775621,
"grad_norm": 0.1211373507976532,
"learning_rate": 4.988379245730652e-05,
"loss": 0.6775,
"step": 8180
},
{
"epoch": 4.168021766456109,
"grad_norm": 0.13393299281597137,
"learning_rate": 4.988365039429589e-05,
"loss": 0.6769,
"step": 8190
},
{
"epoch": 4.173113335136597,
"grad_norm": 0.12077504396438599,
"learning_rate": 4.9883508331285265e-05,
"loss": 0.6829,
"step": 8200
},
{
"epoch": 4.178204903817085,
"grad_norm": 0.10940321534872055,
"learning_rate": 4.988336626827463e-05,
"loss": 0.6809,
"step": 8210
},
{
"epoch": 4.183296472497574,
"grad_norm": 0.09884709119796753,
"learning_rate": 4.9883224205264004e-05,
"loss": 0.6813,
"step": 8220
},
{
"epoch": 4.188388041178062,
"grad_norm": 0.10086120665073395,
"learning_rate": 4.988308214225338e-05,
"loss": 0.6809,
"step": 8230
},
{
"epoch": 4.19347960985855,
"grad_norm": 0.11668648570775986,
"learning_rate": 4.988294007924275e-05,
"loss": 0.6798,
"step": 8240
},
{
"epoch": 4.198571178539038,
"grad_norm": 0.12528111040592194,
"learning_rate": 4.9882798016232123e-05,
"loss": 0.6765,
"step": 8250
},
{
"epoch": 4.203662747219526,
"grad_norm": 0.11714299023151398,
"learning_rate": 4.9882655953221497e-05,
"loss": 0.6744,
"step": 8260
},
{
"epoch": 4.2087543159000145,
"grad_norm": 0.11050295829772949,
"learning_rate": 4.988251389021087e-05,
"loss": 0.6874,
"step": 8270
},
{
"epoch": 4.213845884580502,
"grad_norm": 0.09499291330575943,
"learning_rate": 4.9882371827200236e-05,
"loss": 0.6817,
"step": 8280
},
{
"epoch": 4.218937453260991,
"grad_norm": 0.09335146099328995,
"learning_rate": 4.988222976418961e-05,
"loss": 0.6818,
"step": 8290
},
{
"epoch": 4.224029021941479,
"grad_norm": 0.1219559907913208,
"learning_rate": 4.988208770117898e-05,
"loss": 0.681,
"step": 8300
},
{
"epoch": 4.229120590621967,
"grad_norm": 0.14629492163658142,
"learning_rate": 4.9881945638168355e-05,
"loss": 0.6822,
"step": 8310
},
{
"epoch": 4.234212159302455,
"grad_norm": 0.13365550339221954,
"learning_rate": 4.988180357515773e-05,
"loss": 0.6805,
"step": 8320
},
{
"epoch": 4.239303727982943,
"grad_norm": 0.141509547829628,
"learning_rate": 4.98816615121471e-05,
"loss": 0.6783,
"step": 8330
},
{
"epoch": 4.2443952966634315,
"grad_norm": 0.13036063313484192,
"learning_rate": 4.9881519449136475e-05,
"loss": 0.6756,
"step": 8340
},
{
"epoch": 4.249486865343919,
"grad_norm": 0.11939451843500137,
"learning_rate": 4.988137738612584e-05,
"loss": 0.6826,
"step": 8350
},
{
"epoch": 4.254578434024408,
"grad_norm": 0.12008455395698547,
"learning_rate": 4.9881235323115214e-05,
"loss": 0.681,
"step": 8360
},
{
"epoch": 4.259670002704896,
"grad_norm": 0.11019112914800644,
"learning_rate": 4.988109326010459e-05,
"loss": 0.6856,
"step": 8370
},
{
"epoch": 4.264761571385384,
"grad_norm": 0.10078281164169312,
"learning_rate": 4.988095119709396e-05,
"loss": 0.6792,
"step": 8380
},
{
"epoch": 4.269853140065872,
"grad_norm": 0.1294504553079605,
"learning_rate": 4.988080913408333e-05,
"loss": 0.6825,
"step": 8390
},
{
"epoch": 4.274944708746361,
"grad_norm": 0.1074661836028099,
"learning_rate": 4.9880667071072706e-05,
"loss": 0.6799,
"step": 8400
},
{
"epoch": 4.280036277426849,
"grad_norm": 0.11285123229026794,
"learning_rate": 4.988052500806208e-05,
"loss": 0.6831,
"step": 8410
},
{
"epoch": 4.285127846107336,
"grad_norm": 0.12429996579885483,
"learning_rate": 4.988038294505145e-05,
"loss": 0.6793,
"step": 8420
},
{
"epoch": 4.290219414787825,
"grad_norm": 0.10803820192813873,
"learning_rate": 4.9880240882040826e-05,
"loss": 0.6787,
"step": 8430
},
{
"epoch": 4.295310983468313,
"grad_norm": 0.12693211436271667,
"learning_rate": 4.98800988190302e-05,
"loss": 0.6729,
"step": 8440
},
{
"epoch": 4.300402552148801,
"grad_norm": 0.11828629672527313,
"learning_rate": 4.987995675601957e-05,
"loss": 0.6829,
"step": 8450
},
{
"epoch": 4.305494120829289,
"grad_norm": 0.11893879622220993,
"learning_rate": 4.987981469300894e-05,
"loss": 0.681,
"step": 8460
},
{
"epoch": 4.310585689509777,
"grad_norm": 0.12228237092494965,
"learning_rate": 4.987967262999831e-05,
"loss": 0.6793,
"step": 8470
},
{
"epoch": 4.315677258190266,
"grad_norm": 0.11881165206432343,
"learning_rate": 4.9879530566987684e-05,
"loss": 0.681,
"step": 8480
},
{
"epoch": 4.320768826870753,
"grad_norm": 0.09753947705030441,
"learning_rate": 4.987938850397705e-05,
"loss": 0.6812,
"step": 8490
},
{
"epoch": 4.325860395551242,
"grad_norm": 0.10875561088323593,
"learning_rate": 4.9879246440966424e-05,
"loss": 0.6764,
"step": 8500
},
{
"epoch": 4.33095196423173,
"grad_norm": 0.1029878631234169,
"learning_rate": 4.98791043779558e-05,
"loss": 0.6793,
"step": 8510
},
{
"epoch": 4.3360435329122184,
"grad_norm": 0.11321298032999039,
"learning_rate": 4.987896231494517e-05,
"loss": 0.6805,
"step": 8520
},
{
"epoch": 4.341135101592706,
"grad_norm": 0.12302636355161667,
"learning_rate": 4.987882025193454e-05,
"loss": 0.6841,
"step": 8530
},
{
"epoch": 4.346226670273195,
"grad_norm": 0.0927717313170433,
"learning_rate": 4.9878678188923916e-05,
"loss": 0.6848,
"step": 8540
},
{
"epoch": 4.351318238953683,
"grad_norm": 0.1418168693780899,
"learning_rate": 4.987853612591329e-05,
"loss": 0.6764,
"step": 8550
},
{
"epoch": 4.35640980763417,
"grad_norm": 0.12036493420600891,
"learning_rate": 4.987839406290266e-05,
"loss": 0.6783,
"step": 8560
},
{
"epoch": 4.361501376314659,
"grad_norm": 0.14609991014003754,
"learning_rate": 4.9878251999892036e-05,
"loss": 0.676,
"step": 8570
},
{
"epoch": 4.366592944995147,
"grad_norm": 0.1448822170495987,
"learning_rate": 4.987810993688141e-05,
"loss": 0.6803,
"step": 8580
},
{
"epoch": 4.3716845136756355,
"grad_norm": 0.14650079607963562,
"learning_rate": 4.987796787387078e-05,
"loss": 0.6795,
"step": 8590
},
{
"epoch": 4.376776082356123,
"grad_norm": 0.10146970301866531,
"learning_rate": 4.987782581086015e-05,
"loss": 0.6826,
"step": 8600
},
{
"epoch": 4.381867651036612,
"grad_norm": 0.10098574310541153,
"learning_rate": 4.987768374784952e-05,
"loss": 0.6814,
"step": 8610
},
{
"epoch": 4.3869592197171,
"grad_norm": 0.12981392443180084,
"learning_rate": 4.9877541684838894e-05,
"loss": 0.6774,
"step": 8620
},
{
"epoch": 4.392050788397588,
"grad_norm": 0.1231103241443634,
"learning_rate": 4.987739962182827e-05,
"loss": 0.6751,
"step": 8630
},
{
"epoch": 4.397142357078076,
"grad_norm": 0.17549310624599457,
"learning_rate": 4.987725755881764e-05,
"loss": 0.6773,
"step": 8640
},
{
"epoch": 4.402233925758564,
"grad_norm": 0.1261102259159088,
"learning_rate": 4.9877115495807014e-05,
"loss": 0.6778,
"step": 8650
},
{
"epoch": 4.4073254944390525,
"grad_norm": 0.12228421121835709,
"learning_rate": 4.987697343279639e-05,
"loss": 0.6815,
"step": 8660
},
{
"epoch": 4.41241706311954,
"grad_norm": 0.08992882072925568,
"learning_rate": 4.987683136978576e-05,
"loss": 0.6829,
"step": 8670
},
{
"epoch": 4.417508631800029,
"grad_norm": 0.10478372871875763,
"learning_rate": 4.987668930677513e-05,
"loss": 0.6758,
"step": 8680
},
{
"epoch": 4.422600200480517,
"grad_norm": 0.1255083978176117,
"learning_rate": 4.9876547243764506e-05,
"loss": 0.6741,
"step": 8690
},
{
"epoch": 4.4276917691610045,
"grad_norm": 0.13139568269252777,
"learning_rate": 4.987640518075387e-05,
"loss": 0.6803,
"step": 8700
},
{
"epoch": 4.432783337841493,
"grad_norm": 0.1472860723733902,
"learning_rate": 4.9876263117743246e-05,
"loss": 0.6759,
"step": 8710
},
{
"epoch": 4.437874906521981,
"grad_norm": 0.16318807005882263,
"learning_rate": 4.987612105473262e-05,
"loss": 0.6868,
"step": 8720
},
{
"epoch": 4.44296647520247,
"grad_norm": 0.1145109310746193,
"learning_rate": 4.987597899172199e-05,
"loss": 0.6788,
"step": 8730
},
{
"epoch": 4.448058043882957,
"grad_norm": 0.09544923901557922,
"learning_rate": 4.987583692871136e-05,
"loss": 0.682,
"step": 8740
},
{
"epoch": 4.453149612563446,
"grad_norm": 0.10780615359544754,
"learning_rate": 4.987569486570073e-05,
"loss": 0.6781,
"step": 8750
},
{
"epoch": 4.458241181243934,
"grad_norm": 0.14260242879390717,
"learning_rate": 4.9875552802690104e-05,
"loss": 0.6782,
"step": 8760
},
{
"epoch": 4.463332749924422,
"grad_norm": 0.13693778216838837,
"learning_rate": 4.987541073967948e-05,
"loss": 0.6771,
"step": 8770
},
{
"epoch": 4.46842431860491,
"grad_norm": 0.10794325917959213,
"learning_rate": 4.987526867666885e-05,
"loss": 0.6789,
"step": 8780
},
{
"epoch": 4.473515887285398,
"grad_norm": 0.11324315518140793,
"learning_rate": 4.9875126613658224e-05,
"loss": 0.684,
"step": 8790
},
{
"epoch": 4.478607455965887,
"grad_norm": 0.10087355971336365,
"learning_rate": 4.98749845506476e-05,
"loss": 0.6819,
"step": 8800
},
{
"epoch": 4.483699024646374,
"grad_norm": 0.09752973914146423,
"learning_rate": 4.987484248763697e-05,
"loss": 0.6819,
"step": 8810
},
{
"epoch": 4.488790593326863,
"grad_norm": 0.12462896853685379,
"learning_rate": 4.987470042462634e-05,
"loss": 0.6844,
"step": 8820
},
{
"epoch": 4.493882162007351,
"grad_norm": 0.12875770032405853,
"learning_rate": 4.9874558361615716e-05,
"loss": 0.6784,
"step": 8830
},
{
"epoch": 4.4989737306878395,
"grad_norm": 0.11722705513238907,
"learning_rate": 4.987441629860509e-05,
"loss": 0.6797,
"step": 8840
},
{
"epoch": 4.504065299368327,
"grad_norm": 0.16931360960006714,
"learning_rate": 4.987427423559446e-05,
"loss": 0.6766,
"step": 8850
},
{
"epoch": 4.509156868048816,
"grad_norm": 0.13619418442249298,
"learning_rate": 4.987413217258383e-05,
"loss": 0.6774,
"step": 8860
},
{
"epoch": 4.514248436729304,
"grad_norm": 0.19465768337249756,
"learning_rate": 4.98739901095732e-05,
"loss": 0.6832,
"step": 8870
},
{
"epoch": 4.519340005409791,
"grad_norm": 0.11889132857322693,
"learning_rate": 4.9873848046562575e-05,
"loss": 0.6848,
"step": 8880
},
{
"epoch": 4.52443157409028,
"grad_norm": 0.10783824324607849,
"learning_rate": 4.987370598355195e-05,
"loss": 0.6793,
"step": 8890
},
{
"epoch": 4.529523142770768,
"grad_norm": 0.11385292559862137,
"learning_rate": 4.987356392054132e-05,
"loss": 0.6754,
"step": 8900
},
{
"epoch": 4.5346147114512565,
"grad_norm": 0.13017722964286804,
"learning_rate": 4.987342185753069e-05,
"loss": 0.6778,
"step": 8910
},
{
"epoch": 4.539706280131744,
"grad_norm": 0.13603904843330383,
"learning_rate": 4.987327979452006e-05,
"loss": 0.6758,
"step": 8920
},
{
"epoch": 4.544797848812232,
"grad_norm": 0.15172545611858368,
"learning_rate": 4.9873137731509434e-05,
"loss": 0.677,
"step": 8930
},
{
"epoch": 4.549889417492721,
"grad_norm": 0.13269858062267303,
"learning_rate": 4.987299566849881e-05,
"loss": 0.6823,
"step": 8940
},
{
"epoch": 4.5549809861732085,
"grad_norm": 0.14247867465019226,
"learning_rate": 4.987285360548818e-05,
"loss": 0.6803,
"step": 8950
},
{
"epoch": 4.560072554853697,
"grad_norm": 0.1458357870578766,
"learning_rate": 4.987271154247755e-05,
"loss": 0.6755,
"step": 8960
},
{
"epoch": 4.565164123534185,
"grad_norm": 0.1240466758608818,
"learning_rate": 4.9872569479466926e-05,
"loss": 0.6793,
"step": 8970
},
{
"epoch": 4.5702556922146735,
"grad_norm": 0.14014077186584473,
"learning_rate": 4.98724274164563e-05,
"loss": 0.6812,
"step": 8980
},
{
"epoch": 4.575347260895161,
"grad_norm": 0.1574947088956833,
"learning_rate": 4.987228535344567e-05,
"loss": 0.6752,
"step": 8990
},
{
"epoch": 4.58043882957565,
"grad_norm": 0.12997229397296906,
"learning_rate": 4.987214329043504e-05,
"loss": 0.6853,
"step": 9000
},
{
"epoch": 4.585530398256138,
"grad_norm": 0.11148348450660706,
"learning_rate": 4.987200122742441e-05,
"loss": 0.6782,
"step": 9010
},
{
"epoch": 4.5906219669366255,
"grad_norm": 0.13387084007263184,
"learning_rate": 4.9871859164413785e-05,
"loss": 0.6798,
"step": 9020
},
{
"epoch": 4.595713535617114,
"grad_norm": 0.16059359908103943,
"learning_rate": 4.987171710140316e-05,
"loss": 0.6815,
"step": 9030
},
{
"epoch": 4.600805104297602,
"grad_norm": 0.15377014875411987,
"learning_rate": 4.987157503839253e-05,
"loss": 0.6813,
"step": 9040
},
{
"epoch": 4.605896672978091,
"grad_norm": 0.13581454753875732,
"learning_rate": 4.9871432975381904e-05,
"loss": 0.6776,
"step": 9050
},
{
"epoch": 4.610988241658578,
"grad_norm": 0.11781629174947739,
"learning_rate": 4.987129091237128e-05,
"loss": 0.6778,
"step": 9060
},
{
"epoch": 4.616079810339067,
"grad_norm": 0.15693874657154083,
"learning_rate": 4.987114884936065e-05,
"loss": 0.6785,
"step": 9070
},
{
"epoch": 4.621171379019555,
"grad_norm": 0.1455591917037964,
"learning_rate": 4.9871006786350023e-05,
"loss": 0.683,
"step": 9080
},
{
"epoch": 4.626262947700043,
"grad_norm": 0.10115326195955276,
"learning_rate": 4.9870864723339397e-05,
"loss": 0.6816,
"step": 9090
},
{
"epoch": 4.631354516380531,
"grad_norm": 0.10945667326450348,
"learning_rate": 4.987072266032877e-05,
"loss": 0.6853,
"step": 9100
},
{
"epoch": 4.636446085061019,
"grad_norm": 0.11783566325902939,
"learning_rate": 4.987058059731814e-05,
"loss": 0.6825,
"step": 9110
},
{
"epoch": 4.641537653741508,
"grad_norm": 0.1183709055185318,
"learning_rate": 4.987043853430751e-05,
"loss": 0.6794,
"step": 9120
},
{
"epoch": 4.646629222421995,
"grad_norm": 0.17861825227737427,
"learning_rate": 4.987029647129688e-05,
"loss": 0.6812,
"step": 9130
},
{
"epoch": 4.651720791102484,
"grad_norm": 0.1105700135231018,
"learning_rate": 4.987015440828625e-05,
"loss": 0.6853,
"step": 9140
},
{
"epoch": 4.656812359782972,
"grad_norm": 0.13059043884277344,
"learning_rate": 4.987001234527562e-05,
"loss": 0.6825,
"step": 9150
},
{
"epoch": 4.66190392846346,
"grad_norm": 0.10306143015623093,
"learning_rate": 4.9869870282264995e-05,
"loss": 0.6825,
"step": 9160
},
{
"epoch": 4.666995497143948,
"grad_norm": 0.1366746723651886,
"learning_rate": 4.986972821925437e-05,
"loss": 0.6769,
"step": 9170
},
{
"epoch": 4.672087065824436,
"grad_norm": 0.15557105839252472,
"learning_rate": 4.986958615624374e-05,
"loss": 0.6811,
"step": 9180
},
{
"epoch": 4.677178634504925,
"grad_norm": 0.1473141759634018,
"learning_rate": 4.9869444093233114e-05,
"loss": 0.6843,
"step": 9190
},
{
"epoch": 4.682270203185412,
"grad_norm": 0.16388468444347382,
"learning_rate": 4.986930203022249e-05,
"loss": 0.6751,
"step": 9200
},
{
"epoch": 4.687361771865901,
"grad_norm": 0.15377168357372284,
"learning_rate": 4.986915996721186e-05,
"loss": 0.68,
"step": 9210
},
{
"epoch": 4.692453340546389,
"grad_norm": 0.14194439351558685,
"learning_rate": 4.986901790420123e-05,
"loss": 0.6762,
"step": 9220
},
{
"epoch": 4.6975449092268775,
"grad_norm": 0.1327824741601944,
"learning_rate": 4.9868875841190606e-05,
"loss": 0.6837,
"step": 9230
},
{
"epoch": 4.702636477907365,
"grad_norm": 0.13738127052783966,
"learning_rate": 4.986873377817998e-05,
"loss": 0.6785,
"step": 9240
},
{
"epoch": 4.707728046587853,
"grad_norm": 0.17268739640712738,
"learning_rate": 4.986859171516935e-05,
"loss": 0.6769,
"step": 9250
},
{
"epoch": 4.712819615268342,
"grad_norm": 0.14373987913131714,
"learning_rate": 4.986844965215872e-05,
"loss": 0.6806,
"step": 9260
},
{
"epoch": 4.7179111839488295,
"grad_norm": 0.11402563005685806,
"learning_rate": 4.986830758914809e-05,
"loss": 0.681,
"step": 9270
},
{
"epoch": 4.723002752629318,
"grad_norm": 0.12297854572534561,
"learning_rate": 4.9868165526137465e-05,
"loss": 0.6814,
"step": 9280
},
{
"epoch": 4.728094321309806,
"grad_norm": 0.10925690084695816,
"learning_rate": 4.986802346312684e-05,
"loss": 0.6835,
"step": 9290
},
{
"epoch": 4.7331858899902945,
"grad_norm": 0.1584441214799881,
"learning_rate": 4.986788140011621e-05,
"loss": 0.6795,
"step": 9300
},
{
"epoch": 4.738277458670782,
"grad_norm": 0.1546424776315689,
"learning_rate": 4.9867739337105585e-05,
"loss": 0.6804,
"step": 9310
},
{
"epoch": 4.743369027351271,
"grad_norm": 0.10821778327226639,
"learning_rate": 4.986759727409496e-05,
"loss": 0.6837,
"step": 9320
},
{
"epoch": 4.748460596031759,
"grad_norm": 0.13283872604370117,
"learning_rate": 4.9867455211084324e-05,
"loss": 0.6806,
"step": 9330
},
{
"epoch": 4.7535521647122465,
"grad_norm": 0.14704841375350952,
"learning_rate": 4.98673131480737e-05,
"loss": 0.68,
"step": 9340
},
{
"epoch": 4.758643733392735,
"grad_norm": 0.13948886096477509,
"learning_rate": 4.986717108506307e-05,
"loss": 0.6737,
"step": 9350
},
{
"epoch": 4.763735302073223,
"grad_norm": 0.1441805213689804,
"learning_rate": 4.986702902205244e-05,
"loss": 0.6791,
"step": 9360
},
{
"epoch": 4.768826870753712,
"grad_norm": 0.15041285753250122,
"learning_rate": 4.9866886959041816e-05,
"loss": 0.6772,
"step": 9370
},
{
"epoch": 4.773918439434199,
"grad_norm": 0.1656763106584549,
"learning_rate": 4.986674489603119e-05,
"loss": 0.6844,
"step": 9380
},
{
"epoch": 4.779010008114687,
"grad_norm": 0.1404283046722412,
"learning_rate": 4.986660283302056e-05,
"loss": 0.6837,
"step": 9390
},
{
"epoch": 4.784101576795176,
"grad_norm": 0.1178780272603035,
"learning_rate": 4.986646077000993e-05,
"loss": 0.6822,
"step": 9400
},
{
"epoch": 4.7891931454756635,
"grad_norm": 0.11357172578573227,
"learning_rate": 4.98663187069993e-05,
"loss": 0.6811,
"step": 9410
},
{
"epoch": 4.794284714156152,
"grad_norm": 0.12318674474954605,
"learning_rate": 4.9866176643988675e-05,
"loss": 0.6798,
"step": 9420
},
{
"epoch": 4.79937628283664,
"grad_norm": 0.09487531334161758,
"learning_rate": 4.986603458097805e-05,
"loss": 0.6825,
"step": 9430
},
{
"epoch": 4.804467851517129,
"grad_norm": 0.09417689591646194,
"learning_rate": 4.986589251796742e-05,
"loss": 0.6828,
"step": 9440
},
{
"epoch": 4.809559420197616,
"grad_norm": 0.10734029114246368,
"learning_rate": 4.9865750454956794e-05,
"loss": 0.6821,
"step": 9450
},
{
"epoch": 4.814650988878105,
"grad_norm": 0.10005868971347809,
"learning_rate": 4.986560839194617e-05,
"loss": 0.687,
"step": 9460
},
{
"epoch": 4.819742557558593,
"grad_norm": 0.11884880065917969,
"learning_rate": 4.986546632893554e-05,
"loss": 0.679,
"step": 9470
},
{
"epoch": 4.824834126239081,
"grad_norm": 0.10700765252113342,
"learning_rate": 4.9865324265924914e-05,
"loss": 0.679,
"step": 9480
},
{
"epoch": 4.829925694919569,
"grad_norm": 0.1253756880760193,
"learning_rate": 4.986518220291429e-05,
"loss": 0.6824,
"step": 9490
},
{
"epoch": 4.835017263600057,
"grad_norm": 0.13005779683589935,
"learning_rate": 4.986504013990366e-05,
"loss": 0.6773,
"step": 9500
},
{
"epoch": 4.840108832280546,
"grad_norm": 0.1245838925242424,
"learning_rate": 4.9864898076893026e-05,
"loss": 0.6778,
"step": 9510
},
{
"epoch": 4.845200400961033,
"grad_norm": 0.13099046051502228,
"learning_rate": 4.98647560138824e-05,
"loss": 0.6819,
"step": 9520
},
{
"epoch": 4.850291969641522,
"grad_norm": 0.10995706915855408,
"learning_rate": 4.986461395087177e-05,
"loss": 0.6806,
"step": 9530
},
{
"epoch": 4.85538353832201,
"grad_norm": 0.10981863737106323,
"learning_rate": 4.986447188786114e-05,
"loss": 0.6768,
"step": 9540
},
{
"epoch": 4.8604751070024985,
"grad_norm": 0.10785161703824997,
"learning_rate": 4.986432982485051e-05,
"loss": 0.6815,
"step": 9550
},
{
"epoch": 4.865566675682986,
"grad_norm": 0.11493176966905594,
"learning_rate": 4.9864187761839885e-05,
"loss": 0.6803,
"step": 9560
},
{
"epoch": 4.870658244363474,
"grad_norm": 0.13624422252178192,
"learning_rate": 4.986404569882926e-05,
"loss": 0.679,
"step": 9570
},
{
"epoch": 4.875749813043963,
"grad_norm": 0.12251431494951248,
"learning_rate": 4.986390363581863e-05,
"loss": 0.68,
"step": 9580
},
{
"epoch": 4.8808413817244505,
"grad_norm": 0.15482662618160248,
"learning_rate": 4.9863761572808004e-05,
"loss": 0.6827,
"step": 9590
},
{
"epoch": 4.885932950404939,
"grad_norm": 0.08389197289943695,
"learning_rate": 4.986361950979738e-05,
"loss": 0.6831,
"step": 9600
},
{
"epoch": 4.891024519085427,
"grad_norm": 0.1233370378613472,
"learning_rate": 4.986347744678675e-05,
"loss": 0.6811,
"step": 9610
},
{
"epoch": 4.896116087765915,
"grad_norm": 0.11783581227064133,
"learning_rate": 4.9863335383776124e-05,
"loss": 0.6854,
"step": 9620
},
{
"epoch": 4.901207656446403,
"grad_norm": 0.10777773708105087,
"learning_rate": 4.98631933207655e-05,
"loss": 0.6787,
"step": 9630
},
{
"epoch": 4.906299225126891,
"grad_norm": 0.14652119576931,
"learning_rate": 4.986305125775487e-05,
"loss": 0.6797,
"step": 9640
},
{
"epoch": 4.91139079380738,
"grad_norm": 0.11962393671274185,
"learning_rate": 4.9862909194744236e-05,
"loss": 0.6832,
"step": 9650
},
{
"epoch": 4.9164823624878675,
"grad_norm": 0.11764557659626007,
"learning_rate": 4.986276713173361e-05,
"loss": 0.677,
"step": 9660
},
{
"epoch": 4.921573931168356,
"grad_norm": 0.13469521701335907,
"learning_rate": 4.986262506872298e-05,
"loss": 0.6759,
"step": 9670
},
{
"epoch": 4.926665499848844,
"grad_norm": 0.11636529117822647,
"learning_rate": 4.9862483005712356e-05,
"loss": 0.6789,
"step": 9680
},
{
"epoch": 4.931757068529333,
"grad_norm": 0.15902294218540192,
"learning_rate": 4.986234094270173e-05,
"loss": 0.6758,
"step": 9690
},
{
"epoch": 4.93684863720982,
"grad_norm": 0.13991579413414001,
"learning_rate": 4.98621988796911e-05,
"loss": 0.6839,
"step": 9700
},
{
"epoch": 4.941940205890308,
"grad_norm": 0.12394755333662033,
"learning_rate": 4.9862056816680475e-05,
"loss": 0.6823,
"step": 9710
},
{
"epoch": 4.947031774570797,
"grad_norm": 0.11160258948802948,
"learning_rate": 4.986191475366985e-05,
"loss": 0.6772,
"step": 9720
},
{
"epoch": 4.952123343251285,
"grad_norm": 0.11390865594148636,
"learning_rate": 4.986177269065922e-05,
"loss": 0.6777,
"step": 9730
},
{
"epoch": 4.957214911931773,
"grad_norm": 0.14337550103664398,
"learning_rate": 4.9861630627648594e-05,
"loss": 0.676,
"step": 9740
},
{
"epoch": 4.962306480612261,
"grad_norm": 0.1478574424982071,
"learning_rate": 4.986148856463796e-05,
"loss": 0.6804,
"step": 9750
},
{
"epoch": 4.96739804929275,
"grad_norm": 0.09173934161663055,
"learning_rate": 4.9861346501627334e-05,
"loss": 0.6834,
"step": 9760
},
{
"epoch": 4.972489617973237,
"grad_norm": 0.10893456637859344,
"learning_rate": 4.986120443861671e-05,
"loss": 0.6796,
"step": 9770
},
{
"epoch": 4.977581186653726,
"grad_norm": 0.10967724025249481,
"learning_rate": 4.986106237560608e-05,
"loss": 0.6804,
"step": 9780
},
{
"epoch": 4.982672755334214,
"grad_norm": 0.11746654659509659,
"learning_rate": 4.9860920312595446e-05,
"loss": 0.6807,
"step": 9790
},
{
"epoch": 4.987764324014702,
"grad_norm": 0.10084499418735504,
"learning_rate": 4.986077824958482e-05,
"loss": 0.6779,
"step": 9800
},
{
"epoch": 4.99285589269519,
"grad_norm": 0.16148197650909424,
"learning_rate": 4.986063618657419e-05,
"loss": 0.6766,
"step": 9810
},
{
"epoch": 4.997947461375678,
"grad_norm": 0.12952958047389984,
"learning_rate": 4.9860494123563565e-05,
"loss": 0.676,
"step": 9820
},
{
"epoch": 5.002545784340244,
"grad_norm": 0.16547605395317078,
"learning_rate": 4.986035206055294e-05,
"loss": 0.6137,
"step": 9830
},
{
"epoch": 5.0076373530207325,
"grad_norm": 0.1671449840068817,
"learning_rate": 4.986020999754231e-05,
"loss": 0.6774,
"step": 9840
},
{
"epoch": 5.01272892170122,
"grad_norm": 0.13992175459861755,
"learning_rate": 4.9860067934531685e-05,
"loss": 0.6819,
"step": 9850
},
{
"epoch": 5.017820490381709,
"grad_norm": 0.08816186338663101,
"learning_rate": 4.985992587152106e-05,
"loss": 0.6819,
"step": 9860
},
{
"epoch": 5.022912059062197,
"grad_norm": 0.08476711064577103,
"learning_rate": 4.985978380851043e-05,
"loss": 0.6817,
"step": 9870
},
{
"epoch": 5.0280036277426845,
"grad_norm": 0.09989239275455475,
"learning_rate": 4.9859641745499804e-05,
"loss": 0.683,
"step": 9880
},
{
"epoch": 5.033095196423173,
"grad_norm": 0.09048530459403992,
"learning_rate": 4.985949968248918e-05,
"loss": 0.681,
"step": 9890
},
{
"epoch": 5.038186765103661,
"grad_norm": 0.11307314783334732,
"learning_rate": 4.985935761947855e-05,
"loss": 0.6785,
"step": 9900
},
{
"epoch": 5.04327833378415,
"grad_norm": 0.12317655235528946,
"learning_rate": 4.985921555646792e-05,
"loss": 0.681,
"step": 9910
},
{
"epoch": 5.048369902464637,
"grad_norm": 0.11963162571191788,
"learning_rate": 4.985907349345729e-05,
"loss": 0.6774,
"step": 9920
},
{
"epoch": 5.053461471145126,
"grad_norm": 0.11438319087028503,
"learning_rate": 4.985893143044666e-05,
"loss": 0.68,
"step": 9930
},
{
"epoch": 5.058553039825614,
"grad_norm": 0.13765713572502136,
"learning_rate": 4.9858789367436036e-05,
"loss": 0.6766,
"step": 9940
},
{
"epoch": 5.0636446085061015,
"grad_norm": 0.12760768830776215,
"learning_rate": 4.985864730442541e-05,
"loss": 0.6763,
"step": 9950
},
{
"epoch": 5.06873617718659,
"grad_norm": 0.14188893139362335,
"learning_rate": 4.9858505241414775e-05,
"loss": 0.6815,
"step": 9960
},
{
"epoch": 5.073827745867078,
"grad_norm": 0.177343487739563,
"learning_rate": 4.985836317840415e-05,
"loss": 0.6765,
"step": 9970
},
{
"epoch": 5.078919314547567,
"grad_norm": 0.15826770663261414,
"learning_rate": 4.985822111539352e-05,
"loss": 0.6819,
"step": 9980
},
{
"epoch": 5.084010883228054,
"grad_norm": 0.1431620568037033,
"learning_rate": 4.9858079052382895e-05,
"loss": 0.6809,
"step": 9990
},
{
"epoch": 5.089102451908543,
"grad_norm": 0.13952907919883728,
"learning_rate": 4.985793698937227e-05,
"loss": 0.6803,
"step": 10000
},
{
"epoch": 5.094194020589031,
"grad_norm": 0.11862120032310486,
"learning_rate": 4.985779492636164e-05,
"loss": 0.6774,
"step": 10010
},
{
"epoch": 5.099285589269519,
"grad_norm": 0.15467384457588196,
"learning_rate": 4.9857652863351014e-05,
"loss": 0.6777,
"step": 10020
},
{
"epoch": 5.104377157950007,
"grad_norm": 0.12163079530000687,
"learning_rate": 4.985751080034039e-05,
"loss": 0.6801,
"step": 10030
},
{
"epoch": 5.109468726630495,
"grad_norm": 0.1349727064371109,
"learning_rate": 4.985736873732976e-05,
"loss": 0.679,
"step": 10040
},
{
"epoch": 5.114560295310984,
"grad_norm": 0.12950022518634796,
"learning_rate": 4.9857226674319127e-05,
"loss": 0.6799,
"step": 10050
},
{
"epoch": 5.119651863991471,
"grad_norm": 0.12536922097206116,
"learning_rate": 4.98570846113085e-05,
"loss": 0.6805,
"step": 10060
},
{
"epoch": 5.12474343267196,
"grad_norm": 0.08876863867044449,
"learning_rate": 4.985694254829787e-05,
"loss": 0.6838,
"step": 10070
},
{
"epoch": 5.129835001352448,
"grad_norm": 0.13812567293643951,
"learning_rate": 4.9856800485287246e-05,
"loss": 0.6795,
"step": 10080
},
{
"epoch": 5.1349265700329365,
"grad_norm": 0.11330072581768036,
"learning_rate": 4.985665842227662e-05,
"loss": 0.6775,
"step": 10090
},
{
"epoch": 5.140018138713424,
"grad_norm": 0.12768009305000305,
"learning_rate": 4.985651635926599e-05,
"loss": 0.6758,
"step": 10100
},
{
"epoch": 5.145109707393912,
"grad_norm": 0.15295925736427307,
"learning_rate": 4.9856374296255365e-05,
"loss": 0.6885,
"step": 10110
},
{
"epoch": 5.150201276074401,
"grad_norm": 0.08242222666740417,
"learning_rate": 4.985623223324474e-05,
"loss": 0.6826,
"step": 10120
},
{
"epoch": 5.1552928447548885,
"grad_norm": 0.0866493284702301,
"learning_rate": 4.985609017023411e-05,
"loss": 0.6823,
"step": 10130
},
{
"epoch": 5.160384413435377,
"grad_norm": 0.1157221645116806,
"learning_rate": 4.9855948107223485e-05,
"loss": 0.6764,
"step": 10140
},
{
"epoch": 5.165475982115865,
"grad_norm": 0.1414877027273178,
"learning_rate": 4.985580604421286e-05,
"loss": 0.6749,
"step": 10150
},
{
"epoch": 5.1705675507963536,
"grad_norm": 0.13449379801750183,
"learning_rate": 4.985566398120223e-05,
"loss": 0.6806,
"step": 10160
},
{
"epoch": 5.175659119476841,
"grad_norm": 0.13108868896961212,
"learning_rate": 4.98555219181916e-05,
"loss": 0.6806,
"step": 10170
},
{
"epoch": 5.180750688157329,
"grad_norm": 0.12748171389102936,
"learning_rate": 4.985537985518097e-05,
"loss": 0.6763,
"step": 10180
},
{
"epoch": 5.185842256837818,
"grad_norm": 0.10387007147073746,
"learning_rate": 4.9855237792170336e-05,
"loss": 0.6872,
"step": 10190
},
{
"epoch": 5.1909338255183055,
"grad_norm": 0.09480390697717667,
"learning_rate": 4.985509572915971e-05,
"loss": 0.6822,
"step": 10200
},
{
"epoch": 5.196025394198794,
"grad_norm": 0.11437319219112396,
"learning_rate": 4.985495366614908e-05,
"loss": 0.6792,
"step": 10210
},
{
"epoch": 5.201116962879282,
"grad_norm": 0.12557561695575714,
"learning_rate": 4.9854811603138456e-05,
"loss": 0.682,
"step": 10220
},
{
"epoch": 5.206208531559771,
"grad_norm": 0.1291828453540802,
"learning_rate": 4.985466954012783e-05,
"loss": 0.6848,
"step": 10230
},
{
"epoch": 5.211300100240258,
"grad_norm": 0.12377645820379257,
"learning_rate": 4.98545274771172e-05,
"loss": 0.6789,
"step": 10240
},
{
"epoch": 5.216391668920746,
"grad_norm": 0.12247670441865921,
"learning_rate": 4.9854385414106575e-05,
"loss": 0.681,
"step": 10250
},
{
"epoch": 5.221483237601235,
"grad_norm": 0.10693535208702087,
"learning_rate": 4.985424335109595e-05,
"loss": 0.687,
"step": 10260
},
{
"epoch": 5.2265748062817226,
"grad_norm": 0.11651374399662018,
"learning_rate": 4.985410128808532e-05,
"loss": 0.6775,
"step": 10270
},
{
"epoch": 5.231666374962211,
"grad_norm": 0.1369701623916626,
"learning_rate": 4.9853959225074694e-05,
"loss": 0.6767,
"step": 10280
},
{
"epoch": 5.236757943642699,
"grad_norm": 0.13671474158763885,
"learning_rate": 4.985381716206407e-05,
"loss": 0.6821,
"step": 10290
},
{
"epoch": 5.241849512323188,
"grad_norm": 0.11949580907821655,
"learning_rate": 4.985367509905344e-05,
"loss": 0.6807,
"step": 10300
},
{
"epoch": 5.246941081003675,
"grad_norm": 0.11703040450811386,
"learning_rate": 4.985353303604281e-05,
"loss": 0.678,
"step": 10310
},
{
"epoch": 5.252032649684164,
"grad_norm": 0.11209936439990997,
"learning_rate": 4.985339097303218e-05,
"loss": 0.6773,
"step": 10320
},
{
"epoch": 5.257124218364652,
"grad_norm": 0.13346509635448456,
"learning_rate": 4.985324891002155e-05,
"loss": 0.6857,
"step": 10330
},
{
"epoch": 5.26221578704514,
"grad_norm": 0.12218772619962692,
"learning_rate": 4.9853106847010926e-05,
"loss": 0.681,
"step": 10340
},
{
"epoch": 5.267307355725628,
"grad_norm": 0.1169796735048294,
"learning_rate": 4.98529647840003e-05,
"loss": 0.6767,
"step": 10350
},
{
"epoch": 5.272398924406116,
"grad_norm": 0.14005398750305176,
"learning_rate": 4.985282272098967e-05,
"loss": 0.674,
"step": 10360
},
{
"epoch": 5.277490493086605,
"grad_norm": 0.1299133449792862,
"learning_rate": 4.9852680657979046e-05,
"loss": 0.6779,
"step": 10370
},
{
"epoch": 5.282582061767092,
"grad_norm": 0.13446015119552612,
"learning_rate": 4.985253859496841e-05,
"loss": 0.6781,
"step": 10380
},
{
"epoch": 5.287673630447581,
"grad_norm": 0.14030112326145172,
"learning_rate": 4.9852396531957785e-05,
"loss": 0.6782,
"step": 10390
},
{
"epoch": 5.292765199128069,
"grad_norm": 0.12442600727081299,
"learning_rate": 4.985225446894716e-05,
"loss": 0.6841,
"step": 10400
},
{
"epoch": 5.297856767808557,
"grad_norm": 0.11391379684209824,
"learning_rate": 4.985211240593653e-05,
"loss": 0.6834,
"step": 10410
},
{
"epoch": 5.302948336489045,
"grad_norm": 0.11152996867895126,
"learning_rate": 4.9851970342925904e-05,
"loss": 0.6816,
"step": 10420
},
{
"epoch": 5.308039905169533,
"grad_norm": 0.13936050236225128,
"learning_rate": 4.985182827991528e-05,
"loss": 0.6831,
"step": 10430
},
{
"epoch": 5.313131473850022,
"grad_norm": 0.11654047667980194,
"learning_rate": 4.985168621690465e-05,
"loss": 0.6803,
"step": 10440
},
{
"epoch": 5.3182230425305095,
"grad_norm": 0.11251688003540039,
"learning_rate": 4.985154415389402e-05,
"loss": 0.6815,
"step": 10450
},
{
"epoch": 5.323314611210998,
"grad_norm": 0.09920088946819305,
"learning_rate": 4.985140209088339e-05,
"loss": 0.6789,
"step": 10460
},
{
"epoch": 5.328406179891486,
"grad_norm": 0.18474489450454712,
"learning_rate": 4.985126002787276e-05,
"loss": 0.6777,
"step": 10470
},
{
"epoch": 5.333497748571974,
"grad_norm": 0.12075336277484894,
"learning_rate": 4.9851117964862136e-05,
"loss": 0.6828,
"step": 10480
},
{
"epoch": 5.338589317252462,
"grad_norm": 0.1428055316209793,
"learning_rate": 4.985097590185151e-05,
"loss": 0.6765,
"step": 10490
},
{
"epoch": 5.34368088593295,
"grad_norm": 0.1289169192314148,
"learning_rate": 4.985083383884088e-05,
"loss": 0.6825,
"step": 10500
},
{
"epoch": 5.348772454613439,
"grad_norm": 0.10693208128213882,
"learning_rate": 4.9850691775830256e-05,
"loss": 0.6814,
"step": 10510
},
{
"epoch": 5.3538640232939265,
"grad_norm": 0.11116955429315567,
"learning_rate": 4.985054971281963e-05,
"loss": 0.6805,
"step": 10520
},
{
"epoch": 5.358955591974415,
"grad_norm": 0.11630560457706451,
"learning_rate": 4.9850407649809e-05,
"loss": 0.6779,
"step": 10530
},
{
"epoch": 5.364047160654903,
"grad_norm": 0.13117016851902008,
"learning_rate": 4.9850265586798375e-05,
"loss": 0.6749,
"step": 10540
},
{
"epoch": 5.369138729335392,
"grad_norm": 0.14777855575084686,
"learning_rate": 4.985012352378775e-05,
"loss": 0.6788,
"step": 10550
},
{
"epoch": 5.374230298015879,
"grad_norm": 0.1084110215306282,
"learning_rate": 4.9849981460777114e-05,
"loss": 0.6843,
"step": 10560
},
{
"epoch": 5.379321866696367,
"grad_norm": 0.10926970094442368,
"learning_rate": 4.984983939776649e-05,
"loss": 0.6807,
"step": 10570
},
{
"epoch": 5.384413435376856,
"grad_norm": 0.10273724794387817,
"learning_rate": 4.984969733475586e-05,
"loss": 0.6819,
"step": 10580
},
{
"epoch": 5.389505004057344,
"grad_norm": 0.12061687558889389,
"learning_rate": 4.984955527174523e-05,
"loss": 0.6791,
"step": 10590
},
{
"epoch": 5.394596572737832,
"grad_norm": 0.11515804380178452,
"learning_rate": 4.98494132087346e-05,
"loss": 0.6798,
"step": 10600
},
{
"epoch": 5.39968814141832,
"grad_norm": 0.11288391053676605,
"learning_rate": 4.984927114572397e-05,
"loss": 0.681,
"step": 10610
},
{
"epoch": 5.404779710098809,
"grad_norm": 0.12682178616523743,
"learning_rate": 4.9849129082713346e-05,
"loss": 0.6778,
"step": 10620
},
{
"epoch": 5.409871278779296,
"grad_norm": 0.12649093568325043,
"learning_rate": 4.984898701970272e-05,
"loss": 0.6767,
"step": 10630
},
{
"epoch": 5.414962847459784,
"grad_norm": 0.1650230884552002,
"learning_rate": 4.984884495669209e-05,
"loss": 0.6772,
"step": 10640
},
{
"epoch": 5.420054416140273,
"grad_norm": 0.11968445032835007,
"learning_rate": 4.9848702893681465e-05,
"loss": 0.6791,
"step": 10650
},
{
"epoch": 5.425145984820761,
"grad_norm": 0.10566221922636032,
"learning_rate": 4.984856083067084e-05,
"loss": 0.6769,
"step": 10660
},
{
"epoch": 5.430237553501249,
"grad_norm": 0.09944125264883041,
"learning_rate": 4.984841876766021e-05,
"loss": 0.6789,
"step": 10670
},
{
"epoch": 5.435329122181737,
"grad_norm": 0.12134432047605515,
"learning_rate": 4.9848276704649585e-05,
"loss": 0.6741,
"step": 10680
},
{
"epoch": 5.440420690862226,
"grad_norm": 0.1576509177684784,
"learning_rate": 4.984813464163896e-05,
"loss": 0.6818,
"step": 10690
},
{
"epoch": 5.4455122595427135,
"grad_norm": 0.13000087440013885,
"learning_rate": 4.9847992578628324e-05,
"loss": 0.6719,
"step": 10700
},
{
"epoch": 5.450603828223201,
"grad_norm": 0.12142984569072723,
"learning_rate": 4.98478505156177e-05,
"loss": 0.6825,
"step": 10710
},
{
"epoch": 5.45569539690369,
"grad_norm": 0.1100669875741005,
"learning_rate": 4.984770845260707e-05,
"loss": 0.6759,
"step": 10720
},
{
"epoch": 5.460786965584178,
"grad_norm": 0.1101478561758995,
"learning_rate": 4.9847566389596444e-05,
"loss": 0.685,
"step": 10730
},
{
"epoch": 5.465878534264666,
"grad_norm": 0.1224004253745079,
"learning_rate": 4.984742432658582e-05,
"loss": 0.6763,
"step": 10740
},
{
"epoch": 5.470970102945154,
"grad_norm": 0.14111606776714325,
"learning_rate": 4.984728226357519e-05,
"loss": 0.6777,
"step": 10750
},
{
"epoch": 5.476061671625643,
"grad_norm": 0.10880038887262344,
"learning_rate": 4.984714020056456e-05,
"loss": 0.6834,
"step": 10760
},
{
"epoch": 5.4811532403061305,
"grad_norm": 0.1258549839258194,
"learning_rate": 4.9846998137553936e-05,
"loss": 0.6828,
"step": 10770
},
{
"epoch": 5.486244808986619,
"grad_norm": 0.10077346116304398,
"learning_rate": 4.984685607454331e-05,
"loss": 0.6797,
"step": 10780
},
{
"epoch": 5.491336377667107,
"grad_norm": 0.14082978665828705,
"learning_rate": 4.984671401153268e-05,
"loss": 0.6773,
"step": 10790
},
{
"epoch": 5.496427946347595,
"grad_norm": 0.12051651626825333,
"learning_rate": 4.984657194852205e-05,
"loss": 0.6774,
"step": 10800
},
{
"epoch": 5.501519515028083,
"grad_norm": 0.15081602334976196,
"learning_rate": 4.984642988551142e-05,
"loss": 0.6866,
"step": 10810
},
{
"epoch": 5.506611083708571,
"grad_norm": 0.09743819385766983,
"learning_rate": 4.9846287822500795e-05,
"loss": 0.6804,
"step": 10820
},
{
"epoch": 5.51170265238906,
"grad_norm": 0.09400393813848495,
"learning_rate": 4.984614575949017e-05,
"loss": 0.6815,
"step": 10830
},
{
"epoch": 5.5167942210695475,
"grad_norm": 0.13835515081882477,
"learning_rate": 4.9846003696479534e-05,
"loss": 0.6866,
"step": 10840
},
{
"epoch": 5.521885789750036,
"grad_norm": 0.11208510398864746,
"learning_rate": 4.984586163346891e-05,
"loss": 0.6805,
"step": 10850
},
{
"epoch": 5.526977358430524,
"grad_norm": 0.11167927086353302,
"learning_rate": 4.984571957045828e-05,
"loss": 0.6799,
"step": 10860
},
{
"epoch": 5.532068927111012,
"grad_norm": 0.12590061128139496,
"learning_rate": 4.9845577507447653e-05,
"loss": 0.676,
"step": 10870
},
{
"epoch": 5.5371604957915,
"grad_norm": 0.15050916373729706,
"learning_rate": 4.9845435444437027e-05,
"loss": 0.6712,
"step": 10880
},
{
"epoch": 5.542252064471988,
"grad_norm": 0.14142751693725586,
"learning_rate": 4.98452933814264e-05,
"loss": 0.676,
"step": 10890
},
{
"epoch": 5.547343633152477,
"grad_norm": 0.24029377102851868,
"learning_rate": 4.984515131841577e-05,
"loss": 0.683,
"step": 10900
},
{
"epoch": 5.552435201832965,
"grad_norm": 0.11458209902048111,
"learning_rate": 4.9845009255405146e-05,
"loss": 0.6795,
"step": 10910
},
{
"epoch": 5.557526770513453,
"grad_norm": 0.10509049147367477,
"learning_rate": 4.984486719239452e-05,
"loss": 0.6832,
"step": 10920
},
{
"epoch": 5.562618339193941,
"grad_norm": 0.1304958164691925,
"learning_rate": 4.984472512938389e-05,
"loss": 0.6814,
"step": 10930
},
{
"epoch": 5.567709907874429,
"grad_norm": 0.11066732555627823,
"learning_rate": 4.9844583066373265e-05,
"loss": 0.6734,
"step": 10940
},
{
"epoch": 5.572801476554917,
"grad_norm": 0.14044025540351868,
"learning_rate": 4.984444100336264e-05,
"loss": 0.6851,
"step": 10950
},
{
"epoch": 5.577893045235405,
"grad_norm": 0.09776227921247482,
"learning_rate": 4.9844298940352005e-05,
"loss": 0.6797,
"step": 10960
},
{
"epoch": 5.582984613915894,
"grad_norm": 0.08972660452127457,
"learning_rate": 4.984415687734138e-05,
"loss": 0.6803,
"step": 10970
},
{
"epoch": 5.588076182596382,
"grad_norm": 0.11810458451509476,
"learning_rate": 4.984401481433075e-05,
"loss": 0.6802,
"step": 10980
},
{
"epoch": 5.59316775127687,
"grad_norm": 0.11004742234945297,
"learning_rate": 4.9843872751320124e-05,
"loss": 0.6795,
"step": 10990
},
{
"epoch": 5.598259319957358,
"grad_norm": 0.10075508058071136,
"learning_rate": 4.98437306883095e-05,
"loss": 0.682,
"step": 11000
},
{
"epoch": 5.603350888637847,
"grad_norm": 0.10835061222314835,
"learning_rate": 4.9843588625298863e-05,
"loss": 0.6829,
"step": 11010
},
{
"epoch": 5.6084424573183345,
"grad_norm": 0.1209336370229721,
"learning_rate": 4.9843446562288236e-05,
"loss": 0.6808,
"step": 11020
},
{
"epoch": 5.613534025998822,
"grad_norm": 0.12438962608575821,
"learning_rate": 4.984330449927761e-05,
"loss": 0.6768,
"step": 11030
},
{
"epoch": 5.618625594679311,
"grad_norm": 0.1364268809556961,
"learning_rate": 4.984316243626698e-05,
"loss": 0.6781,
"step": 11040
},
{
"epoch": 5.623717163359799,
"grad_norm": 0.11569849401712418,
"learning_rate": 4.9843020373256356e-05,
"loss": 0.6825,
"step": 11050
},
{
"epoch": 5.628808732040287,
"grad_norm": 0.10072596371173859,
"learning_rate": 4.984287831024573e-05,
"loss": 0.6764,
"step": 11060
},
{
"epoch": 5.633900300720775,
"grad_norm": 0.15180449187755585,
"learning_rate": 4.98427362472351e-05,
"loss": 0.6782,
"step": 11070
},
{
"epoch": 5.638991869401263,
"grad_norm": 0.14204277098178864,
"learning_rate": 4.9842594184224475e-05,
"loss": 0.6806,
"step": 11080
},
{
"epoch": 5.6440834380817515,
"grad_norm": 0.12409929186105728,
"learning_rate": 4.984245212121385e-05,
"loss": 0.6806,
"step": 11090
},
{
"epoch": 5.649175006762239,
"grad_norm": 0.1692194640636444,
"learning_rate": 4.9842310058203215e-05,
"loss": 0.6723,
"step": 11100
},
{
"epoch": 5.654266575442728,
"grad_norm": 0.2566402852535248,
"learning_rate": 4.984216799519259e-05,
"loss": 0.6845,
"step": 11110
},
{
"epoch": 5.659358144123216,
"grad_norm": 0.13745322823524475,
"learning_rate": 4.984202593218196e-05,
"loss": 0.6748,
"step": 11120
},
{
"epoch": 5.664449712803704,
"grad_norm": 0.16598811745643616,
"learning_rate": 4.9841883869171334e-05,
"loss": 0.6798,
"step": 11130
},
{
"epoch": 5.669541281484192,
"grad_norm": 0.13570183515548706,
"learning_rate": 4.984174180616071e-05,
"loss": 0.6797,
"step": 11140
},
{
"epoch": 5.674632850164681,
"grad_norm": 0.17549622058868408,
"learning_rate": 4.984159974315008e-05,
"loss": 0.6773,
"step": 11150
},
{
"epoch": 5.6797244188451685,
"grad_norm": 0.15479332208633423,
"learning_rate": 4.984145768013945e-05,
"loss": 0.6795,
"step": 11160
},
{
"epoch": 5.684815987525656,
"grad_norm": 0.1562296450138092,
"learning_rate": 4.9841315617128826e-05,
"loss": 0.6803,
"step": 11170
},
{
"epoch": 5.689907556206145,
"grad_norm": 0.13014480471611023,
"learning_rate": 4.98411735541182e-05,
"loss": 0.6793,
"step": 11180
},
{
"epoch": 5.694999124886633,
"grad_norm": 0.1577223241329193,
"learning_rate": 4.984103149110757e-05,
"loss": 0.6845,
"step": 11190
},
{
"epoch": 5.700090693567121,
"grad_norm": 0.14906632900238037,
"learning_rate": 4.9840889428096946e-05,
"loss": 0.6771,
"step": 11200
},
{
"epoch": 5.705182262247609,
"grad_norm": 0.15042632818222046,
"learning_rate": 4.984074736508632e-05,
"loss": 0.6737,
"step": 11210
},
{
"epoch": 5.710273830928098,
"grad_norm": 0.1530093252658844,
"learning_rate": 4.9840605302075685e-05,
"loss": 0.6804,
"step": 11220
},
{
"epoch": 5.715365399608586,
"grad_norm": 0.18300846219062805,
"learning_rate": 4.984046323906506e-05,
"loss": 0.6752,
"step": 11230
},
{
"epoch": 5.720456968289074,
"grad_norm": 0.14398545026779175,
"learning_rate": 4.9840321176054424e-05,
"loss": 0.6793,
"step": 11240
},
{
"epoch": 5.725548536969562,
"grad_norm": 0.12745435535907745,
"learning_rate": 4.98401791130438e-05,
"loss": 0.6765,
"step": 11250
},
{
"epoch": 5.73064010565005,
"grad_norm": 0.15162277221679688,
"learning_rate": 4.984003705003317e-05,
"loss": 0.6744,
"step": 11260
},
{
"epoch": 5.735731674330538,
"grad_norm": 0.12970998883247375,
"learning_rate": 4.9839894987022544e-05,
"loss": 0.6818,
"step": 11270
},
{
"epoch": 5.740823243011026,
"grad_norm": 0.1195228323340416,
"learning_rate": 4.983975292401192e-05,
"loss": 0.6749,
"step": 11280
},
{
"epoch": 5.745914811691515,
"grad_norm": 0.14821238815784454,
"learning_rate": 4.983961086100129e-05,
"loss": 0.6759,
"step": 11290
},
{
"epoch": 5.751006380372003,
"grad_norm": 0.18345175683498383,
"learning_rate": 4.983946879799066e-05,
"loss": 0.6736,
"step": 11300
},
{
"epoch": 5.75609794905249,
"grad_norm": 0.14165613055229187,
"learning_rate": 4.9839326734980036e-05,
"loss": 0.6777,
"step": 11310
},
{
"epoch": 5.761189517732979,
"grad_norm": 0.16045770049095154,
"learning_rate": 4.983918467196941e-05,
"loss": 0.678,
"step": 11320
},
{
"epoch": 5.766281086413467,
"grad_norm": 0.1490974873304367,
"learning_rate": 4.983904260895878e-05,
"loss": 0.68,
"step": 11330
},
{
"epoch": 5.7713726550939555,
"grad_norm": 0.11064887046813965,
"learning_rate": 4.9838900545948156e-05,
"loss": 0.6832,
"step": 11340
},
{
"epoch": 5.776464223774443,
"grad_norm": 0.11848734319210052,
"learning_rate": 4.983875848293753e-05,
"loss": 0.6792,
"step": 11350
},
{
"epoch": 5.781555792454932,
"grad_norm": 0.1246313750743866,
"learning_rate": 4.9838616419926895e-05,
"loss": 0.6794,
"step": 11360
},
{
"epoch": 5.78664736113542,
"grad_norm": 0.17359575629234314,
"learning_rate": 4.983847435691627e-05,
"loss": 0.6762,
"step": 11370
},
{
"epoch": 5.791738929815908,
"grad_norm": 0.16471154987812042,
"learning_rate": 4.983833229390564e-05,
"loss": 0.6742,
"step": 11380
},
{
"epoch": 5.796830498496396,
"grad_norm": 0.1479930430650711,
"learning_rate": 4.9838190230895014e-05,
"loss": 0.678,
"step": 11390
},
{
"epoch": 5.801922067176884,
"grad_norm": 0.11385341733694077,
"learning_rate": 4.983804816788439e-05,
"loss": 0.6791,
"step": 11400
},
{
"epoch": 5.8070136358573725,
"grad_norm": 0.13574256002902985,
"learning_rate": 4.983790610487376e-05,
"loss": 0.6795,
"step": 11410
},
{
"epoch": 5.81210520453786,
"grad_norm": 0.1701575517654419,
"learning_rate": 4.9837764041863134e-05,
"loss": 0.6791,
"step": 11420
},
{
"epoch": 5.817196773218349,
"grad_norm": 0.11972179263830185,
"learning_rate": 4.98376219788525e-05,
"loss": 0.6802,
"step": 11430
},
{
"epoch": 5.822288341898837,
"grad_norm": 0.15830230712890625,
"learning_rate": 4.983747991584187e-05,
"loss": 0.6761,
"step": 11440
},
{
"epoch": 5.827379910579325,
"grad_norm": 0.16592001914978027,
"learning_rate": 4.9837337852831246e-05,
"loss": 0.6768,
"step": 11450
},
{
"epoch": 5.832471479259813,
"grad_norm": 0.21496979892253876,
"learning_rate": 4.983719578982062e-05,
"loss": 0.6783,
"step": 11460
},
{
"epoch": 5.837563047940302,
"grad_norm": 0.14850680530071259,
"learning_rate": 4.983705372680999e-05,
"loss": 0.6781,
"step": 11470
},
{
"epoch": 5.8426546166207896,
"grad_norm": 0.12256158143281937,
"learning_rate": 4.9836911663799365e-05,
"loss": 0.6776,
"step": 11480
},
{
"epoch": 5.847746185301277,
"grad_norm": 0.14311592280864716,
"learning_rate": 4.983676960078874e-05,
"loss": 0.6717,
"step": 11490
},
{
"epoch": 5.852837753981766,
"grad_norm": 0.1648699939250946,
"learning_rate": 4.9836627537778105e-05,
"loss": 0.6779,
"step": 11500
},
{
"epoch": 5.857929322662254,
"grad_norm": 0.13590501248836517,
"learning_rate": 4.983648547476748e-05,
"loss": 0.6824,
"step": 11510
},
{
"epoch": 5.863020891342742,
"grad_norm": 0.13972793519496918,
"learning_rate": 4.983634341175685e-05,
"loss": 0.679,
"step": 11520
},
{
"epoch": 5.86811246002323,
"grad_norm": 0.11360618472099304,
"learning_rate": 4.9836201348746224e-05,
"loss": 0.6746,
"step": 11530
},
{
"epoch": 5.873204028703718,
"grad_norm": 0.14063167572021484,
"learning_rate": 4.98360592857356e-05,
"loss": 0.6818,
"step": 11540
},
{
"epoch": 5.878295597384207,
"grad_norm": 0.12393573671579361,
"learning_rate": 4.983591722272497e-05,
"loss": 0.6771,
"step": 11550
},
{
"epoch": 5.883387166064694,
"grad_norm": 0.12383928149938583,
"learning_rate": 4.9835775159714344e-05,
"loss": 0.6807,
"step": 11560
},
{
"epoch": 5.888478734745183,
"grad_norm": 0.11464569717645645,
"learning_rate": 4.983563309670372e-05,
"loss": 0.6823,
"step": 11570
},
{
"epoch": 5.893570303425671,
"grad_norm": 0.15896569192409515,
"learning_rate": 4.983549103369309e-05,
"loss": 0.678,
"step": 11580
},
{
"epoch": 5.898661872106159,
"grad_norm": 0.11153749376535416,
"learning_rate": 4.983534897068246e-05,
"loss": 0.6799,
"step": 11590
},
{
"epoch": 5.903753440786647,
"grad_norm": 0.13557817041873932,
"learning_rate": 4.9835206907671836e-05,
"loss": 0.678,
"step": 11600
},
{
"epoch": 5.908845009467136,
"grad_norm": 0.12681804597377777,
"learning_rate": 4.98350648446612e-05,
"loss": 0.6853,
"step": 11610
},
{
"epoch": 5.913936578147624,
"grad_norm": 0.11007581651210785,
"learning_rate": 4.9834922781650575e-05,
"loss": 0.6799,
"step": 11620
},
{
"epoch": 5.919028146828111,
"grad_norm": 0.14073921740055084,
"learning_rate": 4.983478071863995e-05,
"loss": 0.6809,
"step": 11630
},
{
"epoch": 5.9241197155086,
"grad_norm": 0.17294389009475708,
"learning_rate": 4.9834638655629315e-05,
"loss": 0.677,
"step": 11640
},
{
"epoch": 5.929211284189088,
"grad_norm": 0.11901852488517761,
"learning_rate": 4.983449659261869e-05,
"loss": 0.6814,
"step": 11650
},
{
"epoch": 5.9343028528695765,
"grad_norm": 0.1563209444284439,
"learning_rate": 4.983435452960806e-05,
"loss": 0.6803,
"step": 11660
},
{
"epoch": 5.939394421550064,
"grad_norm": 0.1763051152229309,
"learning_rate": 4.9834212466597434e-05,
"loss": 0.6713,
"step": 11670
},
{
"epoch": 5.944485990230553,
"grad_norm": 0.1412787139415741,
"learning_rate": 4.983407040358681e-05,
"loss": 0.6791,
"step": 11680
},
{
"epoch": 5.949577558911041,
"grad_norm": 0.13946793973445892,
"learning_rate": 4.983392834057618e-05,
"loss": 0.674,
"step": 11690
},
{
"epoch": 5.954669127591529,
"grad_norm": 0.1848699301481247,
"learning_rate": 4.9833786277565553e-05,
"loss": 0.6785,
"step": 11700
},
{
"epoch": 5.959760696272017,
"grad_norm": 0.14714594185352325,
"learning_rate": 4.9833644214554927e-05,
"loss": 0.6764,
"step": 11710
},
{
"epoch": 5.964852264952505,
"grad_norm": 0.14410807192325592,
"learning_rate": 4.98335021515443e-05,
"loss": 0.6755,
"step": 11720
},
{
"epoch": 5.9699438336329935,
"grad_norm": 0.11196265369653702,
"learning_rate": 4.983336008853367e-05,
"loss": 0.6801,
"step": 11730
},
{
"epoch": 5.975035402313481,
"grad_norm": 0.14931631088256836,
"learning_rate": 4.9833218025523046e-05,
"loss": 0.6761,
"step": 11740
},
{
"epoch": 5.98012697099397,
"grad_norm": 0.1235998123884201,
"learning_rate": 4.983307596251241e-05,
"loss": 0.6816,
"step": 11750
},
{
"epoch": 5.985218539674458,
"grad_norm": 0.14235694706439972,
"learning_rate": 4.9832933899501785e-05,
"loss": 0.6784,
"step": 11760
},
{
"epoch": 5.9903101083549455,
"grad_norm": 0.11291839182376862,
"learning_rate": 4.983279183649116e-05,
"loss": 0.6857,
"step": 11770
},
{
"epoch": 5.995401677035434,
"grad_norm": 0.12273520231246948,
"learning_rate": 4.983264977348053e-05,
"loss": 0.6801,
"step": 11780
},
{
"epoch": 6.0,
"grad_norm": 0.025783156976103783,
"learning_rate": 4.9832507710469905e-05,
"loss": 0.6142,
"step": 11790
},
{
"epoch": 6.005091568680488,
"grad_norm": 0.1227310448884964,
"learning_rate": 4.983236564745928e-05,
"loss": 0.679,
"step": 11800
},
{
"epoch": 6.010183137360976,
"grad_norm": 0.14122678339481354,
"learning_rate": 4.983222358444865e-05,
"loss": 0.677,
"step": 11810
},
{
"epoch": 6.015274706041464,
"grad_norm": 0.14405541121959686,
"learning_rate": 4.9832081521438024e-05,
"loss": 0.6799,
"step": 11820
},
{
"epoch": 6.020366274721953,
"grad_norm": 0.18694424629211426,
"learning_rate": 4.98319394584274e-05,
"loss": 0.675,
"step": 11830
},
{
"epoch": 6.025457843402441,
"grad_norm": 0.1961718052625656,
"learning_rate": 4.983179739541677e-05,
"loss": 0.6819,
"step": 11840
},
{
"epoch": 6.030549412082929,
"grad_norm": 0.1102224811911583,
"learning_rate": 4.9831655332406137e-05,
"loss": 0.682,
"step": 11850
},
{
"epoch": 6.035640980763417,
"grad_norm": 0.1295260190963745,
"learning_rate": 4.983151326939551e-05,
"loss": 0.6794,
"step": 11860
},
{
"epoch": 6.040732549443905,
"grad_norm": 0.12580661475658417,
"learning_rate": 4.983137120638488e-05,
"loss": 0.6791,
"step": 11870
},
{
"epoch": 6.0458241181243935,
"grad_norm": 0.1288338154554367,
"learning_rate": 4.9831229143374256e-05,
"loss": 0.6805,
"step": 11880
},
{
"epoch": 6.050915686804881,
"grad_norm": 0.1211671456694603,
"learning_rate": 4.983108708036362e-05,
"loss": 0.6764,
"step": 11890
},
{
"epoch": 6.05600725548537,
"grad_norm": 0.15219536423683167,
"learning_rate": 4.9830945017352995e-05,
"loss": 0.6806,
"step": 11900
},
{
"epoch": 6.061098824165858,
"grad_norm": 0.12759484350681305,
"learning_rate": 4.983080295434237e-05,
"loss": 0.676,
"step": 11910
},
{
"epoch": 6.066190392846346,
"grad_norm": 0.1949695497751236,
"learning_rate": 4.983066089133174e-05,
"loss": 0.6832,
"step": 11920
},
{
"epoch": 6.071281961526834,
"grad_norm": 0.11879277229309082,
"learning_rate": 4.9830518828321115e-05,
"loss": 0.6781,
"step": 11930
},
{
"epoch": 6.076373530207323,
"grad_norm": 0.12636293470859528,
"learning_rate": 4.983037676531049e-05,
"loss": 0.6774,
"step": 11940
},
{
"epoch": 6.0814650988878105,
"grad_norm": 0.13675157725811005,
"learning_rate": 4.983023470229986e-05,
"loss": 0.6789,
"step": 11950
},
{
"epoch": 6.086556667568298,
"grad_norm": 0.13322140276432037,
"learning_rate": 4.9830092639289234e-05,
"loss": 0.6805,
"step": 11960
},
{
"epoch": 6.091648236248787,
"grad_norm": 0.1352871060371399,
"learning_rate": 4.982995057627861e-05,
"loss": 0.6808,
"step": 11970
},
{
"epoch": 6.096739804929275,
"grad_norm": 0.14976170659065247,
"learning_rate": 4.982980851326798e-05,
"loss": 0.6775,
"step": 11980
},
{
"epoch": 6.101831373609763,
"grad_norm": 0.1250462532043457,
"learning_rate": 4.982966645025735e-05,
"loss": 0.6782,
"step": 11990
},
{
"epoch": 6.106922942290251,
"grad_norm": 0.16815803945064545,
"learning_rate": 4.9829524387246726e-05,
"loss": 0.6721,
"step": 12000
},
{
"epoch": 6.11201451097074,
"grad_norm": 0.18195395171642303,
"learning_rate": 4.982938232423609e-05,
"loss": 0.6806,
"step": 12010
},
{
"epoch": 6.1171060796512275,
"grad_norm": 0.15061675012111664,
"learning_rate": 4.9829240261225466e-05,
"loss": 0.6732,
"step": 12020
},
{
"epoch": 6.122197648331715,
"grad_norm": 0.14526985585689545,
"learning_rate": 4.982909819821484e-05,
"loss": 0.6788,
"step": 12030
},
{
"epoch": 6.127289217012204,
"grad_norm": 0.1469496637582779,
"learning_rate": 4.982895613520421e-05,
"loss": 0.6779,
"step": 12040
},
{
"epoch": 6.132380785692692,
"grad_norm": 0.18443866074085236,
"learning_rate": 4.9828814072193585e-05,
"loss": 0.6767,
"step": 12050
},
{
"epoch": 6.13747235437318,
"grad_norm": 0.11885727196931839,
"learning_rate": 4.982867200918295e-05,
"loss": 0.6764,
"step": 12060
},
{
"epoch": 6.142563923053668,
"grad_norm": 0.1266055554151535,
"learning_rate": 4.9828529946172325e-05,
"loss": 0.6837,
"step": 12070
},
{
"epoch": 6.147655491734157,
"grad_norm": 0.11415141075849533,
"learning_rate": 4.98283878831617e-05,
"loss": 0.6837,
"step": 12080
},
{
"epoch": 6.152747060414645,
"grad_norm": 0.09705322235822678,
"learning_rate": 4.982824582015107e-05,
"loss": 0.6815,
"step": 12090
},
{
"epoch": 6.157838629095132,
"grad_norm": 0.12555427849292755,
"learning_rate": 4.9828103757140444e-05,
"loss": 0.6804,
"step": 12100
},
{
"epoch": 6.162930197775621,
"grad_norm": 0.11063813418149948,
"learning_rate": 4.982796169412982e-05,
"loss": 0.6815,
"step": 12110
},
{
"epoch": 6.168021766456109,
"grad_norm": 0.1428930014371872,
"learning_rate": 4.982781963111919e-05,
"loss": 0.6781,
"step": 12120
},
{
"epoch": 6.173113335136597,
"grad_norm": 0.13896307349205017,
"learning_rate": 4.982767756810856e-05,
"loss": 0.6763,
"step": 12130
},
{
"epoch": 6.178204903817085,
"grad_norm": 0.12032928317785263,
"learning_rate": 4.9827535505097936e-05,
"loss": 0.6803,
"step": 12140
},
{
"epoch": 6.183296472497574,
"grad_norm": 0.11562150716781616,
"learning_rate": 4.98273934420873e-05,
"loss": 0.6766,
"step": 12150
},
{
"epoch": 6.188388041178062,
"grad_norm": 0.1040254682302475,
"learning_rate": 4.9827251379076676e-05,
"loss": 0.6823,
"step": 12160
},
{
"epoch": 6.19347960985855,
"grad_norm": 0.1031600683927536,
"learning_rate": 4.982710931606605e-05,
"loss": 0.6757,
"step": 12170
},
{
"epoch": 6.198571178539038,
"grad_norm": 0.11150684952735901,
"learning_rate": 4.982696725305542e-05,
"loss": 0.6781,
"step": 12180
},
{
"epoch": 6.203662747219526,
"grad_norm": 0.15506963431835175,
"learning_rate": 4.9826825190044795e-05,
"loss": 0.672,
"step": 12190
},
{
"epoch": 6.2087543159000145,
"grad_norm": 0.13985055685043335,
"learning_rate": 4.982668312703417e-05,
"loss": 0.6793,
"step": 12200
},
{
"epoch": 6.213845884580502,
"grad_norm": 0.11352770030498505,
"learning_rate": 4.982654106402354e-05,
"loss": 0.6824,
"step": 12210
},
{
"epoch": 6.218937453260991,
"grad_norm": 0.11052574217319489,
"learning_rate": 4.9826399001012914e-05,
"loss": 0.6791,
"step": 12220
},
{
"epoch": 6.224029021941479,
"grad_norm": 0.12992137670516968,
"learning_rate": 4.982625693800229e-05,
"loss": 0.6793,
"step": 12230
},
{
"epoch": 6.229120590621967,
"grad_norm": 0.1408848613500595,
"learning_rate": 4.982611487499166e-05,
"loss": 0.6791,
"step": 12240
},
{
"epoch": 6.234212159302455,
"grad_norm": 0.18795296549797058,
"learning_rate": 4.9825972811981034e-05,
"loss": 0.6802,
"step": 12250
},
{
"epoch": 6.239303727982943,
"grad_norm": 0.12889884412288666,
"learning_rate": 4.982583074897041e-05,
"loss": 0.6878,
"step": 12260
},
{
"epoch": 6.2443952966634315,
"grad_norm": 0.1431640386581421,
"learning_rate": 4.982568868595977e-05,
"loss": 0.6775,
"step": 12270
},
{
"epoch": 6.249486865343919,
"grad_norm": 0.11410534381866455,
"learning_rate": 4.9825546622949146e-05,
"loss": 0.6798,
"step": 12280
},
{
"epoch": 6.254578434024408,
"grad_norm": 0.14347901940345764,
"learning_rate": 4.982540455993851e-05,
"loss": 0.6764,
"step": 12290
},
{
"epoch": 6.259670002704896,
"grad_norm": 0.14148719608783722,
"learning_rate": 4.9825262496927886e-05,
"loss": 0.6778,
"step": 12300
},
{
"epoch": 6.264761571385384,
"grad_norm": 0.13571056723594666,
"learning_rate": 4.982512043391726e-05,
"loss": 0.6822,
"step": 12310
},
{
"epoch": 6.269853140065872,
"grad_norm": 0.13416819274425507,
"learning_rate": 4.982497837090663e-05,
"loss": 0.6764,
"step": 12320
},
{
"epoch": 6.274944708746361,
"grad_norm": 0.12467856705188751,
"learning_rate": 4.9824836307896005e-05,
"loss": 0.681,
"step": 12330
},
{
"epoch": 6.280036277426849,
"grad_norm": 0.11934306472539902,
"learning_rate": 4.982469424488538e-05,
"loss": 0.6808,
"step": 12340
},
{
"epoch": 6.285127846107336,
"grad_norm": 0.12335172295570374,
"learning_rate": 4.982455218187475e-05,
"loss": 0.6795,
"step": 12350
},
{
"epoch": 6.290219414787825,
"grad_norm": 0.12900583446025848,
"learning_rate": 4.9824410118864124e-05,
"loss": 0.6736,
"step": 12360
},
{
"epoch": 6.295310983468313,
"grad_norm": 0.11381091177463531,
"learning_rate": 4.98242680558535e-05,
"loss": 0.6838,
"step": 12370
},
{
"epoch": 6.300402552148801,
"grad_norm": 0.11505099385976791,
"learning_rate": 4.982412599284287e-05,
"loss": 0.6772,
"step": 12380
},
{
"epoch": 6.305494120829289,
"grad_norm": 0.11616339534521103,
"learning_rate": 4.9823983929832244e-05,
"loss": 0.6788,
"step": 12390
},
{
"epoch": 6.310585689509777,
"grad_norm": 0.1088867336511612,
"learning_rate": 4.982384186682162e-05,
"loss": 0.6777,
"step": 12400
},
{
"epoch": 6.315677258190266,
"grad_norm": 0.11975440382957458,
"learning_rate": 4.982369980381098e-05,
"loss": 0.6854,
"step": 12410
},
{
"epoch": 6.320768826870753,
"grad_norm": 0.11531190574169159,
"learning_rate": 4.9823557740800356e-05,
"loss": 0.6786,
"step": 12420
},
{
"epoch": 6.325860395551242,
"grad_norm": 0.117821604013443,
"learning_rate": 4.982341567778973e-05,
"loss": 0.6814,
"step": 12430
},
{
"epoch": 6.33095196423173,
"grad_norm": 0.13663433492183685,
"learning_rate": 4.98232736147791e-05,
"loss": 0.6754,
"step": 12440
},
{
"epoch": 6.3360435329122184,
"grad_norm": 0.14458602666854858,
"learning_rate": 4.9823131551768475e-05,
"loss": 0.6829,
"step": 12450
},
{
"epoch": 6.341135101592706,
"grad_norm": 0.12459100037813187,
"learning_rate": 4.982298948875785e-05,
"loss": 0.6803,
"step": 12460
},
{
"epoch": 6.346226670273195,
"grad_norm": 0.11213183403015137,
"learning_rate": 4.9822847425747215e-05,
"loss": 0.6776,
"step": 12470
},
{
"epoch": 6.351318238953683,
"grad_norm": 0.12166488170623779,
"learning_rate": 4.982270536273659e-05,
"loss": 0.6817,
"step": 12480
},
{
"epoch": 6.35640980763417,
"grad_norm": 0.11691765487194061,
"learning_rate": 4.982256329972596e-05,
"loss": 0.6829,
"step": 12490
},
{
"epoch": 6.361501376314659,
"grad_norm": 0.1120506301522255,
"learning_rate": 4.9822421236715334e-05,
"loss": 0.6791,
"step": 12500
},
{
"epoch": 6.366592944995147,
"grad_norm": 0.12437008321285248,
"learning_rate": 4.982227917370471e-05,
"loss": 0.6751,
"step": 12510
},
{
"epoch": 6.3716845136756355,
"grad_norm": 0.15133772790431976,
"learning_rate": 4.982213711069408e-05,
"loss": 0.6785,
"step": 12520
},
{
"epoch": 6.376776082356123,
"grad_norm": 0.14470815658569336,
"learning_rate": 4.9821995047683453e-05,
"loss": 0.6805,
"step": 12530
},
{
"epoch": 6.381867651036612,
"grad_norm": 0.1352653056383133,
"learning_rate": 4.9821852984672827e-05,
"loss": 0.6799,
"step": 12540
},
{
"epoch": 6.3869592197171,
"grad_norm": 0.12650400400161743,
"learning_rate": 4.982171092166219e-05,
"loss": 0.6788,
"step": 12550
},
{
"epoch": 6.392050788397588,
"grad_norm": 0.12057118117809296,
"learning_rate": 4.9821568858651566e-05,
"loss": 0.6811,
"step": 12560
},
{
"epoch": 6.397142357078076,
"grad_norm": 0.16348209977149963,
"learning_rate": 4.982142679564094e-05,
"loss": 0.6799,
"step": 12570
},
{
"epoch": 6.402233925758564,
"grad_norm": 0.18208801746368408,
"learning_rate": 4.982128473263031e-05,
"loss": 0.6738,
"step": 12580
},
{
"epoch": 6.4073254944390525,
"grad_norm": 0.1399811953306198,
"learning_rate": 4.9821142669619685e-05,
"loss": 0.6762,
"step": 12590
},
{
"epoch": 6.41241706311954,
"grad_norm": 0.11085145175457001,
"learning_rate": 4.982100060660906e-05,
"loss": 0.6914,
"step": 12600
},
{
"epoch": 6.417508631800029,
"grad_norm": 0.10344066470861435,
"learning_rate": 4.982085854359843e-05,
"loss": 0.6809,
"step": 12610
},
{
"epoch": 6.422600200480517,
"grad_norm": 0.13643105328083038,
"learning_rate": 4.9820716480587805e-05,
"loss": 0.6752,
"step": 12620
},
{
"epoch": 6.4276917691610045,
"grad_norm": 0.12111321091651917,
"learning_rate": 4.982057441757718e-05,
"loss": 0.6786,
"step": 12630
},
{
"epoch": 6.432783337841493,
"grad_norm": 0.1612890660762787,
"learning_rate": 4.982043235456655e-05,
"loss": 0.6789,
"step": 12640
},
{
"epoch": 6.437874906521981,
"grad_norm": 0.15844057500362396,
"learning_rate": 4.9820290291555924e-05,
"loss": 0.6826,
"step": 12650
},
{
"epoch": 6.44296647520247,
"grad_norm": 0.128059983253479,
"learning_rate": 4.982014822854529e-05,
"loss": 0.6776,
"step": 12660
},
{
"epoch": 6.448058043882957,
"grad_norm": 0.13311228156089783,
"learning_rate": 4.9820006165534663e-05,
"loss": 0.6793,
"step": 12670
},
{
"epoch": 6.453149612563446,
"grad_norm": 0.15546241402626038,
"learning_rate": 4.9819864102524037e-05,
"loss": 0.6753,
"step": 12680
},
{
"epoch": 6.458241181243934,
"grad_norm": 0.1458451747894287,
"learning_rate": 4.98197220395134e-05,
"loss": 0.6817,
"step": 12690
},
{
"epoch": 6.463332749924422,
"grad_norm": 0.12202929705381393,
"learning_rate": 4.9819579976502776e-05,
"loss": 0.6801,
"step": 12700
},
{
"epoch": 6.46842431860491,
"grad_norm": 0.137448251247406,
"learning_rate": 4.981943791349215e-05,
"loss": 0.6779,
"step": 12710
},
{
"epoch": 6.473515887285398,
"grad_norm": 0.12428711354732513,
"learning_rate": 4.981929585048152e-05,
"loss": 0.6814,
"step": 12720
},
{
"epoch": 6.478607455965887,
"grad_norm": 0.15364359319210052,
"learning_rate": 4.9819153787470895e-05,
"loss": 0.6719,
"step": 12730
},
{
"epoch": 6.483699024646374,
"grad_norm": 0.1646897941827774,
"learning_rate": 4.981901172446027e-05,
"loss": 0.6787,
"step": 12740
},
{
"epoch": 6.488790593326863,
"grad_norm": 0.18058307468891144,
"learning_rate": 4.981886966144964e-05,
"loss": 0.6797,
"step": 12750
},
{
"epoch": 6.493882162007351,
"grad_norm": 0.13395850360393524,
"learning_rate": 4.9818727598439015e-05,
"loss": 0.6776,
"step": 12760
},
{
"epoch": 6.4989737306878395,
"grad_norm": 0.15397368371486664,
"learning_rate": 4.981858553542839e-05,
"loss": 0.6698,
"step": 12770
},
{
"epoch": 6.504065299368327,
"grad_norm": 0.16110943257808685,
"learning_rate": 4.981844347241776e-05,
"loss": 0.6849,
"step": 12780
},
{
"epoch": 6.509156868048816,
"grad_norm": 0.18386079370975494,
"learning_rate": 4.9818301409407134e-05,
"loss": 0.6813,
"step": 12790
},
{
"epoch": 6.514248436729304,
"grad_norm": 0.11144635081291199,
"learning_rate": 4.98181593463965e-05,
"loss": 0.6746,
"step": 12800
},
{
"epoch": 6.519340005409791,
"grad_norm": 0.1547509729862213,
"learning_rate": 4.981801728338587e-05,
"loss": 0.6775,
"step": 12810
},
{
"epoch": 6.52443157409028,
"grad_norm": 0.12533412873744965,
"learning_rate": 4.9817875220375246e-05,
"loss": 0.6723,
"step": 12820
},
{
"epoch": 6.529523142770768,
"grad_norm": 0.13594309985637665,
"learning_rate": 4.981773315736462e-05,
"loss": 0.6815,
"step": 12830
},
{
"epoch": 6.5346147114512565,
"grad_norm": 0.16000863909721375,
"learning_rate": 4.981759109435399e-05,
"loss": 0.6845,
"step": 12840
},
{
"epoch": 6.539706280131744,
"grad_norm": 0.12660828232765198,
"learning_rate": 4.9817449031343366e-05,
"loss": 0.6776,
"step": 12850
},
{
"epoch": 6.544797848812232,
"grad_norm": 0.13099251687526703,
"learning_rate": 4.981730696833274e-05,
"loss": 0.6761,
"step": 12860
},
{
"epoch": 6.549889417492721,
"grad_norm": 0.13618282973766327,
"learning_rate": 4.981716490532211e-05,
"loss": 0.6777,
"step": 12870
},
{
"epoch": 6.5549809861732085,
"grad_norm": 0.128812775015831,
"learning_rate": 4.9817022842311485e-05,
"loss": 0.687,
"step": 12880
},
{
"epoch": 6.560072554853697,
"grad_norm": 0.10990247130393982,
"learning_rate": 4.981688077930085e-05,
"loss": 0.6792,
"step": 12890
},
{
"epoch": 6.565164123534185,
"grad_norm": 0.13022927939891815,
"learning_rate": 4.9816738716290225e-05,
"loss": 0.6785,
"step": 12900
},
{
"epoch": 6.5702556922146735,
"grad_norm": 0.14299486577510834,
"learning_rate": 4.98165966532796e-05,
"loss": 0.6819,
"step": 12910
},
{
"epoch": 6.575347260895161,
"grad_norm": 0.13400639593601227,
"learning_rate": 4.981645459026897e-05,
"loss": 0.6815,
"step": 12920
},
{
"epoch": 6.58043882957565,
"grad_norm": 0.0999205932021141,
"learning_rate": 4.9816312527258344e-05,
"loss": 0.6788,
"step": 12930
},
{
"epoch": 6.585530398256138,
"grad_norm": 0.11330140382051468,
"learning_rate": 4.981617046424771e-05,
"loss": 0.6805,
"step": 12940
},
{
"epoch": 6.5906219669366255,
"grad_norm": 0.18674777448177338,
"learning_rate": 4.981602840123708e-05,
"loss": 0.6778,
"step": 12950
},
{
"epoch": 6.595713535617114,
"grad_norm": 0.15032435953617096,
"learning_rate": 4.9815886338226456e-05,
"loss": 0.6825,
"step": 12960
},
{
"epoch": 6.600805104297602,
"grad_norm": 0.1333203762769699,
"learning_rate": 4.981574427521583e-05,
"loss": 0.6795,
"step": 12970
},
{
"epoch": 6.605896672978091,
"grad_norm": 0.16465353965759277,
"learning_rate": 4.98156022122052e-05,
"loss": 0.6706,
"step": 12980
},
{
"epoch": 6.610988241658578,
"grad_norm": 0.15451110899448395,
"learning_rate": 4.9815460149194576e-05,
"loss": 0.6757,
"step": 12990
},
{
"epoch": 6.616079810339067,
"grad_norm": 0.15208947658538818,
"learning_rate": 4.981531808618395e-05,
"loss": 0.6818,
"step": 13000
},
{
"epoch": 6.621171379019555,
"grad_norm": 0.13289377093315125,
"learning_rate": 4.981517602317332e-05,
"loss": 0.6811,
"step": 13010
},
{
"epoch": 6.626262947700043,
"grad_norm": 0.18308168649673462,
"learning_rate": 4.9815033960162695e-05,
"loss": 0.678,
"step": 13020
},
{
"epoch": 6.631354516380531,
"grad_norm": 0.12425180524587631,
"learning_rate": 4.981489189715207e-05,
"loss": 0.6816,
"step": 13030
},
{
"epoch": 6.636446085061019,
"grad_norm": 0.13754673302173615,
"learning_rate": 4.981474983414144e-05,
"loss": 0.6773,
"step": 13040
},
{
"epoch": 6.641537653741508,
"grad_norm": 0.15316608548164368,
"learning_rate": 4.9814607771130814e-05,
"loss": 0.6765,
"step": 13050
},
{
"epoch": 6.646629222421995,
"grad_norm": 0.136078342795372,
"learning_rate": 4.981446570812018e-05,
"loss": 0.6767,
"step": 13060
},
{
"epoch": 6.651720791102484,
"grad_norm": 0.12898576259613037,
"learning_rate": 4.9814323645109554e-05,
"loss": 0.6786,
"step": 13070
},
{
"epoch": 6.656812359782972,
"grad_norm": 0.11854422837495804,
"learning_rate": 4.981418158209893e-05,
"loss": 0.6806,
"step": 13080
},
{
"epoch": 6.66190392846346,
"grad_norm": 0.1517888456583023,
"learning_rate": 4.98140395190883e-05,
"loss": 0.6829,
"step": 13090
},
{
"epoch": 6.666995497143948,
"grad_norm": 0.1091533899307251,
"learning_rate": 4.9813897456077666e-05,
"loss": 0.6774,
"step": 13100
},
{
"epoch": 6.672087065824436,
"grad_norm": 0.13526228070259094,
"learning_rate": 4.981375539306704e-05,
"loss": 0.6747,
"step": 13110
},
{
"epoch": 6.677178634504925,
"grad_norm": 0.144491046667099,
"learning_rate": 4.981361333005641e-05,
"loss": 0.6787,
"step": 13120
},
{
"epoch": 6.682270203185412,
"grad_norm": 0.16958777606487274,
"learning_rate": 4.9813471267045786e-05,
"loss": 0.6744,
"step": 13130
},
{
"epoch": 6.687361771865901,
"grad_norm": 0.14115367829799652,
"learning_rate": 4.981332920403516e-05,
"loss": 0.6791,
"step": 13140
},
{
"epoch": 6.692453340546389,
"grad_norm": 0.11081673204898834,
"learning_rate": 4.981318714102453e-05,
"loss": 0.6795,
"step": 13150
},
{
"epoch": 6.6975449092268775,
"grad_norm": 0.14843027293682098,
"learning_rate": 4.9813045078013905e-05,
"loss": 0.6807,
"step": 13160
},
{
"epoch": 6.702636477907365,
"grad_norm": 0.12543180584907532,
"learning_rate": 4.981290301500328e-05,
"loss": 0.6778,
"step": 13170
},
{
"epoch": 6.707728046587853,
"grad_norm": 0.13169404864311218,
"learning_rate": 4.981276095199265e-05,
"loss": 0.675,
"step": 13180
},
{
"epoch": 6.712819615268342,
"grad_norm": 0.15343239903450012,
"learning_rate": 4.9812618888982024e-05,
"loss": 0.6819,
"step": 13190
},
{
"epoch": 6.7179111839488295,
"grad_norm": 0.13029424846172333,
"learning_rate": 4.981247682597139e-05,
"loss": 0.6778,
"step": 13200
},
{
"epoch": 6.723002752629318,
"grad_norm": 0.11084284633398056,
"learning_rate": 4.9812334762960764e-05,
"loss": 0.6824,
"step": 13210
},
{
"epoch": 6.728094321309806,
"grad_norm": 0.11253423988819122,
"learning_rate": 4.981219269995014e-05,
"loss": 0.6798,
"step": 13220
},
{
"epoch": 6.7331858899902945,
"grad_norm": 0.1311793029308319,
"learning_rate": 4.981205063693951e-05,
"loss": 0.6814,
"step": 13230
},
{
"epoch": 6.738277458670782,
"grad_norm": 0.12919209897518158,
"learning_rate": 4.981190857392888e-05,
"loss": 0.6768,
"step": 13240
},
{
"epoch": 6.743369027351271,
"grad_norm": 0.12355062365531921,
"learning_rate": 4.9811766510918256e-05,
"loss": 0.6799,
"step": 13250
},
{
"epoch": 6.748460596031759,
"grad_norm": 0.1338970810174942,
"learning_rate": 4.981162444790763e-05,
"loss": 0.6771,
"step": 13260
},
{
"epoch": 6.7535521647122465,
"grad_norm": 0.14117179811000824,
"learning_rate": 4.9811482384897e-05,
"loss": 0.6799,
"step": 13270
},
{
"epoch": 6.758643733392735,
"grad_norm": 0.1848529875278473,
"learning_rate": 4.9811340321886375e-05,
"loss": 0.6755,
"step": 13280
},
{
"epoch": 6.763735302073223,
"grad_norm": 0.1720336526632309,
"learning_rate": 4.981119825887575e-05,
"loss": 0.67,
"step": 13290
},
{
"epoch": 6.768826870753712,
"grad_norm": 0.1607787162065506,
"learning_rate": 4.981105619586512e-05,
"loss": 0.6827,
"step": 13300
},
{
"epoch": 6.773918439434199,
"grad_norm": 0.14998158812522888,
"learning_rate": 4.981091413285449e-05,
"loss": 0.6759,
"step": 13310
},
{
"epoch": 6.779010008114687,
"grad_norm": 0.11763730645179749,
"learning_rate": 4.981077206984386e-05,
"loss": 0.6747,
"step": 13320
},
{
"epoch": 6.784101576795176,
"grad_norm": 0.12859204411506653,
"learning_rate": 4.9810630006833234e-05,
"loss": 0.6785,
"step": 13330
},
{
"epoch": 6.7891931454756635,
"grad_norm": 0.12227821350097656,
"learning_rate": 4.98104879438226e-05,
"loss": 0.6794,
"step": 13340
},
{
"epoch": 6.794284714156152,
"grad_norm": 0.11308576911687851,
"learning_rate": 4.9810345880811974e-05,
"loss": 0.6777,
"step": 13350
},
{
"epoch": 6.79937628283664,
"grad_norm": 0.12252433598041534,
"learning_rate": 4.981020381780135e-05,
"loss": 0.6778,
"step": 13360
},
{
"epoch": 6.804467851517129,
"grad_norm": 0.11951456218957901,
"learning_rate": 4.981006175479072e-05,
"loss": 0.6778,
"step": 13370
},
{
"epoch": 6.809559420197616,
"grad_norm": 0.13758736848831177,
"learning_rate": 4.980991969178009e-05,
"loss": 0.6757,
"step": 13380
},
{
"epoch": 6.814650988878105,
"grad_norm": 0.15930655598640442,
"learning_rate": 4.9809777628769466e-05,
"loss": 0.675,
"step": 13390
},
{
"epoch": 6.819742557558593,
"grad_norm": 0.16790159046649933,
"learning_rate": 4.980963556575884e-05,
"loss": 0.6685,
"step": 13400
},
{
"epoch": 6.824834126239081,
"grad_norm": 0.1681044101715088,
"learning_rate": 4.980949350274821e-05,
"loss": 0.683,
"step": 13410
},
{
"epoch": 6.829925694919569,
"grad_norm": 0.1336173415184021,
"learning_rate": 4.9809351439737585e-05,
"loss": 0.6746,
"step": 13420
},
{
"epoch": 6.835017263600057,
"grad_norm": 0.11793011426925659,
"learning_rate": 4.980920937672696e-05,
"loss": 0.6789,
"step": 13430
},
{
"epoch": 6.840108832280546,
"grad_norm": 0.14056985080242157,
"learning_rate": 4.980906731371633e-05,
"loss": 0.6797,
"step": 13440
},
{
"epoch": 6.845200400961033,
"grad_norm": 0.11312086880207062,
"learning_rate": 4.9808925250705705e-05,
"loss": 0.6777,
"step": 13450
},
{
"epoch": 6.850291969641522,
"grad_norm": 0.14550986886024475,
"learning_rate": 4.980878318769507e-05,
"loss": 0.6792,
"step": 13460
},
{
"epoch": 6.85538353832201,
"grad_norm": 0.13276565074920654,
"learning_rate": 4.9808641124684444e-05,
"loss": 0.6797,
"step": 13470
},
{
"epoch": 6.8604751070024985,
"grad_norm": 0.1404767632484436,
"learning_rate": 4.980849906167382e-05,
"loss": 0.6767,
"step": 13480
},
{
"epoch": 6.865566675682986,
"grad_norm": 0.11344119906425476,
"learning_rate": 4.980835699866319e-05,
"loss": 0.6779,
"step": 13490
},
{
"epoch": 6.870658244363474,
"grad_norm": 0.18248707056045532,
"learning_rate": 4.9808214935652563e-05,
"loss": 0.6819,
"step": 13500
},
{
"epoch": 6.875749813043963,
"grad_norm": 0.13696008920669556,
"learning_rate": 4.9808072872641937e-05,
"loss": 0.6789,
"step": 13510
},
{
"epoch": 6.8808413817244505,
"grad_norm": 0.1089053824543953,
"learning_rate": 4.98079308096313e-05,
"loss": 0.6833,
"step": 13520
},
{
"epoch": 6.885932950404939,
"grad_norm": 0.13730046153068542,
"learning_rate": 4.9807788746620676e-05,
"loss": 0.685,
"step": 13530
},
{
"epoch": 6.891024519085427,
"grad_norm": 0.11708593368530273,
"learning_rate": 4.980764668361005e-05,
"loss": 0.6797,
"step": 13540
},
{
"epoch": 6.896116087765915,
"grad_norm": 0.14479976892471313,
"learning_rate": 4.980750462059942e-05,
"loss": 0.6779,
"step": 13550
},
{
"epoch": 6.901207656446403,
"grad_norm": 0.13402192294597626,
"learning_rate": 4.9807362557588795e-05,
"loss": 0.6775,
"step": 13560
},
{
"epoch": 6.906299225126891,
"grad_norm": 0.1378648430109024,
"learning_rate": 4.980722049457817e-05,
"loss": 0.6799,
"step": 13570
},
{
"epoch": 6.91139079380738,
"grad_norm": 0.1424325555562973,
"learning_rate": 4.980707843156754e-05,
"loss": 0.6777,
"step": 13580
},
{
"epoch": 6.9164823624878675,
"grad_norm": 0.12795968353748322,
"learning_rate": 4.9806936368556915e-05,
"loss": 0.6756,
"step": 13590
},
{
"epoch": 6.921573931168356,
"grad_norm": 0.16961532831192017,
"learning_rate": 4.980679430554628e-05,
"loss": 0.6762,
"step": 13600
},
{
"epoch": 6.926665499848844,
"grad_norm": 0.16084560751914978,
"learning_rate": 4.9806652242535654e-05,
"loss": 0.6783,
"step": 13610
},
{
"epoch": 6.931757068529333,
"grad_norm": 0.1510113775730133,
"learning_rate": 4.980651017952503e-05,
"loss": 0.676,
"step": 13620
},
{
"epoch": 6.93684863720982,
"grad_norm": 0.1436864286661148,
"learning_rate": 4.98063681165144e-05,
"loss": 0.6769,
"step": 13630
},
{
"epoch": 6.941940205890308,
"grad_norm": 0.14651361107826233,
"learning_rate": 4.980622605350377e-05,
"loss": 0.6786,
"step": 13640
},
{
"epoch": 6.947031774570797,
"grad_norm": 0.12080514430999756,
"learning_rate": 4.9806083990493146e-05,
"loss": 0.6719,
"step": 13650
},
{
"epoch": 6.952123343251285,
"grad_norm": 0.18036852777004242,
"learning_rate": 4.980594192748252e-05,
"loss": 0.6776,
"step": 13660
},
{
"epoch": 6.957214911931773,
"grad_norm": 0.15538708865642548,
"learning_rate": 4.980579986447189e-05,
"loss": 0.677,
"step": 13670
},
{
"epoch": 6.962306480612261,
"grad_norm": 0.14524763822555542,
"learning_rate": 4.9805657801461266e-05,
"loss": 0.6725,
"step": 13680
},
{
"epoch": 6.96739804929275,
"grad_norm": 0.13171471655368805,
"learning_rate": 4.980551573845064e-05,
"loss": 0.6814,
"step": 13690
},
{
"epoch": 6.972489617973237,
"grad_norm": 0.14730645716190338,
"learning_rate": 4.980537367544001e-05,
"loss": 0.6828,
"step": 13700
},
{
"epoch": 6.977581186653726,
"grad_norm": 0.1142466589808464,
"learning_rate": 4.980523161242938e-05,
"loss": 0.677,
"step": 13710
},
{
"epoch": 6.982672755334214,
"grad_norm": 0.11980883777141571,
"learning_rate": 4.980508954941875e-05,
"loss": 0.6847,
"step": 13720
},
{
"epoch": 6.987764324014702,
"grad_norm": 0.10882198065519333,
"learning_rate": 4.9804947486408125e-05,
"loss": 0.6749,
"step": 13730
},
{
"epoch": 6.99285589269519,
"grad_norm": 0.1418180912733078,
"learning_rate": 4.980480542339749e-05,
"loss": 0.675,
"step": 13740
}
],
"logging_steps": 10,
"max_steps": 13748,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}