{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.99692914763958, "eval_steps": 500, "global_step": 13748, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005091568680488154, "grad_norm": 0.7564847469329834, "learning_rate": 4.9999857936989376e-05, "loss": 0.6957, "step": 10 }, { "epoch": 0.010183137360976308, "grad_norm": 0.3457886576652527, "learning_rate": 4.999971587397875e-05, "loss": 0.6941, "step": 20 }, { "epoch": 0.015274706041464463, "grad_norm": 0.2739073932170868, "learning_rate": 4.999957381096812e-05, "loss": 0.6921, "step": 30 }, { "epoch": 0.020366274721952616, "grad_norm": 0.2826838195323944, "learning_rate": 4.9999431747957495e-05, "loss": 0.6932, "step": 40 }, { "epoch": 0.02545784340244077, "grad_norm": 0.1553473025560379, "learning_rate": 4.999928968494687e-05, "loss": 0.6933, "step": 50 }, { "epoch": 0.030549412082928926, "grad_norm": 0.17130891978740692, "learning_rate": 4.9999147621936234e-05, "loss": 0.6916, "step": 60 }, { "epoch": 0.03564098076341708, "grad_norm": 0.1938512623310089, "learning_rate": 4.999900555892561e-05, "loss": 0.6919, "step": 70 }, { "epoch": 0.04073254944390523, "grad_norm": 0.2490479201078415, "learning_rate": 4.999886349591498e-05, "loss": 0.6907, "step": 80 }, { "epoch": 0.04582411812439339, "grad_norm": 0.20010128617286682, "learning_rate": 4.9998721432904354e-05, "loss": 0.691, "step": 90 }, { "epoch": 0.05091568680488154, "grad_norm": 0.16262651979923248, "learning_rate": 4.999857936989373e-05, "loss": 0.6917, "step": 100 }, { "epoch": 0.056007255485369695, "grad_norm": 0.21441149711608887, "learning_rate": 4.999843730688309e-05, "loss": 0.6872, "step": 110 }, { "epoch": 0.06109882416585785, "grad_norm": 0.20518313348293304, "learning_rate": 4.9998295243872466e-05, "loss": 0.6899, "step": 120 }, { "epoch": 0.06619039284634601, "grad_norm": 0.2965710759162903, "learning_rate": 4.999815318086184e-05, "loss": 0.6869, "step": 130 }, { "epoch": 0.07128196152683416, "grad_norm": 0.269436776638031, "learning_rate": 4.999801111785121e-05, "loss": 0.6893, "step": 140 }, { "epoch": 0.07637353020732231, "grad_norm": 0.2355806529521942, "learning_rate": 4.9997869054840585e-05, "loss": 0.688, "step": 150 }, { "epoch": 0.08146509888781046, "grad_norm": 0.2859913408756256, "learning_rate": 4.999772699182996e-05, "loss": 0.6882, "step": 160 }, { "epoch": 0.08655666756829862, "grad_norm": 0.22986085712909698, "learning_rate": 4.999758492881933e-05, "loss": 0.6892, "step": 170 }, { "epoch": 0.09164823624878678, "grad_norm": 0.1777602881193161, "learning_rate": 4.9997442865808705e-05, "loss": 0.6891, "step": 180 }, { "epoch": 0.09673980492927493, "grad_norm": 0.24697603285312653, "learning_rate": 4.999730080279808e-05, "loss": 0.6901, "step": 190 }, { "epoch": 0.10183137360976308, "grad_norm": 0.24384862184524536, "learning_rate": 4.9997158739787444e-05, "loss": 0.6864, "step": 200 }, { "epoch": 0.10692294229025123, "grad_norm": 0.22532877326011658, "learning_rate": 4.999701667677682e-05, "loss": 0.6925, "step": 210 }, { "epoch": 0.11201451097073939, "grad_norm": 0.33894965052604675, "learning_rate": 4.999687461376619e-05, "loss": 0.69, "step": 220 }, { "epoch": 0.11710607965122755, "grad_norm": 0.1438864767551422, "learning_rate": 4.9996732550755563e-05, "loss": 0.6905, "step": 230 }, { "epoch": 0.1221976483317157, "grad_norm": 0.1356513947248459, "learning_rate": 4.9996590487744937e-05, "loss": 0.6895, "step": 240 }, { "epoch": 0.12728921701220386, "grad_norm": 0.11220791190862656, "learning_rate": 4.999644842473431e-05, "loss": 0.6889, "step": 250 }, { "epoch": 0.13238078569269202, "grad_norm": 0.18669378757476807, "learning_rate": 4.999630636172368e-05, "loss": 0.6877, "step": 260 }, { "epoch": 0.13747235437318017, "grad_norm": 0.24448029696941376, "learning_rate": 4.9996164298713056e-05, "loss": 0.6821, "step": 270 }, { "epoch": 0.14256392305366833, "grad_norm": 0.33900442719459534, "learning_rate": 4.999602223570243e-05, "loss": 0.6825, "step": 280 }, { "epoch": 0.14765549173415646, "grad_norm": 0.20542432367801666, "learning_rate": 4.99958801726918e-05, "loss": 0.6835, "step": 290 }, { "epoch": 0.15274706041464461, "grad_norm": 0.17819932103157043, "learning_rate": 4.9995738109681175e-05, "loss": 0.6875, "step": 300 }, { "epoch": 0.15783862909513277, "grad_norm": 0.16522936522960663, "learning_rate": 4.999559604667055e-05, "loss": 0.6868, "step": 310 }, { "epoch": 0.16293019777562093, "grad_norm": 0.1356010138988495, "learning_rate": 4.9995453983659915e-05, "loss": 0.6888, "step": 320 }, { "epoch": 0.16802176645610908, "grad_norm": 0.20955336093902588, "learning_rate": 4.999531192064929e-05, "loss": 0.6827, "step": 330 }, { "epoch": 0.17311333513659724, "grad_norm": 0.14871099591255188, "learning_rate": 4.9995169857638654e-05, "loss": 0.687, "step": 340 }, { "epoch": 0.1782049038170854, "grad_norm": 0.1207965835928917, "learning_rate": 4.999502779462803e-05, "loss": 0.6871, "step": 350 }, { "epoch": 0.18329647249757355, "grad_norm": 0.25459375977516174, "learning_rate": 4.99948857316174e-05, "loss": 0.6879, "step": 360 }, { "epoch": 0.1883880411780617, "grad_norm": 0.12454749643802643, "learning_rate": 4.9994743668606773e-05, "loss": 0.689, "step": 370 }, { "epoch": 0.19347960985854987, "grad_norm": 0.13212984800338745, "learning_rate": 4.9994601605596147e-05, "loss": 0.6827, "step": 380 }, { "epoch": 0.19857117853903802, "grad_norm": 0.15493592619895935, "learning_rate": 4.999445954258552e-05, "loss": 0.6916, "step": 390 }, { "epoch": 0.20366274721952615, "grad_norm": 0.1339859962463379, "learning_rate": 4.999431747957489e-05, "loss": 0.6862, "step": 400 }, { "epoch": 0.2087543159000143, "grad_norm": 0.17246641218662262, "learning_rate": 4.9994175416564266e-05, "loss": 0.6915, "step": 410 }, { "epoch": 0.21384588458050247, "grad_norm": 0.09907015413045883, "learning_rate": 4.999403335355364e-05, "loss": 0.6884, "step": 420 }, { "epoch": 0.21893745326099062, "grad_norm": 0.13688722252845764, "learning_rate": 4.999389129054301e-05, "loss": 0.6894, "step": 430 }, { "epoch": 0.22402902194147878, "grad_norm": 0.1572660207748413, "learning_rate": 4.9993749227532385e-05, "loss": 0.6857, "step": 440 }, { "epoch": 0.22912059062196694, "grad_norm": 0.22315748035907745, "learning_rate": 4.999360716452176e-05, "loss": 0.6819, "step": 450 }, { "epoch": 0.2342121593024551, "grad_norm": 0.10592522472143173, "learning_rate": 4.9993465101511125e-05, "loss": 0.6886, "step": 460 }, { "epoch": 0.23930372798294325, "grad_norm": 0.10022767633199692, "learning_rate": 4.99933230385005e-05, "loss": 0.687, "step": 470 }, { "epoch": 0.2443952966634314, "grad_norm": 0.12280535697937012, "learning_rate": 4.999318097548987e-05, "loss": 0.6862, "step": 480 }, { "epoch": 0.24948686534391956, "grad_norm": 0.11044813692569733, "learning_rate": 4.9993038912479244e-05, "loss": 0.6889, "step": 490 }, { "epoch": 0.2545784340244077, "grad_norm": 0.13826850056648254, "learning_rate": 4.999289684946862e-05, "loss": 0.6875, "step": 500 }, { "epoch": 0.2596700027048959, "grad_norm": 0.10267098248004913, "learning_rate": 4.999275478645799e-05, "loss": 0.6877, "step": 510 }, { "epoch": 0.26476157138538403, "grad_norm": 0.08445768803358078, "learning_rate": 4.999261272344736e-05, "loss": 0.6879, "step": 520 }, { "epoch": 0.2698531400658722, "grad_norm": 0.1590685397386551, "learning_rate": 4.999247066043673e-05, "loss": 0.6872, "step": 530 }, { "epoch": 0.27494470874636034, "grad_norm": 0.1537754386663437, "learning_rate": 4.99923285974261e-05, "loss": 0.6812, "step": 540 }, { "epoch": 0.2800362774268485, "grad_norm": 0.12273769080638885, "learning_rate": 4.9992186534415476e-05, "loss": 0.6855, "step": 550 }, { "epoch": 0.28512784610733666, "grad_norm": 0.19177380204200745, "learning_rate": 4.999204447140485e-05, "loss": 0.6857, "step": 560 }, { "epoch": 0.29021941478782476, "grad_norm": 0.1194639578461647, "learning_rate": 4.999190240839422e-05, "loss": 0.6851, "step": 570 }, { "epoch": 0.2953109834683129, "grad_norm": 0.12458167970180511, "learning_rate": 4.9991760345383595e-05, "loss": 0.6875, "step": 580 }, { "epoch": 0.30040255214880107, "grad_norm": 0.15139921009540558, "learning_rate": 4.999161828237297e-05, "loss": 0.6843, "step": 590 }, { "epoch": 0.30549412082928923, "grad_norm": 0.13201646506786346, "learning_rate": 4.9991476219362335e-05, "loss": 0.688, "step": 600 }, { "epoch": 0.3105856895097774, "grad_norm": 0.10855768620967865, "learning_rate": 4.999133415635171e-05, "loss": 0.6859, "step": 610 }, { "epoch": 0.31567725819026554, "grad_norm": 0.14113789796829224, "learning_rate": 4.999119209334108e-05, "loss": 0.6858, "step": 620 }, { "epoch": 0.3207688268707537, "grad_norm": 0.19037926197052002, "learning_rate": 4.9991050030330454e-05, "loss": 0.6851, "step": 630 }, { "epoch": 0.32586039555124185, "grad_norm": 0.18522581458091736, "learning_rate": 4.999090796731983e-05, "loss": 0.6854, "step": 640 }, { "epoch": 0.33095196423173, "grad_norm": 0.24384431540966034, "learning_rate": 4.99907659043092e-05, "loss": 0.6852, "step": 650 }, { "epoch": 0.33604353291221817, "grad_norm": 0.21666169166564941, "learning_rate": 4.999062384129857e-05, "loss": 0.6836, "step": 660 }, { "epoch": 0.3411351015927063, "grad_norm": 0.1427813023328781, "learning_rate": 4.9990481778287946e-05, "loss": 0.6899, "step": 670 }, { "epoch": 0.3462266702731945, "grad_norm": 0.09287853538990021, "learning_rate": 4.999033971527732e-05, "loss": 0.6861, "step": 680 }, { "epoch": 0.35131823895368264, "grad_norm": 0.1490527093410492, "learning_rate": 4.999019765226669e-05, "loss": 0.6859, "step": 690 }, { "epoch": 0.3564098076341708, "grad_norm": 0.0858352780342102, "learning_rate": 4.9990055589256066e-05, "loss": 0.6888, "step": 700 }, { "epoch": 0.36150137631465895, "grad_norm": 0.15133963525295258, "learning_rate": 4.998991352624544e-05, "loss": 0.6837, "step": 710 }, { "epoch": 0.3665929449951471, "grad_norm": 0.14562425017356873, "learning_rate": 4.9989771463234805e-05, "loss": 0.6862, "step": 720 }, { "epoch": 0.37168451367563526, "grad_norm": 0.15240037441253662, "learning_rate": 4.998962940022418e-05, "loss": 0.6839, "step": 730 }, { "epoch": 0.3767760823561234, "grad_norm": 0.1231294646859169, "learning_rate": 4.9989487337213544e-05, "loss": 0.6868, "step": 740 }, { "epoch": 0.3818676510366116, "grad_norm": 0.14511612057685852, "learning_rate": 4.998934527420292e-05, "loss": 0.6816, "step": 750 }, { "epoch": 0.38695921971709973, "grad_norm": 0.1693543940782547, "learning_rate": 4.998920321119229e-05, "loss": 0.6843, "step": 760 }, { "epoch": 0.3920507883975879, "grad_norm": 0.09580985456705093, "learning_rate": 4.9989061148181664e-05, "loss": 0.6875, "step": 770 }, { "epoch": 0.39714235707807605, "grad_norm": 0.1047905758023262, "learning_rate": 4.998891908517104e-05, "loss": 0.6859, "step": 780 }, { "epoch": 0.4022339257585642, "grad_norm": 0.1090409904718399, "learning_rate": 4.998877702216041e-05, "loss": 0.6852, "step": 790 }, { "epoch": 0.4073254944390523, "grad_norm": 0.12578189373016357, "learning_rate": 4.998863495914978e-05, "loss": 0.6867, "step": 800 }, { "epoch": 0.41241706311954046, "grad_norm": 0.11900747567415237, "learning_rate": 4.9988492896139156e-05, "loss": 0.6814, "step": 810 }, { "epoch": 0.4175086318000286, "grad_norm": 0.11401454359292984, "learning_rate": 4.998835083312853e-05, "loss": 0.6899, "step": 820 }, { "epoch": 0.4226002004805168, "grad_norm": 0.1015952005982399, "learning_rate": 4.99882087701179e-05, "loss": 0.6865, "step": 830 }, { "epoch": 0.42769176916100493, "grad_norm": 0.16676318645477295, "learning_rate": 4.9988066707107276e-05, "loss": 0.6891, "step": 840 }, { "epoch": 0.4327833378414931, "grad_norm": 0.10982430726289749, "learning_rate": 4.998792464409665e-05, "loss": 0.6863, "step": 850 }, { "epoch": 0.43787490652198124, "grad_norm": 0.13861846923828125, "learning_rate": 4.9987782581086015e-05, "loss": 0.6898, "step": 860 }, { "epoch": 0.4429664752024694, "grad_norm": 0.09421814233064651, "learning_rate": 4.998764051807539e-05, "loss": 0.6861, "step": 870 }, { "epoch": 0.44805804388295756, "grad_norm": 0.14085189998149872, "learning_rate": 4.998749845506476e-05, "loss": 0.683, "step": 880 }, { "epoch": 0.4531496125634457, "grad_norm": 0.1671237349510193, "learning_rate": 4.9987356392054134e-05, "loss": 0.6835, "step": 890 }, { "epoch": 0.45824118124393387, "grad_norm": 0.13570742309093475, "learning_rate": 4.998721432904351e-05, "loss": 0.685, "step": 900 }, { "epoch": 0.463332749924422, "grad_norm": 0.10402018576860428, "learning_rate": 4.998707226603288e-05, "loss": 0.6872, "step": 910 }, { "epoch": 0.4684243186049102, "grad_norm": 0.10226580500602722, "learning_rate": 4.9986930203022254e-05, "loss": 0.6871, "step": 920 }, { "epoch": 0.47351588728539834, "grad_norm": 0.10132193565368652, "learning_rate": 4.998678814001163e-05, "loss": 0.6902, "step": 930 }, { "epoch": 0.4786074559658865, "grad_norm": 0.11389295756816864, "learning_rate": 4.9986646077001e-05, "loss": 0.6818, "step": 940 }, { "epoch": 0.48369902464637465, "grad_norm": 0.13445314764976501, "learning_rate": 4.9986504013990366e-05, "loss": 0.6882, "step": 950 }, { "epoch": 0.4887905933268628, "grad_norm": 0.08756639063358307, "learning_rate": 4.998636195097974e-05, "loss": 0.6893, "step": 960 }, { "epoch": 0.49388216200735097, "grad_norm": 0.08042973279953003, "learning_rate": 4.998621988796911e-05, "loss": 0.6891, "step": 970 }, { "epoch": 0.4989737306878391, "grad_norm": 0.10082978755235672, "learning_rate": 4.9986077824958485e-05, "loss": 0.6903, "step": 980 }, { "epoch": 0.5040652993683272, "grad_norm": 0.0936272069811821, "learning_rate": 4.998593576194786e-05, "loss": 0.6864, "step": 990 }, { "epoch": 0.5091568680488154, "grad_norm": 0.12317179143428802, "learning_rate": 4.9985793698937225e-05, "loss": 0.6856, "step": 1000 }, { "epoch": 0.5142484367293035, "grad_norm": 0.12630991637706757, "learning_rate": 4.99856516359266e-05, "loss": 0.6843, "step": 1010 }, { "epoch": 0.5193400054097917, "grad_norm": 0.13727591931819916, "learning_rate": 4.998550957291597e-05, "loss": 0.6844, "step": 1020 }, { "epoch": 0.5244315740902799, "grad_norm": 0.20466050505638123, "learning_rate": 4.9985367509905344e-05, "loss": 0.6826, "step": 1030 }, { "epoch": 0.5295231427707681, "grad_norm": 0.18256479501724243, "learning_rate": 4.998522544689472e-05, "loss": 0.6779, "step": 1040 }, { "epoch": 0.5346147114512562, "grad_norm": 0.20778831839561462, "learning_rate": 4.998508338388409e-05, "loss": 0.6853, "step": 1050 }, { "epoch": 0.5397062801317444, "grad_norm": 0.08918287605047226, "learning_rate": 4.9984941320873464e-05, "loss": 0.6867, "step": 1060 }, { "epoch": 0.5447978488122325, "grad_norm": 0.08476213365793228, "learning_rate": 4.9984799257862837e-05, "loss": 0.6843, "step": 1070 }, { "epoch": 0.5498894174927207, "grad_norm": 0.11851644515991211, "learning_rate": 4.998465719485221e-05, "loss": 0.6831, "step": 1080 }, { "epoch": 0.5549809861732088, "grad_norm": 0.18159732222557068, "learning_rate": 4.998451513184158e-05, "loss": 0.6827, "step": 1090 }, { "epoch": 0.560072554853697, "grad_norm": 0.1184081956744194, "learning_rate": 4.9984373068830956e-05, "loss": 0.6835, "step": 1100 }, { "epoch": 0.5651641235341851, "grad_norm": 0.13530392944812775, "learning_rate": 4.998423100582032e-05, "loss": 0.6832, "step": 1110 }, { "epoch": 0.5702556922146733, "grad_norm": 0.17794091999530792, "learning_rate": 4.9984088942809695e-05, "loss": 0.6844, "step": 1120 }, { "epoch": 0.5753472608951614, "grad_norm": 0.18658983707427979, "learning_rate": 4.998394687979907e-05, "loss": 0.682, "step": 1130 }, { "epoch": 0.5804388295756495, "grad_norm": 0.11487103253602982, "learning_rate": 4.998380481678844e-05, "loss": 0.687, "step": 1140 }, { "epoch": 0.5855303982561377, "grad_norm": 0.09985563158988953, "learning_rate": 4.9983662753777815e-05, "loss": 0.6855, "step": 1150 }, { "epoch": 0.5906219669366258, "grad_norm": 0.1510723978281021, "learning_rate": 4.998352069076718e-05, "loss": 0.684, "step": 1160 }, { "epoch": 0.595713535617114, "grad_norm": 0.15650640428066254, "learning_rate": 4.9983378627756554e-05, "loss": 0.6807, "step": 1170 }, { "epoch": 0.6008051042976021, "grad_norm": 0.3100273311138153, "learning_rate": 4.998323656474593e-05, "loss": 0.6837, "step": 1180 }, { "epoch": 0.6058966729780904, "grad_norm": 0.09822337329387665, "learning_rate": 4.99830945017353e-05, "loss": 0.6851, "step": 1190 }, { "epoch": 0.6109882416585785, "grad_norm": 0.16111738979816437, "learning_rate": 4.9982952438724673e-05, "loss": 0.6827, "step": 1200 }, { "epoch": 0.6160798103390667, "grad_norm": 0.1878943145275116, "learning_rate": 4.9982810375714047e-05, "loss": 0.6871, "step": 1210 }, { "epoch": 0.6211713790195548, "grad_norm": 0.1281467080116272, "learning_rate": 4.998266831270342e-05, "loss": 0.6866, "step": 1220 }, { "epoch": 0.626262947700043, "grad_norm": 0.1051391065120697, "learning_rate": 4.998252624969279e-05, "loss": 0.6869, "step": 1230 }, { "epoch": 0.6313545163805311, "grad_norm": 0.138059601187706, "learning_rate": 4.9982384186682166e-05, "loss": 0.6825, "step": 1240 }, { "epoch": 0.6364460850610193, "grad_norm": 0.10719313472509384, "learning_rate": 4.998224212367153e-05, "loss": 0.6837, "step": 1250 }, { "epoch": 0.6415376537415074, "grad_norm": 0.09252595156431198, "learning_rate": 4.9982100060660905e-05, "loss": 0.689, "step": 1260 }, { "epoch": 0.6466292224219956, "grad_norm": 0.12894387543201447, "learning_rate": 4.998195799765028e-05, "loss": 0.6833, "step": 1270 }, { "epoch": 0.6517207911024837, "grad_norm": 0.10794473439455032, "learning_rate": 4.998181593463965e-05, "loss": 0.6866, "step": 1280 }, { "epoch": 0.6568123597829719, "grad_norm": 0.11546550691127777, "learning_rate": 4.9981673871629025e-05, "loss": 0.6861, "step": 1290 }, { "epoch": 0.66190392846346, "grad_norm": 0.10733726620674133, "learning_rate": 4.99815318086184e-05, "loss": 0.683, "step": 1300 }, { "epoch": 0.6669954971439482, "grad_norm": 0.17388881742954254, "learning_rate": 4.998138974560777e-05, "loss": 0.686, "step": 1310 }, { "epoch": 0.6720870658244363, "grad_norm": 0.15069304406642914, "learning_rate": 4.9981247682597144e-05, "loss": 0.6828, "step": 1320 }, { "epoch": 0.6771786345049245, "grad_norm": 0.14276649057865143, "learning_rate": 4.998110561958652e-05, "loss": 0.6814, "step": 1330 }, { "epoch": 0.6822702031854126, "grad_norm": 0.12937600910663605, "learning_rate": 4.998096355657589e-05, "loss": 0.6868, "step": 1340 }, { "epoch": 0.6873617718659009, "grad_norm": 0.1466054916381836, "learning_rate": 4.998082149356526e-05, "loss": 0.6848, "step": 1350 }, { "epoch": 0.692453340546389, "grad_norm": 0.14180545508861542, "learning_rate": 4.9980679430554636e-05, "loss": 0.6847, "step": 1360 }, { "epoch": 0.6975449092268771, "grad_norm": 0.11979173868894577, "learning_rate": 4.9980537367544e-05, "loss": 0.6809, "step": 1370 }, { "epoch": 0.7026364779073653, "grad_norm": 0.15614405274391174, "learning_rate": 4.9980395304533376e-05, "loss": 0.6802, "step": 1380 }, { "epoch": 0.7077280465878534, "grad_norm": 0.16178403794765472, "learning_rate": 4.998025324152274e-05, "loss": 0.6766, "step": 1390 }, { "epoch": 0.7128196152683416, "grad_norm": 0.11734528839588165, "learning_rate": 4.9980111178512115e-05, "loss": 0.6853, "step": 1400 }, { "epoch": 0.7179111839488297, "grad_norm": 0.09437315165996552, "learning_rate": 4.997996911550149e-05, "loss": 0.6859, "step": 1410 }, { "epoch": 0.7230027526293179, "grad_norm": 0.08119911700487137, "learning_rate": 4.997982705249086e-05, "loss": 0.6902, "step": 1420 }, { "epoch": 0.728094321309806, "grad_norm": 0.14570364356040955, "learning_rate": 4.9979684989480235e-05, "loss": 0.6841, "step": 1430 }, { "epoch": 0.7331858899902942, "grad_norm": 0.12333963066339493, "learning_rate": 4.997954292646961e-05, "loss": 0.6819, "step": 1440 }, { "epoch": 0.7382774586707823, "grad_norm": 0.11946499347686768, "learning_rate": 4.997940086345898e-05, "loss": 0.6847, "step": 1450 }, { "epoch": 0.7433690273512705, "grad_norm": 0.12417126446962357, "learning_rate": 4.9979258800448354e-05, "loss": 0.6826, "step": 1460 }, { "epoch": 0.7484605960317586, "grad_norm": 0.11672031134366989, "learning_rate": 4.997911673743773e-05, "loss": 0.6831, "step": 1470 }, { "epoch": 0.7535521647122468, "grad_norm": 0.1273321509361267, "learning_rate": 4.99789746744271e-05, "loss": 0.6828, "step": 1480 }, { "epoch": 0.7586437333927349, "grad_norm": 0.1056080237030983, "learning_rate": 4.997883261141647e-05, "loss": 0.6868, "step": 1490 }, { "epoch": 0.7637353020732232, "grad_norm": 0.12784817814826965, "learning_rate": 4.9978690548405846e-05, "loss": 0.6819, "step": 1500 }, { "epoch": 0.7688268707537113, "grad_norm": 0.16047458350658417, "learning_rate": 4.997854848539521e-05, "loss": 0.6825, "step": 1510 }, { "epoch": 0.7739184394341995, "grad_norm": 0.11385879665613174, "learning_rate": 4.9978406422384586e-05, "loss": 0.686, "step": 1520 }, { "epoch": 0.7790100081146876, "grad_norm": 0.13264243304729462, "learning_rate": 4.997826435937396e-05, "loss": 0.6799, "step": 1530 }, { "epoch": 0.7841015767951758, "grad_norm": 0.2524195611476898, "learning_rate": 4.997812229636333e-05, "loss": 0.6771, "step": 1540 }, { "epoch": 0.7891931454756639, "grad_norm": 0.14071324467658997, "learning_rate": 4.9977980233352705e-05, "loss": 0.6833, "step": 1550 }, { "epoch": 0.7942847141561521, "grad_norm": 0.12755858898162842, "learning_rate": 4.997783817034208e-05, "loss": 0.6831, "step": 1560 }, { "epoch": 0.7993762828366402, "grad_norm": 0.17357097566127777, "learning_rate": 4.997769610733145e-05, "loss": 0.6829, "step": 1570 }, { "epoch": 0.8044678515171284, "grad_norm": 0.13588126003742218, "learning_rate": 4.997755404432082e-05, "loss": 0.6896, "step": 1580 }, { "epoch": 0.8095594201976165, "grad_norm": 0.0981392115354538, "learning_rate": 4.997741198131019e-05, "loss": 0.6859, "step": 1590 }, { "epoch": 0.8146509888781046, "grad_norm": 0.13461001217365265, "learning_rate": 4.9977269918299564e-05, "loss": 0.6811, "step": 1600 }, { "epoch": 0.8197425575585928, "grad_norm": 0.25011003017425537, "learning_rate": 4.997712785528894e-05, "loss": 0.6801, "step": 1610 }, { "epoch": 0.8248341262390809, "grad_norm": 0.19415059685707092, "learning_rate": 4.997698579227831e-05, "loss": 0.68, "step": 1620 }, { "epoch": 0.8299256949195691, "grad_norm": 0.1742919236421585, "learning_rate": 4.997684372926768e-05, "loss": 0.684, "step": 1630 }, { "epoch": 0.8350172636000572, "grad_norm": 0.12717512249946594, "learning_rate": 4.9976701666257056e-05, "loss": 0.6864, "step": 1640 }, { "epoch": 0.8401088322805454, "grad_norm": 0.09787630289793015, "learning_rate": 4.997655960324642e-05, "loss": 0.6865, "step": 1650 }, { "epoch": 0.8452004009610335, "grad_norm": 0.11440135538578033, "learning_rate": 4.9976417540235796e-05, "loss": 0.6846, "step": 1660 }, { "epoch": 0.8502919696415218, "grad_norm": 0.13598810136318207, "learning_rate": 4.997627547722517e-05, "loss": 0.6861, "step": 1670 }, { "epoch": 0.8553835383220099, "grad_norm": 0.1623242348432541, "learning_rate": 4.997613341421454e-05, "loss": 0.6797, "step": 1680 }, { "epoch": 0.8604751070024981, "grad_norm": 0.12565261125564575, "learning_rate": 4.9975991351203915e-05, "loss": 0.6847, "step": 1690 }, { "epoch": 0.8655666756829862, "grad_norm": 0.11019585281610489, "learning_rate": 4.997584928819329e-05, "loss": 0.6802, "step": 1700 }, { "epoch": 0.8706582443634744, "grad_norm": 0.10026270151138306, "learning_rate": 4.997570722518266e-05, "loss": 0.6863, "step": 1710 }, { "epoch": 0.8757498130439625, "grad_norm": 0.10043281316757202, "learning_rate": 4.9975565162172034e-05, "loss": 0.6843, "step": 1720 }, { "epoch": 0.8808413817244507, "grad_norm": 0.0944572165608406, "learning_rate": 4.997542309916141e-05, "loss": 0.684, "step": 1730 }, { "epoch": 0.8859329504049388, "grad_norm": 0.12859639525413513, "learning_rate": 4.997528103615078e-05, "loss": 0.6856, "step": 1740 }, { "epoch": 0.891024519085427, "grad_norm": 0.11585383117198944, "learning_rate": 4.9975138973140154e-05, "loss": 0.6807, "step": 1750 }, { "epoch": 0.8961160877659151, "grad_norm": 0.13746441900730133, "learning_rate": 4.997499691012953e-05, "loss": 0.6846, "step": 1760 }, { "epoch": 0.9012076564464033, "grad_norm": 0.09316791594028473, "learning_rate": 4.997485484711889e-05, "loss": 0.6848, "step": 1770 }, { "epoch": 0.9062992251268914, "grad_norm": 0.07422750443220139, "learning_rate": 4.9974712784108266e-05, "loss": 0.688, "step": 1780 }, { "epoch": 0.9113907938073796, "grad_norm": 0.08577447384595871, "learning_rate": 4.997457072109763e-05, "loss": 0.6856, "step": 1790 }, { "epoch": 0.9164823624878677, "grad_norm": 0.09143663942813873, "learning_rate": 4.9974428658087006e-05, "loss": 0.6848, "step": 1800 }, { "epoch": 0.921573931168356, "grad_norm": 0.10064688324928284, "learning_rate": 4.997428659507638e-05, "loss": 0.6889, "step": 1810 }, { "epoch": 0.926665499848844, "grad_norm": 0.0921172946691513, "learning_rate": 4.997414453206575e-05, "loss": 0.6824, "step": 1820 }, { "epoch": 0.9317570685293322, "grad_norm": 0.12253455817699432, "learning_rate": 4.9974002469055125e-05, "loss": 0.686, "step": 1830 }, { "epoch": 0.9368486372098204, "grad_norm": 0.16046911478042603, "learning_rate": 4.99738604060445e-05, "loss": 0.6833, "step": 1840 }, { "epoch": 0.9419402058903085, "grad_norm": 0.1947670727968216, "learning_rate": 4.997371834303387e-05, "loss": 0.676, "step": 1850 }, { "epoch": 0.9470317745707967, "grad_norm": 0.17206092178821564, "learning_rate": 4.9973576280023244e-05, "loss": 0.6832, "step": 1860 }, { "epoch": 0.9521233432512848, "grad_norm": 0.16195142269134521, "learning_rate": 4.997343421701262e-05, "loss": 0.681, "step": 1870 }, { "epoch": 0.957214911931773, "grad_norm": 0.1436363011598587, "learning_rate": 4.997329215400199e-05, "loss": 0.6847, "step": 1880 }, { "epoch": 0.9623064806122611, "grad_norm": 0.11514883488416672, "learning_rate": 4.9973150090991364e-05, "loss": 0.6823, "step": 1890 }, { "epoch": 0.9673980492927493, "grad_norm": 0.11169356852769852, "learning_rate": 4.997300802798074e-05, "loss": 0.6859, "step": 1900 }, { "epoch": 0.9724896179732374, "grad_norm": 0.09914061427116394, "learning_rate": 4.99728659649701e-05, "loss": 0.6848, "step": 1910 }, { "epoch": 0.9775811866537256, "grad_norm": 0.10717228055000305, "learning_rate": 4.9972723901959476e-05, "loss": 0.6833, "step": 1920 }, { "epoch": 0.9826727553342137, "grad_norm": 0.11209560185670853, "learning_rate": 4.997258183894885e-05, "loss": 0.6861, "step": 1930 }, { "epoch": 0.9877643240147019, "grad_norm": 0.1293047070503235, "learning_rate": 4.997243977593822e-05, "loss": 0.6833, "step": 1940 }, { "epoch": 0.99285589269519, "grad_norm": 0.13042615354061127, "learning_rate": 4.9972297712927595e-05, "loss": 0.6837, "step": 1950 }, { "epoch": 0.9979474613756782, "grad_norm": 0.1377701610326767, "learning_rate": 4.997215564991697e-05, "loss": 0.6858, "step": 1960 }, { "epoch": 1.002545784340244, "grad_norm": 0.13352081179618835, "learning_rate": 4.997201358690634e-05, "loss": 0.6173, "step": 1970 }, { "epoch": 1.0076373530207323, "grad_norm": 0.12459533661603928, "learning_rate": 4.9971871523895715e-05, "loss": 0.6836, "step": 1980 }, { "epoch": 1.0127289217012203, "grad_norm": 0.10235695540904999, "learning_rate": 4.997172946088509e-05, "loss": 0.6867, "step": 1990 }, { "epoch": 1.0178204903817085, "grad_norm": 0.0960664227604866, "learning_rate": 4.9971587397874454e-05, "loss": 0.6847, "step": 2000 }, { "epoch": 1.0229120590621967, "grad_norm": 0.12099937349557877, "learning_rate": 4.997144533486383e-05, "loss": 0.6828, "step": 2010 }, { "epoch": 1.028003627742685, "grad_norm": 0.10949967801570892, "learning_rate": 4.99713032718532e-05, "loss": 0.6828, "step": 2020 }, { "epoch": 1.033095196423173, "grad_norm": 0.09417010843753815, "learning_rate": 4.9971161208842573e-05, "loss": 0.6866, "step": 2030 }, { "epoch": 1.0381867651036611, "grad_norm": 0.08539358526468277, "learning_rate": 4.9971019145831947e-05, "loss": 0.6869, "step": 2040 }, { "epoch": 1.0432783337841494, "grad_norm": 0.10147163271903992, "learning_rate": 4.997087708282131e-05, "loss": 0.68, "step": 2050 }, { "epoch": 1.0483699024646376, "grad_norm": 0.15451493859291077, "learning_rate": 4.9970735019810686e-05, "loss": 0.6826, "step": 2060 }, { "epoch": 1.0534614711451256, "grad_norm": 0.09405049681663513, "learning_rate": 4.997059295680006e-05, "loss": 0.6868, "step": 2070 }, { "epoch": 1.0585530398256138, "grad_norm": 0.12649065256118774, "learning_rate": 4.997045089378943e-05, "loss": 0.6845, "step": 2080 }, { "epoch": 1.063644608506102, "grad_norm": 0.12368927896022797, "learning_rate": 4.9970308830778805e-05, "loss": 0.6804, "step": 2090 }, { "epoch": 1.0687361771865902, "grad_norm": 0.15372063219547272, "learning_rate": 4.997016676776818e-05, "loss": 0.6841, "step": 2100 }, { "epoch": 1.0738277458670782, "grad_norm": 0.14659467339515686, "learning_rate": 4.997002470475755e-05, "loss": 0.6808, "step": 2110 }, { "epoch": 1.0789193145475664, "grad_norm": 0.15990343689918518, "learning_rate": 4.9969882641746925e-05, "loss": 0.6824, "step": 2120 }, { "epoch": 1.0840108832280546, "grad_norm": 0.1342097967863083, "learning_rate": 4.99697405787363e-05, "loss": 0.6819, "step": 2130 }, { "epoch": 1.0891024519085426, "grad_norm": 0.1691288948059082, "learning_rate": 4.996959851572567e-05, "loss": 0.6803, "step": 2140 }, { "epoch": 1.0941940205890308, "grad_norm": 0.12277499586343765, "learning_rate": 4.9969456452715044e-05, "loss": 0.683, "step": 2150 }, { "epoch": 1.099285589269519, "grad_norm": 0.19435428082942963, "learning_rate": 4.996931438970441e-05, "loss": 0.6824, "step": 2160 }, { "epoch": 1.1043771579500072, "grad_norm": 0.0905819982290268, "learning_rate": 4.996917232669378e-05, "loss": 0.6894, "step": 2170 }, { "epoch": 1.1094687266304952, "grad_norm": 0.07771383225917816, "learning_rate": 4.9969030263683156e-05, "loss": 0.6878, "step": 2180 }, { "epoch": 1.1145602953109834, "grad_norm": 0.10115786641836166, "learning_rate": 4.996888820067253e-05, "loss": 0.6818, "step": 2190 }, { "epoch": 1.1196518639914717, "grad_norm": 0.10046926885843277, "learning_rate": 4.99687461376619e-05, "loss": 0.6872, "step": 2200 }, { "epoch": 1.1247434326719599, "grad_norm": 0.1584903746843338, "learning_rate": 4.996860407465127e-05, "loss": 0.6772, "step": 2210 }, { "epoch": 1.1298350013524479, "grad_norm": 0.19328419864177704, "learning_rate": 4.996846201164064e-05, "loss": 0.6814, "step": 2220 }, { "epoch": 1.134926570032936, "grad_norm": 0.12247782945632935, "learning_rate": 4.9968319948630015e-05, "loss": 0.6817, "step": 2230 }, { "epoch": 1.1400181387134243, "grad_norm": 0.13911662995815277, "learning_rate": 4.996817788561939e-05, "loss": 0.678, "step": 2240 }, { "epoch": 1.1451097073939125, "grad_norm": 0.16294950246810913, "learning_rate": 4.996803582260876e-05, "loss": 0.6825, "step": 2250 }, { "epoch": 1.1502012760744005, "grad_norm": 0.15820349752902985, "learning_rate": 4.9967893759598135e-05, "loss": 0.6781, "step": 2260 }, { "epoch": 1.1552928447548887, "grad_norm": 0.13467393815517426, "learning_rate": 4.996775169658751e-05, "loss": 0.6862, "step": 2270 }, { "epoch": 1.160384413435377, "grad_norm": 0.11259343475103378, "learning_rate": 4.996760963357688e-05, "loss": 0.6809, "step": 2280 }, { "epoch": 1.165475982115865, "grad_norm": 0.13340143859386444, "learning_rate": 4.9967467570566254e-05, "loss": 0.6805, "step": 2290 }, { "epoch": 1.170567550796353, "grad_norm": 0.12365837395191193, "learning_rate": 4.996732550755562e-05, "loss": 0.6802, "step": 2300 }, { "epoch": 1.1756591194768413, "grad_norm": 0.09431267529726028, "learning_rate": 4.996718344454499e-05, "loss": 0.6903, "step": 2310 }, { "epoch": 1.1807506881573295, "grad_norm": 0.08034133911132812, "learning_rate": 4.9967041381534366e-05, "loss": 0.6837, "step": 2320 }, { "epoch": 1.1858422568378177, "grad_norm": 0.07523014396429062, "learning_rate": 4.996689931852374e-05, "loss": 0.6836, "step": 2330 }, { "epoch": 1.1909338255183057, "grad_norm": 0.1202087476849556, "learning_rate": 4.996675725551311e-05, "loss": 0.6771, "step": 2340 }, { "epoch": 1.196025394198794, "grad_norm": 0.13852781057357788, "learning_rate": 4.9966615192502486e-05, "loss": 0.6813, "step": 2350 }, { "epoch": 1.2011169628792822, "grad_norm": 0.1234976053237915, "learning_rate": 4.996647312949186e-05, "loss": 0.6799, "step": 2360 }, { "epoch": 1.2062085315597701, "grad_norm": 0.1308223158121109, "learning_rate": 4.996633106648123e-05, "loss": 0.6839, "step": 2370 }, { "epoch": 1.2113001002402584, "grad_norm": 0.11719993501901627, "learning_rate": 4.9966189003470605e-05, "loss": 0.6826, "step": 2380 }, { "epoch": 1.2163916689207466, "grad_norm": 0.13690686225891113, "learning_rate": 4.996604694045998e-05, "loss": 0.6804, "step": 2390 }, { "epoch": 1.2214832376012348, "grad_norm": 0.12480480223894119, "learning_rate": 4.996590487744935e-05, "loss": 0.6831, "step": 2400 }, { "epoch": 1.2265748062817228, "grad_norm": 0.12938359379768372, "learning_rate": 4.9965762814438724e-05, "loss": 0.6821, "step": 2410 }, { "epoch": 1.231666374962211, "grad_norm": 0.14791236817836761, "learning_rate": 4.996562075142809e-05, "loss": 0.6813, "step": 2420 }, { "epoch": 1.2367579436426992, "grad_norm": 0.1450764387845993, "learning_rate": 4.9965478688417464e-05, "loss": 0.6835, "step": 2430 }, { "epoch": 1.2418495123231874, "grad_norm": 0.12077004462480545, "learning_rate": 4.996533662540683e-05, "loss": 0.6843, "step": 2440 }, { "epoch": 1.2469410810036754, "grad_norm": 0.0918821468949318, "learning_rate": 4.99651945623962e-05, "loss": 0.6836, "step": 2450 }, { "epoch": 1.2520326496841636, "grad_norm": 0.09863133728504181, "learning_rate": 4.9965052499385576e-05, "loss": 0.683, "step": 2460 }, { "epoch": 1.2571242183646518, "grad_norm": 0.10029463469982147, "learning_rate": 4.996491043637495e-05, "loss": 0.6839, "step": 2470 }, { "epoch": 1.26221578704514, "grad_norm": 0.11962137371301651, "learning_rate": 4.996476837336432e-05, "loss": 0.677, "step": 2480 }, { "epoch": 1.267307355725628, "grad_norm": 0.11363455653190613, "learning_rate": 4.9964626310353696e-05, "loss": 0.6843, "step": 2490 }, { "epoch": 1.2723989244061162, "grad_norm": 0.08753272145986557, "learning_rate": 4.996448424734307e-05, "loss": 0.6857, "step": 2500 }, { "epoch": 1.2774904930866045, "grad_norm": 0.10698520392179489, "learning_rate": 4.996434218433244e-05, "loss": 0.6855, "step": 2510 }, { "epoch": 1.2825820617670924, "grad_norm": 0.09481139481067657, "learning_rate": 4.9964200121321815e-05, "loss": 0.683, "step": 2520 }, { "epoch": 1.2876736304475807, "grad_norm": 0.15638168156147003, "learning_rate": 4.996405805831119e-05, "loss": 0.6842, "step": 2530 }, { "epoch": 1.2927651991280689, "grad_norm": 0.1133870854973793, "learning_rate": 4.996391599530056e-05, "loss": 0.6831, "step": 2540 }, { "epoch": 1.297856767808557, "grad_norm": 0.1086922213435173, "learning_rate": 4.9963773932289934e-05, "loss": 0.6832, "step": 2550 }, { "epoch": 1.3029483364890453, "grad_norm": 0.11375133693218231, "learning_rate": 4.99636318692793e-05, "loss": 0.6871, "step": 2560 }, { "epoch": 1.3080399051695333, "grad_norm": 0.11502251029014587, "learning_rate": 4.9963489806268674e-05, "loss": 0.6844, "step": 2570 }, { "epoch": 1.3131314738500215, "grad_norm": 0.13333244621753693, "learning_rate": 4.996334774325805e-05, "loss": 0.6821, "step": 2580 }, { "epoch": 1.3182230425305097, "grad_norm": 0.17771519720554352, "learning_rate": 4.996320568024742e-05, "loss": 0.6817, "step": 2590 }, { "epoch": 1.3233146112109977, "grad_norm": 0.10856124758720398, "learning_rate": 4.996306361723679e-05, "loss": 0.6832, "step": 2600 }, { "epoch": 1.328406179891486, "grad_norm": 0.13525483012199402, "learning_rate": 4.9962921554226166e-05, "loss": 0.6848, "step": 2610 }, { "epoch": 1.3334977485719741, "grad_norm": 0.14420652389526367, "learning_rate": 4.996277949121553e-05, "loss": 0.6831, "step": 2620 }, { "epoch": 1.3385893172524623, "grad_norm": 0.10660698264837265, "learning_rate": 4.9962637428204906e-05, "loss": 0.686, "step": 2630 }, { "epoch": 1.3436808859329505, "grad_norm": 0.16599448025226593, "learning_rate": 4.996249536519428e-05, "loss": 0.6826, "step": 2640 }, { "epoch": 1.3487724546134385, "grad_norm": 0.13518887758255005, "learning_rate": 4.996235330218365e-05, "loss": 0.6854, "step": 2650 }, { "epoch": 1.3538640232939267, "grad_norm": 0.1113041415810585, "learning_rate": 4.9962211239173025e-05, "loss": 0.6832, "step": 2660 }, { "epoch": 1.3589555919744147, "grad_norm": 0.13242138922214508, "learning_rate": 4.99620691761624e-05, "loss": 0.6814, "step": 2670 }, { "epoch": 1.364047160654903, "grad_norm": 0.18434931337833405, "learning_rate": 4.996192711315177e-05, "loss": 0.6808, "step": 2680 }, { "epoch": 1.3691387293353912, "grad_norm": 0.11528836935758591, "learning_rate": 4.9961785050141144e-05, "loss": 0.685, "step": 2690 }, { "epoch": 1.3742302980158794, "grad_norm": 0.1295492947101593, "learning_rate": 4.996164298713051e-05, "loss": 0.6825, "step": 2700 }, { "epoch": 1.3793218666963676, "grad_norm": 0.09657806158065796, "learning_rate": 4.9961500924119884e-05, "loss": 0.6825, "step": 2710 }, { "epoch": 1.3844134353768556, "grad_norm": 0.08716735243797302, "learning_rate": 4.996135886110926e-05, "loss": 0.6872, "step": 2720 }, { "epoch": 1.3895050040573438, "grad_norm": 0.0896734893321991, "learning_rate": 4.996121679809863e-05, "loss": 0.6817, "step": 2730 }, { "epoch": 1.394596572737832, "grad_norm": 0.10860587656497955, "learning_rate": 4.9961074735088e-05, "loss": 0.6815, "step": 2740 }, { "epoch": 1.39968814141832, "grad_norm": 0.1187656968832016, "learning_rate": 4.9960932672077376e-05, "loss": 0.686, "step": 2750 }, { "epoch": 1.4047797100988082, "grad_norm": 0.11682062596082687, "learning_rate": 4.996079060906675e-05, "loss": 0.6799, "step": 2760 }, { "epoch": 1.4098712787792964, "grad_norm": 0.14465422928333282, "learning_rate": 4.996064854605612e-05, "loss": 0.6823, "step": 2770 }, { "epoch": 1.4149628474597846, "grad_norm": 0.13644230365753174, "learning_rate": 4.9960506483045495e-05, "loss": 0.6828, "step": 2780 }, { "epoch": 1.4200544161402728, "grad_norm": 0.09552885591983795, "learning_rate": 4.996036442003487e-05, "loss": 0.6828, "step": 2790 }, { "epoch": 1.4251459848207608, "grad_norm": 0.1287170648574829, "learning_rate": 4.996022235702424e-05, "loss": 0.6846, "step": 2800 }, { "epoch": 1.430237553501249, "grad_norm": 0.11409243196249008, "learning_rate": 4.9960080294013615e-05, "loss": 0.6844, "step": 2810 }, { "epoch": 1.4353291221817372, "grad_norm": 0.16463157534599304, "learning_rate": 4.995993823100298e-05, "loss": 0.6842, "step": 2820 }, { "epoch": 1.4404206908622252, "grad_norm": 0.12962253391742706, "learning_rate": 4.9959796167992354e-05, "loss": 0.6831, "step": 2830 }, { "epoch": 1.4455122595427135, "grad_norm": 0.1317017823457718, "learning_rate": 4.995965410498172e-05, "loss": 0.6825, "step": 2840 }, { "epoch": 1.4506038282232017, "grad_norm": 0.1253054440021515, "learning_rate": 4.9959512041971094e-05, "loss": 0.6869, "step": 2850 }, { "epoch": 1.4556953969036899, "grad_norm": 0.10968417674303055, "learning_rate": 4.995936997896047e-05, "loss": 0.6837, "step": 2860 }, { "epoch": 1.460786965584178, "grad_norm": 0.15329721570014954, "learning_rate": 4.995922791594984e-05, "loss": 0.6856, "step": 2870 }, { "epoch": 1.465878534264666, "grad_norm": 0.1338498741388321, "learning_rate": 4.995908585293921e-05, "loss": 0.6862, "step": 2880 }, { "epoch": 1.4709701029451543, "grad_norm": 0.10569129139184952, "learning_rate": 4.9958943789928586e-05, "loss": 0.6834, "step": 2890 }, { "epoch": 1.4760616716256423, "grad_norm": 0.14210055768489838, "learning_rate": 4.995880172691796e-05, "loss": 0.682, "step": 2900 }, { "epoch": 1.4811532403061305, "grad_norm": 0.13887226581573486, "learning_rate": 4.995865966390733e-05, "loss": 0.6813, "step": 2910 }, { "epoch": 1.4862448089866187, "grad_norm": 0.14252229034900665, "learning_rate": 4.9958517600896705e-05, "loss": 0.6819, "step": 2920 }, { "epoch": 1.491336377667107, "grad_norm": 0.1889895647764206, "learning_rate": 4.995837553788608e-05, "loss": 0.6833, "step": 2930 }, { "epoch": 1.4964279463475951, "grad_norm": 0.14179687201976776, "learning_rate": 4.995823347487545e-05, "loss": 0.6774, "step": 2940 }, { "epoch": 1.5015195150280833, "grad_norm": 0.1529311090707779, "learning_rate": 4.9958091411864825e-05, "loss": 0.6839, "step": 2950 }, { "epoch": 1.5066110837085713, "grad_norm": 0.11903861910104752, "learning_rate": 4.995794934885419e-05, "loss": 0.6828, "step": 2960 }, { "epoch": 1.5117026523890595, "grad_norm": 0.11958228051662445, "learning_rate": 4.9957807285843564e-05, "loss": 0.6809, "step": 2970 }, { "epoch": 1.5167942210695475, "grad_norm": 0.10305473953485489, "learning_rate": 4.995766522283294e-05, "loss": 0.6834, "step": 2980 }, { "epoch": 1.5218857897500357, "grad_norm": 0.11478529125452042, "learning_rate": 4.995752315982231e-05, "loss": 0.6835, "step": 2990 }, { "epoch": 1.526977358430524, "grad_norm": 0.13605500757694244, "learning_rate": 4.995738109681168e-05, "loss": 0.6804, "step": 3000 }, { "epoch": 1.5320689271110122, "grad_norm": 0.12643195688724518, "learning_rate": 4.9957239033801056e-05, "loss": 0.6763, "step": 3010 }, { "epoch": 1.5371604957915004, "grad_norm": 0.22794055938720703, "learning_rate": 4.995709697079043e-05, "loss": 0.6822, "step": 3020 }, { "epoch": 1.5422520644719884, "grad_norm": 0.1283722072839737, "learning_rate": 4.99569549077798e-05, "loss": 0.6823, "step": 3030 }, { "epoch": 1.5473436331524766, "grad_norm": 0.12796291708946228, "learning_rate": 4.995681284476917e-05, "loss": 0.6789, "step": 3040 }, { "epoch": 1.5524352018329646, "grad_norm": 0.20063504576683044, "learning_rate": 4.995667078175854e-05, "loss": 0.6782, "step": 3050 }, { "epoch": 1.5575267705134528, "grad_norm": 0.10560201853513718, "learning_rate": 4.9956528718747915e-05, "loss": 0.6899, "step": 3060 }, { "epoch": 1.562618339193941, "grad_norm": 0.09931265562772751, "learning_rate": 4.995638665573729e-05, "loss": 0.6857, "step": 3070 }, { "epoch": 1.5677099078744292, "grad_norm": 0.09285406023263931, "learning_rate": 4.995624459272666e-05, "loss": 0.6865, "step": 3080 }, { "epoch": 1.5728014765549174, "grad_norm": 0.11098553240299225, "learning_rate": 4.9956102529716035e-05, "loss": 0.6837, "step": 3090 }, { "epoch": 1.5778930452354056, "grad_norm": 0.12747269868850708, "learning_rate": 4.99559604667054e-05, "loss": 0.6805, "step": 3100 }, { "epoch": 1.5829846139158936, "grad_norm": 0.19148799777030945, "learning_rate": 4.9955818403694774e-05, "loss": 0.6809, "step": 3110 }, { "epoch": 1.5880761825963818, "grad_norm": 0.11333976686000824, "learning_rate": 4.995567634068415e-05, "loss": 0.6834, "step": 3120 }, { "epoch": 1.5931677512768698, "grad_norm": 0.12725278735160828, "learning_rate": 4.995553427767352e-05, "loss": 0.682, "step": 3130 }, { "epoch": 1.598259319957358, "grad_norm": 0.08800658583641052, "learning_rate": 4.995539221466289e-05, "loss": 0.6863, "step": 3140 }, { "epoch": 1.6033508886378462, "grad_norm": 0.1162288561463356, "learning_rate": 4.9955250151652266e-05, "loss": 0.6816, "step": 3150 }, { "epoch": 1.6084424573183345, "grad_norm": 0.11001812666654587, "learning_rate": 4.995510808864164e-05, "loss": 0.6834, "step": 3160 }, { "epoch": 1.6135340259988227, "grad_norm": 0.17772439122200012, "learning_rate": 4.995496602563101e-05, "loss": 0.6839, "step": 3170 }, { "epoch": 1.6186255946793109, "grad_norm": 0.14124180376529694, "learning_rate": 4.9954823962620386e-05, "loss": 0.6817, "step": 3180 }, { "epoch": 1.6237171633597989, "grad_norm": 0.12131723016500473, "learning_rate": 4.995468189960976e-05, "loss": 0.68, "step": 3190 }, { "epoch": 1.628808732040287, "grad_norm": 0.1277320384979248, "learning_rate": 4.995453983659913e-05, "loss": 0.6761, "step": 3200 }, { "epoch": 1.633900300720775, "grad_norm": 0.11980846524238586, "learning_rate": 4.99543977735885e-05, "loss": 0.6845, "step": 3210 }, { "epoch": 1.6389918694012633, "grad_norm": 0.18087904155254364, "learning_rate": 4.995425571057787e-05, "loss": 0.6815, "step": 3220 }, { "epoch": 1.6440834380817515, "grad_norm": 0.1640498787164688, "learning_rate": 4.9954113647567244e-05, "loss": 0.6799, "step": 3230 }, { "epoch": 1.6491750067622397, "grad_norm": 0.14339861273765564, "learning_rate": 4.995397158455662e-05, "loss": 0.684, "step": 3240 }, { "epoch": 1.654266575442728, "grad_norm": 0.11472135037183762, "learning_rate": 4.9953829521545984e-05, "loss": 0.6825, "step": 3250 }, { "epoch": 1.659358144123216, "grad_norm": 0.12307639420032501, "learning_rate": 4.995368745853536e-05, "loss": 0.6892, "step": 3260 }, { "epoch": 1.6644497128037041, "grad_norm": 0.09782890975475311, "learning_rate": 4.995354539552473e-05, "loss": 0.6823, "step": 3270 }, { "epoch": 1.6695412814841921, "grad_norm": 0.1154768094420433, "learning_rate": 4.99534033325141e-05, "loss": 0.6842, "step": 3280 }, { "epoch": 1.6746328501646803, "grad_norm": 0.18311528861522675, "learning_rate": 4.9953261269503476e-05, "loss": 0.6833, "step": 3290 }, { "epoch": 1.6797244188451685, "grad_norm": 0.11727318912744522, "learning_rate": 4.995311920649285e-05, "loss": 0.6819, "step": 3300 }, { "epoch": 1.6848159875256568, "grad_norm": 0.12900975346565247, "learning_rate": 4.995297714348222e-05, "loss": 0.6849, "step": 3310 }, { "epoch": 1.689907556206145, "grad_norm": 0.12244871258735657, "learning_rate": 4.9952835080471596e-05, "loss": 0.684, "step": 3320 }, { "epoch": 1.6949991248866332, "grad_norm": 0.11466418206691742, "learning_rate": 4.995269301746097e-05, "loss": 0.6833, "step": 3330 }, { "epoch": 1.7000906935671212, "grad_norm": 0.12230440229177475, "learning_rate": 4.995255095445034e-05, "loss": 0.6831, "step": 3340 }, { "epoch": 1.7051822622476094, "grad_norm": 0.13069362938404083, "learning_rate": 4.995240889143971e-05, "loss": 0.6842, "step": 3350 }, { "epoch": 1.7102738309280974, "grad_norm": 0.11203134804964066, "learning_rate": 4.995226682842908e-05, "loss": 0.6858, "step": 3360 }, { "epoch": 1.7153653996085856, "grad_norm": 0.1311291605234146, "learning_rate": 4.9952124765418454e-05, "loss": 0.6838, "step": 3370 }, { "epoch": 1.7204569682890738, "grad_norm": 0.1232665479183197, "learning_rate": 4.995198270240783e-05, "loss": 0.6841, "step": 3380 }, { "epoch": 1.725548536969562, "grad_norm": 0.1445329189300537, "learning_rate": 4.99518406393972e-05, "loss": 0.6801, "step": 3390 }, { "epoch": 1.7306401056500502, "grad_norm": 0.1403801292181015, "learning_rate": 4.9951698576386574e-05, "loss": 0.6863, "step": 3400 }, { "epoch": 1.7357316743305384, "grad_norm": 0.09926485270261765, "learning_rate": 4.995155651337595e-05, "loss": 0.6839, "step": 3410 }, { "epoch": 1.7408232430110264, "grad_norm": 0.10609301924705505, "learning_rate": 4.995141445036532e-05, "loss": 0.6851, "step": 3420 }, { "epoch": 1.7459148116915146, "grad_norm": 0.1048160120844841, "learning_rate": 4.995127238735469e-05, "loss": 0.6875, "step": 3430 }, { "epoch": 1.7510063803720026, "grad_norm": 0.10706604272127151, "learning_rate": 4.9951130324344066e-05, "loss": 0.684, "step": 3440 }, { "epoch": 1.7560979490524908, "grad_norm": 0.1004481241106987, "learning_rate": 4.995098826133344e-05, "loss": 0.6847, "step": 3450 }, { "epoch": 1.761189517732979, "grad_norm": 0.13026846945285797, "learning_rate": 4.9950846198322806e-05, "loss": 0.6822, "step": 3460 }, { "epoch": 1.7662810864134673, "grad_norm": 0.20907576382160187, "learning_rate": 4.995070413531218e-05, "loss": 0.6808, "step": 3470 }, { "epoch": 1.7713726550939555, "grad_norm": 0.14915932714939117, "learning_rate": 4.995056207230155e-05, "loss": 0.6803, "step": 3480 }, { "epoch": 1.7764642237744435, "grad_norm": 0.12906627357006073, "learning_rate": 4.995042000929092e-05, "loss": 0.6868, "step": 3490 }, { "epoch": 1.7815557924549317, "grad_norm": 0.10379557311534882, "learning_rate": 4.995027794628029e-05, "loss": 0.6853, "step": 3500 }, { "epoch": 1.7866473611354197, "grad_norm": 0.10871709883213043, "learning_rate": 4.9950135883269664e-05, "loss": 0.6863, "step": 3510 }, { "epoch": 1.7917389298159079, "grad_norm": 0.1513502597808838, "learning_rate": 4.994999382025904e-05, "loss": 0.6816, "step": 3520 }, { "epoch": 1.796830498496396, "grad_norm": 0.13802939653396606, "learning_rate": 4.994985175724841e-05, "loss": 0.6838, "step": 3530 }, { "epoch": 1.8019220671768843, "grad_norm": 0.13514472544193268, "learning_rate": 4.9949709694237784e-05, "loss": 0.6794, "step": 3540 }, { "epoch": 1.8070136358573725, "grad_norm": 0.16484974324703217, "learning_rate": 4.994956763122716e-05, "loss": 0.682, "step": 3550 }, { "epoch": 1.8121052045378607, "grad_norm": 0.11142993718385696, "learning_rate": 4.994942556821653e-05, "loss": 0.6816, "step": 3560 }, { "epoch": 1.8171967732183487, "grad_norm": 0.14259152114391327, "learning_rate": 4.99492835052059e-05, "loss": 0.6781, "step": 3570 }, { "epoch": 1.822288341898837, "grad_norm": 0.15921758115291595, "learning_rate": 4.9949141442195276e-05, "loss": 0.6855, "step": 3580 }, { "epoch": 1.827379910579325, "grad_norm": 0.09428475797176361, "learning_rate": 4.994899937918465e-05, "loss": 0.6833, "step": 3590 }, { "epoch": 1.8324714792598131, "grad_norm": 0.1155981793999672, "learning_rate": 4.994885731617402e-05, "loss": 0.6841, "step": 3600 }, { "epoch": 1.8375630479403013, "grad_norm": 0.10845302045345306, "learning_rate": 4.994871525316339e-05, "loss": 0.6846, "step": 3610 }, { "epoch": 1.8426546166207896, "grad_norm": 0.12848089635372162, "learning_rate": 4.994857319015276e-05, "loss": 0.682, "step": 3620 }, { "epoch": 1.8477461853012778, "grad_norm": 0.10348972678184509, "learning_rate": 4.9948431127142135e-05, "loss": 0.6842, "step": 3630 }, { "epoch": 1.852837753981766, "grad_norm": 0.13845866918563843, "learning_rate": 4.994828906413151e-05, "loss": 0.6812, "step": 3640 }, { "epoch": 1.857929322662254, "grad_norm": 0.11816728860139847, "learning_rate": 4.994814700112088e-05, "loss": 0.6812, "step": 3650 }, { "epoch": 1.8630208913427422, "grad_norm": 0.13902199268341064, "learning_rate": 4.9948004938110254e-05, "loss": 0.6871, "step": 3660 }, { "epoch": 1.8681124600232302, "grad_norm": 0.12729224562644958, "learning_rate": 4.994786287509962e-05, "loss": 0.6802, "step": 3670 }, { "epoch": 1.8732040287037184, "grad_norm": 0.14033198356628418, "learning_rate": 4.9947720812088994e-05, "loss": 0.6843, "step": 3680 }, { "epoch": 1.8782955973842066, "grad_norm": 0.12836380302906036, "learning_rate": 4.994757874907837e-05, "loss": 0.6833, "step": 3690 }, { "epoch": 1.8833871660646948, "grad_norm": 0.1290048211812973, "learning_rate": 4.994743668606774e-05, "loss": 0.6832, "step": 3700 }, { "epoch": 1.888478734745183, "grad_norm": 0.1284429430961609, "learning_rate": 4.994729462305711e-05, "loss": 0.6843, "step": 3710 }, { "epoch": 1.893570303425671, "grad_norm": 0.13112841546535492, "learning_rate": 4.9947152560046486e-05, "loss": 0.6811, "step": 3720 }, { "epoch": 1.8986618721061592, "grad_norm": 0.14525501430034637, "learning_rate": 4.994701049703586e-05, "loss": 0.6793, "step": 3730 }, { "epoch": 1.9037534407866472, "grad_norm": 0.1803501546382904, "learning_rate": 4.994686843402523e-05, "loss": 0.6791, "step": 3740 }, { "epoch": 1.9088450094671354, "grad_norm": 0.1837460994720459, "learning_rate": 4.99467263710146e-05, "loss": 0.6771, "step": 3750 }, { "epoch": 1.9139365781476236, "grad_norm": 0.12087200582027435, "learning_rate": 4.994658430800397e-05, "loss": 0.6836, "step": 3760 }, { "epoch": 1.9190281468281118, "grad_norm": 0.1253005713224411, "learning_rate": 4.9946442244993345e-05, "loss": 0.6822, "step": 3770 }, { "epoch": 1.9241197155086, "grad_norm": 0.11462333053350449, "learning_rate": 4.994630018198272e-05, "loss": 0.684, "step": 3780 }, { "epoch": 1.9292112841890883, "grad_norm": 0.1458183377981186, "learning_rate": 4.994615811897209e-05, "loss": 0.6812, "step": 3790 }, { "epoch": 1.9343028528695763, "grad_norm": 0.13514210283756256, "learning_rate": 4.9946016055961464e-05, "loss": 0.6883, "step": 3800 }, { "epoch": 1.9393944215500645, "grad_norm": 0.10077164322137833, "learning_rate": 4.994587399295084e-05, "loss": 0.6841, "step": 3810 }, { "epoch": 1.9444859902305525, "grad_norm": 0.1145828515291214, "learning_rate": 4.994573192994021e-05, "loss": 0.6794, "step": 3820 }, { "epoch": 1.9495775589110407, "grad_norm": 0.12171609699726105, "learning_rate": 4.994558986692958e-05, "loss": 0.6801, "step": 3830 }, { "epoch": 1.9546691275915289, "grad_norm": 0.1296948492527008, "learning_rate": 4.9945447803918956e-05, "loss": 0.6836, "step": 3840 }, { "epoch": 1.959760696272017, "grad_norm": 0.13795003294944763, "learning_rate": 4.994530574090833e-05, "loss": 0.6814, "step": 3850 }, { "epoch": 1.9648522649525053, "grad_norm": 0.10949226468801498, "learning_rate": 4.99451636778977e-05, "loss": 0.6851, "step": 3860 }, { "epoch": 1.9699438336329935, "grad_norm": 0.09504687041044235, "learning_rate": 4.994502161488707e-05, "loss": 0.6847, "step": 3870 }, { "epoch": 1.9750354023134815, "grad_norm": 0.12004721909761429, "learning_rate": 4.994487955187644e-05, "loss": 0.6853, "step": 3880 }, { "epoch": 1.9801269709939697, "grad_norm": 0.15672442317008972, "learning_rate": 4.994473748886581e-05, "loss": 0.678, "step": 3890 }, { "epoch": 1.9852185396744577, "grad_norm": 0.1672324538230896, "learning_rate": 4.994459542585518e-05, "loss": 0.6801, "step": 3900 }, { "epoch": 1.990310108354946, "grad_norm": 0.13963304460048676, "learning_rate": 4.9944453362844555e-05, "loss": 0.6896, "step": 3910 }, { "epoch": 1.9954016770354341, "grad_norm": 0.11426424980163574, "learning_rate": 4.994431129983393e-05, "loss": 0.6802, "step": 3920 }, { "epoch": 2.0, "grad_norm": 0.01827167347073555, "learning_rate": 4.99441692368233e-05, "loss": 0.6174, "step": 3930 }, { "epoch": 2.005091568680488, "grad_norm": 0.17911553382873535, "learning_rate": 4.9944027173812674e-05, "loss": 0.6826, "step": 3940 }, { "epoch": 2.0101831373609764, "grad_norm": 0.11154532432556152, "learning_rate": 4.994388511080205e-05, "loss": 0.6843, "step": 3950 }, { "epoch": 2.0152747060414646, "grad_norm": 0.09386030584573746, "learning_rate": 4.994374304779142e-05, "loss": 0.686, "step": 3960 }, { "epoch": 2.0203662747219524, "grad_norm": 0.09608808904886246, "learning_rate": 4.994360098478079e-05, "loss": 0.6791, "step": 3970 }, { "epoch": 2.0254578434024406, "grad_norm": 0.12537717819213867, "learning_rate": 4.9943458921770166e-05, "loss": 0.6842, "step": 3980 }, { "epoch": 2.030549412082929, "grad_norm": 0.09800703823566437, "learning_rate": 4.994331685875954e-05, "loss": 0.6859, "step": 3990 }, { "epoch": 2.035640980763417, "grad_norm": 0.07934601604938507, "learning_rate": 4.994317479574891e-05, "loss": 0.6846, "step": 4000 }, { "epoch": 2.0407325494439053, "grad_norm": 0.10269072651863098, "learning_rate": 4.994303273273828e-05, "loss": 0.6852, "step": 4010 }, { "epoch": 2.0458241181243935, "grad_norm": 0.09138213843107224, "learning_rate": 4.994289066972765e-05, "loss": 0.682, "step": 4020 }, { "epoch": 2.0509156868048817, "grad_norm": 0.1062936782836914, "learning_rate": 4.9942748606717025e-05, "loss": 0.6863, "step": 4030 }, { "epoch": 2.05600725548537, "grad_norm": 0.13446182012557983, "learning_rate": 4.99426065437064e-05, "loss": 0.68, "step": 4040 }, { "epoch": 2.0610988241658577, "grad_norm": 0.1352904587984085, "learning_rate": 4.994246448069577e-05, "loss": 0.68, "step": 4050 }, { "epoch": 2.066190392846346, "grad_norm": 0.14259877800941467, "learning_rate": 4.9942322417685144e-05, "loss": 0.684, "step": 4060 }, { "epoch": 2.071281961526834, "grad_norm": 0.1194225326180458, "learning_rate": 4.994218035467452e-05, "loss": 0.6845, "step": 4070 }, { "epoch": 2.0763735302073223, "grad_norm": 0.10988787561655045, "learning_rate": 4.994203829166389e-05, "loss": 0.6834, "step": 4080 }, { "epoch": 2.0814650988878105, "grad_norm": 0.10374101996421814, "learning_rate": 4.994189622865326e-05, "loss": 0.6833, "step": 4090 }, { "epoch": 2.0865566675682987, "grad_norm": 0.11888198554515839, "learning_rate": 4.994175416564263e-05, "loss": 0.683, "step": 4100 }, { "epoch": 2.091648236248787, "grad_norm": 0.11808530986309052, "learning_rate": 4.9941612102632e-05, "loss": 0.6813, "step": 4110 }, { "epoch": 2.096739804929275, "grad_norm": 0.12874440848827362, "learning_rate": 4.9941470039621376e-05, "loss": 0.6809, "step": 4120 }, { "epoch": 2.101831373609763, "grad_norm": 0.1372908353805542, "learning_rate": 4.994132797661075e-05, "loss": 0.6793, "step": 4130 }, { "epoch": 2.106922942290251, "grad_norm": 0.15299095213413239, "learning_rate": 4.994118591360012e-05, "loss": 0.6785, "step": 4140 }, { "epoch": 2.1120145109707393, "grad_norm": 0.1464032679796219, "learning_rate": 4.994104385058949e-05, "loss": 0.6804, "step": 4150 }, { "epoch": 2.1171060796512275, "grad_norm": 0.10995624214410782, "learning_rate": 4.994090178757886e-05, "loss": 0.6854, "step": 4160 }, { "epoch": 2.1221976483317158, "grad_norm": 0.1125839501619339, "learning_rate": 4.9940759724568235e-05, "loss": 0.6802, "step": 4170 }, { "epoch": 2.127289217012204, "grad_norm": 0.15469452738761902, "learning_rate": 4.994061766155761e-05, "loss": 0.6805, "step": 4180 }, { "epoch": 2.132380785692692, "grad_norm": 0.15448547899723053, "learning_rate": 4.994047559854698e-05, "loss": 0.6826, "step": 4190 }, { "epoch": 2.1374723543731804, "grad_norm": 0.12687282264232635, "learning_rate": 4.9940333535536354e-05, "loss": 0.681, "step": 4200 }, { "epoch": 2.142563923053668, "grad_norm": 0.13330869376659393, "learning_rate": 4.994019147252573e-05, "loss": 0.6811, "step": 4210 }, { "epoch": 2.1476554917341564, "grad_norm": 0.13483920693397522, "learning_rate": 4.99400494095151e-05, "loss": 0.6834, "step": 4220 }, { "epoch": 2.1527470604146446, "grad_norm": 0.08532749861478806, "learning_rate": 4.9939907346504474e-05, "loss": 0.6867, "step": 4230 }, { "epoch": 2.157838629095133, "grad_norm": 0.12028615176677704, "learning_rate": 4.993976528349385e-05, "loss": 0.6849, "step": 4240 }, { "epoch": 2.162930197775621, "grad_norm": 0.10255931317806244, "learning_rate": 4.993962322048322e-05, "loss": 0.6816, "step": 4250 }, { "epoch": 2.1680217664561092, "grad_norm": 0.16485556960105896, "learning_rate": 4.9939481157472586e-05, "loss": 0.6791, "step": 4260 }, { "epoch": 2.1731133351365974, "grad_norm": 0.1411302089691162, "learning_rate": 4.993933909446196e-05, "loss": 0.6788, "step": 4270 }, { "epoch": 2.178204903817085, "grad_norm": 0.18721655011177063, "learning_rate": 4.993919703145133e-05, "loss": 0.6854, "step": 4280 }, { "epoch": 2.1832964724975734, "grad_norm": 0.0997004359960556, "learning_rate": 4.9939054968440706e-05, "loss": 0.6842, "step": 4290 }, { "epoch": 2.1883880411780616, "grad_norm": 0.11703092604875565, "learning_rate": 4.993891290543007e-05, "loss": 0.68, "step": 4300 }, { "epoch": 2.19347960985855, "grad_norm": 0.13729970157146454, "learning_rate": 4.9938770842419445e-05, "loss": 0.6832, "step": 4310 }, { "epoch": 2.198571178539038, "grad_norm": 0.12172706425189972, "learning_rate": 4.993862877940882e-05, "loss": 0.6827, "step": 4320 }, { "epoch": 2.2036627472195263, "grad_norm": 0.12669777870178223, "learning_rate": 4.993848671639819e-05, "loss": 0.6829, "step": 4330 }, { "epoch": 2.2087543159000145, "grad_norm": 0.13186220824718475, "learning_rate": 4.9938344653387564e-05, "loss": 0.6824, "step": 4340 }, { "epoch": 2.2138458845805022, "grad_norm": 0.13194870948791504, "learning_rate": 4.993820259037694e-05, "loss": 0.6803, "step": 4350 }, { "epoch": 2.2189374532609905, "grad_norm": 0.14057835936546326, "learning_rate": 4.993806052736631e-05, "loss": 0.68, "step": 4360 }, { "epoch": 2.2240290219414787, "grad_norm": 0.12043063342571259, "learning_rate": 4.9937918464355684e-05, "loss": 0.6849, "step": 4370 }, { "epoch": 2.229120590621967, "grad_norm": 0.11859495937824249, "learning_rate": 4.993777640134506e-05, "loss": 0.6831, "step": 4380 }, { "epoch": 2.234212159302455, "grad_norm": 0.12299305200576782, "learning_rate": 4.993763433833443e-05, "loss": 0.6803, "step": 4390 }, { "epoch": 2.2393037279829433, "grad_norm": 0.12101336568593979, "learning_rate": 4.9937492275323796e-05, "loss": 0.6812, "step": 4400 }, { "epoch": 2.2443952966634315, "grad_norm": 0.10430170595645905, "learning_rate": 4.993735021231317e-05, "loss": 0.6866, "step": 4410 }, { "epoch": 2.2494868653439197, "grad_norm": 0.08973786234855652, "learning_rate": 4.993720814930254e-05, "loss": 0.6861, "step": 4420 }, { "epoch": 2.254578434024408, "grad_norm": 0.09560893476009369, "learning_rate": 4.9937066086291916e-05, "loss": 0.6821, "step": 4430 }, { "epoch": 2.2596700027048957, "grad_norm": 0.12744377553462982, "learning_rate": 4.993692402328129e-05, "loss": 0.6861, "step": 4440 }, { "epoch": 2.264761571385384, "grad_norm": 0.09390248358249664, "learning_rate": 4.993678196027066e-05, "loss": 0.6837, "step": 4450 }, { "epoch": 2.269853140065872, "grad_norm": 0.10652091354131699, "learning_rate": 4.9936639897260035e-05, "loss": 0.6821, "step": 4460 }, { "epoch": 2.2749447087463603, "grad_norm": 0.14594070613384247, "learning_rate": 4.993649783424941e-05, "loss": 0.6831, "step": 4470 }, { "epoch": 2.2800362774268486, "grad_norm": 0.11480095237493515, "learning_rate": 4.993635577123878e-05, "loss": 0.676, "step": 4480 }, { "epoch": 2.2851278461073368, "grad_norm": 0.15268968045711517, "learning_rate": 4.9936213708228154e-05, "loss": 0.6802, "step": 4490 }, { "epoch": 2.290219414787825, "grad_norm": 0.12245162576436996, "learning_rate": 4.993607164521753e-05, "loss": 0.6816, "step": 4500 }, { "epoch": 2.2953109834683127, "grad_norm": 0.09671120345592499, "learning_rate": 4.9935929582206894e-05, "loss": 0.6808, "step": 4510 }, { "epoch": 2.300402552148801, "grad_norm": 0.10151444375514984, "learning_rate": 4.993578751919627e-05, "loss": 0.6862, "step": 4520 }, { "epoch": 2.305494120829289, "grad_norm": 0.10020536184310913, "learning_rate": 4.993564545618564e-05, "loss": 0.6809, "step": 4530 }, { "epoch": 2.3105856895097774, "grad_norm": 0.20587410032749176, "learning_rate": 4.9935503393175006e-05, "loss": 0.6839, "step": 4540 }, { "epoch": 2.3156772581902656, "grad_norm": 0.12846329808235168, "learning_rate": 4.993536133016438e-05, "loss": 0.6814, "step": 4550 }, { "epoch": 2.320768826870754, "grad_norm": 0.12553255259990692, "learning_rate": 4.993521926715375e-05, "loss": 0.6828, "step": 4560 }, { "epoch": 2.325860395551242, "grad_norm": 0.11741780489683151, "learning_rate": 4.9935077204143125e-05, "loss": 0.6855, "step": 4570 }, { "epoch": 2.33095196423173, "grad_norm": 0.09674712270498276, "learning_rate": 4.99349351411325e-05, "loss": 0.6813, "step": 4580 }, { "epoch": 2.336043532912218, "grad_norm": 0.11124306917190552, "learning_rate": 4.993479307812187e-05, "loss": 0.6763, "step": 4590 }, { "epoch": 2.341135101592706, "grad_norm": 0.1364033818244934, "learning_rate": 4.9934651015111245e-05, "loss": 0.6798, "step": 4600 }, { "epoch": 2.3462266702731944, "grad_norm": 0.14521688222885132, "learning_rate": 4.993450895210062e-05, "loss": 0.6824, "step": 4610 }, { "epoch": 2.3513182389536826, "grad_norm": 0.10061439126729965, "learning_rate": 4.993436688908999e-05, "loss": 0.6841, "step": 4620 }, { "epoch": 2.356409807634171, "grad_norm": 0.09391237050294876, "learning_rate": 4.9934224826079364e-05, "loss": 0.6827, "step": 4630 }, { "epoch": 2.361501376314659, "grad_norm": 0.12690366804599762, "learning_rate": 4.993408276306874e-05, "loss": 0.6798, "step": 4640 }, { "epoch": 2.3665929449951473, "grad_norm": 0.11659922450780869, "learning_rate": 4.993394070005811e-05, "loss": 0.6805, "step": 4650 }, { "epoch": 2.3716845136756355, "grad_norm": 0.1206756979227066, "learning_rate": 4.993379863704748e-05, "loss": 0.6792, "step": 4660 }, { "epoch": 2.3767760823561233, "grad_norm": 0.11938859522342682, "learning_rate": 4.993365657403685e-05, "loss": 0.6837, "step": 4670 }, { "epoch": 2.3818676510366115, "grad_norm": 0.10022424161434174, "learning_rate": 4.993351451102622e-05, "loss": 0.6807, "step": 4680 }, { "epoch": 2.3869592197170997, "grad_norm": 0.14838755130767822, "learning_rate": 4.9933372448015596e-05, "loss": 0.6795, "step": 4690 }, { "epoch": 2.392050788397588, "grad_norm": 0.131904736161232, "learning_rate": 4.993323038500497e-05, "loss": 0.6855, "step": 4700 }, { "epoch": 2.397142357078076, "grad_norm": 0.1132061704993248, "learning_rate": 4.993308832199434e-05, "loss": 0.6792, "step": 4710 }, { "epoch": 2.4022339257585643, "grad_norm": 0.10466153919696808, "learning_rate": 4.993294625898371e-05, "loss": 0.6856, "step": 4720 }, { "epoch": 2.4073254944390525, "grad_norm": 0.108913853764534, "learning_rate": 4.993280419597308e-05, "loss": 0.6787, "step": 4730 }, { "epoch": 2.4124170631195403, "grad_norm": 0.12613457441329956, "learning_rate": 4.9932662132962455e-05, "loss": 0.6804, "step": 4740 }, { "epoch": 2.4175086318000285, "grad_norm": 0.11993265151977539, "learning_rate": 4.993252006995183e-05, "loss": 0.6809, "step": 4750 }, { "epoch": 2.4226002004805167, "grad_norm": 0.13760647177696228, "learning_rate": 4.99323780069412e-05, "loss": 0.6762, "step": 4760 }, { "epoch": 2.427691769161005, "grad_norm": 0.15461039543151855, "learning_rate": 4.9932235943930574e-05, "loss": 0.6808, "step": 4770 }, { "epoch": 2.432783337841493, "grad_norm": 0.11814858764410019, "learning_rate": 4.993209388091995e-05, "loss": 0.6807, "step": 4780 }, { "epoch": 2.4378749065219814, "grad_norm": 0.12167418003082275, "learning_rate": 4.993195181790932e-05, "loss": 0.6838, "step": 4790 }, { "epoch": 2.4429664752024696, "grad_norm": 0.13912709057331085, "learning_rate": 4.9931809754898687e-05, "loss": 0.6814, "step": 4800 }, { "epoch": 2.4480580438829573, "grad_norm": 0.1079849898815155, "learning_rate": 4.993166769188806e-05, "loss": 0.6802, "step": 4810 }, { "epoch": 2.4531496125634455, "grad_norm": 0.1483919620513916, "learning_rate": 4.993152562887743e-05, "loss": 0.6809, "step": 4820 }, { "epoch": 2.4582411812439338, "grad_norm": 0.1411130726337433, "learning_rate": 4.9931383565866806e-05, "loss": 0.6819, "step": 4830 }, { "epoch": 2.463332749924422, "grad_norm": 0.13872161507606506, "learning_rate": 4.993124150285618e-05, "loss": 0.6814, "step": 4840 }, { "epoch": 2.46842431860491, "grad_norm": 0.13207204639911652, "learning_rate": 4.993109943984555e-05, "loss": 0.6819, "step": 4850 }, { "epoch": 2.4735158872853984, "grad_norm": 0.13904866576194763, "learning_rate": 4.9930957376834925e-05, "loss": 0.6799, "step": 4860 }, { "epoch": 2.4786074559658866, "grad_norm": 0.10088212043046951, "learning_rate": 4.99308153138243e-05, "loss": 0.6849, "step": 4870 }, { "epoch": 2.483699024646375, "grad_norm": 0.15108828246593475, "learning_rate": 4.993067325081367e-05, "loss": 0.6824, "step": 4880 }, { "epoch": 2.488790593326863, "grad_norm": 0.11093771457672119, "learning_rate": 4.9930531187803044e-05, "loss": 0.6848, "step": 4890 }, { "epoch": 2.493882162007351, "grad_norm": 0.10378114134073257, "learning_rate": 4.993038912479242e-05, "loss": 0.6824, "step": 4900 }, { "epoch": 2.498973730687839, "grad_norm": 0.1797563135623932, "learning_rate": 4.993024706178179e-05, "loss": 0.6805, "step": 4910 }, { "epoch": 2.5040652993683272, "grad_norm": 0.13369685411453247, "learning_rate": 4.993010499877116e-05, "loss": 0.6798, "step": 4920 }, { "epoch": 2.5091568680488154, "grad_norm": 0.11391709744930267, "learning_rate": 4.992996293576053e-05, "loss": 0.6769, "step": 4930 }, { "epoch": 2.5142484367293036, "grad_norm": 0.15841761231422424, "learning_rate": 4.9929820872749896e-05, "loss": 0.6813, "step": 4940 }, { "epoch": 2.519340005409792, "grad_norm": 0.1152459904551506, "learning_rate": 4.992967880973927e-05, "loss": 0.6825, "step": 4950 }, { "epoch": 2.52443157409028, "grad_norm": 0.1523844301700592, "learning_rate": 4.992953674672864e-05, "loss": 0.6795, "step": 4960 }, { "epoch": 2.529523142770768, "grad_norm": 0.15071742236614227, "learning_rate": 4.9929394683718016e-05, "loss": 0.6778, "step": 4970 }, { "epoch": 2.534614711451256, "grad_norm": 0.0915883481502533, "learning_rate": 4.992925262070739e-05, "loss": 0.689, "step": 4980 }, { "epoch": 2.5397062801317443, "grad_norm": 0.08719677478075027, "learning_rate": 4.992911055769676e-05, "loss": 0.6831, "step": 4990 }, { "epoch": 2.5447978488122325, "grad_norm": 0.10521717369556427, "learning_rate": 4.9928968494686135e-05, "loss": 0.6838, "step": 5000 }, { "epoch": 2.5498894174927207, "grad_norm": 0.14673079550266266, "learning_rate": 4.992882643167551e-05, "loss": 0.6777, "step": 5010 }, { "epoch": 2.554980986173209, "grad_norm": 0.1252555549144745, "learning_rate": 4.992868436866488e-05, "loss": 0.6774, "step": 5020 }, { "epoch": 2.560072554853697, "grad_norm": 0.17313307523727417, "learning_rate": 4.9928542305654254e-05, "loss": 0.6846, "step": 5030 }, { "epoch": 2.565164123534185, "grad_norm": 0.12619802355766296, "learning_rate": 4.992840024264363e-05, "loss": 0.6827, "step": 5040 }, { "epoch": 2.5702556922146735, "grad_norm": 0.11647044122219086, "learning_rate": 4.9928258179633e-05, "loss": 0.6779, "step": 5050 }, { "epoch": 2.5753472608951613, "grad_norm": 0.11227191984653473, "learning_rate": 4.992811611662237e-05, "loss": 0.6767, "step": 5060 }, { "epoch": 2.5804388295756495, "grad_norm": 0.12041344493627548, "learning_rate": 4.992797405361174e-05, "loss": 0.6784, "step": 5070 }, { "epoch": 2.5855303982561377, "grad_norm": 0.14506416022777557, "learning_rate": 4.992783199060111e-05, "loss": 0.6798, "step": 5080 }, { "epoch": 2.590621966936626, "grad_norm": 0.10675019025802612, "learning_rate": 4.9927689927590486e-05, "loss": 0.684, "step": 5090 }, { "epoch": 2.595713535617114, "grad_norm": 0.09595705568790436, "learning_rate": 4.992754786457986e-05, "loss": 0.68, "step": 5100 }, { "epoch": 2.600805104297602, "grad_norm": 0.12361190468072891, "learning_rate": 4.992740580156923e-05, "loss": 0.6813, "step": 5110 }, { "epoch": 2.6058966729780906, "grad_norm": 0.14116083085536957, "learning_rate": 4.9927263738558606e-05, "loss": 0.6791, "step": 5120 }, { "epoch": 2.6109882416585783, "grad_norm": 0.14521893858909607, "learning_rate": 4.992712167554798e-05, "loss": 0.6841, "step": 5130 }, { "epoch": 2.6160798103390666, "grad_norm": 0.08931027352809906, "learning_rate": 4.9926979612537345e-05, "loss": 0.6839, "step": 5140 }, { "epoch": 2.6211713790195548, "grad_norm": 0.15768922865390778, "learning_rate": 4.992683754952672e-05, "loss": 0.6837, "step": 5150 }, { "epoch": 2.626262947700043, "grad_norm": 0.11857085675001144, "learning_rate": 4.992669548651609e-05, "loss": 0.6791, "step": 5160 }, { "epoch": 2.631354516380531, "grad_norm": 0.12832790613174438, "learning_rate": 4.9926553423505464e-05, "loss": 0.6789, "step": 5170 }, { "epoch": 2.6364460850610194, "grad_norm": 0.1246199905872345, "learning_rate": 4.992641136049484e-05, "loss": 0.6834, "step": 5180 }, { "epoch": 2.6415376537415076, "grad_norm": 0.10562731325626373, "learning_rate": 4.992626929748421e-05, "loss": 0.681, "step": 5190 }, { "epoch": 2.6466292224219954, "grad_norm": 0.1098145917057991, "learning_rate": 4.992612723447358e-05, "loss": 0.68, "step": 5200 }, { "epoch": 2.6517207911024836, "grad_norm": 0.1007496640086174, "learning_rate": 4.992598517146295e-05, "loss": 0.6835, "step": 5210 }, { "epoch": 2.656812359782972, "grad_norm": 0.16250421106815338, "learning_rate": 4.992584310845232e-05, "loss": 0.6781, "step": 5220 }, { "epoch": 2.66190392846346, "grad_norm": 0.1358012706041336, "learning_rate": 4.9925701045441696e-05, "loss": 0.6838, "step": 5230 }, { "epoch": 2.6669954971439482, "grad_norm": 0.10603620857000351, "learning_rate": 4.992555898243107e-05, "loss": 0.6847, "step": 5240 }, { "epoch": 2.6720870658244364, "grad_norm": 0.12339074909687042, "learning_rate": 4.992541691942044e-05, "loss": 0.6809, "step": 5250 }, { "epoch": 2.6771786345049247, "grad_norm": 0.13252249360084534, "learning_rate": 4.9925274856409816e-05, "loss": 0.6809, "step": 5260 }, { "epoch": 2.6822702031854124, "grad_norm": 0.12156182527542114, "learning_rate": 4.992513279339919e-05, "loss": 0.6803, "step": 5270 }, { "epoch": 2.687361771865901, "grad_norm": 0.1240081861615181, "learning_rate": 4.992499073038856e-05, "loss": 0.6828, "step": 5280 }, { "epoch": 2.692453340546389, "grad_norm": 0.1085842102766037, "learning_rate": 4.9924848667377935e-05, "loss": 0.6809, "step": 5290 }, { "epoch": 2.697544909226877, "grad_norm": 0.10199875384569168, "learning_rate": 4.992470660436731e-05, "loss": 0.6799, "step": 5300 }, { "epoch": 2.7026364779073653, "grad_norm": 0.10421440750360489, "learning_rate": 4.9924564541356674e-05, "loss": 0.6774, "step": 5310 }, { "epoch": 2.7077280465878535, "grad_norm": 0.11737542599439621, "learning_rate": 4.992442247834605e-05, "loss": 0.6866, "step": 5320 }, { "epoch": 2.7128196152683417, "grad_norm": 0.1116197407245636, "learning_rate": 4.992428041533542e-05, "loss": 0.6826, "step": 5330 }, { "epoch": 2.7179111839488295, "grad_norm": 0.07906144112348557, "learning_rate": 4.9924138352324794e-05, "loss": 0.6833, "step": 5340 }, { "epoch": 2.723002752629318, "grad_norm": 0.09525004774332047, "learning_rate": 4.992399628931416e-05, "loss": 0.6846, "step": 5350 }, { "epoch": 2.728094321309806, "grad_norm": 0.10529020428657532, "learning_rate": 4.992385422630353e-05, "loss": 0.6805, "step": 5360 }, { "epoch": 2.733185889990294, "grad_norm": 0.1130564957857132, "learning_rate": 4.9923712163292906e-05, "loss": 0.6834, "step": 5370 }, { "epoch": 2.7382774586707823, "grad_norm": 0.1169043555855751, "learning_rate": 4.992357010028228e-05, "loss": 0.6791, "step": 5380 }, { "epoch": 2.7433690273512705, "grad_norm": 0.10529076308012009, "learning_rate": 4.992342803727165e-05, "loss": 0.6807, "step": 5390 }, { "epoch": 2.7484605960317587, "grad_norm": 0.11143583059310913, "learning_rate": 4.9923285974261025e-05, "loss": 0.6809, "step": 5400 }, { "epoch": 2.753552164712247, "grad_norm": 0.12018362432718277, "learning_rate": 4.99231439112504e-05, "loss": 0.6803, "step": 5410 }, { "epoch": 2.758643733392735, "grad_norm": 0.10221763700246811, "learning_rate": 4.992300184823977e-05, "loss": 0.6844, "step": 5420 }, { "epoch": 2.763735302073223, "grad_norm": 0.12819118797779083, "learning_rate": 4.9922859785229145e-05, "loss": 0.6802, "step": 5430 }, { "epoch": 2.768826870753711, "grad_norm": 0.11218137294054031, "learning_rate": 4.992271772221852e-05, "loss": 0.6816, "step": 5440 }, { "epoch": 2.7739184394341994, "grad_norm": 0.1787531077861786, "learning_rate": 4.9922575659207884e-05, "loss": 0.6815, "step": 5450 }, { "epoch": 2.7790100081146876, "grad_norm": 0.10983338207006454, "learning_rate": 4.992243359619726e-05, "loss": 0.6777, "step": 5460 }, { "epoch": 2.784101576795176, "grad_norm": 0.12096842378377914, "learning_rate": 4.992229153318663e-05, "loss": 0.6793, "step": 5470 }, { "epoch": 2.789193145475664, "grad_norm": 0.1177634447813034, "learning_rate": 4.9922149470176004e-05, "loss": 0.6845, "step": 5480 }, { "epoch": 2.794284714156152, "grad_norm": 0.09383808076381683, "learning_rate": 4.992200740716538e-05, "loss": 0.6816, "step": 5490 }, { "epoch": 2.79937628283664, "grad_norm": 0.11048846691846848, "learning_rate": 4.992186534415475e-05, "loss": 0.6812, "step": 5500 }, { "epoch": 2.8044678515171286, "grad_norm": 0.11928955465555191, "learning_rate": 4.992172328114412e-05, "loss": 0.6855, "step": 5510 }, { "epoch": 2.8095594201976164, "grad_norm": 0.10312807559967041, "learning_rate": 4.9921581218133496e-05, "loss": 0.6834, "step": 5520 }, { "epoch": 2.8146509888781046, "grad_norm": 0.14140763878822327, "learning_rate": 4.992143915512287e-05, "loss": 0.6826, "step": 5530 }, { "epoch": 2.819742557558593, "grad_norm": 0.12414680421352386, "learning_rate": 4.992129709211224e-05, "loss": 0.6778, "step": 5540 }, { "epoch": 2.824834126239081, "grad_norm": 0.18568123877048492, "learning_rate": 4.9921155029101615e-05, "loss": 0.6834, "step": 5550 }, { "epoch": 2.8299256949195692, "grad_norm": 0.09774978458881378, "learning_rate": 4.992101296609098e-05, "loss": 0.6826, "step": 5560 }, { "epoch": 2.835017263600057, "grad_norm": 0.11985506862401962, "learning_rate": 4.9920870903080355e-05, "loss": 0.6865, "step": 5570 }, { "epoch": 2.8401088322805457, "grad_norm": 0.09641832858324051, "learning_rate": 4.992072884006973e-05, "loss": 0.6821, "step": 5580 }, { "epoch": 2.8452004009610334, "grad_norm": 0.10907211899757385, "learning_rate": 4.9920586777059094e-05, "loss": 0.6847, "step": 5590 }, { "epoch": 2.8502919696415216, "grad_norm": 0.11031023412942886, "learning_rate": 4.992044471404847e-05, "loss": 0.682, "step": 5600 }, { "epoch": 2.85538353832201, "grad_norm": 0.12869343161582947, "learning_rate": 4.992030265103784e-05, "loss": 0.683, "step": 5610 }, { "epoch": 2.860475107002498, "grad_norm": 0.114951953291893, "learning_rate": 4.9920160588027213e-05, "loss": 0.6801, "step": 5620 }, { "epoch": 2.8655666756829863, "grad_norm": 0.12400404363870621, "learning_rate": 4.9920018525016587e-05, "loss": 0.685, "step": 5630 }, { "epoch": 2.8706582443634745, "grad_norm": 0.11837892979383469, "learning_rate": 4.991987646200596e-05, "loss": 0.6826, "step": 5640 }, { "epoch": 2.8757498130439627, "grad_norm": 0.16485707461833954, "learning_rate": 4.991973439899533e-05, "loss": 0.6798, "step": 5650 }, { "epoch": 2.8808413817244505, "grad_norm": 0.1649584323167801, "learning_rate": 4.9919592335984706e-05, "loss": 0.6846, "step": 5660 }, { "epoch": 2.8859329504049387, "grad_norm": 0.09823145717382431, "learning_rate": 4.991945027297408e-05, "loss": 0.6825, "step": 5670 }, { "epoch": 2.891024519085427, "grad_norm": 0.10554816573858261, "learning_rate": 4.991930820996345e-05, "loss": 0.6833, "step": 5680 }, { "epoch": 2.896116087765915, "grad_norm": 0.09985250979661942, "learning_rate": 4.9919166146952825e-05, "loss": 0.685, "step": 5690 }, { "epoch": 2.9012076564464033, "grad_norm": 0.1473183035850525, "learning_rate": 4.99190240839422e-05, "loss": 0.6773, "step": 5700 }, { "epoch": 2.9062992251268915, "grad_norm": 0.1321994662284851, "learning_rate": 4.9918882020931565e-05, "loss": 0.6851, "step": 5710 }, { "epoch": 2.9113907938073798, "grad_norm": 0.11778974533081055, "learning_rate": 4.991873995792094e-05, "loss": 0.6777, "step": 5720 }, { "epoch": 2.9164823624878675, "grad_norm": 0.12440946698188782, "learning_rate": 4.991859789491031e-05, "loss": 0.6866, "step": 5730 }, { "epoch": 2.921573931168356, "grad_norm": 0.14024010300636292, "learning_rate": 4.9918455831899684e-05, "loss": 0.6786, "step": 5740 }, { "epoch": 2.926665499848844, "grad_norm": 0.1383139193058014, "learning_rate": 4.991831376888906e-05, "loss": 0.6843, "step": 5750 }, { "epoch": 2.931757068529332, "grad_norm": 0.16354554891586304, "learning_rate": 4.991817170587843e-05, "loss": 0.6788, "step": 5760 }, { "epoch": 2.9368486372098204, "grad_norm": 0.12788814306259155, "learning_rate": 4.9918029642867796e-05, "loss": 0.6803, "step": 5770 }, { "epoch": 2.9419402058903086, "grad_norm": 0.1226319745182991, "learning_rate": 4.991788757985717e-05, "loss": 0.6833, "step": 5780 }, { "epoch": 2.947031774570797, "grad_norm": 0.12122051417827606, "learning_rate": 4.991774551684654e-05, "loss": 0.6827, "step": 5790 }, { "epoch": 2.9521233432512846, "grad_norm": 0.12066159397363663, "learning_rate": 4.9917603453835916e-05, "loss": 0.6812, "step": 5800 }, { "epoch": 2.957214911931773, "grad_norm": 0.12547747790813446, "learning_rate": 4.991746139082529e-05, "loss": 0.6801, "step": 5810 }, { "epoch": 2.962306480612261, "grad_norm": 0.1140349805355072, "learning_rate": 4.991731932781466e-05, "loss": 0.6833, "step": 5820 }, { "epoch": 2.967398049292749, "grad_norm": 0.14640016853809357, "learning_rate": 4.9917177264804035e-05, "loss": 0.6806, "step": 5830 }, { "epoch": 2.9724896179732374, "grad_norm": 0.1226801946759224, "learning_rate": 4.991703520179341e-05, "loss": 0.6816, "step": 5840 }, { "epoch": 2.9775811866537256, "grad_norm": 0.1280628740787506, "learning_rate": 4.9916893138782775e-05, "loss": 0.6814, "step": 5850 }, { "epoch": 2.982672755334214, "grad_norm": 0.13127422332763672, "learning_rate": 4.991675107577215e-05, "loss": 0.683, "step": 5860 }, { "epoch": 2.987764324014702, "grad_norm": 0.10727940499782562, "learning_rate": 4.991660901276152e-05, "loss": 0.6824, "step": 5870 }, { "epoch": 2.9928558926951903, "grad_norm": 0.13203033804893494, "learning_rate": 4.9916466949750894e-05, "loss": 0.6825, "step": 5880 }, { "epoch": 2.997947461375678, "grad_norm": 0.1205354556441307, "learning_rate": 4.991632488674027e-05, "loss": 0.6868, "step": 5890 }, { "epoch": 3.002545784340244, "grad_norm": 0.1364830732345581, "learning_rate": 4.991618282372964e-05, "loss": 0.6114, "step": 5900 }, { "epoch": 3.007637353020732, "grad_norm": 0.1269853115081787, "learning_rate": 4.991604076071901e-05, "loss": 0.6826, "step": 5910 }, { "epoch": 3.0127289217012203, "grad_norm": 0.1348942220211029, "learning_rate": 4.9915898697708386e-05, "loss": 0.6837, "step": 5920 }, { "epoch": 3.0178204903817085, "grad_norm": 0.13320055603981018, "learning_rate": 4.991575663469776e-05, "loss": 0.6775, "step": 5930 }, { "epoch": 3.0229120590621967, "grad_norm": 0.11422030627727509, "learning_rate": 4.991561457168713e-05, "loss": 0.6801, "step": 5940 }, { "epoch": 3.028003627742685, "grad_norm": 0.10496284067630768, "learning_rate": 4.9915472508676506e-05, "loss": 0.6809, "step": 5950 }, { "epoch": 3.033095196423173, "grad_norm": 0.10586734861135483, "learning_rate": 4.991533044566588e-05, "loss": 0.6775, "step": 5960 }, { "epoch": 3.0381867651036614, "grad_norm": 0.13202211260795593, "learning_rate": 4.9915188382655245e-05, "loss": 0.6777, "step": 5970 }, { "epoch": 3.043278333784149, "grad_norm": 0.13048899173736572, "learning_rate": 4.991504631964462e-05, "loss": 0.6779, "step": 5980 }, { "epoch": 3.0483699024646373, "grad_norm": 0.12446481734514236, "learning_rate": 4.9914904256633984e-05, "loss": 0.6763, "step": 5990 }, { "epoch": 3.0534614711451256, "grad_norm": 0.10315615683794022, "learning_rate": 4.991476219362336e-05, "loss": 0.6877, "step": 6000 }, { "epoch": 3.0585530398256138, "grad_norm": 0.11032961308956146, "learning_rate": 4.991462013061273e-05, "loss": 0.6812, "step": 6010 }, { "epoch": 3.063644608506102, "grad_norm": 0.0968027114868164, "learning_rate": 4.9914478067602104e-05, "loss": 0.6818, "step": 6020 }, { "epoch": 3.06873617718659, "grad_norm": 0.11660617589950562, "learning_rate": 4.991433600459148e-05, "loss": 0.6806, "step": 6030 }, { "epoch": 3.0738277458670784, "grad_norm": 0.1213793009519577, "learning_rate": 4.991419394158085e-05, "loss": 0.6818, "step": 6040 }, { "epoch": 3.078919314547566, "grad_norm": 0.1275392472743988, "learning_rate": 4.991405187857022e-05, "loss": 0.6802, "step": 6050 }, { "epoch": 3.0840108832280544, "grad_norm": 0.1026177927851677, "learning_rate": 4.9913909815559596e-05, "loss": 0.6835, "step": 6060 }, { "epoch": 3.0891024519085426, "grad_norm": 0.10983236879110336, "learning_rate": 4.991376775254897e-05, "loss": 0.6838, "step": 6070 }, { "epoch": 3.094194020589031, "grad_norm": 0.11360979080200195, "learning_rate": 4.991362568953834e-05, "loss": 0.6806, "step": 6080 }, { "epoch": 3.099285589269519, "grad_norm": 0.1488681137561798, "learning_rate": 4.9913483626527716e-05, "loss": 0.6797, "step": 6090 }, { "epoch": 3.1043771579500072, "grad_norm": 0.13620369136333466, "learning_rate": 4.991334156351709e-05, "loss": 0.6871, "step": 6100 }, { "epoch": 3.1094687266304954, "grad_norm": 0.12065689265727997, "learning_rate": 4.9913199500506455e-05, "loss": 0.6792, "step": 6110 }, { "epoch": 3.1145602953109837, "grad_norm": 0.13917431235313416, "learning_rate": 4.991305743749583e-05, "loss": 0.676, "step": 6120 }, { "epoch": 3.1196518639914714, "grad_norm": 0.1255902796983719, "learning_rate": 4.99129153744852e-05, "loss": 0.6842, "step": 6130 }, { "epoch": 3.1247434326719596, "grad_norm": 0.11472214758396149, "learning_rate": 4.9912773311474574e-05, "loss": 0.6788, "step": 6140 }, { "epoch": 3.129835001352448, "grad_norm": 0.12614910304546356, "learning_rate": 4.991263124846395e-05, "loss": 0.6802, "step": 6150 }, { "epoch": 3.134926570032936, "grad_norm": 0.13529641926288605, "learning_rate": 4.991248918545332e-05, "loss": 0.6805, "step": 6160 }, { "epoch": 3.1400181387134243, "grad_norm": 0.11604179441928864, "learning_rate": 4.9912347122442694e-05, "loss": 0.6813, "step": 6170 }, { "epoch": 3.1451097073939125, "grad_norm": 0.12789122760295868, "learning_rate": 4.991220505943207e-05, "loss": 0.6845, "step": 6180 }, { "epoch": 3.1502012760744007, "grad_norm": 0.12958049774169922, "learning_rate": 4.991206299642143e-05, "loss": 0.6804, "step": 6190 }, { "epoch": 3.1552928447548885, "grad_norm": 0.10314188152551651, "learning_rate": 4.9911920933410806e-05, "loss": 0.6847, "step": 6200 }, { "epoch": 3.1603844134353767, "grad_norm": 0.10737662017345428, "learning_rate": 4.991177887040018e-05, "loss": 0.6808, "step": 6210 }, { "epoch": 3.165475982115865, "grad_norm": 0.15128542482852936, "learning_rate": 4.991163680738955e-05, "loss": 0.6789, "step": 6220 }, { "epoch": 3.170567550796353, "grad_norm": 0.11941689997911453, "learning_rate": 4.9911494744378925e-05, "loss": 0.6778, "step": 6230 }, { "epoch": 3.1756591194768413, "grad_norm": 0.15348762273788452, "learning_rate": 4.99113526813683e-05, "loss": 0.6799, "step": 6240 }, { "epoch": 3.1807506881573295, "grad_norm": 0.11959049850702286, "learning_rate": 4.9911210618357665e-05, "loss": 0.6776, "step": 6250 }, { "epoch": 3.1858422568378177, "grad_norm": 0.11588987708091736, "learning_rate": 4.991106855534704e-05, "loss": 0.688, "step": 6260 }, { "epoch": 3.190933825518306, "grad_norm": 0.09905340522527695, "learning_rate": 4.991092649233641e-05, "loss": 0.6845, "step": 6270 }, { "epoch": 3.196025394198794, "grad_norm": 0.11044521629810333, "learning_rate": 4.9910784429325784e-05, "loss": 0.6823, "step": 6280 }, { "epoch": 3.201116962879282, "grad_norm": 0.10236191004514694, "learning_rate": 4.991064236631516e-05, "loss": 0.6824, "step": 6290 }, { "epoch": 3.20620853155977, "grad_norm": 0.12017529457807541, "learning_rate": 4.991050030330453e-05, "loss": 0.682, "step": 6300 }, { "epoch": 3.2113001002402584, "grad_norm": 0.14782628417015076, "learning_rate": 4.9910358240293904e-05, "loss": 0.6781, "step": 6310 }, { "epoch": 3.2163916689207466, "grad_norm": 0.14653240144252777, "learning_rate": 4.991021617728328e-05, "loss": 0.6809, "step": 6320 }, { "epoch": 3.221483237601235, "grad_norm": 0.12069736421108246, "learning_rate": 4.991007411427265e-05, "loss": 0.6811, "step": 6330 }, { "epoch": 3.226574806281723, "grad_norm": 0.13772337138652802, "learning_rate": 4.990993205126202e-05, "loss": 0.6838, "step": 6340 }, { "epoch": 3.231666374962211, "grad_norm": 0.10374171286821365, "learning_rate": 4.9909789988251396e-05, "loss": 0.6836, "step": 6350 }, { "epoch": 3.236757943642699, "grad_norm": 0.11860493570566177, "learning_rate": 4.990964792524076e-05, "loss": 0.6778, "step": 6360 }, { "epoch": 3.241849512323187, "grad_norm": 0.1429886519908905, "learning_rate": 4.9909505862230135e-05, "loss": 0.6775, "step": 6370 }, { "epoch": 3.2469410810036754, "grad_norm": 0.1501941680908203, "learning_rate": 4.990936379921951e-05, "loss": 0.6804, "step": 6380 }, { "epoch": 3.2520326496841636, "grad_norm": 0.12676025927066803, "learning_rate": 4.990922173620888e-05, "loss": 0.6802, "step": 6390 }, { "epoch": 3.257124218364652, "grad_norm": 0.14346669614315033, "learning_rate": 4.990907967319825e-05, "loss": 0.6815, "step": 6400 }, { "epoch": 3.26221578704514, "grad_norm": 0.11594365537166595, "learning_rate": 4.990893761018762e-05, "loss": 0.6783, "step": 6410 }, { "epoch": 3.2673073557256282, "grad_norm": 0.12863503396511078, "learning_rate": 4.9908795547176994e-05, "loss": 0.68, "step": 6420 }, { "epoch": 3.272398924406116, "grad_norm": 0.13634729385375977, "learning_rate": 4.990865348416637e-05, "loss": 0.6795, "step": 6430 }, { "epoch": 3.2774904930866042, "grad_norm": 0.10696328431367874, "learning_rate": 4.990851142115574e-05, "loss": 0.6827, "step": 6440 }, { "epoch": 3.2825820617670924, "grad_norm": 0.1048332154750824, "learning_rate": 4.9908369358145113e-05, "loss": 0.6812, "step": 6450 }, { "epoch": 3.2876736304475807, "grad_norm": 0.09791410714387894, "learning_rate": 4.9908227295134487e-05, "loss": 0.6848, "step": 6460 }, { "epoch": 3.292765199128069, "grad_norm": 0.13385730981826782, "learning_rate": 4.990808523212386e-05, "loss": 0.6826, "step": 6470 }, { "epoch": 3.297856767808557, "grad_norm": 0.13646642863750458, "learning_rate": 4.990794316911323e-05, "loss": 0.6767, "step": 6480 }, { "epoch": 3.3029483364890453, "grad_norm": 0.14173270761966705, "learning_rate": 4.9907801106102606e-05, "loss": 0.6838, "step": 6490 }, { "epoch": 3.3080399051695335, "grad_norm": 0.14603695273399353, "learning_rate": 4.990765904309197e-05, "loss": 0.6766, "step": 6500 }, { "epoch": 3.3131314738500217, "grad_norm": 0.138224795460701, "learning_rate": 4.9907516980081345e-05, "loss": 0.6797, "step": 6510 }, { "epoch": 3.3182230425305095, "grad_norm": 0.11541623622179031, "learning_rate": 4.990737491707072e-05, "loss": 0.6814, "step": 6520 }, { "epoch": 3.3233146112109977, "grad_norm": 0.1160949096083641, "learning_rate": 4.990723285406009e-05, "loss": 0.6842, "step": 6530 }, { "epoch": 3.328406179891486, "grad_norm": 0.1572464108467102, "learning_rate": 4.9907090791049465e-05, "loss": 0.6783, "step": 6540 }, { "epoch": 3.333497748571974, "grad_norm": 0.13026835024356842, "learning_rate": 4.990694872803884e-05, "loss": 0.681, "step": 6550 }, { "epoch": 3.3385893172524623, "grad_norm": 0.11961708962917328, "learning_rate": 4.990680666502821e-05, "loss": 0.6807, "step": 6560 }, { "epoch": 3.3436808859329505, "grad_norm": 0.11406982690095901, "learning_rate": 4.9906664602017584e-05, "loss": 0.6795, "step": 6570 }, { "epoch": 3.3487724546134388, "grad_norm": 0.20744380354881287, "learning_rate": 4.990652253900696e-05, "loss": 0.6771, "step": 6580 }, { "epoch": 3.3538640232939265, "grad_norm": 0.11253584921360016, "learning_rate": 4.990638047599633e-05, "loss": 0.6802, "step": 6590 }, { "epoch": 3.3589555919744147, "grad_norm": 0.08123784512281418, "learning_rate": 4.99062384129857e-05, "loss": 0.6871, "step": 6600 }, { "epoch": 3.364047160654903, "grad_norm": 0.10802698135375977, "learning_rate": 4.990609634997507e-05, "loss": 0.683, "step": 6610 }, { "epoch": 3.369138729335391, "grad_norm": 0.11430787295103073, "learning_rate": 4.990595428696444e-05, "loss": 0.6821, "step": 6620 }, { "epoch": 3.3742302980158794, "grad_norm": 0.09323684871196747, "learning_rate": 4.9905812223953816e-05, "loss": 0.6836, "step": 6630 }, { "epoch": 3.3793218666963676, "grad_norm": 0.10404845327138901, "learning_rate": 4.990567016094318e-05, "loss": 0.6849, "step": 6640 }, { "epoch": 3.384413435376856, "grad_norm": 0.1404566615819931, "learning_rate": 4.9905528097932555e-05, "loss": 0.6769, "step": 6650 }, { "epoch": 3.3895050040573436, "grad_norm": 0.17702195048332214, "learning_rate": 4.990538603492193e-05, "loss": 0.6808, "step": 6660 }, { "epoch": 3.3945965727378318, "grad_norm": 0.1227133646607399, "learning_rate": 4.99052439719113e-05, "loss": 0.6855, "step": 6670 }, { "epoch": 3.39968814141832, "grad_norm": 0.0946226418018341, "learning_rate": 4.9905101908900675e-05, "loss": 0.6804, "step": 6680 }, { "epoch": 3.404779710098808, "grad_norm": 0.11467920988798141, "learning_rate": 4.990495984589005e-05, "loss": 0.677, "step": 6690 }, { "epoch": 3.4098712787792964, "grad_norm": 0.1885383576154709, "learning_rate": 4.990481778287942e-05, "loss": 0.6786, "step": 6700 }, { "epoch": 3.4149628474597846, "grad_norm": 0.0994097888469696, "learning_rate": 4.9904675719868794e-05, "loss": 0.6908, "step": 6710 }, { "epoch": 3.420054416140273, "grad_norm": 0.09989442676305771, "learning_rate": 4.990453365685817e-05, "loss": 0.6804, "step": 6720 }, { "epoch": 3.425145984820761, "grad_norm": 0.12362310290336609, "learning_rate": 4.990439159384754e-05, "loss": 0.6817, "step": 6730 }, { "epoch": 3.4302375535012493, "grad_norm": 0.13228794932365417, "learning_rate": 4.990424953083691e-05, "loss": 0.681, "step": 6740 }, { "epoch": 3.435329122181737, "grad_norm": 0.11642909795045853, "learning_rate": 4.9904107467826286e-05, "loss": 0.6825, "step": 6750 }, { "epoch": 3.4404206908622252, "grad_norm": 0.12548530101776123, "learning_rate": 4.990396540481565e-05, "loss": 0.6814, "step": 6760 }, { "epoch": 3.4455122595427135, "grad_norm": 0.11513999849557877, "learning_rate": 4.9903823341805026e-05, "loss": 0.6792, "step": 6770 }, { "epoch": 3.4506038282232017, "grad_norm": 0.12245498597621918, "learning_rate": 4.99036812787944e-05, "loss": 0.6771, "step": 6780 }, { "epoch": 3.45569539690369, "grad_norm": 0.12722285091876984, "learning_rate": 4.990353921578377e-05, "loss": 0.679, "step": 6790 }, { "epoch": 3.460786965584178, "grad_norm": 0.13212384283542633, "learning_rate": 4.9903397152773145e-05, "loss": 0.6818, "step": 6800 }, { "epoch": 3.4658785342646663, "grad_norm": 0.11193917691707611, "learning_rate": 4.990325508976252e-05, "loss": 0.6822, "step": 6810 }, { "epoch": 3.470970102945154, "grad_norm": 0.14051009714603424, "learning_rate": 4.9903113026751884e-05, "loss": 0.673, "step": 6820 }, { "epoch": 3.4760616716256423, "grad_norm": 0.16787344217300415, "learning_rate": 4.990297096374126e-05, "loss": 0.6834, "step": 6830 }, { "epoch": 3.4811532403061305, "grad_norm": 0.1313748061656952, "learning_rate": 4.990282890073063e-05, "loss": 0.6785, "step": 6840 }, { "epoch": 3.4862448089866187, "grad_norm": 0.13282889127731323, "learning_rate": 4.9902686837720004e-05, "loss": 0.6791, "step": 6850 }, { "epoch": 3.491336377667107, "grad_norm": 0.15743672847747803, "learning_rate": 4.990254477470938e-05, "loss": 0.685, "step": 6860 }, { "epoch": 3.496427946347595, "grad_norm": 0.09886245429515839, "learning_rate": 4.990240271169875e-05, "loss": 0.6831, "step": 6870 }, { "epoch": 3.5015195150280833, "grad_norm": 0.14891770482063293, "learning_rate": 4.990226064868812e-05, "loss": 0.681, "step": 6880 }, { "epoch": 3.506611083708571, "grad_norm": 0.13956576585769653, "learning_rate": 4.9902118585677496e-05, "loss": 0.6806, "step": 6890 }, { "epoch": 3.5117026523890598, "grad_norm": 0.1325678676366806, "learning_rate": 4.990197652266686e-05, "loss": 0.6809, "step": 6900 }, { "epoch": 3.5167942210695475, "grad_norm": 0.13164210319519043, "learning_rate": 4.9901834459656236e-05, "loss": 0.6822, "step": 6910 }, { "epoch": 3.5218857897500357, "grad_norm": 0.13481168448925018, "learning_rate": 4.990169239664561e-05, "loss": 0.673, "step": 6920 }, { "epoch": 3.526977358430524, "grad_norm": 0.16314196586608887, "learning_rate": 4.990155033363498e-05, "loss": 0.6768, "step": 6930 }, { "epoch": 3.532068927111012, "grad_norm": 0.1418369710445404, "learning_rate": 4.9901408270624355e-05, "loss": 0.6777, "step": 6940 }, { "epoch": 3.5371604957915004, "grad_norm": 0.12762701511383057, "learning_rate": 4.990126620761373e-05, "loss": 0.6788, "step": 6950 }, { "epoch": 3.542252064471988, "grad_norm": 0.10353351384401321, "learning_rate": 4.99011241446031e-05, "loss": 0.6858, "step": 6960 }, { "epoch": 3.547343633152477, "grad_norm": 0.0953698605298996, "learning_rate": 4.9900982081592474e-05, "loss": 0.6783, "step": 6970 }, { "epoch": 3.5524352018329646, "grad_norm": 0.10428538918495178, "learning_rate": 4.990084001858185e-05, "loss": 0.6844, "step": 6980 }, { "epoch": 3.557526770513453, "grad_norm": 0.11740399152040482, "learning_rate": 4.990069795557122e-05, "loss": 0.6838, "step": 6990 }, { "epoch": 3.562618339193941, "grad_norm": 0.12733303010463715, "learning_rate": 4.9900555892560594e-05, "loss": 0.6768, "step": 7000 }, { "epoch": 3.567709907874429, "grad_norm": 0.16426721215248108, "learning_rate": 4.990041382954997e-05, "loss": 0.683, "step": 7010 }, { "epoch": 3.5728014765549174, "grad_norm": 0.12947894632816315, "learning_rate": 4.990027176653933e-05, "loss": 0.6729, "step": 7020 }, { "epoch": 3.5778930452354056, "grad_norm": 0.15960286557674408, "learning_rate": 4.9900129703528706e-05, "loss": 0.679, "step": 7030 }, { "epoch": 3.582984613915894, "grad_norm": 0.12176317721605301, "learning_rate": 4.989998764051807e-05, "loss": 0.6832, "step": 7040 }, { "epoch": 3.5880761825963816, "grad_norm": 0.12822549045085907, "learning_rate": 4.9899845577507446e-05, "loss": 0.6807, "step": 7050 }, { "epoch": 3.59316775127687, "grad_norm": 0.09114730358123779, "learning_rate": 4.989970351449682e-05, "loss": 0.6837, "step": 7060 }, { "epoch": 3.598259319957358, "grad_norm": 0.11248596012592316, "learning_rate": 4.989956145148619e-05, "loss": 0.6773, "step": 7070 }, { "epoch": 3.6033508886378462, "grad_norm": 0.14381690323352814, "learning_rate": 4.9899419388475565e-05, "loss": 0.6763, "step": 7080 }, { "epoch": 3.6084424573183345, "grad_norm": 0.1576450616121292, "learning_rate": 4.989927732546494e-05, "loss": 0.6796, "step": 7090 }, { "epoch": 3.6135340259988227, "grad_norm": 0.12672173976898193, "learning_rate": 4.989913526245431e-05, "loss": 0.6773, "step": 7100 }, { "epoch": 3.618625594679311, "grad_norm": 0.10089720040559769, "learning_rate": 4.9898993199443684e-05, "loss": 0.6835, "step": 7110 }, { "epoch": 3.6237171633597987, "grad_norm": 0.10352669656276703, "learning_rate": 4.989885113643306e-05, "loss": 0.6804, "step": 7120 }, { "epoch": 3.6288087320402873, "grad_norm": 0.12168221920728683, "learning_rate": 4.989870907342243e-05, "loss": 0.6775, "step": 7130 }, { "epoch": 3.633900300720775, "grad_norm": 0.152724489569664, "learning_rate": 4.9898567010411804e-05, "loss": 0.6832, "step": 7140 }, { "epoch": 3.6389918694012633, "grad_norm": 0.10124222189188004, "learning_rate": 4.989842494740118e-05, "loss": 0.6824, "step": 7150 }, { "epoch": 3.6440834380817515, "grad_norm": 0.10840737819671631, "learning_rate": 4.989828288439054e-05, "loss": 0.6781, "step": 7160 }, { "epoch": 3.6491750067622397, "grad_norm": 0.10668514668941498, "learning_rate": 4.9898140821379916e-05, "loss": 0.6857, "step": 7170 }, { "epoch": 3.654266575442728, "grad_norm": 0.11429141461849213, "learning_rate": 4.989799875836929e-05, "loss": 0.6823, "step": 7180 }, { "epoch": 3.6593581441232157, "grad_norm": 0.1012284979224205, "learning_rate": 4.989785669535866e-05, "loss": 0.678, "step": 7190 }, { "epoch": 3.6644497128037044, "grad_norm": 0.15000002086162567, "learning_rate": 4.9897714632348035e-05, "loss": 0.6763, "step": 7200 }, { "epoch": 3.669541281484192, "grad_norm": 0.15613609552383423, "learning_rate": 4.989757256933741e-05, "loss": 0.6837, "step": 7210 }, { "epoch": 3.6746328501646803, "grad_norm": 0.13344906270503998, "learning_rate": 4.989743050632678e-05, "loss": 0.6841, "step": 7220 }, { "epoch": 3.6797244188451685, "grad_norm": 0.12140567600727081, "learning_rate": 4.9897288443316155e-05, "loss": 0.6792, "step": 7230 }, { "epoch": 3.6848159875256568, "grad_norm": 0.11317454278469086, "learning_rate": 4.989714638030552e-05, "loss": 0.6821, "step": 7240 }, { "epoch": 3.689907556206145, "grad_norm": 0.1328129768371582, "learning_rate": 4.9897004317294894e-05, "loss": 0.6842, "step": 7250 }, { "epoch": 3.694999124886633, "grad_norm": 0.1081654503941536, "learning_rate": 4.989686225428427e-05, "loss": 0.6796, "step": 7260 }, { "epoch": 3.7000906935671214, "grad_norm": 0.09531684964895248, "learning_rate": 4.989672019127364e-05, "loss": 0.6833, "step": 7270 }, { "epoch": 3.705182262247609, "grad_norm": 0.10997920483350754, "learning_rate": 4.9896578128263013e-05, "loss": 0.6795, "step": 7280 }, { "epoch": 3.7102738309280974, "grad_norm": 0.15568581223487854, "learning_rate": 4.9896436065252387e-05, "loss": 0.6804, "step": 7290 }, { "epoch": 3.7153653996085856, "grad_norm": 0.130909726023674, "learning_rate": 4.989629400224175e-05, "loss": 0.6814, "step": 7300 }, { "epoch": 3.720456968289074, "grad_norm": 0.13917888700962067, "learning_rate": 4.9896151939231126e-05, "loss": 0.6793, "step": 7310 }, { "epoch": 3.725548536969562, "grad_norm": 0.12968967854976654, "learning_rate": 4.98960098762205e-05, "loss": 0.6819, "step": 7320 }, { "epoch": 3.73064010565005, "grad_norm": 0.12175523489713669, "learning_rate": 4.989586781320987e-05, "loss": 0.6785, "step": 7330 }, { "epoch": 3.7357316743305384, "grad_norm": 0.12431439012289047, "learning_rate": 4.9895725750199245e-05, "loss": 0.6785, "step": 7340 }, { "epoch": 3.740823243011026, "grad_norm": 0.1398157924413681, "learning_rate": 4.989558368718862e-05, "loss": 0.6779, "step": 7350 }, { "epoch": 3.745914811691515, "grad_norm": 0.11357001215219498, "learning_rate": 4.989544162417799e-05, "loss": 0.685, "step": 7360 }, { "epoch": 3.7510063803720026, "grad_norm": 0.16288457810878754, "learning_rate": 4.9895299561167365e-05, "loss": 0.6811, "step": 7370 }, { "epoch": 3.756097949052491, "grad_norm": 0.11568481475114822, "learning_rate": 4.989515749815674e-05, "loss": 0.6796, "step": 7380 }, { "epoch": 3.761189517732979, "grad_norm": 0.15195196866989136, "learning_rate": 4.989501543514611e-05, "loss": 0.6777, "step": 7390 }, { "epoch": 3.7662810864134673, "grad_norm": 0.12881244719028473, "learning_rate": 4.9894873372135484e-05, "loss": 0.6775, "step": 7400 }, { "epoch": 3.7713726550939555, "grad_norm": 0.1401291787624359, "learning_rate": 4.989473130912485e-05, "loss": 0.6834, "step": 7410 }, { "epoch": 3.7764642237744432, "grad_norm": 0.12248072773218155, "learning_rate": 4.9894589246114223e-05, "loss": 0.6792, "step": 7420 }, { "epoch": 3.781555792454932, "grad_norm": 0.11089824140071869, "learning_rate": 4.9894447183103596e-05, "loss": 0.6819, "step": 7430 }, { "epoch": 3.7866473611354197, "grad_norm": 0.09657898545265198, "learning_rate": 4.989430512009297e-05, "loss": 0.6841, "step": 7440 }, { "epoch": 3.791738929815908, "grad_norm": 0.12385948747396469, "learning_rate": 4.9894163057082336e-05, "loss": 0.6795, "step": 7450 }, { "epoch": 3.796830498496396, "grad_norm": 0.10562111437320709, "learning_rate": 4.989402099407171e-05, "loss": 0.6783, "step": 7460 }, { "epoch": 3.8019220671768843, "grad_norm": 0.11349403858184814, "learning_rate": 4.989387893106108e-05, "loss": 0.6807, "step": 7470 }, { "epoch": 3.8070136358573725, "grad_norm": 0.11444567143917084, "learning_rate": 4.9893736868050455e-05, "loss": 0.6791, "step": 7480 }, { "epoch": 3.8121052045378607, "grad_norm": 0.1610439121723175, "learning_rate": 4.989359480503983e-05, "loss": 0.6812, "step": 7490 }, { "epoch": 3.817196773218349, "grad_norm": 0.1214766800403595, "learning_rate": 4.98934527420292e-05, "loss": 0.6817, "step": 7500 }, { "epoch": 3.8222883418988367, "grad_norm": 0.12765400111675262, "learning_rate": 4.9893310679018575e-05, "loss": 0.6787, "step": 7510 }, { "epoch": 3.827379910579325, "grad_norm": 0.10731592029333115, "learning_rate": 4.989316861600795e-05, "loss": 0.683, "step": 7520 }, { "epoch": 3.832471479259813, "grad_norm": 0.12986642122268677, "learning_rate": 4.989302655299732e-05, "loss": 0.6766, "step": 7530 }, { "epoch": 3.8375630479403013, "grad_norm": 0.12156540900468826, "learning_rate": 4.9892884489986694e-05, "loss": 0.6834, "step": 7540 }, { "epoch": 3.8426546166207896, "grad_norm": 0.10650958865880966, "learning_rate": 4.989274242697606e-05, "loss": 0.6821, "step": 7550 }, { "epoch": 3.8477461853012778, "grad_norm": 0.09265447407960892, "learning_rate": 4.989260036396543e-05, "loss": 0.6807, "step": 7560 }, { "epoch": 3.852837753981766, "grad_norm": 0.13007622957229614, "learning_rate": 4.9892458300954806e-05, "loss": 0.681, "step": 7570 }, { "epoch": 3.8579293226622537, "grad_norm": 0.1033967137336731, "learning_rate": 4.989231623794418e-05, "loss": 0.686, "step": 7580 }, { "epoch": 3.8630208913427424, "grad_norm": 0.10867638140916824, "learning_rate": 4.989217417493355e-05, "loss": 0.6796, "step": 7590 }, { "epoch": 3.86811246002323, "grad_norm": 0.105263352394104, "learning_rate": 4.9892032111922926e-05, "loss": 0.6824, "step": 7600 }, { "epoch": 3.8732040287037184, "grad_norm": 0.12403067946434021, "learning_rate": 4.98918900489123e-05, "loss": 0.6793, "step": 7610 }, { "epoch": 3.8782955973842066, "grad_norm": 0.09988098591566086, "learning_rate": 4.989174798590167e-05, "loss": 0.6842, "step": 7620 }, { "epoch": 3.883387166064695, "grad_norm": 0.13452745974063873, "learning_rate": 4.9891605922891045e-05, "loss": 0.6811, "step": 7630 }, { "epoch": 3.888478734745183, "grad_norm": 0.10854171961545944, "learning_rate": 4.989146385988042e-05, "loss": 0.6827, "step": 7640 }, { "epoch": 3.893570303425671, "grad_norm": 0.10819829255342484, "learning_rate": 4.9891321796869784e-05, "loss": 0.6796, "step": 7650 }, { "epoch": 3.8986618721061594, "grad_norm": 0.17421726882457733, "learning_rate": 4.989117973385916e-05, "loss": 0.6808, "step": 7660 }, { "epoch": 3.903753440786647, "grad_norm": 0.13020376861095428, "learning_rate": 4.989103767084853e-05, "loss": 0.6796, "step": 7670 }, { "epoch": 3.9088450094671354, "grad_norm": 0.10870732367038727, "learning_rate": 4.9890895607837904e-05, "loss": 0.6867, "step": 7680 }, { "epoch": 3.9139365781476236, "grad_norm": 0.10249564051628113, "learning_rate": 4.989075354482727e-05, "loss": 0.6847, "step": 7690 }, { "epoch": 3.919028146828112, "grad_norm": 0.09583424031734467, "learning_rate": 4.989061148181664e-05, "loss": 0.6837, "step": 7700 }, { "epoch": 3.9241197155086, "grad_norm": 0.10090246796607971, "learning_rate": 4.9890469418806016e-05, "loss": 0.6814, "step": 7710 }, { "epoch": 3.9292112841890883, "grad_norm": 0.1201721727848053, "learning_rate": 4.989032735579539e-05, "loss": 0.6843, "step": 7720 }, { "epoch": 3.9343028528695765, "grad_norm": 0.11703382432460785, "learning_rate": 4.989018529278476e-05, "loss": 0.6784, "step": 7730 }, { "epoch": 3.9393944215500643, "grad_norm": 0.1226707398891449, "learning_rate": 4.9890043229774136e-05, "loss": 0.6847, "step": 7740 }, { "epoch": 3.9444859902305525, "grad_norm": 0.09304598718881607, "learning_rate": 4.988990116676351e-05, "loss": 0.6812, "step": 7750 }, { "epoch": 3.9495775589110407, "grad_norm": 0.10586468130350113, "learning_rate": 4.988975910375288e-05, "loss": 0.679, "step": 7760 }, { "epoch": 3.954669127591529, "grad_norm": 0.10969860106706619, "learning_rate": 4.9889617040742255e-05, "loss": 0.6826, "step": 7770 }, { "epoch": 3.959760696272017, "grad_norm": 0.1249874085187912, "learning_rate": 4.988947497773163e-05, "loss": 0.6767, "step": 7780 }, { "epoch": 3.9648522649525053, "grad_norm": 0.16480816900730133, "learning_rate": 4.9889332914721e-05, "loss": 0.6796, "step": 7790 }, { "epoch": 3.9699438336329935, "grad_norm": 0.2025347650051117, "learning_rate": 4.9889190851710374e-05, "loss": 0.678, "step": 7800 }, { "epoch": 3.9750354023134813, "grad_norm": 0.1153530701994896, "learning_rate": 4.988904878869974e-05, "loss": 0.6812, "step": 7810 }, { "epoch": 3.98012697099397, "grad_norm": 0.12336631864309311, "learning_rate": 4.9888906725689114e-05, "loss": 0.6781, "step": 7820 }, { "epoch": 3.9852185396744577, "grad_norm": 0.1417071670293808, "learning_rate": 4.988876466267849e-05, "loss": 0.6796, "step": 7830 }, { "epoch": 3.990310108354946, "grad_norm": 0.12677961587905884, "learning_rate": 4.988862259966786e-05, "loss": 0.6819, "step": 7840 }, { "epoch": 3.995401677035434, "grad_norm": 0.1134430319070816, "learning_rate": 4.988848053665723e-05, "loss": 0.6802, "step": 7850 }, { "epoch": 4.0, "grad_norm": 0.021812934428453445, "learning_rate": 4.98883384736466e-05, "loss": 0.6148, "step": 7860 }, { "epoch": 4.005091568680488, "grad_norm": 0.11325574666261673, "learning_rate": 4.988819641063597e-05, "loss": 0.6823, "step": 7870 }, { "epoch": 4.010183137360976, "grad_norm": 0.12439537793397903, "learning_rate": 4.9888054347625346e-05, "loss": 0.6808, "step": 7880 }, { "epoch": 4.015274706041464, "grad_norm": 0.11274933069944382, "learning_rate": 4.988791228461472e-05, "loss": 0.6828, "step": 7890 }, { "epoch": 4.020366274721953, "grad_norm": 0.10643935203552246, "learning_rate": 4.988777022160409e-05, "loss": 0.6833, "step": 7900 }, { "epoch": 4.025457843402441, "grad_norm": 0.0944155901670456, "learning_rate": 4.9887628158593465e-05, "loss": 0.6842, "step": 7910 }, { "epoch": 4.030549412082929, "grad_norm": 0.12772725522518158, "learning_rate": 4.988748609558284e-05, "loss": 0.6764, "step": 7920 }, { "epoch": 4.035640980763417, "grad_norm": 0.19370485842227936, "learning_rate": 4.988734403257221e-05, "loss": 0.6845, "step": 7930 }, { "epoch": 4.040732549443905, "grad_norm": 0.13512100279331207, "learning_rate": 4.9887201969561584e-05, "loss": 0.6745, "step": 7940 }, { "epoch": 4.0458241181243935, "grad_norm": 0.13933135569095612, "learning_rate": 4.988705990655095e-05, "loss": 0.6842, "step": 7950 }, { "epoch": 4.050915686804881, "grad_norm": 0.13375182449817657, "learning_rate": 4.9886917843540324e-05, "loss": 0.6815, "step": 7960 }, { "epoch": 4.05600725548537, "grad_norm": 0.11060313135385513, "learning_rate": 4.98867757805297e-05, "loss": 0.6798, "step": 7970 }, { "epoch": 4.061098824165858, "grad_norm": 0.14003530144691467, "learning_rate": 4.988663371751907e-05, "loss": 0.6787, "step": 7980 }, { "epoch": 4.066190392846346, "grad_norm": 0.10484720021486282, "learning_rate": 4.988649165450844e-05, "loss": 0.6818, "step": 7990 }, { "epoch": 4.071281961526834, "grad_norm": 0.11415210366249084, "learning_rate": 4.9886349591497816e-05, "loss": 0.6804, "step": 8000 }, { "epoch": 4.076373530207323, "grad_norm": 0.1279604583978653, "learning_rate": 4.988620752848719e-05, "loss": 0.6793, "step": 8010 }, { "epoch": 4.0814650988878105, "grad_norm": 0.12138471007347107, "learning_rate": 4.988606546547656e-05, "loss": 0.6814, "step": 8020 }, { "epoch": 4.086556667568298, "grad_norm": 0.13427557051181793, "learning_rate": 4.9885923402465935e-05, "loss": 0.6752, "step": 8030 }, { "epoch": 4.091648236248787, "grad_norm": 0.14821045100688934, "learning_rate": 4.988578133945531e-05, "loss": 0.6775, "step": 8040 }, { "epoch": 4.096739804929275, "grad_norm": 0.13484236598014832, "learning_rate": 4.988563927644468e-05, "loss": 0.6846, "step": 8050 }, { "epoch": 4.101831373609763, "grad_norm": 0.07954470813274384, "learning_rate": 4.9885497213434055e-05, "loss": 0.684, "step": 8060 }, { "epoch": 4.106922942290251, "grad_norm": 0.10616060346364975, "learning_rate": 4.988535515042342e-05, "loss": 0.6822, "step": 8070 }, { "epoch": 4.11201451097074, "grad_norm": 0.10499216616153717, "learning_rate": 4.9885213087412794e-05, "loss": 0.6798, "step": 8080 }, { "epoch": 4.1171060796512275, "grad_norm": 0.12274570018053055, "learning_rate": 4.988507102440216e-05, "loss": 0.6799, "step": 8090 }, { "epoch": 4.122197648331715, "grad_norm": 0.11465749889612198, "learning_rate": 4.9884928961391534e-05, "loss": 0.6817, "step": 8100 }, { "epoch": 4.127289217012204, "grad_norm": 0.09962257742881775, "learning_rate": 4.988478689838091e-05, "loss": 0.6844, "step": 8110 }, { "epoch": 4.132380785692692, "grad_norm": 0.1151047945022583, "learning_rate": 4.988464483537028e-05, "loss": 0.6787, "step": 8120 }, { "epoch": 4.13747235437318, "grad_norm": 0.1360507756471634, "learning_rate": 4.988450277235965e-05, "loss": 0.6791, "step": 8130 }, { "epoch": 4.142563923053668, "grad_norm": 0.16751664876937866, "learning_rate": 4.9884360709349026e-05, "loss": 0.6738, "step": 8140 }, { "epoch": 4.147655491734157, "grad_norm": 0.18576379120349884, "learning_rate": 4.98842186463384e-05, "loss": 0.678, "step": 8150 }, { "epoch": 4.152747060414645, "grad_norm": 0.12279310077428818, "learning_rate": 4.988407658332777e-05, "loss": 0.6786, "step": 8160 }, { "epoch": 4.157838629095132, "grad_norm": 0.14428728818893433, "learning_rate": 4.9883934520317145e-05, "loss": 0.6756, "step": 8170 }, { "epoch": 4.162930197775621, "grad_norm": 0.1211373507976532, "learning_rate": 4.988379245730652e-05, "loss": 0.6775, "step": 8180 }, { "epoch": 4.168021766456109, "grad_norm": 0.13393299281597137, "learning_rate": 4.988365039429589e-05, "loss": 0.6769, "step": 8190 }, { "epoch": 4.173113335136597, "grad_norm": 0.12077504396438599, "learning_rate": 4.9883508331285265e-05, "loss": 0.6829, "step": 8200 }, { "epoch": 4.178204903817085, "grad_norm": 0.10940321534872055, "learning_rate": 4.988336626827463e-05, "loss": 0.6809, "step": 8210 }, { "epoch": 4.183296472497574, "grad_norm": 0.09884709119796753, "learning_rate": 4.9883224205264004e-05, "loss": 0.6813, "step": 8220 }, { "epoch": 4.188388041178062, "grad_norm": 0.10086120665073395, "learning_rate": 4.988308214225338e-05, "loss": 0.6809, "step": 8230 }, { "epoch": 4.19347960985855, "grad_norm": 0.11668648570775986, "learning_rate": 4.988294007924275e-05, "loss": 0.6798, "step": 8240 }, { "epoch": 4.198571178539038, "grad_norm": 0.12528111040592194, "learning_rate": 4.9882798016232123e-05, "loss": 0.6765, "step": 8250 }, { "epoch": 4.203662747219526, "grad_norm": 0.11714299023151398, "learning_rate": 4.9882655953221497e-05, "loss": 0.6744, "step": 8260 }, { "epoch": 4.2087543159000145, "grad_norm": 0.11050295829772949, "learning_rate": 4.988251389021087e-05, "loss": 0.6874, "step": 8270 }, { "epoch": 4.213845884580502, "grad_norm": 0.09499291330575943, "learning_rate": 4.9882371827200236e-05, "loss": 0.6817, "step": 8280 }, { "epoch": 4.218937453260991, "grad_norm": 0.09335146099328995, "learning_rate": 4.988222976418961e-05, "loss": 0.6818, "step": 8290 }, { "epoch": 4.224029021941479, "grad_norm": 0.1219559907913208, "learning_rate": 4.988208770117898e-05, "loss": 0.681, "step": 8300 }, { "epoch": 4.229120590621967, "grad_norm": 0.14629492163658142, "learning_rate": 4.9881945638168355e-05, "loss": 0.6822, "step": 8310 }, { "epoch": 4.234212159302455, "grad_norm": 0.13365550339221954, "learning_rate": 4.988180357515773e-05, "loss": 0.6805, "step": 8320 }, { "epoch": 4.239303727982943, "grad_norm": 0.141509547829628, "learning_rate": 4.98816615121471e-05, "loss": 0.6783, "step": 8330 }, { "epoch": 4.2443952966634315, "grad_norm": 0.13036063313484192, "learning_rate": 4.9881519449136475e-05, "loss": 0.6756, "step": 8340 }, { "epoch": 4.249486865343919, "grad_norm": 0.11939451843500137, "learning_rate": 4.988137738612584e-05, "loss": 0.6826, "step": 8350 }, { "epoch": 4.254578434024408, "grad_norm": 0.12008455395698547, "learning_rate": 4.9881235323115214e-05, "loss": 0.681, "step": 8360 }, { "epoch": 4.259670002704896, "grad_norm": 0.11019112914800644, "learning_rate": 4.988109326010459e-05, "loss": 0.6856, "step": 8370 }, { "epoch": 4.264761571385384, "grad_norm": 0.10078281164169312, "learning_rate": 4.988095119709396e-05, "loss": 0.6792, "step": 8380 }, { "epoch": 4.269853140065872, "grad_norm": 0.1294504553079605, "learning_rate": 4.988080913408333e-05, "loss": 0.6825, "step": 8390 }, { "epoch": 4.274944708746361, "grad_norm": 0.1074661836028099, "learning_rate": 4.9880667071072706e-05, "loss": 0.6799, "step": 8400 }, { "epoch": 4.280036277426849, "grad_norm": 0.11285123229026794, "learning_rate": 4.988052500806208e-05, "loss": 0.6831, "step": 8410 }, { "epoch": 4.285127846107336, "grad_norm": 0.12429996579885483, "learning_rate": 4.988038294505145e-05, "loss": 0.6793, "step": 8420 }, { "epoch": 4.290219414787825, "grad_norm": 0.10803820192813873, "learning_rate": 4.9880240882040826e-05, "loss": 0.6787, "step": 8430 }, { "epoch": 4.295310983468313, "grad_norm": 0.12693211436271667, "learning_rate": 4.98800988190302e-05, "loss": 0.6729, "step": 8440 }, { "epoch": 4.300402552148801, "grad_norm": 0.11828629672527313, "learning_rate": 4.987995675601957e-05, "loss": 0.6829, "step": 8450 }, { "epoch": 4.305494120829289, "grad_norm": 0.11893879622220993, "learning_rate": 4.987981469300894e-05, "loss": 0.681, "step": 8460 }, { "epoch": 4.310585689509777, "grad_norm": 0.12228237092494965, "learning_rate": 4.987967262999831e-05, "loss": 0.6793, "step": 8470 }, { "epoch": 4.315677258190266, "grad_norm": 0.11881165206432343, "learning_rate": 4.9879530566987684e-05, "loss": 0.681, "step": 8480 }, { "epoch": 4.320768826870753, "grad_norm": 0.09753947705030441, "learning_rate": 4.987938850397705e-05, "loss": 0.6812, "step": 8490 }, { "epoch": 4.325860395551242, "grad_norm": 0.10875561088323593, "learning_rate": 4.9879246440966424e-05, "loss": 0.6764, "step": 8500 }, { "epoch": 4.33095196423173, "grad_norm": 0.1029878631234169, "learning_rate": 4.98791043779558e-05, "loss": 0.6793, "step": 8510 }, { "epoch": 4.3360435329122184, "grad_norm": 0.11321298032999039, "learning_rate": 4.987896231494517e-05, "loss": 0.6805, "step": 8520 }, { "epoch": 4.341135101592706, "grad_norm": 0.12302636355161667, "learning_rate": 4.987882025193454e-05, "loss": 0.6841, "step": 8530 }, { "epoch": 4.346226670273195, "grad_norm": 0.0927717313170433, "learning_rate": 4.9878678188923916e-05, "loss": 0.6848, "step": 8540 }, { "epoch": 4.351318238953683, "grad_norm": 0.1418168693780899, "learning_rate": 4.987853612591329e-05, "loss": 0.6764, "step": 8550 }, { "epoch": 4.35640980763417, "grad_norm": 0.12036493420600891, "learning_rate": 4.987839406290266e-05, "loss": 0.6783, "step": 8560 }, { "epoch": 4.361501376314659, "grad_norm": 0.14609991014003754, "learning_rate": 4.9878251999892036e-05, "loss": 0.676, "step": 8570 }, { "epoch": 4.366592944995147, "grad_norm": 0.1448822170495987, "learning_rate": 4.987810993688141e-05, "loss": 0.6803, "step": 8580 }, { "epoch": 4.3716845136756355, "grad_norm": 0.14650079607963562, "learning_rate": 4.987796787387078e-05, "loss": 0.6795, "step": 8590 }, { "epoch": 4.376776082356123, "grad_norm": 0.10146970301866531, "learning_rate": 4.987782581086015e-05, "loss": 0.6826, "step": 8600 }, { "epoch": 4.381867651036612, "grad_norm": 0.10098574310541153, "learning_rate": 4.987768374784952e-05, "loss": 0.6814, "step": 8610 }, { "epoch": 4.3869592197171, "grad_norm": 0.12981392443180084, "learning_rate": 4.9877541684838894e-05, "loss": 0.6774, "step": 8620 }, { "epoch": 4.392050788397588, "grad_norm": 0.1231103241443634, "learning_rate": 4.987739962182827e-05, "loss": 0.6751, "step": 8630 }, { "epoch": 4.397142357078076, "grad_norm": 0.17549310624599457, "learning_rate": 4.987725755881764e-05, "loss": 0.6773, "step": 8640 }, { "epoch": 4.402233925758564, "grad_norm": 0.1261102259159088, "learning_rate": 4.9877115495807014e-05, "loss": 0.6778, "step": 8650 }, { "epoch": 4.4073254944390525, "grad_norm": 0.12228421121835709, "learning_rate": 4.987697343279639e-05, "loss": 0.6815, "step": 8660 }, { "epoch": 4.41241706311954, "grad_norm": 0.08992882072925568, "learning_rate": 4.987683136978576e-05, "loss": 0.6829, "step": 8670 }, { "epoch": 4.417508631800029, "grad_norm": 0.10478372871875763, "learning_rate": 4.987668930677513e-05, "loss": 0.6758, "step": 8680 }, { "epoch": 4.422600200480517, "grad_norm": 0.1255083978176117, "learning_rate": 4.9876547243764506e-05, "loss": 0.6741, "step": 8690 }, { "epoch": 4.4276917691610045, "grad_norm": 0.13139568269252777, "learning_rate": 4.987640518075387e-05, "loss": 0.6803, "step": 8700 }, { "epoch": 4.432783337841493, "grad_norm": 0.1472860723733902, "learning_rate": 4.9876263117743246e-05, "loss": 0.6759, "step": 8710 }, { "epoch": 4.437874906521981, "grad_norm": 0.16318807005882263, "learning_rate": 4.987612105473262e-05, "loss": 0.6868, "step": 8720 }, { "epoch": 4.44296647520247, "grad_norm": 0.1145109310746193, "learning_rate": 4.987597899172199e-05, "loss": 0.6788, "step": 8730 }, { "epoch": 4.448058043882957, "grad_norm": 0.09544923901557922, "learning_rate": 4.987583692871136e-05, "loss": 0.682, "step": 8740 }, { "epoch": 4.453149612563446, "grad_norm": 0.10780615359544754, "learning_rate": 4.987569486570073e-05, "loss": 0.6781, "step": 8750 }, { "epoch": 4.458241181243934, "grad_norm": 0.14260242879390717, "learning_rate": 4.9875552802690104e-05, "loss": 0.6782, "step": 8760 }, { "epoch": 4.463332749924422, "grad_norm": 0.13693778216838837, "learning_rate": 4.987541073967948e-05, "loss": 0.6771, "step": 8770 }, { "epoch": 4.46842431860491, "grad_norm": 0.10794325917959213, "learning_rate": 4.987526867666885e-05, "loss": 0.6789, "step": 8780 }, { "epoch": 4.473515887285398, "grad_norm": 0.11324315518140793, "learning_rate": 4.9875126613658224e-05, "loss": 0.684, "step": 8790 }, { "epoch": 4.478607455965887, "grad_norm": 0.10087355971336365, "learning_rate": 4.98749845506476e-05, "loss": 0.6819, "step": 8800 }, { "epoch": 4.483699024646374, "grad_norm": 0.09752973914146423, "learning_rate": 4.987484248763697e-05, "loss": 0.6819, "step": 8810 }, { "epoch": 4.488790593326863, "grad_norm": 0.12462896853685379, "learning_rate": 4.987470042462634e-05, "loss": 0.6844, "step": 8820 }, { "epoch": 4.493882162007351, "grad_norm": 0.12875770032405853, "learning_rate": 4.9874558361615716e-05, "loss": 0.6784, "step": 8830 }, { "epoch": 4.4989737306878395, "grad_norm": 0.11722705513238907, "learning_rate": 4.987441629860509e-05, "loss": 0.6797, "step": 8840 }, { "epoch": 4.504065299368327, "grad_norm": 0.16931360960006714, "learning_rate": 4.987427423559446e-05, "loss": 0.6766, "step": 8850 }, { "epoch": 4.509156868048816, "grad_norm": 0.13619418442249298, "learning_rate": 4.987413217258383e-05, "loss": 0.6774, "step": 8860 }, { "epoch": 4.514248436729304, "grad_norm": 0.19465768337249756, "learning_rate": 4.98739901095732e-05, "loss": 0.6832, "step": 8870 }, { "epoch": 4.519340005409791, "grad_norm": 0.11889132857322693, "learning_rate": 4.9873848046562575e-05, "loss": 0.6848, "step": 8880 }, { "epoch": 4.52443157409028, "grad_norm": 0.10783824324607849, "learning_rate": 4.987370598355195e-05, "loss": 0.6793, "step": 8890 }, { "epoch": 4.529523142770768, "grad_norm": 0.11385292559862137, "learning_rate": 4.987356392054132e-05, "loss": 0.6754, "step": 8900 }, { "epoch": 4.5346147114512565, "grad_norm": 0.13017722964286804, "learning_rate": 4.987342185753069e-05, "loss": 0.6778, "step": 8910 }, { "epoch": 4.539706280131744, "grad_norm": 0.13603904843330383, "learning_rate": 4.987327979452006e-05, "loss": 0.6758, "step": 8920 }, { "epoch": 4.544797848812232, "grad_norm": 0.15172545611858368, "learning_rate": 4.9873137731509434e-05, "loss": 0.677, "step": 8930 }, { "epoch": 4.549889417492721, "grad_norm": 0.13269858062267303, "learning_rate": 4.987299566849881e-05, "loss": 0.6823, "step": 8940 }, { "epoch": 4.5549809861732085, "grad_norm": 0.14247867465019226, "learning_rate": 4.987285360548818e-05, "loss": 0.6803, "step": 8950 }, { "epoch": 4.560072554853697, "grad_norm": 0.1458357870578766, "learning_rate": 4.987271154247755e-05, "loss": 0.6755, "step": 8960 }, { "epoch": 4.565164123534185, "grad_norm": 0.1240466758608818, "learning_rate": 4.9872569479466926e-05, "loss": 0.6793, "step": 8970 }, { "epoch": 4.5702556922146735, "grad_norm": 0.14014077186584473, "learning_rate": 4.98724274164563e-05, "loss": 0.6812, "step": 8980 }, { "epoch": 4.575347260895161, "grad_norm": 0.1574947088956833, "learning_rate": 4.987228535344567e-05, "loss": 0.6752, "step": 8990 }, { "epoch": 4.58043882957565, "grad_norm": 0.12997229397296906, "learning_rate": 4.987214329043504e-05, "loss": 0.6853, "step": 9000 }, { "epoch": 4.585530398256138, "grad_norm": 0.11148348450660706, "learning_rate": 4.987200122742441e-05, "loss": 0.6782, "step": 9010 }, { "epoch": 4.5906219669366255, "grad_norm": 0.13387084007263184, "learning_rate": 4.9871859164413785e-05, "loss": 0.6798, "step": 9020 }, { "epoch": 4.595713535617114, "grad_norm": 0.16059359908103943, "learning_rate": 4.987171710140316e-05, "loss": 0.6815, "step": 9030 }, { "epoch": 4.600805104297602, "grad_norm": 0.15377014875411987, "learning_rate": 4.987157503839253e-05, "loss": 0.6813, "step": 9040 }, { "epoch": 4.605896672978091, "grad_norm": 0.13581454753875732, "learning_rate": 4.9871432975381904e-05, "loss": 0.6776, "step": 9050 }, { "epoch": 4.610988241658578, "grad_norm": 0.11781629174947739, "learning_rate": 4.987129091237128e-05, "loss": 0.6778, "step": 9060 }, { "epoch": 4.616079810339067, "grad_norm": 0.15693874657154083, "learning_rate": 4.987114884936065e-05, "loss": 0.6785, "step": 9070 }, { "epoch": 4.621171379019555, "grad_norm": 0.1455591917037964, "learning_rate": 4.9871006786350023e-05, "loss": 0.683, "step": 9080 }, { "epoch": 4.626262947700043, "grad_norm": 0.10115326195955276, "learning_rate": 4.9870864723339397e-05, "loss": 0.6816, "step": 9090 }, { "epoch": 4.631354516380531, "grad_norm": 0.10945667326450348, "learning_rate": 4.987072266032877e-05, "loss": 0.6853, "step": 9100 }, { "epoch": 4.636446085061019, "grad_norm": 0.11783566325902939, "learning_rate": 4.987058059731814e-05, "loss": 0.6825, "step": 9110 }, { "epoch": 4.641537653741508, "grad_norm": 0.1183709055185318, "learning_rate": 4.987043853430751e-05, "loss": 0.6794, "step": 9120 }, { "epoch": 4.646629222421995, "grad_norm": 0.17861825227737427, "learning_rate": 4.987029647129688e-05, "loss": 0.6812, "step": 9130 }, { "epoch": 4.651720791102484, "grad_norm": 0.1105700135231018, "learning_rate": 4.987015440828625e-05, "loss": 0.6853, "step": 9140 }, { "epoch": 4.656812359782972, "grad_norm": 0.13059043884277344, "learning_rate": 4.987001234527562e-05, "loss": 0.6825, "step": 9150 }, { "epoch": 4.66190392846346, "grad_norm": 0.10306143015623093, "learning_rate": 4.9869870282264995e-05, "loss": 0.6825, "step": 9160 }, { "epoch": 4.666995497143948, "grad_norm": 0.1366746723651886, "learning_rate": 4.986972821925437e-05, "loss": 0.6769, "step": 9170 }, { "epoch": 4.672087065824436, "grad_norm": 0.15557105839252472, "learning_rate": 4.986958615624374e-05, "loss": 0.6811, "step": 9180 }, { "epoch": 4.677178634504925, "grad_norm": 0.1473141759634018, "learning_rate": 4.9869444093233114e-05, "loss": 0.6843, "step": 9190 }, { "epoch": 4.682270203185412, "grad_norm": 0.16388468444347382, "learning_rate": 4.986930203022249e-05, "loss": 0.6751, "step": 9200 }, { "epoch": 4.687361771865901, "grad_norm": 0.15377168357372284, "learning_rate": 4.986915996721186e-05, "loss": 0.68, "step": 9210 }, { "epoch": 4.692453340546389, "grad_norm": 0.14194439351558685, "learning_rate": 4.986901790420123e-05, "loss": 0.6762, "step": 9220 }, { "epoch": 4.6975449092268775, "grad_norm": 0.1327824741601944, "learning_rate": 4.9868875841190606e-05, "loss": 0.6837, "step": 9230 }, { "epoch": 4.702636477907365, "grad_norm": 0.13738127052783966, "learning_rate": 4.986873377817998e-05, "loss": 0.6785, "step": 9240 }, { "epoch": 4.707728046587853, "grad_norm": 0.17268739640712738, "learning_rate": 4.986859171516935e-05, "loss": 0.6769, "step": 9250 }, { "epoch": 4.712819615268342, "grad_norm": 0.14373987913131714, "learning_rate": 4.986844965215872e-05, "loss": 0.6806, "step": 9260 }, { "epoch": 4.7179111839488295, "grad_norm": 0.11402563005685806, "learning_rate": 4.986830758914809e-05, "loss": 0.681, "step": 9270 }, { "epoch": 4.723002752629318, "grad_norm": 0.12297854572534561, "learning_rate": 4.9868165526137465e-05, "loss": 0.6814, "step": 9280 }, { "epoch": 4.728094321309806, "grad_norm": 0.10925690084695816, "learning_rate": 4.986802346312684e-05, "loss": 0.6835, "step": 9290 }, { "epoch": 4.7331858899902945, "grad_norm": 0.1584441214799881, "learning_rate": 4.986788140011621e-05, "loss": 0.6795, "step": 9300 }, { "epoch": 4.738277458670782, "grad_norm": 0.1546424776315689, "learning_rate": 4.9867739337105585e-05, "loss": 0.6804, "step": 9310 }, { "epoch": 4.743369027351271, "grad_norm": 0.10821778327226639, "learning_rate": 4.986759727409496e-05, "loss": 0.6837, "step": 9320 }, { "epoch": 4.748460596031759, "grad_norm": 0.13283872604370117, "learning_rate": 4.9867455211084324e-05, "loss": 0.6806, "step": 9330 }, { "epoch": 4.7535521647122465, "grad_norm": 0.14704841375350952, "learning_rate": 4.98673131480737e-05, "loss": 0.68, "step": 9340 }, { "epoch": 4.758643733392735, "grad_norm": 0.13948886096477509, "learning_rate": 4.986717108506307e-05, "loss": 0.6737, "step": 9350 }, { "epoch": 4.763735302073223, "grad_norm": 0.1441805213689804, "learning_rate": 4.986702902205244e-05, "loss": 0.6791, "step": 9360 }, { "epoch": 4.768826870753712, "grad_norm": 0.15041285753250122, "learning_rate": 4.9866886959041816e-05, "loss": 0.6772, "step": 9370 }, { "epoch": 4.773918439434199, "grad_norm": 0.1656763106584549, "learning_rate": 4.986674489603119e-05, "loss": 0.6844, "step": 9380 }, { "epoch": 4.779010008114687, "grad_norm": 0.1404283046722412, "learning_rate": 4.986660283302056e-05, "loss": 0.6837, "step": 9390 }, { "epoch": 4.784101576795176, "grad_norm": 0.1178780272603035, "learning_rate": 4.986646077000993e-05, "loss": 0.6822, "step": 9400 }, { "epoch": 4.7891931454756635, "grad_norm": 0.11357172578573227, "learning_rate": 4.98663187069993e-05, "loss": 0.6811, "step": 9410 }, { "epoch": 4.794284714156152, "grad_norm": 0.12318674474954605, "learning_rate": 4.9866176643988675e-05, "loss": 0.6798, "step": 9420 }, { "epoch": 4.79937628283664, "grad_norm": 0.09487531334161758, "learning_rate": 4.986603458097805e-05, "loss": 0.6825, "step": 9430 }, { "epoch": 4.804467851517129, "grad_norm": 0.09417689591646194, "learning_rate": 4.986589251796742e-05, "loss": 0.6828, "step": 9440 }, { "epoch": 4.809559420197616, "grad_norm": 0.10734029114246368, "learning_rate": 4.9865750454956794e-05, "loss": 0.6821, "step": 9450 }, { "epoch": 4.814650988878105, "grad_norm": 0.10005868971347809, "learning_rate": 4.986560839194617e-05, "loss": 0.687, "step": 9460 }, { "epoch": 4.819742557558593, "grad_norm": 0.11884880065917969, "learning_rate": 4.986546632893554e-05, "loss": 0.679, "step": 9470 }, { "epoch": 4.824834126239081, "grad_norm": 0.10700765252113342, "learning_rate": 4.9865324265924914e-05, "loss": 0.679, "step": 9480 }, { "epoch": 4.829925694919569, "grad_norm": 0.1253756880760193, "learning_rate": 4.986518220291429e-05, "loss": 0.6824, "step": 9490 }, { "epoch": 4.835017263600057, "grad_norm": 0.13005779683589935, "learning_rate": 4.986504013990366e-05, "loss": 0.6773, "step": 9500 }, { "epoch": 4.840108832280546, "grad_norm": 0.1245838925242424, "learning_rate": 4.9864898076893026e-05, "loss": 0.6778, "step": 9510 }, { "epoch": 4.845200400961033, "grad_norm": 0.13099046051502228, "learning_rate": 4.98647560138824e-05, "loss": 0.6819, "step": 9520 }, { "epoch": 4.850291969641522, "grad_norm": 0.10995706915855408, "learning_rate": 4.986461395087177e-05, "loss": 0.6806, "step": 9530 }, { "epoch": 4.85538353832201, "grad_norm": 0.10981863737106323, "learning_rate": 4.986447188786114e-05, "loss": 0.6768, "step": 9540 }, { "epoch": 4.8604751070024985, "grad_norm": 0.10785161703824997, "learning_rate": 4.986432982485051e-05, "loss": 0.6815, "step": 9550 }, { "epoch": 4.865566675682986, "grad_norm": 0.11493176966905594, "learning_rate": 4.9864187761839885e-05, "loss": 0.6803, "step": 9560 }, { "epoch": 4.870658244363474, "grad_norm": 0.13624422252178192, "learning_rate": 4.986404569882926e-05, "loss": 0.679, "step": 9570 }, { "epoch": 4.875749813043963, "grad_norm": 0.12251431494951248, "learning_rate": 4.986390363581863e-05, "loss": 0.68, "step": 9580 }, { "epoch": 4.8808413817244505, "grad_norm": 0.15482662618160248, "learning_rate": 4.9863761572808004e-05, "loss": 0.6827, "step": 9590 }, { "epoch": 4.885932950404939, "grad_norm": 0.08389197289943695, "learning_rate": 4.986361950979738e-05, "loss": 0.6831, "step": 9600 }, { "epoch": 4.891024519085427, "grad_norm": 0.1233370378613472, "learning_rate": 4.986347744678675e-05, "loss": 0.6811, "step": 9610 }, { "epoch": 4.896116087765915, "grad_norm": 0.11783581227064133, "learning_rate": 4.9863335383776124e-05, "loss": 0.6854, "step": 9620 }, { "epoch": 4.901207656446403, "grad_norm": 0.10777773708105087, "learning_rate": 4.98631933207655e-05, "loss": 0.6787, "step": 9630 }, { "epoch": 4.906299225126891, "grad_norm": 0.14652119576931, "learning_rate": 4.986305125775487e-05, "loss": 0.6797, "step": 9640 }, { "epoch": 4.91139079380738, "grad_norm": 0.11962393671274185, "learning_rate": 4.9862909194744236e-05, "loss": 0.6832, "step": 9650 }, { "epoch": 4.9164823624878675, "grad_norm": 0.11764557659626007, "learning_rate": 4.986276713173361e-05, "loss": 0.677, "step": 9660 }, { "epoch": 4.921573931168356, "grad_norm": 0.13469521701335907, "learning_rate": 4.986262506872298e-05, "loss": 0.6759, "step": 9670 }, { "epoch": 4.926665499848844, "grad_norm": 0.11636529117822647, "learning_rate": 4.9862483005712356e-05, "loss": 0.6789, "step": 9680 }, { "epoch": 4.931757068529333, "grad_norm": 0.15902294218540192, "learning_rate": 4.986234094270173e-05, "loss": 0.6758, "step": 9690 }, { "epoch": 4.93684863720982, "grad_norm": 0.13991579413414001, "learning_rate": 4.98621988796911e-05, "loss": 0.6839, "step": 9700 }, { "epoch": 4.941940205890308, "grad_norm": 0.12394755333662033, "learning_rate": 4.9862056816680475e-05, "loss": 0.6823, "step": 9710 }, { "epoch": 4.947031774570797, "grad_norm": 0.11160258948802948, "learning_rate": 4.986191475366985e-05, "loss": 0.6772, "step": 9720 }, { "epoch": 4.952123343251285, "grad_norm": 0.11390865594148636, "learning_rate": 4.986177269065922e-05, "loss": 0.6777, "step": 9730 }, { "epoch": 4.957214911931773, "grad_norm": 0.14337550103664398, "learning_rate": 4.9861630627648594e-05, "loss": 0.676, "step": 9740 }, { "epoch": 4.962306480612261, "grad_norm": 0.1478574424982071, "learning_rate": 4.986148856463796e-05, "loss": 0.6804, "step": 9750 }, { "epoch": 4.96739804929275, "grad_norm": 0.09173934161663055, "learning_rate": 4.9861346501627334e-05, "loss": 0.6834, "step": 9760 }, { "epoch": 4.972489617973237, "grad_norm": 0.10893456637859344, "learning_rate": 4.986120443861671e-05, "loss": 0.6796, "step": 9770 }, { "epoch": 4.977581186653726, "grad_norm": 0.10967724025249481, "learning_rate": 4.986106237560608e-05, "loss": 0.6804, "step": 9780 }, { "epoch": 4.982672755334214, "grad_norm": 0.11746654659509659, "learning_rate": 4.9860920312595446e-05, "loss": 0.6807, "step": 9790 }, { "epoch": 4.987764324014702, "grad_norm": 0.10084499418735504, "learning_rate": 4.986077824958482e-05, "loss": 0.6779, "step": 9800 }, { "epoch": 4.99285589269519, "grad_norm": 0.16148197650909424, "learning_rate": 4.986063618657419e-05, "loss": 0.6766, "step": 9810 }, { "epoch": 4.997947461375678, "grad_norm": 0.12952958047389984, "learning_rate": 4.9860494123563565e-05, "loss": 0.676, "step": 9820 }, { "epoch": 5.002545784340244, "grad_norm": 0.16547605395317078, "learning_rate": 4.986035206055294e-05, "loss": 0.6137, "step": 9830 }, { "epoch": 5.0076373530207325, "grad_norm": 0.1671449840068817, "learning_rate": 4.986020999754231e-05, "loss": 0.6774, "step": 9840 }, { "epoch": 5.01272892170122, "grad_norm": 0.13992175459861755, "learning_rate": 4.9860067934531685e-05, "loss": 0.6819, "step": 9850 }, { "epoch": 5.017820490381709, "grad_norm": 0.08816186338663101, "learning_rate": 4.985992587152106e-05, "loss": 0.6819, "step": 9860 }, { "epoch": 5.022912059062197, "grad_norm": 0.08476711064577103, "learning_rate": 4.985978380851043e-05, "loss": 0.6817, "step": 9870 }, { "epoch": 5.0280036277426845, "grad_norm": 0.09989239275455475, "learning_rate": 4.9859641745499804e-05, "loss": 0.683, "step": 9880 }, { "epoch": 5.033095196423173, "grad_norm": 0.09048530459403992, "learning_rate": 4.985949968248918e-05, "loss": 0.681, "step": 9890 }, { "epoch": 5.038186765103661, "grad_norm": 0.11307314783334732, "learning_rate": 4.985935761947855e-05, "loss": 0.6785, "step": 9900 }, { "epoch": 5.04327833378415, "grad_norm": 0.12317655235528946, "learning_rate": 4.985921555646792e-05, "loss": 0.681, "step": 9910 }, { "epoch": 5.048369902464637, "grad_norm": 0.11963162571191788, "learning_rate": 4.985907349345729e-05, "loss": 0.6774, "step": 9920 }, { "epoch": 5.053461471145126, "grad_norm": 0.11438319087028503, "learning_rate": 4.985893143044666e-05, "loss": 0.68, "step": 9930 }, { "epoch": 5.058553039825614, "grad_norm": 0.13765713572502136, "learning_rate": 4.9858789367436036e-05, "loss": 0.6766, "step": 9940 }, { "epoch": 5.0636446085061015, "grad_norm": 0.12760768830776215, "learning_rate": 4.985864730442541e-05, "loss": 0.6763, "step": 9950 }, { "epoch": 5.06873617718659, "grad_norm": 0.14188893139362335, "learning_rate": 4.9858505241414775e-05, "loss": 0.6815, "step": 9960 }, { "epoch": 5.073827745867078, "grad_norm": 0.177343487739563, "learning_rate": 4.985836317840415e-05, "loss": 0.6765, "step": 9970 }, { "epoch": 5.078919314547567, "grad_norm": 0.15826770663261414, "learning_rate": 4.985822111539352e-05, "loss": 0.6819, "step": 9980 }, { "epoch": 5.084010883228054, "grad_norm": 0.1431620568037033, "learning_rate": 4.9858079052382895e-05, "loss": 0.6809, "step": 9990 }, { "epoch": 5.089102451908543, "grad_norm": 0.13952907919883728, "learning_rate": 4.985793698937227e-05, "loss": 0.6803, "step": 10000 }, { "epoch": 5.094194020589031, "grad_norm": 0.11862120032310486, "learning_rate": 4.985779492636164e-05, "loss": 0.6774, "step": 10010 }, { "epoch": 5.099285589269519, "grad_norm": 0.15467384457588196, "learning_rate": 4.9857652863351014e-05, "loss": 0.6777, "step": 10020 }, { "epoch": 5.104377157950007, "grad_norm": 0.12163079530000687, "learning_rate": 4.985751080034039e-05, "loss": 0.6801, "step": 10030 }, { "epoch": 5.109468726630495, "grad_norm": 0.1349727064371109, "learning_rate": 4.985736873732976e-05, "loss": 0.679, "step": 10040 }, { "epoch": 5.114560295310984, "grad_norm": 0.12950022518634796, "learning_rate": 4.9857226674319127e-05, "loss": 0.6799, "step": 10050 }, { "epoch": 5.119651863991471, "grad_norm": 0.12536922097206116, "learning_rate": 4.98570846113085e-05, "loss": 0.6805, "step": 10060 }, { "epoch": 5.12474343267196, "grad_norm": 0.08876863867044449, "learning_rate": 4.985694254829787e-05, "loss": 0.6838, "step": 10070 }, { "epoch": 5.129835001352448, "grad_norm": 0.13812567293643951, "learning_rate": 4.9856800485287246e-05, "loss": 0.6795, "step": 10080 }, { "epoch": 5.1349265700329365, "grad_norm": 0.11330072581768036, "learning_rate": 4.985665842227662e-05, "loss": 0.6775, "step": 10090 }, { "epoch": 5.140018138713424, "grad_norm": 0.12768009305000305, "learning_rate": 4.985651635926599e-05, "loss": 0.6758, "step": 10100 }, { "epoch": 5.145109707393912, "grad_norm": 0.15295925736427307, "learning_rate": 4.9856374296255365e-05, "loss": 0.6885, "step": 10110 }, { "epoch": 5.150201276074401, "grad_norm": 0.08242222666740417, "learning_rate": 4.985623223324474e-05, "loss": 0.6826, "step": 10120 }, { "epoch": 5.1552928447548885, "grad_norm": 0.0866493284702301, "learning_rate": 4.985609017023411e-05, "loss": 0.6823, "step": 10130 }, { "epoch": 5.160384413435377, "grad_norm": 0.1157221645116806, "learning_rate": 4.9855948107223485e-05, "loss": 0.6764, "step": 10140 }, { "epoch": 5.165475982115865, "grad_norm": 0.1414877027273178, "learning_rate": 4.985580604421286e-05, "loss": 0.6749, "step": 10150 }, { "epoch": 5.1705675507963536, "grad_norm": 0.13449379801750183, "learning_rate": 4.985566398120223e-05, "loss": 0.6806, "step": 10160 }, { "epoch": 5.175659119476841, "grad_norm": 0.13108868896961212, "learning_rate": 4.98555219181916e-05, "loss": 0.6806, "step": 10170 }, { "epoch": 5.180750688157329, "grad_norm": 0.12748171389102936, "learning_rate": 4.985537985518097e-05, "loss": 0.6763, "step": 10180 }, { "epoch": 5.185842256837818, "grad_norm": 0.10387007147073746, "learning_rate": 4.9855237792170336e-05, "loss": 0.6872, "step": 10190 }, { "epoch": 5.1909338255183055, "grad_norm": 0.09480390697717667, "learning_rate": 4.985509572915971e-05, "loss": 0.6822, "step": 10200 }, { "epoch": 5.196025394198794, "grad_norm": 0.11437319219112396, "learning_rate": 4.985495366614908e-05, "loss": 0.6792, "step": 10210 }, { "epoch": 5.201116962879282, "grad_norm": 0.12557561695575714, "learning_rate": 4.9854811603138456e-05, "loss": 0.682, "step": 10220 }, { "epoch": 5.206208531559771, "grad_norm": 0.1291828453540802, "learning_rate": 4.985466954012783e-05, "loss": 0.6848, "step": 10230 }, { "epoch": 5.211300100240258, "grad_norm": 0.12377645820379257, "learning_rate": 4.98545274771172e-05, "loss": 0.6789, "step": 10240 }, { "epoch": 5.216391668920746, "grad_norm": 0.12247670441865921, "learning_rate": 4.9854385414106575e-05, "loss": 0.681, "step": 10250 }, { "epoch": 5.221483237601235, "grad_norm": 0.10693535208702087, "learning_rate": 4.985424335109595e-05, "loss": 0.687, "step": 10260 }, { "epoch": 5.2265748062817226, "grad_norm": 0.11651374399662018, "learning_rate": 4.985410128808532e-05, "loss": 0.6775, "step": 10270 }, { "epoch": 5.231666374962211, "grad_norm": 0.1369701623916626, "learning_rate": 4.9853959225074694e-05, "loss": 0.6767, "step": 10280 }, { "epoch": 5.236757943642699, "grad_norm": 0.13671474158763885, "learning_rate": 4.985381716206407e-05, "loss": 0.6821, "step": 10290 }, { "epoch": 5.241849512323188, "grad_norm": 0.11949580907821655, "learning_rate": 4.985367509905344e-05, "loss": 0.6807, "step": 10300 }, { "epoch": 5.246941081003675, "grad_norm": 0.11703040450811386, "learning_rate": 4.985353303604281e-05, "loss": 0.678, "step": 10310 }, { "epoch": 5.252032649684164, "grad_norm": 0.11209936439990997, "learning_rate": 4.985339097303218e-05, "loss": 0.6773, "step": 10320 }, { "epoch": 5.257124218364652, "grad_norm": 0.13346509635448456, "learning_rate": 4.985324891002155e-05, "loss": 0.6857, "step": 10330 }, { "epoch": 5.26221578704514, "grad_norm": 0.12218772619962692, "learning_rate": 4.9853106847010926e-05, "loss": 0.681, "step": 10340 }, { "epoch": 5.267307355725628, "grad_norm": 0.1169796735048294, "learning_rate": 4.98529647840003e-05, "loss": 0.6767, "step": 10350 }, { "epoch": 5.272398924406116, "grad_norm": 0.14005398750305176, "learning_rate": 4.985282272098967e-05, "loss": 0.674, "step": 10360 }, { "epoch": 5.277490493086605, "grad_norm": 0.1299133449792862, "learning_rate": 4.9852680657979046e-05, "loss": 0.6779, "step": 10370 }, { "epoch": 5.282582061767092, "grad_norm": 0.13446015119552612, "learning_rate": 4.985253859496841e-05, "loss": 0.6781, "step": 10380 }, { "epoch": 5.287673630447581, "grad_norm": 0.14030112326145172, "learning_rate": 4.9852396531957785e-05, "loss": 0.6782, "step": 10390 }, { "epoch": 5.292765199128069, "grad_norm": 0.12442600727081299, "learning_rate": 4.985225446894716e-05, "loss": 0.6841, "step": 10400 }, { "epoch": 5.297856767808557, "grad_norm": 0.11391379684209824, "learning_rate": 4.985211240593653e-05, "loss": 0.6834, "step": 10410 }, { "epoch": 5.302948336489045, "grad_norm": 0.11152996867895126, "learning_rate": 4.9851970342925904e-05, "loss": 0.6816, "step": 10420 }, { "epoch": 5.308039905169533, "grad_norm": 0.13936050236225128, "learning_rate": 4.985182827991528e-05, "loss": 0.6831, "step": 10430 }, { "epoch": 5.313131473850022, "grad_norm": 0.11654047667980194, "learning_rate": 4.985168621690465e-05, "loss": 0.6803, "step": 10440 }, { "epoch": 5.3182230425305095, "grad_norm": 0.11251688003540039, "learning_rate": 4.985154415389402e-05, "loss": 0.6815, "step": 10450 }, { "epoch": 5.323314611210998, "grad_norm": 0.09920088946819305, "learning_rate": 4.985140209088339e-05, "loss": 0.6789, "step": 10460 }, { "epoch": 5.328406179891486, "grad_norm": 0.18474489450454712, "learning_rate": 4.985126002787276e-05, "loss": 0.6777, "step": 10470 }, { "epoch": 5.333497748571974, "grad_norm": 0.12075336277484894, "learning_rate": 4.9851117964862136e-05, "loss": 0.6828, "step": 10480 }, { "epoch": 5.338589317252462, "grad_norm": 0.1428055316209793, "learning_rate": 4.985097590185151e-05, "loss": 0.6765, "step": 10490 }, { "epoch": 5.34368088593295, "grad_norm": 0.1289169192314148, "learning_rate": 4.985083383884088e-05, "loss": 0.6825, "step": 10500 }, { "epoch": 5.348772454613439, "grad_norm": 0.10693208128213882, "learning_rate": 4.9850691775830256e-05, "loss": 0.6814, "step": 10510 }, { "epoch": 5.3538640232939265, "grad_norm": 0.11116955429315567, "learning_rate": 4.985054971281963e-05, "loss": 0.6805, "step": 10520 }, { "epoch": 5.358955591974415, "grad_norm": 0.11630560457706451, "learning_rate": 4.9850407649809e-05, "loss": 0.6779, "step": 10530 }, { "epoch": 5.364047160654903, "grad_norm": 0.13117016851902008, "learning_rate": 4.9850265586798375e-05, "loss": 0.6749, "step": 10540 }, { "epoch": 5.369138729335392, "grad_norm": 0.14777855575084686, "learning_rate": 4.985012352378775e-05, "loss": 0.6788, "step": 10550 }, { "epoch": 5.374230298015879, "grad_norm": 0.1084110215306282, "learning_rate": 4.9849981460777114e-05, "loss": 0.6843, "step": 10560 }, { "epoch": 5.379321866696367, "grad_norm": 0.10926970094442368, "learning_rate": 4.984983939776649e-05, "loss": 0.6807, "step": 10570 }, { "epoch": 5.384413435376856, "grad_norm": 0.10273724794387817, "learning_rate": 4.984969733475586e-05, "loss": 0.6819, "step": 10580 }, { "epoch": 5.389505004057344, "grad_norm": 0.12061687558889389, "learning_rate": 4.984955527174523e-05, "loss": 0.6791, "step": 10590 }, { "epoch": 5.394596572737832, "grad_norm": 0.11515804380178452, "learning_rate": 4.98494132087346e-05, "loss": 0.6798, "step": 10600 }, { "epoch": 5.39968814141832, "grad_norm": 0.11288391053676605, "learning_rate": 4.984927114572397e-05, "loss": 0.681, "step": 10610 }, { "epoch": 5.404779710098809, "grad_norm": 0.12682178616523743, "learning_rate": 4.9849129082713346e-05, "loss": 0.6778, "step": 10620 }, { "epoch": 5.409871278779296, "grad_norm": 0.12649093568325043, "learning_rate": 4.984898701970272e-05, "loss": 0.6767, "step": 10630 }, { "epoch": 5.414962847459784, "grad_norm": 0.1650230884552002, "learning_rate": 4.984884495669209e-05, "loss": 0.6772, "step": 10640 }, { "epoch": 5.420054416140273, "grad_norm": 0.11968445032835007, "learning_rate": 4.9848702893681465e-05, "loss": 0.6791, "step": 10650 }, { "epoch": 5.425145984820761, "grad_norm": 0.10566221922636032, "learning_rate": 4.984856083067084e-05, "loss": 0.6769, "step": 10660 }, { "epoch": 5.430237553501249, "grad_norm": 0.09944125264883041, "learning_rate": 4.984841876766021e-05, "loss": 0.6789, "step": 10670 }, { "epoch": 5.435329122181737, "grad_norm": 0.12134432047605515, "learning_rate": 4.9848276704649585e-05, "loss": 0.6741, "step": 10680 }, { "epoch": 5.440420690862226, "grad_norm": 0.1576509177684784, "learning_rate": 4.984813464163896e-05, "loss": 0.6818, "step": 10690 }, { "epoch": 5.4455122595427135, "grad_norm": 0.13000087440013885, "learning_rate": 4.9847992578628324e-05, "loss": 0.6719, "step": 10700 }, { "epoch": 5.450603828223201, "grad_norm": 0.12142984569072723, "learning_rate": 4.98478505156177e-05, "loss": 0.6825, "step": 10710 }, { "epoch": 5.45569539690369, "grad_norm": 0.1100669875741005, "learning_rate": 4.984770845260707e-05, "loss": 0.6759, "step": 10720 }, { "epoch": 5.460786965584178, "grad_norm": 0.1101478561758995, "learning_rate": 4.9847566389596444e-05, "loss": 0.685, "step": 10730 }, { "epoch": 5.465878534264666, "grad_norm": 0.1224004253745079, "learning_rate": 4.984742432658582e-05, "loss": 0.6763, "step": 10740 }, { "epoch": 5.470970102945154, "grad_norm": 0.14111606776714325, "learning_rate": 4.984728226357519e-05, "loss": 0.6777, "step": 10750 }, { "epoch": 5.476061671625643, "grad_norm": 0.10880038887262344, "learning_rate": 4.984714020056456e-05, "loss": 0.6834, "step": 10760 }, { "epoch": 5.4811532403061305, "grad_norm": 0.1258549839258194, "learning_rate": 4.9846998137553936e-05, "loss": 0.6828, "step": 10770 }, { "epoch": 5.486244808986619, "grad_norm": 0.10077346116304398, "learning_rate": 4.984685607454331e-05, "loss": 0.6797, "step": 10780 }, { "epoch": 5.491336377667107, "grad_norm": 0.14082978665828705, "learning_rate": 4.984671401153268e-05, "loss": 0.6773, "step": 10790 }, { "epoch": 5.496427946347595, "grad_norm": 0.12051651626825333, "learning_rate": 4.984657194852205e-05, "loss": 0.6774, "step": 10800 }, { "epoch": 5.501519515028083, "grad_norm": 0.15081602334976196, "learning_rate": 4.984642988551142e-05, "loss": 0.6866, "step": 10810 }, { "epoch": 5.506611083708571, "grad_norm": 0.09743819385766983, "learning_rate": 4.9846287822500795e-05, "loss": 0.6804, "step": 10820 }, { "epoch": 5.51170265238906, "grad_norm": 0.09400393813848495, "learning_rate": 4.984614575949017e-05, "loss": 0.6815, "step": 10830 }, { "epoch": 5.5167942210695475, "grad_norm": 0.13835515081882477, "learning_rate": 4.9846003696479534e-05, "loss": 0.6866, "step": 10840 }, { "epoch": 5.521885789750036, "grad_norm": 0.11208510398864746, "learning_rate": 4.984586163346891e-05, "loss": 0.6805, "step": 10850 }, { "epoch": 5.526977358430524, "grad_norm": 0.11167927086353302, "learning_rate": 4.984571957045828e-05, "loss": 0.6799, "step": 10860 }, { "epoch": 5.532068927111012, "grad_norm": 0.12590061128139496, "learning_rate": 4.9845577507447653e-05, "loss": 0.676, "step": 10870 }, { "epoch": 5.5371604957915, "grad_norm": 0.15050916373729706, "learning_rate": 4.9845435444437027e-05, "loss": 0.6712, "step": 10880 }, { "epoch": 5.542252064471988, "grad_norm": 0.14142751693725586, "learning_rate": 4.98452933814264e-05, "loss": 0.676, "step": 10890 }, { "epoch": 5.547343633152477, "grad_norm": 0.24029377102851868, "learning_rate": 4.984515131841577e-05, "loss": 0.683, "step": 10900 }, { "epoch": 5.552435201832965, "grad_norm": 0.11458209902048111, "learning_rate": 4.9845009255405146e-05, "loss": 0.6795, "step": 10910 }, { "epoch": 5.557526770513453, "grad_norm": 0.10509049147367477, "learning_rate": 4.984486719239452e-05, "loss": 0.6832, "step": 10920 }, { "epoch": 5.562618339193941, "grad_norm": 0.1304958164691925, "learning_rate": 4.984472512938389e-05, "loss": 0.6814, "step": 10930 }, { "epoch": 5.567709907874429, "grad_norm": 0.11066732555627823, "learning_rate": 4.9844583066373265e-05, "loss": 0.6734, "step": 10940 }, { "epoch": 5.572801476554917, "grad_norm": 0.14044025540351868, "learning_rate": 4.984444100336264e-05, "loss": 0.6851, "step": 10950 }, { "epoch": 5.577893045235405, "grad_norm": 0.09776227921247482, "learning_rate": 4.9844298940352005e-05, "loss": 0.6797, "step": 10960 }, { "epoch": 5.582984613915894, "grad_norm": 0.08972660452127457, "learning_rate": 4.984415687734138e-05, "loss": 0.6803, "step": 10970 }, { "epoch": 5.588076182596382, "grad_norm": 0.11810458451509476, "learning_rate": 4.984401481433075e-05, "loss": 0.6802, "step": 10980 }, { "epoch": 5.59316775127687, "grad_norm": 0.11004742234945297, "learning_rate": 4.9843872751320124e-05, "loss": 0.6795, "step": 10990 }, { "epoch": 5.598259319957358, "grad_norm": 0.10075508058071136, "learning_rate": 4.98437306883095e-05, "loss": 0.682, "step": 11000 }, { "epoch": 5.603350888637847, "grad_norm": 0.10835061222314835, "learning_rate": 4.9843588625298863e-05, "loss": 0.6829, "step": 11010 }, { "epoch": 5.6084424573183345, "grad_norm": 0.1209336370229721, "learning_rate": 4.9843446562288236e-05, "loss": 0.6808, "step": 11020 }, { "epoch": 5.613534025998822, "grad_norm": 0.12438962608575821, "learning_rate": 4.984330449927761e-05, "loss": 0.6768, "step": 11030 }, { "epoch": 5.618625594679311, "grad_norm": 0.1364268809556961, "learning_rate": 4.984316243626698e-05, "loss": 0.6781, "step": 11040 }, { "epoch": 5.623717163359799, "grad_norm": 0.11569849401712418, "learning_rate": 4.9843020373256356e-05, "loss": 0.6825, "step": 11050 }, { "epoch": 5.628808732040287, "grad_norm": 0.10072596371173859, "learning_rate": 4.984287831024573e-05, "loss": 0.6764, "step": 11060 }, { "epoch": 5.633900300720775, "grad_norm": 0.15180449187755585, "learning_rate": 4.98427362472351e-05, "loss": 0.6782, "step": 11070 }, { "epoch": 5.638991869401263, "grad_norm": 0.14204277098178864, "learning_rate": 4.9842594184224475e-05, "loss": 0.6806, "step": 11080 }, { "epoch": 5.6440834380817515, "grad_norm": 0.12409929186105728, "learning_rate": 4.984245212121385e-05, "loss": 0.6806, "step": 11090 }, { "epoch": 5.649175006762239, "grad_norm": 0.1692194640636444, "learning_rate": 4.9842310058203215e-05, "loss": 0.6723, "step": 11100 }, { "epoch": 5.654266575442728, "grad_norm": 0.2566402852535248, "learning_rate": 4.984216799519259e-05, "loss": 0.6845, "step": 11110 }, { "epoch": 5.659358144123216, "grad_norm": 0.13745322823524475, "learning_rate": 4.984202593218196e-05, "loss": 0.6748, "step": 11120 }, { "epoch": 5.664449712803704, "grad_norm": 0.16598811745643616, "learning_rate": 4.9841883869171334e-05, "loss": 0.6798, "step": 11130 }, { "epoch": 5.669541281484192, "grad_norm": 0.13570183515548706, "learning_rate": 4.984174180616071e-05, "loss": 0.6797, "step": 11140 }, { "epoch": 5.674632850164681, "grad_norm": 0.17549622058868408, "learning_rate": 4.984159974315008e-05, "loss": 0.6773, "step": 11150 }, { "epoch": 5.6797244188451685, "grad_norm": 0.15479332208633423, "learning_rate": 4.984145768013945e-05, "loss": 0.6795, "step": 11160 }, { "epoch": 5.684815987525656, "grad_norm": 0.1562296450138092, "learning_rate": 4.9841315617128826e-05, "loss": 0.6803, "step": 11170 }, { "epoch": 5.689907556206145, "grad_norm": 0.13014480471611023, "learning_rate": 4.98411735541182e-05, "loss": 0.6793, "step": 11180 }, { "epoch": 5.694999124886633, "grad_norm": 0.1577223241329193, "learning_rate": 4.984103149110757e-05, "loss": 0.6845, "step": 11190 }, { "epoch": 5.700090693567121, "grad_norm": 0.14906632900238037, "learning_rate": 4.9840889428096946e-05, "loss": 0.6771, "step": 11200 }, { "epoch": 5.705182262247609, "grad_norm": 0.15042632818222046, "learning_rate": 4.984074736508632e-05, "loss": 0.6737, "step": 11210 }, { "epoch": 5.710273830928098, "grad_norm": 0.1530093252658844, "learning_rate": 4.9840605302075685e-05, "loss": 0.6804, "step": 11220 }, { "epoch": 5.715365399608586, "grad_norm": 0.18300846219062805, "learning_rate": 4.984046323906506e-05, "loss": 0.6752, "step": 11230 }, { "epoch": 5.720456968289074, "grad_norm": 0.14398545026779175, "learning_rate": 4.9840321176054424e-05, "loss": 0.6793, "step": 11240 }, { "epoch": 5.725548536969562, "grad_norm": 0.12745435535907745, "learning_rate": 4.98401791130438e-05, "loss": 0.6765, "step": 11250 }, { "epoch": 5.73064010565005, "grad_norm": 0.15162277221679688, "learning_rate": 4.984003705003317e-05, "loss": 0.6744, "step": 11260 }, { "epoch": 5.735731674330538, "grad_norm": 0.12970998883247375, "learning_rate": 4.9839894987022544e-05, "loss": 0.6818, "step": 11270 }, { "epoch": 5.740823243011026, "grad_norm": 0.1195228323340416, "learning_rate": 4.983975292401192e-05, "loss": 0.6749, "step": 11280 }, { "epoch": 5.745914811691515, "grad_norm": 0.14821238815784454, "learning_rate": 4.983961086100129e-05, "loss": 0.6759, "step": 11290 }, { "epoch": 5.751006380372003, "grad_norm": 0.18345175683498383, "learning_rate": 4.983946879799066e-05, "loss": 0.6736, "step": 11300 }, { "epoch": 5.75609794905249, "grad_norm": 0.14165613055229187, "learning_rate": 4.9839326734980036e-05, "loss": 0.6777, "step": 11310 }, { "epoch": 5.761189517732979, "grad_norm": 0.16045770049095154, "learning_rate": 4.983918467196941e-05, "loss": 0.678, "step": 11320 }, { "epoch": 5.766281086413467, "grad_norm": 0.1490974873304367, "learning_rate": 4.983904260895878e-05, "loss": 0.68, "step": 11330 }, { "epoch": 5.7713726550939555, "grad_norm": 0.11064887046813965, "learning_rate": 4.9838900545948156e-05, "loss": 0.6832, "step": 11340 }, { "epoch": 5.776464223774443, "grad_norm": 0.11848734319210052, "learning_rate": 4.983875848293753e-05, "loss": 0.6792, "step": 11350 }, { "epoch": 5.781555792454932, "grad_norm": 0.1246313750743866, "learning_rate": 4.9838616419926895e-05, "loss": 0.6794, "step": 11360 }, { "epoch": 5.78664736113542, "grad_norm": 0.17359575629234314, "learning_rate": 4.983847435691627e-05, "loss": 0.6762, "step": 11370 }, { "epoch": 5.791738929815908, "grad_norm": 0.16471154987812042, "learning_rate": 4.983833229390564e-05, "loss": 0.6742, "step": 11380 }, { "epoch": 5.796830498496396, "grad_norm": 0.1479930430650711, "learning_rate": 4.9838190230895014e-05, "loss": 0.678, "step": 11390 }, { "epoch": 5.801922067176884, "grad_norm": 0.11385341733694077, "learning_rate": 4.983804816788439e-05, "loss": 0.6791, "step": 11400 }, { "epoch": 5.8070136358573725, "grad_norm": 0.13574256002902985, "learning_rate": 4.983790610487376e-05, "loss": 0.6795, "step": 11410 }, { "epoch": 5.81210520453786, "grad_norm": 0.1701575517654419, "learning_rate": 4.9837764041863134e-05, "loss": 0.6791, "step": 11420 }, { "epoch": 5.817196773218349, "grad_norm": 0.11972179263830185, "learning_rate": 4.98376219788525e-05, "loss": 0.6802, "step": 11430 }, { "epoch": 5.822288341898837, "grad_norm": 0.15830230712890625, "learning_rate": 4.983747991584187e-05, "loss": 0.6761, "step": 11440 }, { "epoch": 5.827379910579325, "grad_norm": 0.16592001914978027, "learning_rate": 4.9837337852831246e-05, "loss": 0.6768, "step": 11450 }, { "epoch": 5.832471479259813, "grad_norm": 0.21496979892253876, "learning_rate": 4.983719578982062e-05, "loss": 0.6783, "step": 11460 }, { "epoch": 5.837563047940302, "grad_norm": 0.14850680530071259, "learning_rate": 4.983705372680999e-05, "loss": 0.6781, "step": 11470 }, { "epoch": 5.8426546166207896, "grad_norm": 0.12256158143281937, "learning_rate": 4.9836911663799365e-05, "loss": 0.6776, "step": 11480 }, { "epoch": 5.847746185301277, "grad_norm": 0.14311592280864716, "learning_rate": 4.983676960078874e-05, "loss": 0.6717, "step": 11490 }, { "epoch": 5.852837753981766, "grad_norm": 0.1648699939250946, "learning_rate": 4.9836627537778105e-05, "loss": 0.6779, "step": 11500 }, { "epoch": 5.857929322662254, "grad_norm": 0.13590501248836517, "learning_rate": 4.983648547476748e-05, "loss": 0.6824, "step": 11510 }, { "epoch": 5.863020891342742, "grad_norm": 0.13972793519496918, "learning_rate": 4.983634341175685e-05, "loss": 0.679, "step": 11520 }, { "epoch": 5.86811246002323, "grad_norm": 0.11360618472099304, "learning_rate": 4.9836201348746224e-05, "loss": 0.6746, "step": 11530 }, { "epoch": 5.873204028703718, "grad_norm": 0.14063167572021484, "learning_rate": 4.98360592857356e-05, "loss": 0.6818, "step": 11540 }, { "epoch": 5.878295597384207, "grad_norm": 0.12393573671579361, "learning_rate": 4.983591722272497e-05, "loss": 0.6771, "step": 11550 }, { "epoch": 5.883387166064694, "grad_norm": 0.12383928149938583, "learning_rate": 4.9835775159714344e-05, "loss": 0.6807, "step": 11560 }, { "epoch": 5.888478734745183, "grad_norm": 0.11464569717645645, "learning_rate": 4.983563309670372e-05, "loss": 0.6823, "step": 11570 }, { "epoch": 5.893570303425671, "grad_norm": 0.15896569192409515, "learning_rate": 4.983549103369309e-05, "loss": 0.678, "step": 11580 }, { "epoch": 5.898661872106159, "grad_norm": 0.11153749376535416, "learning_rate": 4.983534897068246e-05, "loss": 0.6799, "step": 11590 }, { "epoch": 5.903753440786647, "grad_norm": 0.13557817041873932, "learning_rate": 4.9835206907671836e-05, "loss": 0.678, "step": 11600 }, { "epoch": 5.908845009467136, "grad_norm": 0.12681804597377777, "learning_rate": 4.98350648446612e-05, "loss": 0.6853, "step": 11610 }, { "epoch": 5.913936578147624, "grad_norm": 0.11007581651210785, "learning_rate": 4.9834922781650575e-05, "loss": 0.6799, "step": 11620 }, { "epoch": 5.919028146828111, "grad_norm": 0.14073921740055084, "learning_rate": 4.983478071863995e-05, "loss": 0.6809, "step": 11630 }, { "epoch": 5.9241197155086, "grad_norm": 0.17294389009475708, "learning_rate": 4.9834638655629315e-05, "loss": 0.677, "step": 11640 }, { "epoch": 5.929211284189088, "grad_norm": 0.11901852488517761, "learning_rate": 4.983449659261869e-05, "loss": 0.6814, "step": 11650 }, { "epoch": 5.9343028528695765, "grad_norm": 0.1563209444284439, "learning_rate": 4.983435452960806e-05, "loss": 0.6803, "step": 11660 }, { "epoch": 5.939394421550064, "grad_norm": 0.1763051152229309, "learning_rate": 4.9834212466597434e-05, "loss": 0.6713, "step": 11670 }, { "epoch": 5.944485990230553, "grad_norm": 0.1412787139415741, "learning_rate": 4.983407040358681e-05, "loss": 0.6791, "step": 11680 }, { "epoch": 5.949577558911041, "grad_norm": 0.13946793973445892, "learning_rate": 4.983392834057618e-05, "loss": 0.674, "step": 11690 }, { "epoch": 5.954669127591529, "grad_norm": 0.1848699301481247, "learning_rate": 4.9833786277565553e-05, "loss": 0.6785, "step": 11700 }, { "epoch": 5.959760696272017, "grad_norm": 0.14714594185352325, "learning_rate": 4.9833644214554927e-05, "loss": 0.6764, "step": 11710 }, { "epoch": 5.964852264952505, "grad_norm": 0.14410807192325592, "learning_rate": 4.98335021515443e-05, "loss": 0.6755, "step": 11720 }, { "epoch": 5.9699438336329935, "grad_norm": 0.11196265369653702, "learning_rate": 4.983336008853367e-05, "loss": 0.6801, "step": 11730 }, { "epoch": 5.975035402313481, "grad_norm": 0.14931631088256836, "learning_rate": 4.9833218025523046e-05, "loss": 0.6761, "step": 11740 }, { "epoch": 5.98012697099397, "grad_norm": 0.1235998123884201, "learning_rate": 4.983307596251241e-05, "loss": 0.6816, "step": 11750 }, { "epoch": 5.985218539674458, "grad_norm": 0.14235694706439972, "learning_rate": 4.9832933899501785e-05, "loss": 0.6784, "step": 11760 }, { "epoch": 5.9903101083549455, "grad_norm": 0.11291839182376862, "learning_rate": 4.983279183649116e-05, "loss": 0.6857, "step": 11770 }, { "epoch": 5.995401677035434, "grad_norm": 0.12273520231246948, "learning_rate": 4.983264977348053e-05, "loss": 0.6801, "step": 11780 }, { "epoch": 6.0, "grad_norm": 0.025783156976103783, "learning_rate": 4.9832507710469905e-05, "loss": 0.6142, "step": 11790 }, { "epoch": 6.005091568680488, "grad_norm": 0.1227310448884964, "learning_rate": 4.983236564745928e-05, "loss": 0.679, "step": 11800 }, { "epoch": 6.010183137360976, "grad_norm": 0.14122678339481354, "learning_rate": 4.983222358444865e-05, "loss": 0.677, "step": 11810 }, { "epoch": 6.015274706041464, "grad_norm": 0.14405541121959686, "learning_rate": 4.9832081521438024e-05, "loss": 0.6799, "step": 11820 }, { "epoch": 6.020366274721953, "grad_norm": 0.18694424629211426, "learning_rate": 4.98319394584274e-05, "loss": 0.675, "step": 11830 }, { "epoch": 6.025457843402441, "grad_norm": 0.1961718052625656, "learning_rate": 4.983179739541677e-05, "loss": 0.6819, "step": 11840 }, { "epoch": 6.030549412082929, "grad_norm": 0.1102224811911583, "learning_rate": 4.9831655332406137e-05, "loss": 0.682, "step": 11850 }, { "epoch": 6.035640980763417, "grad_norm": 0.1295260190963745, "learning_rate": 4.983151326939551e-05, "loss": 0.6794, "step": 11860 }, { "epoch": 6.040732549443905, "grad_norm": 0.12580661475658417, "learning_rate": 4.983137120638488e-05, "loss": 0.6791, "step": 11870 }, { "epoch": 6.0458241181243935, "grad_norm": 0.1288338154554367, "learning_rate": 4.9831229143374256e-05, "loss": 0.6805, "step": 11880 }, { "epoch": 6.050915686804881, "grad_norm": 0.1211671456694603, "learning_rate": 4.983108708036362e-05, "loss": 0.6764, "step": 11890 }, { "epoch": 6.05600725548537, "grad_norm": 0.15219536423683167, "learning_rate": 4.9830945017352995e-05, "loss": 0.6806, "step": 11900 }, { "epoch": 6.061098824165858, "grad_norm": 0.12759484350681305, "learning_rate": 4.983080295434237e-05, "loss": 0.676, "step": 11910 }, { "epoch": 6.066190392846346, "grad_norm": 0.1949695497751236, "learning_rate": 4.983066089133174e-05, "loss": 0.6832, "step": 11920 }, { "epoch": 6.071281961526834, "grad_norm": 0.11879277229309082, "learning_rate": 4.9830518828321115e-05, "loss": 0.6781, "step": 11930 }, { "epoch": 6.076373530207323, "grad_norm": 0.12636293470859528, "learning_rate": 4.983037676531049e-05, "loss": 0.6774, "step": 11940 }, { "epoch": 6.0814650988878105, "grad_norm": 0.13675157725811005, "learning_rate": 4.983023470229986e-05, "loss": 0.6789, "step": 11950 }, { "epoch": 6.086556667568298, "grad_norm": 0.13322140276432037, "learning_rate": 4.9830092639289234e-05, "loss": 0.6805, "step": 11960 }, { "epoch": 6.091648236248787, "grad_norm": 0.1352871060371399, "learning_rate": 4.982995057627861e-05, "loss": 0.6808, "step": 11970 }, { "epoch": 6.096739804929275, "grad_norm": 0.14976170659065247, "learning_rate": 4.982980851326798e-05, "loss": 0.6775, "step": 11980 }, { "epoch": 6.101831373609763, "grad_norm": 0.1250462532043457, "learning_rate": 4.982966645025735e-05, "loss": 0.6782, "step": 11990 }, { "epoch": 6.106922942290251, "grad_norm": 0.16815803945064545, "learning_rate": 4.9829524387246726e-05, "loss": 0.6721, "step": 12000 }, { "epoch": 6.11201451097074, "grad_norm": 0.18195395171642303, "learning_rate": 4.982938232423609e-05, "loss": 0.6806, "step": 12010 }, { "epoch": 6.1171060796512275, "grad_norm": 0.15061675012111664, "learning_rate": 4.9829240261225466e-05, "loss": 0.6732, "step": 12020 }, { "epoch": 6.122197648331715, "grad_norm": 0.14526985585689545, "learning_rate": 4.982909819821484e-05, "loss": 0.6788, "step": 12030 }, { "epoch": 6.127289217012204, "grad_norm": 0.1469496637582779, "learning_rate": 4.982895613520421e-05, "loss": 0.6779, "step": 12040 }, { "epoch": 6.132380785692692, "grad_norm": 0.18443866074085236, "learning_rate": 4.9828814072193585e-05, "loss": 0.6767, "step": 12050 }, { "epoch": 6.13747235437318, "grad_norm": 0.11885727196931839, "learning_rate": 4.982867200918295e-05, "loss": 0.6764, "step": 12060 }, { "epoch": 6.142563923053668, "grad_norm": 0.1266055554151535, "learning_rate": 4.9828529946172325e-05, "loss": 0.6837, "step": 12070 }, { "epoch": 6.147655491734157, "grad_norm": 0.11415141075849533, "learning_rate": 4.98283878831617e-05, "loss": 0.6837, "step": 12080 }, { "epoch": 6.152747060414645, "grad_norm": 0.09705322235822678, "learning_rate": 4.982824582015107e-05, "loss": 0.6815, "step": 12090 }, { "epoch": 6.157838629095132, "grad_norm": 0.12555427849292755, "learning_rate": 4.9828103757140444e-05, "loss": 0.6804, "step": 12100 }, { "epoch": 6.162930197775621, "grad_norm": 0.11063813418149948, "learning_rate": 4.982796169412982e-05, "loss": 0.6815, "step": 12110 }, { "epoch": 6.168021766456109, "grad_norm": 0.1428930014371872, "learning_rate": 4.982781963111919e-05, "loss": 0.6781, "step": 12120 }, { "epoch": 6.173113335136597, "grad_norm": 0.13896307349205017, "learning_rate": 4.982767756810856e-05, "loss": 0.6763, "step": 12130 }, { "epoch": 6.178204903817085, "grad_norm": 0.12032928317785263, "learning_rate": 4.9827535505097936e-05, "loss": 0.6803, "step": 12140 }, { "epoch": 6.183296472497574, "grad_norm": 0.11562150716781616, "learning_rate": 4.98273934420873e-05, "loss": 0.6766, "step": 12150 }, { "epoch": 6.188388041178062, "grad_norm": 0.1040254682302475, "learning_rate": 4.9827251379076676e-05, "loss": 0.6823, "step": 12160 }, { "epoch": 6.19347960985855, "grad_norm": 0.1031600683927536, "learning_rate": 4.982710931606605e-05, "loss": 0.6757, "step": 12170 }, { "epoch": 6.198571178539038, "grad_norm": 0.11150684952735901, "learning_rate": 4.982696725305542e-05, "loss": 0.6781, "step": 12180 }, { "epoch": 6.203662747219526, "grad_norm": 0.15506963431835175, "learning_rate": 4.9826825190044795e-05, "loss": 0.672, "step": 12190 }, { "epoch": 6.2087543159000145, "grad_norm": 0.13985055685043335, "learning_rate": 4.982668312703417e-05, "loss": 0.6793, "step": 12200 }, { "epoch": 6.213845884580502, "grad_norm": 0.11352770030498505, "learning_rate": 4.982654106402354e-05, "loss": 0.6824, "step": 12210 }, { "epoch": 6.218937453260991, "grad_norm": 0.11052574217319489, "learning_rate": 4.9826399001012914e-05, "loss": 0.6791, "step": 12220 }, { "epoch": 6.224029021941479, "grad_norm": 0.12992137670516968, "learning_rate": 4.982625693800229e-05, "loss": 0.6793, "step": 12230 }, { "epoch": 6.229120590621967, "grad_norm": 0.1408848613500595, "learning_rate": 4.982611487499166e-05, "loss": 0.6791, "step": 12240 }, { "epoch": 6.234212159302455, "grad_norm": 0.18795296549797058, "learning_rate": 4.9825972811981034e-05, "loss": 0.6802, "step": 12250 }, { "epoch": 6.239303727982943, "grad_norm": 0.12889884412288666, "learning_rate": 4.982583074897041e-05, "loss": 0.6878, "step": 12260 }, { "epoch": 6.2443952966634315, "grad_norm": 0.1431640386581421, "learning_rate": 4.982568868595977e-05, "loss": 0.6775, "step": 12270 }, { "epoch": 6.249486865343919, "grad_norm": 0.11410534381866455, "learning_rate": 4.9825546622949146e-05, "loss": 0.6798, "step": 12280 }, { "epoch": 6.254578434024408, "grad_norm": 0.14347901940345764, "learning_rate": 4.982540455993851e-05, "loss": 0.6764, "step": 12290 }, { "epoch": 6.259670002704896, "grad_norm": 0.14148719608783722, "learning_rate": 4.9825262496927886e-05, "loss": 0.6778, "step": 12300 }, { "epoch": 6.264761571385384, "grad_norm": 0.13571056723594666, "learning_rate": 4.982512043391726e-05, "loss": 0.6822, "step": 12310 }, { "epoch": 6.269853140065872, "grad_norm": 0.13416819274425507, "learning_rate": 4.982497837090663e-05, "loss": 0.6764, "step": 12320 }, { "epoch": 6.274944708746361, "grad_norm": 0.12467856705188751, "learning_rate": 4.9824836307896005e-05, "loss": 0.681, "step": 12330 }, { "epoch": 6.280036277426849, "grad_norm": 0.11934306472539902, "learning_rate": 4.982469424488538e-05, "loss": 0.6808, "step": 12340 }, { "epoch": 6.285127846107336, "grad_norm": 0.12335172295570374, "learning_rate": 4.982455218187475e-05, "loss": 0.6795, "step": 12350 }, { "epoch": 6.290219414787825, "grad_norm": 0.12900583446025848, "learning_rate": 4.9824410118864124e-05, "loss": 0.6736, "step": 12360 }, { "epoch": 6.295310983468313, "grad_norm": 0.11381091177463531, "learning_rate": 4.98242680558535e-05, "loss": 0.6838, "step": 12370 }, { "epoch": 6.300402552148801, "grad_norm": 0.11505099385976791, "learning_rate": 4.982412599284287e-05, "loss": 0.6772, "step": 12380 }, { "epoch": 6.305494120829289, "grad_norm": 0.11616339534521103, "learning_rate": 4.9823983929832244e-05, "loss": 0.6788, "step": 12390 }, { "epoch": 6.310585689509777, "grad_norm": 0.1088867336511612, "learning_rate": 4.982384186682162e-05, "loss": 0.6777, "step": 12400 }, { "epoch": 6.315677258190266, "grad_norm": 0.11975440382957458, "learning_rate": 4.982369980381098e-05, "loss": 0.6854, "step": 12410 }, { "epoch": 6.320768826870753, "grad_norm": 0.11531190574169159, "learning_rate": 4.9823557740800356e-05, "loss": 0.6786, "step": 12420 }, { "epoch": 6.325860395551242, "grad_norm": 0.117821604013443, "learning_rate": 4.982341567778973e-05, "loss": 0.6814, "step": 12430 }, { "epoch": 6.33095196423173, "grad_norm": 0.13663433492183685, "learning_rate": 4.98232736147791e-05, "loss": 0.6754, "step": 12440 }, { "epoch": 6.3360435329122184, "grad_norm": 0.14458602666854858, "learning_rate": 4.9823131551768475e-05, "loss": 0.6829, "step": 12450 }, { "epoch": 6.341135101592706, "grad_norm": 0.12459100037813187, "learning_rate": 4.982298948875785e-05, "loss": 0.6803, "step": 12460 }, { "epoch": 6.346226670273195, "grad_norm": 0.11213183403015137, "learning_rate": 4.9822847425747215e-05, "loss": 0.6776, "step": 12470 }, { "epoch": 6.351318238953683, "grad_norm": 0.12166488170623779, "learning_rate": 4.982270536273659e-05, "loss": 0.6817, "step": 12480 }, { "epoch": 6.35640980763417, "grad_norm": 0.11691765487194061, "learning_rate": 4.982256329972596e-05, "loss": 0.6829, "step": 12490 }, { "epoch": 6.361501376314659, "grad_norm": 0.1120506301522255, "learning_rate": 4.9822421236715334e-05, "loss": 0.6791, "step": 12500 }, { "epoch": 6.366592944995147, "grad_norm": 0.12437008321285248, "learning_rate": 4.982227917370471e-05, "loss": 0.6751, "step": 12510 }, { "epoch": 6.3716845136756355, "grad_norm": 0.15133772790431976, "learning_rate": 4.982213711069408e-05, "loss": 0.6785, "step": 12520 }, { "epoch": 6.376776082356123, "grad_norm": 0.14470815658569336, "learning_rate": 4.9821995047683453e-05, "loss": 0.6805, "step": 12530 }, { "epoch": 6.381867651036612, "grad_norm": 0.1352653056383133, "learning_rate": 4.9821852984672827e-05, "loss": 0.6799, "step": 12540 }, { "epoch": 6.3869592197171, "grad_norm": 0.12650400400161743, "learning_rate": 4.982171092166219e-05, "loss": 0.6788, "step": 12550 }, { "epoch": 6.392050788397588, "grad_norm": 0.12057118117809296, "learning_rate": 4.9821568858651566e-05, "loss": 0.6811, "step": 12560 }, { "epoch": 6.397142357078076, "grad_norm": 0.16348209977149963, "learning_rate": 4.982142679564094e-05, "loss": 0.6799, "step": 12570 }, { "epoch": 6.402233925758564, "grad_norm": 0.18208801746368408, "learning_rate": 4.982128473263031e-05, "loss": 0.6738, "step": 12580 }, { "epoch": 6.4073254944390525, "grad_norm": 0.1399811953306198, "learning_rate": 4.9821142669619685e-05, "loss": 0.6762, "step": 12590 }, { "epoch": 6.41241706311954, "grad_norm": 0.11085145175457001, "learning_rate": 4.982100060660906e-05, "loss": 0.6914, "step": 12600 }, { "epoch": 6.417508631800029, "grad_norm": 0.10344066470861435, "learning_rate": 4.982085854359843e-05, "loss": 0.6809, "step": 12610 }, { "epoch": 6.422600200480517, "grad_norm": 0.13643105328083038, "learning_rate": 4.9820716480587805e-05, "loss": 0.6752, "step": 12620 }, { "epoch": 6.4276917691610045, "grad_norm": 0.12111321091651917, "learning_rate": 4.982057441757718e-05, "loss": 0.6786, "step": 12630 }, { "epoch": 6.432783337841493, "grad_norm": 0.1612890660762787, "learning_rate": 4.982043235456655e-05, "loss": 0.6789, "step": 12640 }, { "epoch": 6.437874906521981, "grad_norm": 0.15844057500362396, "learning_rate": 4.9820290291555924e-05, "loss": 0.6826, "step": 12650 }, { "epoch": 6.44296647520247, "grad_norm": 0.128059983253479, "learning_rate": 4.982014822854529e-05, "loss": 0.6776, "step": 12660 }, { "epoch": 6.448058043882957, "grad_norm": 0.13311228156089783, "learning_rate": 4.9820006165534663e-05, "loss": 0.6793, "step": 12670 }, { "epoch": 6.453149612563446, "grad_norm": 0.15546241402626038, "learning_rate": 4.9819864102524037e-05, "loss": 0.6753, "step": 12680 }, { "epoch": 6.458241181243934, "grad_norm": 0.1458451747894287, "learning_rate": 4.98197220395134e-05, "loss": 0.6817, "step": 12690 }, { "epoch": 6.463332749924422, "grad_norm": 0.12202929705381393, "learning_rate": 4.9819579976502776e-05, "loss": 0.6801, "step": 12700 }, { "epoch": 6.46842431860491, "grad_norm": 0.137448251247406, "learning_rate": 4.981943791349215e-05, "loss": 0.6779, "step": 12710 }, { "epoch": 6.473515887285398, "grad_norm": 0.12428711354732513, "learning_rate": 4.981929585048152e-05, "loss": 0.6814, "step": 12720 }, { "epoch": 6.478607455965887, "grad_norm": 0.15364359319210052, "learning_rate": 4.9819153787470895e-05, "loss": 0.6719, "step": 12730 }, { "epoch": 6.483699024646374, "grad_norm": 0.1646897941827774, "learning_rate": 4.981901172446027e-05, "loss": 0.6787, "step": 12740 }, { "epoch": 6.488790593326863, "grad_norm": 0.18058307468891144, "learning_rate": 4.981886966144964e-05, "loss": 0.6797, "step": 12750 }, { "epoch": 6.493882162007351, "grad_norm": 0.13395850360393524, "learning_rate": 4.9818727598439015e-05, "loss": 0.6776, "step": 12760 }, { "epoch": 6.4989737306878395, "grad_norm": 0.15397368371486664, "learning_rate": 4.981858553542839e-05, "loss": 0.6698, "step": 12770 }, { "epoch": 6.504065299368327, "grad_norm": 0.16110943257808685, "learning_rate": 4.981844347241776e-05, "loss": 0.6849, "step": 12780 }, { "epoch": 6.509156868048816, "grad_norm": 0.18386079370975494, "learning_rate": 4.9818301409407134e-05, "loss": 0.6813, "step": 12790 }, { "epoch": 6.514248436729304, "grad_norm": 0.11144635081291199, "learning_rate": 4.98181593463965e-05, "loss": 0.6746, "step": 12800 }, { "epoch": 6.519340005409791, "grad_norm": 0.1547509729862213, "learning_rate": 4.981801728338587e-05, "loss": 0.6775, "step": 12810 }, { "epoch": 6.52443157409028, "grad_norm": 0.12533412873744965, "learning_rate": 4.9817875220375246e-05, "loss": 0.6723, "step": 12820 }, { "epoch": 6.529523142770768, "grad_norm": 0.13594309985637665, "learning_rate": 4.981773315736462e-05, "loss": 0.6815, "step": 12830 }, { "epoch": 6.5346147114512565, "grad_norm": 0.16000863909721375, "learning_rate": 4.981759109435399e-05, "loss": 0.6845, "step": 12840 }, { "epoch": 6.539706280131744, "grad_norm": 0.12660828232765198, "learning_rate": 4.9817449031343366e-05, "loss": 0.6776, "step": 12850 }, { "epoch": 6.544797848812232, "grad_norm": 0.13099251687526703, "learning_rate": 4.981730696833274e-05, "loss": 0.6761, "step": 12860 }, { "epoch": 6.549889417492721, "grad_norm": 0.13618282973766327, "learning_rate": 4.981716490532211e-05, "loss": 0.6777, "step": 12870 }, { "epoch": 6.5549809861732085, "grad_norm": 0.128812775015831, "learning_rate": 4.9817022842311485e-05, "loss": 0.687, "step": 12880 }, { "epoch": 6.560072554853697, "grad_norm": 0.10990247130393982, "learning_rate": 4.981688077930085e-05, "loss": 0.6792, "step": 12890 }, { "epoch": 6.565164123534185, "grad_norm": 0.13022927939891815, "learning_rate": 4.9816738716290225e-05, "loss": 0.6785, "step": 12900 }, { "epoch": 6.5702556922146735, "grad_norm": 0.14299486577510834, "learning_rate": 4.98165966532796e-05, "loss": 0.6819, "step": 12910 }, { "epoch": 6.575347260895161, "grad_norm": 0.13400639593601227, "learning_rate": 4.981645459026897e-05, "loss": 0.6815, "step": 12920 }, { "epoch": 6.58043882957565, "grad_norm": 0.0999205932021141, "learning_rate": 4.9816312527258344e-05, "loss": 0.6788, "step": 12930 }, { "epoch": 6.585530398256138, "grad_norm": 0.11330140382051468, "learning_rate": 4.981617046424771e-05, "loss": 0.6805, "step": 12940 }, { "epoch": 6.5906219669366255, "grad_norm": 0.18674777448177338, "learning_rate": 4.981602840123708e-05, "loss": 0.6778, "step": 12950 }, { "epoch": 6.595713535617114, "grad_norm": 0.15032435953617096, "learning_rate": 4.9815886338226456e-05, "loss": 0.6825, "step": 12960 }, { "epoch": 6.600805104297602, "grad_norm": 0.1333203762769699, "learning_rate": 4.981574427521583e-05, "loss": 0.6795, "step": 12970 }, { "epoch": 6.605896672978091, "grad_norm": 0.16465353965759277, "learning_rate": 4.98156022122052e-05, "loss": 0.6706, "step": 12980 }, { "epoch": 6.610988241658578, "grad_norm": 0.15451110899448395, "learning_rate": 4.9815460149194576e-05, "loss": 0.6757, "step": 12990 }, { "epoch": 6.616079810339067, "grad_norm": 0.15208947658538818, "learning_rate": 4.981531808618395e-05, "loss": 0.6818, "step": 13000 }, { "epoch": 6.621171379019555, "grad_norm": 0.13289377093315125, "learning_rate": 4.981517602317332e-05, "loss": 0.6811, "step": 13010 }, { "epoch": 6.626262947700043, "grad_norm": 0.18308168649673462, "learning_rate": 4.9815033960162695e-05, "loss": 0.678, "step": 13020 }, { "epoch": 6.631354516380531, "grad_norm": 0.12425180524587631, "learning_rate": 4.981489189715207e-05, "loss": 0.6816, "step": 13030 }, { "epoch": 6.636446085061019, "grad_norm": 0.13754673302173615, "learning_rate": 4.981474983414144e-05, "loss": 0.6773, "step": 13040 }, { "epoch": 6.641537653741508, "grad_norm": 0.15316608548164368, "learning_rate": 4.9814607771130814e-05, "loss": 0.6765, "step": 13050 }, { "epoch": 6.646629222421995, "grad_norm": 0.136078342795372, "learning_rate": 4.981446570812018e-05, "loss": 0.6767, "step": 13060 }, { "epoch": 6.651720791102484, "grad_norm": 0.12898576259613037, "learning_rate": 4.9814323645109554e-05, "loss": 0.6786, "step": 13070 }, { "epoch": 6.656812359782972, "grad_norm": 0.11854422837495804, "learning_rate": 4.981418158209893e-05, "loss": 0.6806, "step": 13080 }, { "epoch": 6.66190392846346, "grad_norm": 0.1517888456583023, "learning_rate": 4.98140395190883e-05, "loss": 0.6829, "step": 13090 }, { "epoch": 6.666995497143948, "grad_norm": 0.1091533899307251, "learning_rate": 4.9813897456077666e-05, "loss": 0.6774, "step": 13100 }, { "epoch": 6.672087065824436, "grad_norm": 0.13526228070259094, "learning_rate": 4.981375539306704e-05, "loss": 0.6747, "step": 13110 }, { "epoch": 6.677178634504925, "grad_norm": 0.144491046667099, "learning_rate": 4.981361333005641e-05, "loss": 0.6787, "step": 13120 }, { "epoch": 6.682270203185412, "grad_norm": 0.16958777606487274, "learning_rate": 4.9813471267045786e-05, "loss": 0.6744, "step": 13130 }, { "epoch": 6.687361771865901, "grad_norm": 0.14115367829799652, "learning_rate": 4.981332920403516e-05, "loss": 0.6791, "step": 13140 }, { "epoch": 6.692453340546389, "grad_norm": 0.11081673204898834, "learning_rate": 4.981318714102453e-05, "loss": 0.6795, "step": 13150 }, { "epoch": 6.6975449092268775, "grad_norm": 0.14843027293682098, "learning_rate": 4.9813045078013905e-05, "loss": 0.6807, "step": 13160 }, { "epoch": 6.702636477907365, "grad_norm": 0.12543180584907532, "learning_rate": 4.981290301500328e-05, "loss": 0.6778, "step": 13170 }, { "epoch": 6.707728046587853, "grad_norm": 0.13169404864311218, "learning_rate": 4.981276095199265e-05, "loss": 0.675, "step": 13180 }, { "epoch": 6.712819615268342, "grad_norm": 0.15343239903450012, "learning_rate": 4.9812618888982024e-05, "loss": 0.6819, "step": 13190 }, { "epoch": 6.7179111839488295, "grad_norm": 0.13029424846172333, "learning_rate": 4.981247682597139e-05, "loss": 0.6778, "step": 13200 }, { "epoch": 6.723002752629318, "grad_norm": 0.11084284633398056, "learning_rate": 4.9812334762960764e-05, "loss": 0.6824, "step": 13210 }, { "epoch": 6.728094321309806, "grad_norm": 0.11253423988819122, "learning_rate": 4.981219269995014e-05, "loss": 0.6798, "step": 13220 }, { "epoch": 6.7331858899902945, "grad_norm": 0.1311793029308319, "learning_rate": 4.981205063693951e-05, "loss": 0.6814, "step": 13230 }, { "epoch": 6.738277458670782, "grad_norm": 0.12919209897518158, "learning_rate": 4.981190857392888e-05, "loss": 0.6768, "step": 13240 }, { "epoch": 6.743369027351271, "grad_norm": 0.12355062365531921, "learning_rate": 4.9811766510918256e-05, "loss": 0.6799, "step": 13250 }, { "epoch": 6.748460596031759, "grad_norm": 0.1338970810174942, "learning_rate": 4.981162444790763e-05, "loss": 0.6771, "step": 13260 }, { "epoch": 6.7535521647122465, "grad_norm": 0.14117179811000824, "learning_rate": 4.9811482384897e-05, "loss": 0.6799, "step": 13270 }, { "epoch": 6.758643733392735, "grad_norm": 0.1848529875278473, "learning_rate": 4.9811340321886375e-05, "loss": 0.6755, "step": 13280 }, { "epoch": 6.763735302073223, "grad_norm": 0.1720336526632309, "learning_rate": 4.981119825887575e-05, "loss": 0.67, "step": 13290 }, { "epoch": 6.768826870753712, "grad_norm": 0.1607787162065506, "learning_rate": 4.981105619586512e-05, "loss": 0.6827, "step": 13300 }, { "epoch": 6.773918439434199, "grad_norm": 0.14998158812522888, "learning_rate": 4.981091413285449e-05, "loss": 0.6759, "step": 13310 }, { "epoch": 6.779010008114687, "grad_norm": 0.11763730645179749, "learning_rate": 4.981077206984386e-05, "loss": 0.6747, "step": 13320 }, { "epoch": 6.784101576795176, "grad_norm": 0.12859204411506653, "learning_rate": 4.9810630006833234e-05, "loss": 0.6785, "step": 13330 }, { "epoch": 6.7891931454756635, "grad_norm": 0.12227821350097656, "learning_rate": 4.98104879438226e-05, "loss": 0.6794, "step": 13340 }, { "epoch": 6.794284714156152, "grad_norm": 0.11308576911687851, "learning_rate": 4.9810345880811974e-05, "loss": 0.6777, "step": 13350 }, { "epoch": 6.79937628283664, "grad_norm": 0.12252433598041534, "learning_rate": 4.981020381780135e-05, "loss": 0.6778, "step": 13360 }, { "epoch": 6.804467851517129, "grad_norm": 0.11951456218957901, "learning_rate": 4.981006175479072e-05, "loss": 0.6778, "step": 13370 }, { "epoch": 6.809559420197616, "grad_norm": 0.13758736848831177, "learning_rate": 4.980991969178009e-05, "loss": 0.6757, "step": 13380 }, { "epoch": 6.814650988878105, "grad_norm": 0.15930655598640442, "learning_rate": 4.9809777628769466e-05, "loss": 0.675, "step": 13390 }, { "epoch": 6.819742557558593, "grad_norm": 0.16790159046649933, "learning_rate": 4.980963556575884e-05, "loss": 0.6685, "step": 13400 }, { "epoch": 6.824834126239081, "grad_norm": 0.1681044101715088, "learning_rate": 4.980949350274821e-05, "loss": 0.683, "step": 13410 }, { "epoch": 6.829925694919569, "grad_norm": 0.1336173415184021, "learning_rate": 4.9809351439737585e-05, "loss": 0.6746, "step": 13420 }, { "epoch": 6.835017263600057, "grad_norm": 0.11793011426925659, "learning_rate": 4.980920937672696e-05, "loss": 0.6789, "step": 13430 }, { "epoch": 6.840108832280546, "grad_norm": 0.14056985080242157, "learning_rate": 4.980906731371633e-05, "loss": 0.6797, "step": 13440 }, { "epoch": 6.845200400961033, "grad_norm": 0.11312086880207062, "learning_rate": 4.9808925250705705e-05, "loss": 0.6777, "step": 13450 }, { "epoch": 6.850291969641522, "grad_norm": 0.14550986886024475, "learning_rate": 4.980878318769507e-05, "loss": 0.6792, "step": 13460 }, { "epoch": 6.85538353832201, "grad_norm": 0.13276565074920654, "learning_rate": 4.9808641124684444e-05, "loss": 0.6797, "step": 13470 }, { "epoch": 6.8604751070024985, "grad_norm": 0.1404767632484436, "learning_rate": 4.980849906167382e-05, "loss": 0.6767, "step": 13480 }, { "epoch": 6.865566675682986, "grad_norm": 0.11344119906425476, "learning_rate": 4.980835699866319e-05, "loss": 0.6779, "step": 13490 }, { "epoch": 6.870658244363474, "grad_norm": 0.18248707056045532, "learning_rate": 4.9808214935652563e-05, "loss": 0.6819, "step": 13500 }, { "epoch": 6.875749813043963, "grad_norm": 0.13696008920669556, "learning_rate": 4.9808072872641937e-05, "loss": 0.6789, "step": 13510 }, { "epoch": 6.8808413817244505, "grad_norm": 0.1089053824543953, "learning_rate": 4.98079308096313e-05, "loss": 0.6833, "step": 13520 }, { "epoch": 6.885932950404939, "grad_norm": 0.13730046153068542, "learning_rate": 4.9807788746620676e-05, "loss": 0.685, "step": 13530 }, { "epoch": 6.891024519085427, "grad_norm": 0.11708593368530273, "learning_rate": 4.980764668361005e-05, "loss": 0.6797, "step": 13540 }, { "epoch": 6.896116087765915, "grad_norm": 0.14479976892471313, "learning_rate": 4.980750462059942e-05, "loss": 0.6779, "step": 13550 }, { "epoch": 6.901207656446403, "grad_norm": 0.13402192294597626, "learning_rate": 4.9807362557588795e-05, "loss": 0.6775, "step": 13560 }, { "epoch": 6.906299225126891, "grad_norm": 0.1378648430109024, "learning_rate": 4.980722049457817e-05, "loss": 0.6799, "step": 13570 }, { "epoch": 6.91139079380738, "grad_norm": 0.1424325555562973, "learning_rate": 4.980707843156754e-05, "loss": 0.6777, "step": 13580 }, { "epoch": 6.9164823624878675, "grad_norm": 0.12795968353748322, "learning_rate": 4.9806936368556915e-05, "loss": 0.6756, "step": 13590 }, { "epoch": 6.921573931168356, "grad_norm": 0.16961532831192017, "learning_rate": 4.980679430554628e-05, "loss": 0.6762, "step": 13600 }, { "epoch": 6.926665499848844, "grad_norm": 0.16084560751914978, "learning_rate": 4.9806652242535654e-05, "loss": 0.6783, "step": 13610 }, { "epoch": 6.931757068529333, "grad_norm": 0.1510113775730133, "learning_rate": 4.980651017952503e-05, "loss": 0.676, "step": 13620 }, { "epoch": 6.93684863720982, "grad_norm": 0.1436864286661148, "learning_rate": 4.98063681165144e-05, "loss": 0.6769, "step": 13630 }, { "epoch": 6.941940205890308, "grad_norm": 0.14651361107826233, "learning_rate": 4.980622605350377e-05, "loss": 0.6786, "step": 13640 }, { "epoch": 6.947031774570797, "grad_norm": 0.12080514430999756, "learning_rate": 4.9806083990493146e-05, "loss": 0.6719, "step": 13650 }, { "epoch": 6.952123343251285, "grad_norm": 0.18036852777004242, "learning_rate": 4.980594192748252e-05, "loss": 0.6776, "step": 13660 }, { "epoch": 6.957214911931773, "grad_norm": 0.15538708865642548, "learning_rate": 4.980579986447189e-05, "loss": 0.677, "step": 13670 }, { "epoch": 6.962306480612261, "grad_norm": 0.14524763822555542, "learning_rate": 4.9805657801461266e-05, "loss": 0.6725, "step": 13680 }, { "epoch": 6.96739804929275, "grad_norm": 0.13171471655368805, "learning_rate": 4.980551573845064e-05, "loss": 0.6814, "step": 13690 }, { "epoch": 6.972489617973237, "grad_norm": 0.14730645716190338, "learning_rate": 4.980537367544001e-05, "loss": 0.6828, "step": 13700 }, { "epoch": 6.977581186653726, "grad_norm": 0.1142466589808464, "learning_rate": 4.980523161242938e-05, "loss": 0.677, "step": 13710 }, { "epoch": 6.982672755334214, "grad_norm": 0.11980883777141571, "learning_rate": 4.980508954941875e-05, "loss": 0.6847, "step": 13720 }, { "epoch": 6.987764324014702, "grad_norm": 0.10882198065519333, "learning_rate": 4.9804947486408125e-05, "loss": 0.6749, "step": 13730 }, { "epoch": 6.99285589269519, "grad_norm": 0.1418180912733078, "learning_rate": 4.980480542339749e-05, "loss": 0.675, "step": 13740 } ], "logging_steps": 10, "max_steps": 13748, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }