diff --git "a/checkpoint-42462/trainer_state.json" "b/checkpoint-42462/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-42462/trainer_state.json" @@ -0,0 +1,29756 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 42462, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00023550468654326222, + "grad_norm": 3.2309679985046387, + "learning_rate": 0.00019995760915642223, + "loss": 3.1368, + "step": 10 + }, + { + "epoch": 0.00047100937308652445, + "grad_norm": 2.1174814701080322, + "learning_rate": 0.00019991050821911358, + "loss": 2.6966, + "step": 20 + }, + { + "epoch": 0.0007065140596297867, + "grad_norm": 2.325835704803467, + "learning_rate": 0.00019986340728180493, + "loss": 2.4996, + "step": 30 + }, + { + "epoch": 0.0009420187461730489, + "grad_norm": 1.9865907430648804, + "learning_rate": 0.00019981630634449625, + "loss": 2.7315, + "step": 40 + }, + { + "epoch": 0.0011775234327163111, + "grad_norm": 1.900598406791687, + "learning_rate": 0.00019976920540718763, + "loss": 2.6127, + "step": 50 + }, + { + "epoch": 0.0014130281192595733, + "grad_norm": 1.5695827007293701, + "learning_rate": 0.00019972210446987895, + "loss": 2.5491, + "step": 60 + }, + { + "epoch": 0.0016485328058028356, + "grad_norm": 2.3850433826446533, + "learning_rate": 0.0001996750035325703, + "loss": 2.5412, + "step": 70 + }, + { + "epoch": 0.0018840374923460978, + "grad_norm": 2.0210142135620117, + "learning_rate": 0.00019962790259526165, + "loss": 2.485, + "step": 80 + }, + { + "epoch": 0.00211954217888936, + "grad_norm": 1.617475152015686, + "learning_rate": 0.000199580801657953, + "loss": 2.5976, + "step": 90 + }, + { + "epoch": 0.0023550468654326222, + "grad_norm": 1.6759183406829834, + "learning_rate": 0.00019953370072064435, + "loss": 2.3563, + "step": 100 + }, + { + "epoch": 0.0025905515519758842, + "grad_norm": 1.495890498161316, + "learning_rate": 0.0001994865997833357, + "loss": 2.6193, + "step": 110 + }, + { + "epoch": 0.0028260562385191467, + "grad_norm": 2.131957769393921, + "learning_rate": 0.00019943949884602705, + "loss": 2.5216, + "step": 120 + }, + { + "epoch": 0.0030615609250624087, + "grad_norm": 1.818166732788086, + "learning_rate": 0.0001993923979087184, + "loss": 2.4632, + "step": 130 + }, + { + "epoch": 0.003297065611605671, + "grad_norm": 1.854960560798645, + "learning_rate": 0.00019934529697140973, + "loss": 2.4897, + "step": 140 + }, + { + "epoch": 0.003532570298148933, + "grad_norm": 1.624773383140564, + "learning_rate": 0.0001992981960341011, + "loss": 2.614, + "step": 150 + }, + { + "epoch": 0.0037680749846921956, + "grad_norm": 1.9114423990249634, + "learning_rate": 0.00019925109509679243, + "loss": 2.1351, + "step": 160 + }, + { + "epoch": 0.004003579671235458, + "grad_norm": 1.608393669128418, + "learning_rate": 0.00019920399415948378, + "loss": 2.5292, + "step": 170 + }, + { + "epoch": 0.00423908435777872, + "grad_norm": 1.6805859804153442, + "learning_rate": 0.00019915689322217513, + "loss": 2.5797, + "step": 180 + }, + { + "epoch": 0.004474589044321982, + "grad_norm": 1.54912269115448, + "learning_rate": 0.00019910979228486648, + "loss": 2.6075, + "step": 190 + }, + { + "epoch": 0.0047100937308652445, + "grad_norm": 1.4087010622024536, + "learning_rate": 0.00019906269134755783, + "loss": 2.3585, + "step": 200 + }, + { + "epoch": 0.004945598417408506, + "grad_norm": 1.7299461364746094, + "learning_rate": 0.00019901559041024918, + "loss": 2.3914, + "step": 210 + }, + { + "epoch": 0.0051811031039517685, + "grad_norm": 1.4757139682769775, + "learning_rate": 0.0001989684894729405, + "loss": 2.3994, + "step": 220 + }, + { + "epoch": 0.005416607790495031, + "grad_norm": 1.5432946681976318, + "learning_rate": 0.00019892138853563188, + "loss": 2.5407, + "step": 230 + }, + { + "epoch": 0.005652112477038293, + "grad_norm": 1.5481261014938354, + "learning_rate": 0.0001988742875983232, + "loss": 2.4017, + "step": 240 + }, + { + "epoch": 0.005887617163581555, + "grad_norm": 1.5139665603637695, + "learning_rate": 0.00019882718666101455, + "loss": 2.2548, + "step": 250 + }, + { + "epoch": 0.006123121850124817, + "grad_norm": 1.4921780824661255, + "learning_rate": 0.00019878008572370593, + "loss": 2.293, + "step": 260 + }, + { + "epoch": 0.00635862653666808, + "grad_norm": 1.9517822265625, + "learning_rate": 0.00019873298478639725, + "loss": 2.3931, + "step": 270 + }, + { + "epoch": 0.006594131223211342, + "grad_norm": 1.6674144268035889, + "learning_rate": 0.00019868588384908863, + "loss": 2.6797, + "step": 280 + }, + { + "epoch": 0.006829635909754604, + "grad_norm": 1.8776614665985107, + "learning_rate": 0.00019863878291177995, + "loss": 2.3755, + "step": 290 + }, + { + "epoch": 0.007065140596297866, + "grad_norm": 1.704877257347107, + "learning_rate": 0.0001985916819744713, + "loss": 2.577, + "step": 300 + }, + { + "epoch": 0.007300645282841129, + "grad_norm": 1.7447631359100342, + "learning_rate": 0.00019854458103716265, + "loss": 2.6707, + "step": 310 + }, + { + "epoch": 0.007536149969384391, + "grad_norm": 1.7486447095870972, + "learning_rate": 0.000198497480099854, + "loss": 2.4179, + "step": 320 + }, + { + "epoch": 0.007771654655927653, + "grad_norm": 1.5924261808395386, + "learning_rate": 0.00019845037916254535, + "loss": 2.3293, + "step": 330 + }, + { + "epoch": 0.008007159342470916, + "grad_norm": 2.1535685062408447, + "learning_rate": 0.0001984032782252367, + "loss": 2.4561, + "step": 340 + }, + { + "epoch": 0.008242664029014177, + "grad_norm": 1.8906278610229492, + "learning_rate": 0.00019835617728792803, + "loss": 2.5716, + "step": 350 + }, + { + "epoch": 0.00847816871555744, + "grad_norm": 2.237260580062866, + "learning_rate": 0.0001983090763506194, + "loss": 2.4573, + "step": 360 + }, + { + "epoch": 0.008713673402100702, + "grad_norm": 1.506896734237671, + "learning_rate": 0.00019826197541331073, + "loss": 2.4239, + "step": 370 + }, + { + "epoch": 0.008949178088643964, + "grad_norm": 1.8521124124526978, + "learning_rate": 0.00019821487447600208, + "loss": 2.518, + "step": 380 + }, + { + "epoch": 0.009184682775187226, + "grad_norm": 1.684022307395935, + "learning_rate": 0.00019816777353869343, + "loss": 2.2849, + "step": 390 + }, + { + "epoch": 0.009420187461730489, + "grad_norm": 1.6935802698135376, + "learning_rate": 0.00019812067260138478, + "loss": 2.5127, + "step": 400 + }, + { + "epoch": 0.009655692148273751, + "grad_norm": 2.0532476902008057, + "learning_rate": 0.00019807357166407613, + "loss": 2.2883, + "step": 410 + }, + { + "epoch": 0.009891196834817012, + "grad_norm": 1.579057216644287, + "learning_rate": 0.00019802647072676748, + "loss": 2.3682, + "step": 420 + }, + { + "epoch": 0.010126701521360275, + "grad_norm": 1.471557855606079, + "learning_rate": 0.0001979793697894588, + "loss": 2.4559, + "step": 430 + }, + { + "epoch": 0.010362206207903537, + "grad_norm": 1.4747744798660278, + "learning_rate": 0.00019793226885215018, + "loss": 2.3266, + "step": 440 + }, + { + "epoch": 0.0105977108944468, + "grad_norm": 1.7954132556915283, + "learning_rate": 0.0001978851679148415, + "loss": 2.4466, + "step": 450 + }, + { + "epoch": 0.010833215580990062, + "grad_norm": 1.8787152767181396, + "learning_rate": 0.00019783806697753285, + "loss": 2.2744, + "step": 460 + }, + { + "epoch": 0.011068720267533324, + "grad_norm": 2.769911527633667, + "learning_rate": 0.0001977909660402242, + "loss": 2.4138, + "step": 470 + }, + { + "epoch": 0.011304224954076587, + "grad_norm": 1.7480900287628174, + "learning_rate": 0.00019774386510291555, + "loss": 2.3967, + "step": 480 + }, + { + "epoch": 0.01153972964061985, + "grad_norm": 1.5646064281463623, + "learning_rate": 0.0001976967641656069, + "loss": 2.3035, + "step": 490 + }, + { + "epoch": 0.01177523432716311, + "grad_norm": 2.1122241020202637, + "learning_rate": 0.00019764966322829825, + "loss": 2.4634, + "step": 500 + }, + { + "epoch": 0.012010739013706372, + "grad_norm": 1.5615967512130737, + "learning_rate": 0.00019760256229098958, + "loss": 2.4727, + "step": 510 + }, + { + "epoch": 0.012246243700249635, + "grad_norm": 1.6492537260055542, + "learning_rate": 0.00019755546135368095, + "loss": 2.309, + "step": 520 + }, + { + "epoch": 0.012481748386792897, + "grad_norm": 1.8484909534454346, + "learning_rate": 0.00019750836041637228, + "loss": 2.5138, + "step": 530 + }, + { + "epoch": 0.01271725307333616, + "grad_norm": 1.2929155826568604, + "learning_rate": 0.00019746125947906365, + "loss": 2.1399, + "step": 540 + }, + { + "epoch": 0.012952757759879422, + "grad_norm": 2.6417770385742188, + "learning_rate": 0.000197414158541755, + "loss": 2.5062, + "step": 550 + }, + { + "epoch": 0.013188262446422684, + "grad_norm": 1.4375731945037842, + "learning_rate": 0.00019736705760444633, + "loss": 2.218, + "step": 560 + }, + { + "epoch": 0.013423767132965945, + "grad_norm": 1.6705974340438843, + "learning_rate": 0.0001973199566671377, + "loss": 2.1678, + "step": 570 + }, + { + "epoch": 0.013659271819509208, + "grad_norm": 1.7592695951461792, + "learning_rate": 0.00019727285572982903, + "loss": 2.1822, + "step": 580 + }, + { + "epoch": 0.01389477650605247, + "grad_norm": 1.7886682748794556, + "learning_rate": 0.00019722575479252038, + "loss": 2.3795, + "step": 590 + }, + { + "epoch": 0.014130281192595733, + "grad_norm": 1.9300228357315063, + "learning_rate": 0.00019717865385521173, + "loss": 2.3488, + "step": 600 + }, + { + "epoch": 0.014365785879138995, + "grad_norm": 1.589822769165039, + "learning_rate": 0.00019713155291790308, + "loss": 2.4307, + "step": 610 + }, + { + "epoch": 0.014601290565682257, + "grad_norm": 1.503128170967102, + "learning_rate": 0.00019708445198059443, + "loss": 2.277, + "step": 620 + }, + { + "epoch": 0.01483679525222552, + "grad_norm": 1.7925318479537964, + "learning_rate": 0.00019703735104328578, + "loss": 2.3734, + "step": 630 + }, + { + "epoch": 0.015072299938768782, + "grad_norm": 1.6709870100021362, + "learning_rate": 0.0001969902501059771, + "loss": 2.1709, + "step": 640 + }, + { + "epoch": 0.015307804625312043, + "grad_norm": 1.6741284132003784, + "learning_rate": 0.00019694314916866848, + "loss": 2.2209, + "step": 650 + }, + { + "epoch": 0.015543309311855305, + "grad_norm": 1.6825313568115234, + "learning_rate": 0.0001968960482313598, + "loss": 2.255, + "step": 660 + }, + { + "epoch": 0.015778813998398568, + "grad_norm": 1.544534683227539, + "learning_rate": 0.00019684894729405115, + "loss": 2.3872, + "step": 670 + }, + { + "epoch": 0.016014318684941832, + "grad_norm": 2.33375883102417, + "learning_rate": 0.0001968018463567425, + "loss": 2.4629, + "step": 680 + }, + { + "epoch": 0.016249823371485093, + "grad_norm": 1.4496179819107056, + "learning_rate": 0.00019675474541943385, + "loss": 2.3638, + "step": 690 + }, + { + "epoch": 0.016485328058028353, + "grad_norm": 1.7347285747528076, + "learning_rate": 0.0001967076444821252, + "loss": 2.4611, + "step": 700 + }, + { + "epoch": 0.016720832744571618, + "grad_norm": 1.5036282539367676, + "learning_rate": 0.00019666054354481655, + "loss": 2.3117, + "step": 710 + }, + { + "epoch": 0.01695633743111488, + "grad_norm": 2.0738086700439453, + "learning_rate": 0.0001966134426075079, + "loss": 2.3374, + "step": 720 + }, + { + "epoch": 0.017191842117658142, + "grad_norm": 2.5572900772094727, + "learning_rate": 0.00019656634167019926, + "loss": 2.3246, + "step": 730 + }, + { + "epoch": 0.017427346804201403, + "grad_norm": 1.6567786931991577, + "learning_rate": 0.00019651924073289058, + "loss": 2.4771, + "step": 740 + }, + { + "epoch": 0.017662851490744667, + "grad_norm": 2.6085476875305176, + "learning_rate": 0.00019647213979558196, + "loss": 2.3132, + "step": 750 + }, + { + "epoch": 0.017898356177287928, + "grad_norm": 1.939334511756897, + "learning_rate": 0.00019642503885827328, + "loss": 2.4239, + "step": 760 + }, + { + "epoch": 0.01813386086383119, + "grad_norm": 1.840398907661438, + "learning_rate": 0.00019637793792096463, + "loss": 2.3915, + "step": 770 + }, + { + "epoch": 0.018369365550374453, + "grad_norm": 1.8545604944229126, + "learning_rate": 0.00019633083698365598, + "loss": 2.3342, + "step": 780 + }, + { + "epoch": 0.018604870236917714, + "grad_norm": 1.8592214584350586, + "learning_rate": 0.00019628373604634733, + "loss": 2.4354, + "step": 790 + }, + { + "epoch": 0.018840374923460978, + "grad_norm": 1.6031461954116821, + "learning_rate": 0.00019623663510903868, + "loss": 2.3723, + "step": 800 + }, + { + "epoch": 0.01907587961000424, + "grad_norm": 2.243943214416504, + "learning_rate": 0.00019618953417173003, + "loss": 2.5432, + "step": 810 + }, + { + "epoch": 0.019311384296547503, + "grad_norm": 1.997883677482605, + "learning_rate": 0.00019614243323442135, + "loss": 2.2411, + "step": 820 + }, + { + "epoch": 0.019546888983090763, + "grad_norm": 1.7321577072143555, + "learning_rate": 0.00019609533229711273, + "loss": 2.2699, + "step": 830 + }, + { + "epoch": 0.019782393669634024, + "grad_norm": 1.544219732284546, + "learning_rate": 0.00019604823135980408, + "loss": 2.265, + "step": 840 + }, + { + "epoch": 0.02001789835617729, + "grad_norm": 1.6504725217819214, + "learning_rate": 0.0001960011304224954, + "loss": 2.4386, + "step": 850 + }, + { + "epoch": 0.02025340304272055, + "grad_norm": 2.053715705871582, + "learning_rate": 0.00019595402948518678, + "loss": 2.2794, + "step": 860 + }, + { + "epoch": 0.020488907729263813, + "grad_norm": 1.469041347503662, + "learning_rate": 0.0001959069285478781, + "loss": 2.4016, + "step": 870 + }, + { + "epoch": 0.020724412415807074, + "grad_norm": 1.7059731483459473, + "learning_rate": 0.00019585982761056948, + "loss": 2.3508, + "step": 880 + }, + { + "epoch": 0.020959917102350338, + "grad_norm": 1.9896105527877808, + "learning_rate": 0.0001958127266732608, + "loss": 2.3903, + "step": 890 + }, + { + "epoch": 0.0211954217888936, + "grad_norm": 2.041901111602783, + "learning_rate": 0.00019576562573595216, + "loss": 2.4199, + "step": 900 + }, + { + "epoch": 0.02143092647543686, + "grad_norm": 1.4538580179214478, + "learning_rate": 0.0001957185247986435, + "loss": 2.2481, + "step": 910 + }, + { + "epoch": 0.021666431161980124, + "grad_norm": 2.297203302383423, + "learning_rate": 0.00019567142386133486, + "loss": 2.3375, + "step": 920 + }, + { + "epoch": 0.021901935848523384, + "grad_norm": 2.172999143600464, + "learning_rate": 0.0001956243229240262, + "loss": 2.223, + "step": 930 + }, + { + "epoch": 0.02213744053506665, + "grad_norm": 1.5507631301879883, + "learning_rate": 0.00019557722198671756, + "loss": 2.2435, + "step": 940 + }, + { + "epoch": 0.02237294522160991, + "grad_norm": 2.088479518890381, + "learning_rate": 0.00019553012104940888, + "loss": 2.1769, + "step": 950 + }, + { + "epoch": 0.022608449908153173, + "grad_norm": 1.3809598684310913, + "learning_rate": 0.00019548302011210026, + "loss": 2.3431, + "step": 960 + }, + { + "epoch": 0.022843954594696434, + "grad_norm": 1.5656312704086304, + "learning_rate": 0.00019543591917479158, + "loss": 2.2484, + "step": 970 + }, + { + "epoch": 0.0230794592812397, + "grad_norm": 1.9035502672195435, + "learning_rate": 0.00019538881823748293, + "loss": 2.1907, + "step": 980 + }, + { + "epoch": 0.02331496396778296, + "grad_norm": 2.3373143672943115, + "learning_rate": 0.00019534171730017428, + "loss": 2.2871, + "step": 990 + }, + { + "epoch": 0.02355046865432622, + "grad_norm": 1.9408587217330933, + "learning_rate": 0.00019529461636286563, + "loss": 1.9912, + "step": 1000 + }, + { + "epoch": 0.023785973340869484, + "grad_norm": 2.7549495697021484, + "learning_rate": 0.00019524751542555698, + "loss": 2.4807, + "step": 1010 + }, + { + "epoch": 0.024021478027412745, + "grad_norm": 1.6193737983703613, + "learning_rate": 0.00019520041448824833, + "loss": 2.2551, + "step": 1020 + }, + { + "epoch": 0.02425698271395601, + "grad_norm": 2.092519760131836, + "learning_rate": 0.00019515331355093965, + "loss": 2.0965, + "step": 1030 + }, + { + "epoch": 0.02449248740049927, + "grad_norm": 2.3660218715667725, + "learning_rate": 0.00019510621261363103, + "loss": 2.3436, + "step": 1040 + }, + { + "epoch": 0.024727992087042534, + "grad_norm": 2.076950788497925, + "learning_rate": 0.00019505911167632235, + "loss": 2.0685, + "step": 1050 + }, + { + "epoch": 0.024963496773585794, + "grad_norm": 1.881016731262207, + "learning_rate": 0.0001950120107390137, + "loss": 2.524, + "step": 1060 + }, + { + "epoch": 0.025199001460129055, + "grad_norm": 1.6038298606872559, + "learning_rate": 0.00019496490980170506, + "loss": 2.3772, + "step": 1070 + }, + { + "epoch": 0.02543450614667232, + "grad_norm": 1.5274938344955444, + "learning_rate": 0.0001949178088643964, + "loss": 2.3456, + "step": 1080 + }, + { + "epoch": 0.02567001083321558, + "grad_norm": 1.5426788330078125, + "learning_rate": 0.00019487070792708776, + "loss": 2.4562, + "step": 1090 + }, + { + "epoch": 0.025905515519758844, + "grad_norm": 1.9040606021881104, + "learning_rate": 0.0001948236069897791, + "loss": 2.2722, + "step": 1100 + }, + { + "epoch": 0.026141020206302105, + "grad_norm": 2.3787999153137207, + "learning_rate": 0.00019477650605247046, + "loss": 2.4295, + "step": 1110 + }, + { + "epoch": 0.02637652489284537, + "grad_norm": 1.6062979698181152, + "learning_rate": 0.0001947294051151618, + "loss": 2.3269, + "step": 1120 + }, + { + "epoch": 0.02661202957938863, + "grad_norm": 1.6376535892486572, + "learning_rate": 0.00019468230417785316, + "loss": 2.3202, + "step": 1130 + }, + { + "epoch": 0.02684753426593189, + "grad_norm": 1.78897225856781, + "learning_rate": 0.0001946352032405445, + "loss": 2.4023, + "step": 1140 + }, + { + "epoch": 0.027083038952475155, + "grad_norm": 1.784119963645935, + "learning_rate": 0.00019458810230323586, + "loss": 2.177, + "step": 1150 + }, + { + "epoch": 0.027318543639018415, + "grad_norm": 2.4228804111480713, + "learning_rate": 0.00019454100136592718, + "loss": 2.4059, + "step": 1160 + }, + { + "epoch": 0.02755404832556168, + "grad_norm": 2.120675802230835, + "learning_rate": 0.00019449390042861856, + "loss": 2.2732, + "step": 1170 + }, + { + "epoch": 0.02778955301210494, + "grad_norm": 1.370073676109314, + "learning_rate": 0.00019444679949130988, + "loss": 2.2269, + "step": 1180 + }, + { + "epoch": 0.028025057698648204, + "grad_norm": 1.6620279550552368, + "learning_rate": 0.00019439969855400123, + "loss": 2.2124, + "step": 1190 + }, + { + "epoch": 0.028260562385191465, + "grad_norm": 2.0860698223114014, + "learning_rate": 0.00019435259761669258, + "loss": 2.1872, + "step": 1200 + }, + { + "epoch": 0.02849606707173473, + "grad_norm": 1.944350004196167, + "learning_rate": 0.00019430549667938393, + "loss": 2.3289, + "step": 1210 + }, + { + "epoch": 0.02873157175827799, + "grad_norm": 1.8327244520187378, + "learning_rate": 0.00019425839574207528, + "loss": 2.2747, + "step": 1220 + }, + { + "epoch": 0.02896707644482125, + "grad_norm": 1.52091383934021, + "learning_rate": 0.00019421129480476663, + "loss": 2.3557, + "step": 1230 + }, + { + "epoch": 0.029202581131364515, + "grad_norm": 1.7364777326583862, + "learning_rate": 0.00019416419386745796, + "loss": 2.2268, + "step": 1240 + }, + { + "epoch": 0.029438085817907775, + "grad_norm": 1.7765967845916748, + "learning_rate": 0.00019411709293014933, + "loss": 2.1311, + "step": 1250 + }, + { + "epoch": 0.02967359050445104, + "grad_norm": 2.070098876953125, + "learning_rate": 0.00019406999199284066, + "loss": 2.2586, + "step": 1260 + }, + { + "epoch": 0.0299090951909943, + "grad_norm": 1.7408677339553833, + "learning_rate": 0.000194022891055532, + "loss": 2.2579, + "step": 1270 + }, + { + "epoch": 0.030144599877537565, + "grad_norm": 1.849951982498169, + "learning_rate": 0.00019397579011822336, + "loss": 2.3557, + "step": 1280 + }, + { + "epoch": 0.030380104564080825, + "grad_norm": 1.4832192659378052, + "learning_rate": 0.0001939286891809147, + "loss": 2.0548, + "step": 1290 + }, + { + "epoch": 0.030615609250624086, + "grad_norm": 2.5369913578033447, + "learning_rate": 0.00019388158824360606, + "loss": 2.2137, + "step": 1300 + }, + { + "epoch": 0.03085111393716735, + "grad_norm": 1.648578405380249, + "learning_rate": 0.0001938344873062974, + "loss": 2.4065, + "step": 1310 + }, + { + "epoch": 0.03108661862371061, + "grad_norm": 1.6784148216247559, + "learning_rate": 0.00019378738636898876, + "loss": 2.1999, + "step": 1320 + }, + { + "epoch": 0.03132212331025387, + "grad_norm": 2.132889747619629, + "learning_rate": 0.0001937402854316801, + "loss": 2.3155, + "step": 1330 + }, + { + "epoch": 0.031557627996797136, + "grad_norm": 1.9378546476364136, + "learning_rate": 0.00019369318449437143, + "loss": 2.2407, + "step": 1340 + }, + { + "epoch": 0.0317931326833404, + "grad_norm": 4.045490741729736, + "learning_rate": 0.0001936460835570628, + "loss": 2.2026, + "step": 1350 + }, + { + "epoch": 0.032028637369883664, + "grad_norm": 1.7972216606140137, + "learning_rate": 0.00019359898261975413, + "loss": 2.3747, + "step": 1360 + }, + { + "epoch": 0.03226414205642692, + "grad_norm": 1.7621222734451294, + "learning_rate": 0.00019355188168244548, + "loss": 2.1652, + "step": 1370 + }, + { + "epoch": 0.032499646742970185, + "grad_norm": 2.10644268989563, + "learning_rate": 0.00019350478074513683, + "loss": 2.4651, + "step": 1380 + }, + { + "epoch": 0.03273515142951345, + "grad_norm": 1.6137688159942627, + "learning_rate": 0.00019345767980782818, + "loss": 2.1893, + "step": 1390 + }, + { + "epoch": 0.03297065611605671, + "grad_norm": 2.142392873764038, + "learning_rate": 0.00019341057887051953, + "loss": 2.3385, + "step": 1400 + }, + { + "epoch": 0.03320616080259997, + "grad_norm": 2.0509915351867676, + "learning_rate": 0.00019336347793321088, + "loss": 2.2966, + "step": 1410 + }, + { + "epoch": 0.033441665489143235, + "grad_norm": 1.7054039239883423, + "learning_rate": 0.00019331637699590223, + "loss": 2.2535, + "step": 1420 + }, + { + "epoch": 0.0336771701756865, + "grad_norm": 1.9987514019012451, + "learning_rate": 0.00019326927605859358, + "loss": 2.2129, + "step": 1430 + }, + { + "epoch": 0.03391267486222976, + "grad_norm": 1.558950424194336, + "learning_rate": 0.00019322217512128493, + "loss": 2.3284, + "step": 1440 + }, + { + "epoch": 0.03414817954877302, + "grad_norm": 2.1738619804382324, + "learning_rate": 0.00019317507418397626, + "loss": 2.2122, + "step": 1450 + }, + { + "epoch": 0.034383684235316285, + "grad_norm": 2.3210151195526123, + "learning_rate": 0.00019312797324666763, + "loss": 2.1787, + "step": 1460 + }, + { + "epoch": 0.03461918892185954, + "grad_norm": 2.7615976333618164, + "learning_rate": 0.00019308087230935896, + "loss": 2.1883, + "step": 1470 + }, + { + "epoch": 0.034854693608402806, + "grad_norm": 1.751855731010437, + "learning_rate": 0.00019303377137205033, + "loss": 2.2706, + "step": 1480 + }, + { + "epoch": 0.03509019829494607, + "grad_norm": 1.46992027759552, + "learning_rate": 0.00019298667043474166, + "loss": 2.1759, + "step": 1490 + }, + { + "epoch": 0.035325702981489335, + "grad_norm": 1.7911906242370605, + "learning_rate": 0.000192939569497433, + "loss": 2.0725, + "step": 1500 + }, + { + "epoch": 0.03556120766803259, + "grad_norm": 1.5120116472244263, + "learning_rate": 0.00019289246856012436, + "loss": 2.3206, + "step": 1510 + }, + { + "epoch": 0.035796712354575856, + "grad_norm": 1.7404943704605103, + "learning_rate": 0.0001928453676228157, + "loss": 2.1382, + "step": 1520 + }, + { + "epoch": 0.03603221704111912, + "grad_norm": 1.7643208503723145, + "learning_rate": 0.00019279826668550706, + "loss": 2.0674, + "step": 1530 + }, + { + "epoch": 0.03626772172766238, + "grad_norm": 2.0491485595703125, + "learning_rate": 0.0001927511657481984, + "loss": 2.378, + "step": 1540 + }, + { + "epoch": 0.03650322641420564, + "grad_norm": 1.71403169631958, + "learning_rate": 0.00019270406481088973, + "loss": 2.3117, + "step": 1550 + }, + { + "epoch": 0.036738731100748906, + "grad_norm": 1.729164719581604, + "learning_rate": 0.0001926569638735811, + "loss": 2.2056, + "step": 1560 + }, + { + "epoch": 0.03697423578729217, + "grad_norm": 2.1834709644317627, + "learning_rate": 0.00019260986293627243, + "loss": 2.265, + "step": 1570 + }, + { + "epoch": 0.03720974047383543, + "grad_norm": 1.5792131423950195, + "learning_rate": 0.00019256276199896378, + "loss": 2.2569, + "step": 1580 + }, + { + "epoch": 0.03744524516037869, + "grad_norm": 2.3097856044769287, + "learning_rate": 0.00019251566106165513, + "loss": 2.2394, + "step": 1590 + }, + { + "epoch": 0.037680749846921956, + "grad_norm": 1.9641480445861816, + "learning_rate": 0.00019246856012434648, + "loss": 2.3833, + "step": 1600 + }, + { + "epoch": 0.03791625453346521, + "grad_norm": 2.1574530601501465, + "learning_rate": 0.00019242145918703783, + "loss": 2.1989, + "step": 1610 + }, + { + "epoch": 0.03815175922000848, + "grad_norm": 2.2117180824279785, + "learning_rate": 0.00019237435824972918, + "loss": 2.286, + "step": 1620 + }, + { + "epoch": 0.03838726390655174, + "grad_norm": 1.692057728767395, + "learning_rate": 0.0001923272573124205, + "loss": 2.3328, + "step": 1630 + }, + { + "epoch": 0.038622768593095005, + "grad_norm": 2.3093721866607666, + "learning_rate": 0.00019228015637511188, + "loss": 2.284, + "step": 1640 + }, + { + "epoch": 0.03885827327963826, + "grad_norm": 1.6981611251831055, + "learning_rate": 0.0001922330554378032, + "loss": 2.1517, + "step": 1650 + }, + { + "epoch": 0.03909377796618153, + "grad_norm": 2.144850730895996, + "learning_rate": 0.00019218595450049456, + "loss": 2.3897, + "step": 1660 + }, + { + "epoch": 0.03932928265272479, + "grad_norm": 1.5663037300109863, + "learning_rate": 0.00019213885356318593, + "loss": 2.2038, + "step": 1670 + }, + { + "epoch": 0.03956478733926805, + "grad_norm": 2.053711175918579, + "learning_rate": 0.00019209175262587726, + "loss": 2.1427, + "step": 1680 + }, + { + "epoch": 0.03980029202581131, + "grad_norm": 1.7935670614242554, + "learning_rate": 0.00019204465168856863, + "loss": 2.3839, + "step": 1690 + }, + { + "epoch": 0.04003579671235458, + "grad_norm": 2.1188805103302, + "learning_rate": 0.00019199755075125996, + "loss": 2.2879, + "step": 1700 + }, + { + "epoch": 0.04027130139889784, + "grad_norm": 1.8197453022003174, + "learning_rate": 0.0001919504498139513, + "loss": 2.361, + "step": 1710 + }, + { + "epoch": 0.0405068060854411, + "grad_norm": 1.9226861000061035, + "learning_rate": 0.00019190334887664266, + "loss": 2.3839, + "step": 1720 + }, + { + "epoch": 0.04074231077198436, + "grad_norm": 1.8571581840515137, + "learning_rate": 0.000191856247939334, + "loss": 2.2876, + "step": 1730 + }, + { + "epoch": 0.040977815458527626, + "grad_norm": 1.9353306293487549, + "learning_rate": 0.00019180914700202536, + "loss": 2.1383, + "step": 1740 + }, + { + "epoch": 0.041213320145070884, + "grad_norm": 2.7373709678649902, + "learning_rate": 0.0001917620460647167, + "loss": 2.1032, + "step": 1750 + }, + { + "epoch": 0.04144882483161415, + "grad_norm": 1.6866846084594727, + "learning_rate": 0.00019171494512740803, + "loss": 2.3656, + "step": 1760 + }, + { + "epoch": 0.04168432951815741, + "grad_norm": 1.7577126026153564, + "learning_rate": 0.0001916678441900994, + "loss": 2.2786, + "step": 1770 + }, + { + "epoch": 0.041919834204700676, + "grad_norm": 2.247462272644043, + "learning_rate": 0.00019162074325279073, + "loss": 2.252, + "step": 1780 + }, + { + "epoch": 0.04215533889124393, + "grad_norm": 1.7799142599105835, + "learning_rate": 0.00019157364231548208, + "loss": 2.2314, + "step": 1790 + }, + { + "epoch": 0.0423908435777872, + "grad_norm": 2.1439526081085205, + "learning_rate": 0.00019152654137817343, + "loss": 2.4094, + "step": 1800 + }, + { + "epoch": 0.04262634826433046, + "grad_norm": 2.395840644836426, + "learning_rate": 0.00019147944044086478, + "loss": 2.1436, + "step": 1810 + }, + { + "epoch": 0.04286185295087372, + "grad_norm": 1.6308691501617432, + "learning_rate": 0.00019143233950355613, + "loss": 2.2567, + "step": 1820 + }, + { + "epoch": 0.04309735763741698, + "grad_norm": 2.162234306335449, + "learning_rate": 0.00019138523856624748, + "loss": 2.3629, + "step": 1830 + }, + { + "epoch": 0.04333286232396025, + "grad_norm": 2.5992627143859863, + "learning_rate": 0.0001913381376289388, + "loss": 2.192, + "step": 1840 + }, + { + "epoch": 0.04356836701050351, + "grad_norm": 1.7631303071975708, + "learning_rate": 0.00019129103669163018, + "loss": 2.2224, + "step": 1850 + }, + { + "epoch": 0.04380387169704677, + "grad_norm": 1.678625464439392, + "learning_rate": 0.0001912439357543215, + "loss": 2.2322, + "step": 1860 + }, + { + "epoch": 0.04403937638359003, + "grad_norm": 2.3097527027130127, + "learning_rate": 0.00019119683481701286, + "loss": 2.3262, + "step": 1870 + }, + { + "epoch": 0.0442748810701333, + "grad_norm": 1.3386963605880737, + "learning_rate": 0.0001911497338797042, + "loss": 2.1346, + "step": 1880 + }, + { + "epoch": 0.04451038575667656, + "grad_norm": 2.4035725593566895, + "learning_rate": 0.00019110263294239556, + "loss": 2.2836, + "step": 1890 + }, + { + "epoch": 0.04474589044321982, + "grad_norm": 1.9946550130844116, + "learning_rate": 0.0001910555320050869, + "loss": 2.2045, + "step": 1900 + }, + { + "epoch": 0.04498139512976308, + "grad_norm": 1.757352590560913, + "learning_rate": 0.00019100843106777826, + "loss": 1.9896, + "step": 1910 + }, + { + "epoch": 0.04521689981630635, + "grad_norm": 1.8178822994232178, + "learning_rate": 0.0001909613301304696, + "loss": 2.2243, + "step": 1920 + }, + { + "epoch": 0.045452404502849604, + "grad_norm": 2.1335227489471436, + "learning_rate": 0.00019091422919316096, + "loss": 2.4226, + "step": 1930 + }, + { + "epoch": 0.04568790918939287, + "grad_norm": 2.0507328510284424, + "learning_rate": 0.00019086712825585228, + "loss": 2.3667, + "step": 1940 + }, + { + "epoch": 0.04592341387593613, + "grad_norm": 2.5613133907318115, + "learning_rate": 0.00019082002731854366, + "loss": 2.4204, + "step": 1950 + }, + { + "epoch": 0.0461589185624794, + "grad_norm": 2.1812310218811035, + "learning_rate": 0.000190772926381235, + "loss": 2.1989, + "step": 1960 + }, + { + "epoch": 0.046394423249022654, + "grad_norm": 2.316821336746216, + "learning_rate": 0.00019072582544392633, + "loss": 2.4335, + "step": 1970 + }, + { + "epoch": 0.04662992793556592, + "grad_norm": 2.272609233856201, + "learning_rate": 0.0001906787245066177, + "loss": 2.2962, + "step": 1980 + }, + { + "epoch": 0.04686543262210918, + "grad_norm": 2.0263876914978027, + "learning_rate": 0.00019063162356930903, + "loss": 2.2607, + "step": 1990 + }, + { + "epoch": 0.04710093730865244, + "grad_norm": 2.9174461364746094, + "learning_rate": 0.00019058452263200038, + "loss": 2.2248, + "step": 2000 + }, + { + "epoch": 0.047336441995195704, + "grad_norm": 2.3031280040740967, + "learning_rate": 0.00019053742169469173, + "loss": 2.3064, + "step": 2010 + }, + { + "epoch": 0.04757194668173897, + "grad_norm": 1.9068163633346558, + "learning_rate": 0.00019049032075738308, + "loss": 2.1707, + "step": 2020 + }, + { + "epoch": 0.04780745136828223, + "grad_norm": 1.6343876123428345, + "learning_rate": 0.00019044321982007443, + "loss": 2.1399, + "step": 2030 + }, + { + "epoch": 0.04804295605482549, + "grad_norm": 2.0931334495544434, + "learning_rate": 0.00019039611888276578, + "loss": 2.1365, + "step": 2040 + }, + { + "epoch": 0.04827846074136875, + "grad_norm": 2.3466007709503174, + "learning_rate": 0.0001903490179454571, + "loss": 2.2243, + "step": 2050 + }, + { + "epoch": 0.04851396542791202, + "grad_norm": 1.7455843687057495, + "learning_rate": 0.00019030191700814849, + "loss": 2.2342, + "step": 2060 + }, + { + "epoch": 0.048749470114455275, + "grad_norm": 2.6780898571014404, + "learning_rate": 0.0001902548160708398, + "loss": 2.3248, + "step": 2070 + }, + { + "epoch": 0.04898497480099854, + "grad_norm": 2.0855660438537598, + "learning_rate": 0.00019020771513353119, + "loss": 2.3911, + "step": 2080 + }, + { + "epoch": 0.0492204794875418, + "grad_norm": 1.9923450946807861, + "learning_rate": 0.0001901606141962225, + "loss": 2.2859, + "step": 2090 + }, + { + "epoch": 0.04945598417408507, + "grad_norm": 1.83405601978302, + "learning_rate": 0.00019011351325891386, + "loss": 2.0618, + "step": 2100 + }, + { + "epoch": 0.049691488860628324, + "grad_norm": 2.5639121532440186, + "learning_rate": 0.0001900664123216052, + "loss": 2.3821, + "step": 2110 + }, + { + "epoch": 0.04992699354717159, + "grad_norm": 1.992323875427246, + "learning_rate": 0.00019001931138429656, + "loss": 2.2083, + "step": 2120 + }, + { + "epoch": 0.05016249823371485, + "grad_norm": 2.1780548095703125, + "learning_rate": 0.0001899722104469879, + "loss": 2.2257, + "step": 2130 + }, + { + "epoch": 0.05039800292025811, + "grad_norm": 2.254702091217041, + "learning_rate": 0.00018992510950967926, + "loss": 2.175, + "step": 2140 + }, + { + "epoch": 0.050633507606801374, + "grad_norm": 2.233217477798462, + "learning_rate": 0.00018987800857237058, + "loss": 2.2972, + "step": 2150 + }, + { + "epoch": 0.05086901229334464, + "grad_norm": 2.3001439571380615, + "learning_rate": 0.00018983090763506196, + "loss": 2.3244, + "step": 2160 + }, + { + "epoch": 0.0511045169798879, + "grad_norm": 2.1131844520568848, + "learning_rate": 0.00018978851679148415, + "loss": 2.4339, + "step": 2170 + }, + { + "epoch": 0.05134002166643116, + "grad_norm": 1.8972138166427612, + "learning_rate": 0.0001897414158541755, + "loss": 2.1409, + "step": 2180 + }, + { + "epoch": 0.051575526352974424, + "grad_norm": 2.831122875213623, + "learning_rate": 0.00018969431491686685, + "loss": 2.4679, + "step": 2190 + }, + { + "epoch": 0.05181103103951769, + "grad_norm": 1.9056183099746704, + "learning_rate": 0.0001896472139795582, + "loss": 2.2394, + "step": 2200 + }, + { + "epoch": 0.052046535726060945, + "grad_norm": 2.218238115310669, + "learning_rate": 0.00018960011304224955, + "loss": 2.0855, + "step": 2210 + }, + { + "epoch": 0.05228204041260421, + "grad_norm": 1.6970261335372925, + "learning_rate": 0.0001895530121049409, + "loss": 2.3911, + "step": 2220 + }, + { + "epoch": 0.052517545099147474, + "grad_norm": 1.9589953422546387, + "learning_rate": 0.00018950591116763225, + "loss": 2.2206, + "step": 2230 + }, + { + "epoch": 0.05275304978569074, + "grad_norm": 1.9307024478912354, + "learning_rate": 0.0001894588102303236, + "loss": 2.3854, + "step": 2240 + }, + { + "epoch": 0.052988554472233995, + "grad_norm": 1.5521360635757446, + "learning_rate": 0.00018941170929301495, + "loss": 2.3013, + "step": 2250 + }, + { + "epoch": 0.05322405915877726, + "grad_norm": 2.1857752799987793, + "learning_rate": 0.00018936460835570628, + "loss": 2.3844, + "step": 2260 + }, + { + "epoch": 0.053459563845320524, + "grad_norm": 2.2541661262512207, + "learning_rate": 0.00018931750741839765, + "loss": 2.1811, + "step": 2270 + }, + { + "epoch": 0.05369506853186378, + "grad_norm": 2.868990421295166, + "learning_rate": 0.00018927040648108898, + "loss": 2.391, + "step": 2280 + }, + { + "epoch": 0.053930573218407045, + "grad_norm": 1.7669481039047241, + "learning_rate": 0.00018922330554378033, + "loss": 2.2318, + "step": 2290 + }, + { + "epoch": 0.05416607790495031, + "grad_norm": 2.0685200691223145, + "learning_rate": 0.00018917620460647168, + "loss": 2.2587, + "step": 2300 + }, + { + "epoch": 0.05440158259149357, + "grad_norm": 2.7001943588256836, + "learning_rate": 0.00018912910366916303, + "loss": 2.2861, + "step": 2310 + }, + { + "epoch": 0.05463708727803683, + "grad_norm": 1.7368396520614624, + "learning_rate": 0.00018908200273185438, + "loss": 2.119, + "step": 2320 + }, + { + "epoch": 0.054872591964580095, + "grad_norm": 2.3278701305389404, + "learning_rate": 0.00018903490179454573, + "loss": 2.1874, + "step": 2330 + }, + { + "epoch": 0.05510809665112336, + "grad_norm": 2.0269453525543213, + "learning_rate": 0.00018898780085723705, + "loss": 2.3772, + "step": 2340 + }, + { + "epoch": 0.055343601337666616, + "grad_norm": 1.5783114433288574, + "learning_rate": 0.00018894069991992843, + "loss": 2.1799, + "step": 2350 + }, + { + "epoch": 0.05557910602420988, + "grad_norm": 2.1277246475219727, + "learning_rate": 0.00018889359898261975, + "loss": 2.1011, + "step": 2360 + }, + { + "epoch": 0.055814610710753144, + "grad_norm": 1.865946650505066, + "learning_rate": 0.0001888464980453111, + "loss": 2.2497, + "step": 2370 + }, + { + "epoch": 0.05605011539729641, + "grad_norm": 2.0735392570495605, + "learning_rate": 0.00018879939710800245, + "loss": 2.2633, + "step": 2380 + }, + { + "epoch": 0.056285620083839666, + "grad_norm": 1.9640772342681885, + "learning_rate": 0.0001887522961706938, + "loss": 2.2787, + "step": 2390 + }, + { + "epoch": 0.05652112477038293, + "grad_norm": 1.4566375017166138, + "learning_rate": 0.00018870519523338515, + "loss": 2.1823, + "step": 2400 + }, + { + "epoch": 0.056756629456926194, + "grad_norm": 1.715774416923523, + "learning_rate": 0.0001886580942960765, + "loss": 2.1143, + "step": 2410 + }, + { + "epoch": 0.05699213414346946, + "grad_norm": 4.055193901062012, + "learning_rate": 0.00018861099335876785, + "loss": 2.4697, + "step": 2420 + }, + { + "epoch": 0.057227638830012716, + "grad_norm": 2.1839892864227295, + "learning_rate": 0.0001885638924214592, + "loss": 2.0825, + "step": 2430 + }, + { + "epoch": 0.05746314351655598, + "grad_norm": 1.8288459777832031, + "learning_rate": 0.00018851679148415053, + "loss": 2.1559, + "step": 2440 + }, + { + "epoch": 0.057698648203099244, + "grad_norm": 1.7593369483947754, + "learning_rate": 0.0001884696905468419, + "loss": 2.1335, + "step": 2450 + }, + { + "epoch": 0.0579341528896425, + "grad_norm": 1.703242540359497, + "learning_rate": 0.00018842258960953323, + "loss": 2.2544, + "step": 2460 + }, + { + "epoch": 0.058169657576185765, + "grad_norm": 2.0734493732452393, + "learning_rate": 0.00018837548867222458, + "loss": 2.2457, + "step": 2470 + }, + { + "epoch": 0.05840516226272903, + "grad_norm": 1.746780514717102, + "learning_rate": 0.00018832838773491593, + "loss": 2.3449, + "step": 2480 + }, + { + "epoch": 0.058640666949272294, + "grad_norm": 2.1612048149108887, + "learning_rate": 0.00018828128679760728, + "loss": 2.37, + "step": 2490 + }, + { + "epoch": 0.05887617163581555, + "grad_norm": 1.9451191425323486, + "learning_rate": 0.00018823418586029863, + "loss": 2.0534, + "step": 2500 + }, + { + "epoch": 0.059111676322358815, + "grad_norm": 1.6598166227340698, + "learning_rate": 0.00018818708492298998, + "loss": 2.2115, + "step": 2510 + }, + { + "epoch": 0.05934718100890208, + "grad_norm": 1.8540189266204834, + "learning_rate": 0.00018813998398568133, + "loss": 2.2075, + "step": 2520 + }, + { + "epoch": 0.05958268569544534, + "grad_norm": 1.914438009262085, + "learning_rate": 0.00018809288304837268, + "loss": 2.4852, + "step": 2530 + }, + { + "epoch": 0.0598181903819886, + "grad_norm": 1.941332459449768, + "learning_rate": 0.00018804578211106403, + "loss": 2.2285, + "step": 2540 + }, + { + "epoch": 0.060053695068531865, + "grad_norm": 1.4906508922576904, + "learning_rate": 0.00018799868117375535, + "loss": 2.2549, + "step": 2550 + }, + { + "epoch": 0.06028919975507513, + "grad_norm": 2.5642271041870117, + "learning_rate": 0.00018795158023644673, + "loss": 2.3277, + "step": 2560 + }, + { + "epoch": 0.060524704441618386, + "grad_norm": 2.510061264038086, + "learning_rate": 0.00018790447929913805, + "loss": 2.2822, + "step": 2570 + }, + { + "epoch": 0.06076020912816165, + "grad_norm": 1.7272487878799438, + "learning_rate": 0.00018785737836182943, + "loss": 2.1391, + "step": 2580 + }, + { + "epoch": 0.060995713814704915, + "grad_norm": 1.9994730949401855, + "learning_rate": 0.00018781027742452075, + "loss": 2.2, + "step": 2590 + }, + { + "epoch": 0.06123121850124817, + "grad_norm": 1.5051331520080566, + "learning_rate": 0.0001877631764872121, + "loss": 2.3043, + "step": 2600 + }, + { + "epoch": 0.061466723187791436, + "grad_norm": 1.6909672021865845, + "learning_rate": 0.00018771607554990345, + "loss": 2.0515, + "step": 2610 + }, + { + "epoch": 0.0617022278743347, + "grad_norm": 1.6561975479125977, + "learning_rate": 0.0001876689746125948, + "loss": 2.3871, + "step": 2620 + }, + { + "epoch": 0.061937732560877964, + "grad_norm": 1.6686687469482422, + "learning_rate": 0.00018762187367528615, + "loss": 2.2119, + "step": 2630 + }, + { + "epoch": 0.06217323724742122, + "grad_norm": 1.7708022594451904, + "learning_rate": 0.0001875747727379775, + "loss": 2.3067, + "step": 2640 + }, + { + "epoch": 0.062408741933964486, + "grad_norm": 1.873261570930481, + "learning_rate": 0.00018752767180066883, + "loss": 2.3496, + "step": 2650 + }, + { + "epoch": 0.06264424662050774, + "grad_norm": 1.7290661334991455, + "learning_rate": 0.0001874805708633602, + "loss": 2.2128, + "step": 2660 + }, + { + "epoch": 0.06287975130705101, + "grad_norm": 2.2264304161071777, + "learning_rate": 0.00018743346992605153, + "loss": 2.1921, + "step": 2670 + }, + { + "epoch": 0.06311525599359427, + "grad_norm": 1.793036699295044, + "learning_rate": 0.00018738636898874288, + "loss": 2.352, + "step": 2680 + }, + { + "epoch": 0.06335076068013754, + "grad_norm": 2.034780502319336, + "learning_rate": 0.00018733926805143423, + "loss": 2.3553, + "step": 2690 + }, + { + "epoch": 0.0635862653666808, + "grad_norm": 1.685520887374878, + "learning_rate": 0.00018729216711412558, + "loss": 2.1674, + "step": 2700 + }, + { + "epoch": 0.06382177005322406, + "grad_norm": 2.387843370437622, + "learning_rate": 0.00018724506617681693, + "loss": 2.2459, + "step": 2710 + }, + { + "epoch": 0.06405727473976733, + "grad_norm": 2.0628840923309326, + "learning_rate": 0.00018719796523950828, + "loss": 2.3385, + "step": 2720 + }, + { + "epoch": 0.06429277942631058, + "grad_norm": 1.780592679977417, + "learning_rate": 0.0001871508643021996, + "loss": 2.0707, + "step": 2730 + }, + { + "epoch": 0.06452828411285384, + "grad_norm": 1.9920777082443237, + "learning_rate": 0.00018710376336489098, + "loss": 2.175, + "step": 2740 + }, + { + "epoch": 0.0647637887993971, + "grad_norm": 1.7009214162826538, + "learning_rate": 0.0001870566624275823, + "loss": 2.0857, + "step": 2750 + }, + { + "epoch": 0.06499929348594037, + "grad_norm": 2.2230193614959717, + "learning_rate": 0.00018700956149027365, + "loss": 2.4301, + "step": 2760 + }, + { + "epoch": 0.06523479817248364, + "grad_norm": 1.7082926034927368, + "learning_rate": 0.000186962460552965, + "loss": 2.2562, + "step": 2770 + }, + { + "epoch": 0.0654703028590269, + "grad_norm": 1.9396960735321045, + "learning_rate": 0.00018691535961565635, + "loss": 2.3725, + "step": 2780 + }, + { + "epoch": 0.06570580754557016, + "grad_norm": 1.9210023880004883, + "learning_rate": 0.00018686825867834773, + "loss": 2.197, + "step": 2790 + }, + { + "epoch": 0.06594131223211341, + "grad_norm": 1.7751202583312988, + "learning_rate": 0.00018682115774103905, + "loss": 2.4285, + "step": 2800 + }, + { + "epoch": 0.06617681691865668, + "grad_norm": 2.46057391166687, + "learning_rate": 0.0001867740568037304, + "loss": 2.0906, + "step": 2810 + }, + { + "epoch": 0.06641232160519994, + "grad_norm": 1.982324481010437, + "learning_rate": 0.00018672695586642175, + "loss": 2.3326, + "step": 2820 + }, + { + "epoch": 0.0666478262917432, + "grad_norm": 2.0937414169311523, + "learning_rate": 0.0001866798549291131, + "loss": 2.1654, + "step": 2830 + }, + { + "epoch": 0.06688333097828647, + "grad_norm": 1.8799653053283691, + "learning_rate": 0.00018663275399180446, + "loss": 2.178, + "step": 2840 + }, + { + "epoch": 0.06711883566482973, + "grad_norm": 2.4298360347747803, + "learning_rate": 0.0001865856530544958, + "loss": 2.3419, + "step": 2850 + }, + { + "epoch": 0.067354340351373, + "grad_norm": 1.9179294109344482, + "learning_rate": 0.00018653855211718713, + "loss": 2.3138, + "step": 2860 + }, + { + "epoch": 0.06758984503791625, + "grad_norm": 1.727042317390442, + "learning_rate": 0.0001864914511798785, + "loss": 2.1584, + "step": 2870 + }, + { + "epoch": 0.06782534972445951, + "grad_norm": 5.538887023925781, + "learning_rate": 0.00018644435024256983, + "loss": 2.1706, + "step": 2880 + }, + { + "epoch": 0.06806085441100278, + "grad_norm": 1.6711513996124268, + "learning_rate": 0.00018639724930526118, + "loss": 2.1269, + "step": 2890 + }, + { + "epoch": 0.06829635909754604, + "grad_norm": 3.3184592723846436, + "learning_rate": 0.00018635014836795253, + "loss": 2.201, + "step": 2900 + }, + { + "epoch": 0.0685318637840893, + "grad_norm": 2.0365543365478516, + "learning_rate": 0.00018630304743064388, + "loss": 2.203, + "step": 2910 + }, + { + "epoch": 0.06876736847063257, + "grad_norm": 2.007431745529175, + "learning_rate": 0.00018625594649333523, + "loss": 2.1961, + "step": 2920 + }, + { + "epoch": 0.06900287315717583, + "grad_norm": 5.275245189666748, + "learning_rate": 0.00018620884555602658, + "loss": 1.9558, + "step": 2930 + }, + { + "epoch": 0.06923837784371908, + "grad_norm": 1.7514394521713257, + "learning_rate": 0.0001861617446187179, + "loss": 2.2311, + "step": 2940 + }, + { + "epoch": 0.06947388253026235, + "grad_norm": 1.685663104057312, + "learning_rate": 0.00018611464368140928, + "loss": 2.4731, + "step": 2950 + }, + { + "epoch": 0.06970938721680561, + "grad_norm": 1.8519153594970703, + "learning_rate": 0.0001860675427441006, + "loss": 2.3555, + "step": 2960 + }, + { + "epoch": 0.06994489190334888, + "grad_norm": 1.6579869985580444, + "learning_rate": 0.00018602515190052282, + "loss": 2.1312, + "step": 2970 + }, + { + "epoch": 0.07018039658989214, + "grad_norm": 1.7049615383148193, + "learning_rate": 0.00018597805096321417, + "loss": 2.1154, + "step": 2980 + }, + { + "epoch": 0.0704159012764354, + "grad_norm": 2.0084738731384277, + "learning_rate": 0.00018593095002590552, + "loss": 2.1103, + "step": 2990 + }, + { + "epoch": 0.07065140596297867, + "grad_norm": 2.9388327598571777, + "learning_rate": 0.00018588384908859687, + "loss": 2.2574, + "step": 3000 + }, + { + "epoch": 0.07088691064952192, + "grad_norm": 1.6756362915039062, + "learning_rate": 0.00018583674815128822, + "loss": 2.1279, + "step": 3010 + }, + { + "epoch": 0.07112241533606518, + "grad_norm": 2.137333393096924, + "learning_rate": 0.00018578964721397955, + "loss": 2.3244, + "step": 3020 + }, + { + "epoch": 0.07135792002260845, + "grad_norm": 1.6805435419082642, + "learning_rate": 0.00018574254627667092, + "loss": 2.3783, + "step": 3030 + }, + { + "epoch": 0.07159342470915171, + "grad_norm": 1.4257123470306396, + "learning_rate": 0.00018569544533936227, + "loss": 2.1998, + "step": 3040 + }, + { + "epoch": 0.07182892939569498, + "grad_norm": 2.6023623943328857, + "learning_rate": 0.0001856483444020536, + "loss": 2.2162, + "step": 3050 + }, + { + "epoch": 0.07206443408223824, + "grad_norm": 1.765554666519165, + "learning_rate": 0.00018560124346474497, + "loss": 2.1655, + "step": 3060 + }, + { + "epoch": 0.0722999387687815, + "grad_norm": 1.852608561515808, + "learning_rate": 0.0001855541425274363, + "loss": 2.1649, + "step": 3070 + }, + { + "epoch": 0.07253544345532476, + "grad_norm": 1.706581950187683, + "learning_rate": 0.00018550704159012767, + "loss": 2.1517, + "step": 3080 + }, + { + "epoch": 0.07277094814186802, + "grad_norm": 2.3597655296325684, + "learning_rate": 0.000185459940652819, + "loss": 2.0139, + "step": 3090 + }, + { + "epoch": 0.07300645282841128, + "grad_norm": 2.2678277492523193, + "learning_rate": 0.00018541283971551035, + "loss": 2.0053, + "step": 3100 + }, + { + "epoch": 0.07324195751495455, + "grad_norm": 2.10081148147583, + "learning_rate": 0.0001853657387782017, + "loss": 2.2485, + "step": 3110 + }, + { + "epoch": 0.07347746220149781, + "grad_norm": 1.7612814903259277, + "learning_rate": 0.00018531863784089305, + "loss": 2.2997, + "step": 3120 + }, + { + "epoch": 0.07371296688804108, + "grad_norm": 1.8904122114181519, + "learning_rate": 0.0001852715369035844, + "loss": 2.1279, + "step": 3130 + }, + { + "epoch": 0.07394847157458434, + "grad_norm": 1.769583821296692, + "learning_rate": 0.00018522443596627575, + "loss": 2.2626, + "step": 3140 + }, + { + "epoch": 0.07418397626112759, + "grad_norm": 2.3224611282348633, + "learning_rate": 0.00018517733502896707, + "loss": 2.3991, + "step": 3150 + }, + { + "epoch": 0.07441948094767085, + "grad_norm": 1.9851993322372437, + "learning_rate": 0.00018513023409165845, + "loss": 2.1701, + "step": 3160 + }, + { + "epoch": 0.07465498563421412, + "grad_norm": 2.010765790939331, + "learning_rate": 0.00018508313315434977, + "loss": 2.2649, + "step": 3170 + }, + { + "epoch": 0.07489049032075738, + "grad_norm": 1.9443213939666748, + "learning_rate": 0.00018503603221704112, + "loss": 2.2097, + "step": 3180 + }, + { + "epoch": 0.07512599500730065, + "grad_norm": 2.350292682647705, + "learning_rate": 0.00018498893127973247, + "loss": 2.2244, + "step": 3190 + }, + { + "epoch": 0.07536149969384391, + "grad_norm": 1.83067786693573, + "learning_rate": 0.00018494183034242382, + "loss": 2.0924, + "step": 3200 + }, + { + "epoch": 0.07559700438038718, + "grad_norm": 2.046450614929199, + "learning_rate": 0.00018489472940511517, + "loss": 2.214, + "step": 3210 + }, + { + "epoch": 0.07583250906693043, + "grad_norm": 2.111867666244507, + "learning_rate": 0.00018484762846780652, + "loss": 2.5115, + "step": 3220 + }, + { + "epoch": 0.07606801375347369, + "grad_norm": 1.992246150970459, + "learning_rate": 0.00018480052753049785, + "loss": 2.1997, + "step": 3230 + }, + { + "epoch": 0.07630351844001695, + "grad_norm": 2.763298511505127, + "learning_rate": 0.00018475342659318922, + "loss": 2.299, + "step": 3240 + }, + { + "epoch": 0.07653902312656022, + "grad_norm": 1.8757209777832031, + "learning_rate": 0.00018470632565588055, + "loss": 2.323, + "step": 3250 + }, + { + "epoch": 0.07677452781310348, + "grad_norm": 1.779782772064209, + "learning_rate": 0.0001846592247185719, + "loss": 2.0727, + "step": 3260 + }, + { + "epoch": 0.07701003249964675, + "grad_norm": 2.201812267303467, + "learning_rate": 0.00018461212378126325, + "loss": 2.4871, + "step": 3270 + }, + { + "epoch": 0.07724553718619001, + "grad_norm": 2.0720343589782715, + "learning_rate": 0.0001845650228439546, + "loss": 2.1938, + "step": 3280 + }, + { + "epoch": 0.07748104187273326, + "grad_norm": 1.743800163269043, + "learning_rate": 0.00018451792190664595, + "loss": 2.2414, + "step": 3290 + }, + { + "epoch": 0.07771654655927653, + "grad_norm": 1.9348535537719727, + "learning_rate": 0.0001844708209693373, + "loss": 2.0799, + "step": 3300 + }, + { + "epoch": 0.07795205124581979, + "grad_norm": 2.4140963554382324, + "learning_rate": 0.00018442372003202862, + "loss": 2.1721, + "step": 3310 + }, + { + "epoch": 0.07818755593236305, + "grad_norm": 1.6908040046691895, + "learning_rate": 0.00018437661909472, + "loss": 2.0753, + "step": 3320 + }, + { + "epoch": 0.07842306061890632, + "grad_norm": 2.733171224594116, + "learning_rate": 0.00018432951815741135, + "loss": 2.1139, + "step": 3330 + }, + { + "epoch": 0.07865856530544958, + "grad_norm": 2.480011463165283, + "learning_rate": 0.0001842824172201027, + "loss": 2.2136, + "step": 3340 + }, + { + "epoch": 0.07889406999199285, + "grad_norm": 2.223186731338501, + "learning_rate": 0.00018423531628279405, + "loss": 1.8241, + "step": 3350 + }, + { + "epoch": 0.0791295746785361, + "grad_norm": 2.359806537628174, + "learning_rate": 0.00018418821534548537, + "loss": 2.2171, + "step": 3360 + }, + { + "epoch": 0.07936507936507936, + "grad_norm": 1.6184040307998657, + "learning_rate": 0.00018414111440817675, + "loss": 2.3467, + "step": 3370 + }, + { + "epoch": 0.07960058405162262, + "grad_norm": 1.9266674518585205, + "learning_rate": 0.00018409401347086807, + "loss": 2.092, + "step": 3380 + }, + { + "epoch": 0.07983608873816589, + "grad_norm": 3.338719129562378, + "learning_rate": 0.00018404691253355942, + "loss": 2.2806, + "step": 3390 + }, + { + "epoch": 0.08007159342470915, + "grad_norm": 2.559915781021118, + "learning_rate": 0.00018399981159625077, + "loss": 2.346, + "step": 3400 + }, + { + "epoch": 0.08030709811125242, + "grad_norm": 2.17266845703125, + "learning_rate": 0.00018395271065894212, + "loss": 2.3376, + "step": 3410 + }, + { + "epoch": 0.08054260279779568, + "grad_norm": 1.8615611791610718, + "learning_rate": 0.00018390560972163347, + "loss": 2.204, + "step": 3420 + }, + { + "epoch": 0.08077810748433893, + "grad_norm": 1.8173818588256836, + "learning_rate": 0.00018385850878432482, + "loss": 2.1612, + "step": 3430 + }, + { + "epoch": 0.0810136121708822, + "grad_norm": 2.4427809715270996, + "learning_rate": 0.00018381140784701615, + "loss": 2.379, + "step": 3440 + }, + { + "epoch": 0.08124911685742546, + "grad_norm": 1.7036974430084229, + "learning_rate": 0.00018376430690970753, + "loss": 2.3024, + "step": 3450 + }, + { + "epoch": 0.08148462154396872, + "grad_norm": 2.6452062129974365, + "learning_rate": 0.00018371720597239885, + "loss": 2.0425, + "step": 3460 + }, + { + "epoch": 0.08172012623051199, + "grad_norm": 1.7686418294906616, + "learning_rate": 0.0001836701050350902, + "loss": 2.1656, + "step": 3470 + }, + { + "epoch": 0.08195563091705525, + "grad_norm": 1.9566192626953125, + "learning_rate": 0.00018362300409778155, + "loss": 2.3603, + "step": 3480 + }, + { + "epoch": 0.08219113560359852, + "grad_norm": 1.8868483304977417, + "learning_rate": 0.0001835759031604729, + "loss": 2.2435, + "step": 3490 + }, + { + "epoch": 0.08242664029014177, + "grad_norm": 2.2280383110046387, + "learning_rate": 0.00018352880222316425, + "loss": 2.2404, + "step": 3500 + }, + { + "epoch": 0.08266214497668503, + "grad_norm": 2.1244733333587646, + "learning_rate": 0.0001834817012858556, + "loss": 2.0227, + "step": 3510 + }, + { + "epoch": 0.0828976496632283, + "grad_norm": 1.9479572772979736, + "learning_rate": 0.00018343460034854695, + "loss": 2.0774, + "step": 3520 + }, + { + "epoch": 0.08313315434977156, + "grad_norm": 1.8838822841644287, + "learning_rate": 0.0001833874994112383, + "loss": 2.3049, + "step": 3530 + }, + { + "epoch": 0.08336865903631482, + "grad_norm": 2.044419288635254, + "learning_rate": 0.00018334039847392962, + "loss": 2.0416, + "step": 3540 + }, + { + "epoch": 0.08360416372285809, + "grad_norm": 2.2245490550994873, + "learning_rate": 0.000183293297536621, + "loss": 2.13, + "step": 3550 + }, + { + "epoch": 0.08383966840940135, + "grad_norm": 2.318081855773926, + "learning_rate": 0.00018324619659931232, + "loss": 2.1373, + "step": 3560 + }, + { + "epoch": 0.0840751730959446, + "grad_norm": 1.8568158149719238, + "learning_rate": 0.00018319909566200367, + "loss": 2.2176, + "step": 3570 + }, + { + "epoch": 0.08431067778248787, + "grad_norm": 2.3757684230804443, + "learning_rate": 0.00018315199472469502, + "loss": 2.299, + "step": 3580 + }, + { + "epoch": 0.08454618246903113, + "grad_norm": 3.102747917175293, + "learning_rate": 0.00018310489378738637, + "loss": 2.1751, + "step": 3590 + }, + { + "epoch": 0.0847816871555744, + "grad_norm": 2.0704846382141113, + "learning_rate": 0.00018305779285007772, + "loss": 2.2471, + "step": 3600 + }, + { + "epoch": 0.08501719184211766, + "grad_norm": 1.9258019924163818, + "learning_rate": 0.00018301069191276908, + "loss": 2.1349, + "step": 3610 + }, + { + "epoch": 0.08525269652866092, + "grad_norm": 2.328375816345215, + "learning_rate": 0.00018296359097546043, + "loss": 2.3323, + "step": 3620 + }, + { + "epoch": 0.08548820121520419, + "grad_norm": 1.6533223390579224, + "learning_rate": 0.00018291649003815178, + "loss": 2.3557, + "step": 3630 + }, + { + "epoch": 0.08572370590174744, + "grad_norm": 1.7346618175506592, + "learning_rate": 0.00018286938910084313, + "loss": 2.1023, + "step": 3640 + }, + { + "epoch": 0.0859592105882907, + "grad_norm": 2.2735652923583984, + "learning_rate": 0.00018282228816353445, + "loss": 2.2031, + "step": 3650 + }, + { + "epoch": 0.08619471527483397, + "grad_norm": 1.8555744886398315, + "learning_rate": 0.00018277518722622583, + "loss": 2.2743, + "step": 3660 + }, + { + "epoch": 0.08643021996137723, + "grad_norm": 1.919329285621643, + "learning_rate": 0.00018272808628891715, + "loss": 2.2808, + "step": 3670 + }, + { + "epoch": 0.0866657246479205, + "grad_norm": 2.246380090713501, + "learning_rate": 0.00018268098535160853, + "loss": 2.3911, + "step": 3680 + }, + { + "epoch": 0.08690122933446376, + "grad_norm": 1.6249585151672363, + "learning_rate": 0.00018263388441429985, + "loss": 2.0467, + "step": 3690 + }, + { + "epoch": 0.08713673402100702, + "grad_norm": 1.8340191841125488, + "learning_rate": 0.0001825867834769912, + "loss": 2.188, + "step": 3700 + }, + { + "epoch": 0.08737223870755029, + "grad_norm": 2.3175435066223145, + "learning_rate": 0.00018253968253968255, + "loss": 2.2183, + "step": 3710 + }, + { + "epoch": 0.08760774339409354, + "grad_norm": 2.404165267944336, + "learning_rate": 0.0001824925816023739, + "loss": 2.0468, + "step": 3720 + }, + { + "epoch": 0.0878432480806368, + "grad_norm": 2.1523420810699463, + "learning_rate": 0.00018244548066506525, + "loss": 2.1458, + "step": 3730 + }, + { + "epoch": 0.08807875276718007, + "grad_norm": 2.125816583633423, + "learning_rate": 0.0001823983797277566, + "loss": 2.3276, + "step": 3740 + }, + { + "epoch": 0.08831425745372333, + "grad_norm": 2.124753713607788, + "learning_rate": 0.00018235127879044792, + "loss": 2.222, + "step": 3750 + }, + { + "epoch": 0.0885497621402666, + "grad_norm": 2.3880615234375, + "learning_rate": 0.0001823041778531393, + "loss": 2.1973, + "step": 3760 + }, + { + "epoch": 0.08878526682680986, + "grad_norm": 1.9938665628433228, + "learning_rate": 0.00018225707691583062, + "loss": 2.0841, + "step": 3770 + }, + { + "epoch": 0.08902077151335312, + "grad_norm": 2.1049611568450928, + "learning_rate": 0.00018220997597852197, + "loss": 2.364, + "step": 3780 + }, + { + "epoch": 0.08925627619989637, + "grad_norm": 2.110180139541626, + "learning_rate": 0.00018216287504121333, + "loss": 2.346, + "step": 3790 + }, + { + "epoch": 0.08949178088643964, + "grad_norm": 1.598199725151062, + "learning_rate": 0.00018211577410390468, + "loss": 2.0503, + "step": 3800 + }, + { + "epoch": 0.0897272855729829, + "grad_norm": 2.412278175354004, + "learning_rate": 0.00018206867316659603, + "loss": 2.2061, + "step": 3810 + }, + { + "epoch": 0.08996279025952617, + "grad_norm": 2.158580780029297, + "learning_rate": 0.00018202157222928738, + "loss": 2.1454, + "step": 3820 + }, + { + "epoch": 0.09019829494606943, + "grad_norm": 1.819800853729248, + "learning_rate": 0.0001819744712919787, + "loss": 2.4598, + "step": 3830 + }, + { + "epoch": 0.0904337996326127, + "grad_norm": 1.7876063585281372, + "learning_rate": 0.00018192737035467008, + "loss": 2.1973, + "step": 3840 + }, + { + "epoch": 0.09066930431915596, + "grad_norm": 2.018787384033203, + "learning_rate": 0.0001818802694173614, + "loss": 2.209, + "step": 3850 + }, + { + "epoch": 0.09090480900569921, + "grad_norm": 1.9427393674850464, + "learning_rate": 0.00018183316848005275, + "loss": 2.064, + "step": 3860 + }, + { + "epoch": 0.09114031369224247, + "grad_norm": 2.2805685997009277, + "learning_rate": 0.0001817860675427441, + "loss": 2.2278, + "step": 3870 + }, + { + "epoch": 0.09137581837878574, + "grad_norm": 1.9191609621047974, + "learning_rate": 0.00018173896660543545, + "loss": 2.1242, + "step": 3880 + }, + { + "epoch": 0.091611323065329, + "grad_norm": 2.0127153396606445, + "learning_rate": 0.00018169186566812683, + "loss": 2.1634, + "step": 3890 + }, + { + "epoch": 0.09184682775187226, + "grad_norm": 1.8756777048110962, + "learning_rate": 0.00018164476473081815, + "loss": 2.0746, + "step": 3900 + }, + { + "epoch": 0.09208233243841553, + "grad_norm": 1.7346595525741577, + "learning_rate": 0.0001815976637935095, + "loss": 2.1975, + "step": 3910 + }, + { + "epoch": 0.0923178371249588, + "grad_norm": 2.1969356536865234, + "learning_rate": 0.00018155056285620085, + "loss": 2.1953, + "step": 3920 + }, + { + "epoch": 0.09255334181150204, + "grad_norm": 2.3179337978363037, + "learning_rate": 0.0001815034619188922, + "loss": 2.2919, + "step": 3930 + }, + { + "epoch": 0.09278884649804531, + "grad_norm": 2.3929030895233154, + "learning_rate": 0.00018145636098158355, + "loss": 2.184, + "step": 3940 + }, + { + "epoch": 0.09302435118458857, + "grad_norm": 2.515260934829712, + "learning_rate": 0.0001814092600442749, + "loss": 2.2885, + "step": 3950 + }, + { + "epoch": 0.09325985587113184, + "grad_norm": 2.0467185974121094, + "learning_rate": 0.00018136215910696623, + "loss": 2.2213, + "step": 3960 + }, + { + "epoch": 0.0934953605576751, + "grad_norm": 1.7174040079116821, + "learning_rate": 0.0001813150581696576, + "loss": 2.0542, + "step": 3970 + }, + { + "epoch": 0.09373086524421836, + "grad_norm": 2.1928911209106445, + "learning_rate": 0.00018126795723234893, + "loss": 2.0091, + "step": 3980 + }, + { + "epoch": 0.09396636993076163, + "grad_norm": 2.9600062370300293, + "learning_rate": 0.00018122085629504028, + "loss": 2.3568, + "step": 3990 + }, + { + "epoch": 0.09420187461730488, + "grad_norm": 1.843605637550354, + "learning_rate": 0.00018117375535773163, + "loss": 2.1123, + "step": 4000 + }, + { + "epoch": 0.09443737930384814, + "grad_norm": 1.823076844215393, + "learning_rate": 0.00018112665442042298, + "loss": 2.5867, + "step": 4010 + }, + { + "epoch": 0.09467288399039141, + "grad_norm": 2.1632487773895264, + "learning_rate": 0.00018107955348311433, + "loss": 2.1294, + "step": 4020 + }, + { + "epoch": 0.09490838867693467, + "grad_norm": 1.726159930229187, + "learning_rate": 0.00018103245254580568, + "loss": 2.2689, + "step": 4030 + }, + { + "epoch": 0.09514389336347794, + "grad_norm": 2.210451364517212, + "learning_rate": 0.000180985351608497, + "loss": 2.044, + "step": 4040 + }, + { + "epoch": 0.0953793980500212, + "grad_norm": 2.0674283504486084, + "learning_rate": 0.00018093825067118838, + "loss": 2.2215, + "step": 4050 + }, + { + "epoch": 0.09561490273656446, + "grad_norm": 2.5523428916931152, + "learning_rate": 0.0001808911497338797, + "loss": 1.9213, + "step": 4060 + }, + { + "epoch": 0.09585040742310771, + "grad_norm": 2.207009792327881, + "learning_rate": 0.00018084404879657105, + "loss": 2.4428, + "step": 4070 + }, + { + "epoch": 0.09608591210965098, + "grad_norm": 1.6344585418701172, + "learning_rate": 0.0001807969478592624, + "loss": 2.0769, + "step": 4080 + }, + { + "epoch": 0.09632141679619424, + "grad_norm": 1.8036699295043945, + "learning_rate": 0.00018074984692195375, + "loss": 2.3486, + "step": 4090 + }, + { + "epoch": 0.0965569214827375, + "grad_norm": 1.708190679550171, + "learning_rate": 0.0001807027459846451, + "loss": 1.9418, + "step": 4100 + }, + { + "epoch": 0.09679242616928077, + "grad_norm": 1.9009705781936646, + "learning_rate": 0.00018065564504733645, + "loss": 2.2658, + "step": 4110 + }, + { + "epoch": 0.09702793085582403, + "grad_norm": 1.9692081212997437, + "learning_rate": 0.0001806085441100278, + "loss": 1.9867, + "step": 4120 + }, + { + "epoch": 0.0972634355423673, + "grad_norm": 1.6455093622207642, + "learning_rate": 0.00018056144317271915, + "loss": 2.2132, + "step": 4130 + }, + { + "epoch": 0.09749894022891055, + "grad_norm": 1.9711624383926392, + "learning_rate": 0.00018051434223541048, + "loss": 2.3335, + "step": 4140 + }, + { + "epoch": 0.09773444491545381, + "grad_norm": 2.410085678100586, + "learning_rate": 0.00018046724129810185, + "loss": 2.1966, + "step": 4150 + }, + { + "epoch": 0.09796994960199708, + "grad_norm": 1.9884003400802612, + "learning_rate": 0.0001804201403607932, + "loss": 2.2239, + "step": 4160 + }, + { + "epoch": 0.09820545428854034, + "grad_norm": 1.7518419027328491, + "learning_rate": 0.00018037303942348453, + "loss": 2.2178, + "step": 4170 + }, + { + "epoch": 0.0984409589750836, + "grad_norm": 2.0874931812286377, + "learning_rate": 0.0001803259384861759, + "loss": 2.0643, + "step": 4180 + }, + { + "epoch": 0.09867646366162687, + "grad_norm": 2.409583568572998, + "learning_rate": 0.00018027883754886723, + "loss": 2.2727, + "step": 4190 + }, + { + "epoch": 0.09891196834817013, + "grad_norm": 2.025693655014038, + "learning_rate": 0.00018023173661155858, + "loss": 2.0157, + "step": 4200 + }, + { + "epoch": 0.09914747303471338, + "grad_norm": 1.8936916589736938, + "learning_rate": 0.00018018463567424993, + "loss": 2.0757, + "step": 4210 + }, + { + "epoch": 0.09938297772125665, + "grad_norm": 1.8962174654006958, + "learning_rate": 0.00018013753473694128, + "loss": 2.1957, + "step": 4220 + }, + { + "epoch": 0.09961848240779991, + "grad_norm": 2.055262565612793, + "learning_rate": 0.00018009043379963263, + "loss": 2.1943, + "step": 4230 + }, + { + "epoch": 0.09985398709434318, + "grad_norm": 2.352290153503418, + "learning_rate": 0.00018004333286232398, + "loss": 2.3158, + "step": 4240 + }, + { + "epoch": 0.10008949178088644, + "grad_norm": 1.8247345685958862, + "learning_rate": 0.0001799962319250153, + "loss": 2.1219, + "step": 4250 + }, + { + "epoch": 0.1003249964674297, + "grad_norm": 2.0612590312957764, + "learning_rate": 0.00017994913098770668, + "loss": 2.1914, + "step": 4260 + }, + { + "epoch": 0.10056050115397297, + "grad_norm": 2.3253748416900635, + "learning_rate": 0.000179902030050398, + "loss": 2.2833, + "step": 4270 + }, + { + "epoch": 0.10079600584051622, + "grad_norm": 2.5230417251586914, + "learning_rate": 0.00017985492911308938, + "loss": 2.2253, + "step": 4280 + }, + { + "epoch": 0.10103151052705948, + "grad_norm": 2.022261142730713, + "learning_rate": 0.0001798078281757807, + "loss": 2.0184, + "step": 4290 + }, + { + "epoch": 0.10126701521360275, + "grad_norm": 3.5165741443634033, + "learning_rate": 0.00017976072723847205, + "loss": 2.2235, + "step": 4300 + }, + { + "epoch": 0.10150251990014601, + "grad_norm": 2.481456756591797, + "learning_rate": 0.0001797136263011634, + "loss": 2.3874, + "step": 4310 + }, + { + "epoch": 0.10173802458668928, + "grad_norm": 1.780819058418274, + "learning_rate": 0.00017966652536385475, + "loss": 2.2076, + "step": 4320 + }, + { + "epoch": 0.10197352927323254, + "grad_norm": 1.6852662563323975, + "learning_rate": 0.0001796194244265461, + "loss": 2.1545, + "step": 4330 + }, + { + "epoch": 0.1022090339597758, + "grad_norm": 2.13392972946167, + "learning_rate": 0.00017957232348923745, + "loss": 2.2223, + "step": 4340 + }, + { + "epoch": 0.10244453864631906, + "grad_norm": 1.8714747428894043, + "learning_rate": 0.00017952522255192878, + "loss": 2.0202, + "step": 4350 + }, + { + "epoch": 0.10268004333286232, + "grad_norm": 1.8120633363723755, + "learning_rate": 0.00017947812161462015, + "loss": 2.2459, + "step": 4360 + }, + { + "epoch": 0.10291554801940558, + "grad_norm": 2.3447976112365723, + "learning_rate": 0.00017943102067731148, + "loss": 2.1867, + "step": 4370 + }, + { + "epoch": 0.10315105270594885, + "grad_norm": 2.4707727432250977, + "learning_rate": 0.00017938391974000283, + "loss": 2.3336, + "step": 4380 + }, + { + "epoch": 0.10338655739249211, + "grad_norm": 1.8624794483184814, + "learning_rate": 0.00017933681880269418, + "loss": 2.2378, + "step": 4390 + }, + { + "epoch": 0.10362206207903538, + "grad_norm": 1.687395691871643, + "learning_rate": 0.00017928971786538553, + "loss": 2.1106, + "step": 4400 + }, + { + "epoch": 0.10385756676557864, + "grad_norm": 1.6846327781677246, + "learning_rate": 0.00017924261692807688, + "loss": 2.0948, + "step": 4410 + }, + { + "epoch": 0.10409307145212189, + "grad_norm": 1.7584487199783325, + "learning_rate": 0.00017919551599076823, + "loss": 2.2761, + "step": 4420 + }, + { + "epoch": 0.10432857613866516, + "grad_norm": 2.1012890338897705, + "learning_rate": 0.00017914841505345955, + "loss": 2.3109, + "step": 4430 + }, + { + "epoch": 0.10456408082520842, + "grad_norm": 1.6275670528411865, + "learning_rate": 0.00017910131411615093, + "loss": 2.1511, + "step": 4440 + }, + { + "epoch": 0.10479958551175168, + "grad_norm": 1.744971513748169, + "learning_rate": 0.00017905421317884228, + "loss": 2.3064, + "step": 4450 + }, + { + "epoch": 0.10503509019829495, + "grad_norm": 1.8816249370574951, + "learning_rate": 0.0001790071122415336, + "loss": 2.4038, + "step": 4460 + }, + { + "epoch": 0.10527059488483821, + "grad_norm": 2.0134501457214355, + "learning_rate": 0.00017896001130422498, + "loss": 2.3979, + "step": 4470 + }, + { + "epoch": 0.10550609957138148, + "grad_norm": 2.5849740505218506, + "learning_rate": 0.0001789129103669163, + "loss": 2.4298, + "step": 4480 + }, + { + "epoch": 0.10574160425792473, + "grad_norm": 1.6863932609558105, + "learning_rate": 0.00017886580942960768, + "loss": 2.0796, + "step": 4490 + }, + { + "epoch": 0.10597710894446799, + "grad_norm": 2.486143112182617, + "learning_rate": 0.000178818708492299, + "loss": 2.1871, + "step": 4500 + }, + { + "epoch": 0.10621261363101125, + "grad_norm": 1.7740905284881592, + "learning_rate": 0.00017877160755499035, + "loss": 1.9941, + "step": 4510 + }, + { + "epoch": 0.10644811831755452, + "grad_norm": 2.0685887336730957, + "learning_rate": 0.0001787245066176817, + "loss": 2.2425, + "step": 4520 + }, + { + "epoch": 0.10668362300409778, + "grad_norm": 1.8682607412338257, + "learning_rate": 0.00017867740568037305, + "loss": 2.4737, + "step": 4530 + }, + { + "epoch": 0.10691912769064105, + "grad_norm": 3.012972116470337, + "learning_rate": 0.0001786303047430644, + "loss": 2.2102, + "step": 4540 + }, + { + "epoch": 0.10715463237718431, + "grad_norm": 2.1660654544830322, + "learning_rate": 0.00017858320380575575, + "loss": 1.9413, + "step": 4550 + }, + { + "epoch": 0.10739013706372756, + "grad_norm": 2.0424931049346924, + "learning_rate": 0.00017853610286844708, + "loss": 2.1046, + "step": 4560 + }, + { + "epoch": 0.10762564175027083, + "grad_norm": 2.7950327396392822, + "learning_rate": 0.00017848900193113845, + "loss": 2.0865, + "step": 4570 + }, + { + "epoch": 0.10786114643681409, + "grad_norm": 1.6264240741729736, + "learning_rate": 0.00017844190099382978, + "loss": 2.1237, + "step": 4580 + }, + { + "epoch": 0.10809665112335735, + "grad_norm": 1.7138210535049438, + "learning_rate": 0.00017839480005652113, + "loss": 2.2916, + "step": 4590 + }, + { + "epoch": 0.10833215580990062, + "grad_norm": 1.89190673828125, + "learning_rate": 0.00017834769911921248, + "loss": 2.3419, + "step": 4600 + }, + { + "epoch": 0.10856766049644388, + "grad_norm": 1.813306212425232, + "learning_rate": 0.00017830059818190383, + "loss": 2.3194, + "step": 4610 + }, + { + "epoch": 0.10880316518298715, + "grad_norm": 1.9797568321228027, + "learning_rate": 0.00017825349724459518, + "loss": 2.242, + "step": 4620 + }, + { + "epoch": 0.1090386698695304, + "grad_norm": 1.423639178276062, + "learning_rate": 0.00017820639630728653, + "loss": 2.1681, + "step": 4630 + }, + { + "epoch": 0.10927417455607366, + "grad_norm": 2.0920000076293945, + "learning_rate": 0.00017815929536997785, + "loss": 2.155, + "step": 4640 + }, + { + "epoch": 0.10950967924261693, + "grad_norm": 2.133775234222412, + "learning_rate": 0.00017811219443266923, + "loss": 2.1228, + "step": 4650 + }, + { + "epoch": 0.10974518392916019, + "grad_norm": 1.449283242225647, + "learning_rate": 0.00017806509349536055, + "loss": 2.2264, + "step": 4660 + }, + { + "epoch": 0.10998068861570345, + "grad_norm": 2.10311222076416, + "learning_rate": 0.0001780179925580519, + "loss": 2.1021, + "step": 4670 + }, + { + "epoch": 0.11021619330224672, + "grad_norm": 1.8965257406234741, + "learning_rate": 0.00017797089162074325, + "loss": 2.0347, + "step": 4680 + }, + { + "epoch": 0.11045169798878998, + "grad_norm": 1.989089846611023, + "learning_rate": 0.0001779237906834346, + "loss": 2.3597, + "step": 4690 + }, + { + "epoch": 0.11068720267533323, + "grad_norm": 2.173595666885376, + "learning_rate": 0.00017787668974612595, + "loss": 2.3751, + "step": 4700 + }, + { + "epoch": 0.1109227073618765, + "grad_norm": 2.043771743774414, + "learning_rate": 0.0001778295888088173, + "loss": 2.1705, + "step": 4710 + }, + { + "epoch": 0.11115821204841976, + "grad_norm": 2.4140965938568115, + "learning_rate": 0.00017778248787150865, + "loss": 2.2574, + "step": 4720 + }, + { + "epoch": 0.11139371673496302, + "grad_norm": 1.8277655839920044, + "learning_rate": 0.0001777353869342, + "loss": 2.1179, + "step": 4730 + }, + { + "epoch": 0.11162922142150629, + "grad_norm": 1.941874623298645, + "learning_rate": 0.00017768828599689135, + "loss": 2.0465, + "step": 4740 + }, + { + "epoch": 0.11186472610804955, + "grad_norm": 1.754234790802002, + "learning_rate": 0.0001776411850595827, + "loss": 2.2535, + "step": 4750 + }, + { + "epoch": 0.11210023079459282, + "grad_norm": 1.6321264505386353, + "learning_rate": 0.00017759408412227405, + "loss": 2.041, + "step": 4760 + }, + { + "epoch": 0.11233573548113608, + "grad_norm": 2.81117582321167, + "learning_rate": 0.00017754698318496538, + "loss": 2.2673, + "step": 4770 + }, + { + "epoch": 0.11257124016767933, + "grad_norm": 1.9449315071105957, + "learning_rate": 0.00017749988224765676, + "loss": 2.1277, + "step": 4780 + }, + { + "epoch": 0.1128067448542226, + "grad_norm": 1.9723138809204102, + "learning_rate": 0.00017745278131034808, + "loss": 2.141, + "step": 4790 + }, + { + "epoch": 0.11304224954076586, + "grad_norm": 2.3215932846069336, + "learning_rate": 0.00017740568037303943, + "loss": 2.1263, + "step": 4800 + }, + { + "epoch": 0.11327775422730912, + "grad_norm": 2.240413188934326, + "learning_rate": 0.00017735857943573078, + "loss": 2.1786, + "step": 4810 + }, + { + "epoch": 0.11351325891385239, + "grad_norm": 2.177074432373047, + "learning_rate": 0.00017731147849842213, + "loss": 2.1403, + "step": 4820 + }, + { + "epoch": 0.11374876360039565, + "grad_norm": 1.7291984558105469, + "learning_rate": 0.00017726437756111348, + "loss": 2.2137, + "step": 4830 + }, + { + "epoch": 0.11398426828693892, + "grad_norm": 1.975380539894104, + "learning_rate": 0.00017721727662380483, + "loss": 2.2869, + "step": 4840 + }, + { + "epoch": 0.11421977297348217, + "grad_norm": 2.778428554534912, + "learning_rate": 0.00017717017568649615, + "loss": 2.151, + "step": 4850 + }, + { + "epoch": 0.11445527766002543, + "grad_norm": 2.3732733726501465, + "learning_rate": 0.00017712307474918753, + "loss": 2.1637, + "step": 4860 + }, + { + "epoch": 0.1146907823465687, + "grad_norm": 3.130542516708374, + "learning_rate": 0.00017707597381187885, + "loss": 2.2621, + "step": 4870 + }, + { + "epoch": 0.11492628703311196, + "grad_norm": 2.3386194705963135, + "learning_rate": 0.00017702887287457023, + "loss": 2.2108, + "step": 4880 + }, + { + "epoch": 0.11516179171965522, + "grad_norm": 1.9925601482391357, + "learning_rate": 0.00017698177193726155, + "loss": 2.2729, + "step": 4890 + }, + { + "epoch": 0.11539729640619849, + "grad_norm": 2.1639785766601562, + "learning_rate": 0.0001769346709999529, + "loss": 2.2079, + "step": 4900 + }, + { + "epoch": 0.11563280109274175, + "grad_norm": 2.6207916736602783, + "learning_rate": 0.00017688757006264425, + "loss": 2.1154, + "step": 4910 + }, + { + "epoch": 0.115868305779285, + "grad_norm": 2.0274581909179688, + "learning_rate": 0.0001768404691253356, + "loss": 2.2918, + "step": 4920 + }, + { + "epoch": 0.11610381046582827, + "grad_norm": 2.088710069656372, + "learning_rate": 0.00017679336818802695, + "loss": 2.2056, + "step": 4930 + }, + { + "epoch": 0.11633931515237153, + "grad_norm": 2.373544931411743, + "learning_rate": 0.0001767462672507183, + "loss": 2.1674, + "step": 4940 + }, + { + "epoch": 0.1165748198389148, + "grad_norm": 2.527215003967285, + "learning_rate": 0.00017669916631340963, + "loss": 1.9731, + "step": 4950 + }, + { + "epoch": 0.11681032452545806, + "grad_norm": 2.6474502086639404, + "learning_rate": 0.000176652065376101, + "loss": 2.0882, + "step": 4960 + }, + { + "epoch": 0.11704582921200132, + "grad_norm": 2.4764654636383057, + "learning_rate": 0.00017660496443879233, + "loss": 2.3867, + "step": 4970 + }, + { + "epoch": 0.11728133389854459, + "grad_norm": 1.8438410758972168, + "learning_rate": 0.00017655786350148368, + "loss": 2.3935, + "step": 4980 + }, + { + "epoch": 0.11751683858508784, + "grad_norm": 2.0502359867095947, + "learning_rate": 0.00017651076256417503, + "loss": 2.1566, + "step": 4990 + }, + { + "epoch": 0.1177523432716311, + "grad_norm": 2.705681562423706, + "learning_rate": 0.00017646366162686638, + "loss": 2.2442, + "step": 5000 + }, + { + "epoch": 0.11798784795817437, + "grad_norm": 1.7821354866027832, + "learning_rate": 0.00017641656068955773, + "loss": 2.095, + "step": 5010 + }, + { + "epoch": 0.11822335264471763, + "grad_norm": 2.171154022216797, + "learning_rate": 0.00017636945975224908, + "loss": 2.1128, + "step": 5020 + }, + { + "epoch": 0.1184588573312609, + "grad_norm": 1.8381446599960327, + "learning_rate": 0.00017632235881494043, + "loss": 2.2702, + "step": 5030 + }, + { + "epoch": 0.11869436201780416, + "grad_norm": 2.394317865371704, + "learning_rate": 0.00017627525787763178, + "loss": 2.2467, + "step": 5040 + }, + { + "epoch": 0.11892986670434742, + "grad_norm": 2.100402593612671, + "learning_rate": 0.00017622815694032313, + "loss": 2.1857, + "step": 5050 + }, + { + "epoch": 0.11916537139089067, + "grad_norm": 1.8596925735473633, + "learning_rate": 0.00017618105600301445, + "loss": 2.0607, + "step": 5060 + }, + { + "epoch": 0.11940087607743394, + "grad_norm": 2.523073196411133, + "learning_rate": 0.00017613395506570583, + "loss": 2.2643, + "step": 5070 + }, + { + "epoch": 0.1196363807639772, + "grad_norm": 1.6431152820587158, + "learning_rate": 0.00017608685412839715, + "loss": 2.3096, + "step": 5080 + }, + { + "epoch": 0.11987188545052047, + "grad_norm": 1.8515095710754395, + "learning_rate": 0.00017603975319108853, + "loss": 2.2366, + "step": 5090 + }, + { + "epoch": 0.12010739013706373, + "grad_norm": 2.4143874645233154, + "learning_rate": 0.00017599265225377985, + "loss": 2.2266, + "step": 5100 + }, + { + "epoch": 0.120342894823607, + "grad_norm": 2.4558234214782715, + "learning_rate": 0.0001759455513164712, + "loss": 2.3298, + "step": 5110 + }, + { + "epoch": 0.12057839951015026, + "grad_norm": 1.5276613235473633, + "learning_rate": 0.00017589845037916256, + "loss": 2.1767, + "step": 5120 + }, + { + "epoch": 0.12081390419669351, + "grad_norm": 2.6570372581481934, + "learning_rate": 0.0001758513494418539, + "loss": 2.3225, + "step": 5130 + }, + { + "epoch": 0.12104940888323677, + "grad_norm": 2.54841685295105, + "learning_rate": 0.00017580424850454526, + "loss": 2.3588, + "step": 5140 + }, + { + "epoch": 0.12128491356978004, + "grad_norm": 1.772416114807129, + "learning_rate": 0.0001757571475672366, + "loss": 2.3906, + "step": 5150 + }, + { + "epoch": 0.1215204182563233, + "grad_norm": 1.865113615989685, + "learning_rate": 0.00017571004662992793, + "loss": 2.2392, + "step": 5160 + }, + { + "epoch": 0.12175592294286657, + "grad_norm": 2.2344613075256348, + "learning_rate": 0.0001756629456926193, + "loss": 2.127, + "step": 5170 + }, + { + "epoch": 0.12199142762940983, + "grad_norm": 2.441580057144165, + "learning_rate": 0.00017561584475531063, + "loss": 2.2126, + "step": 5180 + }, + { + "epoch": 0.1222269323159531, + "grad_norm": 1.9913771152496338, + "learning_rate": 0.00017556874381800198, + "loss": 2.22, + "step": 5190 + }, + { + "epoch": 0.12246243700249634, + "grad_norm": 2.1991429328918457, + "learning_rate": 0.00017552164288069333, + "loss": 2.2766, + "step": 5200 + }, + { + "epoch": 0.12269794168903961, + "grad_norm": 2.120680809020996, + "learning_rate": 0.00017547454194338468, + "loss": 2.2078, + "step": 5210 + }, + { + "epoch": 0.12293344637558287, + "grad_norm": 1.788309931755066, + "learning_rate": 0.00017542744100607603, + "loss": 2.2587, + "step": 5220 + }, + { + "epoch": 0.12316895106212614, + "grad_norm": 2.2423155307769775, + "learning_rate": 0.00017538034006876738, + "loss": 2.2241, + "step": 5230 + }, + { + "epoch": 0.1234044557486694, + "grad_norm": 1.9412622451782227, + "learning_rate": 0.0001753332391314587, + "loss": 2.1715, + "step": 5240 + }, + { + "epoch": 0.12363996043521266, + "grad_norm": 2.0444014072418213, + "learning_rate": 0.00017528613819415008, + "loss": 2.1813, + "step": 5250 + }, + { + "epoch": 0.12387546512175593, + "grad_norm": 1.9657073020935059, + "learning_rate": 0.0001752390372568414, + "loss": 2.3767, + "step": 5260 + }, + { + "epoch": 0.12411096980829918, + "grad_norm": 2.156477928161621, + "learning_rate": 0.00017519193631953275, + "loss": 1.8883, + "step": 5270 + }, + { + "epoch": 0.12434647449484244, + "grad_norm": 2.735069513320923, + "learning_rate": 0.0001751448353822241, + "loss": 2.2131, + "step": 5280 + }, + { + "epoch": 0.12458197918138571, + "grad_norm": 1.98786199092865, + "learning_rate": 0.00017509773444491546, + "loss": 2.0419, + "step": 5290 + }, + { + "epoch": 0.12481748386792897, + "grad_norm": 2.619561195373535, + "learning_rate": 0.00017505063350760683, + "loss": 2.0047, + "step": 5300 + }, + { + "epoch": 0.12505298855447222, + "grad_norm": 2.2264404296875, + "learning_rate": 0.00017500353257029816, + "loss": 2.1834, + "step": 5310 + }, + { + "epoch": 0.1252884932410155, + "grad_norm": 2.1874210834503174, + "learning_rate": 0.0001749564316329895, + "loss": 2.1197, + "step": 5320 + }, + { + "epoch": 0.12552399792755875, + "grad_norm": 2.7099130153656006, + "learning_rate": 0.00017490933069568086, + "loss": 1.9282, + "step": 5330 + }, + { + "epoch": 0.12575950261410201, + "grad_norm": 2.1280713081359863, + "learning_rate": 0.0001748622297583722, + "loss": 2.0907, + "step": 5340 + }, + { + "epoch": 0.12599500730064528, + "grad_norm": 2.1866750717163086, + "learning_rate": 0.00017481512882106356, + "loss": 1.8259, + "step": 5350 + }, + { + "epoch": 0.12623051198718854, + "grad_norm": 1.9257513284683228, + "learning_rate": 0.0001747680278837549, + "loss": 2.1086, + "step": 5360 + }, + { + "epoch": 0.1264660166737318, + "grad_norm": 1.8964312076568604, + "learning_rate": 0.00017472092694644623, + "loss": 2.0995, + "step": 5370 + }, + { + "epoch": 0.12670152136027507, + "grad_norm": 2.228306770324707, + "learning_rate": 0.0001746738260091376, + "loss": 2.0373, + "step": 5380 + }, + { + "epoch": 0.12693702604681834, + "grad_norm": 2.723195791244507, + "learning_rate": 0.00017462672507182893, + "loss": 2.0848, + "step": 5390 + }, + { + "epoch": 0.1271725307333616, + "grad_norm": 2.1091506481170654, + "learning_rate": 0.00017457962413452028, + "loss": 2.3576, + "step": 5400 + }, + { + "epoch": 0.12740803541990486, + "grad_norm": 2.01425838470459, + "learning_rate": 0.00017453252319721163, + "loss": 2.1692, + "step": 5410 + }, + { + "epoch": 0.12764354010644813, + "grad_norm": 2.0158488750457764, + "learning_rate": 0.00017448542225990298, + "loss": 2.1845, + "step": 5420 + }, + { + "epoch": 0.1278790447929914, + "grad_norm": 1.796004056930542, + "learning_rate": 0.00017443832132259433, + "loss": 2.1808, + "step": 5430 + }, + { + "epoch": 0.12811454947953466, + "grad_norm": 2.268319845199585, + "learning_rate": 0.00017439122038528568, + "loss": 2.1978, + "step": 5440 + }, + { + "epoch": 0.1283500541660779, + "grad_norm": 2.573340654373169, + "learning_rate": 0.000174344119447977, + "loss": 2.2889, + "step": 5450 + }, + { + "epoch": 0.12858555885262116, + "grad_norm": 2.2684388160705566, + "learning_rate": 0.00017429701851066838, + "loss": 2.0348, + "step": 5460 + }, + { + "epoch": 0.12882106353916442, + "grad_norm": 2.271415948867798, + "learning_rate": 0.0001742499175733597, + "loss": 2.3534, + "step": 5470 + }, + { + "epoch": 0.12905656822570769, + "grad_norm": 1.900991439819336, + "learning_rate": 0.00017420281663605108, + "loss": 2.1018, + "step": 5480 + }, + { + "epoch": 0.12929207291225095, + "grad_norm": 2.262239456176758, + "learning_rate": 0.0001741557156987424, + "loss": 2.2768, + "step": 5490 + }, + { + "epoch": 0.1295275775987942, + "grad_norm": 2.041602849960327, + "learning_rate": 0.00017410861476143376, + "loss": 2.3032, + "step": 5500 + }, + { + "epoch": 0.12976308228533748, + "grad_norm": 1.4872256517410278, + "learning_rate": 0.0001740615138241251, + "loss": 2.3222, + "step": 5510 + }, + { + "epoch": 0.12999858697188074, + "grad_norm": 1.8539739847183228, + "learning_rate": 0.00017401441288681646, + "loss": 2.132, + "step": 5520 + }, + { + "epoch": 0.130234091658424, + "grad_norm": 2.243079900741577, + "learning_rate": 0.0001739673119495078, + "loss": 2.1824, + "step": 5530 + }, + { + "epoch": 0.13046959634496727, + "grad_norm": 2.0111191272735596, + "learning_rate": 0.00017392021101219916, + "loss": 2.1091, + "step": 5540 + }, + { + "epoch": 0.13070510103151053, + "grad_norm": 2.0866692066192627, + "learning_rate": 0.00017387311007489048, + "loss": 2.2156, + "step": 5550 + }, + { + "epoch": 0.1309406057180538, + "grad_norm": 1.6180267333984375, + "learning_rate": 0.00017382600913758186, + "loss": 2.2403, + "step": 5560 + }, + { + "epoch": 0.13117611040459706, + "grad_norm": 1.926936388015747, + "learning_rate": 0.00017378361829400408, + "loss": 2.268, + "step": 5570 + }, + { + "epoch": 0.13141161509114033, + "grad_norm": 1.732120156288147, + "learning_rate": 0.0001737365173566954, + "loss": 1.943, + "step": 5580 + }, + { + "epoch": 0.13164711977768356, + "grad_norm": 2.2308902740478516, + "learning_rate": 0.00017368941641938678, + "loss": 2.3538, + "step": 5590 + }, + { + "epoch": 0.13188262446422683, + "grad_norm": 2.0219004154205322, + "learning_rate": 0.0001736423154820781, + "loss": 2.1096, + "step": 5600 + }, + { + "epoch": 0.1321181291507701, + "grad_norm": 2.223156213760376, + "learning_rate": 0.00017359521454476945, + "loss": 2.138, + "step": 5610 + }, + { + "epoch": 0.13235363383731336, + "grad_norm": 2.6512413024902344, + "learning_rate": 0.0001735481136074608, + "loss": 1.9548, + "step": 5620 + }, + { + "epoch": 0.13258913852385662, + "grad_norm": 1.974619746208191, + "learning_rate": 0.00017350101267015215, + "loss": 2.1642, + "step": 5630 + }, + { + "epoch": 0.13282464321039988, + "grad_norm": 2.279487133026123, + "learning_rate": 0.0001734539117328435, + "loss": 2.1445, + "step": 5640 + }, + { + "epoch": 0.13306014789694315, + "grad_norm": 1.9778087139129639, + "learning_rate": 0.00017340681079553485, + "loss": 2.2169, + "step": 5650 + }, + { + "epoch": 0.1332956525834864, + "grad_norm": 1.9893101453781128, + "learning_rate": 0.00017335970985822617, + "loss": 2.2353, + "step": 5660 + }, + { + "epoch": 0.13353115727002968, + "grad_norm": 2.7806546688079834, + "learning_rate": 0.00017331260892091755, + "loss": 2.1214, + "step": 5670 + }, + { + "epoch": 0.13376666195657294, + "grad_norm": 2.2366230487823486, + "learning_rate": 0.00017326550798360887, + "loss": 2.1004, + "step": 5680 + }, + { + "epoch": 0.1340021666431162, + "grad_norm": 3.1434006690979004, + "learning_rate": 0.00017321840704630022, + "loss": 2.3682, + "step": 5690 + }, + { + "epoch": 0.13423767132965947, + "grad_norm": 1.862724781036377, + "learning_rate": 0.00017317130610899157, + "loss": 2.1701, + "step": 5700 + }, + { + "epoch": 0.13447317601620273, + "grad_norm": 2.2287230491638184, + "learning_rate": 0.00017312420517168292, + "loss": 2.2376, + "step": 5710 + }, + { + "epoch": 0.134708680702746, + "grad_norm": 2.0166916847229004, + "learning_rate": 0.00017307710423437428, + "loss": 2.2481, + "step": 5720 + }, + { + "epoch": 0.13494418538928923, + "grad_norm": 2.4379286766052246, + "learning_rate": 0.00017303000329706563, + "loss": 2.2884, + "step": 5730 + }, + { + "epoch": 0.1351796900758325, + "grad_norm": 1.923784613609314, + "learning_rate": 0.00017298290235975695, + "loss": 2.13, + "step": 5740 + }, + { + "epoch": 0.13541519476237576, + "grad_norm": 2.3278648853302, + "learning_rate": 0.00017293580142244833, + "loss": 2.4975, + "step": 5750 + }, + { + "epoch": 0.13565069944891903, + "grad_norm": 2.186098098754883, + "learning_rate": 0.00017288870048513965, + "loss": 2.4386, + "step": 5760 + }, + { + "epoch": 0.1358862041354623, + "grad_norm": 1.8894046545028687, + "learning_rate": 0.000172841599547831, + "loss": 2.1392, + "step": 5770 + }, + { + "epoch": 0.13612170882200555, + "grad_norm": 2.2432827949523926, + "learning_rate": 0.00017279449861052235, + "loss": 2.2693, + "step": 5780 + }, + { + "epoch": 0.13635721350854882, + "grad_norm": 1.860554575920105, + "learning_rate": 0.0001727473976732137, + "loss": 2.0159, + "step": 5790 + }, + { + "epoch": 0.13659271819509208, + "grad_norm": 1.6823490858078003, + "learning_rate": 0.00017270029673590505, + "loss": 2.2196, + "step": 5800 + }, + { + "epoch": 0.13682822288163535, + "grad_norm": 1.9365490674972534, + "learning_rate": 0.0001726531957985964, + "loss": 2.2016, + "step": 5810 + }, + { + "epoch": 0.1370637275681786, + "grad_norm": 2.2856075763702393, + "learning_rate": 0.00017260609486128775, + "loss": 2.2524, + "step": 5820 + }, + { + "epoch": 0.13729923225472188, + "grad_norm": 2.667853832244873, + "learning_rate": 0.0001725589939239791, + "loss": 1.9745, + "step": 5830 + }, + { + "epoch": 0.13753473694126514, + "grad_norm": 1.9969861507415771, + "learning_rate": 0.00017251189298667045, + "loss": 2.1179, + "step": 5840 + }, + { + "epoch": 0.1377702416278084, + "grad_norm": 1.866898536682129, + "learning_rate": 0.0001724647920493618, + "loss": 2.3985, + "step": 5850 + }, + { + "epoch": 0.13800574631435167, + "grad_norm": 2.4066002368927, + "learning_rate": 0.00017241769111205315, + "loss": 2.2191, + "step": 5860 + }, + { + "epoch": 0.1382412510008949, + "grad_norm": 2.1669511795043945, + "learning_rate": 0.00017237059017474447, + "loss": 2.07, + "step": 5870 + }, + { + "epoch": 0.13847675568743817, + "grad_norm": 1.9920525550842285, + "learning_rate": 0.00017232348923743585, + "loss": 2.1086, + "step": 5880 + }, + { + "epoch": 0.13871226037398143, + "grad_norm": 2.622621536254883, + "learning_rate": 0.00017227638830012717, + "loss": 2.1982, + "step": 5890 + }, + { + "epoch": 0.1389477650605247, + "grad_norm": 2.285757064819336, + "learning_rate": 0.00017222928736281853, + "loss": 2.2294, + "step": 5900 + }, + { + "epoch": 0.13918326974706796, + "grad_norm": 2.428218126296997, + "learning_rate": 0.00017218218642550988, + "loss": 2.3078, + "step": 5910 + }, + { + "epoch": 0.13941877443361123, + "grad_norm": 1.8226295709609985, + "learning_rate": 0.00017213508548820123, + "loss": 2.1509, + "step": 5920 + }, + { + "epoch": 0.1396542791201545, + "grad_norm": 2.660916328430176, + "learning_rate": 0.00017208798455089258, + "loss": 2.0719, + "step": 5930 + }, + { + "epoch": 0.13988978380669775, + "grad_norm": 1.9851837158203125, + "learning_rate": 0.00017204088361358393, + "loss": 2.0906, + "step": 5940 + }, + { + "epoch": 0.14012528849324102, + "grad_norm": 2.3256642818450928, + "learning_rate": 0.00017199378267627525, + "loss": 2.0444, + "step": 5950 + }, + { + "epoch": 0.14036079317978428, + "grad_norm": 1.7920613288879395, + "learning_rate": 0.00017194668173896663, + "loss": 2.1493, + "step": 5960 + }, + { + "epoch": 0.14059629786632755, + "grad_norm": 1.760733723640442, + "learning_rate": 0.00017189958080165795, + "loss": 2.3474, + "step": 5970 + }, + { + "epoch": 0.1408318025528708, + "grad_norm": 2.066558599472046, + "learning_rate": 0.00017185247986434933, + "loss": 2.1801, + "step": 5980 + }, + { + "epoch": 0.14106730723941407, + "grad_norm": 1.9565317630767822, + "learning_rate": 0.00017180537892704065, + "loss": 2.3232, + "step": 5990 + }, + { + "epoch": 0.14130281192595734, + "grad_norm": 2.2794132232666016, + "learning_rate": 0.000171758277989732, + "loss": 2.178, + "step": 6000 + }, + { + "epoch": 0.14153831661250058, + "grad_norm": 1.8281110525131226, + "learning_rate": 0.00017171117705242335, + "loss": 2.1067, + "step": 6010 + }, + { + "epoch": 0.14177382129904384, + "grad_norm": 1.9072505235671997, + "learning_rate": 0.0001716640761151147, + "loss": 1.9927, + "step": 6020 + }, + { + "epoch": 0.1420093259855871, + "grad_norm": 2.288691997528076, + "learning_rate": 0.00017161697517780605, + "loss": 2.0077, + "step": 6030 + }, + { + "epoch": 0.14224483067213037, + "grad_norm": 2.0994765758514404, + "learning_rate": 0.0001715698742404974, + "loss": 2.2116, + "step": 6040 + }, + { + "epoch": 0.14248033535867363, + "grad_norm": 1.807565689086914, + "learning_rate": 0.00017152277330318872, + "loss": 2.3081, + "step": 6050 + }, + { + "epoch": 0.1427158400452169, + "grad_norm": 1.8691102266311646, + "learning_rate": 0.0001714756723658801, + "loss": 2.1234, + "step": 6060 + }, + { + "epoch": 0.14295134473176016, + "grad_norm": 2.168907403945923, + "learning_rate": 0.00017142857142857143, + "loss": 2.1899, + "step": 6070 + }, + { + "epoch": 0.14318684941830342, + "grad_norm": 2.2471699714660645, + "learning_rate": 0.00017138147049126278, + "loss": 2.3476, + "step": 6080 + }, + { + "epoch": 0.1434223541048467, + "grad_norm": 1.8426363468170166, + "learning_rate": 0.00017133436955395413, + "loss": 2.0467, + "step": 6090 + }, + { + "epoch": 0.14365785879138995, + "grad_norm": 1.6236611604690552, + "learning_rate": 0.00017128726861664548, + "loss": 2.0523, + "step": 6100 + }, + { + "epoch": 0.14389336347793322, + "grad_norm": 1.660072922706604, + "learning_rate": 0.00017124016767933683, + "loss": 2.2243, + "step": 6110 + }, + { + "epoch": 0.14412886816447648, + "grad_norm": 2.4543228149414062, + "learning_rate": 0.00017119306674202818, + "loss": 2.1948, + "step": 6120 + }, + { + "epoch": 0.14436437285101975, + "grad_norm": 1.9436759948730469, + "learning_rate": 0.00017114596580471953, + "loss": 2.0913, + "step": 6130 + }, + { + "epoch": 0.144599877537563, + "grad_norm": 2.1361687183380127, + "learning_rate": 0.00017109886486741088, + "loss": 2.2386, + "step": 6140 + }, + { + "epoch": 0.14483538222410625, + "grad_norm": 3.258145332336426, + "learning_rate": 0.00017105176393010223, + "loss": 2.1049, + "step": 6150 + }, + { + "epoch": 0.1450708869106495, + "grad_norm": 2.3219218254089355, + "learning_rate": 0.00017100466299279355, + "loss": 2.2702, + "step": 6160 + }, + { + "epoch": 0.14530639159719277, + "grad_norm": 2.53751277923584, + "learning_rate": 0.00017095756205548493, + "loss": 2.1055, + "step": 6170 + }, + { + "epoch": 0.14554189628373604, + "grad_norm": 1.9522231817245483, + "learning_rate": 0.00017091046111817625, + "loss": 2.1196, + "step": 6180 + }, + { + "epoch": 0.1457774009702793, + "grad_norm": 1.9058953523635864, + "learning_rate": 0.00017086336018086763, + "loss": 2.1288, + "step": 6190 + }, + { + "epoch": 0.14601290565682257, + "grad_norm": 2.3197526931762695, + "learning_rate": 0.00017081625924355895, + "loss": 2.0571, + "step": 6200 + }, + { + "epoch": 0.14624841034336583, + "grad_norm": 2.9913275241851807, + "learning_rate": 0.0001707691583062503, + "loss": 2.042, + "step": 6210 + }, + { + "epoch": 0.1464839150299091, + "grad_norm": 3.6737120151519775, + "learning_rate": 0.00017072205736894165, + "loss": 2.09, + "step": 6220 + }, + { + "epoch": 0.14671941971645236, + "grad_norm": 2.2069551944732666, + "learning_rate": 0.000170674956431633, + "loss": 2.1867, + "step": 6230 + }, + { + "epoch": 0.14695492440299562, + "grad_norm": 2.2664968967437744, + "learning_rate": 0.00017062785549432435, + "loss": 2.2181, + "step": 6240 + }, + { + "epoch": 0.1471904290895389, + "grad_norm": 1.966537356376648, + "learning_rate": 0.0001705807545570157, + "loss": 2.0683, + "step": 6250 + }, + { + "epoch": 0.14742593377608215, + "grad_norm": 1.8831692934036255, + "learning_rate": 0.00017053365361970703, + "loss": 2.0964, + "step": 6260 + }, + { + "epoch": 0.14766143846262542, + "grad_norm": 2.0618419647216797, + "learning_rate": 0.0001704865526823984, + "loss": 2.3007, + "step": 6270 + }, + { + "epoch": 0.14789694314916868, + "grad_norm": 2.163808584213257, + "learning_rate": 0.00017043945174508973, + "loss": 2.1509, + "step": 6280 + }, + { + "epoch": 0.14813244783571194, + "grad_norm": 2.259477138519287, + "learning_rate": 0.00017039235080778108, + "loss": 2.2863, + "step": 6290 + }, + { + "epoch": 0.14836795252225518, + "grad_norm": 1.4202922582626343, + "learning_rate": 0.00017034524987047243, + "loss": 1.9795, + "step": 6300 + }, + { + "epoch": 0.14860345720879845, + "grad_norm": 2.2261738777160645, + "learning_rate": 0.00017029814893316378, + "loss": 2.0105, + "step": 6310 + }, + { + "epoch": 0.1488389618953417, + "grad_norm": 2.284642457962036, + "learning_rate": 0.00017025104799585513, + "loss": 2.1962, + "step": 6320 + }, + { + "epoch": 0.14907446658188497, + "grad_norm": 1.6194367408752441, + "learning_rate": 0.00017020394705854648, + "loss": 2.0346, + "step": 6330 + }, + { + "epoch": 0.14930997126842824, + "grad_norm": 2.0232205390930176, + "learning_rate": 0.0001701568461212378, + "loss": 2.2708, + "step": 6340 + }, + { + "epoch": 0.1495454759549715, + "grad_norm": 2.325477361679077, + "learning_rate": 0.00017010974518392918, + "loss": 2.1813, + "step": 6350 + }, + { + "epoch": 0.14978098064151477, + "grad_norm": 2.309537172317505, + "learning_rate": 0.0001700626442466205, + "loss": 2.1677, + "step": 6360 + }, + { + "epoch": 0.15001648532805803, + "grad_norm": 2.201800584793091, + "learning_rate": 0.00017001554330931185, + "loss": 2.1772, + "step": 6370 + }, + { + "epoch": 0.1502519900146013, + "grad_norm": 1.840621829032898, + "learning_rate": 0.0001699684423720032, + "loss": 2.1918, + "step": 6380 + }, + { + "epoch": 0.15048749470114456, + "grad_norm": 2.601182222366333, + "learning_rate": 0.00016992134143469455, + "loss": 2.1371, + "step": 6390 + }, + { + "epoch": 0.15072299938768782, + "grad_norm": 1.4714235067367554, + "learning_rate": 0.00016987424049738593, + "loss": 2.0602, + "step": 6400 + }, + { + "epoch": 0.1509585040742311, + "grad_norm": 2.3133628368377686, + "learning_rate": 0.00016982713956007725, + "loss": 2.3063, + "step": 6410 + }, + { + "epoch": 0.15119400876077435, + "grad_norm": 1.9627376794815063, + "learning_rate": 0.0001697800386227686, + "loss": 1.9214, + "step": 6420 + }, + { + "epoch": 0.15142951344731762, + "grad_norm": 2.3527040481567383, + "learning_rate": 0.00016973293768545995, + "loss": 2.1646, + "step": 6430 + }, + { + "epoch": 0.15166501813386085, + "grad_norm": 2.2275426387786865, + "learning_rate": 0.0001696858367481513, + "loss": 2.1793, + "step": 6440 + }, + { + "epoch": 0.15190052282040412, + "grad_norm": 2.1824982166290283, + "learning_rate": 0.00016963873581084265, + "loss": 2.2669, + "step": 6450 + }, + { + "epoch": 0.15213602750694738, + "grad_norm": 2.512240171432495, + "learning_rate": 0.000169591634873534, + "loss": 2.1601, + "step": 6460 + }, + { + "epoch": 0.15237153219349064, + "grad_norm": 2.1347885131835938, + "learning_rate": 0.00016954453393622533, + "loss": 2.1572, + "step": 6470 + }, + { + "epoch": 0.1526070368800339, + "grad_norm": 1.7296582460403442, + "learning_rate": 0.0001694974329989167, + "loss": 2.2337, + "step": 6480 + }, + { + "epoch": 0.15284254156657717, + "grad_norm": 2.2184410095214844, + "learning_rate": 0.00016945033206160803, + "loss": 2.1873, + "step": 6490 + }, + { + "epoch": 0.15307804625312044, + "grad_norm": 1.787880778312683, + "learning_rate": 0.00016940323112429938, + "loss": 2.0755, + "step": 6500 + }, + { + "epoch": 0.1533135509396637, + "grad_norm": 1.9352377653121948, + "learning_rate": 0.00016935613018699073, + "loss": 2.2166, + "step": 6510 + }, + { + "epoch": 0.15354905562620697, + "grad_norm": 2.27699875831604, + "learning_rate": 0.00016930902924968208, + "loss": 2.2189, + "step": 6520 + }, + { + "epoch": 0.15378456031275023, + "grad_norm": 1.4749678373336792, + "learning_rate": 0.00016926192831237343, + "loss": 1.9943, + "step": 6530 + }, + { + "epoch": 0.1540200649992935, + "grad_norm": 2.7707648277282715, + "learning_rate": 0.00016921482737506478, + "loss": 2.3423, + "step": 6540 + }, + { + "epoch": 0.15425556968583676, + "grad_norm": 2.5478742122650146, + "learning_rate": 0.0001691677264377561, + "loss": 2.1751, + "step": 6550 + }, + { + "epoch": 0.15449107437238002, + "grad_norm": 2.3068339824676514, + "learning_rate": 0.00016912062550044748, + "loss": 2.2444, + "step": 6560 + }, + { + "epoch": 0.15472657905892329, + "grad_norm": 1.8547791242599487, + "learning_rate": 0.0001690735245631388, + "loss": 2.436, + "step": 6570 + }, + { + "epoch": 0.15496208374546652, + "grad_norm": 2.1598117351531982, + "learning_rate": 0.00016902642362583018, + "loss": 2.0398, + "step": 6580 + }, + { + "epoch": 0.1551975884320098, + "grad_norm": 2.000115156173706, + "learning_rate": 0.0001689793226885215, + "loss": 1.9789, + "step": 6590 + }, + { + "epoch": 0.15543309311855305, + "grad_norm": 2.3992674350738525, + "learning_rate": 0.00016893222175121285, + "loss": 2.2092, + "step": 6600 + }, + { + "epoch": 0.15566859780509631, + "grad_norm": 2.2532436847686768, + "learning_rate": 0.0001688851208139042, + "loss": 2.0844, + "step": 6610 + }, + { + "epoch": 0.15590410249163958, + "grad_norm": 1.8847017288208008, + "learning_rate": 0.00016883801987659555, + "loss": 2.0509, + "step": 6620 + }, + { + "epoch": 0.15613960717818284, + "grad_norm": 2.4338490962982178, + "learning_rate": 0.0001687909189392869, + "loss": 2.229, + "step": 6630 + }, + { + "epoch": 0.1563751118647261, + "grad_norm": 1.6963534355163574, + "learning_rate": 0.00016874381800197825, + "loss": 2.3512, + "step": 6640 + }, + { + "epoch": 0.15661061655126937, + "grad_norm": 2.159989356994629, + "learning_rate": 0.00016869671706466958, + "loss": 2.1962, + "step": 6650 + }, + { + "epoch": 0.15684612123781264, + "grad_norm": 2.163346290588379, + "learning_rate": 0.00016864961612736095, + "loss": 2.2327, + "step": 6660 + }, + { + "epoch": 0.1570816259243559, + "grad_norm": 3.266700029373169, + "learning_rate": 0.00016860251519005228, + "loss": 2.0636, + "step": 6670 + }, + { + "epoch": 0.15731713061089916, + "grad_norm": 2.0337531566619873, + "learning_rate": 0.00016855541425274363, + "loss": 2.2385, + "step": 6680 + }, + { + "epoch": 0.15755263529744243, + "grad_norm": 2.7667040824890137, + "learning_rate": 0.000168508313315435, + "loss": 2.1956, + "step": 6690 + }, + { + "epoch": 0.1577881399839857, + "grad_norm": 2.1070220470428467, + "learning_rate": 0.00016846121237812633, + "loss": 2.1798, + "step": 6700 + }, + { + "epoch": 0.15802364467052896, + "grad_norm": 1.6910346746444702, + "learning_rate": 0.00016841411144081768, + "loss": 2.1594, + "step": 6710 + }, + { + "epoch": 0.1582591493570722, + "grad_norm": 2.147885799407959, + "learning_rate": 0.00016836701050350903, + "loss": 2.2545, + "step": 6720 + }, + { + "epoch": 0.15849465404361546, + "grad_norm": 2.54560923576355, + "learning_rate": 0.00016831990956620038, + "loss": 2.0461, + "step": 6730 + }, + { + "epoch": 0.15873015873015872, + "grad_norm": 2.0715444087982178, + "learning_rate": 0.00016827280862889173, + "loss": 2.3273, + "step": 6740 + }, + { + "epoch": 0.15896566341670199, + "grad_norm": 2.0344667434692383, + "learning_rate": 0.00016822570769158308, + "loss": 2.2495, + "step": 6750 + }, + { + "epoch": 0.15920116810324525, + "grad_norm": 2.010301113128662, + "learning_rate": 0.0001681786067542744, + "loss": 2.2644, + "step": 6760 + }, + { + "epoch": 0.1594366727897885, + "grad_norm": 2.423848867416382, + "learning_rate": 0.00016813150581696578, + "loss": 1.9635, + "step": 6770 + }, + { + "epoch": 0.15967217747633178, + "grad_norm": 1.817819595336914, + "learning_rate": 0.0001680844048796571, + "loss": 2.1507, + "step": 6780 + }, + { + "epoch": 0.15990768216287504, + "grad_norm": 2.4440290927886963, + "learning_rate": 0.00016803730394234848, + "loss": 2.641, + "step": 6790 + }, + { + "epoch": 0.1601431868494183, + "grad_norm": 1.7758880853652954, + "learning_rate": 0.0001679902030050398, + "loss": 2.0812, + "step": 6800 + }, + { + "epoch": 0.16037869153596157, + "grad_norm": 2.6482930183410645, + "learning_rate": 0.00016794310206773115, + "loss": 2.0167, + "step": 6810 + }, + { + "epoch": 0.16061419622250483, + "grad_norm": 2.0901808738708496, + "learning_rate": 0.0001678960011304225, + "loss": 1.9824, + "step": 6820 + }, + { + "epoch": 0.1608497009090481, + "grad_norm": 2.458341121673584, + "learning_rate": 0.00016784890019311385, + "loss": 2.0419, + "step": 6830 + }, + { + "epoch": 0.16108520559559136, + "grad_norm": 1.7757121324539185, + "learning_rate": 0.0001678017992558052, + "loss": 2.1314, + "step": 6840 + }, + { + "epoch": 0.16132071028213463, + "grad_norm": 2.2329952716827393, + "learning_rate": 0.00016775469831849655, + "loss": 2.0286, + "step": 6850 + }, + { + "epoch": 0.16155621496867786, + "grad_norm": 2.1423516273498535, + "learning_rate": 0.00016770759738118788, + "loss": 2.21, + "step": 6860 + }, + { + "epoch": 0.16179171965522113, + "grad_norm": 1.869727611541748, + "learning_rate": 0.00016766049644387925, + "loss": 2.0268, + "step": 6870 + }, + { + "epoch": 0.1620272243417644, + "grad_norm": 2.514774799346924, + "learning_rate": 0.00016761339550657058, + "loss": 2.164, + "step": 6880 + }, + { + "epoch": 0.16226272902830766, + "grad_norm": 1.9911197423934937, + "learning_rate": 0.00016756629456926193, + "loss": 2.0608, + "step": 6890 + }, + { + "epoch": 0.16249823371485092, + "grad_norm": 1.9480197429656982, + "learning_rate": 0.00016751919363195328, + "loss": 2.1584, + "step": 6900 + }, + { + "epoch": 0.16273373840139418, + "grad_norm": 3.6419789791107178, + "learning_rate": 0.00016747209269464463, + "loss": 2.0359, + "step": 6910 + }, + { + "epoch": 0.16296924308793745, + "grad_norm": 1.879197597503662, + "learning_rate": 0.00016742499175733598, + "loss": 1.9777, + "step": 6920 + }, + { + "epoch": 0.1632047477744807, + "grad_norm": 2.217308759689331, + "learning_rate": 0.00016737789082002733, + "loss": 2.3454, + "step": 6930 + }, + { + "epoch": 0.16344025246102398, + "grad_norm": 2.226469039916992, + "learning_rate": 0.00016733078988271865, + "loss": 2.1253, + "step": 6940 + }, + { + "epoch": 0.16367575714756724, + "grad_norm": 2.5227324962615967, + "learning_rate": 0.00016728368894541003, + "loss": 2.0713, + "step": 6950 + }, + { + "epoch": 0.1639112618341105, + "grad_norm": 1.8012442588806152, + "learning_rate": 0.00016723658800810138, + "loss": 1.9799, + "step": 6960 + }, + { + "epoch": 0.16414676652065377, + "grad_norm": 3.06243634223938, + "learning_rate": 0.0001671894870707927, + "loss": 2.3178, + "step": 6970 + }, + { + "epoch": 0.16438227120719703, + "grad_norm": 1.4060784578323364, + "learning_rate": 0.00016714238613348408, + "loss": 2.149, + "step": 6980 + }, + { + "epoch": 0.1646177758937403, + "grad_norm": 1.7966259717941284, + "learning_rate": 0.0001670952851961754, + "loss": 2.2267, + "step": 6990 + }, + { + "epoch": 0.16485328058028353, + "grad_norm": 1.7830440998077393, + "learning_rate": 0.00016704818425886678, + "loss": 2.1592, + "step": 7000 + }, + { + "epoch": 0.1650887852668268, + "grad_norm": 1.8834739923477173, + "learning_rate": 0.0001670010833215581, + "loss": 2.0868, + "step": 7010 + }, + { + "epoch": 0.16532428995337006, + "grad_norm": 2.2466180324554443, + "learning_rate": 0.00016695398238424945, + "loss": 2.1263, + "step": 7020 + }, + { + "epoch": 0.16555979463991333, + "grad_norm": 2.24786639213562, + "learning_rate": 0.0001669068814469408, + "loss": 2.0783, + "step": 7030 + }, + { + "epoch": 0.1657952993264566, + "grad_norm": 2.1359407901763916, + "learning_rate": 0.00016685978050963215, + "loss": 2.2205, + "step": 7040 + }, + { + "epoch": 0.16603080401299986, + "grad_norm": 1.937888741493225, + "learning_rate": 0.0001668126795723235, + "loss": 2.191, + "step": 7050 + }, + { + "epoch": 0.16626630869954312, + "grad_norm": 2.25390625, + "learning_rate": 0.00016676557863501486, + "loss": 2.0718, + "step": 7060 + }, + { + "epoch": 0.16650181338608638, + "grad_norm": 1.8093719482421875, + "learning_rate": 0.00016671847769770618, + "loss": 2.133, + "step": 7070 + }, + { + "epoch": 0.16673731807262965, + "grad_norm": 2.38726806640625, + "learning_rate": 0.00016667137676039756, + "loss": 2.2872, + "step": 7080 + }, + { + "epoch": 0.1669728227591729, + "grad_norm": 1.934637188911438, + "learning_rate": 0.00016662427582308888, + "loss": 2.3178, + "step": 7090 + }, + { + "epoch": 0.16720832744571618, + "grad_norm": 2.438570976257324, + "learning_rate": 0.00016657717488578023, + "loss": 1.9654, + "step": 7100 + }, + { + "epoch": 0.16744383213225944, + "grad_norm": 2.067674398422241, + "learning_rate": 0.00016653007394847158, + "loss": 2.2299, + "step": 7110 + }, + { + "epoch": 0.1676793368188027, + "grad_norm": 2.227522373199463, + "learning_rate": 0.00016648297301116293, + "loss": 2.2692, + "step": 7120 + }, + { + "epoch": 0.16791484150534597, + "grad_norm": 4.050379753112793, + "learning_rate": 0.00016643587207385428, + "loss": 2.2243, + "step": 7130 + }, + { + "epoch": 0.1681503461918892, + "grad_norm": 2.220202922821045, + "learning_rate": 0.00016638877113654563, + "loss": 2.2231, + "step": 7140 + }, + { + "epoch": 0.16838585087843247, + "grad_norm": 2.7050681114196777, + "learning_rate": 0.00016634167019923695, + "loss": 2.1957, + "step": 7150 + }, + { + "epoch": 0.16862135556497573, + "grad_norm": 2.3848469257354736, + "learning_rate": 0.00016629456926192833, + "loss": 2.0928, + "step": 7160 + }, + { + "epoch": 0.168856860251519, + "grad_norm": 2.007478713989258, + "learning_rate": 0.00016624746832461965, + "loss": 2.1006, + "step": 7170 + }, + { + "epoch": 0.16909236493806226, + "grad_norm": 2.2802834510803223, + "learning_rate": 0.00016620036738731103, + "loss": 2.2195, + "step": 7180 + }, + { + "epoch": 0.16932786962460553, + "grad_norm": 2.2520174980163574, + "learning_rate": 0.00016615326645000235, + "loss": 2.169, + "step": 7190 + }, + { + "epoch": 0.1695633743111488, + "grad_norm": 2.1334917545318604, + "learning_rate": 0.0001661061655126937, + "loss": 2.3443, + "step": 7200 + }, + { + "epoch": 0.16979887899769205, + "grad_norm": 2.5005075931549072, + "learning_rate": 0.00016605906457538505, + "loss": 2.1891, + "step": 7210 + }, + { + "epoch": 0.17003438368423532, + "grad_norm": 2.019739866256714, + "learning_rate": 0.0001660119636380764, + "loss": 2.1287, + "step": 7220 + }, + { + "epoch": 0.17026988837077858, + "grad_norm": 1.7916017770767212, + "learning_rate": 0.00016596486270076776, + "loss": 2.1305, + "step": 7230 + }, + { + "epoch": 0.17050539305732185, + "grad_norm": 1.9927102327346802, + "learning_rate": 0.0001659177617634591, + "loss": 2.0284, + "step": 7240 + }, + { + "epoch": 0.1707408977438651, + "grad_norm": 2.6178619861602783, + "learning_rate": 0.00016587066082615046, + "loss": 2.2635, + "step": 7250 + }, + { + "epoch": 0.17097640243040838, + "grad_norm": 2.3916609287261963, + "learning_rate": 0.0001658235598888418, + "loss": 2.243, + "step": 7260 + }, + { + "epoch": 0.17121190711695164, + "grad_norm": 2.027822494506836, + "learning_rate": 0.00016577645895153316, + "loss": 2.1836, + "step": 7270 + }, + { + "epoch": 0.17144741180349488, + "grad_norm": 2.018761157989502, + "learning_rate": 0.00016572935801422448, + "loss": 2.1472, + "step": 7280 + }, + { + "epoch": 0.17168291649003814, + "grad_norm": 1.9133548736572266, + "learning_rate": 0.00016568225707691586, + "loss": 2.1242, + "step": 7290 + }, + { + "epoch": 0.1719184211765814, + "grad_norm": 4.398839473724365, + "learning_rate": 0.00016563515613960718, + "loss": 2.3578, + "step": 7300 + }, + { + "epoch": 0.17215392586312467, + "grad_norm": 1.720661997795105, + "learning_rate": 0.00016558805520229853, + "loss": 2.2148, + "step": 7310 + }, + { + "epoch": 0.17238943054966793, + "grad_norm": 2.4706451892852783, + "learning_rate": 0.00016554095426498988, + "loss": 2.0974, + "step": 7320 + }, + { + "epoch": 0.1726249352362112, + "grad_norm": 2.2956461906433105, + "learning_rate": 0.00016549385332768123, + "loss": 2.1426, + "step": 7330 + }, + { + "epoch": 0.17286043992275446, + "grad_norm": 2.160515069961548, + "learning_rate": 0.00016544675239037258, + "loss": 2.1641, + "step": 7340 + }, + { + "epoch": 0.17309594460929772, + "grad_norm": 2.1589348316192627, + "learning_rate": 0.00016539965145306393, + "loss": 2.2921, + "step": 7350 + }, + { + "epoch": 0.173331449295841, + "grad_norm": 2.136258125305176, + "learning_rate": 0.00016535255051575525, + "loss": 2.2571, + "step": 7360 + }, + { + "epoch": 0.17356695398238425, + "grad_norm": 3.0886502265930176, + "learning_rate": 0.00016530544957844663, + "loss": 2.3636, + "step": 7370 + }, + { + "epoch": 0.17380245866892752, + "grad_norm": 1.8283052444458008, + "learning_rate": 0.00016525834864113795, + "loss": 2.2302, + "step": 7380 + }, + { + "epoch": 0.17403796335547078, + "grad_norm": 2.7342529296875, + "learning_rate": 0.00016521124770382933, + "loss": 2.32, + "step": 7390 + }, + { + "epoch": 0.17427346804201405, + "grad_norm": 2.8123838901519775, + "learning_rate": 0.00016516414676652066, + "loss": 2.1852, + "step": 7400 + }, + { + "epoch": 0.1745089727285573, + "grad_norm": 1.7874890565872192, + "learning_rate": 0.000165117045829212, + "loss": 2.1981, + "step": 7410 + }, + { + "epoch": 0.17474447741510057, + "grad_norm": 2.6356823444366455, + "learning_rate": 0.00016506994489190336, + "loss": 2.4149, + "step": 7420 + }, + { + "epoch": 0.1749799821016438, + "grad_norm": 2.738905906677246, + "learning_rate": 0.00016502755404832557, + "loss": 2.0254, + "step": 7430 + }, + { + "epoch": 0.17521548678818707, + "grad_norm": 1.7247503995895386, + "learning_rate": 0.0001649804531110169, + "loss": 2.2169, + "step": 7440 + }, + { + "epoch": 0.17545099147473034, + "grad_norm": 2.0531630516052246, + "learning_rate": 0.00016493335217370827, + "loss": 2.2372, + "step": 7450 + }, + { + "epoch": 0.1756864961612736, + "grad_norm": 2.0215067863464355, + "learning_rate": 0.0001648862512363996, + "loss": 1.8726, + "step": 7460 + }, + { + "epoch": 0.17592200084781687, + "grad_norm": 2.5239405632019043, + "learning_rate": 0.00016483915029909095, + "loss": 1.8519, + "step": 7470 + }, + { + "epoch": 0.17615750553436013, + "grad_norm": 2.1495165824890137, + "learning_rate": 0.0001647920493617823, + "loss": 2.1613, + "step": 7480 + }, + { + "epoch": 0.1763930102209034, + "grad_norm": 1.7801376581192017, + "learning_rate": 0.00016474494842447365, + "loss": 2.0669, + "step": 7490 + }, + { + "epoch": 0.17662851490744666, + "grad_norm": 2.3613669872283936, + "learning_rate": 0.00016469784748716503, + "loss": 2.1535, + "step": 7500 + }, + { + "epoch": 0.17686401959398992, + "grad_norm": 1.718843936920166, + "learning_rate": 0.00016465074654985635, + "loss": 2.0689, + "step": 7510 + }, + { + "epoch": 0.1770995242805332, + "grad_norm": 1.8729385137557983, + "learning_rate": 0.0001646036456125477, + "loss": 2.0943, + "step": 7520 + }, + { + "epoch": 0.17733502896707645, + "grad_norm": 2.1967837810516357, + "learning_rate": 0.00016455654467523905, + "loss": 1.9327, + "step": 7530 + }, + { + "epoch": 0.17757053365361972, + "grad_norm": 2.0140233039855957, + "learning_rate": 0.0001645094437379304, + "loss": 2.1792, + "step": 7540 + }, + { + "epoch": 0.17780603834016298, + "grad_norm": 1.8183659315109253, + "learning_rate": 0.00016446234280062175, + "loss": 2.1262, + "step": 7550 + }, + { + "epoch": 0.17804154302670624, + "grad_norm": 2.026036500930786, + "learning_rate": 0.0001644152418633131, + "loss": 1.9423, + "step": 7560 + }, + { + "epoch": 0.17827704771324948, + "grad_norm": 3.2327709197998047, + "learning_rate": 0.00016436814092600442, + "loss": 2.1003, + "step": 7570 + }, + { + "epoch": 0.17851255239979275, + "grad_norm": 2.2476227283477783, + "learning_rate": 0.0001643210399886958, + "loss": 2.1822, + "step": 7580 + }, + { + "epoch": 0.178748057086336, + "grad_norm": 2.49985933303833, + "learning_rate": 0.00016427393905138712, + "loss": 2.1069, + "step": 7590 + }, + { + "epoch": 0.17898356177287927, + "grad_norm": 1.9173133373260498, + "learning_rate": 0.00016422683811407847, + "loss": 2.2727, + "step": 7600 + }, + { + "epoch": 0.17921906645942254, + "grad_norm": 2.5073535442352295, + "learning_rate": 0.00016417973717676982, + "loss": 2.1479, + "step": 7610 + }, + { + "epoch": 0.1794545711459658, + "grad_norm": 1.6549034118652344, + "learning_rate": 0.00016413263623946117, + "loss": 2.1405, + "step": 7620 + }, + { + "epoch": 0.17969007583250907, + "grad_norm": 2.1289401054382324, + "learning_rate": 0.00016408553530215252, + "loss": 2.1708, + "step": 7630 + }, + { + "epoch": 0.17992558051905233, + "grad_norm": 2.2387378215789795, + "learning_rate": 0.00016403843436484387, + "loss": 2.0243, + "step": 7640 + }, + { + "epoch": 0.1801610852055956, + "grad_norm": 1.905182123184204, + "learning_rate": 0.0001639913334275352, + "loss": 2.2722, + "step": 7650 + }, + { + "epoch": 0.18039658989213886, + "grad_norm": 2.26442289352417, + "learning_rate": 0.00016394423249022658, + "loss": 2.1395, + "step": 7660 + }, + { + "epoch": 0.18063209457868212, + "grad_norm": 2.4635813236236572, + "learning_rate": 0.0001638971315529179, + "loss": 2.0492, + "step": 7670 + }, + { + "epoch": 0.1808675992652254, + "grad_norm": 2.116964101791382, + "learning_rate": 0.00016385003061560928, + "loss": 2.1899, + "step": 7680 + }, + { + "epoch": 0.18110310395176865, + "grad_norm": 2.431391477584839, + "learning_rate": 0.0001638029296783006, + "loss": 2.0025, + "step": 7690 + }, + { + "epoch": 0.18133860863831192, + "grad_norm": 2.341212749481201, + "learning_rate": 0.00016375582874099195, + "loss": 2.1425, + "step": 7700 + }, + { + "epoch": 0.18157411332485515, + "grad_norm": 1.8115952014923096, + "learning_rate": 0.0001637087278036833, + "loss": 2.038, + "step": 7710 + }, + { + "epoch": 0.18180961801139842, + "grad_norm": 1.9707318544387817, + "learning_rate": 0.00016366162686637465, + "loss": 2.1576, + "step": 7720 + }, + { + "epoch": 0.18204512269794168, + "grad_norm": 2.6205427646636963, + "learning_rate": 0.000163614525929066, + "loss": 2.2071, + "step": 7730 + }, + { + "epoch": 0.18228062738448494, + "grad_norm": 2.613771677017212, + "learning_rate": 0.00016356742499175735, + "loss": 2.2306, + "step": 7740 + }, + { + "epoch": 0.1825161320710282, + "grad_norm": 1.9211950302124023, + "learning_rate": 0.00016352032405444867, + "loss": 2.1495, + "step": 7750 + }, + { + "epoch": 0.18275163675757147, + "grad_norm": 2.385100841522217, + "learning_rate": 0.00016347322311714005, + "loss": 2.1676, + "step": 7760 + }, + { + "epoch": 0.18298714144411474, + "grad_norm": 2.1002583503723145, + "learning_rate": 0.00016342612217983137, + "loss": 2.2365, + "step": 7770 + }, + { + "epoch": 0.183222646130658, + "grad_norm": 2.102348566055298, + "learning_rate": 0.00016337902124252272, + "loss": 2.1476, + "step": 7780 + }, + { + "epoch": 0.18345815081720127, + "grad_norm": 2.3299968242645264, + "learning_rate": 0.0001633319203052141, + "loss": 2.1953, + "step": 7790 + }, + { + "epoch": 0.18369365550374453, + "grad_norm": 1.9959839582443237, + "learning_rate": 0.00016328481936790542, + "loss": 2.0821, + "step": 7800 + }, + { + "epoch": 0.1839291601902878, + "grad_norm": 2.4026405811309814, + "learning_rate": 0.00016323771843059677, + "loss": 2.1747, + "step": 7810 + }, + { + "epoch": 0.18416466487683106, + "grad_norm": 2.223411798477173, + "learning_rate": 0.00016319061749328812, + "loss": 2.0871, + "step": 7820 + }, + { + "epoch": 0.18440016956337432, + "grad_norm": 2.1610796451568604, + "learning_rate": 0.00016314351655597948, + "loss": 2.1532, + "step": 7830 + }, + { + "epoch": 0.1846356742499176, + "grad_norm": 2.226919174194336, + "learning_rate": 0.00016309641561867083, + "loss": 2.2085, + "step": 7840 + }, + { + "epoch": 0.18487117893646082, + "grad_norm": 2.2085070610046387, + "learning_rate": 0.00016304931468136218, + "loss": 2.2106, + "step": 7850 + }, + { + "epoch": 0.1851066836230041, + "grad_norm": 2.343294858932495, + "learning_rate": 0.0001630022137440535, + "loss": 2.128, + "step": 7860 + }, + { + "epoch": 0.18534218830954735, + "grad_norm": 1.8624409437179565, + "learning_rate": 0.00016295511280674488, + "loss": 2.1655, + "step": 7870 + }, + { + "epoch": 0.18557769299609062, + "grad_norm": 2.0223777294158936, + "learning_rate": 0.0001629080118694362, + "loss": 2.1523, + "step": 7880 + }, + { + "epoch": 0.18581319768263388, + "grad_norm": 2.1489696502685547, + "learning_rate": 0.00016286091093212758, + "loss": 2.1188, + "step": 7890 + }, + { + "epoch": 0.18604870236917714, + "grad_norm": 2.2076354026794434, + "learning_rate": 0.0001628138099948189, + "loss": 1.9737, + "step": 7900 + }, + { + "epoch": 0.1862842070557204, + "grad_norm": 2.566239595413208, + "learning_rate": 0.00016276670905751025, + "loss": 2.2254, + "step": 7910 + }, + { + "epoch": 0.18651971174226367, + "grad_norm": 2.0568361282348633, + "learning_rate": 0.0001627196081202016, + "loss": 2.1747, + "step": 7920 + }, + { + "epoch": 0.18675521642880694, + "grad_norm": 2.4507033824920654, + "learning_rate": 0.00016267250718289295, + "loss": 2.3182, + "step": 7930 + }, + { + "epoch": 0.1869907211153502, + "grad_norm": 2.1586856842041016, + "learning_rate": 0.0001626254062455843, + "loss": 2.1965, + "step": 7940 + }, + { + "epoch": 0.18722622580189346, + "grad_norm": 1.9273881912231445, + "learning_rate": 0.00016257830530827565, + "loss": 2.2813, + "step": 7950 + }, + { + "epoch": 0.18746173048843673, + "grad_norm": 1.7580167055130005, + "learning_rate": 0.00016253120437096697, + "loss": 2.1578, + "step": 7960 + }, + { + "epoch": 0.18769723517498, + "grad_norm": 2.0554792881011963, + "learning_rate": 0.00016248410343365835, + "loss": 2.0389, + "step": 7970 + }, + { + "epoch": 0.18793273986152326, + "grad_norm": 1.872406005859375, + "learning_rate": 0.00016243700249634967, + "loss": 2.1407, + "step": 7980 + }, + { + "epoch": 0.1881682445480665, + "grad_norm": 2.1764724254608154, + "learning_rate": 0.00016238990155904102, + "loss": 2.135, + "step": 7990 + }, + { + "epoch": 0.18840374923460976, + "grad_norm": 1.8471111059188843, + "learning_rate": 0.00016234280062173237, + "loss": 2.1204, + "step": 8000 + }, + { + "epoch": 0.18863925392115302, + "grad_norm": 2.7460036277770996, + "learning_rate": 0.00016229569968442373, + "loss": 2.1958, + "step": 8010 + }, + { + "epoch": 0.18887475860769629, + "grad_norm": 1.8986886739730835, + "learning_rate": 0.00016224859874711508, + "loss": 2.2771, + "step": 8020 + }, + { + "epoch": 0.18911026329423955, + "grad_norm": 2.3261477947235107, + "learning_rate": 0.00016220149780980643, + "loss": 2.2911, + "step": 8030 + }, + { + "epoch": 0.18934576798078281, + "grad_norm": 3.4282009601593018, + "learning_rate": 0.00016215439687249775, + "loss": 2.1043, + "step": 8040 + }, + { + "epoch": 0.18958127266732608, + "grad_norm": 1.8405457735061646, + "learning_rate": 0.00016210729593518913, + "loss": 2.211, + "step": 8050 + }, + { + "epoch": 0.18981677735386934, + "grad_norm": 2.0625762939453125, + "learning_rate": 0.00016206019499788045, + "loss": 2.0193, + "step": 8060 + }, + { + "epoch": 0.1900522820404126, + "grad_norm": 1.9741464853286743, + "learning_rate": 0.0001620130940605718, + "loss": 2.1448, + "step": 8070 + }, + { + "epoch": 0.19028778672695587, + "grad_norm": 2.270921468734741, + "learning_rate": 0.00016196599312326318, + "loss": 2.3037, + "step": 8080 + }, + { + "epoch": 0.19052329141349914, + "grad_norm": 1.8715896606445312, + "learning_rate": 0.0001619188921859545, + "loss": 2.027, + "step": 8090 + }, + { + "epoch": 0.1907587961000424, + "grad_norm": 2.515016794204712, + "learning_rate": 0.00016187179124864588, + "loss": 1.9652, + "step": 8100 + }, + { + "epoch": 0.19099430078658566, + "grad_norm": 1.9395673274993896, + "learning_rate": 0.0001618246903113372, + "loss": 2.2969, + "step": 8110 + }, + { + "epoch": 0.19122980547312893, + "grad_norm": 2.1764769554138184, + "learning_rate": 0.00016177758937402855, + "loss": 2.1232, + "step": 8120 + }, + { + "epoch": 0.19146531015967216, + "grad_norm": 2.594571113586426, + "learning_rate": 0.0001617304884367199, + "loss": 2.1088, + "step": 8130 + }, + { + "epoch": 0.19170081484621543, + "grad_norm": 2.394315242767334, + "learning_rate": 0.00016168338749941125, + "loss": 2.0626, + "step": 8140 + }, + { + "epoch": 0.1919363195327587, + "grad_norm": 1.8806692361831665, + "learning_rate": 0.0001616362865621026, + "loss": 2.0899, + "step": 8150 + }, + { + "epoch": 0.19217182421930196, + "grad_norm": 2.2161996364593506, + "learning_rate": 0.00016158918562479395, + "loss": 2.2571, + "step": 8160 + }, + { + "epoch": 0.19240732890584522, + "grad_norm": 3.4953277111053467, + "learning_rate": 0.00016154208468748527, + "loss": 2.0105, + "step": 8170 + }, + { + "epoch": 0.19264283359238848, + "grad_norm": 2.8051047325134277, + "learning_rate": 0.00016149498375017665, + "loss": 2.1364, + "step": 8180 + }, + { + "epoch": 0.19287833827893175, + "grad_norm": 2.5728232860565186, + "learning_rate": 0.00016144788281286798, + "loss": 2.0509, + "step": 8190 + }, + { + "epoch": 0.193113842965475, + "grad_norm": 2.346428394317627, + "learning_rate": 0.00016140078187555933, + "loss": 2.2152, + "step": 8200 + }, + { + "epoch": 0.19334934765201828, + "grad_norm": 1.9895967245101929, + "learning_rate": 0.00016135368093825068, + "loss": 2.1382, + "step": 8210 + }, + { + "epoch": 0.19358485233856154, + "grad_norm": 2.0366828441619873, + "learning_rate": 0.00016130658000094203, + "loss": 2.1697, + "step": 8220 + }, + { + "epoch": 0.1938203570251048, + "grad_norm": 2.5013720989227295, + "learning_rate": 0.00016125947906363338, + "loss": 1.9348, + "step": 8230 + }, + { + "epoch": 0.19405586171164807, + "grad_norm": 2.338641881942749, + "learning_rate": 0.00016121237812632473, + "loss": 2.3867, + "step": 8240 + }, + { + "epoch": 0.19429136639819133, + "grad_norm": 2.367546319961548, + "learning_rate": 0.00016116527718901605, + "loss": 2.1588, + "step": 8250 + }, + { + "epoch": 0.1945268710847346, + "grad_norm": 2.366485118865967, + "learning_rate": 0.00016111817625170743, + "loss": 2.3467, + "step": 8260 + }, + { + "epoch": 0.19476237577127783, + "grad_norm": 1.8073631525039673, + "learning_rate": 0.00016107107531439875, + "loss": 2.0071, + "step": 8270 + }, + { + "epoch": 0.1949978804578211, + "grad_norm": 1.8550753593444824, + "learning_rate": 0.00016102397437709013, + "loss": 2.1192, + "step": 8280 + }, + { + "epoch": 0.19523338514436436, + "grad_norm": 2.834643602371216, + "learning_rate": 0.00016097687343978145, + "loss": 2.1614, + "step": 8290 + }, + { + "epoch": 0.19546888983090763, + "grad_norm": 1.961897850036621, + "learning_rate": 0.0001609297725024728, + "loss": 2.1747, + "step": 8300 + }, + { + "epoch": 0.1957043945174509, + "grad_norm": 2.972184658050537, + "learning_rate": 0.00016088267156516415, + "loss": 2.1692, + "step": 8310 + }, + { + "epoch": 0.19593989920399416, + "grad_norm": 2.2215375900268555, + "learning_rate": 0.0001608355706278555, + "loss": 2.1288, + "step": 8320 + }, + { + "epoch": 0.19617540389053742, + "grad_norm": 1.6455684900283813, + "learning_rate": 0.00016078846969054685, + "loss": 2.0821, + "step": 8330 + }, + { + "epoch": 0.19641090857708068, + "grad_norm": 3.122081995010376, + "learning_rate": 0.0001607413687532382, + "loss": 2.2687, + "step": 8340 + }, + { + "epoch": 0.19664641326362395, + "grad_norm": 2.8040692806243896, + "learning_rate": 0.00016069426781592955, + "loss": 2.3317, + "step": 8350 + }, + { + "epoch": 0.1968819179501672, + "grad_norm": 2.200100898742676, + "learning_rate": 0.0001606471668786209, + "loss": 2.1458, + "step": 8360 + }, + { + "epoch": 0.19711742263671048, + "grad_norm": 1.9488954544067383, + "learning_rate": 0.00016060006594131225, + "loss": 1.9935, + "step": 8370 + }, + { + "epoch": 0.19735292732325374, + "grad_norm": 2.2211225032806396, + "learning_rate": 0.00016055296500400358, + "loss": 2.3222, + "step": 8380 + }, + { + "epoch": 0.197588432009797, + "grad_norm": 2.1487669944763184, + "learning_rate": 0.00016050586406669495, + "loss": 2.1131, + "step": 8390 + }, + { + "epoch": 0.19782393669634027, + "grad_norm": 2.1569080352783203, + "learning_rate": 0.00016045876312938628, + "loss": 2.184, + "step": 8400 + }, + { + "epoch": 0.19805944138288353, + "grad_norm": 1.7691841125488281, + "learning_rate": 0.00016041166219207763, + "loss": 2.2211, + "step": 8410 + }, + { + "epoch": 0.19829494606942677, + "grad_norm": 2.2615177631378174, + "learning_rate": 0.00016036456125476898, + "loss": 2.171, + "step": 8420 + }, + { + "epoch": 0.19853045075597003, + "grad_norm": 1.9963940382003784, + "learning_rate": 0.00016031746031746033, + "loss": 2.0384, + "step": 8430 + }, + { + "epoch": 0.1987659554425133, + "grad_norm": 2.1423399448394775, + "learning_rate": 0.00016027035938015168, + "loss": 2.1735, + "step": 8440 + }, + { + "epoch": 0.19900146012905656, + "grad_norm": 2.056474447250366, + "learning_rate": 0.00016022325844284303, + "loss": 2.2922, + "step": 8450 + }, + { + "epoch": 0.19923696481559983, + "grad_norm": 1.6666935682296753, + "learning_rate": 0.00016017615750553435, + "loss": 2.1775, + "step": 8460 + }, + { + "epoch": 0.1994724695021431, + "grad_norm": 2.2236125469207764, + "learning_rate": 0.00016012905656822573, + "loss": 1.9681, + "step": 8470 + }, + { + "epoch": 0.19970797418868635, + "grad_norm": 2.01289701461792, + "learning_rate": 0.00016008195563091705, + "loss": 1.9393, + "step": 8480 + }, + { + "epoch": 0.19994347887522962, + "grad_norm": 2.205111503601074, + "learning_rate": 0.00016003485469360843, + "loss": 1.9858, + "step": 8490 + }, + { + "epoch": 0.20017898356177288, + "grad_norm": 3.152418375015259, + "learning_rate": 0.00015998775375629975, + "loss": 2.241, + "step": 8500 + }, + { + "epoch": 0.20041448824831615, + "grad_norm": 2.2211573123931885, + "learning_rate": 0.0001599406528189911, + "loss": 2.0439, + "step": 8510 + }, + { + "epoch": 0.2006499929348594, + "grad_norm": 1.9449644088745117, + "learning_rate": 0.00015989355188168245, + "loss": 2.0677, + "step": 8520 + }, + { + "epoch": 0.20088549762140268, + "grad_norm": 3.1177256107330322, + "learning_rate": 0.0001598464509443738, + "loss": 2.1057, + "step": 8530 + }, + { + "epoch": 0.20112100230794594, + "grad_norm": 3.570312023162842, + "learning_rate": 0.00015979935000706515, + "loss": 2.0658, + "step": 8540 + }, + { + "epoch": 0.2013565069944892, + "grad_norm": 1.7642455101013184, + "learning_rate": 0.0001597522490697565, + "loss": 2.2294, + "step": 8550 + }, + { + "epoch": 0.20159201168103244, + "grad_norm": 1.7832077741622925, + "learning_rate": 0.00015970514813244783, + "loss": 2.2795, + "step": 8560 + }, + { + "epoch": 0.2018275163675757, + "grad_norm": 2.377683639526367, + "learning_rate": 0.0001596580471951392, + "loss": 2.0018, + "step": 8570 + }, + { + "epoch": 0.20206302105411897, + "grad_norm": 2.7530715465545654, + "learning_rate": 0.00015961094625783053, + "loss": 2.2075, + "step": 8580 + }, + { + "epoch": 0.20229852574066223, + "grad_norm": 2.924802303314209, + "learning_rate": 0.00015956384532052188, + "loss": 2.2061, + "step": 8590 + }, + { + "epoch": 0.2025340304272055, + "grad_norm": 2.2290501594543457, + "learning_rate": 0.00015951674438321323, + "loss": 1.9976, + "step": 8600 + }, + { + "epoch": 0.20276953511374876, + "grad_norm": 2.3307456970214844, + "learning_rate": 0.00015946964344590458, + "loss": 2.1666, + "step": 8610 + }, + { + "epoch": 0.20300503980029203, + "grad_norm": 2.272770643234253, + "learning_rate": 0.00015942254250859593, + "loss": 2.1245, + "step": 8620 + }, + { + "epoch": 0.2032405444868353, + "grad_norm": 2.5355868339538574, + "learning_rate": 0.00015937544157128728, + "loss": 2.2273, + "step": 8630 + }, + { + "epoch": 0.20347604917337855, + "grad_norm": 1.9374130964279175, + "learning_rate": 0.00015932834063397863, + "loss": 1.9849, + "step": 8640 + }, + { + "epoch": 0.20371155385992182, + "grad_norm": 2.0715346336364746, + "learning_rate": 0.00015928123969666998, + "loss": 1.9397, + "step": 8650 + }, + { + "epoch": 0.20394705854646508, + "grad_norm": 2.3634729385375977, + "learning_rate": 0.00015923413875936133, + "loss": 2.1056, + "step": 8660 + }, + { + "epoch": 0.20418256323300835, + "grad_norm": 2.003783702850342, + "learning_rate": 0.00015918703782205265, + "loss": 1.8101, + "step": 8670 + }, + { + "epoch": 0.2044180679195516, + "grad_norm": 1.9855163097381592, + "learning_rate": 0.00015913993688474403, + "loss": 1.8513, + "step": 8680 + }, + { + "epoch": 0.20465357260609487, + "grad_norm": 2.3215885162353516, + "learning_rate": 0.00015909283594743535, + "loss": 2.2075, + "step": 8690 + }, + { + "epoch": 0.2048890772926381, + "grad_norm": 2.110724687576294, + "learning_rate": 0.00015904573501012673, + "loss": 2.0604, + "step": 8700 + }, + { + "epoch": 0.20512458197918138, + "grad_norm": 2.33134388923645, + "learning_rate": 0.00015899863407281805, + "loss": 2.1537, + "step": 8710 + }, + { + "epoch": 0.20536008666572464, + "grad_norm": 2.0391716957092285, + "learning_rate": 0.0001589515331355094, + "loss": 1.9752, + "step": 8720 + }, + { + "epoch": 0.2055955913522679, + "grad_norm": 2.1670148372650146, + "learning_rate": 0.00015890443219820075, + "loss": 2.1016, + "step": 8730 + }, + { + "epoch": 0.20583109603881117, + "grad_norm": 2.2209887504577637, + "learning_rate": 0.0001588573312608921, + "loss": 2.1446, + "step": 8740 + }, + { + "epoch": 0.20606660072535443, + "grad_norm": 2.245386838912964, + "learning_rate": 0.00015881023032358345, + "loss": 1.9711, + "step": 8750 + }, + { + "epoch": 0.2063021054118977, + "grad_norm": 2.2087981700897217, + "learning_rate": 0.0001587631293862748, + "loss": 2.157, + "step": 8760 + }, + { + "epoch": 0.20653761009844096, + "grad_norm": 1.9615005254745483, + "learning_rate": 0.00015871602844896613, + "loss": 2.0809, + "step": 8770 + }, + { + "epoch": 0.20677311478498422, + "grad_norm": 2.1941707134246826, + "learning_rate": 0.0001586689275116575, + "loss": 2.0107, + "step": 8780 + }, + { + "epoch": 0.2070086194715275, + "grad_norm": 1.8866039514541626, + "learning_rate": 0.00015862182657434883, + "loss": 2.0821, + "step": 8790 + }, + { + "epoch": 0.20724412415807075, + "grad_norm": 2.4562177658081055, + "learning_rate": 0.00015857472563704018, + "loss": 2.2769, + "step": 8800 + }, + { + "epoch": 0.20747962884461402, + "grad_norm": 1.8872411251068115, + "learning_rate": 0.00015852762469973153, + "loss": 2.307, + "step": 8810 + }, + { + "epoch": 0.20771513353115728, + "grad_norm": 1.9673904180526733, + "learning_rate": 0.00015848052376242288, + "loss": 2.242, + "step": 8820 + }, + { + "epoch": 0.20795063821770055, + "grad_norm": 2.0321238040924072, + "learning_rate": 0.00015843342282511423, + "loss": 2.0609, + "step": 8830 + }, + { + "epoch": 0.20818614290424378, + "grad_norm": 2.2742855548858643, + "learning_rate": 0.00015838632188780558, + "loss": 2.3143, + "step": 8840 + }, + { + "epoch": 0.20842164759078705, + "grad_norm": 2.290383815765381, + "learning_rate": 0.0001583392209504969, + "loss": 2.2528, + "step": 8850 + }, + { + "epoch": 0.2086571522773303, + "grad_norm": 2.4600648880004883, + "learning_rate": 0.00015829212001318828, + "loss": 2.3136, + "step": 8860 + }, + { + "epoch": 0.20889265696387357, + "grad_norm": 2.9521021842956543, + "learning_rate": 0.0001582450190758796, + "loss": 2.3804, + "step": 8870 + }, + { + "epoch": 0.20912816165041684, + "grad_norm": 2.5371804237365723, + "learning_rate": 0.00015819791813857098, + "loss": 2.008, + "step": 8880 + }, + { + "epoch": 0.2093636663369601, + "grad_norm": 1.9548083543777466, + "learning_rate": 0.0001581508172012623, + "loss": 2.2387, + "step": 8890 + }, + { + "epoch": 0.20959917102350337, + "grad_norm": 2.33406925201416, + "learning_rate": 0.00015810371626395365, + "loss": 1.9671, + "step": 8900 + }, + { + "epoch": 0.20983467571004663, + "grad_norm": 2.2755494117736816, + "learning_rate": 0.000158056615326645, + "loss": 2.1114, + "step": 8910 + }, + { + "epoch": 0.2100701803965899, + "grad_norm": 2.0145130157470703, + "learning_rate": 0.00015800951438933635, + "loss": 2.1826, + "step": 8920 + }, + { + "epoch": 0.21030568508313316, + "grad_norm": 2.4278318881988525, + "learning_rate": 0.0001579624134520277, + "loss": 2.2274, + "step": 8930 + }, + { + "epoch": 0.21054118976967642, + "grad_norm": 2.743621826171875, + "learning_rate": 0.00015791531251471905, + "loss": 2.03, + "step": 8940 + }, + { + "epoch": 0.2107766944562197, + "grad_norm": 2.0403552055358887, + "learning_rate": 0.0001578682115774104, + "loss": 1.9389, + "step": 8950 + }, + { + "epoch": 0.21101219914276295, + "grad_norm": 1.9980547428131104, + "learning_rate": 0.00015782111064010175, + "loss": 2.207, + "step": 8960 + }, + { + "epoch": 0.21124770382930622, + "grad_norm": 2.2395660877227783, + "learning_rate": 0.0001577740097027931, + "loss": 2.0992, + "step": 8970 + }, + { + "epoch": 0.21148320851584945, + "grad_norm": 2.130779266357422, + "learning_rate": 0.00015772690876548443, + "loss": 2.1155, + "step": 8980 + }, + { + "epoch": 0.21171871320239272, + "grad_norm": 2.452732801437378, + "learning_rate": 0.0001576798078281758, + "loss": 2.1982, + "step": 8990 + }, + { + "epoch": 0.21195421788893598, + "grad_norm": 1.9832158088684082, + "learning_rate": 0.00015763270689086713, + "loss": 2.0777, + "step": 9000 + }, + { + "epoch": 0.21218972257547924, + "grad_norm": 2.609037160873413, + "learning_rate": 0.00015758560595355848, + "loss": 2.2989, + "step": 9010 + }, + { + "epoch": 0.2124252272620225, + "grad_norm": 2.5825107097625732, + "learning_rate": 0.00015753850501624983, + "loss": 2.1168, + "step": 9020 + }, + { + "epoch": 0.21266073194856577, + "grad_norm": 2.1648106575012207, + "learning_rate": 0.00015749140407894118, + "loss": 2.1504, + "step": 9030 + }, + { + "epoch": 0.21289623663510904, + "grad_norm": 2.132033586502075, + "learning_rate": 0.00015744430314163253, + "loss": 2.2418, + "step": 9040 + }, + { + "epoch": 0.2131317413216523, + "grad_norm": 2.439497470855713, + "learning_rate": 0.00015739720220432388, + "loss": 2.2041, + "step": 9050 + }, + { + "epoch": 0.21336724600819557, + "grad_norm": 2.2189526557922363, + "learning_rate": 0.0001573501012670152, + "loss": 2.1985, + "step": 9060 + }, + { + "epoch": 0.21360275069473883, + "grad_norm": 2.7584781646728516, + "learning_rate": 0.00015730300032970658, + "loss": 2.2038, + "step": 9070 + }, + { + "epoch": 0.2138382553812821, + "grad_norm": 2.2705740928649902, + "learning_rate": 0.0001572558993923979, + "loss": 2.1067, + "step": 9080 + }, + { + "epoch": 0.21407376006782536, + "grad_norm": 3.012639045715332, + "learning_rate": 0.00015720879845508928, + "loss": 2.3559, + "step": 9090 + }, + { + "epoch": 0.21430926475436862, + "grad_norm": 3.75484299659729, + "learning_rate": 0.0001571616975177806, + "loss": 2.1761, + "step": 9100 + }, + { + "epoch": 0.2145447694409119, + "grad_norm": 2.1686441898345947, + "learning_rate": 0.00015711459658047195, + "loss": 2.1525, + "step": 9110 + }, + { + "epoch": 0.21478027412745512, + "grad_norm": 2.0703907012939453, + "learning_rate": 0.0001570674956431633, + "loss": 2.3426, + "step": 9120 + }, + { + "epoch": 0.2150157788139984, + "grad_norm": 2.633394956588745, + "learning_rate": 0.00015702039470585465, + "loss": 2.101, + "step": 9130 + }, + { + "epoch": 0.21525128350054165, + "grad_norm": 1.6758605241775513, + "learning_rate": 0.000156973293768546, + "loss": 2.422, + "step": 9140 + }, + { + "epoch": 0.21548678818708492, + "grad_norm": 2.618814468383789, + "learning_rate": 0.00015692619283123735, + "loss": 2.2528, + "step": 9150 + }, + { + "epoch": 0.21572229287362818, + "grad_norm": 1.8236844539642334, + "learning_rate": 0.00015687909189392868, + "loss": 2.075, + "step": 9160 + }, + { + "epoch": 0.21595779756017144, + "grad_norm": 2.2664949893951416, + "learning_rate": 0.00015683199095662006, + "loss": 1.9622, + "step": 9170 + }, + { + "epoch": 0.2161933022467147, + "grad_norm": 2.9308884143829346, + "learning_rate": 0.00015678489001931138, + "loss": 2.0478, + "step": 9180 + }, + { + "epoch": 0.21642880693325797, + "grad_norm": 2.6162636280059814, + "learning_rate": 0.00015673778908200273, + "loss": 2.2457, + "step": 9190 + }, + { + "epoch": 0.21666431161980124, + "grad_norm": 2.512709617614746, + "learning_rate": 0.0001566906881446941, + "loss": 2.1442, + "step": 9200 + }, + { + "epoch": 0.2168998163063445, + "grad_norm": 1.8639780282974243, + "learning_rate": 0.00015664358720738543, + "loss": 2.0677, + "step": 9210 + }, + { + "epoch": 0.21713532099288776, + "grad_norm": 2.1955885887145996, + "learning_rate": 0.00015659648627007678, + "loss": 2.3215, + "step": 9220 + }, + { + "epoch": 0.21737082567943103, + "grad_norm": 2.1745858192443848, + "learning_rate": 0.00015654938533276813, + "loss": 2.2667, + "step": 9230 + }, + { + "epoch": 0.2176063303659743, + "grad_norm": 2.6070613861083984, + "learning_rate": 0.00015650228439545948, + "loss": 2.2632, + "step": 9240 + }, + { + "epoch": 0.21784183505251756, + "grad_norm": 3.0261740684509277, + "learning_rate": 0.00015645518345815083, + "loss": 2.0795, + "step": 9250 + }, + { + "epoch": 0.2180773397390608, + "grad_norm": 2.801222324371338, + "learning_rate": 0.00015640808252084218, + "loss": 2.2413, + "step": 9260 + }, + { + "epoch": 0.21831284442560406, + "grad_norm": 3.1285176277160645, + "learning_rate": 0.0001563609815835335, + "loss": 2.2592, + "step": 9270 + }, + { + "epoch": 0.21854834911214732, + "grad_norm": 2.106144666671753, + "learning_rate": 0.00015631388064622488, + "loss": 2.1891, + "step": 9280 + }, + { + "epoch": 0.2187838537986906, + "grad_norm": 1.9326168298721313, + "learning_rate": 0.0001562667797089162, + "loss": 2.0664, + "step": 9290 + }, + { + "epoch": 0.21901935848523385, + "grad_norm": 1.709635853767395, + "learning_rate": 0.00015621967877160758, + "loss": 2.1744, + "step": 9300 + }, + { + "epoch": 0.21925486317177711, + "grad_norm": 2.1247639656066895, + "learning_rate": 0.0001561725778342989, + "loss": 2.1967, + "step": 9310 + }, + { + "epoch": 0.21949036785832038, + "grad_norm": 2.582542657852173, + "learning_rate": 0.00015612547689699025, + "loss": 2.2411, + "step": 9320 + }, + { + "epoch": 0.21972587254486364, + "grad_norm": 2.743621349334717, + "learning_rate": 0.0001560783759596816, + "loss": 2.3035, + "step": 9330 + }, + { + "epoch": 0.2199613772314069, + "grad_norm": 2.67402720451355, + "learning_rate": 0.00015603127502237296, + "loss": 2.2365, + "step": 9340 + }, + { + "epoch": 0.22019688191795017, + "grad_norm": 2.4903061389923096, + "learning_rate": 0.0001559841740850643, + "loss": 2.0761, + "step": 9350 + }, + { + "epoch": 0.22043238660449344, + "grad_norm": 2.3421287536621094, + "learning_rate": 0.00015593707314775566, + "loss": 2.1829, + "step": 9360 + }, + { + "epoch": 0.2206678912910367, + "grad_norm": 2.735867977142334, + "learning_rate": 0.00015588997221044698, + "loss": 2.2231, + "step": 9370 + }, + { + "epoch": 0.22090339597757996, + "grad_norm": 1.992004156112671, + "learning_rate": 0.00015584287127313836, + "loss": 2.2848, + "step": 9380 + }, + { + "epoch": 0.22113890066412323, + "grad_norm": 2.300267457962036, + "learning_rate": 0.00015579577033582968, + "loss": 2.1919, + "step": 9390 + }, + { + "epoch": 0.22137440535066646, + "grad_norm": 1.9108951091766357, + "learning_rate": 0.00015574866939852103, + "loss": 2.0605, + "step": 9400 + }, + { + "epoch": 0.22160991003720973, + "grad_norm": 2.468258857727051, + "learning_rate": 0.00015570156846121238, + "loss": 2.2323, + "step": 9410 + }, + { + "epoch": 0.221845414723753, + "grad_norm": 2.3677616119384766, + "learning_rate": 0.00015565446752390373, + "loss": 2.0305, + "step": 9420 + }, + { + "epoch": 0.22208091941029626, + "grad_norm": 2.333322286605835, + "learning_rate": 0.00015560736658659508, + "loss": 2.3494, + "step": 9430 + }, + { + "epoch": 0.22231642409683952, + "grad_norm": 2.4163379669189453, + "learning_rate": 0.00015556026564928643, + "loss": 2.2143, + "step": 9440 + }, + { + "epoch": 0.22255192878338279, + "grad_norm": 2.2694649696350098, + "learning_rate": 0.00015551316471197775, + "loss": 2.1862, + "step": 9450 + }, + { + "epoch": 0.22278743346992605, + "grad_norm": 1.8725945949554443, + "learning_rate": 0.00015546606377466913, + "loss": 2.1275, + "step": 9460 + }, + { + "epoch": 0.2230229381564693, + "grad_norm": 2.3583409786224365, + "learning_rate": 0.00015541896283736045, + "loss": 2.2181, + "step": 9470 + }, + { + "epoch": 0.22325844284301258, + "grad_norm": 1.9677271842956543, + "learning_rate": 0.00015537186190005183, + "loss": 2.1267, + "step": 9480 + }, + { + "epoch": 0.22349394752955584, + "grad_norm": 1.9935779571533203, + "learning_rate": 0.00015532476096274318, + "loss": 2.3531, + "step": 9490 + }, + { + "epoch": 0.2237294522160991, + "grad_norm": 3.433013439178467, + "learning_rate": 0.0001552776600254345, + "loss": 2.1142, + "step": 9500 + }, + { + "epoch": 0.22396495690264237, + "grad_norm": 2.3903839588165283, + "learning_rate": 0.00015523055908812588, + "loss": 2.0263, + "step": 9510 + }, + { + "epoch": 0.22420046158918563, + "grad_norm": 2.319390296936035, + "learning_rate": 0.0001551834581508172, + "loss": 2.2263, + "step": 9520 + }, + { + "epoch": 0.2244359662757289, + "grad_norm": 1.99445641040802, + "learning_rate": 0.00015513635721350856, + "loss": 1.9634, + "step": 9530 + }, + { + "epoch": 0.22467147096227216, + "grad_norm": 2.1183362007141113, + "learning_rate": 0.0001550892562761999, + "loss": 2.1869, + "step": 9540 + }, + { + "epoch": 0.2249069756488154, + "grad_norm": 2.5018179416656494, + "learning_rate": 0.00015504215533889126, + "loss": 2.0954, + "step": 9550 + }, + { + "epoch": 0.22514248033535866, + "grad_norm": 2.222480297088623, + "learning_rate": 0.0001549950544015826, + "loss": 2.0068, + "step": 9560 + }, + { + "epoch": 0.22537798502190193, + "grad_norm": 2.6220204830169678, + "learning_rate": 0.00015494795346427396, + "loss": 2.19, + "step": 9570 + }, + { + "epoch": 0.2256134897084452, + "grad_norm": 2.385401964187622, + "learning_rate": 0.00015490085252696528, + "loss": 2.1428, + "step": 9580 + }, + { + "epoch": 0.22584899439498846, + "grad_norm": 2.0146987438201904, + "learning_rate": 0.00015485375158965666, + "loss": 2.1736, + "step": 9590 + }, + { + "epoch": 0.22608449908153172, + "grad_norm": 2.207779884338379, + "learning_rate": 0.00015480665065234798, + "loss": 2.0437, + "step": 9600 + }, + { + "epoch": 0.22632000376807498, + "grad_norm": 2.220184803009033, + "learning_rate": 0.00015475954971503933, + "loss": 2.0954, + "step": 9610 + }, + { + "epoch": 0.22655550845461825, + "grad_norm": 1.9735349416732788, + "learning_rate": 0.00015471244877773068, + "loss": 2.0907, + "step": 9620 + }, + { + "epoch": 0.2267910131411615, + "grad_norm": 2.214447259902954, + "learning_rate": 0.00015466534784042203, + "loss": 2.0783, + "step": 9630 + }, + { + "epoch": 0.22702651782770478, + "grad_norm": 2.104156255722046, + "learning_rate": 0.00015461824690311338, + "loss": 2.3976, + "step": 9640 + }, + { + "epoch": 0.22726202251424804, + "grad_norm": 2.283184289932251, + "learning_rate": 0.00015457114596580473, + "loss": 2.1239, + "step": 9650 + }, + { + "epoch": 0.2274975272007913, + "grad_norm": 1.9349390268325806, + "learning_rate": 0.00015452404502849605, + "loss": 2.2178, + "step": 9660 + }, + { + "epoch": 0.22773303188733457, + "grad_norm": 2.752654552459717, + "learning_rate": 0.00015447694409118743, + "loss": 2.2175, + "step": 9670 + }, + { + "epoch": 0.22796853657387783, + "grad_norm": 1.937559723854065, + "learning_rate": 0.00015442984315387876, + "loss": 2.112, + "step": 9680 + }, + { + "epoch": 0.22820404126042107, + "grad_norm": 2.8238909244537354, + "learning_rate": 0.00015438274221657013, + "loss": 2.0023, + "step": 9690 + }, + { + "epoch": 0.22843954594696433, + "grad_norm": 2.258000373840332, + "learning_rate": 0.00015433564127926146, + "loss": 1.946, + "step": 9700 + }, + { + "epoch": 0.2286750506335076, + "grad_norm": 3.562777519226074, + "learning_rate": 0.0001542885403419528, + "loss": 2.0746, + "step": 9710 + }, + { + "epoch": 0.22891055532005086, + "grad_norm": 2.3150315284729004, + "learning_rate": 0.00015424143940464416, + "loss": 2.1203, + "step": 9720 + }, + { + "epoch": 0.22914606000659413, + "grad_norm": 3.4082276821136475, + "learning_rate": 0.0001541943384673355, + "loss": 2.3191, + "step": 9730 + }, + { + "epoch": 0.2293815646931374, + "grad_norm": 2.0063388347625732, + "learning_rate": 0.00015414723753002686, + "loss": 2.1138, + "step": 9740 + }, + { + "epoch": 0.22961706937968066, + "grad_norm": 1.8203511238098145, + "learning_rate": 0.0001541001365927182, + "loss": 2.0288, + "step": 9750 + }, + { + "epoch": 0.22985257406622392, + "grad_norm": 2.3845832347869873, + "learning_rate": 0.00015405303565540956, + "loss": 2.1712, + "step": 9760 + }, + { + "epoch": 0.23008807875276718, + "grad_norm": 1.6893136501312256, + "learning_rate": 0.0001540059347181009, + "loss": 2.215, + "step": 9770 + }, + { + "epoch": 0.23032358343931045, + "grad_norm": 2.3593993186950684, + "learning_rate": 0.00015395883378079226, + "loss": 2.081, + "step": 9780 + }, + { + "epoch": 0.2305590881258537, + "grad_norm": 2.105539321899414, + "learning_rate": 0.00015391173284348358, + "loss": 2.1697, + "step": 9790 + }, + { + "epoch": 0.23079459281239698, + "grad_norm": 1.891050100326538, + "learning_rate": 0.00015386463190617496, + "loss": 1.9764, + "step": 9800 + }, + { + "epoch": 0.23103009749894024, + "grad_norm": 4.2726263999938965, + "learning_rate": 0.00015381753096886628, + "loss": 2.3325, + "step": 9810 + }, + { + "epoch": 0.2312656021854835, + "grad_norm": 1.6610772609710693, + "learning_rate": 0.00015377043003155763, + "loss": 2.3659, + "step": 9820 + }, + { + "epoch": 0.23150110687202674, + "grad_norm": 2.202704906463623, + "learning_rate": 0.00015372332909424898, + "loss": 2.1231, + "step": 9830 + }, + { + "epoch": 0.23173661155857, + "grad_norm": 1.6619096994400024, + "learning_rate": 0.00015367622815694033, + "loss": 2.0609, + "step": 9840 + }, + { + "epoch": 0.23197211624511327, + "grad_norm": 2.1274075508117676, + "learning_rate": 0.00015362912721963168, + "loss": 2.1627, + "step": 9850 + }, + { + "epoch": 0.23220762093165653, + "grad_norm": 2.303713321685791, + "learning_rate": 0.00015358202628232303, + "loss": 2.168, + "step": 9860 + }, + { + "epoch": 0.2324431256181998, + "grad_norm": 1.953169822692871, + "learning_rate": 0.00015353492534501436, + "loss": 2.0576, + "step": 9870 + }, + { + "epoch": 0.23267863030474306, + "grad_norm": 2.4101409912109375, + "learning_rate": 0.00015348782440770573, + "loss": 2.094, + "step": 9880 + }, + { + "epoch": 0.23291413499128633, + "grad_norm": 2.0362141132354736, + "learning_rate": 0.00015344072347039706, + "loss": 2.162, + "step": 9890 + }, + { + "epoch": 0.2331496396778296, + "grad_norm": 1.7066439390182495, + "learning_rate": 0.00015339362253308843, + "loss": 2.0753, + "step": 9900 + }, + { + "epoch": 0.23338514436437285, + "grad_norm": 2.0691280364990234, + "learning_rate": 0.00015334652159577976, + "loss": 2.0622, + "step": 9910 + }, + { + "epoch": 0.23362064905091612, + "grad_norm": 2.5985186100006104, + "learning_rate": 0.0001532994206584711, + "loss": 2.1825, + "step": 9920 + }, + { + "epoch": 0.23385615373745938, + "grad_norm": 1.8621069192886353, + "learning_rate": 0.00015325231972116246, + "loss": 2.3242, + "step": 9930 + }, + { + "epoch": 0.23409165842400265, + "grad_norm": 1.9276299476623535, + "learning_rate": 0.0001532052187838538, + "loss": 1.991, + "step": 9940 + }, + { + "epoch": 0.2343271631105459, + "grad_norm": 2.965895414352417, + "learning_rate": 0.00015315811784654516, + "loss": 2.0144, + "step": 9950 + }, + { + "epoch": 0.23456266779708917, + "grad_norm": 2.115572214126587, + "learning_rate": 0.0001531110169092365, + "loss": 2.3183, + "step": 9960 + }, + { + "epoch": 0.2347981724836324, + "grad_norm": 2.177835702896118, + "learning_rate": 0.00015306391597192783, + "loss": 1.9841, + "step": 9970 + }, + { + "epoch": 0.23503367717017568, + "grad_norm": 1.9404809474945068, + "learning_rate": 0.0001530168150346192, + "loss": 2.1265, + "step": 9980 + }, + { + "epoch": 0.23526918185671894, + "grad_norm": 2.3881819248199463, + "learning_rate": 0.00015296971409731053, + "loss": 2.2287, + "step": 9990 + }, + { + "epoch": 0.2355046865432622, + "grad_norm": 2.5311830043792725, + "learning_rate": 0.00015292261316000188, + "loss": 2.1156, + "step": 10000 + }, + { + "epoch": 0.23574019122980547, + "grad_norm": 1.957363247871399, + "learning_rate": 0.00015287551222269323, + "loss": 2.0976, + "step": 10010 + }, + { + "epoch": 0.23597569591634873, + "grad_norm": 2.139197587966919, + "learning_rate": 0.00015282841128538458, + "loss": 2.1844, + "step": 10020 + }, + { + "epoch": 0.236211200602892, + "grad_norm": 2.341768741607666, + "learning_rate": 0.00015278131034807593, + "loss": 2.3208, + "step": 10030 + }, + { + "epoch": 0.23644670528943526, + "grad_norm": 2.2145462036132812, + "learning_rate": 0.00015273420941076728, + "loss": 2.225, + "step": 10040 + }, + { + "epoch": 0.23668220997597852, + "grad_norm": 2.0473461151123047, + "learning_rate": 0.00015268710847345863, + "loss": 2.3011, + "step": 10050 + }, + { + "epoch": 0.2369177146625218, + "grad_norm": 2.0674173831939697, + "learning_rate": 0.00015264000753614998, + "loss": 2.0881, + "step": 10060 + }, + { + "epoch": 0.23715321934906505, + "grad_norm": 2.2629573345184326, + "learning_rate": 0.00015259290659884133, + "loss": 2.2819, + "step": 10070 + }, + { + "epoch": 0.23738872403560832, + "grad_norm": 2.272273063659668, + "learning_rate": 0.00015254580566153268, + "loss": 2.0528, + "step": 10080 + }, + { + "epoch": 0.23762422872215158, + "grad_norm": 2.418661117553711, + "learning_rate": 0.00015249870472422403, + "loss": 2.1313, + "step": 10090 + }, + { + "epoch": 0.23785973340869485, + "grad_norm": 2.5303127765655518, + "learning_rate": 0.00015245160378691536, + "loss": 2.2907, + "step": 10100 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 2.464144229888916, + "learning_rate": 0.00015240450284960673, + "loss": 1.9191, + "step": 10110 + }, + { + "epoch": 0.23833074278178135, + "grad_norm": 1.8093903064727783, + "learning_rate": 0.00015235740191229806, + "loss": 2.0152, + "step": 10120 + }, + { + "epoch": 0.2385662474683246, + "grad_norm": 1.9383940696716309, + "learning_rate": 0.0001523103009749894, + "loss": 2.1197, + "step": 10130 + }, + { + "epoch": 0.23880175215486787, + "grad_norm": 2.671933650970459, + "learning_rate": 0.00015226320003768076, + "loss": 1.9837, + "step": 10140 + }, + { + "epoch": 0.23903725684141114, + "grad_norm": 2.083766222000122, + "learning_rate": 0.0001522160991003721, + "loss": 2.0238, + "step": 10150 + }, + { + "epoch": 0.2392727615279544, + "grad_norm": 2.2363474369049072, + "learning_rate": 0.00015216899816306346, + "loss": 2.1516, + "step": 10160 + }, + { + "epoch": 0.23950826621449767, + "grad_norm": 2.276669979095459, + "learning_rate": 0.0001521218972257548, + "loss": 1.9849, + "step": 10170 + }, + { + "epoch": 0.23974377090104093, + "grad_norm": 2.018583059310913, + "learning_rate": 0.00015207479628844613, + "loss": 2.2545, + "step": 10180 + }, + { + "epoch": 0.2399792755875842, + "grad_norm": 2.3326144218444824, + "learning_rate": 0.0001520276953511375, + "loss": 2.2914, + "step": 10190 + }, + { + "epoch": 0.24021478027412746, + "grad_norm": 2.2414534091949463, + "learning_rate": 0.00015198059441382883, + "loss": 2.0861, + "step": 10200 + }, + { + "epoch": 0.24045028496067072, + "grad_norm": 2.206983804702759, + "learning_rate": 0.00015193349347652018, + "loss": 2.1644, + "step": 10210 + }, + { + "epoch": 0.240685789647214, + "grad_norm": 1.9949047565460205, + "learning_rate": 0.00015188639253921153, + "loss": 2.2007, + "step": 10220 + }, + { + "epoch": 0.24092129433375725, + "grad_norm": 1.8927348852157593, + "learning_rate": 0.00015183929160190288, + "loss": 2.0746, + "step": 10230 + }, + { + "epoch": 0.24115679902030052, + "grad_norm": 1.8133490085601807, + "learning_rate": 0.00015179219066459423, + "loss": 1.9463, + "step": 10240 + }, + { + "epoch": 0.24139230370684375, + "grad_norm": 2.7200329303741455, + "learning_rate": 0.00015174508972728558, + "loss": 2.0079, + "step": 10250 + }, + { + "epoch": 0.24162780839338702, + "grad_norm": 2.4350547790527344, + "learning_rate": 0.0001516979887899769, + "loss": 2.2918, + "step": 10260 + }, + { + "epoch": 0.24186331307993028, + "grad_norm": 1.9246656894683838, + "learning_rate": 0.00015165088785266828, + "loss": 2.117, + "step": 10270 + }, + { + "epoch": 0.24209881776647355, + "grad_norm": 2.2206227779388428, + "learning_rate": 0.0001516037869153596, + "loss": 2.225, + "step": 10280 + }, + { + "epoch": 0.2423343224530168, + "grad_norm": 2.1294167041778564, + "learning_rate": 0.00015155668597805098, + "loss": 2.0306, + "step": 10290 + }, + { + "epoch": 0.24256982713956007, + "grad_norm": 2.4349210262298584, + "learning_rate": 0.0001515095850407423, + "loss": 2.1363, + "step": 10300 + }, + { + "epoch": 0.24280533182610334, + "grad_norm": 1.8255261182785034, + "learning_rate": 0.00015146248410343366, + "loss": 2.2975, + "step": 10310 + }, + { + "epoch": 0.2430408365126466, + "grad_norm": 2.2579727172851562, + "learning_rate": 0.000151415383166125, + "loss": 2.0691, + "step": 10320 + }, + { + "epoch": 0.24327634119918987, + "grad_norm": 2.279855966567993, + "learning_rate": 0.00015136828222881636, + "loss": 2.1189, + "step": 10330 + }, + { + "epoch": 0.24351184588573313, + "grad_norm": 2.1615757942199707, + "learning_rate": 0.0001513211812915077, + "loss": 2.2606, + "step": 10340 + }, + { + "epoch": 0.2437473505722764, + "grad_norm": 2.1494510173797607, + "learning_rate": 0.00015127408035419906, + "loss": 2.0028, + "step": 10350 + }, + { + "epoch": 0.24398285525881966, + "grad_norm": 1.856366753578186, + "learning_rate": 0.0001512269794168904, + "loss": 2.015, + "step": 10360 + }, + { + "epoch": 0.24421835994536292, + "grad_norm": 2.0949535369873047, + "learning_rate": 0.00015117987847958176, + "loss": 1.9954, + "step": 10370 + }, + { + "epoch": 0.2444538646319062, + "grad_norm": 2.0382297039031982, + "learning_rate": 0.0001511327775422731, + "loss": 2.0136, + "step": 10380 + }, + { + "epoch": 0.24468936931844942, + "grad_norm": 3.209582567214966, + "learning_rate": 0.00015108567660496443, + "loss": 2.2051, + "step": 10390 + }, + { + "epoch": 0.2449248740049927, + "grad_norm": 2.9670584201812744, + "learning_rate": 0.0001510385756676558, + "loss": 2.0755, + "step": 10400 + }, + { + "epoch": 0.24516037869153595, + "grad_norm": 2.0415966510772705, + "learning_rate": 0.00015099147473034713, + "loss": 2.0836, + "step": 10410 + }, + { + "epoch": 0.24539588337807922, + "grad_norm": 2.374818801879883, + "learning_rate": 0.00015094437379303848, + "loss": 2.1602, + "step": 10420 + }, + { + "epoch": 0.24563138806462248, + "grad_norm": 2.551715850830078, + "learning_rate": 0.00015089727285572983, + "loss": 2.2528, + "step": 10430 + }, + { + "epoch": 0.24586689275116574, + "grad_norm": 1.8446077108383179, + "learning_rate": 0.00015085017191842118, + "loss": 2.2517, + "step": 10440 + }, + { + "epoch": 0.246102397437709, + "grad_norm": 2.6373045444488525, + "learning_rate": 0.00015080307098111253, + "loss": 2.0872, + "step": 10450 + }, + { + "epoch": 0.24633790212425227, + "grad_norm": 2.356964588165283, + "learning_rate": 0.00015075597004380388, + "loss": 2.0906, + "step": 10460 + }, + { + "epoch": 0.24657340681079554, + "grad_norm": 2.3910388946533203, + "learning_rate": 0.0001507088691064952, + "loss": 2.2638, + "step": 10470 + }, + { + "epoch": 0.2468089114973388, + "grad_norm": 2.311750650405884, + "learning_rate": 0.00015066176816918659, + "loss": 1.9749, + "step": 10480 + }, + { + "epoch": 0.24704441618388207, + "grad_norm": 2.235476016998291, + "learning_rate": 0.0001506146672318779, + "loss": 1.8174, + "step": 10490 + }, + { + "epoch": 0.24727992087042533, + "grad_norm": 2.281172275543213, + "learning_rate": 0.00015056756629456929, + "loss": 1.8255, + "step": 10500 + }, + { + "epoch": 0.2475154255569686, + "grad_norm": 2.279736042022705, + "learning_rate": 0.0001505204653572606, + "loss": 2.1736, + "step": 10510 + }, + { + "epoch": 0.24775093024351186, + "grad_norm": 2.279602289199829, + "learning_rate": 0.00015047336441995196, + "loss": 2.0634, + "step": 10520 + }, + { + "epoch": 0.24798643493005512, + "grad_norm": 2.272345542907715, + "learning_rate": 0.0001504262634826433, + "loss": 2.2241, + "step": 10530 + }, + { + "epoch": 0.24822193961659836, + "grad_norm": 1.9404467344284058, + "learning_rate": 0.00015037916254533466, + "loss": 2.2235, + "step": 10540 + }, + { + "epoch": 0.24845744430314162, + "grad_norm": 2.3488924503326416, + "learning_rate": 0.000150332061608026, + "loss": 2.2408, + "step": 10550 + }, + { + "epoch": 0.2486929489896849, + "grad_norm": 1.9650462865829468, + "learning_rate": 0.00015028496067071736, + "loss": 1.9652, + "step": 10560 + }, + { + "epoch": 0.24892845367622815, + "grad_norm": 2.162095785140991, + "learning_rate": 0.00015023785973340868, + "loss": 2.2118, + "step": 10570 + }, + { + "epoch": 0.24916395836277141, + "grad_norm": 2.4109175205230713, + "learning_rate": 0.00015019075879610006, + "loss": 2.1926, + "step": 10580 + }, + { + "epoch": 0.24939946304931468, + "grad_norm": 2.6191952228546143, + "learning_rate": 0.00015014365785879138, + "loss": 2.1851, + "step": 10590 + }, + { + "epoch": 0.24963496773585794, + "grad_norm": 2.78340220451355, + "learning_rate": 0.00015009655692148273, + "loss": 2.1431, + "step": 10600 + }, + { + "epoch": 0.2498704724224012, + "grad_norm": 1.8790323734283447, + "learning_rate": 0.0001500494559841741, + "loss": 2.088, + "step": 10610 + }, + { + "epoch": 0.25010597710894444, + "grad_norm": 2.0652871131896973, + "learning_rate": 0.00015000235504686543, + "loss": 2.1255, + "step": 10620 + }, + { + "epoch": 0.2503414817954877, + "grad_norm": 1.9595967531204224, + "learning_rate": 0.00014995525410955678, + "loss": 2.026, + "step": 10630 + }, + { + "epoch": 0.250576986482031, + "grad_norm": 2.293111562728882, + "learning_rate": 0.00014990815317224813, + "loss": 2.2182, + "step": 10640 + }, + { + "epoch": 0.25081249116857424, + "grad_norm": 1.931410789489746, + "learning_rate": 0.00014986105223493948, + "loss": 2.2731, + "step": 10650 + }, + { + "epoch": 0.2510479958551175, + "grad_norm": 2.6181063652038574, + "learning_rate": 0.00014981395129763084, + "loss": 2.2646, + "step": 10660 + }, + { + "epoch": 0.25128350054166076, + "grad_norm": 1.9321035146713257, + "learning_rate": 0.00014976685036032219, + "loss": 2.1447, + "step": 10670 + }, + { + "epoch": 0.25151900522820403, + "grad_norm": 3.149609327316284, + "learning_rate": 0.00014971974942301354, + "loss": 2.1219, + "step": 10680 + }, + { + "epoch": 0.2517545099147473, + "grad_norm": 1.9660497903823853, + "learning_rate": 0.00014967264848570489, + "loss": 2.2534, + "step": 10690 + }, + { + "epoch": 0.25199001460129056, + "grad_norm": 2.55802845954895, + "learning_rate": 0.0001496255475483962, + "loss": 2.2268, + "step": 10700 + }, + { + "epoch": 0.2522255192878338, + "grad_norm": 1.8749346733093262, + "learning_rate": 0.0001495784466110876, + "loss": 2.0165, + "step": 10710 + }, + { + "epoch": 0.2524610239743771, + "grad_norm": 1.9412798881530762, + "learning_rate": 0.0001495313456737789, + "loss": 2.2144, + "step": 10720 + }, + { + "epoch": 0.25269652866092035, + "grad_norm": 2.201239585876465, + "learning_rate": 0.00014948424473647026, + "loss": 2.1653, + "step": 10730 + }, + { + "epoch": 0.2529320333474636, + "grad_norm": 1.9565176963806152, + "learning_rate": 0.0001494371437991616, + "loss": 2.1649, + "step": 10740 + }, + { + "epoch": 0.2531675380340069, + "grad_norm": 2.220141649246216, + "learning_rate": 0.00014939004286185296, + "loss": 1.8715, + "step": 10750 + }, + { + "epoch": 0.25340304272055014, + "grad_norm": 2.311554193496704, + "learning_rate": 0.0001493429419245443, + "loss": 2.2611, + "step": 10760 + }, + { + "epoch": 0.2536385474070934, + "grad_norm": 2.3438363075256348, + "learning_rate": 0.00014929584098723566, + "loss": 2.1026, + "step": 10770 + }, + { + "epoch": 0.25387405209363667, + "grad_norm": 1.9707456827163696, + "learning_rate": 0.00014924874004992698, + "loss": 2.0283, + "step": 10780 + }, + { + "epoch": 0.25410955678017993, + "grad_norm": 3.091794729232788, + "learning_rate": 0.00014920163911261836, + "loss": 2.3363, + "step": 10790 + }, + { + "epoch": 0.2543450614667232, + "grad_norm": 2.3337857723236084, + "learning_rate": 0.00014915453817530968, + "loss": 2.1223, + "step": 10800 + }, + { + "epoch": 0.25458056615326646, + "grad_norm": 1.6483640670776367, + "learning_rate": 0.00014910743723800103, + "loss": 2.2096, + "step": 10810 + }, + { + "epoch": 0.2548160708398097, + "grad_norm": 2.229172945022583, + "learning_rate": 0.00014906033630069238, + "loss": 2.1877, + "step": 10820 + }, + { + "epoch": 0.255051575526353, + "grad_norm": 1.7530523538589478, + "learning_rate": 0.00014901323536338374, + "loss": 1.9589, + "step": 10830 + }, + { + "epoch": 0.25528708021289626, + "grad_norm": 1.962538242340088, + "learning_rate": 0.00014896613442607509, + "loss": 2.2333, + "step": 10840 + }, + { + "epoch": 0.2555225848994395, + "grad_norm": 3.094045877456665, + "learning_rate": 0.00014891903348876644, + "loss": 2.1465, + "step": 10850 + }, + { + "epoch": 0.2557580895859828, + "grad_norm": 1.9345427751541138, + "learning_rate": 0.00014887193255145776, + "loss": 2.1342, + "step": 10860 + }, + { + "epoch": 0.25599359427252605, + "grad_norm": 1.932407259941101, + "learning_rate": 0.00014882483161414914, + "loss": 2.0455, + "step": 10870 + }, + { + "epoch": 0.2562290989590693, + "grad_norm": 2.0643908977508545, + "learning_rate": 0.00014877773067684046, + "loss": 2.0413, + "step": 10880 + }, + { + "epoch": 0.2564646036456125, + "grad_norm": 2.0406455993652344, + "learning_rate": 0.00014873062973953184, + "loss": 2.2408, + "step": 10890 + }, + { + "epoch": 0.2567001083321558, + "grad_norm": 2.9993858337402344, + "learning_rate": 0.0001486835288022232, + "loss": 2.1404, + "step": 10900 + }, + { + "epoch": 0.25693561301869905, + "grad_norm": 2.122169017791748, + "learning_rate": 0.0001486364278649145, + "loss": 2.3794, + "step": 10910 + }, + { + "epoch": 0.2571711177052423, + "grad_norm": 2.0623533725738525, + "learning_rate": 0.0001485893269276059, + "loss": 2.089, + "step": 10920 + }, + { + "epoch": 0.2574066223917856, + "grad_norm": 2.2135274410247803, + "learning_rate": 0.0001485422259902972, + "loss": 2.0781, + "step": 10930 + }, + { + "epoch": 0.25764212707832884, + "grad_norm": 3.7605085372924805, + "learning_rate": 0.00014849512505298856, + "loss": 2.3425, + "step": 10940 + }, + { + "epoch": 0.2578776317648721, + "grad_norm": 2.248478412628174, + "learning_rate": 0.0001484480241156799, + "loss": 2.0831, + "step": 10950 + }, + { + "epoch": 0.25811313645141537, + "grad_norm": 2.040879249572754, + "learning_rate": 0.00014840092317837126, + "loss": 2.1721, + "step": 10960 + }, + { + "epoch": 0.25834864113795863, + "grad_norm": 2.4752190113067627, + "learning_rate": 0.0001483538222410626, + "loss": 1.8606, + "step": 10970 + }, + { + "epoch": 0.2585841458245019, + "grad_norm": 2.079008102416992, + "learning_rate": 0.00014830672130375396, + "loss": 2.1368, + "step": 10980 + }, + { + "epoch": 0.25881965051104516, + "grad_norm": 2.152157783508301, + "learning_rate": 0.00014825962036644528, + "loss": 2.2405, + "step": 10990 + }, + { + "epoch": 0.2590551551975884, + "grad_norm": 1.8461806774139404, + "learning_rate": 0.00014821251942913666, + "loss": 2.2071, + "step": 11000 + }, + { + "epoch": 0.2592906598841317, + "grad_norm": 2.190112590789795, + "learning_rate": 0.00014816541849182799, + "loss": 2.1474, + "step": 11010 + }, + { + "epoch": 0.25952616457067496, + "grad_norm": 1.828783631324768, + "learning_rate": 0.00014811831755451934, + "loss": 2.0584, + "step": 11020 + }, + { + "epoch": 0.2597616692572182, + "grad_norm": 2.171999454498291, + "learning_rate": 0.00014807121661721069, + "loss": 2.2661, + "step": 11030 + }, + { + "epoch": 0.2599971739437615, + "grad_norm": 2.530381679534912, + "learning_rate": 0.00014802411567990204, + "loss": 2.3015, + "step": 11040 + }, + { + "epoch": 0.26023267863030475, + "grad_norm": 2.840162515640259, + "learning_rate": 0.00014797701474259339, + "loss": 2.0476, + "step": 11050 + }, + { + "epoch": 0.260468183316848, + "grad_norm": 2.931011438369751, + "learning_rate": 0.00014792991380528474, + "loss": 2.1905, + "step": 11060 + }, + { + "epoch": 0.2607036880033913, + "grad_norm": 2.2790048122406006, + "learning_rate": 0.00014788281286797606, + "loss": 1.9775, + "step": 11070 + }, + { + "epoch": 0.26093919268993454, + "grad_norm": 2.563983678817749, + "learning_rate": 0.00014783571193066744, + "loss": 2.0517, + "step": 11080 + }, + { + "epoch": 0.2611746973764778, + "grad_norm": 1.9197920560836792, + "learning_rate": 0.00014779332108708963, + "loss": 2.0083, + "step": 11090 + }, + { + "epoch": 0.26141020206302107, + "grad_norm": 2.238065242767334, + "learning_rate": 0.00014774622014978098, + "loss": 2.1118, + "step": 11100 + }, + { + "epoch": 0.26164570674956433, + "grad_norm": 2.00567626953125, + "learning_rate": 0.00014769911921247233, + "loss": 2.0841, + "step": 11110 + }, + { + "epoch": 0.2618812114361076, + "grad_norm": 1.9711300134658813, + "learning_rate": 0.00014765201827516368, + "loss": 2.1229, + "step": 11120 + }, + { + "epoch": 0.26211671612265086, + "grad_norm": 2.36851167678833, + "learning_rate": 0.00014760491733785503, + "loss": 1.9788, + "step": 11130 + }, + { + "epoch": 0.2623522208091941, + "grad_norm": 2.2312638759613037, + "learning_rate": 0.00014755781640054638, + "loss": 2.1221, + "step": 11140 + }, + { + "epoch": 0.2625877254957374, + "grad_norm": 1.9578697681427002, + "learning_rate": 0.00014751071546323773, + "loss": 1.9844, + "step": 11150 + }, + { + "epoch": 0.26282323018228065, + "grad_norm": 2.0937769412994385, + "learning_rate": 0.00014746361452592908, + "loss": 2.1305, + "step": 11160 + }, + { + "epoch": 0.26305873486882386, + "grad_norm": 1.9958505630493164, + "learning_rate": 0.00014741651358862043, + "loss": 2.2201, + "step": 11170 + }, + { + "epoch": 0.2632942395553671, + "grad_norm": 2.8454930782318115, + "learning_rate": 0.00014736941265131178, + "loss": 2.1756, + "step": 11180 + }, + { + "epoch": 0.2635297442419104, + "grad_norm": 1.7872885465621948, + "learning_rate": 0.00014732231171400313, + "loss": 1.788, + "step": 11190 + }, + { + "epoch": 0.26376524892845365, + "grad_norm": 2.458237886428833, + "learning_rate": 0.00014727521077669445, + "loss": 2.1815, + "step": 11200 + }, + { + "epoch": 0.2640007536149969, + "grad_norm": 2.0657920837402344, + "learning_rate": 0.00014722810983938583, + "loss": 2.0694, + "step": 11210 + }, + { + "epoch": 0.2642362583015402, + "grad_norm": 2.7617380619049072, + "learning_rate": 0.00014718100890207715, + "loss": 2.196, + "step": 11220 + }, + { + "epoch": 0.26447176298808345, + "grad_norm": 2.043877601623535, + "learning_rate": 0.0001471339079647685, + "loss": 2.2654, + "step": 11230 + }, + { + "epoch": 0.2647072676746267, + "grad_norm": 2.868586540222168, + "learning_rate": 0.00014708680702745985, + "loss": 2.1117, + "step": 11240 + }, + { + "epoch": 0.26494277236117, + "grad_norm": 2.223196268081665, + "learning_rate": 0.0001470397060901512, + "loss": 2.3975, + "step": 11250 + }, + { + "epoch": 0.26517827704771324, + "grad_norm": 2.0349698066711426, + "learning_rate": 0.00014699260515284255, + "loss": 2.047, + "step": 11260 + }, + { + "epoch": 0.2654137817342565, + "grad_norm": 1.827580451965332, + "learning_rate": 0.0001469455042155339, + "loss": 2.0944, + "step": 11270 + }, + { + "epoch": 0.26564928642079977, + "grad_norm": 2.8184313774108887, + "learning_rate": 0.00014689840327822523, + "loss": 2.0817, + "step": 11280 + }, + { + "epoch": 0.26588479110734303, + "grad_norm": 2.1076481342315674, + "learning_rate": 0.0001468513023409166, + "loss": 2.153, + "step": 11290 + }, + { + "epoch": 0.2661202957938863, + "grad_norm": 1.8298046588897705, + "learning_rate": 0.00014680420140360793, + "loss": 2.2125, + "step": 11300 + }, + { + "epoch": 0.26635580048042956, + "grad_norm": 1.8453890085220337, + "learning_rate": 0.00014675710046629928, + "loss": 2.1606, + "step": 11310 + }, + { + "epoch": 0.2665913051669728, + "grad_norm": 2.0864975452423096, + "learning_rate": 0.00014670999952899063, + "loss": 2.0076, + "step": 11320 + }, + { + "epoch": 0.2668268098535161, + "grad_norm": 2.20062255859375, + "learning_rate": 0.00014666289859168198, + "loss": 1.9654, + "step": 11330 + }, + { + "epoch": 0.26706231454005935, + "grad_norm": 1.914099097251892, + "learning_rate": 0.00014661579765437333, + "loss": 2.0639, + "step": 11340 + }, + { + "epoch": 0.2672978192266026, + "grad_norm": 1.8117254972457886, + "learning_rate": 0.00014656869671706468, + "loss": 2.0437, + "step": 11350 + }, + { + "epoch": 0.2675333239131459, + "grad_norm": 2.076619863510132, + "learning_rate": 0.000146521595779756, + "loss": 2.0552, + "step": 11360 + }, + { + "epoch": 0.26776882859968915, + "grad_norm": 1.752306580543518, + "learning_rate": 0.00014647449484244738, + "loss": 1.9671, + "step": 11370 + }, + { + "epoch": 0.2680043332862324, + "grad_norm": 2.3570029735565186, + "learning_rate": 0.0001464273939051387, + "loss": 1.8996, + "step": 11380 + }, + { + "epoch": 0.2682398379727757, + "grad_norm": 1.9335840940475464, + "learning_rate": 0.00014638029296783008, + "loss": 2.2257, + "step": 11390 + }, + { + "epoch": 0.26847534265931894, + "grad_norm": 2.8043274879455566, + "learning_rate": 0.0001463331920305214, + "loss": 2.3472, + "step": 11400 + }, + { + "epoch": 0.2687108473458622, + "grad_norm": 1.8869709968566895, + "learning_rate": 0.00014628609109321275, + "loss": 2.3359, + "step": 11410 + }, + { + "epoch": 0.26894635203240547, + "grad_norm": 3.413790225982666, + "learning_rate": 0.0001462389901559041, + "loss": 2.2504, + "step": 11420 + }, + { + "epoch": 0.26918185671894873, + "grad_norm": 2.251716136932373, + "learning_rate": 0.00014619188921859545, + "loss": 2.1951, + "step": 11430 + }, + { + "epoch": 0.269417361405492, + "grad_norm": 2.0624914169311523, + "learning_rate": 0.0001461447882812868, + "loss": 2.0907, + "step": 11440 + }, + { + "epoch": 0.2696528660920352, + "grad_norm": 2.2086572647094727, + "learning_rate": 0.00014609768734397816, + "loss": 2.0458, + "step": 11450 + }, + { + "epoch": 0.26988837077857847, + "grad_norm": 2.272899627685547, + "learning_rate": 0.0001460505864066695, + "loss": 2.0175, + "step": 11460 + }, + { + "epoch": 0.27012387546512173, + "grad_norm": 1.962472677230835, + "learning_rate": 0.00014600348546936086, + "loss": 2.2018, + "step": 11470 + }, + { + "epoch": 0.270359380151665, + "grad_norm": 1.8349707126617432, + "learning_rate": 0.0001459563845320522, + "loss": 2.1957, + "step": 11480 + }, + { + "epoch": 0.27059488483820826, + "grad_norm": 2.317610740661621, + "learning_rate": 0.00014590928359474353, + "loss": 2.1421, + "step": 11490 + }, + { + "epoch": 0.2708303895247515, + "grad_norm": 3.183609962463379, + "learning_rate": 0.0001458621826574349, + "loss": 2.0522, + "step": 11500 + }, + { + "epoch": 0.2710658942112948, + "grad_norm": 2.0786678791046143, + "learning_rate": 0.00014581508172012623, + "loss": 2.0592, + "step": 11510 + }, + { + "epoch": 0.27130139889783805, + "grad_norm": 1.7079672813415527, + "learning_rate": 0.00014576798078281758, + "loss": 1.9237, + "step": 11520 + }, + { + "epoch": 0.2715369035843813, + "grad_norm": 1.9276167154312134, + "learning_rate": 0.00014572087984550893, + "loss": 2.4222, + "step": 11530 + }, + { + "epoch": 0.2717724082709246, + "grad_norm": 2.461214303970337, + "learning_rate": 0.00014567377890820028, + "loss": 2.2114, + "step": 11540 + }, + { + "epoch": 0.27200791295746785, + "grad_norm": 1.96316659450531, + "learning_rate": 0.00014562667797089163, + "loss": 2.2535, + "step": 11550 + }, + { + "epoch": 0.2722434176440111, + "grad_norm": 2.1543939113616943, + "learning_rate": 0.00014557957703358298, + "loss": 2.2106, + "step": 11560 + }, + { + "epoch": 0.2724789223305544, + "grad_norm": 1.8701822757720947, + "learning_rate": 0.0001455324760962743, + "loss": 1.9426, + "step": 11570 + }, + { + "epoch": 0.27271442701709764, + "grad_norm": 2.2132885456085205, + "learning_rate": 0.00014548537515896568, + "loss": 2.1604, + "step": 11580 + }, + { + "epoch": 0.2729499317036409, + "grad_norm": 2.094003200531006, + "learning_rate": 0.000145438274221657, + "loss": 2.1061, + "step": 11590 + }, + { + "epoch": 0.27318543639018417, + "grad_norm": 2.395878314971924, + "learning_rate": 0.00014539117328434838, + "loss": 2.0564, + "step": 11600 + }, + { + "epoch": 0.27342094107672743, + "grad_norm": 2.653024911880493, + "learning_rate": 0.0001453440723470397, + "loss": 2.3041, + "step": 11610 + }, + { + "epoch": 0.2736564457632707, + "grad_norm": 2.932143449783325, + "learning_rate": 0.00014529697140973106, + "loss": 1.9723, + "step": 11620 + }, + { + "epoch": 0.27389195044981396, + "grad_norm": 2.500082492828369, + "learning_rate": 0.0001452498704724224, + "loss": 1.9417, + "step": 11630 + }, + { + "epoch": 0.2741274551363572, + "grad_norm": 3.0111424922943115, + "learning_rate": 0.00014520276953511376, + "loss": 2.1019, + "step": 11640 + }, + { + "epoch": 0.2743629598229005, + "grad_norm": 2.4497432708740234, + "learning_rate": 0.0001451556685978051, + "loss": 2.0508, + "step": 11650 + }, + { + "epoch": 0.27459846450944375, + "grad_norm": 2.2490170001983643, + "learning_rate": 0.00014510856766049646, + "loss": 2.1141, + "step": 11660 + }, + { + "epoch": 0.274833969195987, + "grad_norm": 2.196950674057007, + "learning_rate": 0.00014506146672318778, + "loss": 2.0565, + "step": 11670 + }, + { + "epoch": 0.2750694738825303, + "grad_norm": 3.2146339416503906, + "learning_rate": 0.00014501436578587916, + "loss": 2.0995, + "step": 11680 + }, + { + "epoch": 0.27530497856907354, + "grad_norm": 2.50426983833313, + "learning_rate": 0.00014496726484857048, + "loss": 2.166, + "step": 11690 + }, + { + "epoch": 0.2755404832556168, + "grad_norm": 2.0356388092041016, + "learning_rate": 0.00014492016391126183, + "loss": 2.1584, + "step": 11700 + }, + { + "epoch": 0.2757759879421601, + "grad_norm": 2.600349187850952, + "learning_rate": 0.0001448730629739532, + "loss": 2.1718, + "step": 11710 + }, + { + "epoch": 0.27601149262870334, + "grad_norm": 3.562185764312744, + "learning_rate": 0.00014482596203664453, + "loss": 2.2295, + "step": 11720 + }, + { + "epoch": 0.2762469973152466, + "grad_norm": 1.928037405014038, + "learning_rate": 0.00014477886109933588, + "loss": 1.9995, + "step": 11730 + }, + { + "epoch": 0.2764825020017898, + "grad_norm": 1.4152915477752686, + "learning_rate": 0.00014473176016202723, + "loss": 2.1623, + "step": 11740 + }, + { + "epoch": 0.2767180066883331, + "grad_norm": 1.9397757053375244, + "learning_rate": 0.00014468465922471858, + "loss": 2.0217, + "step": 11750 + }, + { + "epoch": 0.27695351137487634, + "grad_norm": 1.7832798957824707, + "learning_rate": 0.00014463755828740993, + "loss": 2.0114, + "step": 11760 + }, + { + "epoch": 0.2771890160614196, + "grad_norm": 2.282412528991699, + "learning_rate": 0.00014459045735010128, + "loss": 2.1579, + "step": 11770 + }, + { + "epoch": 0.27742452074796287, + "grad_norm": 2.4182028770446777, + "learning_rate": 0.00014454335641279263, + "loss": 2.0862, + "step": 11780 + }, + { + "epoch": 0.27766002543450613, + "grad_norm": 2.954577922821045, + "learning_rate": 0.00014449625547548398, + "loss": 2.184, + "step": 11790 + }, + { + "epoch": 0.2778955301210494, + "grad_norm": 1.8422027826309204, + "learning_rate": 0.0001444491545381753, + "loss": 2.1267, + "step": 11800 + }, + { + "epoch": 0.27813103480759266, + "grad_norm": 1.9264764785766602, + "learning_rate": 0.00014440205360086668, + "loss": 2.2146, + "step": 11810 + }, + { + "epoch": 0.2783665394941359, + "grad_norm": 1.719955563545227, + "learning_rate": 0.000144354952663558, + "loss": 1.9979, + "step": 11820 + }, + { + "epoch": 0.2786020441806792, + "grad_norm": 2.0132710933685303, + "learning_rate": 0.00014430785172624936, + "loss": 2.1951, + "step": 11830 + }, + { + "epoch": 0.27883754886722245, + "grad_norm": 2.1974024772644043, + "learning_rate": 0.0001442607507889407, + "loss": 2.3713, + "step": 11840 + }, + { + "epoch": 0.2790730535537657, + "grad_norm": 2.18841814994812, + "learning_rate": 0.00014421364985163206, + "loss": 2.1615, + "step": 11850 + }, + { + "epoch": 0.279308558240309, + "grad_norm": 2.1697840690612793, + "learning_rate": 0.0001441665489143234, + "loss": 2.1027, + "step": 11860 + }, + { + "epoch": 0.27954406292685224, + "grad_norm": 2.2310283184051514, + "learning_rate": 0.00014411944797701476, + "loss": 2.3225, + "step": 11870 + }, + { + "epoch": 0.2797795676133955, + "grad_norm": 2.3003742694854736, + "learning_rate": 0.00014407234703970608, + "loss": 2.1095, + "step": 11880 + }, + { + "epoch": 0.28001507229993877, + "grad_norm": 1.808599591255188, + "learning_rate": 0.00014402524610239746, + "loss": 2.0941, + "step": 11890 + }, + { + "epoch": 0.28025057698648204, + "grad_norm": 2.0506701469421387, + "learning_rate": 0.00014397814516508878, + "loss": 2.3607, + "step": 11900 + }, + { + "epoch": 0.2804860816730253, + "grad_norm": 1.990747332572937, + "learning_rate": 0.00014393104422778013, + "loss": 2.0992, + "step": 11910 + }, + { + "epoch": 0.28072158635956856, + "grad_norm": 2.9683446884155273, + "learning_rate": 0.00014388394329047148, + "loss": 1.9899, + "step": 11920 + }, + { + "epoch": 0.28095709104611183, + "grad_norm": 2.2459323406219482, + "learning_rate": 0.00014383684235316283, + "loss": 1.9907, + "step": 11930 + }, + { + "epoch": 0.2811925957326551, + "grad_norm": 2.313927173614502, + "learning_rate": 0.00014378974141585418, + "loss": 2.1981, + "step": 11940 + }, + { + "epoch": 0.28142810041919836, + "grad_norm": 2.565319299697876, + "learning_rate": 0.00014374264047854553, + "loss": 2.2282, + "step": 11950 + }, + { + "epoch": 0.2816636051057416, + "grad_norm": 2.329613208770752, + "learning_rate": 0.00014369553954123686, + "loss": 2.0866, + "step": 11960 + }, + { + "epoch": 0.2818991097922849, + "grad_norm": 1.7780147790908813, + "learning_rate": 0.00014364843860392823, + "loss": 2.1109, + "step": 11970 + }, + { + "epoch": 0.28213461447882815, + "grad_norm": 2.3526809215545654, + "learning_rate": 0.00014360133766661956, + "loss": 2.0509, + "step": 11980 + }, + { + "epoch": 0.2823701191653714, + "grad_norm": 2.9026174545288086, + "learning_rate": 0.00014355423672931093, + "loss": 2.1536, + "step": 11990 + }, + { + "epoch": 0.2826056238519147, + "grad_norm": 2.3151416778564453, + "learning_rate": 0.00014350713579200228, + "loss": 2.2772, + "step": 12000 + }, + { + "epoch": 0.28284112853845794, + "grad_norm": 2.0522875785827637, + "learning_rate": 0.0001434600348546936, + "loss": 1.9419, + "step": 12010 + }, + { + "epoch": 0.28307663322500115, + "grad_norm": 1.7886687517166138, + "learning_rate": 0.00014341293391738498, + "loss": 2.1908, + "step": 12020 + }, + { + "epoch": 0.2833121379115444, + "grad_norm": 2.5073275566101074, + "learning_rate": 0.0001433658329800763, + "loss": 2.2582, + "step": 12030 + }, + { + "epoch": 0.2835476425980877, + "grad_norm": 1.9218273162841797, + "learning_rate": 0.00014331873204276766, + "loss": 2.1327, + "step": 12040 + }, + { + "epoch": 0.28378314728463094, + "grad_norm": 1.9071509838104248, + "learning_rate": 0.000143271631105459, + "loss": 2.1697, + "step": 12050 + }, + { + "epoch": 0.2840186519711742, + "grad_norm": 1.9115699529647827, + "learning_rate": 0.00014322453016815036, + "loss": 2.1689, + "step": 12060 + }, + { + "epoch": 0.28425415665771747, + "grad_norm": 3.1587018966674805, + "learning_rate": 0.0001431774292308417, + "loss": 2.2849, + "step": 12070 + }, + { + "epoch": 0.28448966134426074, + "grad_norm": 2.1622958183288574, + "learning_rate": 0.00014313032829353306, + "loss": 1.9378, + "step": 12080 + }, + { + "epoch": 0.284725166030804, + "grad_norm": 2.525998830795288, + "learning_rate": 0.00014308322735622438, + "loss": 2.277, + "step": 12090 + }, + { + "epoch": 0.28496067071734726, + "grad_norm": 1.5249601602554321, + "learning_rate": 0.00014303612641891576, + "loss": 1.9813, + "step": 12100 + }, + { + "epoch": 0.28519617540389053, + "grad_norm": 2.085453748703003, + "learning_rate": 0.00014298902548160708, + "loss": 2.3223, + "step": 12110 + }, + { + "epoch": 0.2854316800904338, + "grad_norm": 1.8140358924865723, + "learning_rate": 0.00014294192454429843, + "loss": 2.1393, + "step": 12120 + }, + { + "epoch": 0.28566718477697706, + "grad_norm": 2.5443224906921387, + "learning_rate": 0.00014289482360698978, + "loss": 2.2873, + "step": 12130 + }, + { + "epoch": 0.2859026894635203, + "grad_norm": 2.006328821182251, + "learning_rate": 0.00014284772266968113, + "loss": 2.1113, + "step": 12140 + }, + { + "epoch": 0.2861381941500636, + "grad_norm": 1.8038783073425293, + "learning_rate": 0.00014280062173237248, + "loss": 2.3057, + "step": 12150 + }, + { + "epoch": 0.28637369883660685, + "grad_norm": 2.357002019882202, + "learning_rate": 0.00014275352079506383, + "loss": 2.0994, + "step": 12160 + }, + { + "epoch": 0.2866092035231501, + "grad_norm": 2.0927555561065674, + "learning_rate": 0.00014270641985775516, + "loss": 1.9388, + "step": 12170 + }, + { + "epoch": 0.2868447082096934, + "grad_norm": 3.126340866088867, + "learning_rate": 0.00014265931892044653, + "loss": 2.2587, + "step": 12180 + }, + { + "epoch": 0.28708021289623664, + "grad_norm": 2.3898532390594482, + "learning_rate": 0.00014261221798313786, + "loss": 2.0545, + "step": 12190 + }, + { + "epoch": 0.2873157175827799, + "grad_norm": 2.6160011291503906, + "learning_rate": 0.00014256511704582923, + "loss": 1.9734, + "step": 12200 + }, + { + "epoch": 0.28755122226932317, + "grad_norm": 2.259218692779541, + "learning_rate": 0.00014251801610852056, + "loss": 2.0344, + "step": 12210 + }, + { + "epoch": 0.28778672695586643, + "grad_norm": 2.0822460651397705, + "learning_rate": 0.0001424709151712119, + "loss": 2.2014, + "step": 12220 + }, + { + "epoch": 0.2880222316424097, + "grad_norm": 2.0669522285461426, + "learning_rate": 0.00014242381423390326, + "loss": 2.0617, + "step": 12230 + }, + { + "epoch": 0.28825773632895296, + "grad_norm": 1.9552747011184692, + "learning_rate": 0.0001423767132965946, + "loss": 2.1155, + "step": 12240 + }, + { + "epoch": 0.2884932410154962, + "grad_norm": 2.3942112922668457, + "learning_rate": 0.00014232961235928596, + "loss": 2.0574, + "step": 12250 + }, + { + "epoch": 0.2887287457020395, + "grad_norm": 2.205756187438965, + "learning_rate": 0.0001422825114219773, + "loss": 2.1403, + "step": 12260 + }, + { + "epoch": 0.28896425038858276, + "grad_norm": 2.649390697479248, + "learning_rate": 0.00014223541048466863, + "loss": 2.1289, + "step": 12270 + }, + { + "epoch": 0.289199755075126, + "grad_norm": 2.3708388805389404, + "learning_rate": 0.00014218830954736, + "loss": 2.06, + "step": 12280 + }, + { + "epoch": 0.2894352597616693, + "grad_norm": 2.051314115524292, + "learning_rate": 0.00014214120861005136, + "loss": 2.2359, + "step": 12290 + }, + { + "epoch": 0.2896707644482125, + "grad_norm": 1.698848009109497, + "learning_rate": 0.00014209410767274268, + "loss": 2.0647, + "step": 12300 + }, + { + "epoch": 0.28990626913475576, + "grad_norm": 1.9865273237228394, + "learning_rate": 0.00014204700673543406, + "loss": 1.9384, + "step": 12310 + }, + { + "epoch": 0.290141773821299, + "grad_norm": 2.2270126342773438, + "learning_rate": 0.00014199990579812538, + "loss": 2.1169, + "step": 12320 + }, + { + "epoch": 0.2903772785078423, + "grad_norm": 1.6230186223983765, + "learning_rate": 0.00014195280486081673, + "loss": 1.9432, + "step": 12330 + }, + { + "epoch": 0.29061278319438555, + "grad_norm": 1.7483242750167847, + "learning_rate": 0.00014190570392350808, + "loss": 2.1609, + "step": 12340 + }, + { + "epoch": 0.2908482878809288, + "grad_norm": 1.7598001956939697, + "learning_rate": 0.00014185860298619943, + "loss": 2.396, + "step": 12350 + }, + { + "epoch": 0.2910837925674721, + "grad_norm": 2.14711856842041, + "learning_rate": 0.00014181150204889078, + "loss": 2.092, + "step": 12360 + }, + { + "epoch": 0.29131929725401534, + "grad_norm": 1.9929486513137817, + "learning_rate": 0.00014176440111158213, + "loss": 2.0726, + "step": 12370 + }, + { + "epoch": 0.2915548019405586, + "grad_norm": 1.9438811540603638, + "learning_rate": 0.00014171730017427348, + "loss": 1.9861, + "step": 12380 + }, + { + "epoch": 0.29179030662710187, + "grad_norm": 1.812244176864624, + "learning_rate": 0.00014167019923696483, + "loss": 2.2, + "step": 12390 + }, + { + "epoch": 0.29202581131364513, + "grad_norm": 1.8583556413650513, + "learning_rate": 0.00014162309829965616, + "loss": 2.0777, + "step": 12400 + }, + { + "epoch": 0.2922613160001884, + "grad_norm": 2.8509726524353027, + "learning_rate": 0.00014157599736234753, + "loss": 2.112, + "step": 12410 + }, + { + "epoch": 0.29249682068673166, + "grad_norm": 2.370173931121826, + "learning_rate": 0.00014152889642503886, + "loss": 2.1104, + "step": 12420 + }, + { + "epoch": 0.2927323253732749, + "grad_norm": 2.105886459350586, + "learning_rate": 0.0001414817954877302, + "loss": 2.1593, + "step": 12430 + }, + { + "epoch": 0.2929678300598182, + "grad_norm": 2.6540844440460205, + "learning_rate": 0.00014143469455042156, + "loss": 2.034, + "step": 12440 + }, + { + "epoch": 0.29320333474636145, + "grad_norm": 1.7813196182250977, + "learning_rate": 0.0001413875936131129, + "loss": 1.899, + "step": 12450 + }, + { + "epoch": 0.2934388394329047, + "grad_norm": 2.747343063354492, + "learning_rate": 0.00014134049267580426, + "loss": 2.1251, + "step": 12460 + }, + { + "epoch": 0.293674344119448, + "grad_norm": 2.120857000350952, + "learning_rate": 0.0001412933917384956, + "loss": 1.9557, + "step": 12470 + }, + { + "epoch": 0.29390984880599125, + "grad_norm": 2.8175089359283447, + "learning_rate": 0.00014124629080118693, + "loss": 2.0522, + "step": 12480 + }, + { + "epoch": 0.2941453534925345, + "grad_norm": 3.126023292541504, + "learning_rate": 0.0001411991898638783, + "loss": 2.06, + "step": 12490 + }, + { + "epoch": 0.2943808581790778, + "grad_norm": 2.139693260192871, + "learning_rate": 0.00014115208892656963, + "loss": 2.2009, + "step": 12500 + }, + { + "epoch": 0.29461636286562104, + "grad_norm": 2.007183790206909, + "learning_rate": 0.00014110498798926098, + "loss": 2.1605, + "step": 12510 + }, + { + "epoch": 0.2948518675521643, + "grad_norm": 1.9379314184188843, + "learning_rate": 0.00014105788705195233, + "loss": 2.1381, + "step": 12520 + }, + { + "epoch": 0.29508737223870757, + "grad_norm": 2.096290349960327, + "learning_rate": 0.00014101078611464368, + "loss": 2.2027, + "step": 12530 + }, + { + "epoch": 0.29532287692525083, + "grad_norm": 2.452683925628662, + "learning_rate": 0.00014096368517733503, + "loss": 2.1766, + "step": 12540 + }, + { + "epoch": 0.2955583816117941, + "grad_norm": 2.5896694660186768, + "learning_rate": 0.00014091658424002638, + "loss": 2.1109, + "step": 12550 + }, + { + "epoch": 0.29579388629833736, + "grad_norm": 2.125363349914551, + "learning_rate": 0.00014086948330271773, + "loss": 2.3477, + "step": 12560 + }, + { + "epoch": 0.2960293909848806, + "grad_norm": 1.9147471189498901, + "learning_rate": 0.00014082238236540908, + "loss": 2.4074, + "step": 12570 + }, + { + "epoch": 0.2962648956714239, + "grad_norm": 2.466128349304199, + "learning_rate": 0.00014077528142810043, + "loss": 2.161, + "step": 12580 + }, + { + "epoch": 0.2965004003579671, + "grad_norm": 2.0209436416625977, + "learning_rate": 0.00014072818049079179, + "loss": 2.0503, + "step": 12590 + }, + { + "epoch": 0.29673590504451036, + "grad_norm": 2.1198041439056396, + "learning_rate": 0.00014068107955348314, + "loss": 2.166, + "step": 12600 + }, + { + "epoch": 0.2969714097310536, + "grad_norm": 1.7354202270507812, + "learning_rate": 0.00014063397861617446, + "loss": 2.0945, + "step": 12610 + }, + { + "epoch": 0.2972069144175969, + "grad_norm": 1.8669902086257935, + "learning_rate": 0.00014058687767886584, + "loss": 1.9682, + "step": 12620 + }, + { + "epoch": 0.29744241910414015, + "grad_norm": 2.1740376949310303, + "learning_rate": 0.00014053977674155716, + "loss": 2.1025, + "step": 12630 + }, + { + "epoch": 0.2976779237906834, + "grad_norm": 2.275982141494751, + "learning_rate": 0.0001404926758042485, + "loss": 2.2597, + "step": 12640 + }, + { + "epoch": 0.2979134284772267, + "grad_norm": 2.172459125518799, + "learning_rate": 0.00014044557486693986, + "loss": 2.1002, + "step": 12650 + }, + { + "epoch": 0.29814893316376995, + "grad_norm": 2.4488515853881836, + "learning_rate": 0.0001403984739296312, + "loss": 1.9405, + "step": 12660 + }, + { + "epoch": 0.2983844378503132, + "grad_norm": 2.1682851314544678, + "learning_rate": 0.00014035137299232256, + "loss": 2.1472, + "step": 12670 + }, + { + "epoch": 0.2986199425368565, + "grad_norm": 1.9250439405441284, + "learning_rate": 0.0001403042720550139, + "loss": 2.1314, + "step": 12680 + }, + { + "epoch": 0.29885544722339974, + "grad_norm": 3.761707305908203, + "learning_rate": 0.00014025717111770523, + "loss": 2.1523, + "step": 12690 + }, + { + "epoch": 0.299090951909943, + "grad_norm": 2.87754487991333, + "learning_rate": 0.0001402100701803966, + "loss": 1.9867, + "step": 12700 + }, + { + "epoch": 0.29932645659648627, + "grad_norm": 2.3228402137756348, + "learning_rate": 0.00014016296924308793, + "loss": 2.015, + "step": 12710 + }, + { + "epoch": 0.29956196128302953, + "grad_norm": 1.869566798210144, + "learning_rate": 0.00014011586830577928, + "loss": 2.2132, + "step": 12720 + }, + { + "epoch": 0.2997974659695728, + "grad_norm": 2.2412784099578857, + "learning_rate": 0.00014006876736847063, + "loss": 2.3368, + "step": 12730 + }, + { + "epoch": 0.30003297065611606, + "grad_norm": 1.911073923110962, + "learning_rate": 0.00014002166643116198, + "loss": 2.0563, + "step": 12740 + }, + { + "epoch": 0.3002684753426593, + "grad_norm": 2.371375799179077, + "learning_rate": 0.00013997456549385333, + "loss": 2.1771, + "step": 12750 + }, + { + "epoch": 0.3005039800292026, + "grad_norm": 2.312708854675293, + "learning_rate": 0.00013992746455654469, + "loss": 2.158, + "step": 12760 + }, + { + "epoch": 0.30073948471574585, + "grad_norm": 2.721620559692383, + "learning_rate": 0.000139880363619236, + "loss": 2.2242, + "step": 12770 + }, + { + "epoch": 0.3009749894022891, + "grad_norm": 2.6075215339660645, + "learning_rate": 0.00013983326268192739, + "loss": 2.029, + "step": 12780 + }, + { + "epoch": 0.3012104940888324, + "grad_norm": 2.001995086669922, + "learning_rate": 0.0001397861617446187, + "loss": 2.1364, + "step": 12790 + }, + { + "epoch": 0.30144599877537565, + "grad_norm": 1.792315125465393, + "learning_rate": 0.00013973906080731009, + "loss": 2.0791, + "step": 12800 + }, + { + "epoch": 0.3016815034619189, + "grad_norm": 2.6156537532806396, + "learning_rate": 0.0001396919598700014, + "loss": 2.131, + "step": 12810 + }, + { + "epoch": 0.3019170081484622, + "grad_norm": 2.277663230895996, + "learning_rate": 0.00013964485893269276, + "loss": 2.03, + "step": 12820 + }, + { + "epoch": 0.30215251283500544, + "grad_norm": 2.09621000289917, + "learning_rate": 0.0001395977579953841, + "loss": 2.2744, + "step": 12830 + }, + { + "epoch": 0.3023880175215487, + "grad_norm": 2.4095897674560547, + "learning_rate": 0.00013955065705807546, + "loss": 1.8946, + "step": 12840 + }, + { + "epoch": 0.30262352220809197, + "grad_norm": 2.2622735500335693, + "learning_rate": 0.0001395035561207668, + "loss": 2.1998, + "step": 12850 + }, + { + "epoch": 0.30285902689463523, + "grad_norm": 2.8324317932128906, + "learning_rate": 0.00013945645518345816, + "loss": 2.4248, + "step": 12860 + }, + { + "epoch": 0.30309453158117844, + "grad_norm": 2.2162587642669678, + "learning_rate": 0.0001394093542461495, + "loss": 2.0623, + "step": 12870 + }, + { + "epoch": 0.3033300362677217, + "grad_norm": 2.325887441635132, + "learning_rate": 0.00013936225330884086, + "loss": 2.2047, + "step": 12880 + }, + { + "epoch": 0.30356554095426497, + "grad_norm": 2.152468681335449, + "learning_rate": 0.0001393151523715322, + "loss": 2.1984, + "step": 12890 + }, + { + "epoch": 0.30380104564080823, + "grad_norm": 2.0017037391662598, + "learning_rate": 0.00013926805143422353, + "loss": 2.1574, + "step": 12900 + }, + { + "epoch": 0.3040365503273515, + "grad_norm": 2.003835439682007, + "learning_rate": 0.0001392209504969149, + "loss": 2.2094, + "step": 12910 + }, + { + "epoch": 0.30427205501389476, + "grad_norm": 2.4444828033447266, + "learning_rate": 0.00013917384955960623, + "loss": 2.0709, + "step": 12920 + }, + { + "epoch": 0.304507559700438, + "grad_norm": 2.4966955184936523, + "learning_rate": 0.00013912674862229758, + "loss": 2.0336, + "step": 12930 + }, + { + "epoch": 0.3047430643869813, + "grad_norm": 2.560464859008789, + "learning_rate": 0.00013907964768498894, + "loss": 2.0573, + "step": 12940 + }, + { + "epoch": 0.30497856907352455, + "grad_norm": 2.135472059249878, + "learning_rate": 0.00013903254674768029, + "loss": 2.0721, + "step": 12950 + }, + { + "epoch": 0.3052140737600678, + "grad_norm": 1.9246833324432373, + "learning_rate": 0.00013898544581037164, + "loss": 2.1015, + "step": 12960 + }, + { + "epoch": 0.3054495784466111, + "grad_norm": 2.2237212657928467, + "learning_rate": 0.00013893834487306299, + "loss": 1.9604, + "step": 12970 + }, + { + "epoch": 0.30568508313315434, + "grad_norm": 2.493617057800293, + "learning_rate": 0.00013889124393575434, + "loss": 2.278, + "step": 12980 + }, + { + "epoch": 0.3059205878196976, + "grad_norm": 3.1759326457977295, + "learning_rate": 0.00013884414299844569, + "loss": 2.1974, + "step": 12990 + }, + { + "epoch": 0.3061560925062409, + "grad_norm": 2.2904551029205322, + "learning_rate": 0.000138797042061137, + "loss": 2.0009, + "step": 13000 + }, + { + "epoch": 0.30639159719278414, + "grad_norm": 2.1130900382995605, + "learning_rate": 0.0001387499411238284, + "loss": 2.1561, + "step": 13010 + }, + { + "epoch": 0.3066271018793274, + "grad_norm": 2.156583786010742, + "learning_rate": 0.0001387028401865197, + "loss": 2.1903, + "step": 13020 + }, + { + "epoch": 0.30686260656587067, + "grad_norm": 2.4466586112976074, + "learning_rate": 0.00013865573924921106, + "loss": 2.1301, + "step": 13030 + }, + { + "epoch": 0.30709811125241393, + "grad_norm": 2.5151896476745605, + "learning_rate": 0.0001386086383119024, + "loss": 2.2717, + "step": 13040 + }, + { + "epoch": 0.3073336159389572, + "grad_norm": 2.063638210296631, + "learning_rate": 0.00013856153737459376, + "loss": 2.0034, + "step": 13050 + }, + { + "epoch": 0.30756912062550046, + "grad_norm": 2.0933945178985596, + "learning_rate": 0.0001385144364372851, + "loss": 2.024, + "step": 13060 + }, + { + "epoch": 0.3078046253120437, + "grad_norm": 2.3742902278900146, + "learning_rate": 0.00013846733549997646, + "loss": 1.9053, + "step": 13070 + }, + { + "epoch": 0.308040129998587, + "grad_norm": 2.2515957355499268, + "learning_rate": 0.00013842023456266778, + "loss": 2.139, + "step": 13080 + }, + { + "epoch": 0.30827563468513025, + "grad_norm": 2.533169984817505, + "learning_rate": 0.00013837313362535916, + "loss": 2.2177, + "step": 13090 + }, + { + "epoch": 0.3085111393716735, + "grad_norm": 2.0766286849975586, + "learning_rate": 0.00013832603268805048, + "loss": 2.0354, + "step": 13100 + }, + { + "epoch": 0.3087466440582168, + "grad_norm": 2.1932218074798584, + "learning_rate": 0.00013827893175074184, + "loss": 2.0149, + "step": 13110 + }, + { + "epoch": 0.30898214874476004, + "grad_norm": 2.7338597774505615, + "learning_rate": 0.00013823183081343319, + "loss": 2.0193, + "step": 13120 + }, + { + "epoch": 0.3092176534313033, + "grad_norm": 2.09232497215271, + "learning_rate": 0.00013818472987612454, + "loss": 2.1508, + "step": 13130 + }, + { + "epoch": 0.30945315811784657, + "grad_norm": 2.1449930667877197, + "learning_rate": 0.0001381376289388159, + "loss": 1.9638, + "step": 13140 + }, + { + "epoch": 0.3096886628043898, + "grad_norm": 2.363706350326538, + "learning_rate": 0.00013809052800150724, + "loss": 2.3092, + "step": 13150 + }, + { + "epoch": 0.30992416749093304, + "grad_norm": 2.566270589828491, + "learning_rate": 0.00013804342706419859, + "loss": 2.0075, + "step": 13160 + }, + { + "epoch": 0.3101596721774763, + "grad_norm": 2.4858806133270264, + "learning_rate": 0.00013799632612688994, + "loss": 2.1862, + "step": 13170 + }, + { + "epoch": 0.3103951768640196, + "grad_norm": 2.256286144256592, + "learning_rate": 0.0001379492251895813, + "loss": 1.9553, + "step": 13180 + }, + { + "epoch": 0.31063068155056284, + "grad_norm": 2.1771228313446045, + "learning_rate": 0.00013790212425227264, + "loss": 2.0341, + "step": 13190 + }, + { + "epoch": 0.3108661862371061, + "grad_norm": 2.82309889793396, + "learning_rate": 0.000137855023314964, + "loss": 2.3205, + "step": 13200 + }, + { + "epoch": 0.31110169092364937, + "grad_norm": 2.4577043056488037, + "learning_rate": 0.0001378079223776553, + "loss": 2.0357, + "step": 13210 + }, + { + "epoch": 0.31133719561019263, + "grad_norm": 2.040903091430664, + "learning_rate": 0.0001377608214403467, + "loss": 2.0322, + "step": 13220 + }, + { + "epoch": 0.3115727002967359, + "grad_norm": 2.3443174362182617, + "learning_rate": 0.000137713720503038, + "loss": 2.0852, + "step": 13230 + }, + { + "epoch": 0.31180820498327916, + "grad_norm": 2.266204357147217, + "learning_rate": 0.00013766661956572936, + "loss": 2.3327, + "step": 13240 + }, + { + "epoch": 0.3120437096698224, + "grad_norm": 2.3548195362091064, + "learning_rate": 0.0001376195186284207, + "loss": 2.1512, + "step": 13250 + }, + { + "epoch": 0.3122792143563657, + "grad_norm": 1.9592384099960327, + "learning_rate": 0.00013757241769111206, + "loss": 2.0161, + "step": 13260 + }, + { + "epoch": 0.31251471904290895, + "grad_norm": 2.306349754333496, + "learning_rate": 0.0001375253167538034, + "loss": 2.1769, + "step": 13270 + }, + { + "epoch": 0.3127502237294522, + "grad_norm": 2.76021146774292, + "learning_rate": 0.00013747821581649476, + "loss": 2.3517, + "step": 13280 + }, + { + "epoch": 0.3129857284159955, + "grad_norm": 3.3570475578308105, + "learning_rate": 0.00013743111487918609, + "loss": 2.1244, + "step": 13290 + }, + { + "epoch": 0.31322123310253874, + "grad_norm": 2.5103135108947754, + "learning_rate": 0.00013738401394187746, + "loss": 2.057, + "step": 13300 + }, + { + "epoch": 0.313456737789082, + "grad_norm": 2.2112882137298584, + "learning_rate": 0.00013733691300456879, + "loss": 1.9779, + "step": 13310 + }, + { + "epoch": 0.31369224247562527, + "grad_norm": 2.6416406631469727, + "learning_rate": 0.00013728981206726014, + "loss": 2.1394, + "step": 13320 + }, + { + "epoch": 0.31392774716216854, + "grad_norm": 1.99900484085083, + "learning_rate": 0.00013724271112995149, + "loss": 2.3333, + "step": 13330 + }, + { + "epoch": 0.3141632518487118, + "grad_norm": 2.0924324989318848, + "learning_rate": 0.00013719561019264284, + "loss": 2.0693, + "step": 13340 + }, + { + "epoch": 0.31439875653525506, + "grad_norm": 2.4140758514404297, + "learning_rate": 0.0001371485092553342, + "loss": 2.2516, + "step": 13350 + }, + { + "epoch": 0.31463426122179833, + "grad_norm": 2.1066081523895264, + "learning_rate": 0.00013710140831802554, + "loss": 2.1283, + "step": 13360 + }, + { + "epoch": 0.3148697659083416, + "grad_norm": 1.8612606525421143, + "learning_rate": 0.00013705430738071686, + "loss": 2.0633, + "step": 13370 + }, + { + "epoch": 0.31510527059488486, + "grad_norm": 2.5588574409484863, + "learning_rate": 0.00013700720644340824, + "loss": 2.1888, + "step": 13380 + }, + { + "epoch": 0.3153407752814281, + "grad_norm": 1.7056317329406738, + "learning_rate": 0.00013696010550609956, + "loss": 1.9666, + "step": 13390 + }, + { + "epoch": 0.3155762799679714, + "grad_norm": 1.8611016273498535, + "learning_rate": 0.00013691300456879094, + "loss": 2.182, + "step": 13400 + }, + { + "epoch": 0.31581178465451465, + "grad_norm": 2.0268967151641846, + "learning_rate": 0.0001368659036314823, + "loss": 1.8033, + "step": 13410 + }, + { + "epoch": 0.3160472893410579, + "grad_norm": 1.7649638652801514, + "learning_rate": 0.0001368188026941736, + "loss": 1.9693, + "step": 13420 + }, + { + "epoch": 0.3162827940276011, + "grad_norm": 3.1262307167053223, + "learning_rate": 0.000136771701756865, + "loss": 2.1137, + "step": 13430 + }, + { + "epoch": 0.3165182987141444, + "grad_norm": 2.176802158355713, + "learning_rate": 0.0001367246008195563, + "loss": 2.0533, + "step": 13440 + }, + { + "epoch": 0.31675380340068765, + "grad_norm": 2.7924866676330566, + "learning_rate": 0.00013667749988224766, + "loss": 1.9473, + "step": 13450 + }, + { + "epoch": 0.3169893080872309, + "grad_norm": 2.8177592754364014, + "learning_rate": 0.000136630398944939, + "loss": 2.0061, + "step": 13460 + }, + { + "epoch": 0.3172248127737742, + "grad_norm": 3.690781354904175, + "learning_rate": 0.00013658329800763036, + "loss": 2.2083, + "step": 13470 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 2.0106966495513916, + "learning_rate": 0.0001365361970703217, + "loss": 1.9977, + "step": 13480 + }, + { + "epoch": 0.3176958221468607, + "grad_norm": 2.4227206707000732, + "learning_rate": 0.00013648909613301306, + "loss": 2.0599, + "step": 13490 + }, + { + "epoch": 0.31793132683340397, + "grad_norm": 2.597426176071167, + "learning_rate": 0.00013644199519570439, + "loss": 2.0478, + "step": 13500 + }, + { + "epoch": 0.31816683151994724, + "grad_norm": 2.364302158355713, + "learning_rate": 0.00013639489425839576, + "loss": 2.1535, + "step": 13510 + }, + { + "epoch": 0.3184023362064905, + "grad_norm": 1.8173555135726929, + "learning_rate": 0.0001363477933210871, + "loss": 2.0284, + "step": 13520 + }, + { + "epoch": 0.31863784089303376, + "grad_norm": 2.0568530559539795, + "learning_rate": 0.00013630069238377844, + "loss": 1.9601, + "step": 13530 + }, + { + "epoch": 0.318873345579577, + "grad_norm": 2.300703525543213, + "learning_rate": 0.0001362535914464698, + "loss": 2.1593, + "step": 13540 + }, + { + "epoch": 0.3191088502661203, + "grad_norm": 2.1626505851745605, + "learning_rate": 0.00013620649050916114, + "loss": 2.0686, + "step": 13550 + }, + { + "epoch": 0.31934435495266356, + "grad_norm": 2.2925024032592773, + "learning_rate": 0.0001361593895718525, + "loss": 2.0937, + "step": 13560 + }, + { + "epoch": 0.3195798596392068, + "grad_norm": 2.871372938156128, + "learning_rate": 0.00013611228863454384, + "loss": 2.2753, + "step": 13570 + }, + { + "epoch": 0.3198153643257501, + "grad_norm": 2.3106420040130615, + "learning_rate": 0.0001360651876972352, + "loss": 2.2251, + "step": 13580 + }, + { + "epoch": 0.32005086901229335, + "grad_norm": 2.192394256591797, + "learning_rate": 0.00013601808675992654, + "loss": 2.1868, + "step": 13590 + }, + { + "epoch": 0.3202863736988366, + "grad_norm": 2.862781286239624, + "learning_rate": 0.00013597098582261786, + "loss": 2.1841, + "step": 13600 + }, + { + "epoch": 0.3205218783853799, + "grad_norm": 2.3184902667999268, + "learning_rate": 0.00013592388488530924, + "loss": 2.3279, + "step": 13610 + }, + { + "epoch": 0.32075738307192314, + "grad_norm": 2.3771584033966064, + "learning_rate": 0.00013587678394800056, + "loss": 2.2126, + "step": 13620 + }, + { + "epoch": 0.3209928877584664, + "grad_norm": 1.6889021396636963, + "learning_rate": 0.0001358296830106919, + "loss": 2.0039, + "step": 13630 + }, + { + "epoch": 0.32122839244500967, + "grad_norm": 2.208984613418579, + "learning_rate": 0.00013578258207338326, + "loss": 1.9929, + "step": 13640 + }, + { + "epoch": 0.32146389713155293, + "grad_norm": 2.5552730560302734, + "learning_rate": 0.0001357354811360746, + "loss": 2.0772, + "step": 13650 + }, + { + "epoch": 0.3216994018180962, + "grad_norm": 2.576536178588867, + "learning_rate": 0.00013568838019876596, + "loss": 2.0566, + "step": 13660 + }, + { + "epoch": 0.32193490650463946, + "grad_norm": 3.376354694366455, + "learning_rate": 0.0001356412792614573, + "loss": 2.0092, + "step": 13670 + }, + { + "epoch": 0.3221704111911827, + "grad_norm": 2.312920331954956, + "learning_rate": 0.00013559417832414864, + "loss": 2.1, + "step": 13680 + }, + { + "epoch": 0.322405915877726, + "grad_norm": 2.1818175315856934, + "learning_rate": 0.00013554707738684001, + "loss": 1.9808, + "step": 13690 + }, + { + "epoch": 0.32264142056426925, + "grad_norm": 2.547168493270874, + "learning_rate": 0.00013549997644953136, + "loss": 2.1154, + "step": 13700 + }, + { + "epoch": 0.3228769252508125, + "grad_norm": 1.7971171140670776, + "learning_rate": 0.0001354528755122227, + "loss": 2.126, + "step": 13710 + }, + { + "epoch": 0.3231124299373557, + "grad_norm": 2.0792088508605957, + "learning_rate": 0.00013540577457491406, + "loss": 1.9622, + "step": 13720 + }, + { + "epoch": 0.323347934623899, + "grad_norm": 1.9572780132293701, + "learning_rate": 0.0001353586736376054, + "loss": 1.9715, + "step": 13730 + }, + { + "epoch": 0.32358343931044226, + "grad_norm": 2.749486207962036, + "learning_rate": 0.00013531157270029676, + "loss": 2.2712, + "step": 13740 + }, + { + "epoch": 0.3238189439969855, + "grad_norm": 2.157297372817993, + "learning_rate": 0.0001352644717629881, + "loss": 2.2583, + "step": 13750 + }, + { + "epoch": 0.3240544486835288, + "grad_norm": 2.2807443141937256, + "learning_rate": 0.00013521737082567944, + "loss": 2.1891, + "step": 13760 + }, + { + "epoch": 0.32428995337007205, + "grad_norm": 1.8038594722747803, + "learning_rate": 0.0001351702698883708, + "loss": 1.9942, + "step": 13770 + }, + { + "epoch": 0.3245254580566153, + "grad_norm": 2.015502691268921, + "learning_rate": 0.00013512316895106214, + "loss": 1.9229, + "step": 13780 + }, + { + "epoch": 0.3247609627431586, + "grad_norm": 2.0219409465789795, + "learning_rate": 0.0001350760680137535, + "loss": 2.0995, + "step": 13790 + }, + { + "epoch": 0.32499646742970184, + "grad_norm": 2.005467176437378, + "learning_rate": 0.00013502896707644484, + "loss": 2.196, + "step": 13800 + }, + { + "epoch": 0.3252319721162451, + "grad_norm": 2.505066394805908, + "learning_rate": 0.00013498186613913616, + "loss": 2.0781, + "step": 13810 + }, + { + "epoch": 0.32546747680278837, + "grad_norm": 2.1414785385131836, + "learning_rate": 0.00013493476520182754, + "loss": 2.0771, + "step": 13820 + }, + { + "epoch": 0.32570298148933163, + "grad_norm": 2.4025304317474365, + "learning_rate": 0.00013488766426451886, + "loss": 2.2001, + "step": 13830 + }, + { + "epoch": 0.3259384861758749, + "grad_norm": 2.1574246883392334, + "learning_rate": 0.0001348405633272102, + "loss": 2.0484, + "step": 13840 + }, + { + "epoch": 0.32617399086241816, + "grad_norm": 2.238400936126709, + "learning_rate": 0.00013479346238990156, + "loss": 2.0084, + "step": 13850 + }, + { + "epoch": 0.3264094955489614, + "grad_norm": 1.7974083423614502, + "learning_rate": 0.00013474636145259291, + "loss": 2.1113, + "step": 13860 + }, + { + "epoch": 0.3266450002355047, + "grad_norm": 2.17846941947937, + "learning_rate": 0.00013469926051528426, + "loss": 2.3447, + "step": 13870 + }, + { + "epoch": 0.32688050492204795, + "grad_norm": 2.0620272159576416, + "learning_rate": 0.00013465215957797561, + "loss": 2.1059, + "step": 13880 + }, + { + "epoch": 0.3271160096085912, + "grad_norm": 2.4582271575927734, + "learning_rate": 0.00013460505864066694, + "loss": 1.8575, + "step": 13890 + }, + { + "epoch": 0.3273515142951345, + "grad_norm": 2.7777035236358643, + "learning_rate": 0.00013455795770335831, + "loss": 2.1932, + "step": 13900 + }, + { + "epoch": 0.32758701898167775, + "grad_norm": 1.6070411205291748, + "learning_rate": 0.00013451085676604964, + "loss": 1.9579, + "step": 13910 + }, + { + "epoch": 0.327822523668221, + "grad_norm": 2.466210126876831, + "learning_rate": 0.000134463755828741, + "loss": 1.9896, + "step": 13920 + }, + { + "epoch": 0.3280580283547643, + "grad_norm": 1.9184893369674683, + "learning_rate": 0.00013441665489143234, + "loss": 2.2178, + "step": 13930 + }, + { + "epoch": 0.32829353304130754, + "grad_norm": 2.1110756397247314, + "learning_rate": 0.0001343695539541237, + "loss": 2.1961, + "step": 13940 + }, + { + "epoch": 0.3285290377278508, + "grad_norm": 2.830181121826172, + "learning_rate": 0.00013432245301681504, + "loss": 2.1987, + "step": 13950 + }, + { + "epoch": 0.32876454241439407, + "grad_norm": 2.1129209995269775, + "learning_rate": 0.0001342753520795064, + "loss": 2.0927, + "step": 13960 + }, + { + "epoch": 0.32900004710093733, + "grad_norm": 2.1881232261657715, + "learning_rate": 0.0001342282511421977, + "loss": 1.8427, + "step": 13970 + }, + { + "epoch": 0.3292355517874806, + "grad_norm": 2.1353397369384766, + "learning_rate": 0.0001341811502048891, + "loss": 1.9289, + "step": 13980 + }, + { + "epoch": 0.32947105647402386, + "grad_norm": 3.527761459350586, + "learning_rate": 0.00013413404926758044, + "loss": 2.2088, + "step": 13990 + }, + { + "epoch": 0.32970656116056707, + "grad_norm": 2.0877022743225098, + "learning_rate": 0.0001340869483302718, + "loss": 2.1233, + "step": 14000 + }, + { + "epoch": 0.32994206584711033, + "grad_norm": 1.560817837715149, + "learning_rate": 0.00013403984739296314, + "loss": 2.0715, + "step": 14010 + }, + { + "epoch": 0.3301775705336536, + "grad_norm": 2.216294765472412, + "learning_rate": 0.00013399274645565446, + "loss": 2.0866, + "step": 14020 + }, + { + "epoch": 0.33041307522019686, + "grad_norm": 2.3411684036254883, + "learning_rate": 0.00013394564551834584, + "loss": 2.1824, + "step": 14030 + }, + { + "epoch": 0.3306485799067401, + "grad_norm": 2.613354206085205, + "learning_rate": 0.00013389854458103716, + "loss": 2.3151, + "step": 14040 + }, + { + "epoch": 0.3308840845932834, + "grad_norm": 2.107166290283203, + "learning_rate": 0.00013385144364372851, + "loss": 2.091, + "step": 14050 + }, + { + "epoch": 0.33111958927982665, + "grad_norm": 2.1655850410461426, + "learning_rate": 0.00013380434270641986, + "loss": 2.2143, + "step": 14060 + }, + { + "epoch": 0.3313550939663699, + "grad_norm": 1.9823925495147705, + "learning_rate": 0.00013375724176911121, + "loss": 2.1066, + "step": 14070 + }, + { + "epoch": 0.3315905986529132, + "grad_norm": 2.26995849609375, + "learning_rate": 0.00013371014083180256, + "loss": 1.9626, + "step": 14080 + }, + { + "epoch": 0.33182610333945645, + "grad_norm": 1.9416958093643188, + "learning_rate": 0.00013366303989449392, + "loss": 2.2461, + "step": 14090 + }, + { + "epoch": 0.3320616080259997, + "grad_norm": 2.685157060623169, + "learning_rate": 0.00013361593895718524, + "loss": 1.9703, + "step": 14100 + }, + { + "epoch": 0.332297112712543, + "grad_norm": 2.3456876277923584, + "learning_rate": 0.00013356883801987662, + "loss": 2.2025, + "step": 14110 + }, + { + "epoch": 0.33253261739908624, + "grad_norm": 2.766294002532959, + "learning_rate": 0.00013352173708256794, + "loss": 2.2302, + "step": 14120 + }, + { + "epoch": 0.3327681220856295, + "grad_norm": 2.4374420642852783, + "learning_rate": 0.0001334746361452593, + "loss": 2.1317, + "step": 14130 + }, + { + "epoch": 0.33300362677217277, + "grad_norm": 1.8526066541671753, + "learning_rate": 0.00013342753520795064, + "loss": 2.0103, + "step": 14140 + }, + { + "epoch": 0.33323913145871603, + "grad_norm": 2.461101770401001, + "learning_rate": 0.000133380434270642, + "loss": 2.0952, + "step": 14150 + }, + { + "epoch": 0.3334746361452593, + "grad_norm": 1.950975775718689, + "learning_rate": 0.00013333333333333334, + "loss": 2.0181, + "step": 14160 + }, + { + "epoch": 0.33371014083180256, + "grad_norm": 2.2419090270996094, + "learning_rate": 0.0001332862323960247, + "loss": 2.226, + "step": 14170 + }, + { + "epoch": 0.3339456455183458, + "grad_norm": 2.077028751373291, + "learning_rate": 0.000133239131458716, + "loss": 2.0689, + "step": 14180 + }, + { + "epoch": 0.3341811502048891, + "grad_norm": 1.9552679061889648, + "learning_rate": 0.0001331920305214074, + "loss": 2.219, + "step": 14190 + }, + { + "epoch": 0.33441665489143235, + "grad_norm": 1.9980392456054688, + "learning_rate": 0.00013314492958409871, + "loss": 2.1455, + "step": 14200 + }, + { + "epoch": 0.3346521595779756, + "grad_norm": 2.1066527366638184, + "learning_rate": 0.0001330978286467901, + "loss": 2.413, + "step": 14210 + }, + { + "epoch": 0.3348876642645189, + "grad_norm": 2.242450475692749, + "learning_rate": 0.00013305072770948141, + "loss": 2.0, + "step": 14220 + }, + { + "epoch": 0.33512316895106214, + "grad_norm": 2.308070659637451, + "learning_rate": 0.00013300362677217276, + "loss": 1.9766, + "step": 14230 + }, + { + "epoch": 0.3353586736376054, + "grad_norm": 3.0701427459716797, + "learning_rate": 0.00013295652583486411, + "loss": 2.0551, + "step": 14240 + }, + { + "epoch": 0.3355941783241487, + "grad_norm": 2.204287052154541, + "learning_rate": 0.00013290942489755546, + "loss": 2.1141, + "step": 14250 + }, + { + "epoch": 0.33582968301069194, + "grad_norm": 2.2963192462921143, + "learning_rate": 0.00013286232396024682, + "loss": 1.8639, + "step": 14260 + }, + { + "epoch": 0.3360651876972352, + "grad_norm": 1.9334232807159424, + "learning_rate": 0.00013281522302293817, + "loss": 1.9872, + "step": 14270 + }, + { + "epoch": 0.3363006923837784, + "grad_norm": 2.3119375705718994, + "learning_rate": 0.00013276812208562952, + "loss": 2.2071, + "step": 14280 + }, + { + "epoch": 0.3365361970703217, + "grad_norm": 1.685998797416687, + "learning_rate": 0.00013272102114832087, + "loss": 2.2305, + "step": 14290 + }, + { + "epoch": 0.33677170175686494, + "grad_norm": 1.8673522472381592, + "learning_rate": 0.00013267392021101222, + "loss": 2.0198, + "step": 14300 + }, + { + "epoch": 0.3370072064434082, + "grad_norm": 2.057691812515259, + "learning_rate": 0.00013262681927370354, + "loss": 2.1267, + "step": 14310 + }, + { + "epoch": 0.33724271112995147, + "grad_norm": 2.4289650917053223, + "learning_rate": 0.00013257971833639492, + "loss": 2.1753, + "step": 14320 + }, + { + "epoch": 0.33747821581649473, + "grad_norm": 2.0615077018737793, + "learning_rate": 0.00013253261739908624, + "loss": 2.1745, + "step": 14330 + }, + { + "epoch": 0.337713720503038, + "grad_norm": 1.9922257661819458, + "learning_rate": 0.0001324855164617776, + "loss": 2.041, + "step": 14340 + }, + { + "epoch": 0.33794922518958126, + "grad_norm": 2.150548219680786, + "learning_rate": 0.00013243841552446894, + "loss": 2.054, + "step": 14350 + }, + { + "epoch": 0.3381847298761245, + "grad_norm": 1.9815877676010132, + "learning_rate": 0.0001323913145871603, + "loss": 2.2345, + "step": 14360 + }, + { + "epoch": 0.3384202345626678, + "grad_norm": 3.1625378131866455, + "learning_rate": 0.00013234421364985164, + "loss": 2.0449, + "step": 14370 + }, + { + "epoch": 0.33865573924921105, + "grad_norm": 2.5849571228027344, + "learning_rate": 0.000132297112712543, + "loss": 2.1497, + "step": 14380 + }, + { + "epoch": 0.3388912439357543, + "grad_norm": 3.0751826763153076, + "learning_rate": 0.00013225001177523434, + "loss": 2.0963, + "step": 14390 + }, + { + "epoch": 0.3391267486222976, + "grad_norm": 1.572991967201233, + "learning_rate": 0.0001322029108379257, + "loss": 1.9016, + "step": 14400 + }, + { + "epoch": 0.33936225330884084, + "grad_norm": 2.5276403427124023, + "learning_rate": 0.00013215580990061701, + "loss": 2.3332, + "step": 14410 + }, + { + "epoch": 0.3395977579953841, + "grad_norm": 1.7656186819076538, + "learning_rate": 0.0001321087089633084, + "loss": 2.0629, + "step": 14420 + }, + { + "epoch": 0.3398332626819274, + "grad_norm": 1.8778975009918213, + "learning_rate": 0.00013206160802599972, + "loss": 2.2139, + "step": 14430 + }, + { + "epoch": 0.34006876736847064, + "grad_norm": 2.696181535720825, + "learning_rate": 0.00013201450708869107, + "loss": 1.9954, + "step": 14440 + }, + { + "epoch": 0.3403042720550139, + "grad_norm": 1.6417945623397827, + "learning_rate": 0.00013196740615138242, + "loss": 1.9359, + "step": 14450 + }, + { + "epoch": 0.34053977674155717, + "grad_norm": 2.1908366680145264, + "learning_rate": 0.00013192030521407377, + "loss": 1.9072, + "step": 14460 + }, + { + "epoch": 0.34077528142810043, + "grad_norm": 2.7889277935028076, + "learning_rate": 0.00013187320427676512, + "loss": 2.0709, + "step": 14470 + }, + { + "epoch": 0.3410107861146437, + "grad_norm": 2.474809408187866, + "learning_rate": 0.00013182610333945647, + "loss": 2.155, + "step": 14480 + }, + { + "epoch": 0.34124629080118696, + "grad_norm": 2.2163426876068115, + "learning_rate": 0.0001317790024021478, + "loss": 2.1198, + "step": 14490 + }, + { + "epoch": 0.3414817954877302, + "grad_norm": 2.4669189453125, + "learning_rate": 0.00013173190146483917, + "loss": 2.1101, + "step": 14500 + }, + { + "epoch": 0.3417173001742735, + "grad_norm": 2.4319045543670654, + "learning_rate": 0.0001316848005275305, + "loss": 2.208, + "step": 14510 + }, + { + "epoch": 0.34195280486081675, + "grad_norm": 2.244265556335449, + "learning_rate": 0.00013163769959022184, + "loss": 2.0715, + "step": 14520 + }, + { + "epoch": 0.34218830954736, + "grad_norm": 2.1139960289001465, + "learning_rate": 0.0001315905986529132, + "loss": 1.9087, + "step": 14530 + }, + { + "epoch": 0.3424238142339033, + "grad_norm": 2.030888080596924, + "learning_rate": 0.00013154349771560454, + "loss": 2.0678, + "step": 14540 + }, + { + "epoch": 0.34265931892044654, + "grad_norm": 3.3537919521331787, + "learning_rate": 0.00013149639677829592, + "loss": 2.1527, + "step": 14550 + }, + { + "epoch": 0.34289482360698975, + "grad_norm": 2.137396812438965, + "learning_rate": 0.00013144929584098724, + "loss": 2.2981, + "step": 14560 + }, + { + "epoch": 0.343130328293533, + "grad_norm": 3.0466394424438477, + "learning_rate": 0.0001314021949036786, + "loss": 2.1918, + "step": 14570 + }, + { + "epoch": 0.3433658329800763, + "grad_norm": 2.6642491817474365, + "learning_rate": 0.00013135509396636994, + "loss": 2.2627, + "step": 14580 + }, + { + "epoch": 0.34360133766661954, + "grad_norm": 1.9378482103347778, + "learning_rate": 0.0001313079930290613, + "loss": 2.0302, + "step": 14590 + }, + { + "epoch": 0.3438368423531628, + "grad_norm": 2.130795955657959, + "learning_rate": 0.00013126089209175264, + "loss": 2.0724, + "step": 14600 + }, + { + "epoch": 0.3440723470397061, + "grad_norm": 3.243426561355591, + "learning_rate": 0.000131213791154444, + "loss": 2.1017, + "step": 14610 + }, + { + "epoch": 0.34430785172624934, + "grad_norm": 2.094165086746216, + "learning_rate": 0.00013116669021713532, + "loss": 2.2293, + "step": 14620 + }, + { + "epoch": 0.3445433564127926, + "grad_norm": 2.4611902236938477, + "learning_rate": 0.0001311195892798267, + "loss": 2.0473, + "step": 14630 + }, + { + "epoch": 0.34477886109933586, + "grad_norm": 2.694459915161133, + "learning_rate": 0.00013107248834251802, + "loss": 2.0456, + "step": 14640 + }, + { + "epoch": 0.34501436578587913, + "grad_norm": 1.9303042888641357, + "learning_rate": 0.00013102538740520937, + "loss": 2.0716, + "step": 14650 + }, + { + "epoch": 0.3452498704724224, + "grad_norm": 2.9043257236480713, + "learning_rate": 0.00013097828646790072, + "loss": 2.1234, + "step": 14660 + }, + { + "epoch": 0.34548537515896566, + "grad_norm": 2.1518394947052, + "learning_rate": 0.00013093118553059207, + "loss": 2.1043, + "step": 14670 + }, + { + "epoch": 0.3457208798455089, + "grad_norm": 2.1524922847747803, + "learning_rate": 0.00013088408459328342, + "loss": 2.2017, + "step": 14680 + }, + { + "epoch": 0.3459563845320522, + "grad_norm": 3.270179510116577, + "learning_rate": 0.00013083698365597477, + "loss": 2.0904, + "step": 14690 + }, + { + "epoch": 0.34619188921859545, + "grad_norm": 2.349952220916748, + "learning_rate": 0.0001307898827186661, + "loss": 1.9607, + "step": 14700 + }, + { + "epoch": 0.3464273939051387, + "grad_norm": 2.087453842163086, + "learning_rate": 0.00013074278178135747, + "loss": 2.0567, + "step": 14710 + }, + { + "epoch": 0.346662898591682, + "grad_norm": 1.739876389503479, + "learning_rate": 0.0001306956808440488, + "loss": 1.9899, + "step": 14720 + }, + { + "epoch": 0.34689840327822524, + "grad_norm": 2.5640616416931152, + "learning_rate": 0.00013064857990674014, + "loss": 2.126, + "step": 14730 + }, + { + "epoch": 0.3471339079647685, + "grad_norm": 2.1961350440979004, + "learning_rate": 0.0001306014789694315, + "loss": 2.2033, + "step": 14740 + }, + { + "epoch": 0.34736941265131177, + "grad_norm": 2.4550042152404785, + "learning_rate": 0.00013055437803212284, + "loss": 2.2094, + "step": 14750 + }, + { + "epoch": 0.34760491733785503, + "grad_norm": 2.9043076038360596, + "learning_rate": 0.0001305072770948142, + "loss": 2.1208, + "step": 14760 + }, + { + "epoch": 0.3478404220243983, + "grad_norm": 2.2261176109313965, + "learning_rate": 0.00013046017615750554, + "loss": 2.3374, + "step": 14770 + }, + { + "epoch": 0.34807592671094156, + "grad_norm": 3.6236088275909424, + "learning_rate": 0.00013041307522019687, + "loss": 2.1553, + "step": 14780 + }, + { + "epoch": 0.3483114313974848, + "grad_norm": 1.7490025758743286, + "learning_rate": 0.00013036597428288824, + "loss": 2.0613, + "step": 14790 + }, + { + "epoch": 0.3485469360840281, + "grad_norm": 1.8174374103546143, + "learning_rate": 0.00013031887334557957, + "loss": 1.9779, + "step": 14800 + }, + { + "epoch": 0.34878244077057136, + "grad_norm": 2.0383830070495605, + "learning_rate": 0.00013027177240827094, + "loss": 2.0438, + "step": 14810 + }, + { + "epoch": 0.3490179454571146, + "grad_norm": 2.6745285987854004, + "learning_rate": 0.0001302246714709623, + "loss": 2.0689, + "step": 14820 + }, + { + "epoch": 0.3492534501436579, + "grad_norm": 2.264727830886841, + "learning_rate": 0.00013017757053365362, + "loss": 2.3443, + "step": 14830 + }, + { + "epoch": 0.34948895483020115, + "grad_norm": 1.7950100898742676, + "learning_rate": 0.000130130469596345, + "loss": 2.101, + "step": 14840 + }, + { + "epoch": 0.34972445951674436, + "grad_norm": 2.4111218452453613, + "learning_rate": 0.00013008336865903632, + "loss": 1.9223, + "step": 14850 + }, + { + "epoch": 0.3499599642032876, + "grad_norm": 2.078031539916992, + "learning_rate": 0.00013003626772172767, + "loss": 2.1652, + "step": 14860 + }, + { + "epoch": 0.3501954688898309, + "grad_norm": 2.400324583053589, + "learning_rate": 0.00012998916678441902, + "loss": 1.9705, + "step": 14870 + }, + { + "epoch": 0.35043097357637415, + "grad_norm": 1.9067310094833374, + "learning_rate": 0.00012994206584711037, + "loss": 2.0825, + "step": 14880 + }, + { + "epoch": 0.3506664782629174, + "grad_norm": 2.236149549484253, + "learning_rate": 0.00012989496490980172, + "loss": 2.2749, + "step": 14890 + }, + { + "epoch": 0.3509019829494607, + "grad_norm": 1.9904561042785645, + "learning_rate": 0.00012984786397249307, + "loss": 2.0971, + "step": 14900 + }, + { + "epoch": 0.35113748763600394, + "grad_norm": 2.2129764556884766, + "learning_rate": 0.0001298007630351844, + "loss": 2.2559, + "step": 14910 + }, + { + "epoch": 0.3513729923225472, + "grad_norm": 2.020630121231079, + "learning_rate": 0.00012975366209787577, + "loss": 1.968, + "step": 14920 + }, + { + "epoch": 0.35160849700909047, + "grad_norm": 3.8411476612091064, + "learning_rate": 0.0001297065611605671, + "loss": 2.0724, + "step": 14930 + }, + { + "epoch": 0.35184400169563373, + "grad_norm": 4.267243385314941, + "learning_rate": 0.00012965946022325844, + "loss": 1.9732, + "step": 14940 + }, + { + "epoch": 0.352079506382177, + "grad_norm": 2.2827847003936768, + "learning_rate": 0.0001296123592859498, + "loss": 2.3295, + "step": 14950 + }, + { + "epoch": 0.35231501106872026, + "grad_norm": 2.276663064956665, + "learning_rate": 0.00012956525834864114, + "loss": 2.2312, + "step": 14960 + }, + { + "epoch": 0.3525505157552635, + "grad_norm": 2.2448136806488037, + "learning_rate": 0.0001295181574113325, + "loss": 2.0823, + "step": 14970 + }, + { + "epoch": 0.3527860204418068, + "grad_norm": 2.307004690170288, + "learning_rate": 0.00012947105647402384, + "loss": 1.9903, + "step": 14980 + }, + { + "epoch": 0.35302152512835006, + "grad_norm": 2.2261812686920166, + "learning_rate": 0.0001294239555367152, + "loss": 2.0342, + "step": 14990 + }, + { + "epoch": 0.3532570298148933, + "grad_norm": 2.9633865356445312, + "learning_rate": 0.00012937685459940654, + "loss": 1.9893, + "step": 15000 + }, + { + "epoch": 0.3534925345014366, + "grad_norm": 2.297091007232666, + "learning_rate": 0.00012932975366209787, + "loss": 2.258, + "step": 15010 + }, + { + "epoch": 0.35372803918797985, + "grad_norm": 3.055051803588867, + "learning_rate": 0.00012928265272478924, + "loss": 2.0553, + "step": 15020 + }, + { + "epoch": 0.3539635438745231, + "grad_norm": 1.8271070718765259, + "learning_rate": 0.00012923555178748057, + "loss": 2.1569, + "step": 15030 + }, + { + "epoch": 0.3541990485610664, + "grad_norm": 2.4504764080047607, + "learning_rate": 0.00012918845085017192, + "loss": 2.1445, + "step": 15040 + }, + { + "epoch": 0.35443455324760964, + "grad_norm": 3.338831901550293, + "learning_rate": 0.00012914134991286327, + "loss": 2.151, + "step": 15050 + }, + { + "epoch": 0.3546700579341529, + "grad_norm": 1.9933847188949585, + "learning_rate": 0.00012909424897555462, + "loss": 1.9229, + "step": 15060 + }, + { + "epoch": 0.35490556262069617, + "grad_norm": 1.8507821559906006, + "learning_rate": 0.00012904714803824597, + "loss": 1.8161, + "step": 15070 + }, + { + "epoch": 0.35514106730723943, + "grad_norm": 2.110309362411499, + "learning_rate": 0.00012900004710093732, + "loss": 2.107, + "step": 15080 + }, + { + "epoch": 0.3553765719937827, + "grad_norm": 2.447514057159424, + "learning_rate": 0.00012895294616362864, + "loss": 2.0256, + "step": 15090 + }, + { + "epoch": 0.35561207668032596, + "grad_norm": 2.630833864212036, + "learning_rate": 0.00012890584522632002, + "loss": 2.2369, + "step": 15100 + }, + { + "epoch": 0.3558475813668692, + "grad_norm": 2.210677146911621, + "learning_rate": 0.00012885874428901137, + "loss": 2.3155, + "step": 15110 + }, + { + "epoch": 0.3560830860534125, + "grad_norm": 2.5040535926818848, + "learning_rate": 0.0001288116433517027, + "loss": 2.1744, + "step": 15120 + }, + { + "epoch": 0.3563185907399557, + "grad_norm": 2.41166615486145, + "learning_rate": 0.00012876454241439407, + "loss": 1.9641, + "step": 15130 + }, + { + "epoch": 0.35655409542649896, + "grad_norm": 2.1737725734710693, + "learning_rate": 0.0001287174414770854, + "loss": 2.0927, + "step": 15140 + }, + { + "epoch": 0.3567896001130422, + "grad_norm": 2.3262150287628174, + "learning_rate": 0.00012867034053977677, + "loss": 1.8485, + "step": 15150 + }, + { + "epoch": 0.3570251047995855, + "grad_norm": 2.214301586151123, + "learning_rate": 0.0001286232396024681, + "loss": 2.0541, + "step": 15160 + }, + { + "epoch": 0.35726060948612876, + "grad_norm": 3.0727179050445557, + "learning_rate": 0.0001285808487588903, + "loss": 2.2335, + "step": 15170 + }, + { + "epoch": 0.357496114172672, + "grad_norm": 1.55754816532135, + "learning_rate": 0.00012853374782158166, + "loss": 1.7324, + "step": 15180 + }, + { + "epoch": 0.3577316188592153, + "grad_norm": 2.219801425933838, + "learning_rate": 0.000128486646884273, + "loss": 2.1554, + "step": 15190 + }, + { + "epoch": 0.35796712354575855, + "grad_norm": 2.0826430320739746, + "learning_rate": 0.00012843954594696433, + "loss": 2.0151, + "step": 15200 + }, + { + "epoch": 0.3582026282323018, + "grad_norm": 2.083486557006836, + "learning_rate": 0.0001283924450096557, + "loss": 2.1411, + "step": 15210 + }, + { + "epoch": 0.3584381329188451, + "grad_norm": 2.3072783946990967, + "learning_rate": 0.00012834534407234704, + "loss": 2.006, + "step": 15220 + }, + { + "epoch": 0.35867363760538834, + "grad_norm": 2.440856456756592, + "learning_rate": 0.00012829824313503839, + "loss": 2.0854, + "step": 15230 + }, + { + "epoch": 0.3589091422919316, + "grad_norm": 2.845301866531372, + "learning_rate": 0.00012825114219772974, + "loss": 2.3434, + "step": 15240 + }, + { + "epoch": 0.35914464697847487, + "grad_norm": 2.953895330429077, + "learning_rate": 0.00012820404126042109, + "loss": 2.0988, + "step": 15250 + }, + { + "epoch": 0.35938015166501813, + "grad_norm": 2.4946932792663574, + "learning_rate": 0.00012815694032311244, + "loss": 2.0624, + "step": 15260 + }, + { + "epoch": 0.3596156563515614, + "grad_norm": 2.2791571617126465, + "learning_rate": 0.00012810983938580379, + "loss": 2.0612, + "step": 15270 + }, + { + "epoch": 0.35985116103810466, + "grad_norm": 2.0594568252563477, + "learning_rate": 0.00012806273844849514, + "loss": 2.2905, + "step": 15280 + }, + { + "epoch": 0.3600866657246479, + "grad_norm": 1.937827467918396, + "learning_rate": 0.0001280156375111865, + "loss": 2.2477, + "step": 15290 + }, + { + "epoch": 0.3603221704111912, + "grad_norm": 2.1612651348114014, + "learning_rate": 0.0001279685365738778, + "loss": 2.064, + "step": 15300 + }, + { + "epoch": 0.36055767509773445, + "grad_norm": 2.1727564334869385, + "learning_rate": 0.0001279214356365692, + "loss": 2.1897, + "step": 15310 + }, + { + "epoch": 0.3607931797842777, + "grad_norm": 1.9623788595199585, + "learning_rate": 0.0001278743346992605, + "loss": 2.0795, + "step": 15320 + }, + { + "epoch": 0.361028684470821, + "grad_norm": 2.5158348083496094, + "learning_rate": 0.00012782723376195186, + "loss": 2.3262, + "step": 15330 + }, + { + "epoch": 0.36126418915736425, + "grad_norm": 1.673125982284546, + "learning_rate": 0.0001277801328246432, + "loss": 2.0423, + "step": 15340 + }, + { + "epoch": 0.3614996938439075, + "grad_norm": 2.2631309032440186, + "learning_rate": 0.00012773303188733456, + "loss": 1.9951, + "step": 15350 + }, + { + "epoch": 0.3617351985304508, + "grad_norm": 2.103239059448242, + "learning_rate": 0.0001276859309500259, + "loss": 2.1707, + "step": 15360 + }, + { + "epoch": 0.36197070321699404, + "grad_norm": 2.2068543434143066, + "learning_rate": 0.00012763883001271726, + "loss": 2.0808, + "step": 15370 + }, + { + "epoch": 0.3622062079035373, + "grad_norm": 2.3455021381378174, + "learning_rate": 0.0001275917290754086, + "loss": 2.1042, + "step": 15380 + }, + { + "epoch": 0.36244171259008057, + "grad_norm": 2.655992031097412, + "learning_rate": 0.00012754462813809996, + "loss": 2.165, + "step": 15390 + }, + { + "epoch": 0.36267721727662383, + "grad_norm": 2.5498814582824707, + "learning_rate": 0.0001274975272007913, + "loss": 2.251, + "step": 15400 + }, + { + "epoch": 0.36291272196316704, + "grad_norm": 2.270501136779785, + "learning_rate": 0.00012745042626348264, + "loss": 1.9743, + "step": 15410 + }, + { + "epoch": 0.3631482266497103, + "grad_norm": 1.8444944620132446, + "learning_rate": 0.000127403325326174, + "loss": 2.2257, + "step": 15420 + }, + { + "epoch": 0.36338373133625357, + "grad_norm": 3.1018927097320557, + "learning_rate": 0.00012735622438886534, + "loss": 2.0163, + "step": 15430 + }, + { + "epoch": 0.36361923602279683, + "grad_norm": 3.2892568111419678, + "learning_rate": 0.0001273091234515567, + "loss": 2.0607, + "step": 15440 + }, + { + "epoch": 0.3638547407093401, + "grad_norm": 2.4903275966644287, + "learning_rate": 0.00012726202251424804, + "loss": 2.0741, + "step": 15450 + }, + { + "epoch": 0.36409024539588336, + "grad_norm": 2.295133590698242, + "learning_rate": 0.0001272149215769394, + "loss": 2.3047, + "step": 15460 + }, + { + "epoch": 0.3643257500824266, + "grad_norm": 1.8592169284820557, + "learning_rate": 0.00012716782063963074, + "loss": 2.1098, + "step": 15470 + }, + { + "epoch": 0.3645612547689699, + "grad_norm": 1.8355640172958374, + "learning_rate": 0.0001271207197023221, + "loss": 2.0824, + "step": 15480 + }, + { + "epoch": 0.36479675945551315, + "grad_norm": 2.295816421508789, + "learning_rate": 0.00012707361876501344, + "loss": 2.1756, + "step": 15490 + }, + { + "epoch": 0.3650322641420564, + "grad_norm": 2.0585267543792725, + "learning_rate": 0.0001270265178277048, + "loss": 2.3163, + "step": 15500 + }, + { + "epoch": 0.3652677688285997, + "grad_norm": 1.753099799156189, + "learning_rate": 0.0001269794168903961, + "loss": 2.1565, + "step": 15510 + }, + { + "epoch": 0.36550327351514295, + "grad_norm": 2.6697375774383545, + "learning_rate": 0.0001269323159530875, + "loss": 2.2484, + "step": 15520 + }, + { + "epoch": 0.3657387782016862, + "grad_norm": 2.1869733333587646, + "learning_rate": 0.0001268852150157788, + "loss": 2.1681, + "step": 15530 + }, + { + "epoch": 0.3659742828882295, + "grad_norm": 2.7051522731781006, + "learning_rate": 0.00012683811407847016, + "loss": 2.0296, + "step": 15540 + }, + { + "epoch": 0.36620978757477274, + "grad_norm": 2.1270742416381836, + "learning_rate": 0.0001267910131411615, + "loss": 1.9708, + "step": 15550 + }, + { + "epoch": 0.366445292261316, + "grad_norm": 2.645512342453003, + "learning_rate": 0.00012674391220385286, + "loss": 2.1499, + "step": 15560 + }, + { + "epoch": 0.36668079694785927, + "grad_norm": 2.9818031787872314, + "learning_rate": 0.0001266968112665442, + "loss": 2.0822, + "step": 15570 + }, + { + "epoch": 0.36691630163440253, + "grad_norm": 3.0397183895111084, + "learning_rate": 0.00012664971032923556, + "loss": 2.1616, + "step": 15580 + }, + { + "epoch": 0.3671518063209458, + "grad_norm": 2.2161061763763428, + "learning_rate": 0.00012660260939192689, + "loss": 1.9094, + "step": 15590 + }, + { + "epoch": 0.36738731100748906, + "grad_norm": 1.962997555732727, + "learning_rate": 0.00012655550845461826, + "loss": 2.0234, + "step": 15600 + }, + { + "epoch": 0.3676228156940323, + "grad_norm": 2.1313388347625732, + "learning_rate": 0.00012650840751730959, + "loss": 2.04, + "step": 15610 + }, + { + "epoch": 0.3678583203805756, + "grad_norm": 2.517944574356079, + "learning_rate": 0.00012646130658000094, + "loss": 2.0798, + "step": 15620 + }, + { + "epoch": 0.36809382506711885, + "grad_norm": 2.792917251586914, + "learning_rate": 0.0001264142056426923, + "loss": 1.9457, + "step": 15630 + }, + { + "epoch": 0.3683293297536621, + "grad_norm": 2.193131685256958, + "learning_rate": 0.00012636710470538364, + "loss": 2.1932, + "step": 15640 + }, + { + "epoch": 0.3685648344402054, + "grad_norm": 2.234732151031494, + "learning_rate": 0.00012632000376807501, + "loss": 2.1695, + "step": 15650 + }, + { + "epoch": 0.36880033912674864, + "grad_norm": 2.553999900817871, + "learning_rate": 0.00012627290283076634, + "loss": 2.1587, + "step": 15660 + }, + { + "epoch": 0.3690358438132919, + "grad_norm": 2.0366036891937256, + "learning_rate": 0.0001262258018934577, + "loss": 2.1545, + "step": 15670 + }, + { + "epoch": 0.3692713484998352, + "grad_norm": 1.8725734949111938, + "learning_rate": 0.00012617870095614904, + "loss": 2.1266, + "step": 15680 + }, + { + "epoch": 0.36950685318637844, + "grad_norm": 2.4001288414001465, + "learning_rate": 0.0001261316000188404, + "loss": 2.1415, + "step": 15690 + }, + { + "epoch": 0.36974235787292165, + "grad_norm": 2.269136428833008, + "learning_rate": 0.00012608449908153174, + "loss": 1.9927, + "step": 15700 + }, + { + "epoch": 0.3699778625594649, + "grad_norm": 2.2865076065063477, + "learning_rate": 0.0001260373981442231, + "loss": 2.3061, + "step": 15710 + }, + { + "epoch": 0.3702133672460082, + "grad_norm": 2.2969698905944824, + "learning_rate": 0.0001259902972069144, + "loss": 2.1753, + "step": 15720 + }, + { + "epoch": 0.37044887193255144, + "grad_norm": 2.6389291286468506, + "learning_rate": 0.0001259431962696058, + "loss": 1.9136, + "step": 15730 + }, + { + "epoch": 0.3706843766190947, + "grad_norm": 3.111884355545044, + "learning_rate": 0.0001258960953322971, + "loss": 1.9592, + "step": 15740 + }, + { + "epoch": 0.37091988130563797, + "grad_norm": 1.8911104202270508, + "learning_rate": 0.00012584899439498846, + "loss": 1.9782, + "step": 15750 + }, + { + "epoch": 0.37115538599218123, + "grad_norm": 2.410327672958374, + "learning_rate": 0.0001258018934576798, + "loss": 2.1944, + "step": 15760 + }, + { + "epoch": 0.3713908906787245, + "grad_norm": 2.6267807483673096, + "learning_rate": 0.00012575479252037116, + "loss": 2.1023, + "step": 15770 + }, + { + "epoch": 0.37162639536526776, + "grad_norm": 2.455199718475342, + "learning_rate": 0.0001257076915830625, + "loss": 2.2238, + "step": 15780 + }, + { + "epoch": 0.371861900051811, + "grad_norm": 2.131810188293457, + "learning_rate": 0.00012566059064575386, + "loss": 2.1704, + "step": 15790 + }, + { + "epoch": 0.3720974047383543, + "grad_norm": 2.0262787342071533, + "learning_rate": 0.0001256134897084452, + "loss": 2.0336, + "step": 15800 + }, + { + "epoch": 0.37233290942489755, + "grad_norm": 2.7119791507720947, + "learning_rate": 0.00012556638877113656, + "loss": 2.0177, + "step": 15810 + }, + { + "epoch": 0.3725684141114408, + "grad_norm": 2.81471586227417, + "learning_rate": 0.0001255192878338279, + "loss": 1.9555, + "step": 15820 + }, + { + "epoch": 0.3728039187979841, + "grad_norm": 1.8543721437454224, + "learning_rate": 0.00012547218689651924, + "loss": 2.1615, + "step": 15830 + }, + { + "epoch": 0.37303942348452734, + "grad_norm": 2.129915952682495, + "learning_rate": 0.0001254250859592106, + "loss": 2.0439, + "step": 15840 + }, + { + "epoch": 0.3732749281710706, + "grad_norm": 1.9343938827514648, + "learning_rate": 0.00012537798502190194, + "loss": 1.944, + "step": 15850 + }, + { + "epoch": 0.37351043285761387, + "grad_norm": 2.403035879135132, + "learning_rate": 0.0001253308840845933, + "loss": 2.0897, + "step": 15860 + }, + { + "epoch": 0.37374593754415714, + "grad_norm": 1.8205158710479736, + "learning_rate": 0.00012528378314728464, + "loss": 2.1931, + "step": 15870 + }, + { + "epoch": 0.3739814422307004, + "grad_norm": 2.119952917098999, + "learning_rate": 0.000125236682209976, + "loss": 2.254, + "step": 15880 + }, + { + "epoch": 0.37421694691724366, + "grad_norm": 2.8081493377685547, + "learning_rate": 0.00012518958127266734, + "loss": 2.0929, + "step": 15890 + }, + { + "epoch": 0.37445245160378693, + "grad_norm": 1.801512598991394, + "learning_rate": 0.00012514248033535866, + "loss": 2.1442, + "step": 15900 + }, + { + "epoch": 0.3746879562903302, + "grad_norm": 2.782501697540283, + "learning_rate": 0.00012509537939805004, + "loss": 2.1213, + "step": 15910 + }, + { + "epoch": 0.37492346097687346, + "grad_norm": 2.08080792427063, + "learning_rate": 0.00012504827846074136, + "loss": 2.0139, + "step": 15920 + }, + { + "epoch": 0.3751589656634167, + "grad_norm": 2.6191723346710205, + "learning_rate": 0.0001250011775234327, + "loss": 2.1154, + "step": 15930 + }, + { + "epoch": 0.37539447034996, + "grad_norm": 2.5812697410583496, + "learning_rate": 0.0001249540765861241, + "loss": 1.9264, + "step": 15940 + }, + { + "epoch": 0.37562997503650325, + "grad_norm": 1.9726451635360718, + "learning_rate": 0.0001249069756488154, + "loss": 2.1646, + "step": 15950 + }, + { + "epoch": 0.3758654797230465, + "grad_norm": 1.8344637155532837, + "learning_rate": 0.00012485987471150676, + "loss": 2.1906, + "step": 15960 + }, + { + "epoch": 0.3761009844095898, + "grad_norm": 2.092193365097046, + "learning_rate": 0.00012481277377419811, + "loss": 2.2604, + "step": 15970 + }, + { + "epoch": 0.376336489096133, + "grad_norm": 1.8692036867141724, + "learning_rate": 0.00012476567283688946, + "loss": 1.9886, + "step": 15980 + }, + { + "epoch": 0.37657199378267625, + "grad_norm": 1.7763489484786987, + "learning_rate": 0.00012471857189958081, + "loss": 2.0102, + "step": 15990 + }, + { + "epoch": 0.3768074984692195, + "grad_norm": 2.593400478363037, + "learning_rate": 0.00012467147096227216, + "loss": 2.1123, + "step": 16000 + }, + { + "epoch": 0.3770430031557628, + "grad_norm": 2.239337205886841, + "learning_rate": 0.0001246243700249635, + "loss": 2.2664, + "step": 16010 + }, + { + "epoch": 0.37727850784230604, + "grad_norm": 2.6145036220550537, + "learning_rate": 0.00012457726908765486, + "loss": 2.1313, + "step": 16020 + }, + { + "epoch": 0.3775140125288493, + "grad_norm": 2.3163740634918213, + "learning_rate": 0.0001245301681503462, + "loss": 2.1796, + "step": 16030 + }, + { + "epoch": 0.37774951721539257, + "grad_norm": 1.7631512880325317, + "learning_rate": 0.00012448306721303757, + "loss": 2.0957, + "step": 16040 + }, + { + "epoch": 0.37798502190193584, + "grad_norm": 1.9558098316192627, + "learning_rate": 0.0001244359662757289, + "loss": 2.0811, + "step": 16050 + }, + { + "epoch": 0.3782205265884791, + "grad_norm": 1.9874584674835205, + "learning_rate": 0.00012438886533842024, + "loss": 1.7742, + "step": 16060 + }, + { + "epoch": 0.37845603127502236, + "grad_norm": 2.0579781532287598, + "learning_rate": 0.0001243417644011116, + "loss": 1.9229, + "step": 16070 + }, + { + "epoch": 0.37869153596156563, + "grad_norm": 2.639272928237915, + "learning_rate": 0.00012429466346380294, + "loss": 2.294, + "step": 16080 + }, + { + "epoch": 0.3789270406481089, + "grad_norm": 2.7305397987365723, + "learning_rate": 0.0001242475625264943, + "loss": 2.1147, + "step": 16090 + }, + { + "epoch": 0.37916254533465216, + "grad_norm": 3.5461792945861816, + "learning_rate": 0.00012420046158918564, + "loss": 2.1957, + "step": 16100 + }, + { + "epoch": 0.3793980500211954, + "grad_norm": 2.2670328617095947, + "learning_rate": 0.00012415336065187696, + "loss": 2.0242, + "step": 16110 + }, + { + "epoch": 0.3796335547077387, + "grad_norm": 2.6801137924194336, + "learning_rate": 0.00012410625971456834, + "loss": 2.0179, + "step": 16120 + }, + { + "epoch": 0.37986905939428195, + "grad_norm": 2.0788025856018066, + "learning_rate": 0.00012405915877725966, + "loss": 1.8448, + "step": 16130 + }, + { + "epoch": 0.3801045640808252, + "grad_norm": 1.919297695159912, + "learning_rate": 0.00012401205783995101, + "loss": 1.9961, + "step": 16140 + }, + { + "epoch": 0.3803400687673685, + "grad_norm": 1.8758867979049683, + "learning_rate": 0.00012396495690264236, + "loss": 2.3458, + "step": 16150 + }, + { + "epoch": 0.38057557345391174, + "grad_norm": 2.559508800506592, + "learning_rate": 0.00012391785596533371, + "loss": 2.1396, + "step": 16160 + }, + { + "epoch": 0.380811078140455, + "grad_norm": 1.8591848611831665, + "learning_rate": 0.00012387075502802506, + "loss": 2.0795, + "step": 16170 + }, + { + "epoch": 0.38104658282699827, + "grad_norm": 1.8270467519760132, + "learning_rate": 0.00012382365409071641, + "loss": 1.9482, + "step": 16180 + }, + { + "epoch": 0.38128208751354153, + "grad_norm": 1.938462257385254, + "learning_rate": 0.00012377655315340774, + "loss": 2.0556, + "step": 16190 + }, + { + "epoch": 0.3815175922000848, + "grad_norm": 2.203063726425171, + "learning_rate": 0.00012372945221609912, + "loss": 2.1188, + "step": 16200 + }, + { + "epoch": 0.38175309688662806, + "grad_norm": 2.3647751808166504, + "learning_rate": 0.00012368235127879047, + "loss": 1.9052, + "step": 16210 + }, + { + "epoch": 0.3819886015731713, + "grad_norm": 2.129561185836792, + "learning_rate": 0.0001236352503414818, + "loss": 2.1535, + "step": 16220 + }, + { + "epoch": 0.3822241062597146, + "grad_norm": 2.4006335735321045, + "learning_rate": 0.00012358814940417317, + "loss": 2.0127, + "step": 16230 + }, + { + "epoch": 0.38245961094625786, + "grad_norm": 2.2458090782165527, + "learning_rate": 0.0001235410484668645, + "loss": 2.1334, + "step": 16240 + }, + { + "epoch": 0.3826951156328011, + "grad_norm": 2.183962106704712, + "learning_rate": 0.00012349394752955587, + "loss": 2.0084, + "step": 16250 + }, + { + "epoch": 0.38293062031934433, + "grad_norm": 2.067183017730713, + "learning_rate": 0.0001234468465922472, + "loss": 1.9644, + "step": 16260 + }, + { + "epoch": 0.3831661250058876, + "grad_norm": 1.8971290588378906, + "learning_rate": 0.00012339974565493854, + "loss": 1.9136, + "step": 16270 + }, + { + "epoch": 0.38340162969243086, + "grad_norm": 2.4257447719573975, + "learning_rate": 0.0001233526447176299, + "loss": 2.3739, + "step": 16280 + }, + { + "epoch": 0.3836371343789741, + "grad_norm": 2.2010610103607178, + "learning_rate": 0.00012330554378032124, + "loss": 2.1275, + "step": 16290 + }, + { + "epoch": 0.3838726390655174, + "grad_norm": 2.2034873962402344, + "learning_rate": 0.0001232584428430126, + "loss": 2.007, + "step": 16300 + }, + { + "epoch": 0.38410814375206065, + "grad_norm": 2.489978551864624, + "learning_rate": 0.00012321134190570394, + "loss": 2.0106, + "step": 16310 + }, + { + "epoch": 0.3843436484386039, + "grad_norm": 2.219968795776367, + "learning_rate": 0.00012316424096839526, + "loss": 1.946, + "step": 16320 + }, + { + "epoch": 0.3845791531251472, + "grad_norm": 3.080683946609497, + "learning_rate": 0.00012311714003108664, + "loss": 2.1341, + "step": 16330 + }, + { + "epoch": 0.38481465781169044, + "grad_norm": 2.717668294906616, + "learning_rate": 0.00012307003909377796, + "loss": 2.0214, + "step": 16340 + }, + { + "epoch": 0.3850501624982337, + "grad_norm": 2.038290023803711, + "learning_rate": 0.00012302293815646931, + "loss": 1.8628, + "step": 16350 + }, + { + "epoch": 0.38528566718477697, + "grad_norm": 2.27325177192688, + "learning_rate": 0.00012297583721916066, + "loss": 2.0714, + "step": 16360 + }, + { + "epoch": 0.38552117187132023, + "grad_norm": 2.770235776901245, + "learning_rate": 0.00012292873628185202, + "loss": 2.1704, + "step": 16370 + }, + { + "epoch": 0.3857566765578635, + "grad_norm": 2.967625379562378, + "learning_rate": 0.00012288163534454337, + "loss": 2.0807, + "step": 16380 + }, + { + "epoch": 0.38599218124440676, + "grad_norm": 2.048448085784912, + "learning_rate": 0.00012283453440723472, + "loss": 2.1044, + "step": 16390 + }, + { + "epoch": 0.38622768593095, + "grad_norm": 2.9608845710754395, + "learning_rate": 0.00012278743346992604, + "loss": 2.1001, + "step": 16400 + }, + { + "epoch": 0.3864631906174933, + "grad_norm": 1.9301419258117676, + "learning_rate": 0.00012274033253261742, + "loss": 2.0542, + "step": 16410 + }, + { + "epoch": 0.38669869530403655, + "grad_norm": 1.865768313407898, + "learning_rate": 0.00012269323159530874, + "loss": 2.001, + "step": 16420 + }, + { + "epoch": 0.3869341999905798, + "grad_norm": 2.2581074237823486, + "learning_rate": 0.0001226461306580001, + "loss": 2.0847, + "step": 16430 + }, + { + "epoch": 0.3871697046771231, + "grad_norm": 1.6205297708511353, + "learning_rate": 0.00012259902972069144, + "loss": 2.1412, + "step": 16440 + }, + { + "epoch": 0.38740520936366635, + "grad_norm": 2.4812774658203125, + "learning_rate": 0.0001225519287833828, + "loss": 2.1315, + "step": 16450 + }, + { + "epoch": 0.3876407140502096, + "grad_norm": 2.2032692432403564, + "learning_rate": 0.00012250482784607414, + "loss": 2.108, + "step": 16460 + }, + { + "epoch": 0.3878762187367529, + "grad_norm": 2.2076354026794434, + "learning_rate": 0.0001224577269087655, + "loss": 2.1367, + "step": 16470 + }, + { + "epoch": 0.38811172342329614, + "grad_norm": 2.061929225921631, + "learning_rate": 0.0001224106259714568, + "loss": 1.8931, + "step": 16480 + }, + { + "epoch": 0.3883472281098394, + "grad_norm": 1.9706366062164307, + "learning_rate": 0.0001223635250341482, + "loss": 2.1087, + "step": 16490 + }, + { + "epoch": 0.38858273279638267, + "grad_norm": 2.2798683643341064, + "learning_rate": 0.00012231642409683954, + "loss": 2.152, + "step": 16500 + }, + { + "epoch": 0.38881823748292593, + "grad_norm": 2.740971326828003, + "learning_rate": 0.0001222693231595309, + "loss": 1.9273, + "step": 16510 + }, + { + "epoch": 0.3890537421694692, + "grad_norm": 2.453092575073242, + "learning_rate": 0.00012222222222222224, + "loss": 1.8969, + "step": 16520 + }, + { + "epoch": 0.38928924685601246, + "grad_norm": 2.553710460662842, + "learning_rate": 0.00012217512128491356, + "loss": 2.0968, + "step": 16530 + }, + { + "epoch": 0.38952475154255567, + "grad_norm": 2.732374429702759, + "learning_rate": 0.00012212802034760494, + "loss": 2.2074, + "step": 16540 + }, + { + "epoch": 0.38976025622909893, + "grad_norm": 2.342780113220215, + "learning_rate": 0.00012208091941029627, + "loss": 2.0617, + "step": 16550 + }, + { + "epoch": 0.3899957609156422, + "grad_norm": 1.9254868030548096, + "learning_rate": 0.00012203381847298763, + "loss": 1.8196, + "step": 16560 + }, + { + "epoch": 0.39023126560218546, + "grad_norm": 1.8668605089187622, + "learning_rate": 0.00012198671753567897, + "loss": 2.1088, + "step": 16570 + }, + { + "epoch": 0.3904667702887287, + "grad_norm": 2.149210214614868, + "learning_rate": 0.00012193961659837032, + "loss": 2.232, + "step": 16580 + }, + { + "epoch": 0.390702274975272, + "grad_norm": 2.4574036598205566, + "learning_rate": 0.00012189251566106165, + "loss": 2.0236, + "step": 16590 + }, + { + "epoch": 0.39093777966181525, + "grad_norm": 1.9799808263778687, + "learning_rate": 0.00012184541472375302, + "loss": 2.1751, + "step": 16600 + }, + { + "epoch": 0.3911732843483585, + "grad_norm": 2.1461141109466553, + "learning_rate": 0.00012179831378644435, + "loss": 2.1239, + "step": 16610 + }, + { + "epoch": 0.3914087890349018, + "grad_norm": 2.3437371253967285, + "learning_rate": 0.0001217512128491357, + "loss": 2.2036, + "step": 16620 + }, + { + "epoch": 0.39164429372144505, + "grad_norm": 2.012018918991089, + "learning_rate": 0.00012170411191182704, + "loss": 2.1692, + "step": 16630 + }, + { + "epoch": 0.3918797984079883, + "grad_norm": 2.889059066772461, + "learning_rate": 0.0001216570109745184, + "loss": 2.2988, + "step": 16640 + }, + { + "epoch": 0.3921153030945316, + "grad_norm": 2.31487774848938, + "learning_rate": 0.00012160991003720974, + "loss": 2.1439, + "step": 16650 + }, + { + "epoch": 0.39235080778107484, + "grad_norm": 1.9940412044525146, + "learning_rate": 0.0001215628090999011, + "loss": 2.1955, + "step": 16660 + }, + { + "epoch": 0.3925863124676181, + "grad_norm": 2.186189889907837, + "learning_rate": 0.00012151570816259243, + "loss": 1.9478, + "step": 16670 + }, + { + "epoch": 0.39282181715416137, + "grad_norm": 1.8489048480987549, + "learning_rate": 0.00012146860722528379, + "loss": 2.0117, + "step": 16680 + }, + { + "epoch": 0.39305732184070463, + "grad_norm": 2.188730239868164, + "learning_rate": 0.00012142150628797513, + "loss": 2.367, + "step": 16690 + }, + { + "epoch": 0.3932928265272479, + "grad_norm": 2.6312897205352783, + "learning_rate": 0.00012137440535066649, + "loss": 2.1655, + "step": 16700 + }, + { + "epoch": 0.39352833121379116, + "grad_norm": 2.244814872741699, + "learning_rate": 0.00012132730441335783, + "loss": 2.0669, + "step": 16710 + }, + { + "epoch": 0.3937638359003344, + "grad_norm": 3.1631484031677246, + "learning_rate": 0.00012128020347604918, + "loss": 2.3648, + "step": 16720 + }, + { + "epoch": 0.3939993405868777, + "grad_norm": 2.074009418487549, + "learning_rate": 0.00012123310253874052, + "loss": 2.192, + "step": 16730 + }, + { + "epoch": 0.39423484527342095, + "grad_norm": 2.1103250980377197, + "learning_rate": 0.00012118600160143188, + "loss": 1.9476, + "step": 16740 + }, + { + "epoch": 0.3944703499599642, + "grad_norm": 2.7866339683532715, + "learning_rate": 0.00012113890066412322, + "loss": 1.9956, + "step": 16750 + }, + { + "epoch": 0.3947058546465075, + "grad_norm": 2.7085251808166504, + "learning_rate": 0.00012109179972681457, + "loss": 2.2952, + "step": 16760 + }, + { + "epoch": 0.39494135933305075, + "grad_norm": 1.9454951286315918, + "learning_rate": 0.0001210446987895059, + "loss": 2.1263, + "step": 16770 + }, + { + "epoch": 0.395176864019594, + "grad_norm": 3.369178056716919, + "learning_rate": 0.00012099759785219727, + "loss": 2.0465, + "step": 16780 + }, + { + "epoch": 0.3954123687061373, + "grad_norm": 2.1586029529571533, + "learning_rate": 0.00012095049691488862, + "loss": 2.1699, + "step": 16790 + }, + { + "epoch": 0.39564787339268054, + "grad_norm": 3.4209299087524414, + "learning_rate": 0.00012090339597757995, + "loss": 1.9791, + "step": 16800 + }, + { + "epoch": 0.3958833780792238, + "grad_norm": 2.1973490715026855, + "learning_rate": 0.00012085629504027132, + "loss": 2.1723, + "step": 16810 + }, + { + "epoch": 0.39611888276576707, + "grad_norm": 2.067542791366577, + "learning_rate": 0.00012080919410296265, + "loss": 1.8388, + "step": 16820 + }, + { + "epoch": 0.3963543874523103, + "grad_norm": 2.0267772674560547, + "learning_rate": 0.000120762093165654, + "loss": 2.1342, + "step": 16830 + }, + { + "epoch": 0.39658989213885354, + "grad_norm": 1.9653459787368774, + "learning_rate": 0.00012071499222834534, + "loss": 2.1676, + "step": 16840 + }, + { + "epoch": 0.3968253968253968, + "grad_norm": 2.003641366958618, + "learning_rate": 0.0001206678912910367, + "loss": 2.119, + "step": 16850 + }, + { + "epoch": 0.39706090151194007, + "grad_norm": 2.4826791286468506, + "learning_rate": 0.00012062079035372804, + "loss": 1.9719, + "step": 16860 + }, + { + "epoch": 0.39729640619848333, + "grad_norm": 2.9601070880889893, + "learning_rate": 0.0001205736894164194, + "loss": 1.9954, + "step": 16870 + }, + { + "epoch": 0.3975319108850266, + "grad_norm": 2.5066933631896973, + "learning_rate": 0.00012052658847911074, + "loss": 2.0153, + "step": 16880 + }, + { + "epoch": 0.39776741557156986, + "grad_norm": 2.712014675140381, + "learning_rate": 0.00012047948754180209, + "loss": 2.2096, + "step": 16890 + }, + { + "epoch": 0.3980029202581131, + "grad_norm": 1.7923743724822998, + "learning_rate": 0.00012043238660449343, + "loss": 2.2143, + "step": 16900 + }, + { + "epoch": 0.3982384249446564, + "grad_norm": 2.5782217979431152, + "learning_rate": 0.00012038528566718479, + "loss": 2.2435, + "step": 16910 + }, + { + "epoch": 0.39847392963119965, + "grad_norm": 2.224064350128174, + "learning_rate": 0.00012033818472987613, + "loss": 2.1383, + "step": 16920 + }, + { + "epoch": 0.3987094343177429, + "grad_norm": 2.3517062664031982, + "learning_rate": 0.00012029108379256748, + "loss": 1.7965, + "step": 16930 + }, + { + "epoch": 0.3989449390042862, + "grad_norm": 3.8429908752441406, + "learning_rate": 0.00012024398285525882, + "loss": 2.0901, + "step": 16940 + }, + { + "epoch": 0.39918044369082945, + "grad_norm": 2.49701189994812, + "learning_rate": 0.00012019688191795018, + "loss": 2.075, + "step": 16950 + }, + { + "epoch": 0.3994159483773727, + "grad_norm": 2.411953926086426, + "learning_rate": 0.00012014978098064152, + "loss": 2.2769, + "step": 16960 + }, + { + "epoch": 0.399651453063916, + "grad_norm": 2.9988455772399902, + "learning_rate": 0.00012010268004333287, + "loss": 1.979, + "step": 16970 + }, + { + "epoch": 0.39988695775045924, + "grad_norm": 3.0603721141815186, + "learning_rate": 0.0001200555791060242, + "loss": 2.0761, + "step": 16980 + }, + { + "epoch": 0.4001224624370025, + "grad_norm": 2.097357988357544, + "learning_rate": 0.00012000847816871557, + "loss": 2.1266, + "step": 16990 + }, + { + "epoch": 0.40035796712354577, + "grad_norm": 2.6588997840881348, + "learning_rate": 0.0001199613772314069, + "loss": 2.1037, + "step": 17000 + }, + { + "epoch": 0.40059347181008903, + "grad_norm": 2.0685205459594727, + "learning_rate": 0.00011991427629409825, + "loss": 2.0696, + "step": 17010 + }, + { + "epoch": 0.4008289764966323, + "grad_norm": 2.6148645877838135, + "learning_rate": 0.00011986717535678959, + "loss": 2.0947, + "step": 17020 + }, + { + "epoch": 0.40106448118317556, + "grad_norm": 2.0467002391815186, + "learning_rate": 0.00011982007441948096, + "loss": 2.2171, + "step": 17030 + }, + { + "epoch": 0.4012999858697188, + "grad_norm": 2.4288554191589355, + "learning_rate": 0.00011977297348217229, + "loss": 2.1036, + "step": 17040 + }, + { + "epoch": 0.4015354905562621, + "grad_norm": 2.3127670288085938, + "learning_rate": 0.00011972587254486364, + "loss": 2.0517, + "step": 17050 + }, + { + "epoch": 0.40177099524280535, + "grad_norm": 2.3823368549346924, + "learning_rate": 0.000119678771607555, + "loss": 2.0046, + "step": 17060 + }, + { + "epoch": 0.4020064999293486, + "grad_norm": 2.0719432830810547, + "learning_rate": 0.00011963167067024634, + "loss": 2.0568, + "step": 17070 + }, + { + "epoch": 0.4022420046158919, + "grad_norm": 2.4580512046813965, + "learning_rate": 0.0001195845697329377, + "loss": 2.0559, + "step": 17080 + }, + { + "epoch": 0.40247750930243514, + "grad_norm": 2.124021291732788, + "learning_rate": 0.00011953746879562904, + "loss": 2.086, + "step": 17090 + }, + { + "epoch": 0.4027130139889784, + "grad_norm": 2.4031240940093994, + "learning_rate": 0.00011949036785832039, + "loss": 2.1624, + "step": 17100 + }, + { + "epoch": 0.4029485186755216, + "grad_norm": 2.5331597328186035, + "learning_rate": 0.00011944326692101173, + "loss": 2.0293, + "step": 17110 + }, + { + "epoch": 0.4031840233620649, + "grad_norm": 2.3105568885803223, + "learning_rate": 0.0001193961659837031, + "loss": 2.0536, + "step": 17120 + }, + { + "epoch": 0.40341952804860814, + "grad_norm": 1.935504674911499, + "learning_rate": 0.00011934906504639443, + "loss": 2.0363, + "step": 17130 + }, + { + "epoch": 0.4036550327351514, + "grad_norm": 1.818901777267456, + "learning_rate": 0.00011930196410908578, + "loss": 2.1138, + "step": 17140 + }, + { + "epoch": 0.4038905374216947, + "grad_norm": 2.2937822341918945, + "learning_rate": 0.00011925486317177712, + "loss": 2.2075, + "step": 17150 + }, + { + "epoch": 0.40412604210823794, + "grad_norm": 2.4323720932006836, + "learning_rate": 0.00011920776223446848, + "loss": 1.8704, + "step": 17160 + }, + { + "epoch": 0.4043615467947812, + "grad_norm": 2.0773727893829346, + "learning_rate": 0.00011916066129715982, + "loss": 2.1124, + "step": 17170 + }, + { + "epoch": 0.40459705148132447, + "grad_norm": 2.424532413482666, + "learning_rate": 0.00011911356035985117, + "loss": 1.9422, + "step": 17180 + }, + { + "epoch": 0.40483255616786773, + "grad_norm": 2.1791512966156006, + "learning_rate": 0.0001190664594225425, + "loss": 2.0547, + "step": 17190 + }, + { + "epoch": 0.405068060854411, + "grad_norm": 2.939730167388916, + "learning_rate": 0.00011901935848523387, + "loss": 1.9766, + "step": 17200 + }, + { + "epoch": 0.40530356554095426, + "grad_norm": 2.1646764278411865, + "learning_rate": 0.0001189722575479252, + "loss": 1.9632, + "step": 17210 + }, + { + "epoch": 0.4055390702274975, + "grad_norm": 3.135854721069336, + "learning_rate": 0.00011892515661061656, + "loss": 1.942, + "step": 17220 + }, + { + "epoch": 0.4057745749140408, + "grad_norm": 2.3029401302337646, + "learning_rate": 0.00011887805567330789, + "loss": 2.3985, + "step": 17230 + }, + { + "epoch": 0.40601007960058405, + "grad_norm": 2.5653719902038574, + "learning_rate": 0.00011883095473599926, + "loss": 2.125, + "step": 17240 + }, + { + "epoch": 0.4062455842871273, + "grad_norm": 2.4413366317749023, + "learning_rate": 0.00011878385379869059, + "loss": 2.2774, + "step": 17250 + }, + { + "epoch": 0.4064810889736706, + "grad_norm": 1.920428991317749, + "learning_rate": 0.00011873675286138196, + "loss": 2.3723, + "step": 17260 + }, + { + "epoch": 0.40671659366021384, + "grad_norm": 2.025369644165039, + "learning_rate": 0.00011868965192407328, + "loss": 2.1473, + "step": 17270 + }, + { + "epoch": 0.4069520983467571, + "grad_norm": 2.081719160079956, + "learning_rate": 0.00011864255098676464, + "loss": 2.2024, + "step": 17280 + }, + { + "epoch": 0.40718760303330037, + "grad_norm": 1.9507285356521606, + "learning_rate": 0.00011859545004945598, + "loss": 2.0511, + "step": 17290 + }, + { + "epoch": 0.40742310771984364, + "grad_norm": 2.085679531097412, + "learning_rate": 0.00011854834911214734, + "loss": 2.0069, + "step": 17300 + }, + { + "epoch": 0.4076586124063869, + "grad_norm": 2.4207077026367188, + "learning_rate": 0.00011850124817483868, + "loss": 2.1838, + "step": 17310 + }, + { + "epoch": 0.40789411709293016, + "grad_norm": 1.945299744606018, + "learning_rate": 0.00011845414723753003, + "loss": 2.1365, + "step": 17320 + }, + { + "epoch": 0.40812962177947343, + "grad_norm": 2.0724103450775146, + "learning_rate": 0.00011840704630022137, + "loss": 2.1791, + "step": 17330 + }, + { + "epoch": 0.4083651264660167, + "grad_norm": 2.00799298286438, + "learning_rate": 0.00011835994536291273, + "loss": 2.103, + "step": 17340 + }, + { + "epoch": 0.40860063115255996, + "grad_norm": 2.6130778789520264, + "learning_rate": 0.00011831284442560408, + "loss": 2.1468, + "step": 17350 + }, + { + "epoch": 0.4088361358391032, + "grad_norm": 2.042046308517456, + "learning_rate": 0.00011826574348829542, + "loss": 1.9138, + "step": 17360 + }, + { + "epoch": 0.4090716405256465, + "grad_norm": 2.197761058807373, + "learning_rate": 0.00011821864255098678, + "loss": 2.0824, + "step": 17370 + }, + { + "epoch": 0.40930714521218975, + "grad_norm": 2.178068161010742, + "learning_rate": 0.00011817154161367812, + "loss": 2.0455, + "step": 17380 + }, + { + "epoch": 0.40954264989873296, + "grad_norm": 1.9603676795959473, + "learning_rate": 0.00011812444067636947, + "loss": 2.0522, + "step": 17390 + }, + { + "epoch": 0.4097781545852762, + "grad_norm": 1.8290703296661377, + "learning_rate": 0.0001180773397390608, + "loss": 2.0019, + "step": 17400 + }, + { + "epoch": 0.4100136592718195, + "grad_norm": 2.015998601913452, + "learning_rate": 0.00011803023880175217, + "loss": 2.2494, + "step": 17410 + }, + { + "epoch": 0.41024916395836275, + "grad_norm": 2.149958610534668, + "learning_rate": 0.0001179831378644435, + "loss": 1.9823, + "step": 17420 + }, + { + "epoch": 0.410484668644906, + "grad_norm": 1.841565489768982, + "learning_rate": 0.00011793603692713486, + "loss": 1.9568, + "step": 17430 + }, + { + "epoch": 0.4107201733314493, + "grad_norm": 2.855804204940796, + "learning_rate": 0.00011788893598982619, + "loss": 2.1627, + "step": 17440 + }, + { + "epoch": 0.41095567801799254, + "grad_norm": 2.0609381198883057, + "learning_rate": 0.00011784183505251756, + "loss": 2.0022, + "step": 17450 + }, + { + "epoch": 0.4111911827045358, + "grad_norm": 2.362701416015625, + "learning_rate": 0.0001177947341152089, + "loss": 2.2441, + "step": 17460 + }, + { + "epoch": 0.41142668739107907, + "grad_norm": 1.9817253351211548, + "learning_rate": 0.00011774763317790026, + "loss": 1.8491, + "step": 17470 + }, + { + "epoch": 0.41166219207762234, + "grad_norm": 1.9875420331954956, + "learning_rate": 0.0001177005322405916, + "loss": 1.8271, + "step": 17480 + }, + { + "epoch": 0.4118976967641656, + "grad_norm": 1.8128178119659424, + "learning_rate": 0.00011765343130328294, + "loss": 2.0102, + "step": 17490 + }, + { + "epoch": 0.41213320145070886, + "grad_norm": 2.7656726837158203, + "learning_rate": 0.00011760633036597428, + "loss": 2.0932, + "step": 17500 + }, + { + "epoch": 0.41236870613725213, + "grad_norm": 2.31899094581604, + "learning_rate": 0.00011755922942866564, + "loss": 2.3904, + "step": 17510 + }, + { + "epoch": 0.4126042108237954, + "grad_norm": 1.7432920932769775, + "learning_rate": 0.00011751212849135698, + "loss": 2.0688, + "step": 17520 + }, + { + "epoch": 0.41283971551033866, + "grad_norm": 2.6849148273468018, + "learning_rate": 0.00011746502755404833, + "loss": 1.9904, + "step": 17530 + }, + { + "epoch": 0.4130752201968819, + "grad_norm": 2.1877646446228027, + "learning_rate": 0.00011741792661673967, + "loss": 2.1276, + "step": 17540 + }, + { + "epoch": 0.4133107248834252, + "grad_norm": 2.3878908157348633, + "learning_rate": 0.00011737082567943103, + "loss": 2.0121, + "step": 17550 + }, + { + "epoch": 0.41354622956996845, + "grad_norm": 2.5052924156188965, + "learning_rate": 0.00011732372474212237, + "loss": 2.1646, + "step": 17560 + }, + { + "epoch": 0.4137817342565117, + "grad_norm": 1.9085092544555664, + "learning_rate": 0.00011727662380481372, + "loss": 2.0815, + "step": 17570 + }, + { + "epoch": 0.414017238943055, + "grad_norm": 2.4492745399475098, + "learning_rate": 0.00011722952286750506, + "loss": 2.0763, + "step": 17580 + }, + { + "epoch": 0.41425274362959824, + "grad_norm": 2.189110279083252, + "learning_rate": 0.00011718242193019642, + "loss": 2.114, + "step": 17590 + }, + { + "epoch": 0.4144882483161415, + "grad_norm": 2.4104158878326416, + "learning_rate": 0.00011713532099288776, + "loss": 2.19, + "step": 17600 + }, + { + "epoch": 0.41472375300268477, + "grad_norm": 1.884446620941162, + "learning_rate": 0.0001170882200555791, + "loss": 1.9784, + "step": 17610 + }, + { + "epoch": 0.41495925768922803, + "grad_norm": 2.4759879112243652, + "learning_rate": 0.00011704111911827047, + "loss": 2.2371, + "step": 17620 + }, + { + "epoch": 0.4151947623757713, + "grad_norm": 2.4873902797698975, + "learning_rate": 0.00011699401818096181, + "loss": 2.1282, + "step": 17630 + }, + { + "epoch": 0.41543026706231456, + "grad_norm": 1.9872322082519531, + "learning_rate": 0.00011694691724365317, + "loss": 2.1676, + "step": 17640 + }, + { + "epoch": 0.4156657717488578, + "grad_norm": 2.117286205291748, + "learning_rate": 0.0001168998163063445, + "loss": 2.1725, + "step": 17650 + }, + { + "epoch": 0.4159012764354011, + "grad_norm": 2.261207342147827, + "learning_rate": 0.00011685271536903586, + "loss": 2.1271, + "step": 17660 + }, + { + "epoch": 0.4161367811219443, + "grad_norm": 2.076218843460083, + "learning_rate": 0.0001168056144317272, + "loss": 1.926, + "step": 17670 + }, + { + "epoch": 0.41637228580848756, + "grad_norm": 2.5241241455078125, + "learning_rate": 0.00011675851349441856, + "loss": 1.9504, + "step": 17680 + }, + { + "epoch": 0.4166077904950308, + "grad_norm": 2.2576680183410645, + "learning_rate": 0.0001167114125571099, + "loss": 2.2149, + "step": 17690 + }, + { + "epoch": 0.4168432951815741, + "grad_norm": 2.1343159675598145, + "learning_rate": 0.00011666431161980125, + "loss": 2.1592, + "step": 17700 + }, + { + "epoch": 0.41707879986811736, + "grad_norm": 2.8269872665405273, + "learning_rate": 0.00011661721068249258, + "loss": 1.8489, + "step": 17710 + }, + { + "epoch": 0.4173143045546606, + "grad_norm": 2.6557393074035645, + "learning_rate": 0.00011657010974518395, + "loss": 2.3097, + "step": 17720 + }, + { + "epoch": 0.4175498092412039, + "grad_norm": 1.9406561851501465, + "learning_rate": 0.00011652300880787528, + "loss": 2.2424, + "step": 17730 + }, + { + "epoch": 0.41778531392774715, + "grad_norm": 2.8886189460754395, + "learning_rate": 0.00011647590787056663, + "loss": 1.9974, + "step": 17740 + }, + { + "epoch": 0.4180208186142904, + "grad_norm": 2.958347797393799, + "learning_rate": 0.00011642880693325797, + "loss": 2.0821, + "step": 17750 + }, + { + "epoch": 0.4182563233008337, + "grad_norm": 2.3907742500305176, + "learning_rate": 0.00011638170599594933, + "loss": 2.2051, + "step": 17760 + }, + { + "epoch": 0.41849182798737694, + "grad_norm": 2.605231523513794, + "learning_rate": 0.00011633460505864067, + "loss": 2.1911, + "step": 17770 + }, + { + "epoch": 0.4187273326739202, + "grad_norm": 2.1181795597076416, + "learning_rate": 0.00011628750412133202, + "loss": 1.8307, + "step": 17780 + }, + { + "epoch": 0.41896283736046347, + "grad_norm": 2.3362369537353516, + "learning_rate": 0.00011624040318402336, + "loss": 1.9971, + "step": 17790 + }, + { + "epoch": 0.41919834204700673, + "grad_norm": 2.4740500450134277, + "learning_rate": 0.00011619330224671472, + "loss": 2.0515, + "step": 17800 + }, + { + "epoch": 0.41943384673355, + "grad_norm": 2.383115768432617, + "learning_rate": 0.00011614620130940606, + "loss": 2.1157, + "step": 17810 + }, + { + "epoch": 0.41966935142009326, + "grad_norm": 1.487096905708313, + "learning_rate": 0.00011609910037209741, + "loss": 2.0911, + "step": 17820 + }, + { + "epoch": 0.4199048561066365, + "grad_norm": 2.061518430709839, + "learning_rate": 0.00011605199943478874, + "loss": 2.2484, + "step": 17830 + }, + { + "epoch": 0.4201403607931798, + "grad_norm": 2.2005579471588135, + "learning_rate": 0.00011600489849748011, + "loss": 2.0508, + "step": 17840 + }, + { + "epoch": 0.42037586547972305, + "grad_norm": 2.021574020385742, + "learning_rate": 0.00011595779756017144, + "loss": 2.0133, + "step": 17850 + }, + { + "epoch": 0.4206113701662663, + "grad_norm": 2.3463683128356934, + "learning_rate": 0.00011591069662286281, + "loss": 1.9324, + "step": 17860 + }, + { + "epoch": 0.4208468748528096, + "grad_norm": 2.0511763095855713, + "learning_rate": 0.00011586359568555413, + "loss": 1.8826, + "step": 17870 + }, + { + "epoch": 0.42108237953935285, + "grad_norm": 1.8911633491516113, + "learning_rate": 0.0001158164947482455, + "loss": 1.9932, + "step": 17880 + }, + { + "epoch": 0.4213178842258961, + "grad_norm": 2.640658140182495, + "learning_rate": 0.00011576939381093683, + "loss": 2.0184, + "step": 17890 + }, + { + "epoch": 0.4215533889124394, + "grad_norm": 2.112999677658081, + "learning_rate": 0.0001157222928736282, + "loss": 2.2726, + "step": 17900 + }, + { + "epoch": 0.42178889359898264, + "grad_norm": 2.526036262512207, + "learning_rate": 0.00011567519193631955, + "loss": 2.1118, + "step": 17910 + }, + { + "epoch": 0.4220243982855259, + "grad_norm": 2.948558807373047, + "learning_rate": 0.00011562809099901088, + "loss": 2.2965, + "step": 17920 + }, + { + "epoch": 0.42225990297206917, + "grad_norm": 2.221510648727417, + "learning_rate": 0.00011558099006170225, + "loss": 2.0943, + "step": 17930 + }, + { + "epoch": 0.42249540765861243, + "grad_norm": 1.9982761144638062, + "learning_rate": 0.00011553388912439358, + "loss": 2.1344, + "step": 17940 + }, + { + "epoch": 0.4227309123451557, + "grad_norm": 1.816463589668274, + "learning_rate": 0.00011548678818708493, + "loss": 2.1454, + "step": 17950 + }, + { + "epoch": 0.4229664170316989, + "grad_norm": 1.9133336544036865, + "learning_rate": 0.00011543968724977627, + "loss": 2.2219, + "step": 17960 + }, + { + "epoch": 0.42320192171824217, + "grad_norm": 2.87747859954834, + "learning_rate": 0.00011539258631246763, + "loss": 2.2701, + "step": 17970 + }, + { + "epoch": 0.42343742640478543, + "grad_norm": 2.2570955753326416, + "learning_rate": 0.00011534548537515897, + "loss": 2.2363, + "step": 17980 + }, + { + "epoch": 0.4236729310913287, + "grad_norm": 2.1838762760162354, + "learning_rate": 0.00011529838443785032, + "loss": 2.1544, + "step": 17990 + }, + { + "epoch": 0.42390843577787196, + "grad_norm": 2.118103504180908, + "learning_rate": 0.00011525128350054166, + "loss": 2.2437, + "step": 18000 + }, + { + "epoch": 0.4241439404644152, + "grad_norm": 1.956613540649414, + "learning_rate": 0.00011520418256323302, + "loss": 2.0544, + "step": 18010 + }, + { + "epoch": 0.4243794451509585, + "grad_norm": 2.5147013664245605, + "learning_rate": 0.00011515708162592436, + "loss": 2.1949, + "step": 18020 + }, + { + "epoch": 0.42461494983750175, + "grad_norm": 1.6883703470230103, + "learning_rate": 0.00011510998068861571, + "loss": 2.1606, + "step": 18030 + }, + { + "epoch": 0.424850454524045, + "grad_norm": 3.019367218017578, + "learning_rate": 0.00011506287975130705, + "loss": 2.1466, + "step": 18040 + }, + { + "epoch": 0.4250859592105883, + "grad_norm": 2.5443851947784424, + "learning_rate": 0.00011501577881399841, + "loss": 2.0999, + "step": 18050 + }, + { + "epoch": 0.42532146389713155, + "grad_norm": 1.759558081626892, + "learning_rate": 0.00011496867787668975, + "loss": 2.0157, + "step": 18060 + }, + { + "epoch": 0.4255569685836748, + "grad_norm": 2.4723591804504395, + "learning_rate": 0.00011492157693938111, + "loss": 2.2185, + "step": 18070 + }, + { + "epoch": 0.4257924732702181, + "grad_norm": 2.276010274887085, + "learning_rate": 0.00011487447600207245, + "loss": 2.197, + "step": 18080 + }, + { + "epoch": 0.42602797795676134, + "grad_norm": 2.547868013381958, + "learning_rate": 0.0001148273750647638, + "loss": 2.0393, + "step": 18090 + }, + { + "epoch": 0.4262634826433046, + "grad_norm": 2.285099983215332, + "learning_rate": 0.00011478027412745513, + "loss": 1.9714, + "step": 18100 + }, + { + "epoch": 0.42649898732984787, + "grad_norm": 1.7757441997528076, + "learning_rate": 0.0001147331731901465, + "loss": 2.0319, + "step": 18110 + }, + { + "epoch": 0.42673449201639113, + "grad_norm": 2.2466018199920654, + "learning_rate": 0.00011468607225283783, + "loss": 2.0001, + "step": 18120 + }, + { + "epoch": 0.4269699967029344, + "grad_norm": 2.274280548095703, + "learning_rate": 0.00011463897131552918, + "loss": 2.0503, + "step": 18130 + }, + { + "epoch": 0.42720550138947766, + "grad_norm": 2.588524341583252, + "learning_rate": 0.00011459187037822052, + "loss": 2.3182, + "step": 18140 + }, + { + "epoch": 0.4274410060760209, + "grad_norm": 2.081986904144287, + "learning_rate": 0.00011454476944091188, + "loss": 2.1953, + "step": 18150 + }, + { + "epoch": 0.4276765107625642, + "grad_norm": 2.046691417694092, + "learning_rate": 0.00011449766850360322, + "loss": 2.1025, + "step": 18160 + }, + { + "epoch": 0.42791201544910745, + "grad_norm": 2.94069504737854, + "learning_rate": 0.00011445056756629457, + "loss": 2.2368, + "step": 18170 + }, + { + "epoch": 0.4281475201356507, + "grad_norm": 2.245175361633301, + "learning_rate": 0.00011440346662898591, + "loss": 2.1502, + "step": 18180 + }, + { + "epoch": 0.428383024822194, + "grad_norm": 2.029442310333252, + "learning_rate": 0.00011435636569167727, + "loss": 2.2961, + "step": 18190 + }, + { + "epoch": 0.42861852950873724, + "grad_norm": 2.311154842376709, + "learning_rate": 0.00011430926475436862, + "loss": 2.0729, + "step": 18200 + }, + { + "epoch": 0.4288540341952805, + "grad_norm": 2.393167018890381, + "learning_rate": 0.00011426216381705996, + "loss": 2.0608, + "step": 18210 + }, + { + "epoch": 0.4290895388818238, + "grad_norm": 2.606497049331665, + "learning_rate": 0.00011421506287975132, + "loss": 2.0255, + "step": 18220 + }, + { + "epoch": 0.42932504356836704, + "grad_norm": 1.9125550985336304, + "learning_rate": 0.00011416796194244266, + "loss": 2.0945, + "step": 18230 + }, + { + "epoch": 0.42956054825491025, + "grad_norm": 2.4020402431488037, + "learning_rate": 0.00011412086100513402, + "loss": 2.0022, + "step": 18240 + }, + { + "epoch": 0.4297960529414535, + "grad_norm": 1.5923528671264648, + "learning_rate": 0.00011407376006782535, + "loss": 2.0348, + "step": 18250 + }, + { + "epoch": 0.4300315576279968, + "grad_norm": 2.0022480487823486, + "learning_rate": 0.00011402665913051671, + "loss": 2.1634, + "step": 18260 + }, + { + "epoch": 0.43026706231454004, + "grad_norm": 2.2088112831115723, + "learning_rate": 0.00011397955819320805, + "loss": 2.2046, + "step": 18270 + }, + { + "epoch": 0.4305025670010833, + "grad_norm": 1.8067007064819336, + "learning_rate": 0.00011393245725589941, + "loss": 2.0242, + "step": 18280 + }, + { + "epoch": 0.43073807168762657, + "grad_norm": 2.725207567214966, + "learning_rate": 0.00011388535631859075, + "loss": 1.8446, + "step": 18290 + }, + { + "epoch": 0.43097357637416983, + "grad_norm": 3.046445608139038, + "learning_rate": 0.0001138382553812821, + "loss": 2.1722, + "step": 18300 + }, + { + "epoch": 0.4312090810607131, + "grad_norm": 2.3470003604888916, + "learning_rate": 0.00011379115444397343, + "loss": 2.0065, + "step": 18310 + }, + { + "epoch": 0.43144458574725636, + "grad_norm": 3.09189772605896, + "learning_rate": 0.0001137440535066648, + "loss": 1.9948, + "step": 18320 + }, + { + "epoch": 0.4316800904337996, + "grad_norm": 2.13586163520813, + "learning_rate": 0.00011369695256935613, + "loss": 1.9383, + "step": 18330 + }, + { + "epoch": 0.4319155951203429, + "grad_norm": 2.6252524852752686, + "learning_rate": 0.00011364985163204748, + "loss": 1.9324, + "step": 18340 + }, + { + "epoch": 0.43215109980688615, + "grad_norm": 2.855048179626465, + "learning_rate": 0.00011360275069473882, + "loss": 2.3409, + "step": 18350 + }, + { + "epoch": 0.4323866044934294, + "grad_norm": 2.435730457305908, + "learning_rate": 0.00011355564975743019, + "loss": 2.1404, + "step": 18360 + }, + { + "epoch": 0.4326221091799727, + "grad_norm": 2.411512613296509, + "learning_rate": 0.00011350854882012152, + "loss": 2.2621, + "step": 18370 + }, + { + "epoch": 0.43285761386651594, + "grad_norm": 2.974522590637207, + "learning_rate": 0.00011346144788281287, + "loss": 2.1951, + "step": 18380 + }, + { + "epoch": 0.4330931185530592, + "grad_norm": 1.7726364135742188, + "learning_rate": 0.00011341434694550421, + "loss": 1.8892, + "step": 18390 + }, + { + "epoch": 0.4333286232396025, + "grad_norm": 2.825831174850464, + "learning_rate": 0.00011336724600819557, + "loss": 2.1635, + "step": 18400 + }, + { + "epoch": 0.43356412792614574, + "grad_norm": 2.5960309505462646, + "learning_rate": 0.00011332014507088691, + "loss": 2.0728, + "step": 18410 + }, + { + "epoch": 0.433799632612689, + "grad_norm": 2.0427327156066895, + "learning_rate": 0.00011327304413357826, + "loss": 2.0275, + "step": 18420 + }, + { + "epoch": 0.43403513729923227, + "grad_norm": 2.6198508739471436, + "learning_rate": 0.0001132259431962696, + "loss": 2.0241, + "step": 18430 + }, + { + "epoch": 0.43427064198577553, + "grad_norm": 2.224815845489502, + "learning_rate": 0.00011317884225896096, + "loss": 2.0585, + "step": 18440 + }, + { + "epoch": 0.4345061466723188, + "grad_norm": 2.4373631477355957, + "learning_rate": 0.0001131317413216523, + "loss": 2.0333, + "step": 18450 + }, + { + "epoch": 0.43474165135886206, + "grad_norm": 1.711761713027954, + "learning_rate": 0.00011308464038434366, + "loss": 1.7788, + "step": 18460 + }, + { + "epoch": 0.4349771560454053, + "grad_norm": 3.0095267295837402, + "learning_rate": 0.00011303753944703501, + "loss": 2.2233, + "step": 18470 + }, + { + "epoch": 0.4352126607319486, + "grad_norm": 2.0507631301879883, + "learning_rate": 0.00011299043850972635, + "loss": 2.019, + "step": 18480 + }, + { + "epoch": 0.43544816541849185, + "grad_norm": 2.733670711517334, + "learning_rate": 0.00011294333757241771, + "loss": 2.074, + "step": 18490 + }, + { + "epoch": 0.4356836701050351, + "grad_norm": 2.1250648498535156, + "learning_rate": 0.00011289623663510905, + "loss": 2.0513, + "step": 18500 + }, + { + "epoch": 0.4359191747915784, + "grad_norm": 3.0923218727111816, + "learning_rate": 0.0001128491356978004, + "loss": 2.3304, + "step": 18510 + }, + { + "epoch": 0.4361546794781216, + "grad_norm": 3.0069260597229004, + "learning_rate": 0.00011280203476049173, + "loss": 2.2824, + "step": 18520 + }, + { + "epoch": 0.43639018416466485, + "grad_norm": 2.1305959224700928, + "learning_rate": 0.0001127549338231831, + "loss": 2.0499, + "step": 18530 + }, + { + "epoch": 0.4366256888512081, + "grad_norm": 2.1745736598968506, + "learning_rate": 0.00011270783288587444, + "loss": 1.9903, + "step": 18540 + }, + { + "epoch": 0.4368611935377514, + "grad_norm": 2.113219738006592, + "learning_rate": 0.00011266073194856579, + "loss": 2.1466, + "step": 18550 + }, + { + "epoch": 0.43709669822429464, + "grad_norm": 2.403501510620117, + "learning_rate": 0.00011261363101125712, + "loss": 2.1115, + "step": 18560 + }, + { + "epoch": 0.4373322029108379, + "grad_norm": 2.2626802921295166, + "learning_rate": 0.00011256653007394849, + "loss": 2.089, + "step": 18570 + }, + { + "epoch": 0.4375677075973812, + "grad_norm": 2.3562400341033936, + "learning_rate": 0.00011251942913663982, + "loss": 2.3542, + "step": 18580 + }, + { + "epoch": 0.43780321228392444, + "grad_norm": 1.9294075965881348, + "learning_rate": 0.00011247232819933117, + "loss": 2.1626, + "step": 18590 + }, + { + "epoch": 0.4380387169704677, + "grad_norm": 2.3301680088043213, + "learning_rate": 0.00011242522726202251, + "loss": 2.047, + "step": 18600 + }, + { + "epoch": 0.43827422165701097, + "grad_norm": 2.25012469291687, + "learning_rate": 0.00011237812632471387, + "loss": 2.1893, + "step": 18610 + }, + { + "epoch": 0.43850972634355423, + "grad_norm": 2.4669597148895264, + "learning_rate": 0.00011233102538740521, + "loss": 1.9656, + "step": 18620 + }, + { + "epoch": 0.4387452310300975, + "grad_norm": 2.1987082958221436, + "learning_rate": 0.00011228392445009656, + "loss": 2.1176, + "step": 18630 + }, + { + "epoch": 0.43898073571664076, + "grad_norm": 2.573086738586426, + "learning_rate": 0.0001122368235127879, + "loss": 1.9486, + "step": 18640 + }, + { + "epoch": 0.439216240403184, + "grad_norm": 2.510371446609497, + "learning_rate": 0.00011218972257547926, + "loss": 2.2401, + "step": 18650 + }, + { + "epoch": 0.4394517450897273, + "grad_norm": 2.1042065620422363, + "learning_rate": 0.0001121426216381706, + "loss": 1.9531, + "step": 18660 + }, + { + "epoch": 0.43968724977627055, + "grad_norm": 2.6314985752105713, + "learning_rate": 0.00011209552070086196, + "loss": 2.126, + "step": 18670 + }, + { + "epoch": 0.4399227544628138, + "grad_norm": 1.911363959312439, + "learning_rate": 0.0001120484197635533, + "loss": 2.2783, + "step": 18680 + }, + { + "epoch": 0.4401582591493571, + "grad_norm": 3.6765568256378174, + "learning_rate": 0.00011200131882624465, + "loss": 2.0344, + "step": 18690 + }, + { + "epoch": 0.44039376383590034, + "grad_norm": 1.8842840194702148, + "learning_rate": 0.00011195421788893599, + "loss": 2.0042, + "step": 18700 + }, + { + "epoch": 0.4406292685224436, + "grad_norm": 2.389233112335205, + "learning_rate": 0.00011190711695162735, + "loss": 2.0097, + "step": 18710 + }, + { + "epoch": 0.44086477320898687, + "grad_norm": 1.9120182991027832, + "learning_rate": 0.00011186001601431869, + "loss": 1.9488, + "step": 18720 + }, + { + "epoch": 0.44110027789553014, + "grad_norm": 3.073310613632202, + "learning_rate": 0.00011181291507701004, + "loss": 1.8932, + "step": 18730 + }, + { + "epoch": 0.4413357825820734, + "grad_norm": 2.4336462020874023, + "learning_rate": 0.00011176581413970137, + "loss": 2.028, + "step": 18740 + }, + { + "epoch": 0.44157128726861666, + "grad_norm": 2.4928979873657227, + "learning_rate": 0.00011171871320239274, + "loss": 2.0821, + "step": 18750 + }, + { + "epoch": 0.4418067919551599, + "grad_norm": 2.694518804550171, + "learning_rate": 0.00011167161226508409, + "loss": 2.0103, + "step": 18760 + }, + { + "epoch": 0.4420422966417032, + "grad_norm": 1.9793258905410767, + "learning_rate": 0.00011162451132777542, + "loss": 2.0106, + "step": 18770 + }, + { + "epoch": 0.44227780132824646, + "grad_norm": 3.150414228439331, + "learning_rate": 0.00011157741039046679, + "loss": 2.2266, + "step": 18780 + }, + { + "epoch": 0.4425133060147897, + "grad_norm": 2.1798579692840576, + "learning_rate": 0.00011153030945315812, + "loss": 2.1064, + "step": 18790 + }, + { + "epoch": 0.44274881070133293, + "grad_norm": 2.7977168560028076, + "learning_rate": 0.00011148320851584947, + "loss": 2.179, + "step": 18800 + }, + { + "epoch": 0.4429843153878762, + "grad_norm": 1.93820059299469, + "learning_rate": 0.00011143610757854081, + "loss": 2.2933, + "step": 18810 + }, + { + "epoch": 0.44321982007441946, + "grad_norm": 2.5885021686553955, + "learning_rate": 0.00011138900664123217, + "loss": 2.1091, + "step": 18820 + }, + { + "epoch": 0.4434553247609627, + "grad_norm": 2.002413034439087, + "learning_rate": 0.00011134190570392351, + "loss": 1.8793, + "step": 18830 + }, + { + "epoch": 0.443690829447506, + "grad_norm": 2.016279697418213, + "learning_rate": 0.00011129480476661487, + "loss": 1.9843, + "step": 18840 + }, + { + "epoch": 0.44392633413404925, + "grad_norm": 1.719465732574463, + "learning_rate": 0.0001112477038293062, + "loss": 2.0675, + "step": 18850 + }, + { + "epoch": 0.4441618388205925, + "grad_norm": 2.9975037574768066, + "learning_rate": 0.00011120060289199756, + "loss": 1.9329, + "step": 18860 + }, + { + "epoch": 0.4443973435071358, + "grad_norm": 2.3108158111572266, + "learning_rate": 0.0001111535019546889, + "loss": 2.1686, + "step": 18870 + }, + { + "epoch": 0.44463284819367904, + "grad_norm": 2.196322202682495, + "learning_rate": 0.00011110640101738026, + "loss": 2.1436, + "step": 18880 + }, + { + "epoch": 0.4448683528802223, + "grad_norm": 2.0889739990234375, + "learning_rate": 0.0001110593000800716, + "loss": 2.1503, + "step": 18890 + }, + { + "epoch": 0.44510385756676557, + "grad_norm": 1.8750911951065063, + "learning_rate": 0.00011101219914276295, + "loss": 2.1539, + "step": 18900 + }, + { + "epoch": 0.44533936225330883, + "grad_norm": 2.5251245498657227, + "learning_rate": 0.00011096509820545429, + "loss": 2.0683, + "step": 18910 + }, + { + "epoch": 0.4455748669398521, + "grad_norm": 2.4985053539276123, + "learning_rate": 0.00011091799726814565, + "loss": 2.0526, + "step": 18920 + }, + { + "epoch": 0.44581037162639536, + "grad_norm": 2.0587680339813232, + "learning_rate": 0.00011087089633083699, + "loss": 2.1422, + "step": 18930 + }, + { + "epoch": 0.4460458763129386, + "grad_norm": 2.103192090988159, + "learning_rate": 0.00011082379539352834, + "loss": 1.9882, + "step": 18940 + }, + { + "epoch": 0.4462813809994819, + "grad_norm": 2.437361717224121, + "learning_rate": 0.00011077669445621967, + "loss": 2.1671, + "step": 18950 + }, + { + "epoch": 0.44651688568602516, + "grad_norm": 2.0805160999298096, + "learning_rate": 0.00011072959351891104, + "loss": 2.0893, + "step": 18960 + }, + { + "epoch": 0.4467523903725684, + "grad_norm": 2.183110475540161, + "learning_rate": 0.00011068249258160237, + "loss": 1.9785, + "step": 18970 + }, + { + "epoch": 0.4469878950591117, + "grad_norm": 3.0522234439849854, + "learning_rate": 0.00011063539164429372, + "loss": 2.1078, + "step": 18980 + }, + { + "epoch": 0.44722339974565495, + "grad_norm": 2.6682941913604736, + "learning_rate": 0.00011058829070698506, + "loss": 2.1685, + "step": 18990 + }, + { + "epoch": 0.4474589044321982, + "grad_norm": 2.421186685562134, + "learning_rate": 0.00011054118976967642, + "loss": 2.1228, + "step": 19000 + }, + { + "epoch": 0.4476944091187415, + "grad_norm": 1.8149057626724243, + "learning_rate": 0.00011049408883236776, + "loss": 2.1178, + "step": 19010 + }, + { + "epoch": 0.44792991380528474, + "grad_norm": 2.045279026031494, + "learning_rate": 0.00011044698789505911, + "loss": 1.934, + "step": 19020 + }, + { + "epoch": 0.448165418491828, + "grad_norm": 2.152418375015259, + "learning_rate": 0.00011039988695775045, + "loss": 2.0527, + "step": 19030 + }, + { + "epoch": 0.44840092317837127, + "grad_norm": 2.6455652713775635, + "learning_rate": 0.00011035278602044181, + "loss": 1.9045, + "step": 19040 + }, + { + "epoch": 0.44863642786491453, + "grad_norm": 1.9313437938690186, + "learning_rate": 0.00011030568508313318, + "loss": 2.0172, + "step": 19050 + }, + { + "epoch": 0.4488719325514578, + "grad_norm": 2.253701686859131, + "learning_rate": 0.00011025858414582451, + "loss": 2.164, + "step": 19060 + }, + { + "epoch": 0.44910743723800106, + "grad_norm": 1.9324719905853271, + "learning_rate": 0.00011021148320851586, + "loss": 2.333, + "step": 19070 + }, + { + "epoch": 0.4493429419245443, + "grad_norm": 1.903435230255127, + "learning_rate": 0.0001101643822712072, + "loss": 2.0204, + "step": 19080 + }, + { + "epoch": 0.44957844661108753, + "grad_norm": 2.1830732822418213, + "learning_rate": 0.00011011728133389856, + "loss": 2.2265, + "step": 19090 + }, + { + "epoch": 0.4498139512976308, + "grad_norm": 2.450608253479004, + "learning_rate": 0.0001100701803965899, + "loss": 2.047, + "step": 19100 + }, + { + "epoch": 0.45004945598417406, + "grad_norm": 2.1484196186065674, + "learning_rate": 0.00011002307945928125, + "loss": 1.9849, + "step": 19110 + }, + { + "epoch": 0.4502849606707173, + "grad_norm": 2.1579840183258057, + "learning_rate": 0.00010997597852197259, + "loss": 2.1648, + "step": 19120 + }, + { + "epoch": 0.4505204653572606, + "grad_norm": 2.8706860542297363, + "learning_rate": 0.00010992887758466395, + "loss": 1.9726, + "step": 19130 + }, + { + "epoch": 0.45075597004380386, + "grad_norm": 2.238607406616211, + "learning_rate": 0.00010988177664735529, + "loss": 2.056, + "step": 19140 + }, + { + "epoch": 0.4509914747303471, + "grad_norm": 1.9871670007705688, + "learning_rate": 0.00010983467571004664, + "loss": 2.1684, + "step": 19150 + }, + { + "epoch": 0.4512269794168904, + "grad_norm": 2.0434229373931885, + "learning_rate": 0.00010978757477273797, + "loss": 2.1818, + "step": 19160 + }, + { + "epoch": 0.45146248410343365, + "grad_norm": 1.8030943870544434, + "learning_rate": 0.00010974047383542934, + "loss": 1.9667, + "step": 19170 + }, + { + "epoch": 0.4516979887899769, + "grad_norm": 2.399859666824341, + "learning_rate": 0.00010969337289812067, + "loss": 1.9836, + "step": 19180 + }, + { + "epoch": 0.4519334934765202, + "grad_norm": 2.7550058364868164, + "learning_rate": 0.00010964627196081203, + "loss": 2.1151, + "step": 19190 + }, + { + "epoch": 0.45216899816306344, + "grad_norm": 2.3162660598754883, + "learning_rate": 0.00010959917102350336, + "loss": 2.0955, + "step": 19200 + }, + { + "epoch": 0.4524045028496067, + "grad_norm": 1.7464721202850342, + "learning_rate": 0.00010955678017992559, + "loss": 1.9926, + "step": 19210 + }, + { + "epoch": 0.45264000753614997, + "grad_norm": 3.1640796661376953, + "learning_rate": 0.00010950967924261693, + "loss": 2.0717, + "step": 19220 + }, + { + "epoch": 0.45287551222269323, + "grad_norm": 1.9667413234710693, + "learning_rate": 0.00010946257830530828, + "loss": 2.1224, + "step": 19230 + }, + { + "epoch": 0.4531110169092365, + "grad_norm": 2.176348924636841, + "learning_rate": 0.00010941547736799962, + "loss": 2.0679, + "step": 19240 + }, + { + "epoch": 0.45334652159577976, + "grad_norm": 2.226438045501709, + "learning_rate": 0.00010936837643069098, + "loss": 2.1582, + "step": 19250 + }, + { + "epoch": 0.453582026282323, + "grad_norm": 2.002314805984497, + "learning_rate": 0.00010932127549338232, + "loss": 2.0523, + "step": 19260 + }, + { + "epoch": 0.4538175309688663, + "grad_norm": 3.0285584926605225, + "learning_rate": 0.00010927417455607367, + "loss": 2.1907, + "step": 19270 + }, + { + "epoch": 0.45405303565540955, + "grad_norm": 3.506699323654175, + "learning_rate": 0.000109227073618765, + "loss": 2.1037, + "step": 19280 + }, + { + "epoch": 0.4542885403419528, + "grad_norm": 2.3388898372650146, + "learning_rate": 0.00010917997268145637, + "loss": 2.0536, + "step": 19290 + }, + { + "epoch": 0.4545240450284961, + "grad_norm": 1.9860022068023682, + "learning_rate": 0.00010913287174414772, + "loss": 2.1253, + "step": 19300 + }, + { + "epoch": 0.45475954971503935, + "grad_norm": 2.931128740310669, + "learning_rate": 0.00010908577080683905, + "loss": 2.1019, + "step": 19310 + }, + { + "epoch": 0.4549950544015826, + "grad_norm": 1.9610087871551514, + "learning_rate": 0.00010903866986953042, + "loss": 2.114, + "step": 19320 + }, + { + "epoch": 0.4552305590881259, + "grad_norm": 2.159877300262451, + "learning_rate": 0.00010899156893222176, + "loss": 1.8286, + "step": 19330 + }, + { + "epoch": 0.45546606377466914, + "grad_norm": 1.8019903898239136, + "learning_rate": 0.00010894446799491312, + "loss": 2.1, + "step": 19340 + }, + { + "epoch": 0.4557015684612124, + "grad_norm": 2.368715524673462, + "learning_rate": 0.00010889736705760444, + "loss": 2.1636, + "step": 19350 + }, + { + "epoch": 0.45593707314775567, + "grad_norm": 2.2068965435028076, + "learning_rate": 0.0001088502661202958, + "loss": 2.1377, + "step": 19360 + }, + { + "epoch": 0.4561725778342989, + "grad_norm": 2.2656490802764893, + "learning_rate": 0.00010880316518298714, + "loss": 2.1331, + "step": 19370 + }, + { + "epoch": 0.45640808252084214, + "grad_norm": 1.6999800205230713, + "learning_rate": 0.0001087560642456785, + "loss": 1.8618, + "step": 19380 + }, + { + "epoch": 0.4566435872073854, + "grad_norm": 2.0297725200653076, + "learning_rate": 0.00010870896330836984, + "loss": 2.0886, + "step": 19390 + }, + { + "epoch": 0.45687909189392867, + "grad_norm": 2.10459566116333, + "learning_rate": 0.0001086618623710612, + "loss": 2.0182, + "step": 19400 + }, + { + "epoch": 0.45711459658047193, + "grad_norm": 2.357388496398926, + "learning_rate": 0.00010861476143375253, + "loss": 1.9764, + "step": 19410 + }, + { + "epoch": 0.4573501012670152, + "grad_norm": 2.0262765884399414, + "learning_rate": 0.0001085676604964439, + "loss": 2.3218, + "step": 19420 + }, + { + "epoch": 0.45758560595355846, + "grad_norm": 3.8287534713745117, + "learning_rate": 0.00010852055955913523, + "loss": 2.1722, + "step": 19430 + }, + { + "epoch": 0.4578211106401017, + "grad_norm": 2.3780245780944824, + "learning_rate": 0.00010847345862182658, + "loss": 2.0486, + "step": 19440 + }, + { + "epoch": 0.458056615326645, + "grad_norm": 2.3219287395477295, + "learning_rate": 0.00010842635768451792, + "loss": 2.2132, + "step": 19450 + }, + { + "epoch": 0.45829212001318825, + "grad_norm": 2.278029680252075, + "learning_rate": 0.00010837925674720928, + "loss": 1.9851, + "step": 19460 + }, + { + "epoch": 0.4585276246997315, + "grad_norm": 2.079396963119507, + "learning_rate": 0.00010833215580990062, + "loss": 2.2242, + "step": 19470 + }, + { + "epoch": 0.4587631293862748, + "grad_norm": 2.292618751525879, + "learning_rate": 0.00010828505487259197, + "loss": 1.912, + "step": 19480 + }, + { + "epoch": 0.45899863407281805, + "grad_norm": 2.677804708480835, + "learning_rate": 0.0001082379539352833, + "loss": 2.1219, + "step": 19490 + }, + { + "epoch": 0.4592341387593613, + "grad_norm": 2.149505138397217, + "learning_rate": 0.00010819085299797467, + "loss": 2.0128, + "step": 19500 + }, + { + "epoch": 0.4594696434459046, + "grad_norm": 2.596064329147339, + "learning_rate": 0.000108143752060666, + "loss": 2.1132, + "step": 19510 + }, + { + "epoch": 0.45970514813244784, + "grad_norm": 2.409451484680176, + "learning_rate": 0.00010809665112335736, + "loss": 1.9036, + "step": 19520 + }, + { + "epoch": 0.4599406528189911, + "grad_norm": 2.5537643432617188, + "learning_rate": 0.00010804955018604869, + "loss": 1.9008, + "step": 19530 + }, + { + "epoch": 0.46017615750553437, + "grad_norm": 1.7899531126022339, + "learning_rate": 0.00010800244924874006, + "loss": 1.829, + "step": 19540 + }, + { + "epoch": 0.46041166219207763, + "grad_norm": 1.7132242918014526, + "learning_rate": 0.00010795534831143139, + "loss": 2.1235, + "step": 19550 + }, + { + "epoch": 0.4606471668786209, + "grad_norm": 2.0346591472625732, + "learning_rate": 0.00010790824737412276, + "loss": 2.0902, + "step": 19560 + }, + { + "epoch": 0.46088267156516416, + "grad_norm": 3.4156370162963867, + "learning_rate": 0.00010786114643681408, + "loss": 2.2832, + "step": 19570 + }, + { + "epoch": 0.4611181762517074, + "grad_norm": 2.067667007446289, + "learning_rate": 0.00010781404549950544, + "loss": 2.1939, + "step": 19580 + }, + { + "epoch": 0.4613536809382507, + "grad_norm": 1.9971282482147217, + "learning_rate": 0.00010776694456219681, + "loss": 2.1049, + "step": 19590 + }, + { + "epoch": 0.46158918562479395, + "grad_norm": 2.331735849380493, + "learning_rate": 0.00010771984362488814, + "loss": 2.3262, + "step": 19600 + }, + { + "epoch": 0.4618246903113372, + "grad_norm": 2.4964492321014404, + "learning_rate": 0.0001076727426875795, + "loss": 1.8584, + "step": 19610 + }, + { + "epoch": 0.4620601949978805, + "grad_norm": 2.145075559616089, + "learning_rate": 0.00010762564175027083, + "loss": 2.1807, + "step": 19620 + }, + { + "epoch": 0.46229569968442374, + "grad_norm": 2.1231913566589355, + "learning_rate": 0.00010758325090669306, + "loss": 2.0905, + "step": 19630 + }, + { + "epoch": 0.462531204370967, + "grad_norm": 2.6678659915924072, + "learning_rate": 0.00010753614996938439, + "loss": 1.9073, + "step": 19640 + }, + { + "epoch": 0.4627667090575102, + "grad_norm": 1.718123435974121, + "learning_rate": 0.00010748904903207575, + "loss": 2.1715, + "step": 19650 + }, + { + "epoch": 0.4630022137440535, + "grad_norm": 2.190896511077881, + "learning_rate": 0.00010744194809476709, + "loss": 2.1435, + "step": 19660 + }, + { + "epoch": 0.46323771843059675, + "grad_norm": 2.8094241619110107, + "learning_rate": 0.00010739484715745845, + "loss": 1.9762, + "step": 19670 + }, + { + "epoch": 0.46347322311714, + "grad_norm": 1.8374676704406738, + "learning_rate": 0.00010734774622014979, + "loss": 2.0374, + "step": 19680 + }, + { + "epoch": 0.4637087278036833, + "grad_norm": 2.1029560565948486, + "learning_rate": 0.00010730064528284114, + "loss": 2.3126, + "step": 19690 + }, + { + "epoch": 0.46394423249022654, + "grad_norm": 2.3267154693603516, + "learning_rate": 0.00010725354434553247, + "loss": 2.0234, + "step": 19700 + }, + { + "epoch": 0.4641797371767698, + "grad_norm": 2.216888189315796, + "learning_rate": 0.00010720644340822384, + "loss": 2.0581, + "step": 19710 + }, + { + "epoch": 0.46441524186331307, + "grad_norm": 2.1676642894744873, + "learning_rate": 0.00010715934247091517, + "loss": 2.0509, + "step": 19720 + }, + { + "epoch": 0.46465074654985633, + "grad_norm": 2.206120014190674, + "learning_rate": 0.00010711224153360652, + "loss": 2.0382, + "step": 19730 + }, + { + "epoch": 0.4648862512363996, + "grad_norm": 2.1404688358306885, + "learning_rate": 0.00010706514059629786, + "loss": 2.086, + "step": 19740 + }, + { + "epoch": 0.46512175592294286, + "grad_norm": 2.2367289066314697, + "learning_rate": 0.00010701803965898922, + "loss": 2.2163, + "step": 19750 + }, + { + "epoch": 0.4653572606094861, + "grad_norm": 1.8468304872512817, + "learning_rate": 0.00010697093872168056, + "loss": 2.1602, + "step": 19760 + }, + { + "epoch": 0.4655927652960294, + "grad_norm": 2.4168541431427, + "learning_rate": 0.00010692383778437191, + "loss": 2.0497, + "step": 19770 + }, + { + "epoch": 0.46582826998257265, + "grad_norm": 1.9554232358932495, + "learning_rate": 0.00010687673684706325, + "loss": 2.1027, + "step": 19780 + }, + { + "epoch": 0.4660637746691159, + "grad_norm": 2.9504382610321045, + "learning_rate": 0.00010682963590975461, + "loss": 1.9, + "step": 19790 + }, + { + "epoch": 0.4662992793556592, + "grad_norm": 3.071281671524048, + "learning_rate": 0.00010678253497244595, + "loss": 2.0025, + "step": 19800 + }, + { + "epoch": 0.46653478404220244, + "grad_norm": 2.1430885791778564, + "learning_rate": 0.0001067354340351373, + "loss": 1.8642, + "step": 19810 + }, + { + "epoch": 0.4667702887287457, + "grad_norm": 1.6342874765396118, + "learning_rate": 0.00010668833309782864, + "loss": 2.0941, + "step": 19820 + }, + { + "epoch": 0.467005793415289, + "grad_norm": 1.8183525800704956, + "learning_rate": 0.00010664123216052, + "loss": 2.0612, + "step": 19830 + }, + { + "epoch": 0.46724129810183224, + "grad_norm": 2.2085859775543213, + "learning_rate": 0.00010659413122321136, + "loss": 2.0461, + "step": 19840 + }, + { + "epoch": 0.4674768027883755, + "grad_norm": 2.12282395362854, + "learning_rate": 0.0001065470302859027, + "loss": 2.0324, + "step": 19850 + }, + { + "epoch": 0.46771230747491876, + "grad_norm": 2.824951648712158, + "learning_rate": 0.00010649992934859405, + "loss": 1.9978, + "step": 19860 + }, + { + "epoch": 0.46794781216146203, + "grad_norm": 1.9392645359039307, + "learning_rate": 0.00010645282841128539, + "loss": 2.2445, + "step": 19870 + }, + { + "epoch": 0.4681833168480053, + "grad_norm": 2.1611478328704834, + "learning_rate": 0.00010640572747397675, + "loss": 2.1556, + "step": 19880 + }, + { + "epoch": 0.46841882153454856, + "grad_norm": 2.281257390975952, + "learning_rate": 0.00010635862653666809, + "loss": 2.0552, + "step": 19890 + }, + { + "epoch": 0.4686543262210918, + "grad_norm": 2.2869932651519775, + "learning_rate": 0.00010631152559935944, + "loss": 2.1354, + "step": 19900 + }, + { + "epoch": 0.4688898309076351, + "grad_norm": 2.583472967147827, + "learning_rate": 0.00010626442466205077, + "loss": 2.0894, + "step": 19910 + }, + { + "epoch": 0.46912533559417835, + "grad_norm": 1.9615871906280518, + "learning_rate": 0.00010621732372474214, + "loss": 2.0168, + "step": 19920 + }, + { + "epoch": 0.4693608402807216, + "grad_norm": 2.0687925815582275, + "learning_rate": 0.00010617022278743348, + "loss": 1.9977, + "step": 19930 + }, + { + "epoch": 0.4695963449672648, + "grad_norm": 1.8753052949905396, + "learning_rate": 0.00010612312185012483, + "loss": 1.9732, + "step": 19940 + }, + { + "epoch": 0.4698318496538081, + "grad_norm": 2.3094046115875244, + "learning_rate": 0.00010607602091281616, + "loss": 2.1751, + "step": 19950 + }, + { + "epoch": 0.47006735434035135, + "grad_norm": 1.8566091060638428, + "learning_rate": 0.00010602891997550753, + "loss": 2.097, + "step": 19960 + }, + { + "epoch": 0.4703028590268946, + "grad_norm": 2.8540568351745605, + "learning_rate": 0.00010598181903819886, + "loss": 2.0623, + "step": 19970 + }, + { + "epoch": 0.4705383637134379, + "grad_norm": 2.5797126293182373, + "learning_rate": 0.00010593471810089021, + "loss": 2.0501, + "step": 19980 + }, + { + "epoch": 0.47077386839998114, + "grad_norm": 2.1992506980895996, + "learning_rate": 0.00010588761716358155, + "loss": 1.9637, + "step": 19990 + }, + { + "epoch": 0.4710093730865244, + "grad_norm": 1.7316781282424927, + "learning_rate": 0.00010584051622627291, + "loss": 2.156, + "step": 20000 + }, + { + "epoch": 0.47124487777306767, + "grad_norm": 2.292141914367676, + "learning_rate": 0.00010579341528896425, + "loss": 2.1193, + "step": 20010 + }, + { + "epoch": 0.47148038245961094, + "grad_norm": 2.6842830181121826, + "learning_rate": 0.0001057463143516556, + "loss": 2.0507, + "step": 20020 + }, + { + "epoch": 0.4717158871461542, + "grad_norm": 1.8671191930770874, + "learning_rate": 0.00010569921341434694, + "loss": 1.8824, + "step": 20030 + }, + { + "epoch": 0.47195139183269746, + "grad_norm": 2.062864303588867, + "learning_rate": 0.0001056521124770383, + "loss": 2.1032, + "step": 20040 + }, + { + "epoch": 0.47218689651924073, + "grad_norm": 1.6221140623092651, + "learning_rate": 0.00010560501153972964, + "loss": 2.2215, + "step": 20050 + }, + { + "epoch": 0.472422401205784, + "grad_norm": 3.1304547786712646, + "learning_rate": 0.000105557910602421, + "loss": 2.106, + "step": 20060 + }, + { + "epoch": 0.47265790589232726, + "grad_norm": 2.085540533065796, + "learning_rate": 0.00010551080966511234, + "loss": 2.1274, + "step": 20070 + }, + { + "epoch": 0.4728934105788705, + "grad_norm": 1.9874452352523804, + "learning_rate": 0.00010546370872780369, + "loss": 1.8995, + "step": 20080 + }, + { + "epoch": 0.4731289152654138, + "grad_norm": 2.646638870239258, + "learning_rate": 0.00010541660779049502, + "loss": 2.1751, + "step": 20090 + }, + { + "epoch": 0.47336441995195705, + "grad_norm": 2.6359567642211914, + "learning_rate": 0.00010536950685318639, + "loss": 1.9397, + "step": 20100 + }, + { + "epoch": 0.4735999246385003, + "grad_norm": 2.21799635887146, + "learning_rate": 0.00010532240591587774, + "loss": 2.1049, + "step": 20110 + }, + { + "epoch": 0.4738354293250436, + "grad_norm": 2.5067179203033447, + "learning_rate": 0.00010527530497856908, + "loss": 2.174, + "step": 20120 + }, + { + "epoch": 0.47407093401158684, + "grad_norm": 2.724372148513794, + "learning_rate": 0.00010522820404126044, + "loss": 2.1434, + "step": 20130 + }, + { + "epoch": 0.4743064386981301, + "grad_norm": 2.247986078262329, + "learning_rate": 0.00010518110310395178, + "loss": 2.0298, + "step": 20140 + }, + { + "epoch": 0.47454194338467337, + "grad_norm": 1.7842614650726318, + "learning_rate": 0.00010513400216664313, + "loss": 2.0235, + "step": 20150 + }, + { + "epoch": 0.47477744807121663, + "grad_norm": 2.239865303039551, + "learning_rate": 0.00010508690122933446, + "loss": 2.0488, + "step": 20160 + }, + { + "epoch": 0.4750129527577599, + "grad_norm": 1.8381290435791016, + "learning_rate": 0.00010503980029202583, + "loss": 2.137, + "step": 20170 + }, + { + "epoch": 0.47524845744430316, + "grad_norm": 1.837793231010437, + "learning_rate": 0.00010499269935471716, + "loss": 1.7239, + "step": 20180 + }, + { + "epoch": 0.4754839621308464, + "grad_norm": 2.2856781482696533, + "learning_rate": 0.00010494559841740851, + "loss": 2.0022, + "step": 20190 + }, + { + "epoch": 0.4757194668173897, + "grad_norm": 2.448535919189453, + "learning_rate": 0.00010489849748009985, + "loss": 2.191, + "step": 20200 + }, + { + "epoch": 0.47595497150393296, + "grad_norm": 2.522270679473877, + "learning_rate": 0.00010485139654279121, + "loss": 2.1016, + "step": 20210 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 2.3444366455078125, + "learning_rate": 0.00010480429560548255, + "loss": 2.1629, + "step": 20220 + }, + { + "epoch": 0.47642598087701943, + "grad_norm": 1.8811753988265991, + "learning_rate": 0.00010475719466817391, + "loss": 1.9501, + "step": 20230 + }, + { + "epoch": 0.4766614855635627, + "grad_norm": 2.2614917755126953, + "learning_rate": 0.00010471009373086524, + "loss": 1.9971, + "step": 20240 + }, + { + "epoch": 0.47689699025010596, + "grad_norm": 2.2525382041931152, + "learning_rate": 0.0001046629927935566, + "loss": 2.0558, + "step": 20250 + }, + { + "epoch": 0.4771324949366492, + "grad_norm": 3.0018701553344727, + "learning_rate": 0.00010461589185624794, + "loss": 2.1558, + "step": 20260 + }, + { + "epoch": 0.4773679996231925, + "grad_norm": 2.163959264755249, + "learning_rate": 0.0001045687909189393, + "loss": 2.0081, + "step": 20270 + }, + { + "epoch": 0.47760350430973575, + "grad_norm": 1.9860162734985352, + "learning_rate": 0.00010452168998163064, + "loss": 2.0474, + "step": 20280 + }, + { + "epoch": 0.477839008996279, + "grad_norm": 2.7075717449188232, + "learning_rate": 0.00010447458904432199, + "loss": 2.0118, + "step": 20290 + }, + { + "epoch": 0.4780745136828223, + "grad_norm": 2.2994916439056396, + "learning_rate": 0.00010442748810701333, + "loss": 1.9586, + "step": 20300 + }, + { + "epoch": 0.47831001836936554, + "grad_norm": 1.9415532350540161, + "learning_rate": 0.00010438038716970469, + "loss": 2.1362, + "step": 20310 + }, + { + "epoch": 0.4785455230559088, + "grad_norm": 2.319530725479126, + "learning_rate": 0.00010433328623239603, + "loss": 1.8642, + "step": 20320 + }, + { + "epoch": 0.47878102774245207, + "grad_norm": 2.8633079528808594, + "learning_rate": 0.00010428618529508738, + "loss": 2.2701, + "step": 20330 + }, + { + "epoch": 0.47901653242899533, + "grad_norm": 3.3309438228607178, + "learning_rate": 0.00010423908435777871, + "loss": 2.0667, + "step": 20340 + }, + { + "epoch": 0.4792520371155386, + "grad_norm": 1.8573964834213257, + "learning_rate": 0.00010419198342047008, + "loss": 2.1726, + "step": 20350 + }, + { + "epoch": 0.47948754180208186, + "grad_norm": 2.2213289737701416, + "learning_rate": 0.00010414488248316141, + "loss": 2.1585, + "step": 20360 + }, + { + "epoch": 0.4797230464886251, + "grad_norm": 2.0454204082489014, + "learning_rate": 0.00010409778154585276, + "loss": 2.1965, + "step": 20370 + }, + { + "epoch": 0.4799585511751684, + "grad_norm": 2.0503110885620117, + "learning_rate": 0.0001040506806085441, + "loss": 2.0648, + "step": 20380 + }, + { + "epoch": 0.48019405586171166, + "grad_norm": 2.948557138442993, + "learning_rate": 0.00010400357967123546, + "loss": 1.9269, + "step": 20390 + }, + { + "epoch": 0.4804295605482549, + "grad_norm": 2.1079161167144775, + "learning_rate": 0.00010395647873392681, + "loss": 1.9769, + "step": 20400 + }, + { + "epoch": 0.4806650652347982, + "grad_norm": 2.086078643798828, + "learning_rate": 0.00010390937779661815, + "loss": 2.0752, + "step": 20410 + }, + { + "epoch": 0.48090056992134145, + "grad_norm": 1.7482393980026245, + "learning_rate": 0.00010386227685930952, + "loss": 1.9872, + "step": 20420 + }, + { + "epoch": 0.4811360746078847, + "grad_norm": 2.2785568237304688, + "learning_rate": 0.00010381517592200085, + "loss": 2.1667, + "step": 20430 + }, + { + "epoch": 0.481371579294428, + "grad_norm": 2.2354800701141357, + "learning_rate": 0.00010376807498469222, + "loss": 2.0142, + "step": 20440 + }, + { + "epoch": 0.48160708398097124, + "grad_norm": 2.2600879669189453, + "learning_rate": 0.00010372097404738355, + "loss": 2.1101, + "step": 20450 + }, + { + "epoch": 0.4818425886675145, + "grad_norm": 2.2491447925567627, + "learning_rate": 0.0001036738731100749, + "loss": 2.0255, + "step": 20460 + }, + { + "epoch": 0.48207809335405777, + "grad_norm": 3.314981698989868, + "learning_rate": 0.00010362677217276624, + "loss": 1.8906, + "step": 20470 + }, + { + "epoch": 0.48231359804060103, + "grad_norm": 2.204580783843994, + "learning_rate": 0.0001035796712354576, + "loss": 2.235, + "step": 20480 + }, + { + "epoch": 0.4825491027271443, + "grad_norm": 2.513540029525757, + "learning_rate": 0.00010353257029814894, + "loss": 1.9714, + "step": 20490 + }, + { + "epoch": 0.4827846074136875, + "grad_norm": 2.1390328407287598, + "learning_rate": 0.00010348546936084029, + "loss": 2.242, + "step": 20500 + }, + { + "epoch": 0.48302011210023077, + "grad_norm": 1.9267793893814087, + "learning_rate": 0.00010343836842353163, + "loss": 2.2083, + "step": 20510 + }, + { + "epoch": 0.48325561678677403, + "grad_norm": 1.821351408958435, + "learning_rate": 0.00010339126748622299, + "loss": 2.0652, + "step": 20520 + }, + { + "epoch": 0.4834911214733173, + "grad_norm": 2.3727545738220215, + "learning_rate": 0.00010334416654891433, + "loss": 2.2172, + "step": 20530 + }, + { + "epoch": 0.48372662615986056, + "grad_norm": 2.450948476791382, + "learning_rate": 0.00010329706561160568, + "loss": 2.0812, + "step": 20540 + }, + { + "epoch": 0.4839621308464038, + "grad_norm": 2.104326009750366, + "learning_rate": 0.00010324996467429701, + "loss": 2.1296, + "step": 20550 + }, + { + "epoch": 0.4841976355329471, + "grad_norm": 1.6081938743591309, + "learning_rate": 0.00010320286373698838, + "loss": 2.1119, + "step": 20560 + }, + { + "epoch": 0.48443314021949035, + "grad_norm": 2.259737491607666, + "learning_rate": 0.00010315576279967971, + "loss": 2.0855, + "step": 20570 + }, + { + "epoch": 0.4846686449060336, + "grad_norm": 2.6324872970581055, + "learning_rate": 0.00010310866186237106, + "loss": 2.1031, + "step": 20580 + }, + { + "epoch": 0.4849041495925769, + "grad_norm": 2.676846504211426, + "learning_rate": 0.0001030615609250624, + "loss": 1.8389, + "step": 20590 + }, + { + "epoch": 0.48513965427912015, + "grad_norm": 2.062814712524414, + "learning_rate": 0.00010301445998775377, + "loss": 2.0708, + "step": 20600 + }, + { + "epoch": 0.4853751589656634, + "grad_norm": 1.8760517835617065, + "learning_rate": 0.0001029673590504451, + "loss": 1.9559, + "step": 20610 + }, + { + "epoch": 0.4856106636522067, + "grad_norm": 2.316432476043701, + "learning_rate": 0.00010292025811313645, + "loss": 2.0927, + "step": 20620 + }, + { + "epoch": 0.48584616833874994, + "grad_norm": 2.2129321098327637, + "learning_rate": 0.00010287315717582779, + "loss": 2.0794, + "step": 20630 + }, + { + "epoch": 0.4860816730252932, + "grad_norm": 2.3312692642211914, + "learning_rate": 0.00010282605623851915, + "loss": 1.9387, + "step": 20640 + }, + { + "epoch": 0.48631717771183647, + "grad_norm": 1.807623267173767, + "learning_rate": 0.00010277895530121049, + "loss": 1.984, + "step": 20650 + }, + { + "epoch": 0.48655268239837973, + "grad_norm": 2.3233418464660645, + "learning_rate": 0.00010273185436390185, + "loss": 1.8548, + "step": 20660 + }, + { + "epoch": 0.486788187084923, + "grad_norm": 2.37127423286438, + "learning_rate": 0.00010268475342659319, + "loss": 2.2169, + "step": 20670 + }, + { + "epoch": 0.48702369177146626, + "grad_norm": 2.1091103553771973, + "learning_rate": 0.00010263765248928454, + "loss": 2.0467, + "step": 20680 + }, + { + "epoch": 0.4872591964580095, + "grad_norm": 3.436962127685547, + "learning_rate": 0.0001025905515519759, + "loss": 2.1332, + "step": 20690 + }, + { + "epoch": 0.4874947011445528, + "grad_norm": 2.2271640300750732, + "learning_rate": 0.00010254345061466724, + "loss": 2.0451, + "step": 20700 + }, + { + "epoch": 0.48773020583109605, + "grad_norm": 2.0807104110717773, + "learning_rate": 0.00010249634967735859, + "loss": 1.9054, + "step": 20710 + }, + { + "epoch": 0.4879657105176393, + "grad_norm": 2.3256382942199707, + "learning_rate": 0.00010244924874004993, + "loss": 1.9937, + "step": 20720 + }, + { + "epoch": 0.4882012152041826, + "grad_norm": 1.8084198236465454, + "learning_rate": 0.00010240214780274129, + "loss": 2.0964, + "step": 20730 + }, + { + "epoch": 0.48843671989072585, + "grad_norm": 2.9819064140319824, + "learning_rate": 0.00010235504686543263, + "loss": 2.0176, + "step": 20740 + }, + { + "epoch": 0.4886722245772691, + "grad_norm": 2.723691701889038, + "learning_rate": 0.00010230794592812398, + "loss": 2.2222, + "step": 20750 + }, + { + "epoch": 0.4889077292638124, + "grad_norm": 2.697124719619751, + "learning_rate": 0.00010226084499081532, + "loss": 2.1921, + "step": 20760 + }, + { + "epoch": 0.48914323395035564, + "grad_norm": 2.0441181659698486, + "learning_rate": 0.00010221374405350668, + "loss": 2.1415, + "step": 20770 + }, + { + "epoch": 0.48937873863689885, + "grad_norm": 2.7658491134643555, + "learning_rate": 0.00010216664311619802, + "loss": 2.068, + "step": 20780 + }, + { + "epoch": 0.4896142433234421, + "grad_norm": 2.291435718536377, + "learning_rate": 0.00010211954217888937, + "loss": 1.9272, + "step": 20790 + }, + { + "epoch": 0.4898497480099854, + "grad_norm": 2.1761622428894043, + "learning_rate": 0.0001020724412415807, + "loss": 2.1504, + "step": 20800 + }, + { + "epoch": 0.49008525269652864, + "grad_norm": 1.9706686735153198, + "learning_rate": 0.00010202534030427207, + "loss": 2.1641, + "step": 20810 + }, + { + "epoch": 0.4903207573830719, + "grad_norm": 2.0886855125427246, + "learning_rate": 0.0001019782393669634, + "loss": 2.0652, + "step": 20820 + }, + { + "epoch": 0.49055626206961517, + "grad_norm": 2.300699234008789, + "learning_rate": 0.00010193113842965477, + "loss": 1.9582, + "step": 20830 + }, + { + "epoch": 0.49079176675615843, + "grad_norm": 2.276219606399536, + "learning_rate": 0.00010188403749234609, + "loss": 2.3446, + "step": 20840 + }, + { + "epoch": 0.4910272714427017, + "grad_norm": 1.8282259702682495, + "learning_rate": 0.00010183693655503745, + "loss": 2.1345, + "step": 20850 + }, + { + "epoch": 0.49126277612924496, + "grad_norm": 2.4664013385772705, + "learning_rate": 0.00010178983561772879, + "loss": 2.081, + "step": 20860 + }, + { + "epoch": 0.4914982808157882, + "grad_norm": 2.2036657333374023, + "learning_rate": 0.00010174273468042015, + "loss": 1.9418, + "step": 20870 + }, + { + "epoch": 0.4917337855023315, + "grad_norm": 2.229975938796997, + "learning_rate": 0.00010169563374311149, + "loss": 2.0532, + "step": 20880 + }, + { + "epoch": 0.49196929018887475, + "grad_norm": 2.0748679637908936, + "learning_rate": 0.00010164853280580284, + "loss": 2.0162, + "step": 20890 + }, + { + "epoch": 0.492204794875418, + "grad_norm": 2.1549928188323975, + "learning_rate": 0.00010160143186849418, + "loss": 2.1265, + "step": 20900 + }, + { + "epoch": 0.4924402995619613, + "grad_norm": 2.1036789417266846, + "learning_rate": 0.00010155433093118554, + "loss": 2.1916, + "step": 20910 + }, + { + "epoch": 0.49267580424850455, + "grad_norm": 1.9950534105300903, + "learning_rate": 0.00010150722999387688, + "loss": 1.9724, + "step": 20920 + }, + { + "epoch": 0.4929113089350478, + "grad_norm": 1.8147923946380615, + "learning_rate": 0.00010146012905656823, + "loss": 1.9993, + "step": 20930 + }, + { + "epoch": 0.4931468136215911, + "grad_norm": 3.3798739910125732, + "learning_rate": 0.00010141302811925957, + "loss": 2.1009, + "step": 20940 + }, + { + "epoch": 0.49338231830813434, + "grad_norm": 2.460247039794922, + "learning_rate": 0.00010136592718195093, + "loss": 2.2822, + "step": 20950 + }, + { + "epoch": 0.4936178229946776, + "grad_norm": 2.2641265392303467, + "learning_rate": 0.00010131882624464228, + "loss": 2.0259, + "step": 20960 + }, + { + "epoch": 0.49385332768122087, + "grad_norm": 2.0152974128723145, + "learning_rate": 0.00010127172530733362, + "loss": 2.2931, + "step": 20970 + }, + { + "epoch": 0.49408883236776413, + "grad_norm": 2.167557716369629, + "learning_rate": 0.00010122462437002498, + "loss": 1.9011, + "step": 20980 + }, + { + "epoch": 0.4943243370543074, + "grad_norm": 2.114088773727417, + "learning_rate": 0.00010117752343271632, + "loss": 1.8406, + "step": 20990 + }, + { + "epoch": 0.49455984174085066, + "grad_norm": 2.915698289871216, + "learning_rate": 0.00010113042249540767, + "loss": 2.1208, + "step": 21000 + }, + { + "epoch": 0.4947953464273939, + "grad_norm": 2.237755060195923, + "learning_rate": 0.000101083321558099, + "loss": 2.0806, + "step": 21010 + }, + { + "epoch": 0.4950308511139372, + "grad_norm": 2.4972400665283203, + "learning_rate": 0.00010103622062079037, + "loss": 2.1965, + "step": 21020 + }, + { + "epoch": 0.49526635580048045, + "grad_norm": 1.718467116355896, + "learning_rate": 0.0001009891196834817, + "loss": 1.7707, + "step": 21030 + }, + { + "epoch": 0.4955018604870237, + "grad_norm": 2.2876641750335693, + "learning_rate": 0.00010094201874617307, + "loss": 2.0414, + "step": 21040 + }, + { + "epoch": 0.495737365173567, + "grad_norm": 2.5221569538116455, + "learning_rate": 0.0001008949178088644, + "loss": 1.9567, + "step": 21050 + }, + { + "epoch": 0.49597286986011024, + "grad_norm": 2.7397263050079346, + "learning_rate": 0.00010084781687155575, + "loss": 1.938, + "step": 21060 + }, + { + "epoch": 0.49620837454665345, + "grad_norm": 1.9480063915252686, + "learning_rate": 0.00010080071593424709, + "loss": 2.033, + "step": 21070 + }, + { + "epoch": 0.4964438792331967, + "grad_norm": 2.3631954193115234, + "learning_rate": 0.00010075361499693846, + "loss": 2.1295, + "step": 21080 + }, + { + "epoch": 0.49667938391974, + "grad_norm": 2.4683914184570312, + "learning_rate": 0.00010070651405962979, + "loss": 1.9156, + "step": 21090 + }, + { + "epoch": 0.49691488860628324, + "grad_norm": 2.1928350925445557, + "learning_rate": 0.00010065941312232114, + "loss": 2.1845, + "step": 21100 + }, + { + "epoch": 0.4971503932928265, + "grad_norm": 1.8933937549591064, + "learning_rate": 0.00010061231218501248, + "loss": 2.0413, + "step": 21110 + }, + { + "epoch": 0.4973858979793698, + "grad_norm": 2.532449722290039, + "learning_rate": 0.00010056521124770384, + "loss": 2.1513, + "step": 21120 + }, + { + "epoch": 0.49762140266591304, + "grad_norm": 2.1574230194091797, + "learning_rate": 0.00010051811031039518, + "loss": 1.8182, + "step": 21130 + }, + { + "epoch": 0.4978569073524563, + "grad_norm": 2.202259063720703, + "learning_rate": 0.00010047100937308653, + "loss": 2.2352, + "step": 21140 + }, + { + "epoch": 0.49809241203899957, + "grad_norm": 2.3444535732269287, + "learning_rate": 0.00010042390843577787, + "loss": 2.0119, + "step": 21150 + }, + { + "epoch": 0.49832791672554283, + "grad_norm": 2.5278213024139404, + "learning_rate": 0.00010037680749846923, + "loss": 2.1794, + "step": 21160 + }, + { + "epoch": 0.4985634214120861, + "grad_norm": 1.6672074794769287, + "learning_rate": 0.00010032970656116057, + "loss": 1.8915, + "step": 21170 + }, + { + "epoch": 0.49879892609862936, + "grad_norm": 1.9059944152832031, + "learning_rate": 0.00010028260562385192, + "loss": 1.9642, + "step": 21180 + }, + { + "epoch": 0.4990344307851726, + "grad_norm": 2.001873254776001, + "learning_rate": 0.00010023550468654325, + "loss": 2.1203, + "step": 21190 + }, + { + "epoch": 0.4992699354717159, + "grad_norm": 2.2856180667877197, + "learning_rate": 0.00010018840374923462, + "loss": 2.1251, + "step": 21200 + }, + { + "epoch": 0.49950544015825915, + "grad_norm": 3.174320936203003, + "learning_rate": 0.00010014130281192595, + "loss": 2.0888, + "step": 21210 + }, + { + "epoch": 0.4997409448448024, + "grad_norm": 3.9642059803009033, + "learning_rate": 0.0001000942018746173, + "loss": 2.13, + "step": 21220 + }, + { + "epoch": 0.4999764495313457, + "grad_norm": 2.8816041946411133, + "learning_rate": 0.00010004710093730864, + "loss": 2.2852, + "step": 21230 + }, + { + "epoch": 0.5002119542178889, + "grad_norm": 2.1514241695404053, + "learning_rate": 0.0001, + "loss": 2.175, + "step": 21240 + }, + { + "epoch": 0.5004474589044322, + "grad_norm": 2.823493242263794, + "learning_rate": 9.995289906269136e-05, + "loss": 1.9793, + "step": 21250 + }, + { + "epoch": 0.5006829635909754, + "grad_norm": 2.3700454235076904, + "learning_rate": 9.99057981253827e-05, + "loss": 2.2788, + "step": 21260 + }, + { + "epoch": 0.5009184682775187, + "grad_norm": 2.2594282627105713, + "learning_rate": 9.985869718807404e-05, + "loss": 2.0639, + "step": 21270 + }, + { + "epoch": 0.501153972964062, + "grad_norm": 2.6550183296203613, + "learning_rate": 9.981159625076539e-05, + "loss": 2.1829, + "step": 21280 + }, + { + "epoch": 0.5013894776506053, + "grad_norm": 2.1802735328674316, + "learning_rate": 9.976449531345674e-05, + "loss": 2.0109, + "step": 21290 + }, + { + "epoch": 0.5016249823371485, + "grad_norm": 2.058255672454834, + "learning_rate": 9.971739437614809e-05, + "loss": 2.2871, + "step": 21300 + }, + { + "epoch": 0.5018604870236918, + "grad_norm": 2.2070486545562744, + "learning_rate": 9.967029343883943e-05, + "loss": 2.062, + "step": 21310 + }, + { + "epoch": 0.502095991710235, + "grad_norm": 2.2400360107421875, + "learning_rate": 9.962319250153078e-05, + "loss": 2.0432, + "step": 21320 + }, + { + "epoch": 0.5023314963967783, + "grad_norm": 1.9597783088684082, + "learning_rate": 9.957609156422213e-05, + "loss": 2.0918, + "step": 21330 + }, + { + "epoch": 0.5025670010833215, + "grad_norm": 2.0394837856292725, + "learning_rate": 9.952899062691348e-05, + "loss": 2.0768, + "step": 21340 + }, + { + "epoch": 0.5028025057698648, + "grad_norm": 2.9763147830963135, + "learning_rate": 9.948188968960482e-05, + "loss": 2.2218, + "step": 21350 + }, + { + "epoch": 0.5030380104564081, + "grad_norm": 2.2641072273254395, + "learning_rate": 9.943478875229617e-05, + "loss": 2.1642, + "step": 21360 + }, + { + "epoch": 0.5032735151429514, + "grad_norm": 1.8236515522003174, + "learning_rate": 9.938768781498752e-05, + "loss": 2.0839, + "step": 21370 + }, + { + "epoch": 0.5035090198294946, + "grad_norm": 2.490896224975586, + "learning_rate": 9.934058687767887e-05, + "loss": 2.2359, + "step": 21380 + }, + { + "epoch": 0.5037445245160379, + "grad_norm": 2.3176302909851074, + "learning_rate": 9.929348594037022e-05, + "loss": 2.2307, + "step": 21390 + }, + { + "epoch": 0.5039800292025811, + "grad_norm": 2.2265188694000244, + "learning_rate": 9.924638500306157e-05, + "loss": 2.0561, + "step": 21400 + }, + { + "epoch": 0.5042155338891244, + "grad_norm": 2.182687759399414, + "learning_rate": 9.919928406575292e-05, + "loss": 2.1138, + "step": 21410 + }, + { + "epoch": 0.5044510385756676, + "grad_norm": 2.3917243480682373, + "learning_rate": 9.915218312844427e-05, + "loss": 2.1989, + "step": 21420 + }, + { + "epoch": 0.504686543262211, + "grad_norm": 2.182008981704712, + "learning_rate": 9.91050821911356e-05, + "loss": 1.9135, + "step": 21430 + }, + { + "epoch": 0.5049220479487542, + "grad_norm": 2.1667606830596924, + "learning_rate": 9.905798125382696e-05, + "loss": 1.977, + "step": 21440 + }, + { + "epoch": 0.5051575526352975, + "grad_norm": 2.5774123668670654, + "learning_rate": 9.90108803165183e-05, + "loss": 2.0104, + "step": 21450 + }, + { + "epoch": 0.5053930573218407, + "grad_norm": 2.3878164291381836, + "learning_rate": 9.896377937920966e-05, + "loss": 1.9225, + "step": 21460 + }, + { + "epoch": 0.505628562008384, + "grad_norm": 2.676603078842163, + "learning_rate": 9.8916678441901e-05, + "loss": 2.1486, + "step": 21470 + }, + { + "epoch": 0.5058640666949272, + "grad_norm": 1.9781603813171387, + "learning_rate": 9.886957750459234e-05, + "loss": 2.0173, + "step": 21480 + }, + { + "epoch": 0.5060995713814704, + "grad_norm": 2.2742934226989746, + "learning_rate": 9.882247656728369e-05, + "loss": 2.0306, + "step": 21490 + }, + { + "epoch": 0.5063350760680138, + "grad_norm": 1.8691270351409912, + "learning_rate": 9.877537562997504e-05, + "loss": 1.8714, + "step": 21500 + }, + { + "epoch": 0.506570580754557, + "grad_norm": 2.0995771884918213, + "learning_rate": 9.87282746926664e-05, + "loss": 1.931, + "step": 21510 + }, + { + "epoch": 0.5068060854411003, + "grad_norm": 2.2483112812042236, + "learning_rate": 9.868117375535773e-05, + "loss": 2.1824, + "step": 21520 + }, + { + "epoch": 0.5070415901276435, + "grad_norm": 2.2055604457855225, + "learning_rate": 9.863407281804908e-05, + "loss": 1.8965, + "step": 21530 + }, + { + "epoch": 0.5072770948141868, + "grad_norm": 2.187610626220703, + "learning_rate": 9.858697188074043e-05, + "loss": 2.1238, + "step": 21540 + }, + { + "epoch": 0.50751259950073, + "grad_norm": 2.0155301094055176, + "learning_rate": 9.853987094343178e-05, + "loss": 2.2547, + "step": 21550 + }, + { + "epoch": 0.5077481041872733, + "grad_norm": 2.0881471633911133, + "learning_rate": 9.849277000612313e-05, + "loss": 2.05, + "step": 21560 + }, + { + "epoch": 0.5079836088738166, + "grad_norm": 2.319646120071411, + "learning_rate": 9.844566906881447e-05, + "loss": 1.9783, + "step": 21570 + }, + { + "epoch": 0.5082191135603599, + "grad_norm": 2.136197805404663, + "learning_rate": 9.839856813150582e-05, + "loss": 2.0, + "step": 21580 + }, + { + "epoch": 0.5084546182469031, + "grad_norm": 2.998568058013916, + "learning_rate": 9.835146719419717e-05, + "loss": 2.1183, + "step": 21590 + }, + { + "epoch": 0.5086901229334464, + "grad_norm": 2.355353355407715, + "learning_rate": 9.830436625688852e-05, + "loss": 2.2414, + "step": 21600 + }, + { + "epoch": 0.5089256276199896, + "grad_norm": 1.928250789642334, + "learning_rate": 9.825726531957986e-05, + "loss": 1.8088, + "step": 21610 + }, + { + "epoch": 0.5091611323065329, + "grad_norm": 3.0732262134552, + "learning_rate": 9.82101643822712e-05, + "loss": 1.9908, + "step": 21620 + }, + { + "epoch": 0.5093966369930761, + "grad_norm": 3.366936445236206, + "learning_rate": 9.816306344496256e-05, + "loss": 2.067, + "step": 21630 + }, + { + "epoch": 0.5096321416796195, + "grad_norm": 2.2484872341156006, + "learning_rate": 9.81159625076539e-05, + "loss": 2.1143, + "step": 21640 + }, + { + "epoch": 0.5098676463661627, + "grad_norm": 2.183173894882202, + "learning_rate": 9.806886157034524e-05, + "loss": 2.1001, + "step": 21650 + }, + { + "epoch": 0.510103151052706, + "grad_norm": 2.780048131942749, + "learning_rate": 9.802176063303659e-05, + "loss": 2.2141, + "step": 21660 + }, + { + "epoch": 0.5103386557392492, + "grad_norm": 2.5140750408172607, + "learning_rate": 9.797465969572796e-05, + "loss": 2.136, + "step": 21670 + }, + { + "epoch": 0.5105741604257925, + "grad_norm": 2.4135138988494873, + "learning_rate": 9.792755875841931e-05, + "loss": 1.924, + "step": 21680 + }, + { + "epoch": 0.5108096651123357, + "grad_norm": 2.8856024742126465, + "learning_rate": 9.788045782111064e-05, + "loss": 1.9122, + "step": 21690 + }, + { + "epoch": 0.511045169798879, + "grad_norm": 2.015821695327759, + "learning_rate": 9.7833356883802e-05, + "loss": 2.2464, + "step": 21700 + }, + { + "epoch": 0.5112806744854222, + "grad_norm": 3.2356677055358887, + "learning_rate": 9.778625594649334e-05, + "loss": 2.0949, + "step": 21710 + }, + { + "epoch": 0.5115161791719656, + "grad_norm": 1.8908910751342773, + "learning_rate": 9.77391550091847e-05, + "loss": 1.8944, + "step": 21720 + }, + { + "epoch": 0.5117516838585088, + "grad_norm": 1.9458872079849243, + "learning_rate": 9.769205407187603e-05, + "loss": 2.1298, + "step": 21730 + }, + { + "epoch": 0.5119871885450521, + "grad_norm": 2.1977505683898926, + "learning_rate": 9.764495313456738e-05, + "loss": 2.3225, + "step": 21740 + }, + { + "epoch": 0.5122226932315953, + "grad_norm": 2.247251033782959, + "learning_rate": 9.759785219725873e-05, + "loss": 2.0982, + "step": 21750 + }, + { + "epoch": 0.5124581979181386, + "grad_norm": 2.334291934967041, + "learning_rate": 9.755075125995008e-05, + "loss": 2.144, + "step": 21760 + }, + { + "epoch": 0.5126937026046818, + "grad_norm": 2.0786211490631104, + "learning_rate": 9.750365032264143e-05, + "loss": 2.1501, + "step": 21770 + }, + { + "epoch": 0.512929207291225, + "grad_norm": 2.1926138401031494, + "learning_rate": 9.745654938533277e-05, + "loss": 2.0627, + "step": 21780 + }, + { + "epoch": 0.5131647119777684, + "grad_norm": 2.517853021621704, + "learning_rate": 9.740944844802412e-05, + "loss": 2.0631, + "step": 21790 + }, + { + "epoch": 0.5134002166643116, + "grad_norm": 1.8308531045913696, + "learning_rate": 9.736234751071547e-05, + "loss": 2.1762, + "step": 21800 + }, + { + "epoch": 0.5136357213508549, + "grad_norm": 2.1084744930267334, + "learning_rate": 9.731524657340682e-05, + "loss": 1.9098, + "step": 21810 + }, + { + "epoch": 0.5138712260373981, + "grad_norm": 2.3798017501831055, + "learning_rate": 9.726814563609816e-05, + "loss": 1.8055, + "step": 21820 + }, + { + "epoch": 0.5141067307239414, + "grad_norm": 2.443279266357422, + "learning_rate": 9.72210446987895e-05, + "loss": 2.0454, + "step": 21830 + }, + { + "epoch": 0.5143422354104846, + "grad_norm": 2.0576086044311523, + "learning_rate": 9.717394376148086e-05, + "loss": 2.2606, + "step": 21840 + }, + { + "epoch": 0.514577740097028, + "grad_norm": 1.869069218635559, + "learning_rate": 9.712684282417221e-05, + "loss": 1.8852, + "step": 21850 + }, + { + "epoch": 0.5148132447835712, + "grad_norm": 1.9165115356445312, + "learning_rate": 9.707974188686356e-05, + "loss": 1.9244, + "step": 21860 + }, + { + "epoch": 0.5150487494701145, + "grad_norm": 1.7431786060333252, + "learning_rate": 9.70326409495549e-05, + "loss": 1.9097, + "step": 21870 + }, + { + "epoch": 0.5152842541566577, + "grad_norm": 2.2474775314331055, + "learning_rate": 9.698554001224624e-05, + "loss": 2.1901, + "step": 21880 + }, + { + "epoch": 0.515519758843201, + "grad_norm": 1.9225223064422607, + "learning_rate": 9.69384390749376e-05, + "loss": 2.217, + "step": 21890 + }, + { + "epoch": 0.5157552635297442, + "grad_norm": 1.9866782426834106, + "learning_rate": 9.689133813762894e-05, + "loss": 1.8987, + "step": 21900 + }, + { + "epoch": 0.5159907682162875, + "grad_norm": 2.126774549484253, + "learning_rate": 9.684423720032028e-05, + "loss": 2.0953, + "step": 21910 + }, + { + "epoch": 0.5162262729028307, + "grad_norm": 1.8921127319335938, + "learning_rate": 9.679713626301163e-05, + "loss": 2.0589, + "step": 21920 + }, + { + "epoch": 0.5164617775893741, + "grad_norm": 2.78843092918396, + "learning_rate": 9.675003532570298e-05, + "loss": 1.9555, + "step": 21930 + }, + { + "epoch": 0.5166972822759173, + "grad_norm": 2.5705997943878174, + "learning_rate": 9.670293438839433e-05, + "loss": 2.0471, + "step": 21940 + }, + { + "epoch": 0.5169327869624606, + "grad_norm": 1.8780876398086548, + "learning_rate": 9.665583345108568e-05, + "loss": 2.0409, + "step": 21950 + }, + { + "epoch": 0.5171682916490038, + "grad_norm": 2.1440744400024414, + "learning_rate": 9.660873251377703e-05, + "loss": 2.0148, + "step": 21960 + }, + { + "epoch": 0.5174037963355471, + "grad_norm": 2.174579381942749, + "learning_rate": 9.656163157646838e-05, + "loss": 2.1547, + "step": 21970 + }, + { + "epoch": 0.5176393010220903, + "grad_norm": 3.240619421005249, + "learning_rate": 9.651453063915973e-05, + "loss": 2.0667, + "step": 21980 + }, + { + "epoch": 0.5178748057086336, + "grad_norm": 2.3205692768096924, + "learning_rate": 9.646742970185107e-05, + "loss": 1.8602, + "step": 21990 + }, + { + "epoch": 0.5181103103951769, + "grad_norm": 3.049022912979126, + "learning_rate": 9.642032876454242e-05, + "loss": 2.0942, + "step": 22000 + }, + { + "epoch": 0.5183458150817202, + "grad_norm": 2.585932970046997, + "learning_rate": 9.637322782723377e-05, + "loss": 1.9998, + "step": 22010 + }, + { + "epoch": 0.5185813197682634, + "grad_norm": 2.4109232425689697, + "learning_rate": 9.632612688992512e-05, + "loss": 2.075, + "step": 22020 + }, + { + "epoch": 0.5188168244548067, + "grad_norm": 2.334937334060669, + "learning_rate": 9.627902595261646e-05, + "loss": 1.8412, + "step": 22030 + }, + { + "epoch": 0.5190523291413499, + "grad_norm": 3.0095884799957275, + "learning_rate": 9.623192501530781e-05, + "loss": 1.918, + "step": 22040 + }, + { + "epoch": 0.5192878338278932, + "grad_norm": 1.9258203506469727, + "learning_rate": 9.618482407799916e-05, + "loss": 1.9809, + "step": 22050 + }, + { + "epoch": 0.5195233385144364, + "grad_norm": 1.8226529359817505, + "learning_rate": 9.613772314069051e-05, + "loss": 1.9827, + "step": 22060 + }, + { + "epoch": 0.5197588432009796, + "grad_norm": 2.051090955734253, + "learning_rate": 9.609062220338186e-05, + "loss": 1.9504, + "step": 22070 + }, + { + "epoch": 0.519994347887523, + "grad_norm": 2.140054941177368, + "learning_rate": 9.60435212660732e-05, + "loss": 2.0869, + "step": 22080 + }, + { + "epoch": 0.5202298525740662, + "grad_norm": 2.1044740676879883, + "learning_rate": 9.599642032876455e-05, + "loss": 1.9475, + "step": 22090 + }, + { + "epoch": 0.5204653572606095, + "grad_norm": 1.9756555557250977, + "learning_rate": 9.59493193914559e-05, + "loss": 2.0775, + "step": 22100 + }, + { + "epoch": 0.5207008619471527, + "grad_norm": 2.549635171890259, + "learning_rate": 9.590221845414725e-05, + "loss": 1.8268, + "step": 22110 + }, + { + "epoch": 0.520936366633696, + "grad_norm": 2.121861696243286, + "learning_rate": 9.585511751683858e-05, + "loss": 2.103, + "step": 22120 + }, + { + "epoch": 0.5211718713202392, + "grad_norm": 2.389665365219116, + "learning_rate": 9.580801657952993e-05, + "loss": 2.2068, + "step": 22130 + }, + { + "epoch": 0.5214073760067826, + "grad_norm": 2.340815782546997, + "learning_rate": 9.576091564222128e-05, + "loss": 2.0227, + "step": 22140 + }, + { + "epoch": 0.5216428806933258, + "grad_norm": 2.191857099533081, + "learning_rate": 9.571381470491263e-05, + "loss": 2.1419, + "step": 22150 + }, + { + "epoch": 0.5218783853798691, + "grad_norm": 3.036268711090088, + "learning_rate": 9.566671376760398e-05, + "loss": 2.0012, + "step": 22160 + }, + { + "epoch": 0.5221138900664123, + "grad_norm": 2.6708240509033203, + "learning_rate": 9.561961283029532e-05, + "loss": 2.052, + "step": 22170 + }, + { + "epoch": 0.5223493947529556, + "grad_norm": 2.439678907394409, + "learning_rate": 9.557251189298667e-05, + "loss": 2.2863, + "step": 22180 + }, + { + "epoch": 0.5225848994394988, + "grad_norm": 2.5144474506378174, + "learning_rate": 9.552541095567802e-05, + "loss": 2.1124, + "step": 22190 + }, + { + "epoch": 0.5228204041260421, + "grad_norm": 2.6806702613830566, + "learning_rate": 9.547831001836937e-05, + "loss": 2.1127, + "step": 22200 + }, + { + "epoch": 0.5230559088125853, + "grad_norm": 2.2948286533355713, + "learning_rate": 9.543120908106071e-05, + "loss": 2.0362, + "step": 22210 + }, + { + "epoch": 0.5232914134991287, + "grad_norm": 2.144993543624878, + "learning_rate": 9.538410814375206e-05, + "loss": 1.8412, + "step": 22220 + }, + { + "epoch": 0.5235269181856719, + "grad_norm": 2.0969650745391846, + "learning_rate": 9.533700720644342e-05, + "loss": 2.1297, + "step": 22230 + }, + { + "epoch": 0.5237624228722152, + "grad_norm": 1.9239368438720703, + "learning_rate": 9.528990626913477e-05, + "loss": 2.0009, + "step": 22240 + }, + { + "epoch": 0.5239979275587584, + "grad_norm": 2.1357004642486572, + "learning_rate": 9.524280533182611e-05, + "loss": 2.1151, + "step": 22250 + }, + { + "epoch": 0.5242334322453017, + "grad_norm": 2.1568477153778076, + "learning_rate": 9.519570439451746e-05, + "loss": 1.991, + "step": 22260 + }, + { + "epoch": 0.5244689369318449, + "grad_norm": 2.371934175491333, + "learning_rate": 9.514860345720881e-05, + "loss": 2.1708, + "step": 22270 + }, + { + "epoch": 0.5247044416183883, + "grad_norm": 1.8997218608856201, + "learning_rate": 9.510150251990016e-05, + "loss": 2.0686, + "step": 22280 + }, + { + "epoch": 0.5249399463049315, + "grad_norm": 2.1850790977478027, + "learning_rate": 9.50544015825915e-05, + "loss": 2.0206, + "step": 22290 + }, + { + "epoch": 0.5251754509914748, + "grad_norm": 2.4998154640197754, + "learning_rate": 9.500730064528285e-05, + "loss": 1.874, + "step": 22300 + }, + { + "epoch": 0.525410955678018, + "grad_norm": 2.6623775959014893, + "learning_rate": 9.49601997079742e-05, + "loss": 2.1144, + "step": 22310 + }, + { + "epoch": 0.5256464603645613, + "grad_norm": 2.4677085876464844, + "learning_rate": 9.491309877066555e-05, + "loss": 2.0148, + "step": 22320 + }, + { + "epoch": 0.5258819650511045, + "grad_norm": 2.5146234035491943, + "learning_rate": 9.486599783335688e-05, + "loss": 2.0617, + "step": 22330 + }, + { + "epoch": 0.5261174697376477, + "grad_norm": 1.9805279970169067, + "learning_rate": 9.481889689604823e-05, + "loss": 2.0317, + "step": 22340 + }, + { + "epoch": 0.526352974424191, + "grad_norm": 1.9576618671417236, + "learning_rate": 9.477179595873958e-05, + "loss": 2.1339, + "step": 22350 + }, + { + "epoch": 0.5265884791107343, + "grad_norm": 2.2992379665374756, + "learning_rate": 9.472469502143093e-05, + "loss": 2.1361, + "step": 22360 + }, + { + "epoch": 0.5268239837972776, + "grad_norm": 2.034731149673462, + "learning_rate": 9.467759408412228e-05, + "loss": 2.062, + "step": 22370 + }, + { + "epoch": 0.5270594884838208, + "grad_norm": 2.1001486778259277, + "learning_rate": 9.463049314681362e-05, + "loss": 2.1102, + "step": 22380 + }, + { + "epoch": 0.5272949931703641, + "grad_norm": 1.7902547121047974, + "learning_rate": 9.458339220950497e-05, + "loss": 1.8913, + "step": 22390 + }, + { + "epoch": 0.5275304978569073, + "grad_norm": 2.0695128440856934, + "learning_rate": 9.453629127219632e-05, + "loss": 2.0724, + "step": 22400 + }, + { + "epoch": 0.5277660025434506, + "grad_norm": 2.392047643661499, + "learning_rate": 9.448919033488767e-05, + "loss": 1.8279, + "step": 22410 + }, + { + "epoch": 0.5280015072299938, + "grad_norm": 2.0684807300567627, + "learning_rate": 9.444208939757901e-05, + "loss": 2.2671, + "step": 22420 + }, + { + "epoch": 0.5282370119165372, + "grad_norm": 1.8215241432189941, + "learning_rate": 9.439498846027036e-05, + "loss": 2.1644, + "step": 22430 + }, + { + "epoch": 0.5284725166030804, + "grad_norm": 2.322417974472046, + "learning_rate": 9.434788752296171e-05, + "loss": 2.1318, + "step": 22440 + }, + { + "epoch": 0.5287080212896237, + "grad_norm": 2.333282232284546, + "learning_rate": 9.430078658565306e-05, + "loss": 1.8483, + "step": 22450 + }, + { + "epoch": 0.5289435259761669, + "grad_norm": 2.832303285598755, + "learning_rate": 9.425368564834441e-05, + "loss": 2.0394, + "step": 22460 + }, + { + "epoch": 0.5291790306627102, + "grad_norm": 1.6820281744003296, + "learning_rate": 9.420658471103575e-05, + "loss": 2.027, + "step": 22470 + }, + { + "epoch": 0.5294145353492534, + "grad_norm": 3.3374030590057373, + "learning_rate": 9.41594837737271e-05, + "loss": 2.1277, + "step": 22480 + }, + { + "epoch": 0.5296500400357967, + "grad_norm": 2.0465030670166016, + "learning_rate": 9.411238283641845e-05, + "loss": 2.0185, + "step": 22490 + }, + { + "epoch": 0.52988554472234, + "grad_norm": 2.40169620513916, + "learning_rate": 9.40652818991098e-05, + "loss": 2.2946, + "step": 22500 + }, + { + "epoch": 0.5301210494088833, + "grad_norm": 2.6130552291870117, + "learning_rate": 9.401818096180113e-05, + "loss": 2.3308, + "step": 22510 + }, + { + "epoch": 0.5303565540954265, + "grad_norm": 2.0632524490356445, + "learning_rate": 9.39710800244925e-05, + "loss": 2.1752, + "step": 22520 + }, + { + "epoch": 0.5305920587819698, + "grad_norm": 1.7699297666549683, + "learning_rate": 9.392397908718385e-05, + "loss": 2.0812, + "step": 22530 + }, + { + "epoch": 0.530827563468513, + "grad_norm": 2.3439061641693115, + "learning_rate": 9.38768781498752e-05, + "loss": 2.0295, + "step": 22540 + }, + { + "epoch": 0.5310630681550563, + "grad_norm": 2.736743688583374, + "learning_rate": 9.382977721256653e-05, + "loss": 2.0404, + "step": 22550 + }, + { + "epoch": 0.5312985728415995, + "grad_norm": 1.7512779235839844, + "learning_rate": 9.378267627525788e-05, + "loss": 2.0293, + "step": 22560 + }, + { + "epoch": 0.5315340775281429, + "grad_norm": 2.9181957244873047, + "learning_rate": 9.373557533794923e-05, + "loss": 2.0772, + "step": 22570 + }, + { + "epoch": 0.5317695822146861, + "grad_norm": 2.2531585693359375, + "learning_rate": 9.368847440064059e-05, + "loss": 2.1138, + "step": 22580 + }, + { + "epoch": 0.5320050869012294, + "grad_norm": 2.3378305435180664, + "learning_rate": 9.364137346333192e-05, + "loss": 2.164, + "step": 22590 + }, + { + "epoch": 0.5322405915877726, + "grad_norm": 2.257312536239624, + "learning_rate": 9.359427252602327e-05, + "loss": 1.9478, + "step": 22600 + }, + { + "epoch": 0.5324760962743159, + "grad_norm": 3.4792850017547607, + "learning_rate": 9.354717158871462e-05, + "loss": 1.9697, + "step": 22610 + }, + { + "epoch": 0.5327116009608591, + "grad_norm": 2.011868476867676, + "learning_rate": 9.350007065140597e-05, + "loss": 1.9607, + "step": 22620 + }, + { + "epoch": 0.5329471056474023, + "grad_norm": 3.508305072784424, + "learning_rate": 9.345296971409731e-05, + "loss": 2.0881, + "step": 22630 + }, + { + "epoch": 0.5331826103339457, + "grad_norm": 2.206843137741089, + "learning_rate": 9.340586877678866e-05, + "loss": 2.0199, + "step": 22640 + }, + { + "epoch": 0.5334181150204889, + "grad_norm": 2.1505041122436523, + "learning_rate": 9.335876783948001e-05, + "loss": 1.9688, + "step": 22650 + }, + { + "epoch": 0.5336536197070322, + "grad_norm": 2.0961718559265137, + "learning_rate": 9.331166690217136e-05, + "loss": 2.0561, + "step": 22660 + }, + { + "epoch": 0.5338891243935754, + "grad_norm": 1.9985352754592896, + "learning_rate": 9.326456596486271e-05, + "loss": 1.9848, + "step": 22670 + }, + { + "epoch": 0.5341246290801187, + "grad_norm": 3.043963670730591, + "learning_rate": 9.321746502755405e-05, + "loss": 2.0881, + "step": 22680 + }, + { + "epoch": 0.5343601337666619, + "grad_norm": 2.3572945594787598, + "learning_rate": 9.31703640902454e-05, + "loss": 1.995, + "step": 22690 + }, + { + "epoch": 0.5345956384532052, + "grad_norm": 2.074956178665161, + "learning_rate": 9.312326315293675e-05, + "loss": 2.0339, + "step": 22700 + }, + { + "epoch": 0.5348311431397484, + "grad_norm": 1.7180410623550415, + "learning_rate": 9.30761622156281e-05, + "loss": 1.9256, + "step": 22710 + }, + { + "epoch": 0.5350666478262918, + "grad_norm": 2.5363078117370605, + "learning_rate": 9.302906127831943e-05, + "loss": 2.2295, + "step": 22720 + }, + { + "epoch": 0.535302152512835, + "grad_norm": 2.200470209121704, + "learning_rate": 9.298196034101078e-05, + "loss": 1.904, + "step": 22730 + }, + { + "epoch": 0.5355376571993783, + "grad_norm": 1.8723726272583008, + "learning_rate": 9.293485940370213e-05, + "loss": 2.2491, + "step": 22740 + }, + { + "epoch": 0.5357731618859215, + "grad_norm": 2.703275442123413, + "learning_rate": 9.288775846639349e-05, + "loss": 1.7582, + "step": 22750 + }, + { + "epoch": 0.5360086665724648, + "grad_norm": 2.2764389514923096, + "learning_rate": 9.284065752908484e-05, + "loss": 1.9832, + "step": 22760 + }, + { + "epoch": 0.536244171259008, + "grad_norm": 2.089346170425415, + "learning_rate": 9.279355659177617e-05, + "loss": 2.2175, + "step": 22770 + }, + { + "epoch": 0.5364796759455513, + "grad_norm": 2.090502977371216, + "learning_rate": 9.274645565446752e-05, + "loss": 2.1681, + "step": 22780 + }, + { + "epoch": 0.5367151806320946, + "grad_norm": 3.0695157051086426, + "learning_rate": 9.269935471715887e-05, + "loss": 2.205, + "step": 22790 + }, + { + "epoch": 0.5369506853186379, + "grad_norm": 2.0072319507598877, + "learning_rate": 9.265225377985022e-05, + "loss": 1.9657, + "step": 22800 + }, + { + "epoch": 0.5371861900051811, + "grad_norm": 2.8740854263305664, + "learning_rate": 9.260515284254157e-05, + "loss": 2.011, + "step": 22810 + }, + { + "epoch": 0.5374216946917244, + "grad_norm": 2.9721429347991943, + "learning_rate": 9.255805190523292e-05, + "loss": 2.0737, + "step": 22820 + }, + { + "epoch": 0.5376571993782676, + "grad_norm": 1.8961126804351807, + "learning_rate": 9.251095096792427e-05, + "loss": 1.89, + "step": 22830 + }, + { + "epoch": 0.5378927040648109, + "grad_norm": 2.3074028491973877, + "learning_rate": 9.246385003061562e-05, + "loss": 2.0739, + "step": 22840 + }, + { + "epoch": 0.5381282087513541, + "grad_norm": 2.082486391067505, + "learning_rate": 9.241674909330696e-05, + "loss": 2.0767, + "step": 22850 + }, + { + "epoch": 0.5383637134378975, + "grad_norm": 2.289553165435791, + "learning_rate": 9.236964815599831e-05, + "loss": 2.1389, + "step": 22860 + }, + { + "epoch": 0.5385992181244407, + "grad_norm": 2.9601359367370605, + "learning_rate": 9.232254721868966e-05, + "loss": 1.984, + "step": 22870 + }, + { + "epoch": 0.538834722810984, + "grad_norm": 2.5509331226348877, + "learning_rate": 9.227544628138101e-05, + "loss": 2.1656, + "step": 22880 + }, + { + "epoch": 0.5390702274975272, + "grad_norm": 2.1479685306549072, + "learning_rate": 9.222834534407235e-05, + "loss": 2.0994, + "step": 22890 + }, + { + "epoch": 0.5393057321840704, + "grad_norm": 1.8056848049163818, + "learning_rate": 9.21812444067637e-05, + "loss": 1.9269, + "step": 22900 + }, + { + "epoch": 0.5395412368706137, + "grad_norm": 2.1940340995788574, + "learning_rate": 9.213414346945505e-05, + "loss": 1.9795, + "step": 22910 + }, + { + "epoch": 0.5397767415571569, + "grad_norm": 2.247976541519165, + "learning_rate": 9.20870425321464e-05, + "loss": 2.0206, + "step": 22920 + }, + { + "epoch": 0.5400122462437003, + "grad_norm": 2.516195774078369, + "learning_rate": 9.203994159483774e-05, + "loss": 1.9513, + "step": 22930 + }, + { + "epoch": 0.5402477509302435, + "grad_norm": 1.9935544729232788, + "learning_rate": 9.199284065752909e-05, + "loss": 1.8158, + "step": 22940 + }, + { + "epoch": 0.5404832556167868, + "grad_norm": 1.9285348653793335, + "learning_rate": 9.194573972022044e-05, + "loss": 2.0684, + "step": 22950 + }, + { + "epoch": 0.54071876030333, + "grad_norm": 2.0052757263183594, + "learning_rate": 9.189863878291179e-05, + "loss": 1.8693, + "step": 22960 + }, + { + "epoch": 0.5409542649898733, + "grad_norm": 3.399982213973999, + "learning_rate": 9.185153784560314e-05, + "loss": 2.2221, + "step": 22970 + }, + { + "epoch": 0.5411897696764165, + "grad_norm": 1.7858917713165283, + "learning_rate": 9.180443690829447e-05, + "loss": 1.9837, + "step": 22980 + }, + { + "epoch": 0.5414252743629598, + "grad_norm": 2.4309234619140625, + "learning_rate": 9.175733597098582e-05, + "loss": 2.0491, + "step": 22990 + }, + { + "epoch": 0.541660779049503, + "grad_norm": 1.9532793760299683, + "learning_rate": 9.171023503367717e-05, + "loss": 2.0639, + "step": 23000 + }, + { + "epoch": 0.5418962837360464, + "grad_norm": 2.7318344116210938, + "learning_rate": 9.166313409636852e-05, + "loss": 2.1516, + "step": 23010 + }, + { + "epoch": 0.5421317884225896, + "grad_norm": 1.8187711238861084, + "learning_rate": 9.161603315905986e-05, + "loss": 1.8155, + "step": 23020 + }, + { + "epoch": 0.5423672931091329, + "grad_norm": 2.425743818283081, + "learning_rate": 9.156893222175121e-05, + "loss": 2.1712, + "step": 23030 + }, + { + "epoch": 0.5426027977956761, + "grad_norm": 1.8586986064910889, + "learning_rate": 9.152183128444256e-05, + "loss": 2.03, + "step": 23040 + }, + { + "epoch": 0.5428383024822194, + "grad_norm": 2.399402618408203, + "learning_rate": 9.147473034713391e-05, + "loss": 1.9666, + "step": 23050 + }, + { + "epoch": 0.5430738071687626, + "grad_norm": 1.966139554977417, + "learning_rate": 9.142762940982526e-05, + "loss": 1.9428, + "step": 23060 + }, + { + "epoch": 0.543309311855306, + "grad_norm": 2.4574363231658936, + "learning_rate": 9.13805284725166e-05, + "loss": 2.115, + "step": 23070 + }, + { + "epoch": 0.5435448165418492, + "grad_norm": 2.235666275024414, + "learning_rate": 9.133342753520796e-05, + "loss": 2.0983, + "step": 23080 + }, + { + "epoch": 0.5437803212283925, + "grad_norm": 1.8291510343551636, + "learning_rate": 9.128632659789931e-05, + "loss": 2.0416, + "step": 23090 + }, + { + "epoch": 0.5440158259149357, + "grad_norm": 2.8294384479522705, + "learning_rate": 9.123922566059065e-05, + "loss": 2.2959, + "step": 23100 + }, + { + "epoch": 0.544251330601479, + "grad_norm": 2.171060085296631, + "learning_rate": 9.1192124723282e-05, + "loss": 2.1052, + "step": 23110 + }, + { + "epoch": 0.5444868352880222, + "grad_norm": 2.0458946228027344, + "learning_rate": 9.114502378597335e-05, + "loss": 2.029, + "step": 23120 + }, + { + "epoch": 0.5447223399745655, + "grad_norm": 1.9284874200820923, + "learning_rate": 9.10979228486647e-05, + "loss": 2.0337, + "step": 23130 + }, + { + "epoch": 0.5449578446611087, + "grad_norm": 1.6422959566116333, + "learning_rate": 9.105082191135605e-05, + "loss": 2.2123, + "step": 23140 + }, + { + "epoch": 0.5451933493476521, + "grad_norm": 2.0372812747955322, + "learning_rate": 9.100372097404739e-05, + "loss": 2.0597, + "step": 23150 + }, + { + "epoch": 0.5454288540341953, + "grad_norm": 2.334294319152832, + "learning_rate": 9.095662003673874e-05, + "loss": 1.9422, + "step": 23160 + }, + { + "epoch": 0.5456643587207386, + "grad_norm": 2.039754629135132, + "learning_rate": 9.090951909943009e-05, + "loss": 2.1631, + "step": 23170 + }, + { + "epoch": 0.5458998634072818, + "grad_norm": 1.9584988355636597, + "learning_rate": 9.086241816212144e-05, + "loss": 2.1069, + "step": 23180 + }, + { + "epoch": 0.546135368093825, + "grad_norm": 2.1057446002960205, + "learning_rate": 9.081531722481277e-05, + "loss": 2.0025, + "step": 23190 + }, + { + "epoch": 0.5463708727803683, + "grad_norm": 2.6134400367736816, + "learning_rate": 9.076821628750412e-05, + "loss": 2.2541, + "step": 23200 + }, + { + "epoch": 0.5466063774669115, + "grad_norm": 1.9767898321151733, + "learning_rate": 9.072111535019547e-05, + "loss": 1.9391, + "step": 23210 + }, + { + "epoch": 0.5468418821534549, + "grad_norm": 2.1497464179992676, + "learning_rate": 9.067401441288682e-05, + "loss": 1.992, + "step": 23220 + }, + { + "epoch": 0.5470773868399981, + "grad_norm": 1.8777291774749756, + "learning_rate": 9.062691347557816e-05, + "loss": 2.0687, + "step": 23230 + }, + { + "epoch": 0.5473128915265414, + "grad_norm": 2.7456014156341553, + "learning_rate": 9.057981253826951e-05, + "loss": 1.9403, + "step": 23240 + }, + { + "epoch": 0.5475483962130846, + "grad_norm": 1.9173210859298706, + "learning_rate": 9.053271160096086e-05, + "loss": 2.362, + "step": 23250 + }, + { + "epoch": 0.5477839008996279, + "grad_norm": 3.4497084617614746, + "learning_rate": 9.048561066365221e-05, + "loss": 2.3116, + "step": 23260 + }, + { + "epoch": 0.5480194055861711, + "grad_norm": 1.9495487213134766, + "learning_rate": 9.043850972634356e-05, + "loss": 2.2498, + "step": 23270 + }, + { + "epoch": 0.5482549102727144, + "grad_norm": 2.1822617053985596, + "learning_rate": 9.03914087890349e-05, + "loss": 2.1444, + "step": 23280 + }, + { + "epoch": 0.5484904149592577, + "grad_norm": 2.625450849533081, + "learning_rate": 9.034430785172625e-05, + "loss": 2.1074, + "step": 23290 + }, + { + "epoch": 0.548725919645801, + "grad_norm": 2.899829149246216, + "learning_rate": 9.02972069144176e-05, + "loss": 2.1126, + "step": 23300 + }, + { + "epoch": 0.5489614243323442, + "grad_norm": 3.0588479042053223, + "learning_rate": 9.025010597710895e-05, + "loss": 1.9202, + "step": 23310 + }, + { + "epoch": 0.5491969290188875, + "grad_norm": 2.1747286319732666, + "learning_rate": 9.020300503980029e-05, + "loss": 1.9893, + "step": 23320 + }, + { + "epoch": 0.5494324337054307, + "grad_norm": 2.146778106689453, + "learning_rate": 9.015590410249164e-05, + "loss": 1.988, + "step": 23330 + }, + { + "epoch": 0.549667938391974, + "grad_norm": 1.8845597505569458, + "learning_rate": 9.010880316518299e-05, + "loss": 2.1452, + "step": 23340 + }, + { + "epoch": 0.5499034430785172, + "grad_norm": 2.016979694366455, + "learning_rate": 9.006170222787434e-05, + "loss": 2.1675, + "step": 23350 + }, + { + "epoch": 0.5501389477650606, + "grad_norm": 2.911409378051758, + "learning_rate": 9.001460129056569e-05, + "loss": 2.0028, + "step": 23360 + }, + { + "epoch": 0.5503744524516038, + "grad_norm": 2.8870551586151123, + "learning_rate": 8.996750035325704e-05, + "loss": 1.9068, + "step": 23370 + }, + { + "epoch": 0.5506099571381471, + "grad_norm": 2.1140267848968506, + "learning_rate": 8.992039941594839e-05, + "loss": 2.1994, + "step": 23380 + }, + { + "epoch": 0.5508454618246903, + "grad_norm": 1.9857615232467651, + "learning_rate": 8.987329847863974e-05, + "loss": 2.1649, + "step": 23390 + }, + { + "epoch": 0.5510809665112336, + "grad_norm": 2.7637999057769775, + "learning_rate": 8.982619754133107e-05, + "loss": 1.9992, + "step": 23400 + }, + { + "epoch": 0.5513164711977768, + "grad_norm": 2.225193738937378, + "learning_rate": 8.977909660402243e-05, + "loss": 2.1342, + "step": 23410 + }, + { + "epoch": 0.5515519758843201, + "grad_norm": 1.7296162843704224, + "learning_rate": 8.973199566671378e-05, + "loss": 2.0405, + "step": 23420 + }, + { + "epoch": 0.5517874805708634, + "grad_norm": 2.2399344444274902, + "learning_rate": 8.968489472940513e-05, + "loss": 2.0995, + "step": 23430 + }, + { + "epoch": 0.5520229852574067, + "grad_norm": 2.2582974433898926, + "learning_rate": 8.963779379209648e-05, + "loss": 1.9908, + "step": 23440 + }, + { + "epoch": 0.5522584899439499, + "grad_norm": 2.6607015132904053, + "learning_rate": 8.959069285478781e-05, + "loss": 1.9292, + "step": 23450 + }, + { + "epoch": 0.5524939946304932, + "grad_norm": 2.3309760093688965, + "learning_rate": 8.954359191747916e-05, + "loss": 1.874, + "step": 23460 + }, + { + "epoch": 0.5527294993170364, + "grad_norm": 2.2204902172088623, + "learning_rate": 8.949649098017051e-05, + "loss": 2.064, + "step": 23470 + }, + { + "epoch": 0.5529650040035796, + "grad_norm": 1.4835526943206787, + "learning_rate": 8.944939004286186e-05, + "loss": 1.9785, + "step": 23480 + }, + { + "epoch": 0.5532005086901229, + "grad_norm": 2.3408985137939453, + "learning_rate": 8.94022891055532e-05, + "loss": 2.0047, + "step": 23490 + }, + { + "epoch": 0.5534360133766661, + "grad_norm": 2.300576686859131, + "learning_rate": 8.935518816824455e-05, + "loss": 2.1159, + "step": 23500 + }, + { + "epoch": 0.5536715180632095, + "grad_norm": 1.9129767417907715, + "learning_rate": 8.93080872309359e-05, + "loss": 2.0221, + "step": 23510 + }, + { + "epoch": 0.5539070227497527, + "grad_norm": 1.8746204376220703, + "learning_rate": 8.926098629362725e-05, + "loss": 2.1288, + "step": 23520 + }, + { + "epoch": 0.554142527436296, + "grad_norm": 1.7646563053131104, + "learning_rate": 8.921388535631859e-05, + "loss": 2.2262, + "step": 23530 + }, + { + "epoch": 0.5543780321228392, + "grad_norm": 1.9448425769805908, + "learning_rate": 8.916678441900994e-05, + "loss": 1.8702, + "step": 23540 + }, + { + "epoch": 0.5546135368093825, + "grad_norm": 2.126228094100952, + "learning_rate": 8.911968348170129e-05, + "loss": 2.1124, + "step": 23550 + }, + { + "epoch": 0.5548490414959257, + "grad_norm": 1.686448097229004, + "learning_rate": 8.907258254439264e-05, + "loss": 2.0523, + "step": 23560 + }, + { + "epoch": 0.555084546182469, + "grad_norm": 2.2924816608428955, + "learning_rate": 8.902548160708399e-05, + "loss": 2.1922, + "step": 23570 + }, + { + "epoch": 0.5553200508690123, + "grad_norm": 2.001551866531372, + "learning_rate": 8.897838066977532e-05, + "loss": 1.8586, + "step": 23580 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 2.2796730995178223, + "learning_rate": 8.893127973246668e-05, + "loss": 2.1119, + "step": 23590 + }, + { + "epoch": 0.5557910602420988, + "grad_norm": 1.9059703350067139, + "learning_rate": 8.888417879515803e-05, + "loss": 2.1778, + "step": 23600 + }, + { + "epoch": 0.5560265649286421, + "grad_norm": 2.31387996673584, + "learning_rate": 8.883707785784938e-05, + "loss": 1.9897, + "step": 23610 + }, + { + "epoch": 0.5562620696151853, + "grad_norm": 2.014240264892578, + "learning_rate": 8.878997692054071e-05, + "loss": 2.12, + "step": 23620 + }, + { + "epoch": 0.5564975743017286, + "grad_norm": 2.1083483695983887, + "learning_rate": 8.874287598323206e-05, + "loss": 1.9658, + "step": 23630 + }, + { + "epoch": 0.5567330789882718, + "grad_norm": 1.9519131183624268, + "learning_rate": 8.869577504592341e-05, + "loss": 2.2165, + "step": 23640 + }, + { + "epoch": 0.5569685836748152, + "grad_norm": 1.703078031539917, + "learning_rate": 8.864867410861478e-05, + "loss": 2.1628, + "step": 23650 + }, + { + "epoch": 0.5572040883613584, + "grad_norm": 2.055586814880371, + "learning_rate": 8.860157317130611e-05, + "loss": 1.9498, + "step": 23660 + }, + { + "epoch": 0.5574395930479017, + "grad_norm": 2.0325064659118652, + "learning_rate": 8.855447223399746e-05, + "loss": 1.9808, + "step": 23670 + }, + { + "epoch": 0.5576750977344449, + "grad_norm": 1.8139878511428833, + "learning_rate": 8.850737129668881e-05, + "loss": 2.0656, + "step": 23680 + }, + { + "epoch": 0.5579106024209882, + "grad_norm": 2.5634472370147705, + "learning_rate": 8.846027035938016e-05, + "loss": 1.9946, + "step": 23690 + }, + { + "epoch": 0.5581461071075314, + "grad_norm": 2.3983256816864014, + "learning_rate": 8.84131694220715e-05, + "loss": 2.0557, + "step": 23700 + }, + { + "epoch": 0.5583816117940748, + "grad_norm": 1.7592830657958984, + "learning_rate": 8.836606848476285e-05, + "loss": 1.9625, + "step": 23710 + }, + { + "epoch": 0.558617116480618, + "grad_norm": 1.9398764371871948, + "learning_rate": 8.83189675474542e-05, + "loss": 1.9973, + "step": 23720 + }, + { + "epoch": 0.5588526211671613, + "grad_norm": 2.4670112133026123, + "learning_rate": 8.827186661014555e-05, + "loss": 2.0338, + "step": 23730 + }, + { + "epoch": 0.5590881258537045, + "grad_norm": 2.532526969909668, + "learning_rate": 8.82247656728369e-05, + "loss": 2.0994, + "step": 23740 + }, + { + "epoch": 0.5593236305402477, + "grad_norm": 3.6450741291046143, + "learning_rate": 8.81823748292591e-05, + "loss": 2.1508, + "step": 23750 + }, + { + "epoch": 0.559559135226791, + "grad_norm": 2.395998239517212, + "learning_rate": 8.813527389195046e-05, + "loss": 2.035, + "step": 23760 + }, + { + "epoch": 0.5597946399133342, + "grad_norm": 2.092289686203003, + "learning_rate": 8.80881729546418e-05, + "loss": 2.0636, + "step": 23770 + }, + { + "epoch": 0.5600301445998775, + "grad_norm": 1.8740358352661133, + "learning_rate": 8.804107201733314e-05, + "loss": 1.9921, + "step": 23780 + }, + { + "epoch": 0.5602656492864208, + "grad_norm": 3.254704475402832, + "learning_rate": 8.79939710800245e-05, + "loss": 2.0609, + "step": 23790 + }, + { + "epoch": 0.5605011539729641, + "grad_norm": 2.3679122924804688, + "learning_rate": 8.794687014271584e-05, + "loss": 1.9789, + "step": 23800 + }, + { + "epoch": 0.5607366586595073, + "grad_norm": 2.638920783996582, + "learning_rate": 8.78997692054072e-05, + "loss": 2.0542, + "step": 23810 + }, + { + "epoch": 0.5609721633460506, + "grad_norm": 2.1278998851776123, + "learning_rate": 8.785266826809853e-05, + "loss": 2.185, + "step": 23820 + }, + { + "epoch": 0.5612076680325938, + "grad_norm": 2.04278826713562, + "learning_rate": 8.780556733078988e-05, + "loss": 2.1442, + "step": 23830 + }, + { + "epoch": 0.5614431727191371, + "grad_norm": 2.327648639678955, + "learning_rate": 8.775846639348123e-05, + "loss": 2.2638, + "step": 23840 + }, + { + "epoch": 0.5616786774056803, + "grad_norm": 2.190995454788208, + "learning_rate": 8.771136545617258e-05, + "loss": 2.3052, + "step": 23850 + }, + { + "epoch": 0.5619141820922237, + "grad_norm": 2.5929126739501953, + "learning_rate": 8.766426451886393e-05, + "loss": 2.078, + "step": 23860 + }, + { + "epoch": 0.5621496867787669, + "grad_norm": 2.1054494380950928, + "learning_rate": 8.761716358155527e-05, + "loss": 2.1903, + "step": 23870 + }, + { + "epoch": 0.5623851914653102, + "grad_norm": 2.804368495941162, + "learning_rate": 8.757006264424662e-05, + "loss": 1.8636, + "step": 23880 + }, + { + "epoch": 0.5626206961518534, + "grad_norm": 2.6225225925445557, + "learning_rate": 8.752296170693797e-05, + "loss": 2.0436, + "step": 23890 + }, + { + "epoch": 0.5628562008383967, + "grad_norm": 1.7755427360534668, + "learning_rate": 8.747586076962932e-05, + "loss": 2.1628, + "step": 23900 + }, + { + "epoch": 0.5630917055249399, + "grad_norm": 2.0387985706329346, + "learning_rate": 8.742875983232067e-05, + "loss": 2.1442, + "step": 23910 + }, + { + "epoch": 0.5633272102114832, + "grad_norm": 2.0750441551208496, + "learning_rate": 8.738165889501202e-05, + "loss": 2.0374, + "step": 23920 + }, + { + "epoch": 0.5635627148980265, + "grad_norm": 2.1440858840942383, + "learning_rate": 8.733455795770337e-05, + "loss": 2.0789, + "step": 23930 + }, + { + "epoch": 0.5637982195845698, + "grad_norm": 2.1340370178222656, + "learning_rate": 8.728745702039472e-05, + "loss": 2.2198, + "step": 23940 + }, + { + "epoch": 0.564033724271113, + "grad_norm": 2.3632407188415527, + "learning_rate": 8.724035608308606e-05, + "loss": 2.0647, + "step": 23950 + }, + { + "epoch": 0.5642692289576563, + "grad_norm": 2.03859543800354, + "learning_rate": 8.719325514577741e-05, + "loss": 2.1071, + "step": 23960 + }, + { + "epoch": 0.5645047336441995, + "grad_norm": 2.2486722469329834, + "learning_rate": 8.714615420846876e-05, + "loss": 2.0443, + "step": 23970 + }, + { + "epoch": 0.5647402383307428, + "grad_norm": 1.8128294944763184, + "learning_rate": 8.709905327116011e-05, + "loss": 1.9782, + "step": 23980 + }, + { + "epoch": 0.564975743017286, + "grad_norm": 2.3327388763427734, + "learning_rate": 8.705195233385144e-05, + "loss": 2.1579, + "step": 23990 + }, + { + "epoch": 0.5652112477038294, + "grad_norm": 2.0959858894348145, + "learning_rate": 8.70048513965428e-05, + "loss": 1.8995, + "step": 24000 + }, + { + "epoch": 0.5654467523903726, + "grad_norm": 2.87188458442688, + "learning_rate": 8.695775045923414e-05, + "loss": 2.0005, + "step": 24010 + }, + { + "epoch": 0.5656822570769159, + "grad_norm": 2.495626449584961, + "learning_rate": 8.69106495219255e-05, + "loss": 2.3682, + "step": 24020 + }, + { + "epoch": 0.5659177617634591, + "grad_norm": 3.239908456802368, + "learning_rate": 8.686354858461683e-05, + "loss": 1.9192, + "step": 24030 + }, + { + "epoch": 0.5661532664500023, + "grad_norm": 1.99828040599823, + "learning_rate": 8.681644764730818e-05, + "loss": 2.3154, + "step": 24040 + }, + { + "epoch": 0.5663887711365456, + "grad_norm": 2.1217734813690186, + "learning_rate": 8.676934670999953e-05, + "loss": 1.9546, + "step": 24050 + }, + { + "epoch": 0.5666242758230888, + "grad_norm": 3.101858377456665, + "learning_rate": 8.672224577269088e-05, + "loss": 2.0897, + "step": 24060 + }, + { + "epoch": 0.5668597805096321, + "grad_norm": 2.358698606491089, + "learning_rate": 8.667514483538223e-05, + "loss": 2.0906, + "step": 24070 + }, + { + "epoch": 0.5670952851961754, + "grad_norm": 2.2802388668060303, + "learning_rate": 8.662804389807357e-05, + "loss": 2.0595, + "step": 24080 + }, + { + "epoch": 0.5673307898827187, + "grad_norm": 2.175506114959717, + "learning_rate": 8.658094296076492e-05, + "loss": 1.8499, + "step": 24090 + }, + { + "epoch": 0.5675662945692619, + "grad_norm": 1.782609224319458, + "learning_rate": 8.653384202345627e-05, + "loss": 2.1053, + "step": 24100 + }, + { + "epoch": 0.5678017992558052, + "grad_norm": 1.9120488166809082, + "learning_rate": 8.648674108614762e-05, + "loss": 2.2141, + "step": 24110 + }, + { + "epoch": 0.5680373039423484, + "grad_norm": 3.216996908187866, + "learning_rate": 8.643964014883896e-05, + "loss": 2.0973, + "step": 24120 + }, + { + "epoch": 0.5682728086288917, + "grad_norm": 2.160984754562378, + "learning_rate": 8.639253921153031e-05, + "loss": 2.1362, + "step": 24130 + }, + { + "epoch": 0.5685083133154349, + "grad_norm": 1.6789969205856323, + "learning_rate": 8.634543827422166e-05, + "loss": 2.1757, + "step": 24140 + }, + { + "epoch": 0.5687438180019783, + "grad_norm": 2.3847877979278564, + "learning_rate": 8.629833733691301e-05, + "loss": 2.2551, + "step": 24150 + }, + { + "epoch": 0.5689793226885215, + "grad_norm": 2.110700845718384, + "learning_rate": 8.625123639960436e-05, + "loss": 2.1488, + "step": 24160 + }, + { + "epoch": 0.5692148273750648, + "grad_norm": 2.052661657333374, + "learning_rate": 8.62041354622957e-05, + "loss": 2.166, + "step": 24170 + }, + { + "epoch": 0.569450332061608, + "grad_norm": 2.335162401199341, + "learning_rate": 8.615703452498704e-05, + "loss": 1.9717, + "step": 24180 + }, + { + "epoch": 0.5696858367481513, + "grad_norm": 3.1710004806518555, + "learning_rate": 8.610993358767841e-05, + "loss": 1.8111, + "step": 24190 + }, + { + "epoch": 0.5699213414346945, + "grad_norm": 2.3458499908447266, + "learning_rate": 8.606283265036975e-05, + "loss": 2.159, + "step": 24200 + }, + { + "epoch": 0.5701568461212378, + "grad_norm": 2.641172170639038, + "learning_rate": 8.60157317130611e-05, + "loss": 1.9753, + "step": 24210 + }, + { + "epoch": 0.5703923508077811, + "grad_norm": 1.8560911417007446, + "learning_rate": 8.596863077575245e-05, + "loss": 2.1409, + "step": 24220 + }, + { + "epoch": 0.5706278554943244, + "grad_norm": 2.3327791690826416, + "learning_rate": 8.59215298384438e-05, + "loss": 2.0098, + "step": 24230 + }, + { + "epoch": 0.5708633601808676, + "grad_norm": 2.3636698722839355, + "learning_rate": 8.587442890113515e-05, + "loss": 2.1375, + "step": 24240 + }, + { + "epoch": 0.5710988648674109, + "grad_norm": 2.747985601425171, + "learning_rate": 8.582732796382648e-05, + "loss": 2.1451, + "step": 24250 + }, + { + "epoch": 0.5713343695539541, + "grad_norm": 2.1141562461853027, + "learning_rate": 8.578022702651783e-05, + "loss": 2.1145, + "step": 24260 + }, + { + "epoch": 0.5715698742404974, + "grad_norm": 2.0584163665771484, + "learning_rate": 8.573312608920918e-05, + "loss": 1.919, + "step": 24270 + }, + { + "epoch": 0.5718053789270406, + "grad_norm": 2.1240551471710205, + "learning_rate": 8.568602515190053e-05, + "loss": 2.0322, + "step": 24280 + }, + { + "epoch": 0.572040883613584, + "grad_norm": 2.4911723136901855, + "learning_rate": 8.563892421459187e-05, + "loss": 2.4528, + "step": 24290 + }, + { + "epoch": 0.5722763883001272, + "grad_norm": 1.859668493270874, + "learning_rate": 8.559182327728322e-05, + "loss": 2.027, + "step": 24300 + }, + { + "epoch": 0.5725118929866705, + "grad_norm": 1.7338566780090332, + "learning_rate": 8.554472233997457e-05, + "loss": 1.9711, + "step": 24310 + }, + { + "epoch": 0.5727473976732137, + "grad_norm": 2.1398377418518066, + "learning_rate": 8.549762140266592e-05, + "loss": 2.1034, + "step": 24320 + }, + { + "epoch": 0.5729829023597569, + "grad_norm": 1.8699679374694824, + "learning_rate": 8.545052046535726e-05, + "loss": 1.9032, + "step": 24330 + }, + { + "epoch": 0.5732184070463002, + "grad_norm": 2.045719623565674, + "learning_rate": 8.540341952804861e-05, + "loss": 1.9111, + "step": 24340 + }, + { + "epoch": 0.5734539117328434, + "grad_norm": 2.614588499069214, + "learning_rate": 8.535631859073996e-05, + "loss": 2.0196, + "step": 24350 + }, + { + "epoch": 0.5736894164193868, + "grad_norm": 2.135077953338623, + "learning_rate": 8.530921765343131e-05, + "loss": 2.2211, + "step": 24360 + }, + { + "epoch": 0.57392492110593, + "grad_norm": 2.3194169998168945, + "learning_rate": 8.526211671612266e-05, + "loss": 2.1909, + "step": 24370 + }, + { + "epoch": 0.5741604257924733, + "grad_norm": 2.06208872795105, + "learning_rate": 8.5215015778814e-05, + "loss": 2.099, + "step": 24380 + }, + { + "epoch": 0.5743959304790165, + "grad_norm": 1.9081873893737793, + "learning_rate": 8.516791484150535e-05, + "loss": 1.8989, + "step": 24390 + }, + { + "epoch": 0.5746314351655598, + "grad_norm": 1.925196886062622, + "learning_rate": 8.51208139041967e-05, + "loss": 2.1537, + "step": 24400 + }, + { + "epoch": 0.574866939852103, + "grad_norm": 2.0393810272216797, + "learning_rate": 8.507371296688805e-05, + "loss": 1.9381, + "step": 24410 + }, + { + "epoch": 0.5751024445386463, + "grad_norm": 2.061901807785034, + "learning_rate": 8.502661202957938e-05, + "loss": 2.0184, + "step": 24420 + }, + { + "epoch": 0.5753379492251895, + "grad_norm": 1.582486629486084, + "learning_rate": 8.497951109227073e-05, + "loss": 2.1767, + "step": 24430 + }, + { + "epoch": 0.5755734539117329, + "grad_norm": 2.24311900138855, + "learning_rate": 8.493241015496208e-05, + "loss": 2.1155, + "step": 24440 + }, + { + "epoch": 0.5758089585982761, + "grad_norm": 1.794895052909851, + "learning_rate": 8.488530921765343e-05, + "loss": 1.8587, + "step": 24450 + }, + { + "epoch": 0.5760444632848194, + "grad_norm": 2.3999338150024414, + "learning_rate": 8.483820828034478e-05, + "loss": 2.1511, + "step": 24460 + }, + { + "epoch": 0.5762799679713626, + "grad_norm": 2.538761854171753, + "learning_rate": 8.479110734303613e-05, + "loss": 2.1788, + "step": 24470 + }, + { + "epoch": 0.5765154726579059, + "grad_norm": 2.7868943214416504, + "learning_rate": 8.474400640572748e-05, + "loss": 2.0722, + "step": 24480 + }, + { + "epoch": 0.5767509773444491, + "grad_norm": 2.052523136138916, + "learning_rate": 8.469690546841883e-05, + "loss": 2.1654, + "step": 24490 + }, + { + "epoch": 0.5769864820309925, + "grad_norm": 2.1687915325164795, + "learning_rate": 8.464980453111017e-05, + "loss": 1.9479, + "step": 24500 + }, + { + "epoch": 0.5772219867175357, + "grad_norm": 2.0487418174743652, + "learning_rate": 8.460270359380152e-05, + "loss": 2.1456, + "step": 24510 + }, + { + "epoch": 0.577457491404079, + "grad_norm": 2.201139211654663, + "learning_rate": 8.455560265649287e-05, + "loss": 2.0357, + "step": 24520 + }, + { + "epoch": 0.5776929960906222, + "grad_norm": 2.2305378913879395, + "learning_rate": 8.450850171918422e-05, + "loss": 2.1552, + "step": 24530 + }, + { + "epoch": 0.5779285007771655, + "grad_norm": 4.121927261352539, + "learning_rate": 8.446140078187557e-05, + "loss": 2.0478, + "step": 24540 + }, + { + "epoch": 0.5781640054637087, + "grad_norm": 2.2727227210998535, + "learning_rate": 8.441429984456691e-05, + "loss": 1.9688, + "step": 24550 + }, + { + "epoch": 0.578399510150252, + "grad_norm": 2.3096418380737305, + "learning_rate": 8.436719890725826e-05, + "loss": 2.0188, + "step": 24560 + }, + { + "epoch": 0.5786350148367952, + "grad_norm": 1.787701964378357, + "learning_rate": 8.432009796994961e-05, + "loss": 1.9433, + "step": 24570 + }, + { + "epoch": 0.5788705195233386, + "grad_norm": 1.5643645524978638, + "learning_rate": 8.427299703264096e-05, + "loss": 1.9101, + "step": 24580 + }, + { + "epoch": 0.5791060242098818, + "grad_norm": 2.1164591312408447, + "learning_rate": 8.42258960953323e-05, + "loss": 2.0142, + "step": 24590 + }, + { + "epoch": 0.579341528896425, + "grad_norm": 1.9861196279525757, + "learning_rate": 8.417879515802365e-05, + "loss": 2.1013, + "step": 24600 + }, + { + "epoch": 0.5795770335829683, + "grad_norm": 2.388425588607788, + "learning_rate": 8.4131694220715e-05, + "loss": 1.9681, + "step": 24610 + }, + { + "epoch": 0.5798125382695115, + "grad_norm": 1.6500285863876343, + "learning_rate": 8.408459328340635e-05, + "loss": 1.992, + "step": 24620 + }, + { + "epoch": 0.5800480429560548, + "grad_norm": 1.7825500965118408, + "learning_rate": 8.403749234609768e-05, + "loss": 2.0607, + "step": 24630 + }, + { + "epoch": 0.580283547642598, + "grad_norm": 2.313638210296631, + "learning_rate": 8.399039140878903e-05, + "loss": 2.0777, + "step": 24640 + }, + { + "epoch": 0.5805190523291414, + "grad_norm": 2.432238817214966, + "learning_rate": 8.394329047148038e-05, + "loss": 2.0948, + "step": 24650 + }, + { + "epoch": 0.5807545570156846, + "grad_norm": 2.8354523181915283, + "learning_rate": 8.389618953417173e-05, + "loss": 2.1771, + "step": 24660 + }, + { + "epoch": 0.5809900617022279, + "grad_norm": 2.3798136711120605, + "learning_rate": 8.384908859686308e-05, + "loss": 1.9815, + "step": 24670 + }, + { + "epoch": 0.5812255663887711, + "grad_norm": 2.092031478881836, + "learning_rate": 8.380198765955442e-05, + "loss": 2.0798, + "step": 24680 + }, + { + "epoch": 0.5814610710753144, + "grad_norm": 2.465062379837036, + "learning_rate": 8.375488672224577e-05, + "loss": 1.8865, + "step": 24690 + }, + { + "epoch": 0.5816965757618576, + "grad_norm": 2.506052017211914, + "learning_rate": 8.370778578493712e-05, + "loss": 1.9352, + "step": 24700 + }, + { + "epoch": 0.581932080448401, + "grad_norm": 2.2928619384765625, + "learning_rate": 8.366068484762847e-05, + "loss": 1.8513, + "step": 24710 + }, + { + "epoch": 0.5821675851349442, + "grad_norm": 1.858624815940857, + "learning_rate": 8.361358391031981e-05, + "loss": 2.095, + "step": 24720 + }, + { + "epoch": 0.5824030898214875, + "grad_norm": 2.981670379638672, + "learning_rate": 8.356648297301116e-05, + "loss": 1.978, + "step": 24730 + }, + { + "epoch": 0.5826385945080307, + "grad_norm": 2.1256954669952393, + "learning_rate": 8.351938203570251e-05, + "loss": 1.9985, + "step": 24740 + }, + { + "epoch": 0.582874099194574, + "grad_norm": 1.9734386205673218, + "learning_rate": 8.347228109839387e-05, + "loss": 2.0097, + "step": 24750 + }, + { + "epoch": 0.5831096038811172, + "grad_norm": 2.111104965209961, + "learning_rate": 8.342518016108521e-05, + "loss": 2.0503, + "step": 24760 + }, + { + "epoch": 0.5833451085676605, + "grad_norm": 1.7222291231155396, + "learning_rate": 8.337807922377656e-05, + "loss": 1.8505, + "step": 24770 + }, + { + "epoch": 0.5835806132542037, + "grad_norm": 2.5214107036590576, + "learning_rate": 8.333097828646791e-05, + "loss": 2.0998, + "step": 24780 + }, + { + "epoch": 0.5838161179407471, + "grad_norm": 2.7706222534179688, + "learning_rate": 8.328387734915926e-05, + "loss": 2.0881, + "step": 24790 + }, + { + "epoch": 0.5840516226272903, + "grad_norm": 2.2346792221069336, + "learning_rate": 8.32367764118506e-05, + "loss": 2.0906, + "step": 24800 + }, + { + "epoch": 0.5842871273138336, + "grad_norm": 2.1181459426879883, + "learning_rate": 8.318967547454195e-05, + "loss": 1.9335, + "step": 24810 + }, + { + "epoch": 0.5845226320003768, + "grad_norm": 2.1477532386779785, + "learning_rate": 8.31425745372333e-05, + "loss": 2.1249, + "step": 24820 + }, + { + "epoch": 0.5847581366869201, + "grad_norm": 2.5332417488098145, + "learning_rate": 8.309547359992465e-05, + "loss": 2.1898, + "step": 24830 + }, + { + "epoch": 0.5849936413734633, + "grad_norm": 1.6639565229415894, + "learning_rate": 8.3048372662616e-05, + "loss": 2.077, + "step": 24840 + }, + { + "epoch": 0.5852291460600066, + "grad_norm": 2.576613187789917, + "learning_rate": 8.300127172530733e-05, + "loss": 2.1235, + "step": 24850 + }, + { + "epoch": 0.5854646507465499, + "grad_norm": 2.24748158454895, + "learning_rate": 8.295417078799869e-05, + "loss": 1.7241, + "step": 24860 + }, + { + "epoch": 0.5857001554330932, + "grad_norm": 2.139662265777588, + "learning_rate": 8.290706985069004e-05, + "loss": 2.0476, + "step": 24870 + }, + { + "epoch": 0.5859356601196364, + "grad_norm": 2.513113260269165, + "learning_rate": 8.285996891338139e-05, + "loss": 2.0448, + "step": 24880 + }, + { + "epoch": 0.5861711648061796, + "grad_norm": 2.219991445541382, + "learning_rate": 8.281286797607272e-05, + "loss": 1.7964, + "step": 24890 + }, + { + "epoch": 0.5864066694927229, + "grad_norm": 2.668653964996338, + "learning_rate": 8.276576703876407e-05, + "loss": 2.2925, + "step": 24900 + }, + { + "epoch": 0.5866421741792661, + "grad_norm": 2.3607492446899414, + "learning_rate": 8.271866610145542e-05, + "loss": 2.0662, + "step": 24910 + }, + { + "epoch": 0.5868776788658094, + "grad_norm": 2.165482521057129, + "learning_rate": 8.267156516414677e-05, + "loss": 2.0108, + "step": 24920 + }, + { + "epoch": 0.5871131835523526, + "grad_norm": 3.355048418045044, + "learning_rate": 8.262446422683811e-05, + "loss": 2.2239, + "step": 24930 + }, + { + "epoch": 0.587348688238896, + "grad_norm": 2.3312196731567383, + "learning_rate": 8.257736328952946e-05, + "loss": 2.1449, + "step": 24940 + }, + { + "epoch": 0.5875841929254392, + "grad_norm": 2.766026735305786, + "learning_rate": 8.253026235222081e-05, + "loss": 2.0531, + "step": 24950 + }, + { + "epoch": 0.5878196976119825, + "grad_norm": 2.3361899852752686, + "learning_rate": 8.248316141491216e-05, + "loss": 2.029, + "step": 24960 + }, + { + "epoch": 0.5880552022985257, + "grad_norm": 2.563586950302124, + "learning_rate": 8.243606047760351e-05, + "loss": 2.0073, + "step": 24970 + }, + { + "epoch": 0.588290706985069, + "grad_norm": 3.044583320617676, + "learning_rate": 8.238895954029485e-05, + "loss": 2.3071, + "step": 24980 + }, + { + "epoch": 0.5885262116716122, + "grad_norm": 1.7707631587982178, + "learning_rate": 8.23418586029862e-05, + "loss": 1.9584, + "step": 24990 + }, + { + "epoch": 0.5887617163581556, + "grad_norm": 3.5477962493896484, + "learning_rate": 8.229475766567755e-05, + "loss": 2.0358, + "step": 25000 + }, + { + "epoch": 0.5889972210446988, + "grad_norm": 2.394061326980591, + "learning_rate": 8.22476567283689e-05, + "loss": 2.0315, + "step": 25010 + }, + { + "epoch": 0.5892327257312421, + "grad_norm": 2.087583065032959, + "learning_rate": 8.220055579106023e-05, + "loss": 2.0567, + "step": 25020 + }, + { + "epoch": 0.5894682304177853, + "grad_norm": 1.9279582500457764, + "learning_rate": 8.21534548537516e-05, + "loss": 2.2003, + "step": 25030 + }, + { + "epoch": 0.5897037351043286, + "grad_norm": 2.105403184890747, + "learning_rate": 8.210635391644295e-05, + "loss": 1.9165, + "step": 25040 + }, + { + "epoch": 0.5899392397908718, + "grad_norm": 2.1151294708251953, + "learning_rate": 8.20592529791343e-05, + "loss": 2.1332, + "step": 25050 + }, + { + "epoch": 0.5901747444774151, + "grad_norm": 2.34464693069458, + "learning_rate": 8.201215204182564e-05, + "loss": 1.9142, + "step": 25060 + }, + { + "epoch": 0.5904102491639583, + "grad_norm": 2.0007407665252686, + "learning_rate": 8.196505110451699e-05, + "loss": 1.9881, + "step": 25070 + }, + { + "epoch": 0.5906457538505017, + "grad_norm": 2.4901375770568848, + "learning_rate": 8.191795016720834e-05, + "loss": 2.0313, + "step": 25080 + }, + { + "epoch": 0.5908812585370449, + "grad_norm": 1.5879149436950684, + "learning_rate": 8.187084922989969e-05, + "loss": 2.0404, + "step": 25090 + }, + { + "epoch": 0.5911167632235882, + "grad_norm": 1.9847928285598755, + "learning_rate": 8.182374829259102e-05, + "loss": 1.9612, + "step": 25100 + }, + { + "epoch": 0.5913522679101314, + "grad_norm": 1.7049063444137573, + "learning_rate": 8.177664735528237e-05, + "loss": 2.1503, + "step": 25110 + }, + { + "epoch": 0.5915877725966747, + "grad_norm": 2.089395046234131, + "learning_rate": 8.172954641797372e-05, + "loss": 2.1733, + "step": 25120 + }, + { + "epoch": 0.5918232772832179, + "grad_norm": 2.2690000534057617, + "learning_rate": 8.168244548066507e-05, + "loss": 2.1087, + "step": 25130 + }, + { + "epoch": 0.5920587819697612, + "grad_norm": 2.3506076335906982, + "learning_rate": 8.163534454335642e-05, + "loss": 1.9857, + "step": 25140 + }, + { + "epoch": 0.5922942866563045, + "grad_norm": 2.1546356678009033, + "learning_rate": 8.158824360604776e-05, + "loss": 1.8125, + "step": 25150 + }, + { + "epoch": 0.5925297913428478, + "grad_norm": 2.6273109912872314, + "learning_rate": 8.154114266873911e-05, + "loss": 2.1505, + "step": 25160 + }, + { + "epoch": 0.592765296029391, + "grad_norm": 2.0121755599975586, + "learning_rate": 8.149404173143046e-05, + "loss": 2.2848, + "step": 25170 + }, + { + "epoch": 0.5930008007159342, + "grad_norm": 2.0071730613708496, + "learning_rate": 8.144694079412181e-05, + "loss": 2.0596, + "step": 25180 + }, + { + "epoch": 0.5932363054024775, + "grad_norm": 1.9122415781021118, + "learning_rate": 8.139983985681315e-05, + "loss": 1.9869, + "step": 25190 + }, + { + "epoch": 0.5934718100890207, + "grad_norm": 2.8985416889190674, + "learning_rate": 8.13527389195045e-05, + "loss": 1.9928, + "step": 25200 + }, + { + "epoch": 0.593707314775564, + "grad_norm": 2.471587657928467, + "learning_rate": 8.130563798219585e-05, + "loss": 1.9349, + "step": 25210 + }, + { + "epoch": 0.5939428194621073, + "grad_norm": 2.232741594314575, + "learning_rate": 8.12585370448872e-05, + "loss": 2.3072, + "step": 25220 + }, + { + "epoch": 0.5941783241486506, + "grad_norm": 2.7327969074249268, + "learning_rate": 8.121143610757854e-05, + "loss": 2.0982, + "step": 25230 + }, + { + "epoch": 0.5944138288351938, + "grad_norm": 2.561145305633545, + "learning_rate": 8.116433517026989e-05, + "loss": 1.9554, + "step": 25240 + }, + { + "epoch": 0.5946493335217371, + "grad_norm": 2.3863377571105957, + "learning_rate": 8.111723423296124e-05, + "loss": 1.9331, + "step": 25250 + }, + { + "epoch": 0.5948848382082803, + "grad_norm": 2.0999460220336914, + "learning_rate": 8.107484338938345e-05, + "loss": 1.9116, + "step": 25260 + }, + { + "epoch": 0.5951203428948236, + "grad_norm": 2.5434398651123047, + "learning_rate": 8.102774245207479e-05, + "loss": 2.1479, + "step": 25270 + }, + { + "epoch": 0.5953558475813668, + "grad_norm": 2.038438558578491, + "learning_rate": 8.098064151476614e-05, + "loss": 2.0945, + "step": 25280 + }, + { + "epoch": 0.5955913522679102, + "grad_norm": 2.3757026195526123, + "learning_rate": 8.09335405774575e-05, + "loss": 2.142, + "step": 25290 + }, + { + "epoch": 0.5958268569544534, + "grad_norm": 2.268975257873535, + "learning_rate": 8.088643964014884e-05, + "loss": 2.0171, + "step": 25300 + }, + { + "epoch": 0.5960623616409967, + "grad_norm": 2.189600944519043, + "learning_rate": 8.083933870284019e-05, + "loss": 2.0769, + "step": 25310 + }, + { + "epoch": 0.5962978663275399, + "grad_norm": 2.25864577293396, + "learning_rate": 8.079223776553154e-05, + "loss": 2.0467, + "step": 25320 + }, + { + "epoch": 0.5965333710140832, + "grad_norm": 1.8093814849853516, + "learning_rate": 8.074513682822289e-05, + "loss": 1.9886, + "step": 25330 + }, + { + "epoch": 0.5967688757006264, + "grad_norm": 2.4894871711730957, + "learning_rate": 8.069803589091424e-05, + "loss": 2.0495, + "step": 25340 + }, + { + "epoch": 0.5970043803871697, + "grad_norm": 2.965756893157959, + "learning_rate": 8.065093495360558e-05, + "loss": 2.233, + "step": 25350 + }, + { + "epoch": 0.597239885073713, + "grad_norm": 2.278423309326172, + "learning_rate": 8.060383401629693e-05, + "loss": 2.0152, + "step": 25360 + }, + { + "epoch": 0.5974753897602563, + "grad_norm": 2.2974514961242676, + "learning_rate": 8.055673307898828e-05, + "loss": 2.2189, + "step": 25370 + }, + { + "epoch": 0.5977108944467995, + "grad_norm": 2.401662588119507, + "learning_rate": 8.050963214167963e-05, + "loss": 1.9795, + "step": 25380 + }, + { + "epoch": 0.5979463991333428, + "grad_norm": 2.429227113723755, + "learning_rate": 8.046253120437097e-05, + "loss": 1.9638, + "step": 25390 + }, + { + "epoch": 0.598181903819886, + "grad_norm": 2.7841548919677734, + "learning_rate": 8.041543026706232e-05, + "loss": 2.1462, + "step": 25400 + }, + { + "epoch": 0.5984174085064293, + "grad_norm": 2.3505892753601074, + "learning_rate": 8.036832932975367e-05, + "loss": 2.1676, + "step": 25410 + }, + { + "epoch": 0.5986529131929725, + "grad_norm": 1.8642781972885132, + "learning_rate": 8.032122839244502e-05, + "loss": 2.2374, + "step": 25420 + }, + { + "epoch": 0.5988884178795159, + "grad_norm": 2.466585874557495, + "learning_rate": 8.027412745513637e-05, + "loss": 2.1254, + "step": 25430 + }, + { + "epoch": 0.5991239225660591, + "grad_norm": 2.247736692428589, + "learning_rate": 8.02270265178277e-05, + "loss": 1.8839, + "step": 25440 + }, + { + "epoch": 0.5993594272526023, + "grad_norm": 2.2101500034332275, + "learning_rate": 8.017992558051905e-05, + "loss": 2.0983, + "step": 25450 + }, + { + "epoch": 0.5995949319391456, + "grad_norm": 2.6747212409973145, + "learning_rate": 8.01328246432104e-05, + "loss": 2.0788, + "step": 25460 + }, + { + "epoch": 0.5998304366256888, + "grad_norm": 2.289003610610962, + "learning_rate": 8.008572370590176e-05, + "loss": 2.1053, + "step": 25470 + }, + { + "epoch": 0.6000659413122321, + "grad_norm": 2.8795812129974365, + "learning_rate": 8.003862276859309e-05, + "loss": 1.9574, + "step": 25480 + }, + { + "epoch": 0.6003014459987753, + "grad_norm": 1.9603990316390991, + "learning_rate": 7.999152183128444e-05, + "loss": 2.1414, + "step": 25490 + }, + { + "epoch": 0.6005369506853186, + "grad_norm": 1.8929415941238403, + "learning_rate": 7.994442089397579e-05, + "loss": 2.0685, + "step": 25500 + }, + { + "epoch": 0.6007724553718619, + "grad_norm": 2.2865395545959473, + "learning_rate": 7.989731995666714e-05, + "loss": 1.9385, + "step": 25510 + }, + { + "epoch": 0.6010079600584052, + "grad_norm": 1.7141823768615723, + "learning_rate": 7.985021901935848e-05, + "loss": 1.9145, + "step": 25520 + }, + { + "epoch": 0.6012434647449484, + "grad_norm": 2.2416176795959473, + "learning_rate": 7.980311808204983e-05, + "loss": 1.9598, + "step": 25530 + }, + { + "epoch": 0.6014789694314917, + "grad_norm": 1.8684344291687012, + "learning_rate": 7.975601714474118e-05, + "loss": 2.0327, + "step": 25540 + }, + { + "epoch": 0.6017144741180349, + "grad_norm": 2.3493103981018066, + "learning_rate": 7.970891620743253e-05, + "loss": 1.9675, + "step": 25550 + }, + { + "epoch": 0.6019499788045782, + "grad_norm": 2.669125556945801, + "learning_rate": 7.966181527012388e-05, + "loss": 2.1429, + "step": 25560 + }, + { + "epoch": 0.6021854834911214, + "grad_norm": 2.1872849464416504, + "learning_rate": 7.961471433281523e-05, + "loss": 2.1035, + "step": 25570 + }, + { + "epoch": 0.6024209881776648, + "grad_norm": 2.5659477710723877, + "learning_rate": 7.956761339550658e-05, + "loss": 2.1668, + "step": 25580 + }, + { + "epoch": 0.602656492864208, + "grad_norm": 1.815410852432251, + "learning_rate": 7.952051245819793e-05, + "loss": 1.9685, + "step": 25590 + }, + { + "epoch": 0.6028919975507513, + "grad_norm": 2.178764820098877, + "learning_rate": 7.947341152088927e-05, + "loss": 2.0261, + "step": 25600 + }, + { + "epoch": 0.6031275022372945, + "grad_norm": 2.0727059841156006, + "learning_rate": 7.942631058358062e-05, + "loss": 2.3205, + "step": 25610 + }, + { + "epoch": 0.6033630069238378, + "grad_norm": 2.4925308227539062, + "learning_rate": 7.937920964627197e-05, + "loss": 1.9224, + "step": 25620 + }, + { + "epoch": 0.603598511610381, + "grad_norm": 2.7875311374664307, + "learning_rate": 7.933210870896332e-05, + "loss": 2.1901, + "step": 25630 + }, + { + "epoch": 0.6038340162969243, + "grad_norm": 2.135871648788452, + "learning_rate": 7.928500777165467e-05, + "loss": 2.0171, + "step": 25640 + }, + { + "epoch": 0.6040695209834676, + "grad_norm": 2.568267345428467, + "learning_rate": 7.9237906834346e-05, + "loss": 2.2376, + "step": 25650 + }, + { + "epoch": 0.6043050256700109, + "grad_norm": 2.8752918243408203, + "learning_rate": 7.919080589703736e-05, + "loss": 2.0801, + "step": 25660 + }, + { + "epoch": 0.6045405303565541, + "grad_norm": 2.950716257095337, + "learning_rate": 7.91437049597287e-05, + "loss": 2.1433, + "step": 25670 + }, + { + "epoch": 0.6047760350430974, + "grad_norm": 1.806060791015625, + "learning_rate": 7.909660402242006e-05, + "loss": 2.0425, + "step": 25680 + }, + { + "epoch": 0.6050115397296406, + "grad_norm": 2.009774923324585, + "learning_rate": 7.904950308511139e-05, + "loss": 1.9264, + "step": 25690 + }, + { + "epoch": 0.6052470444161839, + "grad_norm": 2.4653844833374023, + "learning_rate": 7.900240214780274e-05, + "loss": 2.2729, + "step": 25700 + }, + { + "epoch": 0.6054825491027271, + "grad_norm": 2.0661404132843018, + "learning_rate": 7.895530121049409e-05, + "loss": 1.8604, + "step": 25710 + }, + { + "epoch": 0.6057180537892705, + "grad_norm": 2.1067216396331787, + "learning_rate": 7.890820027318544e-05, + "loss": 1.918, + "step": 25720 + }, + { + "epoch": 0.6059535584758137, + "grad_norm": 1.9731528759002686, + "learning_rate": 7.88610993358768e-05, + "loss": 2.0417, + "step": 25730 + }, + { + "epoch": 0.6061890631623569, + "grad_norm": 2.0043580532073975, + "learning_rate": 7.881399839856813e-05, + "loss": 1.8275, + "step": 25740 + }, + { + "epoch": 0.6064245678489002, + "grad_norm": 2.3526785373687744, + "learning_rate": 7.876689746125948e-05, + "loss": 1.9985, + "step": 25750 + }, + { + "epoch": 0.6066600725354434, + "grad_norm": 2.168198585510254, + "learning_rate": 7.871979652395083e-05, + "loss": 1.9785, + "step": 25760 + }, + { + "epoch": 0.6068955772219867, + "grad_norm": 1.7501553297042847, + "learning_rate": 7.867269558664218e-05, + "loss": 2.0646, + "step": 25770 + }, + { + "epoch": 0.6071310819085299, + "grad_norm": 2.0135562419891357, + "learning_rate": 7.862559464933352e-05, + "loss": 2.152, + "step": 25780 + }, + { + "epoch": 0.6073665865950733, + "grad_norm": 1.9424461126327515, + "learning_rate": 7.857849371202487e-05, + "loss": 2.0319, + "step": 25790 + }, + { + "epoch": 0.6076020912816165, + "grad_norm": 2.0011091232299805, + "learning_rate": 7.853139277471622e-05, + "loss": 1.9419, + "step": 25800 + }, + { + "epoch": 0.6078375959681598, + "grad_norm": 1.9479938745498657, + "learning_rate": 7.848429183740757e-05, + "loss": 2.0519, + "step": 25810 + }, + { + "epoch": 0.608073100654703, + "grad_norm": 2.629399538040161, + "learning_rate": 7.84371909000989e-05, + "loss": 1.8832, + "step": 25820 + }, + { + "epoch": 0.6083086053412463, + "grad_norm": 2.2955732345581055, + "learning_rate": 7.839008996279026e-05, + "loss": 1.8042, + "step": 25830 + }, + { + "epoch": 0.6085441100277895, + "grad_norm": 2.8424713611602783, + "learning_rate": 7.83429890254816e-05, + "loss": 2.4232, + "step": 25840 + }, + { + "epoch": 0.6087796147143328, + "grad_norm": 2.084066152572632, + "learning_rate": 7.829588808817296e-05, + "loss": 2.0076, + "step": 25850 + }, + { + "epoch": 0.609015119400876, + "grad_norm": 2.988767623901367, + "learning_rate": 7.82487871508643e-05, + "loss": 1.9684, + "step": 25860 + }, + { + "epoch": 0.6092506240874194, + "grad_norm": 2.3572447299957275, + "learning_rate": 7.820168621355566e-05, + "loss": 1.9121, + "step": 25870 + }, + { + "epoch": 0.6094861287739626, + "grad_norm": 1.948307752609253, + "learning_rate": 7.8154585276247e-05, + "loss": 1.6575, + "step": 25880 + }, + { + "epoch": 0.6097216334605059, + "grad_norm": 4.080255031585693, + "learning_rate": 7.810748433893836e-05, + "loss": 2.0366, + "step": 25890 + }, + { + "epoch": 0.6099571381470491, + "grad_norm": 2.5627658367156982, + "learning_rate": 7.80603834016297e-05, + "loss": 1.7901, + "step": 25900 + }, + { + "epoch": 0.6101926428335924, + "grad_norm": 1.9693503379821777, + "learning_rate": 7.801328246432104e-05, + "loss": 1.9767, + "step": 25910 + }, + { + "epoch": 0.6104281475201356, + "grad_norm": 2.0714879035949707, + "learning_rate": 7.79661815270124e-05, + "loss": 2.0898, + "step": 25920 + }, + { + "epoch": 0.610663652206679, + "grad_norm": 2.8542380332946777, + "learning_rate": 7.791908058970374e-05, + "loss": 2.2137, + "step": 25930 + }, + { + "epoch": 0.6108991568932222, + "grad_norm": 2.3239822387695312, + "learning_rate": 7.78719796523951e-05, + "loss": 1.8162, + "step": 25940 + }, + { + "epoch": 0.6111346615797655, + "grad_norm": 1.9721574783325195, + "learning_rate": 7.782487871508643e-05, + "loss": 2.1246, + "step": 25950 + }, + { + "epoch": 0.6113701662663087, + "grad_norm": 2.5218284130096436, + "learning_rate": 7.777777777777778e-05, + "loss": 1.9872, + "step": 25960 + }, + { + "epoch": 0.611605670952852, + "grad_norm": 1.9830020666122437, + "learning_rate": 7.773067684046913e-05, + "loss": 1.8981, + "step": 25970 + }, + { + "epoch": 0.6118411756393952, + "grad_norm": 2.9052417278289795, + "learning_rate": 7.768357590316048e-05, + "loss": 1.961, + "step": 25980 + }, + { + "epoch": 0.6120766803259385, + "grad_norm": 2.380767583847046, + "learning_rate": 7.763647496585182e-05, + "loss": 2.0117, + "step": 25990 + }, + { + "epoch": 0.6123121850124817, + "grad_norm": 2.563429117202759, + "learning_rate": 7.758937402854317e-05, + "loss": 2.0541, + "step": 26000 + }, + { + "epoch": 0.612547689699025, + "grad_norm": 2.067661762237549, + "learning_rate": 7.754227309123452e-05, + "loss": 1.9734, + "step": 26010 + }, + { + "epoch": 0.6127831943855683, + "grad_norm": 2.019895315170288, + "learning_rate": 7.749517215392587e-05, + "loss": 2.2234, + "step": 26020 + }, + { + "epoch": 0.6130186990721115, + "grad_norm": 2.9683279991149902, + "learning_rate": 7.74480712166172e-05, + "loss": 1.9863, + "step": 26030 + }, + { + "epoch": 0.6132542037586548, + "grad_norm": 2.322253465652466, + "learning_rate": 7.740097027930856e-05, + "loss": 2.0671, + "step": 26040 + }, + { + "epoch": 0.613489708445198, + "grad_norm": 2.235363483428955, + "learning_rate": 7.73538693419999e-05, + "loss": 1.9467, + "step": 26050 + }, + { + "epoch": 0.6137252131317413, + "grad_norm": 2.408618927001953, + "learning_rate": 7.730676840469126e-05, + "loss": 2.1104, + "step": 26060 + }, + { + "epoch": 0.6139607178182845, + "grad_norm": 2.392561912536621, + "learning_rate": 7.725966746738261e-05, + "loss": 1.8468, + "step": 26070 + }, + { + "epoch": 0.6141962225048279, + "grad_norm": 2.707282781600952, + "learning_rate": 7.721256653007394e-05, + "loss": 2.0011, + "step": 26080 + }, + { + "epoch": 0.6144317271913711, + "grad_norm": 2.337557792663574, + "learning_rate": 7.71654655927653e-05, + "loss": 2.026, + "step": 26090 + }, + { + "epoch": 0.6146672318779144, + "grad_norm": 2.9671690464019775, + "learning_rate": 7.711836465545664e-05, + "loss": 1.965, + "step": 26100 + }, + { + "epoch": 0.6149027365644576, + "grad_norm": 1.8119666576385498, + "learning_rate": 7.7071263718148e-05, + "loss": 2.0404, + "step": 26110 + }, + { + "epoch": 0.6151382412510009, + "grad_norm": 2.940488576889038, + "learning_rate": 7.702416278083933e-05, + "loss": 1.7759, + "step": 26120 + }, + { + "epoch": 0.6153737459375441, + "grad_norm": 2.25976300239563, + "learning_rate": 7.697706184353068e-05, + "loss": 1.7815, + "step": 26130 + }, + { + "epoch": 0.6156092506240874, + "grad_norm": 2.183560371398926, + "learning_rate": 7.692996090622205e-05, + "loss": 1.8272, + "step": 26140 + }, + { + "epoch": 0.6158447553106307, + "grad_norm": 2.320969820022583, + "learning_rate": 7.68828599689134e-05, + "loss": 2.1829, + "step": 26150 + }, + { + "epoch": 0.616080259997174, + "grad_norm": 2.730713129043579, + "learning_rate": 7.683575903160473e-05, + "loss": 1.908, + "step": 26160 + }, + { + "epoch": 0.6163157646837172, + "grad_norm": 2.4461238384246826, + "learning_rate": 7.678865809429608e-05, + "loss": 2.1768, + "step": 26170 + }, + { + "epoch": 0.6165512693702605, + "grad_norm": 2.157590866088867, + "learning_rate": 7.674155715698743e-05, + "loss": 2.1194, + "step": 26180 + }, + { + "epoch": 0.6167867740568037, + "grad_norm": 2.2713019847869873, + "learning_rate": 7.669445621967878e-05, + "loss": 2.0888, + "step": 26190 + }, + { + "epoch": 0.617022278743347, + "grad_norm": 2.798720598220825, + "learning_rate": 7.664735528237012e-05, + "loss": 2.0496, + "step": 26200 + }, + { + "epoch": 0.6172577834298902, + "grad_norm": 2.4818081855773926, + "learning_rate": 7.660025434506147e-05, + "loss": 2.2425, + "step": 26210 + }, + { + "epoch": 0.6174932881164336, + "grad_norm": 2.5907058715820312, + "learning_rate": 7.655315340775282e-05, + "loss": 1.9915, + "step": 26220 + }, + { + "epoch": 0.6177287928029768, + "grad_norm": 3.659034490585327, + "learning_rate": 7.650605247044417e-05, + "loss": 1.9745, + "step": 26230 + }, + { + "epoch": 0.6179642974895201, + "grad_norm": 1.8911350965499878, + "learning_rate": 7.645895153313552e-05, + "loss": 2.02, + "step": 26240 + }, + { + "epoch": 0.6181998021760633, + "grad_norm": 2.086848020553589, + "learning_rate": 7.641185059582686e-05, + "loss": 2.0886, + "step": 26250 + }, + { + "epoch": 0.6184353068626066, + "grad_norm": 2.335862636566162, + "learning_rate": 7.636474965851821e-05, + "loss": 2.13, + "step": 26260 + }, + { + "epoch": 0.6186708115491498, + "grad_norm": 3.3991475105285645, + "learning_rate": 7.631764872120956e-05, + "loss": 2.0224, + "step": 26270 + }, + { + "epoch": 0.6189063162356931, + "grad_norm": 2.554853677749634, + "learning_rate": 7.627054778390091e-05, + "loss": 2.0263, + "step": 26280 + }, + { + "epoch": 0.6191418209222364, + "grad_norm": 1.8681471347808838, + "learning_rate": 7.622344684659224e-05, + "loss": 1.8256, + "step": 26290 + }, + { + "epoch": 0.6193773256087796, + "grad_norm": 2.5911004543304443, + "learning_rate": 7.61763459092836e-05, + "loss": 2.1467, + "step": 26300 + }, + { + "epoch": 0.6196128302953229, + "grad_norm": 2.3044159412384033, + "learning_rate": 7.612924497197495e-05, + "loss": 1.8694, + "step": 26310 + }, + { + "epoch": 0.6198483349818661, + "grad_norm": 2.6165413856506348, + "learning_rate": 7.60821440346663e-05, + "loss": 1.9561, + "step": 26320 + }, + { + "epoch": 0.6200838396684094, + "grad_norm": 2.0098507404327393, + "learning_rate": 7.603504309735763e-05, + "loss": 2.0378, + "step": 26330 + }, + { + "epoch": 0.6203193443549526, + "grad_norm": 3.309293508529663, + "learning_rate": 7.598794216004898e-05, + "loss": 1.8842, + "step": 26340 + }, + { + "epoch": 0.6205548490414959, + "grad_norm": 2.2279319763183594, + "learning_rate": 7.594084122274033e-05, + "loss": 2.1463, + "step": 26350 + }, + { + "epoch": 0.6207903537280391, + "grad_norm": 1.7420648336410522, + "learning_rate": 7.589374028543168e-05, + "loss": 2.033, + "step": 26360 + }, + { + "epoch": 0.6210258584145825, + "grad_norm": 2.068329334259033, + "learning_rate": 7.584663934812303e-05, + "loss": 2.0667, + "step": 26370 + }, + { + "epoch": 0.6212613631011257, + "grad_norm": 2.148442506790161, + "learning_rate": 7.579953841081437e-05, + "loss": 2.0649, + "step": 26380 + }, + { + "epoch": 0.621496867787669, + "grad_norm": 2.486476421356201, + "learning_rate": 7.575243747350572e-05, + "loss": 2.0631, + "step": 26390 + }, + { + "epoch": 0.6217323724742122, + "grad_norm": 2.4887843132019043, + "learning_rate": 7.570533653619707e-05, + "loss": 1.7455, + "step": 26400 + }, + { + "epoch": 0.6219678771607555, + "grad_norm": 1.8628402948379517, + "learning_rate": 7.565823559888842e-05, + "loss": 2.0665, + "step": 26410 + }, + { + "epoch": 0.6222033818472987, + "grad_norm": 1.9637489318847656, + "learning_rate": 7.561113466157977e-05, + "loss": 1.9645, + "step": 26420 + }, + { + "epoch": 0.622438886533842, + "grad_norm": 2.734393835067749, + "learning_rate": 7.556403372427112e-05, + "loss": 2.0079, + "step": 26430 + }, + { + "epoch": 0.6226743912203853, + "grad_norm": 2.7208571434020996, + "learning_rate": 7.551693278696247e-05, + "loss": 2.0066, + "step": 26440 + }, + { + "epoch": 0.6229098959069286, + "grad_norm": 2.311509132385254, + "learning_rate": 7.546983184965382e-05, + "loss": 2.0782, + "step": 26450 + }, + { + "epoch": 0.6231454005934718, + "grad_norm": 2.43361759185791, + "learning_rate": 7.542273091234516e-05, + "loss": 2.1844, + "step": 26460 + }, + { + "epoch": 0.6233809052800151, + "grad_norm": 2.6297848224639893, + "learning_rate": 7.537562997503651e-05, + "loss": 1.9422, + "step": 26470 + }, + { + "epoch": 0.6236164099665583, + "grad_norm": 1.997184157371521, + "learning_rate": 7.532852903772786e-05, + "loss": 2.1683, + "step": 26480 + }, + { + "epoch": 0.6238519146531016, + "grad_norm": 1.8917272090911865, + "learning_rate": 7.528142810041921e-05, + "loss": 1.9385, + "step": 26490 + }, + { + "epoch": 0.6240874193396448, + "grad_norm": 2.0081610679626465, + "learning_rate": 7.523432716311055e-05, + "loss": 1.9071, + "step": 26500 + }, + { + "epoch": 0.6243229240261882, + "grad_norm": 2.179267644882202, + "learning_rate": 7.51872262258019e-05, + "loss": 2.1131, + "step": 26510 + }, + { + "epoch": 0.6245584287127314, + "grad_norm": 1.9860879182815552, + "learning_rate": 7.514012528849325e-05, + "loss": 2.3245, + "step": 26520 + }, + { + "epoch": 0.6247939333992747, + "grad_norm": 1.9527974128723145, + "learning_rate": 7.50930243511846e-05, + "loss": 1.9865, + "step": 26530 + }, + { + "epoch": 0.6250294380858179, + "grad_norm": 2.507073402404785, + "learning_rate": 7.504592341387595e-05, + "loss": 1.8888, + "step": 26540 + }, + { + "epoch": 0.6252649427723612, + "grad_norm": 2.8235909938812256, + "learning_rate": 7.499882247656728e-05, + "loss": 2.0307, + "step": 26550 + }, + { + "epoch": 0.6255004474589044, + "grad_norm": 3.5919525623321533, + "learning_rate": 7.495172153925863e-05, + "loss": 2.1907, + "step": 26560 + }, + { + "epoch": 0.6257359521454477, + "grad_norm": 1.6083260774612427, + "learning_rate": 7.490462060194998e-05, + "loss": 1.8715, + "step": 26570 + }, + { + "epoch": 0.625971456831991, + "grad_norm": 2.9224612712860107, + "learning_rate": 7.485751966464133e-05, + "loss": 2.1417, + "step": 26580 + }, + { + "epoch": 0.6262069615185342, + "grad_norm": 2.0312814712524414, + "learning_rate": 7.481041872733267e-05, + "loss": 1.9067, + "step": 26590 + }, + { + "epoch": 0.6264424662050775, + "grad_norm": 2.3932294845581055, + "learning_rate": 7.476331779002402e-05, + "loss": 2.0652, + "step": 26600 + }, + { + "epoch": 0.6266779708916207, + "grad_norm": 2.058734893798828, + "learning_rate": 7.471621685271537e-05, + "loss": 2.1212, + "step": 26610 + }, + { + "epoch": 0.626913475578164, + "grad_norm": 2.9603049755096436, + "learning_rate": 7.466911591540672e-05, + "loss": 1.917, + "step": 26620 + }, + { + "epoch": 0.6271489802647072, + "grad_norm": 2.741225004196167, + "learning_rate": 7.462201497809806e-05, + "loss": 2.095, + "step": 26630 + }, + { + "epoch": 0.6273844849512505, + "grad_norm": 2.8252007961273193, + "learning_rate": 7.457491404078941e-05, + "loss": 2.045, + "step": 26640 + }, + { + "epoch": 0.6276199896377938, + "grad_norm": 2.179786443710327, + "learning_rate": 7.452781310348076e-05, + "loss": 2.0414, + "step": 26650 + }, + { + "epoch": 0.6278554943243371, + "grad_norm": 2.8023617267608643, + "learning_rate": 7.448071216617211e-05, + "loss": 2.2393, + "step": 26660 + }, + { + "epoch": 0.6280909990108803, + "grad_norm": 1.851933240890503, + "learning_rate": 7.443361122886346e-05, + "loss": 2.1044, + "step": 26670 + }, + { + "epoch": 0.6283265036974236, + "grad_norm": 2.2054028511047363, + "learning_rate": 7.43865102915548e-05, + "loss": 2.328, + "step": 26680 + }, + { + "epoch": 0.6285620083839668, + "grad_norm": 2.3730199337005615, + "learning_rate": 7.433940935424615e-05, + "loss": 1.9667, + "step": 26690 + }, + { + "epoch": 0.6287975130705101, + "grad_norm": 2.3404037952423096, + "learning_rate": 7.429230841693751e-05, + "loss": 2.0277, + "step": 26700 + }, + { + "epoch": 0.6290330177570533, + "grad_norm": 2.6131813526153564, + "learning_rate": 7.424520747962885e-05, + "loss": 2.0922, + "step": 26710 + }, + { + "epoch": 0.6292685224435967, + "grad_norm": 1.9105520248413086, + "learning_rate": 7.41981065423202e-05, + "loss": 2.2015, + "step": 26720 + }, + { + "epoch": 0.6295040271301399, + "grad_norm": 2.223731517791748, + "learning_rate": 7.415100560501155e-05, + "loss": 1.8634, + "step": 26730 + }, + { + "epoch": 0.6297395318166832, + "grad_norm": 2.680253028869629, + "learning_rate": 7.41039046677029e-05, + "loss": 2.0635, + "step": 26740 + }, + { + "epoch": 0.6299750365032264, + "grad_norm": 3.308995008468628, + "learning_rate": 7.405680373039425e-05, + "loss": 1.8926, + "step": 26750 + }, + { + "epoch": 0.6302105411897697, + "grad_norm": 2.192030668258667, + "learning_rate": 7.400970279308558e-05, + "loss": 1.7986, + "step": 26760 + }, + { + "epoch": 0.6304460458763129, + "grad_norm": 2.320146322250366, + "learning_rate": 7.396260185577693e-05, + "loss": 2.1723, + "step": 26770 + }, + { + "epoch": 0.6306815505628562, + "grad_norm": 2.5888264179229736, + "learning_rate": 7.391550091846828e-05, + "loss": 1.9864, + "step": 26780 + }, + { + "epoch": 0.6309170552493994, + "grad_norm": 2.5209715366363525, + "learning_rate": 7.386839998115963e-05, + "loss": 2.0133, + "step": 26790 + }, + { + "epoch": 0.6311525599359428, + "grad_norm": 2.5114974975585938, + "learning_rate": 7.382129904385097e-05, + "loss": 2.0248, + "step": 26800 + }, + { + "epoch": 0.631388064622486, + "grad_norm": 1.4139314889907837, + "learning_rate": 7.377419810654232e-05, + "loss": 1.7762, + "step": 26810 + }, + { + "epoch": 0.6316235693090293, + "grad_norm": 2.1081104278564453, + "learning_rate": 7.372709716923367e-05, + "loss": 1.9416, + "step": 26820 + }, + { + "epoch": 0.6318590739955725, + "grad_norm": 2.1463840007781982, + "learning_rate": 7.367999623192502e-05, + "loss": 1.8741, + "step": 26830 + }, + { + "epoch": 0.6320945786821158, + "grad_norm": 2.403310775756836, + "learning_rate": 7.363289529461637e-05, + "loss": 1.974, + "step": 26840 + }, + { + "epoch": 0.632330083368659, + "grad_norm": 2.203840732574463, + "learning_rate": 7.358579435730771e-05, + "loss": 1.9, + "step": 26850 + }, + { + "epoch": 0.6325655880552022, + "grad_norm": 2.0691256523132324, + "learning_rate": 7.353869341999906e-05, + "loss": 1.9828, + "step": 26860 + }, + { + "epoch": 0.6328010927417456, + "grad_norm": 1.5303398370742798, + "learning_rate": 7.349159248269041e-05, + "loss": 1.8315, + "step": 26870 + }, + { + "epoch": 0.6330365974282888, + "grad_norm": 2.035423994064331, + "learning_rate": 7.344449154538176e-05, + "loss": 2.2133, + "step": 26880 + }, + { + "epoch": 0.6332721021148321, + "grad_norm": 2.0543622970581055, + "learning_rate": 7.33973906080731e-05, + "loss": 1.8439, + "step": 26890 + }, + { + "epoch": 0.6335076068013753, + "grad_norm": 2.3338544368743896, + "learning_rate": 7.335028967076445e-05, + "loss": 2.1695, + "step": 26900 + }, + { + "epoch": 0.6337431114879186, + "grad_norm": 2.5324740409851074, + "learning_rate": 7.33031887334558e-05, + "loss": 2.1254, + "step": 26910 + }, + { + "epoch": 0.6339786161744618, + "grad_norm": 2.218935489654541, + "learning_rate": 7.325608779614715e-05, + "loss": 2.0474, + "step": 26920 + }, + { + "epoch": 0.6342141208610051, + "grad_norm": 2.561553478240967, + "learning_rate": 7.320898685883848e-05, + "loss": 2.3904, + "step": 26930 + }, + { + "epoch": 0.6344496255475484, + "grad_norm": 2.866525411605835, + "learning_rate": 7.316188592152983e-05, + "loss": 2.0126, + "step": 26940 + }, + { + "epoch": 0.6346851302340917, + "grad_norm": 1.68939208984375, + "learning_rate": 7.311478498422118e-05, + "loss": 2.0345, + "step": 26950 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 2.2209982872009277, + "learning_rate": 7.306768404691253e-05, + "loss": 2.2129, + "step": 26960 + }, + { + "epoch": 0.6351561396071782, + "grad_norm": 2.152632713317871, + "learning_rate": 7.302058310960389e-05, + "loss": 2.0254, + "step": 26970 + }, + { + "epoch": 0.6353916442937214, + "grad_norm": 2.0267491340637207, + "learning_rate": 7.297348217229522e-05, + "loss": 2.1108, + "step": 26980 + }, + { + "epoch": 0.6356271489802647, + "grad_norm": 1.9867489337921143, + "learning_rate": 7.292638123498659e-05, + "loss": 2.1763, + "step": 26990 + }, + { + "epoch": 0.6358626536668079, + "grad_norm": 3.316333532333374, + "learning_rate": 7.287928029767794e-05, + "loss": 2.1017, + "step": 27000 + }, + { + "epoch": 0.6360981583533513, + "grad_norm": 1.8042023181915283, + "learning_rate": 7.283217936036927e-05, + "loss": 2.1005, + "step": 27010 + }, + { + "epoch": 0.6363336630398945, + "grad_norm": 2.227996587753296, + "learning_rate": 7.278507842306062e-05, + "loss": 2.1378, + "step": 27020 + }, + { + "epoch": 0.6365691677264378, + "grad_norm": 2.519883155822754, + "learning_rate": 7.273797748575197e-05, + "loss": 1.8913, + "step": 27030 + }, + { + "epoch": 0.636804672412981, + "grad_norm": 1.6812520027160645, + "learning_rate": 7.269087654844332e-05, + "loss": 1.8019, + "step": 27040 + }, + { + "epoch": 0.6370401770995243, + "grad_norm": 3.9242050647735596, + "learning_rate": 7.264377561113467e-05, + "loss": 2.3015, + "step": 27050 + }, + { + "epoch": 0.6372756817860675, + "grad_norm": 1.8133604526519775, + "learning_rate": 7.259667467382601e-05, + "loss": 1.9485, + "step": 27060 + }, + { + "epoch": 0.6375111864726108, + "grad_norm": 2.5527639389038086, + "learning_rate": 7.254957373651736e-05, + "loss": 2.0823, + "step": 27070 + }, + { + "epoch": 0.637746691159154, + "grad_norm": 2.00186824798584, + "learning_rate": 7.250247279920871e-05, + "loss": 2.0701, + "step": 27080 + }, + { + "epoch": 0.6379821958456974, + "grad_norm": 2.2295942306518555, + "learning_rate": 7.245537186190006e-05, + "loss": 2.0213, + "step": 27090 + }, + { + "epoch": 0.6382177005322406, + "grad_norm": 2.3956990242004395, + "learning_rate": 7.24082709245914e-05, + "loss": 1.8728, + "step": 27100 + }, + { + "epoch": 0.6384532052187839, + "grad_norm": 2.1132164001464844, + "learning_rate": 7.236116998728275e-05, + "loss": 1.8988, + "step": 27110 + }, + { + "epoch": 0.6386887099053271, + "grad_norm": 2.5249156951904297, + "learning_rate": 7.23140690499741e-05, + "loss": 2.051, + "step": 27120 + }, + { + "epoch": 0.6389242145918704, + "grad_norm": 2.1515731811523438, + "learning_rate": 7.226696811266545e-05, + "loss": 1.8113, + "step": 27130 + }, + { + "epoch": 0.6391597192784136, + "grad_norm": 1.9574979543685913, + "learning_rate": 7.22198671753568e-05, + "loss": 2.2399, + "step": 27140 + }, + { + "epoch": 0.6393952239649568, + "grad_norm": 2.0978519916534424, + "learning_rate": 7.217276623804814e-05, + "loss": 2.333, + "step": 27150 + }, + { + "epoch": 0.6396307286515002, + "grad_norm": 1.8691319227218628, + "learning_rate": 7.212566530073949e-05, + "loss": 2.0774, + "step": 27160 + }, + { + "epoch": 0.6398662333380434, + "grad_norm": 1.9516493082046509, + "learning_rate": 7.207856436343084e-05, + "loss": 2.2324, + "step": 27170 + }, + { + "epoch": 0.6401017380245867, + "grad_norm": 1.61760675907135, + "learning_rate": 7.203146342612219e-05, + "loss": 2.127, + "step": 27180 + }, + { + "epoch": 0.6403372427111299, + "grad_norm": 1.9777883291244507, + "learning_rate": 7.198436248881352e-05, + "loss": 2.0067, + "step": 27190 + }, + { + "epoch": 0.6405727473976732, + "grad_norm": 2.270724058151245, + "learning_rate": 7.193726155150487e-05, + "loss": 1.8146, + "step": 27200 + }, + { + "epoch": 0.6408082520842164, + "grad_norm": 2.2753586769104004, + "learning_rate": 7.189016061419622e-05, + "loss": 2.1406, + "step": 27210 + }, + { + "epoch": 0.6410437567707598, + "grad_norm": 2.114849328994751, + "learning_rate": 7.184305967688757e-05, + "loss": 1.9977, + "step": 27220 + }, + { + "epoch": 0.641279261457303, + "grad_norm": 2.480898857116699, + "learning_rate": 7.179595873957891e-05, + "loss": 1.8545, + "step": 27230 + }, + { + "epoch": 0.6415147661438463, + "grad_norm": 2.312798500061035, + "learning_rate": 7.174885780227026e-05, + "loss": 2.0963, + "step": 27240 + }, + { + "epoch": 0.6417502708303895, + "grad_norm": 2.0719785690307617, + "learning_rate": 7.170175686496161e-05, + "loss": 2.0581, + "step": 27250 + }, + { + "epoch": 0.6419857755169328, + "grad_norm": 2.2538905143737793, + "learning_rate": 7.165465592765296e-05, + "loss": 1.9055, + "step": 27260 + }, + { + "epoch": 0.642221280203476, + "grad_norm": 2.6148452758789062, + "learning_rate": 7.160755499034431e-05, + "loss": 2.2561, + "step": 27270 + }, + { + "epoch": 0.6424567848900193, + "grad_norm": 1.9259710311889648, + "learning_rate": 7.156045405303566e-05, + "loss": 2.2007, + "step": 27280 + }, + { + "epoch": 0.6426922895765625, + "grad_norm": 1.789009690284729, + "learning_rate": 7.151335311572701e-05, + "loss": 1.9831, + "step": 27290 + }, + { + "epoch": 0.6429277942631059, + "grad_norm": 3.458051919937134, + "learning_rate": 7.146625217841836e-05, + "loss": 2.0307, + "step": 27300 + }, + { + "epoch": 0.6431632989496491, + "grad_norm": 2.757291078567505, + "learning_rate": 7.14191512411097e-05, + "loss": 2.1219, + "step": 27310 + }, + { + "epoch": 0.6433988036361924, + "grad_norm": 1.8680845499038696, + "learning_rate": 7.137205030380105e-05, + "loss": 2.0296, + "step": 27320 + }, + { + "epoch": 0.6436343083227356, + "grad_norm": 2.32859468460083, + "learning_rate": 7.13249493664924e-05, + "loss": 2.1007, + "step": 27330 + }, + { + "epoch": 0.6438698130092789, + "grad_norm": 2.07627272605896, + "learning_rate": 7.127784842918375e-05, + "loss": 1.9358, + "step": 27340 + }, + { + "epoch": 0.6441053176958221, + "grad_norm": 2.0837743282318115, + "learning_rate": 7.12307474918751e-05, + "loss": 1.8441, + "step": 27350 + }, + { + "epoch": 0.6443408223823655, + "grad_norm": 2.417036771774292, + "learning_rate": 7.118364655456644e-05, + "loss": 1.9943, + "step": 27360 + }, + { + "epoch": 0.6445763270689087, + "grad_norm": 2.3873021602630615, + "learning_rate": 7.113654561725779e-05, + "loss": 2.0055, + "step": 27370 + }, + { + "epoch": 0.644811831755452, + "grad_norm": 1.9441145658493042, + "learning_rate": 7.108944467994914e-05, + "loss": 2.0131, + "step": 27380 + }, + { + "epoch": 0.6450473364419952, + "grad_norm": 2.0448830127716064, + "learning_rate": 7.104234374264049e-05, + "loss": 2.2472, + "step": 27390 + }, + { + "epoch": 0.6452828411285385, + "grad_norm": 2.079862594604492, + "learning_rate": 7.099524280533182e-05, + "loss": 1.8943, + "step": 27400 + }, + { + "epoch": 0.6455183458150817, + "grad_norm": 3.530986785888672, + "learning_rate": 7.094814186802317e-05, + "loss": 2.0522, + "step": 27410 + }, + { + "epoch": 0.645753850501625, + "grad_norm": 2.296212673187256, + "learning_rate": 7.090104093071452e-05, + "loss": 1.7455, + "step": 27420 + }, + { + "epoch": 0.6459893551881682, + "grad_norm": 2.3414154052734375, + "learning_rate": 7.085393999340587e-05, + "loss": 2.0542, + "step": 27430 + }, + { + "epoch": 0.6462248598747115, + "grad_norm": 2.2467596530914307, + "learning_rate": 7.080683905609722e-05, + "loss": 2.0408, + "step": 27440 + }, + { + "epoch": 0.6464603645612548, + "grad_norm": 1.901523232460022, + "learning_rate": 7.075973811878856e-05, + "loss": 2.0676, + "step": 27450 + }, + { + "epoch": 0.646695869247798, + "grad_norm": 2.3312385082244873, + "learning_rate": 7.071263718147991e-05, + "loss": 2.1138, + "step": 27460 + }, + { + "epoch": 0.6469313739343413, + "grad_norm": 2.5286500453948975, + "learning_rate": 7.066553624417126e-05, + "loss": 2.1407, + "step": 27470 + }, + { + "epoch": 0.6471668786208845, + "grad_norm": 1.9842777252197266, + "learning_rate": 7.062314540059347e-05, + "loss": 1.928, + "step": 27480 + }, + { + "epoch": 0.6474023833074278, + "grad_norm": 2.1205496788024902, + "learning_rate": 7.057604446328482e-05, + "loss": 2.0597, + "step": 27490 + }, + { + "epoch": 0.647637887993971, + "grad_norm": 2.428333044052124, + "learning_rate": 7.052894352597617e-05, + "loss": 2.0553, + "step": 27500 + }, + { + "epoch": 0.6478733926805144, + "grad_norm": 2.098172664642334, + "learning_rate": 7.048184258866752e-05, + "loss": 2.135, + "step": 27510 + }, + { + "epoch": 0.6481088973670576, + "grad_norm": 2.455638885498047, + "learning_rate": 7.043474165135887e-05, + "loss": 2.1021, + "step": 27520 + }, + { + "epoch": 0.6483444020536009, + "grad_norm": 1.9910706281661987, + "learning_rate": 7.038764071405022e-05, + "loss": 2.0287, + "step": 27530 + }, + { + "epoch": 0.6485799067401441, + "grad_norm": 1.8840402364730835, + "learning_rate": 7.034053977674157e-05, + "loss": 2.1737, + "step": 27540 + }, + { + "epoch": 0.6488154114266874, + "grad_norm": 2.0080809593200684, + "learning_rate": 7.029343883943292e-05, + "loss": 1.8489, + "step": 27550 + }, + { + "epoch": 0.6490509161132306, + "grad_norm": 2.091917037963867, + "learning_rate": 7.024633790212425e-05, + "loss": 2.0131, + "step": 27560 + }, + { + "epoch": 0.649286420799774, + "grad_norm": 1.8030489683151245, + "learning_rate": 7.01992369648156e-05, + "loss": 2.0919, + "step": 27570 + }, + { + "epoch": 0.6495219254863172, + "grad_norm": 1.896256923675537, + "learning_rate": 7.015213602750696e-05, + "loss": 2.0142, + "step": 27580 + }, + { + "epoch": 0.6497574301728605, + "grad_norm": 1.7844651937484741, + "learning_rate": 7.01050350901983e-05, + "loss": 2.2077, + "step": 27590 + }, + { + "epoch": 0.6499929348594037, + "grad_norm": 2.182560920715332, + "learning_rate": 7.005793415288964e-05, + "loss": 1.7973, + "step": 27600 + }, + { + "epoch": 0.650228439545947, + "grad_norm": 2.0249404907226562, + "learning_rate": 7.001083321558099e-05, + "loss": 2.1121, + "step": 27610 + }, + { + "epoch": 0.6504639442324902, + "grad_norm": 2.052584409713745, + "learning_rate": 6.996373227827234e-05, + "loss": 2.2913, + "step": 27620 + }, + { + "epoch": 0.6506994489190335, + "grad_norm": 1.8792239427566528, + "learning_rate": 6.991663134096369e-05, + "loss": 2.059, + "step": 27630 + }, + { + "epoch": 0.6509349536055767, + "grad_norm": 2.087036371231079, + "learning_rate": 6.986953040365504e-05, + "loss": 2.0064, + "step": 27640 + }, + { + "epoch": 0.6511704582921201, + "grad_norm": 2.4446024894714355, + "learning_rate": 6.982242946634638e-05, + "loss": 2.0678, + "step": 27650 + }, + { + "epoch": 0.6514059629786633, + "grad_norm": 2.256411552429199, + "learning_rate": 6.977532852903773e-05, + "loss": 1.9454, + "step": 27660 + }, + { + "epoch": 0.6516414676652066, + "grad_norm": 2.468778610229492, + "learning_rate": 6.972822759172908e-05, + "loss": 2.1624, + "step": 27670 + }, + { + "epoch": 0.6518769723517498, + "grad_norm": 2.614762544631958, + "learning_rate": 6.968112665442043e-05, + "loss": 2.1327, + "step": 27680 + }, + { + "epoch": 0.6521124770382931, + "grad_norm": 1.6540695428848267, + "learning_rate": 6.963402571711177e-05, + "loss": 1.7609, + "step": 27690 + }, + { + "epoch": 0.6523479817248363, + "grad_norm": 1.6981443166732788, + "learning_rate": 6.958692477980312e-05, + "loss": 1.9458, + "step": 27700 + }, + { + "epoch": 0.6525834864113795, + "grad_norm": 2.1197662353515625, + "learning_rate": 6.953982384249447e-05, + "loss": 2.1603, + "step": 27710 + }, + { + "epoch": 0.6528189910979229, + "grad_norm": 2.753307580947876, + "learning_rate": 6.949272290518582e-05, + "loss": 2.192, + "step": 27720 + }, + { + "epoch": 0.6530544957844661, + "grad_norm": 1.7629789113998413, + "learning_rate": 6.944562196787717e-05, + "loss": 1.9333, + "step": 27730 + }, + { + "epoch": 0.6532900004710094, + "grad_norm": 2.513230323791504, + "learning_rate": 6.93985210305685e-05, + "loss": 2.0639, + "step": 27740 + }, + { + "epoch": 0.6535255051575526, + "grad_norm": 1.9997378587722778, + "learning_rate": 6.935142009325986e-05, + "loss": 1.8298, + "step": 27750 + }, + { + "epoch": 0.6537610098440959, + "grad_norm": 2.183000087738037, + "learning_rate": 6.93043191559512e-05, + "loss": 1.8049, + "step": 27760 + }, + { + "epoch": 0.6539965145306391, + "grad_norm": 2.316758155822754, + "learning_rate": 6.925721821864256e-05, + "loss": 2.2343, + "step": 27770 + }, + { + "epoch": 0.6542320192171824, + "grad_norm": 2.8688292503356934, + "learning_rate": 6.921011728133389e-05, + "loss": 2.0771, + "step": 27780 + }, + { + "epoch": 0.6544675239037256, + "grad_norm": 2.846386671066284, + "learning_rate": 6.916301634402524e-05, + "loss": 1.9481, + "step": 27790 + }, + { + "epoch": 0.654703028590269, + "grad_norm": 2.241668462753296, + "learning_rate": 6.911591540671659e-05, + "loss": 2.0466, + "step": 27800 + }, + { + "epoch": 0.6549385332768122, + "grad_norm": 1.862688422203064, + "learning_rate": 6.906881446940796e-05, + "loss": 2.0278, + "step": 27810 + }, + { + "epoch": 0.6551740379633555, + "grad_norm": 2.669524908065796, + "learning_rate": 6.902171353209929e-05, + "loss": 1.9347, + "step": 27820 + }, + { + "epoch": 0.6554095426498987, + "grad_norm": 1.9434707164764404, + "learning_rate": 6.897461259479064e-05, + "loss": 1.9777, + "step": 27830 + }, + { + "epoch": 0.655645047336442, + "grad_norm": 1.9469919204711914, + "learning_rate": 6.8927511657482e-05, + "loss": 2.1351, + "step": 27840 + }, + { + "epoch": 0.6558805520229852, + "grad_norm": 2.0823862552642822, + "learning_rate": 6.888041072017334e-05, + "loss": 2.1103, + "step": 27850 + }, + { + "epoch": 0.6561160567095286, + "grad_norm": 2.011646270751953, + "learning_rate": 6.883330978286468e-05, + "loss": 2.0807, + "step": 27860 + }, + { + "epoch": 0.6563515613960718, + "grad_norm": 2.085239887237549, + "learning_rate": 6.878620884555603e-05, + "loss": 2.0131, + "step": 27870 + }, + { + "epoch": 0.6565870660826151, + "grad_norm": 2.445866346359253, + "learning_rate": 6.873910790824738e-05, + "loss": 1.9386, + "step": 27880 + }, + { + "epoch": 0.6568225707691583, + "grad_norm": 2.3337929248809814, + "learning_rate": 6.869200697093873e-05, + "loss": 1.9866, + "step": 27890 + }, + { + "epoch": 0.6570580754557016, + "grad_norm": 2.558706045150757, + "learning_rate": 6.864490603363007e-05, + "loss": 1.9361, + "step": 27900 + }, + { + "epoch": 0.6572935801422448, + "grad_norm": 1.7763642072677612, + "learning_rate": 6.859780509632142e-05, + "loss": 2.147, + "step": 27910 + }, + { + "epoch": 0.6575290848287881, + "grad_norm": 2.353127956390381, + "learning_rate": 6.855070415901277e-05, + "loss": 1.972, + "step": 27920 + }, + { + "epoch": 0.6577645895153313, + "grad_norm": 2.1729345321655273, + "learning_rate": 6.850360322170412e-05, + "loss": 1.9824, + "step": 27930 + }, + { + "epoch": 0.6580000942018747, + "grad_norm": 2.3141558170318604, + "learning_rate": 6.845650228439547e-05, + "loss": 1.8974, + "step": 27940 + }, + { + "epoch": 0.6582355988884179, + "grad_norm": 2.9359517097473145, + "learning_rate": 6.84094013470868e-05, + "loss": 2.0506, + "step": 27950 + }, + { + "epoch": 0.6584711035749612, + "grad_norm": 2.285006046295166, + "learning_rate": 6.836230040977816e-05, + "loss": 2.1571, + "step": 27960 + }, + { + "epoch": 0.6587066082615044, + "grad_norm": 2.683715581893921, + "learning_rate": 6.83151994724695e-05, + "loss": 2.3031, + "step": 27970 + }, + { + "epoch": 0.6589421129480477, + "grad_norm": 2.042071580886841, + "learning_rate": 6.826809853516086e-05, + "loss": 1.9662, + "step": 27980 + }, + { + "epoch": 0.6591776176345909, + "grad_norm": 2.244554042816162, + "learning_rate": 6.822099759785219e-05, + "loss": 2.1763, + "step": 27990 + }, + { + "epoch": 0.6594131223211341, + "grad_norm": 2.151461124420166, + "learning_rate": 6.817389666054354e-05, + "loss": 1.8774, + "step": 28000 + }, + { + "epoch": 0.6596486270076775, + "grad_norm": 2.07513427734375, + "learning_rate": 6.81267957232349e-05, + "loss": 1.9599, + "step": 28010 + }, + { + "epoch": 0.6598841316942207, + "grad_norm": 1.9893085956573486, + "learning_rate": 6.807969478592624e-05, + "loss": 2.1541, + "step": 28020 + }, + { + "epoch": 0.660119636380764, + "grad_norm": 3.486293315887451, + "learning_rate": 6.80325938486176e-05, + "loss": 2.1084, + "step": 28030 + }, + { + "epoch": 0.6603551410673072, + "grad_norm": 2.266904830932617, + "learning_rate": 6.798549291130893e-05, + "loss": 1.9884, + "step": 28040 + }, + { + "epoch": 0.6605906457538505, + "grad_norm": 2.0467066764831543, + "learning_rate": 6.793839197400028e-05, + "loss": 2.172, + "step": 28050 + }, + { + "epoch": 0.6608261504403937, + "grad_norm": 2.1317572593688965, + "learning_rate": 6.789129103669163e-05, + "loss": 2.1142, + "step": 28060 + }, + { + "epoch": 0.661061655126937, + "grad_norm": 2.3432118892669678, + "learning_rate": 6.784419009938298e-05, + "loss": 1.9512, + "step": 28070 + }, + { + "epoch": 0.6612971598134803, + "grad_norm": 2.4152426719665527, + "learning_rate": 6.779708916207432e-05, + "loss": 2.0989, + "step": 28080 + }, + { + "epoch": 0.6615326645000236, + "grad_norm": 1.8867883682250977, + "learning_rate": 6.774998822476568e-05, + "loss": 1.9836, + "step": 28090 + }, + { + "epoch": 0.6617681691865668, + "grad_norm": 2.182126998901367, + "learning_rate": 6.770288728745703e-05, + "loss": 1.8761, + "step": 28100 + }, + { + "epoch": 0.6620036738731101, + "grad_norm": 2.5316390991210938, + "learning_rate": 6.765578635014838e-05, + "loss": 1.9713, + "step": 28110 + }, + { + "epoch": 0.6622391785596533, + "grad_norm": 2.1727163791656494, + "learning_rate": 6.760868541283972e-05, + "loss": 1.8554, + "step": 28120 + }, + { + "epoch": 0.6624746832461966, + "grad_norm": 3.6611173152923584, + "learning_rate": 6.756158447553107e-05, + "loss": 1.9843, + "step": 28130 + }, + { + "epoch": 0.6627101879327398, + "grad_norm": 2.4131314754486084, + "learning_rate": 6.751448353822242e-05, + "loss": 2.0242, + "step": 28140 + }, + { + "epoch": 0.6629456926192832, + "grad_norm": 1.8160946369171143, + "learning_rate": 6.746738260091377e-05, + "loss": 2.0622, + "step": 28150 + }, + { + "epoch": 0.6631811973058264, + "grad_norm": 3.188413381576538, + "learning_rate": 6.74202816636051e-05, + "loss": 2.2114, + "step": 28160 + }, + { + "epoch": 0.6634167019923697, + "grad_norm": 2.4607508182525635, + "learning_rate": 6.737318072629646e-05, + "loss": 2.0314, + "step": 28170 + }, + { + "epoch": 0.6636522066789129, + "grad_norm": 2.2782578468322754, + "learning_rate": 6.732607978898781e-05, + "loss": 1.9546, + "step": 28180 + }, + { + "epoch": 0.6638877113654562, + "grad_norm": 1.997504472732544, + "learning_rate": 6.727897885167916e-05, + "loss": 2.0726, + "step": 28190 + }, + { + "epoch": 0.6641232160519994, + "grad_norm": 2.3978195190429688, + "learning_rate": 6.72318779143705e-05, + "loss": 2.1611, + "step": 28200 + }, + { + "epoch": 0.6643587207385427, + "grad_norm": 2.00547456741333, + "learning_rate": 6.718477697706184e-05, + "loss": 2.0716, + "step": 28210 + }, + { + "epoch": 0.664594225425086, + "grad_norm": 1.96976637840271, + "learning_rate": 6.71376760397532e-05, + "loss": 1.9797, + "step": 28220 + }, + { + "epoch": 0.6648297301116293, + "grad_norm": 2.197422981262207, + "learning_rate": 6.709057510244454e-05, + "loss": 1.9887, + "step": 28230 + }, + { + "epoch": 0.6650652347981725, + "grad_norm": 1.326515555381775, + "learning_rate": 6.70434741651359e-05, + "loss": 1.9457, + "step": 28240 + }, + { + "epoch": 0.6653007394847158, + "grad_norm": 1.9585696458816528, + "learning_rate": 6.699637322782723e-05, + "loss": 2.1483, + "step": 28250 + }, + { + "epoch": 0.665536244171259, + "grad_norm": 2.264357328414917, + "learning_rate": 6.694927229051858e-05, + "loss": 1.9455, + "step": 28260 + }, + { + "epoch": 0.6657717488578023, + "grad_norm": 2.1170806884765625, + "learning_rate": 6.690217135320993e-05, + "loss": 2.1229, + "step": 28270 + }, + { + "epoch": 0.6660072535443455, + "grad_norm": 2.3567802906036377, + "learning_rate": 6.685507041590128e-05, + "loss": 2.2026, + "step": 28280 + }, + { + "epoch": 0.6662427582308887, + "grad_norm": 2.09476637840271, + "learning_rate": 6.680796947859262e-05, + "loss": 1.9125, + "step": 28290 + }, + { + "epoch": 0.6664782629174321, + "grad_norm": 1.959568738937378, + "learning_rate": 6.676086854128397e-05, + "loss": 2.085, + "step": 28300 + }, + { + "epoch": 0.6667137676039753, + "grad_norm": 3.108013153076172, + "learning_rate": 6.671376760397532e-05, + "loss": 1.9585, + "step": 28310 + }, + { + "epoch": 0.6669492722905186, + "grad_norm": 2.3948521614074707, + "learning_rate": 6.666666666666667e-05, + "loss": 1.8998, + "step": 28320 + }, + { + "epoch": 0.6671847769770618, + "grad_norm": 1.9991596937179565, + "learning_rate": 6.6619565729358e-05, + "loss": 1.9824, + "step": 28330 + }, + { + "epoch": 0.6674202816636051, + "grad_norm": 1.759979486465454, + "learning_rate": 6.657246479204936e-05, + "loss": 1.9744, + "step": 28340 + }, + { + "epoch": 0.6676557863501483, + "grad_norm": 2.247377634048462, + "learning_rate": 6.652536385474071e-05, + "loss": 2.1226, + "step": 28350 + }, + { + "epoch": 0.6678912910366916, + "grad_norm": 3.3564887046813965, + "learning_rate": 6.647826291743206e-05, + "loss": 2.0988, + "step": 28360 + }, + { + "epoch": 0.6681267957232349, + "grad_norm": 2.1177890300750732, + "learning_rate": 6.643116198012341e-05, + "loss": 1.6984, + "step": 28370 + }, + { + "epoch": 0.6683623004097782, + "grad_norm": 2.21870756149292, + "learning_rate": 6.638406104281476e-05, + "loss": 2.1458, + "step": 28380 + }, + { + "epoch": 0.6685978050963214, + "grad_norm": 1.8025680780410767, + "learning_rate": 6.633696010550611e-05, + "loss": 1.9258, + "step": 28390 + }, + { + "epoch": 0.6688333097828647, + "grad_norm": 1.9146382808685303, + "learning_rate": 6.628985916819746e-05, + "loss": 1.879, + "step": 28400 + }, + { + "epoch": 0.6690688144694079, + "grad_norm": 2.1155905723571777, + "learning_rate": 6.62427582308888e-05, + "loss": 2.2079, + "step": 28410 + }, + { + "epoch": 0.6693043191559512, + "grad_norm": 2.081460475921631, + "learning_rate": 6.619565729358015e-05, + "loss": 2.0226, + "step": 28420 + }, + { + "epoch": 0.6695398238424944, + "grad_norm": 2.065375328063965, + "learning_rate": 6.61485563562715e-05, + "loss": 2.0476, + "step": 28430 + }, + { + "epoch": 0.6697753285290378, + "grad_norm": 1.6890816688537598, + "learning_rate": 6.610145541896285e-05, + "loss": 1.9391, + "step": 28440 + }, + { + "epoch": 0.670010833215581, + "grad_norm": 2.4403939247131348, + "learning_rate": 6.60543544816542e-05, + "loss": 1.8528, + "step": 28450 + }, + { + "epoch": 0.6702463379021243, + "grad_norm": 2.6802732944488525, + "learning_rate": 6.600725354434553e-05, + "loss": 2.1087, + "step": 28460 + }, + { + "epoch": 0.6704818425886675, + "grad_norm": 1.8135290145874023, + "learning_rate": 6.596015260703688e-05, + "loss": 2.1068, + "step": 28470 + }, + { + "epoch": 0.6707173472752108, + "grad_norm": 2.0007054805755615, + "learning_rate": 6.591305166972823e-05, + "loss": 2.113, + "step": 28480 + }, + { + "epoch": 0.670952851961754, + "grad_norm": 2.349604368209839, + "learning_rate": 6.586595073241958e-05, + "loss": 1.7037, + "step": 28490 + }, + { + "epoch": 0.6711883566482973, + "grad_norm": 2.2065269947052, + "learning_rate": 6.581884979511092e-05, + "loss": 2.3166, + "step": 28500 + }, + { + "epoch": 0.6714238613348406, + "grad_norm": 2.0659828186035156, + "learning_rate": 6.577174885780227e-05, + "loss": 2.0392, + "step": 28510 + }, + { + "epoch": 0.6716593660213839, + "grad_norm": 2.7772090435028076, + "learning_rate": 6.572464792049362e-05, + "loss": 1.9277, + "step": 28520 + }, + { + "epoch": 0.6718948707079271, + "grad_norm": 2.1206893920898438, + "learning_rate": 6.567754698318497e-05, + "loss": 1.8924, + "step": 28530 + }, + { + "epoch": 0.6721303753944704, + "grad_norm": 2.0614333152770996, + "learning_rate": 6.563044604587632e-05, + "loss": 2.1831, + "step": 28540 + }, + { + "epoch": 0.6723658800810136, + "grad_norm": 1.9979201555252075, + "learning_rate": 6.558334510856766e-05, + "loss": 1.9522, + "step": 28550 + }, + { + "epoch": 0.6726013847675568, + "grad_norm": 1.968994379043579, + "learning_rate": 6.553624417125901e-05, + "loss": 1.9722, + "step": 28560 + }, + { + "epoch": 0.6728368894541001, + "grad_norm": 1.783308982849121, + "learning_rate": 6.548914323395036e-05, + "loss": 2.1729, + "step": 28570 + }, + { + "epoch": 0.6730723941406433, + "grad_norm": 2.494913101196289, + "learning_rate": 6.544204229664171e-05, + "loss": 2.1268, + "step": 28580 + }, + { + "epoch": 0.6733078988271867, + "grad_norm": 2.0739006996154785, + "learning_rate": 6.539494135933305e-05, + "loss": 2.0764, + "step": 28590 + }, + { + "epoch": 0.6735434035137299, + "grad_norm": 3.6099002361297607, + "learning_rate": 6.53478404220244e-05, + "loss": 2.0012, + "step": 28600 + }, + { + "epoch": 0.6737789082002732, + "grad_norm": 2.3188111782073975, + "learning_rate": 6.530073948471575e-05, + "loss": 2.0252, + "step": 28610 + }, + { + "epoch": 0.6740144128868164, + "grad_norm": 1.7235386371612549, + "learning_rate": 6.525834864113795e-05, + "loss": 1.9303, + "step": 28620 + }, + { + "epoch": 0.6742499175733597, + "grad_norm": 1.7218267917633057, + "learning_rate": 6.521124770382931e-05, + "loss": 1.9484, + "step": 28630 + }, + { + "epoch": 0.6744854222599029, + "grad_norm": 2.311770439147949, + "learning_rate": 6.516414676652066e-05, + "loss": 2.0987, + "step": 28640 + }, + { + "epoch": 0.6747209269464463, + "grad_norm": 2.6366100311279297, + "learning_rate": 6.511704582921201e-05, + "loss": 1.9514, + "step": 28650 + }, + { + "epoch": 0.6749564316329895, + "grad_norm": 2.66902494430542, + "learning_rate": 6.506994489190335e-05, + "loss": 1.9368, + "step": 28660 + }, + { + "epoch": 0.6751919363195328, + "grad_norm": 2.354694366455078, + "learning_rate": 6.50228439545947e-05, + "loss": 1.9255, + "step": 28670 + }, + { + "epoch": 0.675427441006076, + "grad_norm": 2.0443005561828613, + "learning_rate": 6.497574301728605e-05, + "loss": 2.0929, + "step": 28680 + }, + { + "epoch": 0.6756629456926193, + "grad_norm": 1.8448538780212402, + "learning_rate": 6.49286420799774e-05, + "loss": 2.0847, + "step": 28690 + }, + { + "epoch": 0.6758984503791625, + "grad_norm": 2.754966974258423, + "learning_rate": 6.488154114266874e-05, + "loss": 2.1979, + "step": 28700 + }, + { + "epoch": 0.6761339550657058, + "grad_norm": 2.3212506771087646, + "learning_rate": 6.483444020536009e-05, + "loss": 2.062, + "step": 28710 + }, + { + "epoch": 0.676369459752249, + "grad_norm": 2.090217351913452, + "learning_rate": 6.478733926805144e-05, + "loss": 1.9911, + "step": 28720 + }, + { + "epoch": 0.6766049644387924, + "grad_norm": 1.9577314853668213, + "learning_rate": 6.474023833074279e-05, + "loss": 1.9376, + "step": 28730 + }, + { + "epoch": 0.6768404691253356, + "grad_norm": 1.8566160202026367, + "learning_rate": 6.469313739343414e-05, + "loss": 2.0811, + "step": 28740 + }, + { + "epoch": 0.6770759738118789, + "grad_norm": 2.3416032791137695, + "learning_rate": 6.464603645612548e-05, + "loss": 2.1295, + "step": 28750 + }, + { + "epoch": 0.6773114784984221, + "grad_norm": 1.6582307815551758, + "learning_rate": 6.459893551881683e-05, + "loss": 2.0106, + "step": 28760 + }, + { + "epoch": 0.6775469831849654, + "grad_norm": 2.144157886505127, + "learning_rate": 6.455183458150818e-05, + "loss": 1.9546, + "step": 28770 + }, + { + "epoch": 0.6777824878715086, + "grad_norm": 2.9796035289764404, + "learning_rate": 6.450473364419953e-05, + "loss": 1.9336, + "step": 28780 + }, + { + "epoch": 0.678017992558052, + "grad_norm": 2.0936977863311768, + "learning_rate": 6.445763270689086e-05, + "loss": 2.077, + "step": 28790 + }, + { + "epoch": 0.6782534972445952, + "grad_norm": 1.9568135738372803, + "learning_rate": 6.441053176958221e-05, + "loss": 2.1443, + "step": 28800 + }, + { + "epoch": 0.6784890019311385, + "grad_norm": 2.607478618621826, + "learning_rate": 6.436343083227356e-05, + "loss": 2.139, + "step": 28810 + }, + { + "epoch": 0.6787245066176817, + "grad_norm": 2.0148494243621826, + "learning_rate": 6.431632989496491e-05, + "loss": 1.7023, + "step": 28820 + }, + { + "epoch": 0.678960011304225, + "grad_norm": 2.429159164428711, + "learning_rate": 6.426922895765626e-05, + "loss": 1.9094, + "step": 28830 + }, + { + "epoch": 0.6791955159907682, + "grad_norm": 2.1411314010620117, + "learning_rate": 6.42221280203476e-05, + "loss": 2.1745, + "step": 28840 + }, + { + "epoch": 0.6794310206773114, + "grad_norm": 2.190335988998413, + "learning_rate": 6.417502708303895e-05, + "loss": 2.139, + "step": 28850 + }, + { + "epoch": 0.6796665253638547, + "grad_norm": 2.283425807952881, + "learning_rate": 6.41279261457303e-05, + "loss": 1.9698, + "step": 28860 + }, + { + "epoch": 0.679902030050398, + "grad_norm": 2.0648112297058105, + "learning_rate": 6.408082520842165e-05, + "loss": 2.0706, + "step": 28870 + }, + { + "epoch": 0.6801375347369413, + "grad_norm": 1.8722620010375977, + "learning_rate": 6.403372427111299e-05, + "loss": 2.0114, + "step": 28880 + }, + { + "epoch": 0.6803730394234845, + "grad_norm": 2.4573814868927, + "learning_rate": 6.398662333380434e-05, + "loss": 1.9978, + "step": 28890 + }, + { + "epoch": 0.6806085441100278, + "grad_norm": 2.1509108543395996, + "learning_rate": 6.393952239649569e-05, + "loss": 2.0784, + "step": 28900 + }, + { + "epoch": 0.680844048796571, + "grad_norm": 2.1195876598358154, + "learning_rate": 6.389242145918705e-05, + "loss": 2.1025, + "step": 28910 + }, + { + "epoch": 0.6810795534831143, + "grad_norm": 2.044261932373047, + "learning_rate": 6.384532052187839e-05, + "loss": 1.786, + "step": 28920 + }, + { + "epoch": 0.6813150581696575, + "grad_norm": 3.030416250228882, + "learning_rate": 6.379821958456974e-05, + "loss": 2.1121, + "step": 28930 + }, + { + "epoch": 0.6815505628562009, + "grad_norm": 2.920388698577881, + "learning_rate": 6.375111864726109e-05, + "loss": 2.0764, + "step": 28940 + }, + { + "epoch": 0.6817860675427441, + "grad_norm": 1.9454491138458252, + "learning_rate": 6.370401770995244e-05, + "loss": 2.0691, + "step": 28950 + }, + { + "epoch": 0.6820215722292874, + "grad_norm": 1.834728479385376, + "learning_rate": 6.365691677264378e-05, + "loss": 1.8553, + "step": 28960 + }, + { + "epoch": 0.6822570769158306, + "grad_norm": 2.1100876331329346, + "learning_rate": 6.360981583533513e-05, + "loss": 1.8956, + "step": 28970 + }, + { + "epoch": 0.6824925816023739, + "grad_norm": 1.9620522260665894, + "learning_rate": 6.356271489802648e-05, + "loss": 1.7718, + "step": 28980 + }, + { + "epoch": 0.6827280862889171, + "grad_norm": 2.126603126525879, + "learning_rate": 6.351561396071783e-05, + "loss": 1.9046, + "step": 28990 + }, + { + "epoch": 0.6829635909754604, + "grad_norm": 2.21046781539917, + "learning_rate": 6.346851302340916e-05, + "loss": 2.054, + "step": 29000 + }, + { + "epoch": 0.6831990956620037, + "grad_norm": 2.2389838695526123, + "learning_rate": 6.342141208610051e-05, + "loss": 2.0818, + "step": 29010 + }, + { + "epoch": 0.683434600348547, + "grad_norm": 2.375826358795166, + "learning_rate": 6.337431114879186e-05, + "loss": 1.9759, + "step": 29020 + }, + { + "epoch": 0.6836701050350902, + "grad_norm": 1.9584667682647705, + "learning_rate": 6.332721021148322e-05, + "loss": 2.0202, + "step": 29030 + }, + { + "epoch": 0.6839056097216335, + "grad_norm": 2.5491538047790527, + "learning_rate": 6.328010927417457e-05, + "loss": 2.0253, + "step": 29040 + }, + { + "epoch": 0.6841411144081767, + "grad_norm": 1.8970832824707031, + "learning_rate": 6.32330083368659e-05, + "loss": 1.842, + "step": 29050 + }, + { + "epoch": 0.68437661909472, + "grad_norm": 2.02691912651062, + "learning_rate": 6.318590739955725e-05, + "loss": 1.788, + "step": 29060 + }, + { + "epoch": 0.6846121237812632, + "grad_norm": 1.954071283340454, + "learning_rate": 6.31388064622486e-05, + "loss": 1.9792, + "step": 29070 + }, + { + "epoch": 0.6848476284678066, + "grad_norm": 2.6439526081085205, + "learning_rate": 6.309170552493995e-05, + "loss": 1.8697, + "step": 29080 + }, + { + "epoch": 0.6850831331543498, + "grad_norm": 2.4627387523651123, + "learning_rate": 6.304460458763129e-05, + "loss": 2.2861, + "step": 29090 + }, + { + "epoch": 0.6853186378408931, + "grad_norm": 2.732792615890503, + "learning_rate": 6.299750365032264e-05, + "loss": 1.9502, + "step": 29100 + }, + { + "epoch": 0.6855541425274363, + "grad_norm": 2.2963614463806152, + "learning_rate": 6.295040271301399e-05, + "loss": 2.0442, + "step": 29110 + }, + { + "epoch": 0.6857896472139795, + "grad_norm": 2.23295259475708, + "learning_rate": 6.290330177570534e-05, + "loss": 2.2432, + "step": 29120 + }, + { + "epoch": 0.6860251519005228, + "grad_norm": 2.274610996246338, + "learning_rate": 6.285620083839669e-05, + "loss": 2.1089, + "step": 29130 + }, + { + "epoch": 0.686260656587066, + "grad_norm": 2.2117106914520264, + "learning_rate": 6.280909990108803e-05, + "loss": 2.1116, + "step": 29140 + }, + { + "epoch": 0.6864961612736094, + "grad_norm": 1.9509594440460205, + "learning_rate": 6.276199896377938e-05, + "loss": 2.033, + "step": 29150 + }, + { + "epoch": 0.6867316659601526, + "grad_norm": 2.090939521789551, + "learning_rate": 6.271489802647073e-05, + "loss": 2.1308, + "step": 29160 + }, + { + "epoch": 0.6869671706466959, + "grad_norm": 1.9330859184265137, + "learning_rate": 6.266779708916208e-05, + "loss": 2.0138, + "step": 29170 + }, + { + "epoch": 0.6872026753332391, + "grad_norm": 2.7607204914093018, + "learning_rate": 6.262069615185341e-05, + "loss": 1.8859, + "step": 29180 + }, + { + "epoch": 0.6874381800197824, + "grad_norm": 1.8978123664855957, + "learning_rate": 6.257359521454478e-05, + "loss": 1.9501, + "step": 29190 + }, + { + "epoch": 0.6876736847063256, + "grad_norm": 1.9100096225738525, + "learning_rate": 6.252649427723613e-05, + "loss": 2.1775, + "step": 29200 + }, + { + "epoch": 0.6879091893928689, + "grad_norm": 2.1695330142974854, + "learning_rate": 6.247939333992748e-05, + "loss": 1.8777, + "step": 29210 + }, + { + "epoch": 0.6881446940794121, + "grad_norm": 2.1711602210998535, + "learning_rate": 6.243229240261882e-05, + "loss": 1.9542, + "step": 29220 + }, + { + "epoch": 0.6883801987659555, + "grad_norm": 1.9122905731201172, + "learning_rate": 6.238519146531017e-05, + "loss": 2.0965, + "step": 29230 + }, + { + "epoch": 0.6886157034524987, + "grad_norm": 2.8225343227386475, + "learning_rate": 6.233809052800152e-05, + "loss": 1.8979, + "step": 29240 + }, + { + "epoch": 0.688851208139042, + "grad_norm": 2.511843204498291, + "learning_rate": 6.229098959069287e-05, + "loss": 1.8626, + "step": 29250 + }, + { + "epoch": 0.6890867128255852, + "grad_norm": 2.351067066192627, + "learning_rate": 6.22438886533842e-05, + "loss": 1.9113, + "step": 29260 + }, + { + "epoch": 0.6893222175121285, + "grad_norm": 2.117947578430176, + "learning_rate": 6.219678771607555e-05, + "loss": 2.2033, + "step": 29270 + }, + { + "epoch": 0.6895577221986717, + "grad_norm": 2.907036542892456, + "learning_rate": 6.21496867787669e-05, + "loss": 1.9599, + "step": 29280 + }, + { + "epoch": 0.689793226885215, + "grad_norm": 2.5090277194976807, + "learning_rate": 6.210258584145825e-05, + "loss": 2.0079, + "step": 29290 + }, + { + "epoch": 0.6900287315717583, + "grad_norm": 2.303168296813965, + "learning_rate": 6.205548490414959e-05, + "loss": 2.1558, + "step": 29300 + }, + { + "epoch": 0.6902642362583016, + "grad_norm": 1.9022759199142456, + "learning_rate": 6.200838396684094e-05, + "loss": 1.9063, + "step": 29310 + }, + { + "epoch": 0.6904997409448448, + "grad_norm": 2.1879916191101074, + "learning_rate": 6.196128302953229e-05, + "loss": 1.8982, + "step": 29320 + }, + { + "epoch": 0.6907352456313881, + "grad_norm": 2.6008474826812744, + "learning_rate": 6.191418209222364e-05, + "loss": 2.2362, + "step": 29330 + }, + { + "epoch": 0.6909707503179313, + "grad_norm": 2.61698317527771, + "learning_rate": 6.186708115491499e-05, + "loss": 2.0725, + "step": 29340 + }, + { + "epoch": 0.6912062550044746, + "grad_norm": 2.21352219581604, + "learning_rate": 6.181998021760633e-05, + "loss": 2.0445, + "step": 29350 + }, + { + "epoch": 0.6914417596910178, + "grad_norm": 2.037073850631714, + "learning_rate": 6.177287928029768e-05, + "loss": 1.8788, + "step": 29360 + }, + { + "epoch": 0.6916772643775612, + "grad_norm": 2.1956939697265625, + "learning_rate": 6.172577834298903e-05, + "loss": 2.0399, + "step": 29370 + }, + { + "epoch": 0.6919127690641044, + "grad_norm": 2.5270023345947266, + "learning_rate": 6.167867740568038e-05, + "loss": 2.2344, + "step": 29380 + }, + { + "epoch": 0.6921482737506477, + "grad_norm": 2.476280689239502, + "learning_rate": 6.163157646837172e-05, + "loss": 1.9711, + "step": 29390 + }, + { + "epoch": 0.6923837784371909, + "grad_norm": 1.4749599695205688, + "learning_rate": 6.158447553106307e-05, + "loss": 2.0054, + "step": 29400 + }, + { + "epoch": 0.6926192831237341, + "grad_norm": 2.379751443862915, + "learning_rate": 6.153737459375442e-05, + "loss": 1.9132, + "step": 29410 + }, + { + "epoch": 0.6928547878102774, + "grad_norm": 2.010408401489258, + "learning_rate": 6.149027365644577e-05, + "loss": 2.0497, + "step": 29420 + }, + { + "epoch": 0.6930902924968206, + "grad_norm": 1.9814579486846924, + "learning_rate": 6.144317271913712e-05, + "loss": 2.0795, + "step": 29430 + }, + { + "epoch": 0.693325797183364, + "grad_norm": 2.5224850177764893, + "learning_rate": 6.139607178182845e-05, + "loss": 1.8552, + "step": 29440 + }, + { + "epoch": 0.6935613018699072, + "grad_norm": 2.3376238346099854, + "learning_rate": 6.13489708445198e-05, + "loss": 1.7499, + "step": 29450 + }, + { + "epoch": 0.6937968065564505, + "grad_norm": 2.1768531799316406, + "learning_rate": 6.130186990721115e-05, + "loss": 2.1749, + "step": 29460 + }, + { + "epoch": 0.6940323112429937, + "grad_norm": 2.1843111515045166, + "learning_rate": 6.12547689699025e-05, + "loss": 2.0094, + "step": 29470 + }, + { + "epoch": 0.694267815929537, + "grad_norm": 2.0782411098480225, + "learning_rate": 6.120766803259385e-05, + "loss": 2.1589, + "step": 29480 + }, + { + "epoch": 0.6945033206160802, + "grad_norm": 2.4557056427001953, + "learning_rate": 6.11605670952852e-05, + "loss": 2.164, + "step": 29490 + }, + { + "epoch": 0.6947388253026235, + "grad_norm": 2.396660804748535, + "learning_rate": 6.111346615797655e-05, + "loss": 2.012, + "step": 29500 + }, + { + "epoch": 0.6949743299891668, + "grad_norm": 2.3271138668060303, + "learning_rate": 6.10663652206679e-05, + "loss": 1.9762, + "step": 29510 + }, + { + "epoch": 0.6952098346757101, + "grad_norm": 2.339451789855957, + "learning_rate": 6.101926428335925e-05, + "loss": 1.7365, + "step": 29520 + }, + { + "epoch": 0.6954453393622533, + "grad_norm": 1.9744147062301636, + "learning_rate": 6.097216334605059e-05, + "loss": 2.0507, + "step": 29530 + }, + { + "epoch": 0.6956808440487966, + "grad_norm": 3.988935708999634, + "learning_rate": 6.092506240874194e-05, + "loss": 1.9501, + "step": 29540 + }, + { + "epoch": 0.6959163487353398, + "grad_norm": 2.2221195697784424, + "learning_rate": 6.0877961471433286e-05, + "loss": 2.123, + "step": 29550 + }, + { + "epoch": 0.6961518534218831, + "grad_norm": 2.0011703968048096, + "learning_rate": 6.0830860534124636e-05, + "loss": 1.9586, + "step": 29560 + }, + { + "epoch": 0.6963873581084263, + "grad_norm": 2.5858304500579834, + "learning_rate": 6.078375959681598e-05, + "loss": 2.2042, + "step": 29570 + }, + { + "epoch": 0.6966228627949697, + "grad_norm": 2.5795745849609375, + "learning_rate": 6.073665865950733e-05, + "loss": 2.159, + "step": 29580 + }, + { + "epoch": 0.6968583674815129, + "grad_norm": 2.2104289531707764, + "learning_rate": 6.068955772219867e-05, + "loss": 1.865, + "step": 29590 + }, + { + "epoch": 0.6970938721680562, + "grad_norm": 1.7108197212219238, + "learning_rate": 6.064245678489002e-05, + "loss": 2.1158, + "step": 29600 + }, + { + "epoch": 0.6973293768545994, + "grad_norm": 2.610651731491089, + "learning_rate": 6.059535584758137e-05, + "loss": 2.0603, + "step": 29610 + }, + { + "epoch": 0.6975648815411427, + "grad_norm": 1.6342675685882568, + "learning_rate": 6.054825491027272e-05, + "loss": 1.963, + "step": 29620 + }, + { + "epoch": 0.6978003862276859, + "grad_norm": 2.5833704471588135, + "learning_rate": 6.050115397296407e-05, + "loss": 2.1255, + "step": 29630 + }, + { + "epoch": 0.6980358909142292, + "grad_norm": 2.2657909393310547, + "learning_rate": 6.045405303565541e-05, + "loss": 1.8765, + "step": 29640 + }, + { + "epoch": 0.6982713956007724, + "grad_norm": 2.124436378479004, + "learning_rate": 6.040695209834676e-05, + "loss": 2.2452, + "step": 29650 + }, + { + "epoch": 0.6985069002873158, + "grad_norm": 2.2420623302459717, + "learning_rate": 6.0359851161038104e-05, + "loss": 2.1147, + "step": 29660 + }, + { + "epoch": 0.698742404973859, + "grad_norm": 3.1197521686553955, + "learning_rate": 6.0312750223729455e-05, + "loss": 2.1481, + "step": 29670 + }, + { + "epoch": 0.6989779096604023, + "grad_norm": 2.0020577907562256, + "learning_rate": 6.02656492864208e-05, + "loss": 2.1455, + "step": 29680 + }, + { + "epoch": 0.6992134143469455, + "grad_norm": 3.46406888961792, + "learning_rate": 6.021854834911215e-05, + "loss": 1.8329, + "step": 29690 + }, + { + "epoch": 0.6994489190334887, + "grad_norm": 2.138166904449463, + "learning_rate": 6.017144741180349e-05, + "loss": 2.2264, + "step": 29700 + }, + { + "epoch": 0.699684423720032, + "grad_norm": 2.236649751663208, + "learning_rate": 6.012434647449484e-05, + "loss": 1.9857, + "step": 29710 + }, + { + "epoch": 0.6999199284065752, + "grad_norm": 2.371490955352783, + "learning_rate": 6.0077245537186185e-05, + "loss": 2.2641, + "step": 29720 + }, + { + "epoch": 0.7001554330931186, + "grad_norm": 1.9338253736495972, + "learning_rate": 6.0030144599877536e-05, + "loss": 1.7444, + "step": 29730 + }, + { + "epoch": 0.7003909377796618, + "grad_norm": 2.1143953800201416, + "learning_rate": 5.9983043662568886e-05, + "loss": 2.0956, + "step": 29740 + }, + { + "epoch": 0.7006264424662051, + "grad_norm": 2.2336223125457764, + "learning_rate": 5.993594272526023e-05, + "loss": 2.2865, + "step": 29750 + }, + { + "epoch": 0.7008619471527483, + "grad_norm": 2.3659603595733643, + "learning_rate": 5.9888841787951586e-05, + "loss": 1.9793, + "step": 29760 + }, + { + "epoch": 0.7010974518392916, + "grad_norm": 2.853128671646118, + "learning_rate": 5.984174085064294e-05, + "loss": 2.1234, + "step": 29770 + }, + { + "epoch": 0.7013329565258348, + "grad_norm": 2.3596596717834473, + "learning_rate": 5.979463991333428e-05, + "loss": 1.8743, + "step": 29780 + }, + { + "epoch": 0.7015684612123781, + "grad_norm": 1.9511758089065552, + "learning_rate": 5.974753897602563e-05, + "loss": 2.1135, + "step": 29790 + }, + { + "epoch": 0.7018039658989214, + "grad_norm": 2.2908072471618652, + "learning_rate": 5.9700438038716974e-05, + "loss": 2.2518, + "step": 29800 + }, + { + "epoch": 0.7020394705854647, + "grad_norm": 2.339449167251587, + "learning_rate": 5.9653337101408324e-05, + "loss": 2.2305, + "step": 29810 + }, + { + "epoch": 0.7022749752720079, + "grad_norm": 2.239489793777466, + "learning_rate": 5.9606236164099674e-05, + "loss": 2.0766, + "step": 29820 + }, + { + "epoch": 0.7025104799585512, + "grad_norm": 1.768304467201233, + "learning_rate": 5.955913522679102e-05, + "loss": 2.1229, + "step": 29830 + }, + { + "epoch": 0.7027459846450944, + "grad_norm": 1.8999934196472168, + "learning_rate": 5.951203428948237e-05, + "loss": 1.9637, + "step": 29840 + }, + { + "epoch": 0.7029814893316377, + "grad_norm": 2.1406350135803223, + "learning_rate": 5.946493335217371e-05, + "loss": 1.8846, + "step": 29850 + }, + { + "epoch": 0.7032169940181809, + "grad_norm": 2.1771881580352783, + "learning_rate": 5.941783241486506e-05, + "loss": 1.9483, + "step": 29860 + }, + { + "epoch": 0.7034524987047243, + "grad_norm": 1.9195053577423096, + "learning_rate": 5.9370731477556405e-05, + "loss": 1.8552, + "step": 29870 + }, + { + "epoch": 0.7036880033912675, + "grad_norm": 2.817622423171997, + "learning_rate": 5.9323630540247756e-05, + "loss": 2.0414, + "step": 29880 + }, + { + "epoch": 0.7039235080778108, + "grad_norm": 1.7897940874099731, + "learning_rate": 5.92765296029391e-05, + "loss": 1.8892, + "step": 29890 + }, + { + "epoch": 0.704159012764354, + "grad_norm": 2.211215019226074, + "learning_rate": 5.922942866563045e-05, + "loss": 2.002, + "step": 29900 + }, + { + "epoch": 0.7043945174508973, + "grad_norm": 2.0046420097351074, + "learning_rate": 5.918232772832179e-05, + "loss": 1.9973, + "step": 29910 + }, + { + "epoch": 0.7046300221374405, + "grad_norm": 2.126875638961792, + "learning_rate": 5.913522679101314e-05, + "loss": 2.044, + "step": 29920 + }, + { + "epoch": 0.7048655268239838, + "grad_norm": 2.909090995788574, + "learning_rate": 5.908812585370449e-05, + "loss": 1.7947, + "step": 29930 + }, + { + "epoch": 0.705101031510527, + "grad_norm": 2.0971028804779053, + "learning_rate": 5.904102491639584e-05, + "loss": 1.9566, + "step": 29940 + }, + { + "epoch": 0.7053365361970704, + "grad_norm": 2.6434972286224365, + "learning_rate": 5.899392397908719e-05, + "loss": 2.1791, + "step": 29950 + }, + { + "epoch": 0.7055720408836136, + "grad_norm": 1.7850927114486694, + "learning_rate": 5.894682304177853e-05, + "loss": 2.0034, + "step": 29960 + }, + { + "epoch": 0.7058075455701568, + "grad_norm": 1.7622065544128418, + "learning_rate": 5.889972210446988e-05, + "loss": 1.9323, + "step": 29970 + }, + { + "epoch": 0.7060430502567001, + "grad_norm": 2.0412206649780273, + "learning_rate": 5.8852621167161224e-05, + "loss": 2.0019, + "step": 29980 + }, + { + "epoch": 0.7062785549432433, + "grad_norm": 1.7954665422439575, + "learning_rate": 5.8805520229852574e-05, + "loss": 1.9332, + "step": 29990 + }, + { + "epoch": 0.7065140596297866, + "grad_norm": 4.530249118804932, + "learning_rate": 5.875841929254392e-05, + "loss": 2.0373, + "step": 30000 + }, + { + "epoch": 0.7067495643163298, + "grad_norm": 1.6885002851486206, + "learning_rate": 5.871131835523527e-05, + "loss": 1.9966, + "step": 30010 + }, + { + "epoch": 0.7069850690028732, + "grad_norm": 2.3572335243225098, + "learning_rate": 5.866421741792661e-05, + "loss": 1.9012, + "step": 30020 + }, + { + "epoch": 0.7072205736894164, + "grad_norm": 1.8825509548187256, + "learning_rate": 5.861711648061796e-05, + "loss": 2.2303, + "step": 30030 + }, + { + "epoch": 0.7074560783759597, + "grad_norm": 3.2787723541259766, + "learning_rate": 5.857001554330932e-05, + "loss": 2.2594, + "step": 30040 + }, + { + "epoch": 0.7076915830625029, + "grad_norm": 2.52473783493042, + "learning_rate": 5.852291460600067e-05, + "loss": 2.1244, + "step": 30050 + }, + { + "epoch": 0.7079270877490462, + "grad_norm": 2.1791837215423584, + "learning_rate": 5.847581366869201e-05, + "loss": 1.973, + "step": 30060 + }, + { + "epoch": 0.7081625924355894, + "grad_norm": 1.9965521097183228, + "learning_rate": 5.842871273138336e-05, + "loss": 1.8938, + "step": 30070 + }, + { + "epoch": 0.7083980971221328, + "grad_norm": 1.438650131225586, + "learning_rate": 5.8381611794074706e-05, + "loss": 1.9799, + "step": 30080 + }, + { + "epoch": 0.708633601808676, + "grad_norm": 1.926436185836792, + "learning_rate": 5.8334510856766056e-05, + "loss": 2.0451, + "step": 30090 + }, + { + "epoch": 0.7088691064952193, + "grad_norm": 1.62642240524292, + "learning_rate": 5.82874099194574e-05, + "loss": 2.0659, + "step": 30100 + }, + { + "epoch": 0.7091046111817625, + "grad_norm": 2.5094661712646484, + "learning_rate": 5.824030898214875e-05, + "loss": 1.9782, + "step": 30110 + }, + { + "epoch": 0.7093401158683058, + "grad_norm": 1.6265511512756348, + "learning_rate": 5.81932080448401e-05, + "loss": 1.9198, + "step": 30120 + }, + { + "epoch": 0.709575620554849, + "grad_norm": 2.2201929092407227, + "learning_rate": 5.8146107107531444e-05, + "loss": 1.962, + "step": 30130 + }, + { + "epoch": 0.7098111252413923, + "grad_norm": 2.028233051300049, + "learning_rate": 5.8099006170222794e-05, + "loss": 2.0659, + "step": 30140 + }, + { + "epoch": 0.7100466299279355, + "grad_norm": 2.96724796295166, + "learning_rate": 5.805190523291414e-05, + "loss": 2.0536, + "step": 30150 + }, + { + "epoch": 0.7102821346144789, + "grad_norm": 2.110179901123047, + "learning_rate": 5.800480429560549e-05, + "loss": 2.0094, + "step": 30160 + }, + { + "epoch": 0.7105176393010221, + "grad_norm": 2.6137187480926514, + "learning_rate": 5.795770335829683e-05, + "loss": 1.9766, + "step": 30170 + }, + { + "epoch": 0.7107531439875654, + "grad_norm": 2.006112813949585, + "learning_rate": 5.791060242098818e-05, + "loss": 2.0879, + "step": 30180 + }, + { + "epoch": 0.7109886486741086, + "grad_norm": 2.337247848510742, + "learning_rate": 5.7863501483679525e-05, + "loss": 2.038, + "step": 30190 + }, + { + "epoch": 0.7112241533606519, + "grad_norm": 1.9541200399398804, + "learning_rate": 5.7816400546370875e-05, + "loss": 1.9604, + "step": 30200 + }, + { + "epoch": 0.7114596580471951, + "grad_norm": 1.8182893991470337, + "learning_rate": 5.776929960906222e-05, + "loss": 1.8738, + "step": 30210 + }, + { + "epoch": 0.7116951627337385, + "grad_norm": 2.3723807334899902, + "learning_rate": 5.772219867175357e-05, + "loss": 2.1882, + "step": 30220 + }, + { + "epoch": 0.7119306674202817, + "grad_norm": 2.4405813217163086, + "learning_rate": 5.767509773444492e-05, + "loss": 1.9004, + "step": 30230 + }, + { + "epoch": 0.712166172106825, + "grad_norm": 2.4666452407836914, + "learning_rate": 5.762799679713626e-05, + "loss": 2.0491, + "step": 30240 + }, + { + "epoch": 0.7124016767933682, + "grad_norm": 2.4087648391723633, + "learning_rate": 5.758089585982761e-05, + "loss": 2.1029, + "step": 30250 + }, + { + "epoch": 0.7126371814799114, + "grad_norm": 2.2330007553100586, + "learning_rate": 5.7533794922518956e-05, + "loss": 1.9501, + "step": 30260 + }, + { + "epoch": 0.7128726861664547, + "grad_norm": 2.045724868774414, + "learning_rate": 5.748669398521031e-05, + "loss": 2.0743, + "step": 30270 + }, + { + "epoch": 0.7131081908529979, + "grad_norm": 2.37623929977417, + "learning_rate": 5.743959304790165e-05, + "loss": 2.086, + "step": 30280 + }, + { + "epoch": 0.7133436955395412, + "grad_norm": 2.594832181930542, + "learning_rate": 5.7392492110593e-05, + "loss": 2.0116, + "step": 30290 + }, + { + "epoch": 0.7135792002260845, + "grad_norm": 2.3839049339294434, + "learning_rate": 5.7345391173284344e-05, + "loss": 2.1416, + "step": 30300 + }, + { + "epoch": 0.7138147049126278, + "grad_norm": 2.3938348293304443, + "learning_rate": 5.7298290235975694e-05, + "loss": 2.15, + "step": 30310 + }, + { + "epoch": 0.714050209599171, + "grad_norm": 1.7177648544311523, + "learning_rate": 5.725118929866705e-05, + "loss": 1.8613, + "step": 30320 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 2.2762975692749023, + "learning_rate": 5.72040883613584e-05, + "loss": 1.8133, + "step": 30330 + }, + { + "epoch": 0.7145212189722575, + "grad_norm": 2.3679895401000977, + "learning_rate": 5.7156987424049745e-05, + "loss": 1.9782, + "step": 30340 + }, + { + "epoch": 0.7147567236588008, + "grad_norm": 2.099062204360962, + "learning_rate": 5.7109886486741095e-05, + "loss": 2.065, + "step": 30350 + }, + { + "epoch": 0.714992228345344, + "grad_norm": 1.8509316444396973, + "learning_rate": 5.706278554943244e-05, + "loss": 2.0064, + "step": 30360 + }, + { + "epoch": 0.7152277330318874, + "grad_norm": 2.6055731773376465, + "learning_rate": 5.701568461212379e-05, + "loss": 2.21, + "step": 30370 + }, + { + "epoch": 0.7154632377184306, + "grad_norm": 2.160001754760742, + "learning_rate": 5.696858367481513e-05, + "loss": 2.0922, + "step": 30380 + }, + { + "epoch": 0.7156987424049739, + "grad_norm": 2.1145455837249756, + "learning_rate": 5.692148273750648e-05, + "loss": 2.0778, + "step": 30390 + }, + { + "epoch": 0.7159342470915171, + "grad_norm": 2.1301040649414062, + "learning_rate": 5.6874381800197826e-05, + "loss": 1.9435, + "step": 30400 + }, + { + "epoch": 0.7161697517780604, + "grad_norm": 2.1226649284362793, + "learning_rate": 5.6827280862889176e-05, + "loss": 2.0785, + "step": 30410 + }, + { + "epoch": 0.7164052564646036, + "grad_norm": 2.950019359588623, + "learning_rate": 5.6780179925580526e-05, + "loss": 1.7644, + "step": 30420 + }, + { + "epoch": 0.7166407611511469, + "grad_norm": 2.4695847034454346, + "learning_rate": 5.673307898827187e-05, + "loss": 2.2652, + "step": 30430 + }, + { + "epoch": 0.7168762658376902, + "grad_norm": 2.0165791511535645, + "learning_rate": 5.668597805096322e-05, + "loss": 1.9458, + "step": 30440 + }, + { + "epoch": 0.7171117705242335, + "grad_norm": 2.2012386322021484, + "learning_rate": 5.6638877113654564e-05, + "loss": 1.9355, + "step": 30450 + }, + { + "epoch": 0.7173472752107767, + "grad_norm": 2.5340282917022705, + "learning_rate": 5.6591776176345914e-05, + "loss": 2.1552, + "step": 30460 + }, + { + "epoch": 0.71758277989732, + "grad_norm": 2.569276809692383, + "learning_rate": 5.654467523903726e-05, + "loss": 1.962, + "step": 30470 + }, + { + "epoch": 0.7178182845838632, + "grad_norm": 2.128749132156372, + "learning_rate": 5.649757430172861e-05, + "loss": 2.036, + "step": 30480 + }, + { + "epoch": 0.7180537892704065, + "grad_norm": 2.065203905105591, + "learning_rate": 5.645047336441995e-05, + "loss": 2.0853, + "step": 30490 + }, + { + "epoch": 0.7182892939569497, + "grad_norm": 2.0871028900146484, + "learning_rate": 5.64033724271113e-05, + "loss": 2.2986, + "step": 30500 + }, + { + "epoch": 0.7185247986434931, + "grad_norm": 1.4431718587875366, + "learning_rate": 5.6356271489802645e-05, + "loss": 1.9797, + "step": 30510 + }, + { + "epoch": 0.7187603033300363, + "grad_norm": 2.06838321685791, + "learning_rate": 5.6309170552493995e-05, + "loss": 1.945, + "step": 30520 + }, + { + "epoch": 0.7189958080165796, + "grad_norm": 1.6888794898986816, + "learning_rate": 5.6262069615185345e-05, + "loss": 1.9909, + "step": 30530 + }, + { + "epoch": 0.7192313127031228, + "grad_norm": 2.028390884399414, + "learning_rate": 5.621496867787669e-05, + "loss": 1.8028, + "step": 30540 + }, + { + "epoch": 0.719466817389666, + "grad_norm": 2.6491353511810303, + "learning_rate": 5.616786774056804e-05, + "loss": 2.1648, + "step": 30550 + }, + { + "epoch": 0.7197023220762093, + "grad_norm": 2.0216126441955566, + "learning_rate": 5.612076680325938e-05, + "loss": 1.9735, + "step": 30560 + }, + { + "epoch": 0.7199378267627525, + "grad_norm": 3.1546339988708496, + "learning_rate": 5.607366586595073e-05, + "loss": 2.1136, + "step": 30570 + }, + { + "epoch": 0.7201733314492959, + "grad_norm": 2.0467398166656494, + "learning_rate": 5.6026564928642076e-05, + "loss": 1.8179, + "step": 30580 + }, + { + "epoch": 0.7204088361358391, + "grad_norm": 1.9955779314041138, + "learning_rate": 5.5979463991333426e-05, + "loss": 1.9321, + "step": 30590 + }, + { + "epoch": 0.7206443408223824, + "grad_norm": 2.4097037315368652, + "learning_rate": 5.593236305402477e-05, + "loss": 2.0949, + "step": 30600 + }, + { + "epoch": 0.7208798455089256, + "grad_norm": 2.0927698612213135, + "learning_rate": 5.5885262116716134e-05, + "loss": 1.9115, + "step": 30610 + }, + { + "epoch": 0.7211153501954689, + "grad_norm": 2.086876630783081, + "learning_rate": 5.583816117940748e-05, + "loss": 2.3412, + "step": 30620 + }, + { + "epoch": 0.7213508548820121, + "grad_norm": 1.9654732942581177, + "learning_rate": 5.579106024209883e-05, + "loss": 2.1572, + "step": 30630 + }, + { + "epoch": 0.7215863595685554, + "grad_norm": 1.9568740129470825, + "learning_rate": 5.574395930479017e-05, + "loss": 1.8459, + "step": 30640 + }, + { + "epoch": 0.7218218642550986, + "grad_norm": 1.8348342180252075, + "learning_rate": 5.569685836748152e-05, + "loss": 1.8352, + "step": 30650 + }, + { + "epoch": 0.722057368941642, + "grad_norm": 2.3031091690063477, + "learning_rate": 5.5649757430172865e-05, + "loss": 1.9543, + "step": 30660 + }, + { + "epoch": 0.7222928736281852, + "grad_norm": 2.4159276485443115, + "learning_rate": 5.5602656492864215e-05, + "loss": 2.1472, + "step": 30670 + }, + { + "epoch": 0.7225283783147285, + "grad_norm": 2.4604334831237793, + "learning_rate": 5.555555555555556e-05, + "loss": 2.1093, + "step": 30680 + }, + { + "epoch": 0.7227638830012717, + "grad_norm": 2.7177553176879883, + "learning_rate": 5.550845461824691e-05, + "loss": 2.0667, + "step": 30690 + }, + { + "epoch": 0.722999387687815, + "grad_norm": 2.469963788986206, + "learning_rate": 5.546135368093825e-05, + "loss": 2.0233, + "step": 30700 + }, + { + "epoch": 0.7232348923743582, + "grad_norm": 2.972891330718994, + "learning_rate": 5.54142527436296e-05, + "loss": 1.9273, + "step": 30710 + }, + { + "epoch": 0.7234703970609015, + "grad_norm": 2.8329389095306396, + "learning_rate": 5.536715180632095e-05, + "loss": 2.0522, + "step": 30720 + }, + { + "epoch": 0.7237059017474448, + "grad_norm": 2.5914034843444824, + "learning_rate": 5.5320050869012296e-05, + "loss": 1.9087, + "step": 30730 + }, + { + "epoch": 0.7239414064339881, + "grad_norm": 2.375659227371216, + "learning_rate": 5.5272949931703646e-05, + "loss": 1.9656, + "step": 30740 + }, + { + "epoch": 0.7241769111205313, + "grad_norm": 2.802253246307373, + "learning_rate": 5.522584899439499e-05, + "loss": 1.9603, + "step": 30750 + }, + { + "epoch": 0.7244124158070746, + "grad_norm": 2.0423829555511475, + "learning_rate": 5.517874805708634e-05, + "loss": 2.0772, + "step": 30760 + }, + { + "epoch": 0.7246479204936178, + "grad_norm": 2.187680959701538, + "learning_rate": 5.513164711977768e-05, + "loss": 1.9225, + "step": 30770 + }, + { + "epoch": 0.7248834251801611, + "grad_norm": 2.442068576812744, + "learning_rate": 5.5084546182469034e-05, + "loss": 1.9933, + "step": 30780 + }, + { + "epoch": 0.7251189298667043, + "grad_norm": 2.3445839881896973, + "learning_rate": 5.503744524516038e-05, + "loss": 2.2684, + "step": 30790 + }, + { + "epoch": 0.7253544345532477, + "grad_norm": 2.0452582836151123, + "learning_rate": 5.499034430785173e-05, + "loss": 2.1643, + "step": 30800 + }, + { + "epoch": 0.7255899392397909, + "grad_norm": 2.025725841522217, + "learning_rate": 5.494324337054307e-05, + "loss": 1.9175, + "step": 30810 + }, + { + "epoch": 0.7258254439263341, + "grad_norm": 2.7299511432647705, + "learning_rate": 5.489614243323442e-05, + "loss": 2.0061, + "step": 30820 + }, + { + "epoch": 0.7260609486128774, + "grad_norm": 2.5968010425567627, + "learning_rate": 5.484904149592577e-05, + "loss": 1.905, + "step": 30830 + }, + { + "epoch": 0.7262964532994206, + "grad_norm": 2.217876434326172, + "learning_rate": 5.4801940558617115e-05, + "loss": 2.0114, + "step": 30840 + }, + { + "epoch": 0.7265319579859639, + "grad_norm": 3.317579984664917, + "learning_rate": 5.4754839621308465e-05, + "loss": 1.9557, + "step": 30850 + }, + { + "epoch": 0.7267674626725071, + "grad_norm": 3.0695741176605225, + "learning_rate": 5.470773868399981e-05, + "loss": 2.0315, + "step": 30860 + }, + { + "epoch": 0.7270029673590505, + "grad_norm": 2.6726536750793457, + "learning_rate": 5.466063774669116e-05, + "loss": 1.9777, + "step": 30870 + }, + { + "epoch": 0.7272384720455937, + "grad_norm": 2.8172085285186768, + "learning_rate": 5.46135368093825e-05, + "loss": 2.044, + "step": 30880 + }, + { + "epoch": 0.727473976732137, + "grad_norm": 2.0183653831481934, + "learning_rate": 5.456643587207386e-05, + "loss": 1.9781, + "step": 30890 + }, + { + "epoch": 0.7277094814186802, + "grad_norm": 2.0858523845672607, + "learning_rate": 5.451933493476521e-05, + "loss": 1.7131, + "step": 30900 + }, + { + "epoch": 0.7279449861052235, + "grad_norm": 2.115558624267578, + "learning_rate": 5.447223399745656e-05, + "loss": 1.992, + "step": 30910 + }, + { + "epoch": 0.7281804907917667, + "grad_norm": 1.972657322883606, + "learning_rate": 5.44251330601479e-05, + "loss": 1.8837, + "step": 30920 + }, + { + "epoch": 0.72841599547831, + "grad_norm": 1.986677885055542, + "learning_rate": 5.437803212283925e-05, + "loss": 2.0074, + "step": 30930 + }, + { + "epoch": 0.7286515001648532, + "grad_norm": 2.1113574504852295, + "learning_rate": 5.43309311855306e-05, + "loss": 2.0117, + "step": 30940 + }, + { + "epoch": 0.7288870048513966, + "grad_norm": 2.277940034866333, + "learning_rate": 5.428383024822195e-05, + "loss": 2.0174, + "step": 30950 + }, + { + "epoch": 0.7291225095379398, + "grad_norm": 2.1765830516815186, + "learning_rate": 5.423672931091329e-05, + "loss": 2.0942, + "step": 30960 + }, + { + "epoch": 0.7293580142244831, + "grad_norm": 2.458887815475464, + "learning_rate": 5.418962837360464e-05, + "loss": 2.0839, + "step": 30970 + }, + { + "epoch": 0.7295935189110263, + "grad_norm": 2.4584996700286865, + "learning_rate": 5.4142527436295984e-05, + "loss": 1.8682, + "step": 30980 + }, + { + "epoch": 0.7298290235975696, + "grad_norm": 2.042698860168457, + "learning_rate": 5.4095426498987334e-05, + "loss": 1.9744, + "step": 30990 + }, + { + "epoch": 0.7300645282841128, + "grad_norm": 2.2433009147644043, + "learning_rate": 5.404832556167868e-05, + "loss": 2.0678, + "step": 31000 + }, + { + "epoch": 0.7303000329706562, + "grad_norm": 3.455995559692383, + "learning_rate": 5.400122462437003e-05, + "loss": 2.0532, + "step": 31010 + }, + { + "epoch": 0.7305355376571994, + "grad_norm": 3.3080856800079346, + "learning_rate": 5.395412368706138e-05, + "loss": 2.1967, + "step": 31020 + }, + { + "epoch": 0.7307710423437427, + "grad_norm": 1.755066990852356, + "learning_rate": 5.390702274975272e-05, + "loss": 1.9065, + "step": 31030 + }, + { + "epoch": 0.7310065470302859, + "grad_norm": 2.1286559104919434, + "learning_rate": 5.385992181244407e-05, + "loss": 1.8758, + "step": 31040 + }, + { + "epoch": 0.7312420517168292, + "grad_norm": 1.8122589588165283, + "learning_rate": 5.3812820875135416e-05, + "loss": 1.9088, + "step": 31050 + }, + { + "epoch": 0.7314775564033724, + "grad_norm": 2.3942129611968994, + "learning_rate": 5.3765719937826766e-05, + "loss": 1.9794, + "step": 31060 + }, + { + "epoch": 0.7317130610899157, + "grad_norm": 2.5759265422821045, + "learning_rate": 5.371861900051811e-05, + "loss": 2.1059, + "step": 31070 + }, + { + "epoch": 0.731948565776459, + "grad_norm": 2.152968406677246, + "learning_rate": 5.367151806320946e-05, + "loss": 2.1384, + "step": 31080 + }, + { + "epoch": 0.7321840704630023, + "grad_norm": 1.7909654378890991, + "learning_rate": 5.36244171259008e-05, + "loss": 2.1023, + "step": 31090 + }, + { + "epoch": 0.7324195751495455, + "grad_norm": 2.8516340255737305, + "learning_rate": 5.357731618859215e-05, + "loss": 2.0101, + "step": 31100 + }, + { + "epoch": 0.7326550798360887, + "grad_norm": 3.418398857116699, + "learning_rate": 5.35302152512835e-05, + "loss": 2.0228, + "step": 31110 + }, + { + "epoch": 0.732890584522632, + "grad_norm": 1.9390182495117188, + "learning_rate": 5.348311431397485e-05, + "loss": 1.8994, + "step": 31120 + }, + { + "epoch": 0.7331260892091752, + "grad_norm": 2.929248809814453, + "learning_rate": 5.34360133766662e-05, + "loss": 2.017, + "step": 31130 + }, + { + "epoch": 0.7333615938957185, + "grad_norm": 2.084099531173706, + "learning_rate": 5.338891243935754e-05, + "loss": 2.0448, + "step": 31140 + }, + { + "epoch": 0.7335970985822617, + "grad_norm": 1.9358183145523071, + "learning_rate": 5.334181150204889e-05, + "loss": 1.9732, + "step": 31150 + }, + { + "epoch": 0.7338326032688051, + "grad_norm": 1.9566363096237183, + "learning_rate": 5.3294710564740234e-05, + "loss": 2.023, + "step": 31160 + }, + { + "epoch": 0.7340681079553483, + "grad_norm": 2.517813205718994, + "learning_rate": 5.324760962743159e-05, + "loss": 1.9894, + "step": 31170 + }, + { + "epoch": 0.7343036126418916, + "grad_norm": 1.871909737586975, + "learning_rate": 5.320050869012294e-05, + "loss": 2.1048, + "step": 31180 + }, + { + "epoch": 0.7345391173284348, + "grad_norm": 1.6476243734359741, + "learning_rate": 5.3153407752814285e-05, + "loss": 1.8816, + "step": 31190 + }, + { + "epoch": 0.7347746220149781, + "grad_norm": 2.7056660652160645, + "learning_rate": 5.3106306815505635e-05, + "loss": 1.8774, + "step": 31200 + }, + { + "epoch": 0.7350101267015213, + "grad_norm": 2.0394725799560547, + "learning_rate": 5.3059205878196986e-05, + "loss": 1.9307, + "step": 31210 + }, + { + "epoch": 0.7352456313880646, + "grad_norm": 2.695197105407715, + "learning_rate": 5.301210494088833e-05, + "loss": 1.842, + "step": 31220 + }, + { + "epoch": 0.7354811360746079, + "grad_norm": 2.9238481521606445, + "learning_rate": 5.296500400357968e-05, + "loss": 2.1305, + "step": 31230 + }, + { + "epoch": 0.7357166407611512, + "grad_norm": 2.0291872024536133, + "learning_rate": 5.291790306627102e-05, + "loss": 1.9462, + "step": 31240 + }, + { + "epoch": 0.7359521454476944, + "grad_norm": 2.4080393314361572, + "learning_rate": 5.287080212896237e-05, + "loss": 2.1098, + "step": 31250 + }, + { + "epoch": 0.7361876501342377, + "grad_norm": 2.3340492248535156, + "learning_rate": 5.2823701191653717e-05, + "loss": 2.135, + "step": 31260 + }, + { + "epoch": 0.7364231548207809, + "grad_norm": 2.0928733348846436, + "learning_rate": 5.277660025434507e-05, + "loss": 2.1051, + "step": 31270 + }, + { + "epoch": 0.7366586595073242, + "grad_norm": 2.603736162185669, + "learning_rate": 5.272949931703641e-05, + "loss": 2.2034, + "step": 31280 + }, + { + "epoch": 0.7368941641938674, + "grad_norm": 3.005927324295044, + "learning_rate": 5.268239837972776e-05, + "loss": 2.0459, + "step": 31290 + }, + { + "epoch": 0.7371296688804108, + "grad_norm": 2.0738449096679688, + "learning_rate": 5.2635297442419104e-05, + "loss": 2.1972, + "step": 31300 + }, + { + "epoch": 0.737365173566954, + "grad_norm": 2.542039394378662, + "learning_rate": 5.2588196505110454e-05, + "loss": 1.8309, + "step": 31310 + }, + { + "epoch": 0.7376006782534973, + "grad_norm": 2.0776071548461914, + "learning_rate": 5.2541095567801804e-05, + "loss": 2.2318, + "step": 31320 + }, + { + "epoch": 0.7378361829400405, + "grad_norm": 2.2851767539978027, + "learning_rate": 5.249399463049315e-05, + "loss": 2.1569, + "step": 31330 + }, + { + "epoch": 0.7380716876265838, + "grad_norm": 2.047200918197632, + "learning_rate": 5.24468936931845e-05, + "loss": 1.9189, + "step": 31340 + }, + { + "epoch": 0.738307192313127, + "grad_norm": 2.037782669067383, + "learning_rate": 5.239979275587584e-05, + "loss": 2.0439, + "step": 31350 + }, + { + "epoch": 0.7385426969996703, + "grad_norm": 2.144557476043701, + "learning_rate": 5.235269181856719e-05, + "loss": 1.9919, + "step": 31360 + }, + { + "epoch": 0.7387782016862136, + "grad_norm": 1.9935516119003296, + "learning_rate": 5.2305590881258535e-05, + "loss": 1.9855, + "step": 31370 + }, + { + "epoch": 0.7390137063727569, + "grad_norm": 2.8330445289611816, + "learning_rate": 5.2258489943949886e-05, + "loss": 2.1518, + "step": 31380 + }, + { + "epoch": 0.7392492110593001, + "grad_norm": 2.2874932289123535, + "learning_rate": 5.221138900664123e-05, + "loss": 2.0948, + "step": 31390 + }, + { + "epoch": 0.7394847157458433, + "grad_norm": 2.6140425205230713, + "learning_rate": 5.216428806933258e-05, + "loss": 2.0383, + "step": 31400 + }, + { + "epoch": 0.7397202204323866, + "grad_norm": 2.7529566287994385, + "learning_rate": 5.211718713202392e-05, + "loss": 2.0262, + "step": 31410 + }, + { + "epoch": 0.7399557251189298, + "grad_norm": 1.8774211406707764, + "learning_rate": 5.207008619471527e-05, + "loss": 1.7626, + "step": 31420 + }, + { + "epoch": 0.7401912298054731, + "grad_norm": 2.481081008911133, + "learning_rate": 5.202298525740662e-05, + "loss": 2.2189, + "step": 31430 + }, + { + "epoch": 0.7404267344920163, + "grad_norm": 3.3495407104492188, + "learning_rate": 5.197588432009797e-05, + "loss": 2.0925, + "step": 31440 + }, + { + "epoch": 0.7406622391785597, + "grad_norm": 2.11091685295105, + "learning_rate": 5.1928783382789324e-05, + "loss": 2.1724, + "step": 31450 + }, + { + "epoch": 0.7408977438651029, + "grad_norm": 2.0129551887512207, + "learning_rate": 5.1881682445480674e-05, + "loss": 2.1202, + "step": 31460 + }, + { + "epoch": 0.7411332485516462, + "grad_norm": 1.9652073383331299, + "learning_rate": 5.183458150817202e-05, + "loss": 1.9064, + "step": 31470 + }, + { + "epoch": 0.7413687532381894, + "grad_norm": 2.4702093601226807, + "learning_rate": 5.178748057086337e-05, + "loss": 2.042, + "step": 31480 + }, + { + "epoch": 0.7416042579247327, + "grad_norm": 1.9683281183242798, + "learning_rate": 5.174037963355471e-05, + "loss": 1.8423, + "step": 31490 + }, + { + "epoch": 0.7418397626112759, + "grad_norm": 1.8819502592086792, + "learning_rate": 5.169327869624606e-05, + "loss": 1.9363, + "step": 31500 + }, + { + "epoch": 0.7420752672978193, + "grad_norm": 2.266874074935913, + "learning_rate": 5.164617775893741e-05, + "loss": 1.9024, + "step": 31510 + }, + { + "epoch": 0.7423107719843625, + "grad_norm": 2.113044023513794, + "learning_rate": 5.1599076821628755e-05, + "loss": 1.9809, + "step": 31520 + }, + { + "epoch": 0.7425462766709058, + "grad_norm": 2.2748920917510986, + "learning_rate": 5.1551975884320105e-05, + "loss": 2.0686, + "step": 31530 + }, + { + "epoch": 0.742781781357449, + "grad_norm": 2.078479051589966, + "learning_rate": 5.150487494701145e-05, + "loss": 2.1953, + "step": 31540 + }, + { + "epoch": 0.7430172860439923, + "grad_norm": 2.49847412109375, + "learning_rate": 5.14577740097028e-05, + "loss": 2.0343, + "step": 31550 + }, + { + "epoch": 0.7432527907305355, + "grad_norm": 2.6483688354492188, + "learning_rate": 5.141067307239414e-05, + "loss": 1.9767, + "step": 31560 + }, + { + "epoch": 0.7434882954170788, + "grad_norm": 2.081569194793701, + "learning_rate": 5.136357213508549e-05, + "loss": 2.0236, + "step": 31570 + }, + { + "epoch": 0.743723800103622, + "grad_norm": 2.007598876953125, + "learning_rate": 5.1316471197776836e-05, + "loss": 2.079, + "step": 31580 + }, + { + "epoch": 0.7439593047901654, + "grad_norm": 2.3295485973358154, + "learning_rate": 5.1269370260468186e-05, + "loss": 1.9723, + "step": 31590 + }, + { + "epoch": 0.7441948094767086, + "grad_norm": 2.001807689666748, + "learning_rate": 5.122226932315953e-05, + "loss": 2.0025, + "step": 31600 + }, + { + "epoch": 0.7444303141632519, + "grad_norm": 2.1212780475616455, + "learning_rate": 5.117516838585088e-05, + "loss": 2.1097, + "step": 31610 + }, + { + "epoch": 0.7446658188497951, + "grad_norm": 1.8547629117965698, + "learning_rate": 5.112806744854223e-05, + "loss": 1.9761, + "step": 31620 + }, + { + "epoch": 0.7449013235363384, + "grad_norm": 1.8750020265579224, + "learning_rate": 5.1080966511233574e-05, + "loss": 2.0775, + "step": 31630 + }, + { + "epoch": 0.7451368282228816, + "grad_norm": 2.390493869781494, + "learning_rate": 5.1033865573924924e-05, + "loss": 2.0353, + "step": 31640 + }, + { + "epoch": 0.745372332909425, + "grad_norm": 1.8279821872711182, + "learning_rate": 5.098676463661627e-05, + "loss": 1.958, + "step": 31650 + }, + { + "epoch": 0.7456078375959682, + "grad_norm": 2.5792455673217773, + "learning_rate": 5.093966369930762e-05, + "loss": 2.2457, + "step": 31660 + }, + { + "epoch": 0.7458433422825114, + "grad_norm": 2.464423894882202, + "learning_rate": 5.089256276199896e-05, + "loss": 1.8873, + "step": 31670 + }, + { + "epoch": 0.7460788469690547, + "grad_norm": 1.911726951599121, + "learning_rate": 5.084546182469031e-05, + "loss": 2.0966, + "step": 31680 + }, + { + "epoch": 0.7463143516555979, + "grad_norm": 2.0924248695373535, + "learning_rate": 5.0798360887381655e-05, + "loss": 2.1157, + "step": 31690 + }, + { + "epoch": 0.7465498563421412, + "grad_norm": 2.2013473510742188, + "learning_rate": 5.0751259950073005e-05, + "loss": 2.05, + "step": 31700 + }, + { + "epoch": 0.7467853610286844, + "grad_norm": 2.036799430847168, + "learning_rate": 5.070415901276435e-05, + "loss": 2.1727, + "step": 31710 + }, + { + "epoch": 0.7470208657152277, + "grad_norm": 1.8770381212234497, + "learning_rate": 5.06570580754557e-05, + "loss": 2.0502, + "step": 31720 + }, + { + "epoch": 0.747256370401771, + "grad_norm": 1.9372336864471436, + "learning_rate": 5.060995713814705e-05, + "loss": 2.1371, + "step": 31730 + }, + { + "epoch": 0.7474918750883143, + "grad_norm": 2.189121961593628, + "learning_rate": 5.0562856200838406e-05, + "loss": 2.0499, + "step": 31740 + }, + { + "epoch": 0.7477273797748575, + "grad_norm": 2.1549293994903564, + "learning_rate": 5.051575526352975e-05, + "loss": 1.9287, + "step": 31750 + }, + { + "epoch": 0.7479628844614008, + "grad_norm": 2.0482614040374756, + "learning_rate": 5.04686543262211e-05, + "loss": 2.0564, + "step": 31760 + }, + { + "epoch": 0.748198389147944, + "grad_norm": 2.111997604370117, + "learning_rate": 5.0421553388912443e-05, + "loss": 2.176, + "step": 31770 + }, + { + "epoch": 0.7484338938344873, + "grad_norm": 2.4631214141845703, + "learning_rate": 5.0374452451603794e-05, + "loss": 1.9805, + "step": 31780 + }, + { + "epoch": 0.7486693985210305, + "grad_norm": 3.002066135406494, + "learning_rate": 5.032735151429514e-05, + "loss": 2.091, + "step": 31790 + }, + { + "epoch": 0.7489049032075739, + "grad_norm": 2.605787992477417, + "learning_rate": 5.028025057698649e-05, + "loss": 1.8293, + "step": 31800 + }, + { + "epoch": 0.7491404078941171, + "grad_norm": 1.7942407131195068, + "learning_rate": 5.023314963967784e-05, + "loss": 2.0559, + "step": 31810 + }, + { + "epoch": 0.7493759125806604, + "grad_norm": 2.3381400108337402, + "learning_rate": 5.018604870236918e-05, + "loss": 2.1534, + "step": 31820 + }, + { + "epoch": 0.7496114172672036, + "grad_norm": 2.1282408237457275, + "learning_rate": 5.013894776506053e-05, + "loss": 1.6971, + "step": 31830 + }, + { + "epoch": 0.7498469219537469, + "grad_norm": 2.5377397537231445, + "learning_rate": 5.0091846827751875e-05, + "loss": 2.1498, + "step": 31840 + }, + { + "epoch": 0.7500824266402901, + "grad_norm": 2.1873764991760254, + "learning_rate": 5.0044745890443225e-05, + "loss": 1.9524, + "step": 31850 + }, + { + "epoch": 0.7503179313268334, + "grad_norm": 1.9060252904891968, + "learning_rate": 4.999764495313457e-05, + "loss": 1.9569, + "step": 31860 + }, + { + "epoch": 0.7505534360133767, + "grad_norm": 1.914854645729065, + "learning_rate": 4.995054401582592e-05, + "loss": 1.9475, + "step": 31870 + }, + { + "epoch": 0.75078894069992, + "grad_norm": 3.2591161727905273, + "learning_rate": 4.990344307851726e-05, + "loss": 2.0404, + "step": 31880 + }, + { + "epoch": 0.7510244453864632, + "grad_norm": 1.9363534450531006, + "learning_rate": 4.985634214120861e-05, + "loss": 1.9256, + "step": 31890 + }, + { + "epoch": 0.7512599500730065, + "grad_norm": 2.1461710929870605, + "learning_rate": 4.9809241203899956e-05, + "loss": 1.9352, + "step": 31900 + }, + { + "epoch": 0.7514954547595497, + "grad_norm": 2.079094648361206, + "learning_rate": 4.9762140266591306e-05, + "loss": 1.9824, + "step": 31910 + }, + { + "epoch": 0.751730959446093, + "grad_norm": 3.0540053844451904, + "learning_rate": 4.9715039329282656e-05, + "loss": 2.2878, + "step": 31920 + }, + { + "epoch": 0.7519664641326362, + "grad_norm": 2.7210094928741455, + "learning_rate": 4.9667938391974e-05, + "loss": 1.8646, + "step": 31930 + }, + { + "epoch": 0.7522019688191796, + "grad_norm": 2.256350517272949, + "learning_rate": 4.962083745466535e-05, + "loss": 1.6932, + "step": 31940 + }, + { + "epoch": 0.7524374735057228, + "grad_norm": 1.946227788925171, + "learning_rate": 4.95737365173567e-05, + "loss": 2.0174, + "step": 31950 + }, + { + "epoch": 0.752672978192266, + "grad_norm": 2.2353408336639404, + "learning_rate": 4.952663558004805e-05, + "loss": 2.0512, + "step": 31960 + }, + { + "epoch": 0.7529084828788093, + "grad_norm": 2.020702362060547, + "learning_rate": 4.9479534642739394e-05, + "loss": 2.1851, + "step": 31970 + }, + { + "epoch": 0.7531439875653525, + "grad_norm": 2.0567731857299805, + "learning_rate": 4.9432433705430744e-05, + "loss": 2.0855, + "step": 31980 + }, + { + "epoch": 0.7533794922518958, + "grad_norm": 2.680443525314331, + "learning_rate": 4.938533276812209e-05, + "loss": 2.0761, + "step": 31990 + }, + { + "epoch": 0.753614996938439, + "grad_norm": 2.444499969482422, + "learning_rate": 4.933823183081344e-05, + "loss": 1.9018, + "step": 32000 + }, + { + "epoch": 0.7538505016249823, + "grad_norm": 2.7612671852111816, + "learning_rate": 4.929113089350478e-05, + "loss": 1.9357, + "step": 32010 + }, + { + "epoch": 0.7540860063115256, + "grad_norm": 2.018686532974243, + "learning_rate": 4.924402995619613e-05, + "loss": 1.9346, + "step": 32020 + }, + { + "epoch": 0.7543215109980689, + "grad_norm": 2.3052027225494385, + "learning_rate": 4.9196929018887475e-05, + "loss": 1.9173, + "step": 32030 + }, + { + "epoch": 0.7545570156846121, + "grad_norm": 2.0256094932556152, + "learning_rate": 4.9149828081578826e-05, + "loss": 1.9119, + "step": 32040 + }, + { + "epoch": 0.7547925203711554, + "grad_norm": 2.7691421508789062, + "learning_rate": 4.910272714427017e-05, + "loss": 1.9702, + "step": 32050 + }, + { + "epoch": 0.7550280250576986, + "grad_norm": 2.235715866088867, + "learning_rate": 4.905562620696152e-05, + "loss": 1.9636, + "step": 32060 + }, + { + "epoch": 0.7552635297442419, + "grad_norm": 2.506202459335327, + "learning_rate": 4.900852526965287e-05, + "loss": 1.9745, + "step": 32070 + }, + { + "epoch": 0.7554990344307851, + "grad_norm": 2.220933198928833, + "learning_rate": 4.896142433234421e-05, + "loss": 2.2018, + "step": 32080 + }, + { + "epoch": 0.7557345391173285, + "grad_norm": 2.0806100368499756, + "learning_rate": 4.891432339503556e-05, + "loss": 2.0028, + "step": 32090 + }, + { + "epoch": 0.7559700438038717, + "grad_norm": 2.027374744415283, + "learning_rate": 4.8867222457726913e-05, + "loss": 2.0422, + "step": 32100 + }, + { + "epoch": 0.756205548490415, + "grad_norm": 2.054565191268921, + "learning_rate": 4.8820121520418264e-05, + "loss": 1.9676, + "step": 32110 + }, + { + "epoch": 0.7564410531769582, + "grad_norm": 2.5844099521636963, + "learning_rate": 4.877302058310961e-05, + "loss": 2.1372, + "step": 32120 + }, + { + "epoch": 0.7566765578635015, + "grad_norm": 2.344665765762329, + "learning_rate": 4.872591964580096e-05, + "loss": 2.2227, + "step": 32130 + }, + { + "epoch": 0.7569120625500447, + "grad_norm": 2.02775502204895, + "learning_rate": 4.86788187084923e-05, + "loss": 1.8766, + "step": 32140 + }, + { + "epoch": 0.757147567236588, + "grad_norm": 1.9430595636367798, + "learning_rate": 4.863171777118365e-05, + "loss": 1.9213, + "step": 32150 + }, + { + "epoch": 0.7573830719231313, + "grad_norm": 1.9600436687469482, + "learning_rate": 4.8584616833874995e-05, + "loss": 1.7373, + "step": 32160 + }, + { + "epoch": 0.7576185766096746, + "grad_norm": 2.5143792629241943, + "learning_rate": 4.8537515896566345e-05, + "loss": 2.1279, + "step": 32170 + }, + { + "epoch": 0.7578540812962178, + "grad_norm": 1.9432998895645142, + "learning_rate": 4.849041495925769e-05, + "loss": 2.0553, + "step": 32180 + }, + { + "epoch": 0.7580895859827611, + "grad_norm": 2.147117853164673, + "learning_rate": 4.844331402194904e-05, + "loss": 1.9338, + "step": 32190 + }, + { + "epoch": 0.7583250906693043, + "grad_norm": 1.7949907779693604, + "learning_rate": 4.839621308464038e-05, + "loss": 2.0411, + "step": 32200 + }, + { + "epoch": 0.7585605953558476, + "grad_norm": 1.945996880531311, + "learning_rate": 4.834911214733173e-05, + "loss": 1.9042, + "step": 32210 + }, + { + "epoch": 0.7587961000423908, + "grad_norm": 3.3498096466064453, + "learning_rate": 4.830201121002308e-05, + "loss": 2.2681, + "step": 32220 + }, + { + "epoch": 0.759031604728934, + "grad_norm": 1.85516357421875, + "learning_rate": 4.825491027271443e-05, + "loss": 1.9829, + "step": 32230 + }, + { + "epoch": 0.7592671094154774, + "grad_norm": 1.8677126169204712, + "learning_rate": 4.8207809335405776e-05, + "loss": 1.8494, + "step": 32240 + }, + { + "epoch": 0.7595026141020206, + "grad_norm": 1.955237865447998, + "learning_rate": 4.8160708398097126e-05, + "loss": 1.9311, + "step": 32250 + }, + { + "epoch": 0.7597381187885639, + "grad_norm": 2.534681558609009, + "learning_rate": 4.811360746078848e-05, + "loss": 2.2401, + "step": 32260 + }, + { + "epoch": 0.7599736234751071, + "grad_norm": 1.8097875118255615, + "learning_rate": 4.806650652347982e-05, + "loss": 1.8716, + "step": 32270 + }, + { + "epoch": 0.7602091281616504, + "grad_norm": 2.7228007316589355, + "learning_rate": 4.801940558617117e-05, + "loss": 1.9849, + "step": 32280 + }, + { + "epoch": 0.7604446328481936, + "grad_norm": 2.104480028152466, + "learning_rate": 4.7972304648862514e-05, + "loss": 2.1219, + "step": 32290 + }, + { + "epoch": 0.760680137534737, + "grad_norm": 2.193178176879883, + "learning_rate": 4.7925203711553864e-05, + "loss": 1.8933, + "step": 32300 + }, + { + "epoch": 0.7609156422212802, + "grad_norm": 2.3160879611968994, + "learning_rate": 4.787810277424521e-05, + "loss": 2.0682, + "step": 32310 + }, + { + "epoch": 0.7611511469078235, + "grad_norm": 1.9633328914642334, + "learning_rate": 4.783100183693656e-05, + "loss": 1.893, + "step": 32320 + }, + { + "epoch": 0.7613866515943667, + "grad_norm": 2.395254611968994, + "learning_rate": 4.77839008996279e-05, + "loss": 1.9148, + "step": 32330 + }, + { + "epoch": 0.76162215628091, + "grad_norm": 2.234740972518921, + "learning_rate": 4.773679996231925e-05, + "loss": 1.9468, + "step": 32340 + }, + { + "epoch": 0.7618576609674532, + "grad_norm": 2.9619855880737305, + "learning_rate": 4.7689699025010595e-05, + "loss": 1.7942, + "step": 32350 + }, + { + "epoch": 0.7620931656539965, + "grad_norm": 2.4581329822540283, + "learning_rate": 4.7642598087701945e-05, + "loss": 1.9489, + "step": 32360 + }, + { + "epoch": 0.7623286703405397, + "grad_norm": 2.132481098175049, + "learning_rate": 4.7595497150393295e-05, + "loss": 1.8993, + "step": 32370 + }, + { + "epoch": 0.7625641750270831, + "grad_norm": 2.9130518436431885, + "learning_rate": 4.7548396213084646e-05, + "loss": 2.1784, + "step": 32380 + }, + { + "epoch": 0.7627996797136263, + "grad_norm": 2.35318922996521, + "learning_rate": 4.750129527577599e-05, + "loss": 2.0111, + "step": 32390 + }, + { + "epoch": 0.7630351844001696, + "grad_norm": 2.9324629306793213, + "learning_rate": 4.745419433846734e-05, + "loss": 1.9419, + "step": 32400 + }, + { + "epoch": 0.7632706890867128, + "grad_norm": 1.9878653287887573, + "learning_rate": 4.740709340115869e-05, + "loss": 1.9623, + "step": 32410 + }, + { + "epoch": 0.7635061937732561, + "grad_norm": 1.8808915615081787, + "learning_rate": 4.735999246385003e-05, + "loss": 2.12, + "step": 32420 + }, + { + "epoch": 0.7637416984597993, + "grad_norm": 1.9779167175292969, + "learning_rate": 4.7312891526541383e-05, + "loss": 1.775, + "step": 32430 + }, + { + "epoch": 0.7639772031463427, + "grad_norm": 2.064002752304077, + "learning_rate": 4.726579058923273e-05, + "loss": 2.0461, + "step": 32440 + }, + { + "epoch": 0.7642127078328859, + "grad_norm": 1.897849440574646, + "learning_rate": 4.721868965192408e-05, + "loss": 1.8057, + "step": 32450 + }, + { + "epoch": 0.7644482125194292, + "grad_norm": 2.096017837524414, + "learning_rate": 4.717158871461542e-05, + "loss": 1.9373, + "step": 32460 + }, + { + "epoch": 0.7646837172059724, + "grad_norm": 2.7307066917419434, + "learning_rate": 4.712448777730677e-05, + "loss": 2.1702, + "step": 32470 + }, + { + "epoch": 0.7649192218925157, + "grad_norm": 1.8949542045593262, + "learning_rate": 4.7077386839998114e-05, + "loss": 1.9492, + "step": 32480 + }, + { + "epoch": 0.7651547265790589, + "grad_norm": 2.4673449993133545, + "learning_rate": 4.7030285902689465e-05, + "loss": 2.3952, + "step": 32490 + }, + { + "epoch": 0.7653902312656022, + "grad_norm": 2.2674577236175537, + "learning_rate": 4.698318496538081e-05, + "loss": 1.9935, + "step": 32500 + }, + { + "epoch": 0.7656257359521454, + "grad_norm": 2.0791029930114746, + "learning_rate": 4.6936084028072165e-05, + "loss": 2.0564, + "step": 32510 + }, + { + "epoch": 0.7658612406386887, + "grad_norm": 2.5843989849090576, + "learning_rate": 4.688898309076351e-05, + "loss": 1.7758, + "step": 32520 + }, + { + "epoch": 0.766096745325232, + "grad_norm": 2.141845226287842, + "learning_rate": 4.684188215345486e-05, + "loss": 1.9268, + "step": 32530 + }, + { + "epoch": 0.7663322500117752, + "grad_norm": 2.0539655685424805, + "learning_rate": 4.67947812161462e-05, + "loss": 1.9727, + "step": 32540 + }, + { + "epoch": 0.7665677546983185, + "grad_norm": 2.356175184249878, + "learning_rate": 4.674768027883755e-05, + "loss": 2.1268, + "step": 32550 + }, + { + "epoch": 0.7668032593848617, + "grad_norm": 2.6625282764434814, + "learning_rate": 4.67005793415289e-05, + "loss": 2.0887, + "step": 32560 + }, + { + "epoch": 0.767038764071405, + "grad_norm": 2.2994375228881836, + "learning_rate": 4.6653478404220246e-05, + "loss": 1.7722, + "step": 32570 + }, + { + "epoch": 0.7672742687579482, + "grad_norm": 2.1485538482666016, + "learning_rate": 4.6606377466911596e-05, + "loss": 1.914, + "step": 32580 + }, + { + "epoch": 0.7675097734444916, + "grad_norm": 3.24705171585083, + "learning_rate": 4.655927652960294e-05, + "loss": 1.9643, + "step": 32590 + }, + { + "epoch": 0.7677452781310348, + "grad_norm": 2.247652292251587, + "learning_rate": 4.651217559229429e-05, + "loss": 2.2326, + "step": 32600 + }, + { + "epoch": 0.7679807828175781, + "grad_norm": 2.2194790840148926, + "learning_rate": 4.6465074654985634e-05, + "loss": 2.0224, + "step": 32610 + }, + { + "epoch": 0.7682162875041213, + "grad_norm": 2.246136426925659, + "learning_rate": 4.6417973717676984e-05, + "loss": 2.1937, + "step": 32620 + }, + { + "epoch": 0.7684517921906646, + "grad_norm": 1.978644847869873, + "learning_rate": 4.637087278036833e-05, + "loss": 2.0933, + "step": 32630 + }, + { + "epoch": 0.7686872968772078, + "grad_norm": 2.822878360748291, + "learning_rate": 4.632377184305968e-05, + "loss": 2.1852, + "step": 32640 + }, + { + "epoch": 0.7689228015637511, + "grad_norm": 1.890211820602417, + "learning_rate": 4.627667090575102e-05, + "loss": 1.9076, + "step": 32650 + }, + { + "epoch": 0.7691583062502944, + "grad_norm": 2.3078582286834717, + "learning_rate": 4.622956996844238e-05, + "loss": 1.9756, + "step": 32660 + }, + { + "epoch": 0.7693938109368377, + "grad_norm": 2.1552579402923584, + "learning_rate": 4.618246903113372e-05, + "loss": 2.0046, + "step": 32670 + }, + { + "epoch": 0.7696293156233809, + "grad_norm": 2.208951711654663, + "learning_rate": 4.613536809382507e-05, + "loss": 2.1688, + "step": 32680 + }, + { + "epoch": 0.7698648203099242, + "grad_norm": 2.6358840465545654, + "learning_rate": 4.6088267156516415e-05, + "loss": 2.0845, + "step": 32690 + }, + { + "epoch": 0.7701003249964674, + "grad_norm": 2.166287422180176, + "learning_rate": 4.6041166219207765e-05, + "loss": 2.0353, + "step": 32700 + }, + { + "epoch": 0.7703358296830107, + "grad_norm": 2.0117709636688232, + "learning_rate": 4.5994065281899116e-05, + "loss": 2.0734, + "step": 32710 + }, + { + "epoch": 0.7705713343695539, + "grad_norm": 2.068957805633545, + "learning_rate": 4.594696434459046e-05, + "loss": 2.0584, + "step": 32720 + }, + { + "epoch": 0.7708068390560973, + "grad_norm": 2.884422540664673, + "learning_rate": 4.589986340728181e-05, + "loss": 2.0943, + "step": 32730 + }, + { + "epoch": 0.7710423437426405, + "grad_norm": 2.284824848175049, + "learning_rate": 4.585276246997315e-05, + "loss": 2.0243, + "step": 32740 + }, + { + "epoch": 0.7712778484291838, + "grad_norm": 2.4009203910827637, + "learning_rate": 4.58056615326645e-05, + "loss": 2.0731, + "step": 32750 + }, + { + "epoch": 0.771513353115727, + "grad_norm": 2.728984832763672, + "learning_rate": 4.5758560595355847e-05, + "loss": 2.1744, + "step": 32760 + }, + { + "epoch": 0.7717488578022703, + "grad_norm": 2.1261510848999023, + "learning_rate": 4.57114596580472e-05, + "loss": 2.0703, + "step": 32770 + }, + { + "epoch": 0.7719843624888135, + "grad_norm": 2.4731087684631348, + "learning_rate": 4.566435872073854e-05, + "loss": 1.8765, + "step": 32780 + }, + { + "epoch": 0.7722198671753568, + "grad_norm": 2.0476648807525635, + "learning_rate": 4.561725778342989e-05, + "loss": 2.0803, + "step": 32790 + }, + { + "epoch": 0.7724553718619, + "grad_norm": 2.126819372177124, + "learning_rate": 4.557015684612124e-05, + "loss": 1.9244, + "step": 32800 + }, + { + "epoch": 0.7726908765484433, + "grad_norm": 1.6341440677642822, + "learning_rate": 4.552305590881259e-05, + "loss": 2.0313, + "step": 32810 + }, + { + "epoch": 0.7729263812349866, + "grad_norm": 2.055776596069336, + "learning_rate": 4.5475954971503935e-05, + "loss": 2.0049, + "step": 32820 + }, + { + "epoch": 0.7731618859215298, + "grad_norm": 2.976670265197754, + "learning_rate": 4.5428854034195285e-05, + "loss": 2.3065, + "step": 32830 + }, + { + "epoch": 0.7733973906080731, + "grad_norm": 2.6509125232696533, + "learning_rate": 4.538175309688663e-05, + "loss": 2.1162, + "step": 32840 + }, + { + "epoch": 0.7736328952946163, + "grad_norm": 2.617938756942749, + "learning_rate": 4.533465215957798e-05, + "loss": 2.0919, + "step": 32850 + }, + { + "epoch": 0.7738683999811596, + "grad_norm": 2.4530789852142334, + "learning_rate": 4.528755122226933e-05, + "loss": 2.009, + "step": 32860 + }, + { + "epoch": 0.7741039046677028, + "grad_norm": 2.889514207839966, + "learning_rate": 4.524045028496067e-05, + "loss": 1.8075, + "step": 32870 + }, + { + "epoch": 0.7743394093542462, + "grad_norm": 2.127507448196411, + "learning_rate": 4.519334934765202e-05, + "loss": 2.0197, + "step": 32880 + }, + { + "epoch": 0.7745749140407894, + "grad_norm": 2.079528570175171, + "learning_rate": 4.5146248410343366e-05, + "loss": 2.0391, + "step": 32890 + }, + { + "epoch": 0.7748104187273327, + "grad_norm": 2.2359161376953125, + "learning_rate": 4.5099147473034716e-05, + "loss": 2.0698, + "step": 32900 + }, + { + "epoch": 0.7750459234138759, + "grad_norm": 1.9062904119491577, + "learning_rate": 4.505204653572606e-05, + "loss": 2.1326, + "step": 32910 + }, + { + "epoch": 0.7752814281004192, + "grad_norm": 2.303953170776367, + "learning_rate": 4.500494559841741e-05, + "loss": 2.0753, + "step": 32920 + }, + { + "epoch": 0.7755169327869624, + "grad_norm": 1.8101952075958252, + "learning_rate": 4.495784466110875e-05, + "loss": 1.8815, + "step": 32930 + }, + { + "epoch": 0.7757524374735058, + "grad_norm": 3.982452630996704, + "learning_rate": 4.491074372380011e-05, + "loss": 1.9328, + "step": 32940 + }, + { + "epoch": 0.775987942160049, + "grad_norm": 2.1649701595306396, + "learning_rate": 4.4863642786491454e-05, + "loss": 1.8076, + "step": 32950 + }, + { + "epoch": 0.7762234468465923, + "grad_norm": 1.8240478038787842, + "learning_rate": 4.4816541849182804e-05, + "loss": 2.0128, + "step": 32960 + }, + { + "epoch": 0.7764589515331355, + "grad_norm": 1.8030163049697876, + "learning_rate": 4.476944091187415e-05, + "loss": 2.0234, + "step": 32970 + }, + { + "epoch": 0.7766944562196788, + "grad_norm": 1.9468859434127808, + "learning_rate": 4.47223399745655e-05, + "loss": 1.9977, + "step": 32980 + }, + { + "epoch": 0.776929960906222, + "grad_norm": 2.0538527965545654, + "learning_rate": 4.467523903725684e-05, + "loss": 2.141, + "step": 32990 + }, + { + "epoch": 0.7771654655927653, + "grad_norm": 2.1472928524017334, + "learning_rate": 4.462813809994819e-05, + "loss": 2.1504, + "step": 33000 + }, + { + "epoch": 0.7774009702793085, + "grad_norm": 2.2194812297821045, + "learning_rate": 4.458103716263954e-05, + "loss": 2.0039, + "step": 33010 + }, + { + "epoch": 0.7776364749658519, + "grad_norm": 4.218422889709473, + "learning_rate": 4.4533936225330885e-05, + "loss": 1.922, + "step": 33020 + }, + { + "epoch": 0.7778719796523951, + "grad_norm": 2.516357421875, + "learning_rate": 4.4486835288022235e-05, + "loss": 2.0226, + "step": 33030 + }, + { + "epoch": 0.7781074843389384, + "grad_norm": 2.656029224395752, + "learning_rate": 4.443973435071358e-05, + "loss": 1.9103, + "step": 33040 + }, + { + "epoch": 0.7783429890254816, + "grad_norm": 2.5791871547698975, + "learning_rate": 4.439263341340493e-05, + "loss": 1.9794, + "step": 33050 + }, + { + "epoch": 0.7785784937120249, + "grad_norm": 2.0614676475524902, + "learning_rate": 4.434553247609627e-05, + "loss": 1.8569, + "step": 33060 + }, + { + "epoch": 0.7788139983985681, + "grad_norm": 3.1220650672912598, + "learning_rate": 4.429843153878762e-05, + "loss": 2.0549, + "step": 33070 + }, + { + "epoch": 0.7790495030851113, + "grad_norm": NaN, + "learning_rate": 4.425133060147897e-05, + "loss": 1.9899, + "step": 33080 + }, + { + "epoch": 0.7792850077716547, + "grad_norm": 2.013601541519165, + "learning_rate": 4.4208939757901184e-05, + "loss": 1.9919, + "step": 33090 + }, + { + "epoch": 0.7795205124581979, + "grad_norm": 2.3044540882110596, + "learning_rate": 4.4161838820592534e-05, + "loss": 2.0695, + "step": 33100 + }, + { + "epoch": 0.7797560171447412, + "grad_norm": 2.1326963901519775, + "learning_rate": 4.411473788328388e-05, + "loss": 2.0311, + "step": 33110 + }, + { + "epoch": 0.7799915218312844, + "grad_norm": 2.112426280975342, + "learning_rate": 4.406763694597523e-05, + "loss": 1.8339, + "step": 33120 + }, + { + "epoch": 0.7802270265178277, + "grad_norm": 1.8452937602996826, + "learning_rate": 4.402053600866657e-05, + "loss": 2.0248, + "step": 33130 + }, + { + "epoch": 0.7804625312043709, + "grad_norm": 2.265781879425049, + "learning_rate": 4.397343507135792e-05, + "loss": 1.9383, + "step": 33140 + }, + { + "epoch": 0.7806980358909142, + "grad_norm": 2.641981601715088, + "learning_rate": 4.3926334134049265e-05, + "loss": 2.059, + "step": 33150 + }, + { + "epoch": 0.7809335405774575, + "grad_norm": 2.4268856048583984, + "learning_rate": 4.3879233196740616e-05, + "loss": 2.2207, + "step": 33160 + }, + { + "epoch": 0.7811690452640008, + "grad_norm": 2.2304317951202393, + "learning_rate": 4.3832132259431966e-05, + "loss": 2.1449, + "step": 33170 + }, + { + "epoch": 0.781404549950544, + "grad_norm": 2.062983274459839, + "learning_rate": 4.378503132212331e-05, + "loss": 1.8112, + "step": 33180 + }, + { + "epoch": 0.7816400546370873, + "grad_norm": 1.8598015308380127, + "learning_rate": 4.373793038481466e-05, + "loss": 1.8849, + "step": 33190 + }, + { + "epoch": 0.7818755593236305, + "grad_norm": 1.722225546836853, + "learning_rate": 4.369082944750601e-05, + "loss": 2.0335, + "step": 33200 + }, + { + "epoch": 0.7821110640101738, + "grad_norm": 4.0551958084106445, + "learning_rate": 4.364372851019736e-05, + "loss": 2.0743, + "step": 33210 + }, + { + "epoch": 0.782346568696717, + "grad_norm": 1.9605753421783447, + "learning_rate": 4.360133766661957e-05, + "loss": 2.0431, + "step": 33220 + }, + { + "epoch": 0.7825820733832604, + "grad_norm": 2.312108278274536, + "learning_rate": 4.3554236729310915e-05, + "loss": 1.9877, + "step": 33230 + }, + { + "epoch": 0.7828175780698036, + "grad_norm": 2.241905927658081, + "learning_rate": 4.3507135792002265e-05, + "loss": 2.0147, + "step": 33240 + }, + { + "epoch": 0.7830530827563469, + "grad_norm": 2.4801273345947266, + "learning_rate": 4.346003485469361e-05, + "loss": 1.9757, + "step": 33250 + }, + { + "epoch": 0.7832885874428901, + "grad_norm": 2.7878518104553223, + "learning_rate": 4.341293391738496e-05, + "loss": 2.0236, + "step": 33260 + }, + { + "epoch": 0.7835240921294334, + "grad_norm": 1.9572044610977173, + "learning_rate": 4.33658329800763e-05, + "loss": 2.1393, + "step": 33270 + }, + { + "epoch": 0.7837595968159766, + "grad_norm": 1.8226878643035889, + "learning_rate": 4.331873204276765e-05, + "loss": 1.9322, + "step": 33280 + }, + { + "epoch": 0.7839951015025199, + "grad_norm": 1.8301420211791992, + "learning_rate": 4.3271631105458996e-05, + "loss": 2.0656, + "step": 33290 + }, + { + "epoch": 0.7842306061890632, + "grad_norm": 3.304522752761841, + "learning_rate": 4.3224530168150346e-05, + "loss": 1.7974, + "step": 33300 + }, + { + "epoch": 0.7844661108756065, + "grad_norm": 2.201037645339966, + "learning_rate": 4.3177429230841696e-05, + "loss": 1.9841, + "step": 33310 + }, + { + "epoch": 0.7847016155621497, + "grad_norm": 1.9371416568756104, + "learning_rate": 4.3130328293533047e-05, + "loss": 2.1516, + "step": 33320 + }, + { + "epoch": 0.784937120248693, + "grad_norm": 2.4234302043914795, + "learning_rate": 4.308322735622439e-05, + "loss": 2.0129, + "step": 33330 + }, + { + "epoch": 0.7851726249352362, + "grad_norm": 1.8257781267166138, + "learning_rate": 4.303612641891574e-05, + "loss": 2.0591, + "step": 33340 + }, + { + "epoch": 0.7854081296217795, + "grad_norm": 2.209580659866333, + "learning_rate": 4.2989025481607084e-05, + "loss": 2.033, + "step": 33350 + }, + { + "epoch": 0.7856436343083227, + "grad_norm": 2.0317904949188232, + "learning_rate": 4.2941924544298434e-05, + "loss": 2.0874, + "step": 33360 + }, + { + "epoch": 0.785879138994866, + "grad_norm": 2.2737340927124023, + "learning_rate": 4.2894823606989784e-05, + "loss": 2.0869, + "step": 33370 + }, + { + "epoch": 0.7861146436814093, + "grad_norm": 1.857039213180542, + "learning_rate": 4.284772266968113e-05, + "loss": 2.1403, + "step": 33380 + }, + { + "epoch": 0.7863501483679525, + "grad_norm": 2.16298508644104, + "learning_rate": 4.280062173237248e-05, + "loss": 1.8499, + "step": 33390 + }, + { + "epoch": 0.7865856530544958, + "grad_norm": 2.2282299995422363, + "learning_rate": 4.275352079506382e-05, + "loss": 2.1735, + "step": 33400 + }, + { + "epoch": 0.786821157741039, + "grad_norm": 1.993444800376892, + "learning_rate": 4.270641985775517e-05, + "loss": 2.1354, + "step": 33410 + }, + { + "epoch": 0.7870566624275823, + "grad_norm": 2.0155577659606934, + "learning_rate": 4.2659318920446515e-05, + "loss": 1.7992, + "step": 33420 + }, + { + "epoch": 0.7872921671141255, + "grad_norm": 2.0637929439544678, + "learning_rate": 4.2612217983137865e-05, + "loss": 1.9629, + "step": 33430 + }, + { + "epoch": 0.7875276718006688, + "grad_norm": 2.51766300201416, + "learning_rate": 4.256511704582921e-05, + "loss": 1.8442, + "step": 33440 + }, + { + "epoch": 0.7877631764872121, + "grad_norm": 2.253438949584961, + "learning_rate": 4.2518016108520566e-05, + "loss": 1.9771, + "step": 33450 + }, + { + "epoch": 0.7879986811737554, + "grad_norm": 2.1923258304595947, + "learning_rate": 4.247091517121191e-05, + "loss": 2.2046, + "step": 33460 + }, + { + "epoch": 0.7882341858602986, + "grad_norm": 1.9330971240997314, + "learning_rate": 4.242381423390326e-05, + "loss": 1.9438, + "step": 33470 + }, + { + "epoch": 0.7884696905468419, + "grad_norm": 1.7735408544540405, + "learning_rate": 4.23767132965946e-05, + "loss": 2.0522, + "step": 33480 + }, + { + "epoch": 0.7887051952333851, + "grad_norm": 2.54807448387146, + "learning_rate": 4.232961235928595e-05, + "loss": 1.9661, + "step": 33490 + }, + { + "epoch": 0.7889406999199284, + "grad_norm": 1.8328182697296143, + "learning_rate": 4.22825114219773e-05, + "loss": 2.1688, + "step": 33500 + }, + { + "epoch": 0.7891762046064716, + "grad_norm": 2.4814724922180176, + "learning_rate": 4.223541048466865e-05, + "loss": 2.03, + "step": 33510 + }, + { + "epoch": 0.789411709293015, + "grad_norm": 2.5519399642944336, + "learning_rate": 4.218830954736e-05, + "loss": 2.1314, + "step": 33520 + }, + { + "epoch": 0.7896472139795582, + "grad_norm": 2.523111343383789, + "learning_rate": 4.214120861005134e-05, + "loss": 1.9807, + "step": 33530 + }, + { + "epoch": 0.7898827186661015, + "grad_norm": 2.3048107624053955, + "learning_rate": 4.209410767274269e-05, + "loss": 2.0443, + "step": 33540 + }, + { + "epoch": 0.7901182233526447, + "grad_norm": 2.162158966064453, + "learning_rate": 4.2047006735434034e-05, + "loss": 2.0237, + "step": 33550 + }, + { + "epoch": 0.790353728039188, + "grad_norm": 2.4160468578338623, + "learning_rate": 4.1999905798125385e-05, + "loss": 2.1968, + "step": 33560 + }, + { + "epoch": 0.7905892327257312, + "grad_norm": 1.7282012701034546, + "learning_rate": 4.195280486081673e-05, + "loss": 1.9573, + "step": 33570 + }, + { + "epoch": 0.7908247374122745, + "grad_norm": 2.083235025405884, + "learning_rate": 4.190570392350808e-05, + "loss": 2.0517, + "step": 33580 + }, + { + "epoch": 0.7910602420988178, + "grad_norm": 2.5902810096740723, + "learning_rate": 4.185860298619943e-05, + "loss": 2.072, + "step": 33590 + }, + { + "epoch": 0.7912957467853611, + "grad_norm": 2.1250321865081787, + "learning_rate": 4.181150204889078e-05, + "loss": 2.1724, + "step": 33600 + }, + { + "epoch": 0.7915312514719043, + "grad_norm": 2.0615923404693604, + "learning_rate": 4.176440111158212e-05, + "loss": 2.0236, + "step": 33610 + }, + { + "epoch": 0.7917667561584476, + "grad_norm": 2.1386961936950684, + "learning_rate": 4.171730017427347e-05, + "loss": 2.1057, + "step": 33620 + }, + { + "epoch": 0.7920022608449908, + "grad_norm": 2.5183324813842773, + "learning_rate": 4.1670199236964816e-05, + "loss": 2.0803, + "step": 33630 + }, + { + "epoch": 0.7922377655315341, + "grad_norm": 2.7656891345977783, + "learning_rate": 4.1623098299656166e-05, + "loss": 2.0875, + "step": 33640 + }, + { + "epoch": 0.7924732702180773, + "grad_norm": 1.7094897031784058, + "learning_rate": 4.157599736234751e-05, + "loss": 2.0225, + "step": 33650 + }, + { + "epoch": 0.7927087749046206, + "grad_norm": 2.0408098697662354, + "learning_rate": 4.152889642503886e-05, + "loss": 1.9889, + "step": 33660 + }, + { + "epoch": 0.7929442795911639, + "grad_norm": 2.6148180961608887, + "learning_rate": 4.148179548773021e-05, + "loss": 2.3067, + "step": 33670 + }, + { + "epoch": 0.7931797842777071, + "grad_norm": 1.6949024200439453, + "learning_rate": 4.1434694550421554e-05, + "loss": 2.2897, + "step": 33680 + }, + { + "epoch": 0.7934152889642504, + "grad_norm": 2.3153419494628906, + "learning_rate": 4.1387593613112904e-05, + "loss": 1.8178, + "step": 33690 + }, + { + "epoch": 0.7936507936507936, + "grad_norm": 2.5159363746643066, + "learning_rate": 4.134049267580425e-05, + "loss": 2.291, + "step": 33700 + }, + { + "epoch": 0.7938862983373369, + "grad_norm": 1.6296318769454956, + "learning_rate": 4.12933917384956e-05, + "loss": 1.9528, + "step": 33710 + }, + { + "epoch": 0.7941218030238801, + "grad_norm": 2.256610870361328, + "learning_rate": 4.124629080118694e-05, + "loss": 2.1584, + "step": 33720 + }, + { + "epoch": 0.7943573077104235, + "grad_norm": 1.8672707080841064, + "learning_rate": 4.11991898638783e-05, + "loss": 2.0653, + "step": 33730 + }, + { + "epoch": 0.7945928123969667, + "grad_norm": 3.621340751647949, + "learning_rate": 4.115208892656964e-05, + "loss": 2.2101, + "step": 33740 + }, + { + "epoch": 0.79482831708351, + "grad_norm": 2.5111987590789795, + "learning_rate": 4.110498798926099e-05, + "loss": 2.0825, + "step": 33750 + }, + { + "epoch": 0.7950638217700532, + "grad_norm": 2.316218614578247, + "learning_rate": 4.1057887051952335e-05, + "loss": 1.979, + "step": 33760 + }, + { + "epoch": 0.7952993264565965, + "grad_norm": 3.524585247039795, + "learning_rate": 4.1010786114643686e-05, + "loss": 1.9178, + "step": 33770 + }, + { + "epoch": 0.7955348311431397, + "grad_norm": 2.1990773677825928, + "learning_rate": 4.096368517733503e-05, + "loss": 2.0901, + "step": 33780 + }, + { + "epoch": 0.795770335829683, + "grad_norm": 3.369415760040283, + "learning_rate": 4.091658424002638e-05, + "loss": 1.7894, + "step": 33790 + }, + { + "epoch": 0.7960058405162262, + "grad_norm": 1.9027162790298462, + "learning_rate": 4.086948330271772e-05, + "loss": 2.0495, + "step": 33800 + }, + { + "epoch": 0.7962413452027696, + "grad_norm": 2.3677480220794678, + "learning_rate": 4.082238236540907e-05, + "loss": 2.124, + "step": 33810 + }, + { + "epoch": 0.7964768498893128, + "grad_norm": 2.0192174911499023, + "learning_rate": 4.077528142810042e-05, + "loss": 1.9715, + "step": 33820 + }, + { + "epoch": 0.7967123545758561, + "grad_norm": 2.7535741329193115, + "learning_rate": 4.072818049079177e-05, + "loss": 1.9513, + "step": 33830 + }, + { + "epoch": 0.7969478592623993, + "grad_norm": 1.9463825225830078, + "learning_rate": 4.068107955348312e-05, + "loss": 1.8706, + "step": 33840 + }, + { + "epoch": 0.7971833639489426, + "grad_norm": 2.1753220558166504, + "learning_rate": 4.063397861617446e-05, + "loss": 2.2774, + "step": 33850 + }, + { + "epoch": 0.7974188686354858, + "grad_norm": 1.5415219068527222, + "learning_rate": 4.058687767886581e-05, + "loss": 2.0952, + "step": 33860 + }, + { + "epoch": 0.7976543733220292, + "grad_norm": 2.0215823650360107, + "learning_rate": 4.053977674155716e-05, + "loss": 1.9228, + "step": 33870 + }, + { + "epoch": 0.7978898780085724, + "grad_norm": 1.6013972759246826, + "learning_rate": 4.049267580424851e-05, + "loss": 2.037, + "step": 33880 + }, + { + "epoch": 0.7981253826951157, + "grad_norm": 2.624075174331665, + "learning_rate": 4.0445574866939855e-05, + "loss": 1.9571, + "step": 33890 + }, + { + "epoch": 0.7983608873816589, + "grad_norm": 2.599543571472168, + "learning_rate": 4.0398473929631205e-05, + "loss": 2.0544, + "step": 33900 + }, + { + "epoch": 0.7985963920682022, + "grad_norm": 1.950036883354187, + "learning_rate": 4.035137299232255e-05, + "loss": 2.0248, + "step": 33910 + }, + { + "epoch": 0.7988318967547454, + "grad_norm": 2.2384231090545654, + "learning_rate": 4.03042720550139e-05, + "loss": 2.0692, + "step": 33920 + }, + { + "epoch": 0.7990674014412886, + "grad_norm": 1.6910631656646729, + "learning_rate": 4.025717111770524e-05, + "loss": 1.9093, + "step": 33930 + }, + { + "epoch": 0.799302906127832, + "grad_norm": 2.5212926864624023, + "learning_rate": 4.021007018039659e-05, + "loss": 1.9333, + "step": 33940 + }, + { + "epoch": 0.7995384108143752, + "grad_norm": 2.185635566711426, + "learning_rate": 4.0162969243087936e-05, + "loss": 2.2586, + "step": 33950 + }, + { + "epoch": 0.7997739155009185, + "grad_norm": 3.235745429992676, + "learning_rate": 4.0115868305779286e-05, + "loss": 2.0323, + "step": 33960 + }, + { + "epoch": 0.8000094201874617, + "grad_norm": 1.8148341178894043, + "learning_rate": 4.0068767368470636e-05, + "loss": 2.091, + "step": 33970 + }, + { + "epoch": 0.800244924874005, + "grad_norm": 2.3433852195739746, + "learning_rate": 4.002166643116198e-05, + "loss": 1.7543, + "step": 33980 + }, + { + "epoch": 0.8004804295605482, + "grad_norm": 1.957887053489685, + "learning_rate": 3.997456549385333e-05, + "loss": 1.9187, + "step": 33990 + }, + { + "epoch": 0.8007159342470915, + "grad_norm": 2.367794990539551, + "learning_rate": 3.9927464556544673e-05, + "loss": 2.0184, + "step": 34000 + }, + { + "epoch": 0.8009514389336347, + "grad_norm": 2.2443687915802, + "learning_rate": 3.9880363619236024e-05, + "loss": 1.9743, + "step": 34010 + }, + { + "epoch": 0.8011869436201781, + "grad_norm": 1.9915101528167725, + "learning_rate": 3.9833262681927374e-05, + "loss": 1.9113, + "step": 34020 + }, + { + "epoch": 0.8014224483067213, + "grad_norm": 2.523921489715576, + "learning_rate": 3.9786161744618724e-05, + "loss": 1.9004, + "step": 34030 + }, + { + "epoch": 0.8016579529932646, + "grad_norm": 2.2113091945648193, + "learning_rate": 3.973906080731007e-05, + "loss": 2.2037, + "step": 34040 + }, + { + "epoch": 0.8018934576798078, + "grad_norm": 1.9317169189453125, + "learning_rate": 3.969195987000142e-05, + "loss": 2.1463, + "step": 34050 + }, + { + "epoch": 0.8021289623663511, + "grad_norm": 1.804018259048462, + "learning_rate": 3.964485893269276e-05, + "loss": 2.1225, + "step": 34060 + }, + { + "epoch": 0.8023644670528943, + "grad_norm": 1.9248290061950684, + "learning_rate": 3.959775799538411e-05, + "loss": 2.0875, + "step": 34070 + }, + { + "epoch": 0.8025999717394376, + "grad_norm": 2.0950820446014404, + "learning_rate": 3.9550657058075455e-05, + "loss": 2.1594, + "step": 34080 + }, + { + "epoch": 0.8028354764259809, + "grad_norm": 2.13773775100708, + "learning_rate": 3.9503556120766805e-05, + "loss": 1.6516, + "step": 34090 + }, + { + "epoch": 0.8030709811125242, + "grad_norm": 1.8024755716323853, + "learning_rate": 3.945645518345815e-05, + "loss": 1.9598, + "step": 34100 + }, + { + "epoch": 0.8033064857990674, + "grad_norm": 2.8858230113983154, + "learning_rate": 3.94093542461495e-05, + "loss": 1.8878, + "step": 34110 + }, + { + "epoch": 0.8035419904856107, + "grad_norm": 2.173196315765381, + "learning_rate": 3.936225330884085e-05, + "loss": 1.8901, + "step": 34120 + }, + { + "epoch": 0.8037774951721539, + "grad_norm": 2.5591368675231934, + "learning_rate": 3.931515237153219e-05, + "loss": 1.82, + "step": 34130 + }, + { + "epoch": 0.8040129998586972, + "grad_norm": 2.587899923324585, + "learning_rate": 3.926805143422354e-05, + "loss": 2.0842, + "step": 34140 + }, + { + "epoch": 0.8042485045452404, + "grad_norm": 1.9660160541534424, + "learning_rate": 3.9220950496914886e-05, + "loss": 1.9926, + "step": 34150 + }, + { + "epoch": 0.8044840092317838, + "grad_norm": 2.291707754135132, + "learning_rate": 3.9173849559606243e-05, + "loss": 1.9393, + "step": 34160 + }, + { + "epoch": 0.804719513918327, + "grad_norm": 2.452038288116455, + "learning_rate": 3.912674862229759e-05, + "loss": 2.0658, + "step": 34170 + }, + { + "epoch": 0.8049550186048703, + "grad_norm": 1.9962801933288574, + "learning_rate": 3.907964768498894e-05, + "loss": 2.0453, + "step": 34180 + }, + { + "epoch": 0.8051905232914135, + "grad_norm": 2.2015650272369385, + "learning_rate": 3.903254674768028e-05, + "loss": 2.1626, + "step": 34190 + }, + { + "epoch": 0.8054260279779568, + "grad_norm": 2.1929237842559814, + "learning_rate": 3.898544581037163e-05, + "loss": 2.1904, + "step": 34200 + }, + { + "epoch": 0.8056615326645, + "grad_norm": 2.3259692192077637, + "learning_rate": 3.8938344873062974e-05, + "loss": 1.9968, + "step": 34210 + }, + { + "epoch": 0.8058970373510432, + "grad_norm": 2.7438056468963623, + "learning_rate": 3.8891243935754325e-05, + "loss": 2.2442, + "step": 34220 + }, + { + "epoch": 0.8061325420375866, + "grad_norm": 2.609450340270996, + "learning_rate": 3.884414299844567e-05, + "loss": 1.9798, + "step": 34230 + }, + { + "epoch": 0.8063680467241298, + "grad_norm": 2.7057836055755615, + "learning_rate": 3.879704206113702e-05, + "loss": 1.6244, + "step": 34240 + }, + { + "epoch": 0.8066035514106731, + "grad_norm": 2.6504015922546387, + "learning_rate": 3.874994112382836e-05, + "loss": 2.1608, + "step": 34250 + }, + { + "epoch": 0.8068390560972163, + "grad_norm": 2.614208936691284, + "learning_rate": 3.870284018651971e-05, + "loss": 2.085, + "step": 34260 + }, + { + "epoch": 0.8070745607837596, + "grad_norm": 2.1055519580841064, + "learning_rate": 3.865573924921106e-05, + "loss": 2.0212, + "step": 34270 + }, + { + "epoch": 0.8073100654703028, + "grad_norm": 2.314775228500366, + "learning_rate": 3.8608638311902406e-05, + "loss": 2.0582, + "step": 34280 + }, + { + "epoch": 0.8075455701568461, + "grad_norm": 2.320479393005371, + "learning_rate": 3.8561537374593756e-05, + "loss": 1.8997, + "step": 34290 + }, + { + "epoch": 0.8077810748433893, + "grad_norm": 2.615222692489624, + "learning_rate": 3.8514436437285106e-05, + "loss": 2.1215, + "step": 34300 + }, + { + "epoch": 0.8080165795299327, + "grad_norm": 1.9585306644439697, + "learning_rate": 3.8467335499976456e-05, + "loss": 1.8919, + "step": 34310 + }, + { + "epoch": 0.8082520842164759, + "grad_norm": 1.8004975318908691, + "learning_rate": 3.84202345626678e-05, + "loss": 2.1203, + "step": 34320 + }, + { + "epoch": 0.8084875889030192, + "grad_norm": 1.973084568977356, + "learning_rate": 3.837313362535915e-05, + "loss": 2.0844, + "step": 34330 + }, + { + "epoch": 0.8087230935895624, + "grad_norm": 2.2893195152282715, + "learning_rate": 3.8326032688050494e-05, + "loss": 1.9385, + "step": 34340 + }, + { + "epoch": 0.8089585982761057, + "grad_norm": 2.5790085792541504, + "learning_rate": 3.8278931750741844e-05, + "loss": 1.9747, + "step": 34350 + }, + { + "epoch": 0.8091941029626489, + "grad_norm": 2.1595449447631836, + "learning_rate": 3.823183081343319e-05, + "loss": 2.1628, + "step": 34360 + }, + { + "epoch": 0.8094296076491923, + "grad_norm": 2.5433499813079834, + "learning_rate": 3.818472987612454e-05, + "loss": 1.9291, + "step": 34370 + }, + { + "epoch": 0.8096651123357355, + "grad_norm": 2.732644557952881, + "learning_rate": 3.813762893881588e-05, + "loss": 2.1055, + "step": 34380 + }, + { + "epoch": 0.8099006170222788, + "grad_norm": 1.8708646297454834, + "learning_rate": 3.809052800150723e-05, + "loss": 1.9993, + "step": 34390 + }, + { + "epoch": 0.810136121708822, + "grad_norm": 1.994516134262085, + "learning_rate": 3.8043427064198575e-05, + "loss": 2.0301, + "step": 34400 + }, + { + "epoch": 0.8103716263953653, + "grad_norm": 2.354677677154541, + "learning_rate": 3.7996326126889925e-05, + "loss": 2.0536, + "step": 34410 + }, + { + "epoch": 0.8106071310819085, + "grad_norm": 1.9902782440185547, + "learning_rate": 3.7949225189581275e-05, + "loss": 2.0342, + "step": 34420 + }, + { + "epoch": 0.8108426357684518, + "grad_norm": 2.1519672870635986, + "learning_rate": 3.790212425227262e-05, + "loss": 2.0471, + "step": 34430 + }, + { + "epoch": 0.811078140454995, + "grad_norm": 2.525325059890747, + "learning_rate": 3.785502331496397e-05, + "loss": 1.9082, + "step": 34440 + }, + { + "epoch": 0.8113136451415384, + "grad_norm": 1.9717092514038086, + "learning_rate": 3.780792237765532e-05, + "loss": 2.0308, + "step": 34450 + }, + { + "epoch": 0.8115491498280816, + "grad_norm": 2.371563196182251, + "learning_rate": 3.776082144034667e-05, + "loss": 2.2107, + "step": 34460 + }, + { + "epoch": 0.8117846545146249, + "grad_norm": 2.0133941173553467, + "learning_rate": 3.771372050303801e-05, + "loss": 2.2433, + "step": 34470 + }, + { + "epoch": 0.8120201592011681, + "grad_norm": 2.075589418411255, + "learning_rate": 3.766661956572936e-05, + "loss": 2.0989, + "step": 34480 + }, + { + "epoch": 0.8122556638877113, + "grad_norm": 2.919072389602661, + "learning_rate": 3.761951862842071e-05, + "loss": 2.1238, + "step": 34490 + }, + { + "epoch": 0.8124911685742546, + "grad_norm": 2.5039098262786865, + "learning_rate": 3.757241769111206e-05, + "loss": 2.1452, + "step": 34500 + }, + { + "epoch": 0.8127266732607978, + "grad_norm": 2.3447000980377197, + "learning_rate": 3.75253167538034e-05, + "loss": 2.1173, + "step": 34510 + }, + { + "epoch": 0.8129621779473412, + "grad_norm": 2.079946279525757, + "learning_rate": 3.747821581649475e-05, + "loss": 1.7781, + "step": 34520 + }, + { + "epoch": 0.8131976826338844, + "grad_norm": 2.0972554683685303, + "learning_rate": 3.7431114879186094e-05, + "loss": 1.8383, + "step": 34530 + }, + { + "epoch": 0.8134331873204277, + "grad_norm": 1.8633981943130493, + "learning_rate": 3.7384013941877444e-05, + "loss": 1.967, + "step": 34540 + }, + { + "epoch": 0.8136686920069709, + "grad_norm": 1.938206434249878, + "learning_rate": 3.733691300456879e-05, + "loss": 2.0227, + "step": 34550 + }, + { + "epoch": 0.8139041966935142, + "grad_norm": 2.096994400024414, + "learning_rate": 3.728981206726014e-05, + "loss": 2.109, + "step": 34560 + }, + { + "epoch": 0.8141397013800574, + "grad_norm": 2.675692319869995, + "learning_rate": 3.724271112995149e-05, + "loss": 2.0283, + "step": 34570 + }, + { + "epoch": 0.8143752060666007, + "grad_norm": 2.3348910808563232, + "learning_rate": 3.719561019264284e-05, + "loss": 2.0057, + "step": 34580 + }, + { + "epoch": 0.814610710753144, + "grad_norm": 1.5937950611114502, + "learning_rate": 3.714850925533418e-05, + "loss": 1.9629, + "step": 34590 + }, + { + "epoch": 0.8148462154396873, + "grad_norm": 2.59680438041687, + "learning_rate": 3.710140831802553e-05, + "loss": 1.8433, + "step": 34600 + }, + { + "epoch": 0.8150817201262305, + "grad_norm": 2.0707480907440186, + "learning_rate": 3.705430738071688e-05, + "loss": 1.9767, + "step": 34610 + }, + { + "epoch": 0.8153172248127738, + "grad_norm": 2.1386687755584717, + "learning_rate": 3.7007206443408226e-05, + "loss": 1.9353, + "step": 34620 + }, + { + "epoch": 0.815552729499317, + "grad_norm": 2.207897663116455, + "learning_rate": 3.6960105506099576e-05, + "loss": 2.0739, + "step": 34630 + }, + { + "epoch": 0.8157882341858603, + "grad_norm": 1.8906501531600952, + "learning_rate": 3.691300456879092e-05, + "loss": 2.0127, + "step": 34640 + }, + { + "epoch": 0.8160237388724035, + "grad_norm": 2.1605560779571533, + "learning_rate": 3.686590363148227e-05, + "loss": 1.8724, + "step": 34650 + }, + { + "epoch": 0.8162592435589469, + "grad_norm": 2.182072401046753, + "learning_rate": 3.681880269417361e-05, + "loss": 1.968, + "step": 34660 + }, + { + "epoch": 0.8164947482454901, + "grad_norm": 2.064140796661377, + "learning_rate": 3.6771701756864964e-05, + "loss": 2.0005, + "step": 34670 + }, + { + "epoch": 0.8167302529320334, + "grad_norm": 2.037813186645508, + "learning_rate": 3.672460081955631e-05, + "loss": 2.0115, + "step": 34680 + }, + { + "epoch": 0.8169657576185766, + "grad_norm": 2.205673933029175, + "learning_rate": 3.667749988224766e-05, + "loss": 1.8754, + "step": 34690 + }, + { + "epoch": 0.8172012623051199, + "grad_norm": 2.1787984371185303, + "learning_rate": 3.6630398944939e-05, + "loss": 2.0636, + "step": 34700 + }, + { + "epoch": 0.8174367669916631, + "grad_norm": 2.24696946144104, + "learning_rate": 3.658329800763035e-05, + "loss": 2.071, + "step": 34710 + }, + { + "epoch": 0.8176722716782064, + "grad_norm": 1.690644383430481, + "learning_rate": 3.65361970703217e-05, + "loss": 1.9747, + "step": 34720 + }, + { + "epoch": 0.8179077763647497, + "grad_norm": 2.699493646621704, + "learning_rate": 3.648909613301305e-05, + "loss": 2.1167, + "step": 34730 + }, + { + "epoch": 0.818143281051293, + "grad_norm": 2.260894536972046, + "learning_rate": 3.6441995195704395e-05, + "loss": 1.8883, + "step": 34740 + }, + { + "epoch": 0.8183787857378362, + "grad_norm": 1.964855670928955, + "learning_rate": 3.6394894258395745e-05, + "loss": 2.0409, + "step": 34750 + }, + { + "epoch": 0.8186142904243795, + "grad_norm": 2.360180616378784, + "learning_rate": 3.6347793321087095e-05, + "loss": 1.9799, + "step": 34760 + }, + { + "epoch": 0.8188497951109227, + "grad_norm": 2.9807870388031006, + "learning_rate": 3.630069238377844e-05, + "loss": 2.2196, + "step": 34770 + }, + { + "epoch": 0.8190852997974659, + "grad_norm": 2.496344566345215, + "learning_rate": 3.625359144646979e-05, + "loss": 1.8034, + "step": 34780 + }, + { + "epoch": 0.8193208044840092, + "grad_norm": 2.7705297470092773, + "learning_rate": 3.620649050916113e-05, + "loss": 1.8999, + "step": 34790 + }, + { + "epoch": 0.8195563091705524, + "grad_norm": 2.4351303577423096, + "learning_rate": 3.615938957185248e-05, + "loss": 1.8937, + "step": 34800 + }, + { + "epoch": 0.8197918138570958, + "grad_norm": 2.3407814502716064, + "learning_rate": 3.6112288634543826e-05, + "loss": 2.2182, + "step": 34810 + }, + { + "epoch": 0.820027318543639, + "grad_norm": 1.9275591373443604, + "learning_rate": 3.606518769723518e-05, + "loss": 2.0223, + "step": 34820 + }, + { + "epoch": 0.8202628232301823, + "grad_norm": 2.0065784454345703, + "learning_rate": 3.601808675992652e-05, + "loss": 2.2715, + "step": 34830 + }, + { + "epoch": 0.8204983279167255, + "grad_norm": 2.351534366607666, + "learning_rate": 3.597098582261787e-05, + "loss": 2.1069, + "step": 34840 + }, + { + "epoch": 0.8207338326032688, + "grad_norm": 2.254944086074829, + "learning_rate": 3.5923884885309214e-05, + "loss": 2.1348, + "step": 34850 + }, + { + "epoch": 0.820969337289812, + "grad_norm": 4.640080451965332, + "learning_rate": 3.587678394800057e-05, + "loss": 1.981, + "step": 34860 + }, + { + "epoch": 0.8212048419763553, + "grad_norm": 1.7675470113754272, + "learning_rate": 3.5829683010691914e-05, + "loss": 1.9438, + "step": 34870 + }, + { + "epoch": 0.8214403466628986, + "grad_norm": 1.8417401313781738, + "learning_rate": 3.5782582073383265e-05, + "loss": 1.9817, + "step": 34880 + }, + { + "epoch": 0.8216758513494419, + "grad_norm": 2.11193585395813, + "learning_rate": 3.573548113607461e-05, + "loss": 2.0047, + "step": 34890 + }, + { + "epoch": 0.8219113560359851, + "grad_norm": 1.9333069324493408, + "learning_rate": 3.568838019876596e-05, + "loss": 1.9611, + "step": 34900 + }, + { + "epoch": 0.8221468607225284, + "grad_norm": 1.8805556297302246, + "learning_rate": 3.564127926145731e-05, + "loss": 1.9072, + "step": 34910 + }, + { + "epoch": 0.8223823654090716, + "grad_norm": 2.1735150814056396, + "learning_rate": 3.559417832414865e-05, + "loss": 1.8782, + "step": 34920 + }, + { + "epoch": 0.8226178700956149, + "grad_norm": 2.4724745750427246, + "learning_rate": 3.554707738684e-05, + "loss": 2.0225, + "step": 34930 + }, + { + "epoch": 0.8228533747821581, + "grad_norm": 2.609894037246704, + "learning_rate": 3.5499976449531346e-05, + "loss": 2.1728, + "step": 34940 + }, + { + "epoch": 0.8230888794687015, + "grad_norm": 2.194990634918213, + "learning_rate": 3.5452875512222696e-05, + "loss": 1.8967, + "step": 34950 + }, + { + "epoch": 0.8233243841552447, + "grad_norm": 1.8386231660842896, + "learning_rate": 3.540577457491404e-05, + "loss": 2.0285, + "step": 34960 + }, + { + "epoch": 0.823559888841788, + "grad_norm": 2.252108573913574, + "learning_rate": 3.535867363760539e-05, + "loss": 2.0548, + "step": 34970 + }, + { + "epoch": 0.8237953935283312, + "grad_norm": 1.995373249053955, + "learning_rate": 3.531157270029673e-05, + "loss": 2.0336, + "step": 34980 + }, + { + "epoch": 0.8240308982148745, + "grad_norm": 2.212472438812256, + "learning_rate": 3.526447176298808e-05, + "loss": 2.0237, + "step": 34990 + }, + { + "epoch": 0.8242664029014177, + "grad_norm": 2.2565536499023438, + "learning_rate": 3.5217370825679434e-05, + "loss": 2.2211, + "step": 35000 + }, + { + "epoch": 0.824501907587961, + "grad_norm": 2.493603229522705, + "learning_rate": 3.5170269888370784e-05, + "loss": 2.1334, + "step": 35010 + }, + { + "epoch": 0.8247374122745043, + "grad_norm": 2.159635305404663, + "learning_rate": 3.512316895106213e-05, + "loss": 1.9526, + "step": 35020 + }, + { + "epoch": 0.8249729169610476, + "grad_norm": 1.947871208190918, + "learning_rate": 3.507606801375348e-05, + "loss": 1.9906, + "step": 35030 + }, + { + "epoch": 0.8252084216475908, + "grad_norm": 2.8672406673431396, + "learning_rate": 3.502896707644482e-05, + "loss": 2.2269, + "step": 35040 + }, + { + "epoch": 0.8254439263341341, + "grad_norm": 2.212104082107544, + "learning_rate": 3.498186613913617e-05, + "loss": 1.9107, + "step": 35050 + }, + { + "epoch": 0.8256794310206773, + "grad_norm": 2.2118680477142334, + "learning_rate": 3.493476520182752e-05, + "loss": 2.1372, + "step": 35060 + }, + { + "epoch": 0.8259149357072205, + "grad_norm": 2.289013385772705, + "learning_rate": 3.4887664264518865e-05, + "loss": 1.9752, + "step": 35070 + }, + { + "epoch": 0.8261504403937638, + "grad_norm": 2.4700567722320557, + "learning_rate": 3.4840563327210215e-05, + "loss": 2.0911, + "step": 35080 + }, + { + "epoch": 0.826385945080307, + "grad_norm": 1.9941130876541138, + "learning_rate": 3.479346238990156e-05, + "loss": 1.8645, + "step": 35090 + }, + { + "epoch": 0.8266214497668504, + "grad_norm": 2.4892849922180176, + "learning_rate": 3.474636145259291e-05, + "loss": 1.8108, + "step": 35100 + }, + { + "epoch": 0.8268569544533936, + "grad_norm": 1.7279706001281738, + "learning_rate": 3.469926051528425e-05, + "loss": 1.8917, + "step": 35110 + }, + { + "epoch": 0.8270924591399369, + "grad_norm": 1.975010871887207, + "learning_rate": 3.46521595779756e-05, + "loss": 1.9208, + "step": 35120 + }, + { + "epoch": 0.8273279638264801, + "grad_norm": 2.1341896057128906, + "learning_rate": 3.4605058640666946e-05, + "loss": 2.0969, + "step": 35130 + }, + { + "epoch": 0.8275634685130234, + "grad_norm": 2.393791437149048, + "learning_rate": 3.4557957703358296e-05, + "loss": 1.9602, + "step": 35140 + }, + { + "epoch": 0.8277989731995666, + "grad_norm": 2.1387171745300293, + "learning_rate": 3.4510856766049647e-05, + "loss": 2.0583, + "step": 35150 + }, + { + "epoch": 0.82803447788611, + "grad_norm": 1.9722431898117065, + "learning_rate": 3.4463755828741e-05, + "loss": 2.0649, + "step": 35160 + }, + { + "epoch": 0.8282699825726532, + "grad_norm": 2.32572340965271, + "learning_rate": 3.441665489143234e-05, + "loss": 2.1126, + "step": 35170 + }, + { + "epoch": 0.8285054872591965, + "grad_norm": 2.5280470848083496, + "learning_rate": 3.436955395412369e-05, + "loss": 2.0072, + "step": 35180 + }, + { + "epoch": 0.8287409919457397, + "grad_norm": 2.0874745845794678, + "learning_rate": 3.4322453016815034e-05, + "loss": 2.0661, + "step": 35190 + }, + { + "epoch": 0.828976496632283, + "grad_norm": 3.3035924434661865, + "learning_rate": 3.4275352079506384e-05, + "loss": 1.9418, + "step": 35200 + }, + { + "epoch": 0.8292120013188262, + "grad_norm": 2.1365528106689453, + "learning_rate": 3.4228251142197735e-05, + "loss": 1.8834, + "step": 35210 + }, + { + "epoch": 0.8294475060053695, + "grad_norm": 1.7832911014556885, + "learning_rate": 3.418115020488908e-05, + "loss": 1.9537, + "step": 35220 + }, + { + "epoch": 0.8296830106919127, + "grad_norm": 2.0132415294647217, + "learning_rate": 3.413404926758043e-05, + "loss": 2.0083, + "step": 35230 + }, + { + "epoch": 0.8299185153784561, + "grad_norm": 2.6595003604888916, + "learning_rate": 3.408694833027177e-05, + "loss": 2.0959, + "step": 35240 + }, + { + "epoch": 0.8301540200649993, + "grad_norm": 1.921191692352295, + "learning_rate": 3.403984739296312e-05, + "loss": 1.9271, + "step": 35250 + }, + { + "epoch": 0.8303895247515426, + "grad_norm": 2.7424440383911133, + "learning_rate": 3.3992746455654465e-05, + "loss": 2.0904, + "step": 35260 + }, + { + "epoch": 0.8306250294380858, + "grad_norm": 5.584036827087402, + "learning_rate": 3.3945645518345816e-05, + "loss": 2.0697, + "step": 35270 + }, + { + "epoch": 0.8308605341246291, + "grad_norm": 2.842258930206299, + "learning_rate": 3.389854458103716e-05, + "loss": 1.9869, + "step": 35280 + }, + { + "epoch": 0.8310960388111723, + "grad_norm": 1.9061213731765747, + "learning_rate": 3.3851443643728516e-05, + "loss": 2.3207, + "step": 35290 + }, + { + "epoch": 0.8313315434977157, + "grad_norm": 2.775942087173462, + "learning_rate": 3.380434270641986e-05, + "loss": 1.9147, + "step": 35300 + }, + { + "epoch": 0.8315670481842589, + "grad_norm": 3.2270853519439697, + "learning_rate": 3.375724176911121e-05, + "loss": 2.175, + "step": 35310 + }, + { + "epoch": 0.8318025528708022, + "grad_norm": 1.8016290664672852, + "learning_rate": 3.371014083180255e-05, + "loss": 1.8835, + "step": 35320 + }, + { + "epoch": 0.8320380575573454, + "grad_norm": 2.776650905609131, + "learning_rate": 3.3663039894493904e-05, + "loss": 1.7118, + "step": 35330 + }, + { + "epoch": 0.8322735622438886, + "grad_norm": 2.003655433654785, + "learning_rate": 3.361593895718525e-05, + "loss": 1.9875, + "step": 35340 + }, + { + "epoch": 0.8325090669304319, + "grad_norm": 2.3793911933898926, + "learning_rate": 3.35688380198766e-05, + "loss": 2.1075, + "step": 35350 + }, + { + "epoch": 0.8327445716169751, + "grad_norm": 2.2091314792633057, + "learning_rate": 3.352173708256795e-05, + "loss": 2.068, + "step": 35360 + }, + { + "epoch": 0.8329800763035184, + "grad_norm": 2.356450319290161, + "learning_rate": 3.347463614525929e-05, + "loss": 2.0795, + "step": 35370 + }, + { + "epoch": 0.8332155809900617, + "grad_norm": 1.9805959463119507, + "learning_rate": 3.342753520795064e-05, + "loss": 2.0017, + "step": 35380 + }, + { + "epoch": 0.833451085676605, + "grad_norm": 1.8965378999710083, + "learning_rate": 3.3380434270641985e-05, + "loss": 2.0095, + "step": 35390 + }, + { + "epoch": 0.8336865903631482, + "grad_norm": 2.3308956623077393, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.6953, + "step": 35400 + }, + { + "epoch": 0.8339220950496915, + "grad_norm": 1.7506887912750244, + "learning_rate": 3.328623239602468e-05, + "loss": 2.011, + "step": 35410 + }, + { + "epoch": 0.8341575997362347, + "grad_norm": 1.6992536783218384, + "learning_rate": 3.323913145871603e-05, + "loss": 1.9904, + "step": 35420 + }, + { + "epoch": 0.834393104422778, + "grad_norm": 2.503995418548584, + "learning_rate": 3.319203052140738e-05, + "loss": 2.0786, + "step": 35430 + }, + { + "epoch": 0.8346286091093212, + "grad_norm": 1.9842063188552856, + "learning_rate": 3.314492958409873e-05, + "loss": 1.9527, + "step": 35440 + }, + { + "epoch": 0.8348641137958646, + "grad_norm": 1.7640032768249512, + "learning_rate": 3.309782864679007e-05, + "loss": 1.8065, + "step": 35450 + }, + { + "epoch": 0.8350996184824078, + "grad_norm": 2.2963879108428955, + "learning_rate": 3.305072770948142e-05, + "loss": 1.8249, + "step": 35460 + }, + { + "epoch": 0.8353351231689511, + "grad_norm": 1.9893380403518677, + "learning_rate": 3.3003626772172766e-05, + "loss": 2.1082, + "step": 35470 + }, + { + "epoch": 0.8355706278554943, + "grad_norm": 1.7458503246307373, + "learning_rate": 3.2956525834864117e-05, + "loss": 1.9985, + "step": 35480 + }, + { + "epoch": 0.8358061325420376, + "grad_norm": 1.997781753540039, + "learning_rate": 3.290942489755546e-05, + "loss": 2.1124, + "step": 35490 + }, + { + "epoch": 0.8360416372285808, + "grad_norm": 3.002406358718872, + "learning_rate": 3.286232396024681e-05, + "loss": 2.0895, + "step": 35500 + }, + { + "epoch": 0.8362771419151241, + "grad_norm": 1.5130270719528198, + "learning_rate": 3.281522302293816e-05, + "loss": 1.8941, + "step": 35510 + }, + { + "epoch": 0.8365126466016674, + "grad_norm": 1.7578104734420776, + "learning_rate": 3.2768122085629504e-05, + "loss": 2.0347, + "step": 35520 + }, + { + "epoch": 0.8367481512882107, + "grad_norm": 2.7366182804107666, + "learning_rate": 3.2721021148320854e-05, + "loss": 1.8944, + "step": 35530 + }, + { + "epoch": 0.8369836559747539, + "grad_norm": 2.484748601913452, + "learning_rate": 3.26739202110122e-05, + "loss": 1.9773, + "step": 35540 + }, + { + "epoch": 0.8372191606612972, + "grad_norm": 2.0619559288024902, + "learning_rate": 3.262681927370355e-05, + "loss": 2.0534, + "step": 35550 + }, + { + "epoch": 0.8374546653478404, + "grad_norm": 2.169790744781494, + "learning_rate": 3.257971833639489e-05, + "loss": 1.9748, + "step": 35560 + }, + { + "epoch": 0.8376901700343837, + "grad_norm": 2.2871201038360596, + "learning_rate": 3.253261739908625e-05, + "loss": 2.1623, + "step": 35570 + }, + { + "epoch": 0.8379256747209269, + "grad_norm": 2.192305326461792, + "learning_rate": 3.248551646177759e-05, + "loss": 1.9842, + "step": 35580 + }, + { + "epoch": 0.8381611794074703, + "grad_norm": 1.6627674102783203, + "learning_rate": 3.243841552446894e-05, + "loss": 2.0446, + "step": 35590 + }, + { + "epoch": 0.8383966840940135, + "grad_norm": 1.8032068014144897, + "learning_rate": 3.2391314587160286e-05, + "loss": 1.9092, + "step": 35600 + }, + { + "epoch": 0.8386321887805568, + "grad_norm": 2.1932010650634766, + "learning_rate": 3.2344213649851636e-05, + "loss": 2.0882, + "step": 35610 + }, + { + "epoch": 0.8388676934671, + "grad_norm": 2.2689177989959717, + "learning_rate": 3.229711271254298e-05, + "loss": 2.0097, + "step": 35620 + }, + { + "epoch": 0.8391031981536432, + "grad_norm": 1.843926191329956, + "learning_rate": 3.225001177523433e-05, + "loss": 2.1152, + "step": 35630 + }, + { + "epoch": 0.8393387028401865, + "grad_norm": 1.8309720754623413, + "learning_rate": 3.220291083792567e-05, + "loss": 1.7824, + "step": 35640 + }, + { + "epoch": 0.8395742075267297, + "grad_norm": 1.8428617715835571, + "learning_rate": 3.215580990061702e-05, + "loss": 2.0781, + "step": 35650 + }, + { + "epoch": 0.839809712213273, + "grad_norm": 3.1680748462677, + "learning_rate": 3.2108708963308374e-05, + "loss": 2.0001, + "step": 35660 + }, + { + "epoch": 0.8400452168998163, + "grad_norm": 2.44405198097229, + "learning_rate": 3.206160802599972e-05, + "loss": 1.9798, + "step": 35670 + }, + { + "epoch": 0.8402807215863596, + "grad_norm": 2.168524980545044, + "learning_rate": 3.201450708869107e-05, + "loss": 1.7701, + "step": 35680 + }, + { + "epoch": 0.8405162262729028, + "grad_norm": 2.2438528537750244, + "learning_rate": 3.196740615138241e-05, + "loss": 2.1856, + "step": 35690 + }, + { + "epoch": 0.8407517309594461, + "grad_norm": 3.037527084350586, + "learning_rate": 3.192030521407376e-05, + "loss": 2.1172, + "step": 35700 + }, + { + "epoch": 0.8409872356459893, + "grad_norm": 2.3657798767089844, + "learning_rate": 3.187320427676511e-05, + "loss": 2.2592, + "step": 35710 + }, + { + "epoch": 0.8412227403325326, + "grad_norm": 2.6857681274414062, + "learning_rate": 3.182610333945646e-05, + "loss": 1.8174, + "step": 35720 + }, + { + "epoch": 0.8414582450190758, + "grad_norm": 2.249049186706543, + "learning_rate": 3.1779002402147805e-05, + "loss": 2.0437, + "step": 35730 + }, + { + "epoch": 0.8416937497056192, + "grad_norm": 2.1826348304748535, + "learning_rate": 3.1731901464839155e-05, + "loss": 2.1848, + "step": 35740 + }, + { + "epoch": 0.8419292543921624, + "grad_norm": 2.04245924949646, + "learning_rate": 3.16848005275305e-05, + "loss": 1.8733, + "step": 35750 + }, + { + "epoch": 0.8421647590787057, + "grad_norm": 2.0764975547790527, + "learning_rate": 3.163769959022185e-05, + "loss": 1.9855, + "step": 35760 + }, + { + "epoch": 0.8424002637652489, + "grad_norm": 2.581373453140259, + "learning_rate": 3.159059865291319e-05, + "loss": 1.7349, + "step": 35770 + }, + { + "epoch": 0.8426357684517922, + "grad_norm": 2.6888554096221924, + "learning_rate": 3.154349771560454e-05, + "loss": 2.0537, + "step": 35780 + }, + { + "epoch": 0.8428712731383354, + "grad_norm": 1.8182003498077393, + "learning_rate": 3.1496396778295886e-05, + "loss": 2.1486, + "step": 35790 + }, + { + "epoch": 0.8431067778248788, + "grad_norm": 2.2902069091796875, + "learning_rate": 3.1449295840987236e-05, + "loss": 2.056, + "step": 35800 + }, + { + "epoch": 0.843342282511422, + "grad_norm": 1.7687901258468628, + "learning_rate": 3.1402194903678587e-05, + "loss": 2.1473, + "step": 35810 + }, + { + "epoch": 0.8435777871979653, + "grad_norm": 1.9152394533157349, + "learning_rate": 3.135509396636993e-05, + "loss": 2.008, + "step": 35820 + }, + { + "epoch": 0.8438132918845085, + "grad_norm": 2.308910846710205, + "learning_rate": 3.130799302906128e-05, + "loss": 1.9928, + "step": 35830 + }, + { + "epoch": 0.8440487965710518, + "grad_norm": 1.848122477531433, + "learning_rate": 3.1260892091752624e-05, + "loss": 2.1119, + "step": 35840 + }, + { + "epoch": 0.844284301257595, + "grad_norm": 2.760970115661621, + "learning_rate": 3.121379115444398e-05, + "loss": 1.9572, + "step": 35850 + }, + { + "epoch": 0.8445198059441383, + "grad_norm": 2.340911865234375, + "learning_rate": 3.1166690217135324e-05, + "loss": 2.0947, + "step": 35860 + }, + { + "epoch": 0.8447553106306815, + "grad_norm": 1.8320900201797485, + "learning_rate": 3.1119589279826674e-05, + "loss": 1.8869, + "step": 35870 + }, + { + "epoch": 0.8449908153172249, + "grad_norm": 2.2665231227874756, + "learning_rate": 3.107248834251802e-05, + "loss": 2.0459, + "step": 35880 + }, + { + "epoch": 0.8452263200037681, + "grad_norm": 2.3234121799468994, + "learning_rate": 3.102538740520937e-05, + "loss": 1.8317, + "step": 35890 + }, + { + "epoch": 0.8454618246903114, + "grad_norm": 2.1993918418884277, + "learning_rate": 3.097828646790071e-05, + "loss": 2.1804, + "step": 35900 + }, + { + "epoch": 0.8456973293768546, + "grad_norm": 2.1090517044067383, + "learning_rate": 3.093118553059206e-05, + "loss": 1.921, + "step": 35910 + }, + { + "epoch": 0.8459328340633978, + "grad_norm": 2.0984067916870117, + "learning_rate": 3.0884084593283405e-05, + "loss": 1.9881, + "step": 35920 + }, + { + "epoch": 0.8461683387499411, + "grad_norm": 1.9138286113739014, + "learning_rate": 3.0836983655974756e-05, + "loss": 1.965, + "step": 35930 + }, + { + "epoch": 0.8464038434364843, + "grad_norm": 2.4115593433380127, + "learning_rate": 3.07898827186661e-05, + "loss": 2.0701, + "step": 35940 + }, + { + "epoch": 0.8466393481230277, + "grad_norm": 2.430145025253296, + "learning_rate": 3.074278178135745e-05, + "loss": 1.8851, + "step": 35950 + }, + { + "epoch": 0.8468748528095709, + "grad_norm": 2.130075216293335, + "learning_rate": 3.06956808440488e-05, + "loss": 2.0154, + "step": 35960 + }, + { + "epoch": 0.8471103574961142, + "grad_norm": 2.5573925971984863, + "learning_rate": 3.064857990674014e-05, + "loss": 2.2392, + "step": 35970 + }, + { + "epoch": 0.8473458621826574, + "grad_norm": 1.901427984237671, + "learning_rate": 3.060147896943149e-05, + "loss": 1.7748, + "step": 35980 + }, + { + "epoch": 0.8475813668692007, + "grad_norm": 2.2052485942840576, + "learning_rate": 3.0554378032122843e-05, + "loss": 1.8585, + "step": 35990 + }, + { + "epoch": 0.8478168715557439, + "grad_norm": 2.0382041931152344, + "learning_rate": 3.050727709481419e-05, + "loss": 1.9512, + "step": 36000 + }, + { + "epoch": 0.8480523762422872, + "grad_norm": 2.326741933822632, + "learning_rate": 3.0460176157505537e-05, + "loss": 2.1158, + "step": 36010 + }, + { + "epoch": 0.8482878809288305, + "grad_norm": 2.1949450969696045, + "learning_rate": 3.0413075220196884e-05, + "loss": 1.7401, + "step": 36020 + }, + { + "epoch": 0.8485233856153738, + "grad_norm": 1.9501028060913086, + "learning_rate": 3.036597428288823e-05, + "loss": 2.1736, + "step": 36030 + }, + { + "epoch": 0.848758890301917, + "grad_norm": 1.9012770652770996, + "learning_rate": 3.0318873345579578e-05, + "loss": 1.9965, + "step": 36040 + }, + { + "epoch": 0.8489943949884603, + "grad_norm": 2.4797487258911133, + "learning_rate": 3.0271772408270928e-05, + "loss": 2.0385, + "step": 36050 + }, + { + "epoch": 0.8492298996750035, + "grad_norm": 2.465705394744873, + "learning_rate": 3.0224671470962275e-05, + "loss": 1.9674, + "step": 36060 + }, + { + "epoch": 0.8494654043615468, + "grad_norm": 3.0579581260681152, + "learning_rate": 3.0177570533653622e-05, + "loss": 1.9021, + "step": 36070 + }, + { + "epoch": 0.84970090904809, + "grad_norm": 1.4716150760650635, + "learning_rate": 3.013046959634497e-05, + "loss": 1.9811, + "step": 36080 + }, + { + "epoch": 0.8499364137346334, + "grad_norm": 2.094615936279297, + "learning_rate": 3.0083368659036315e-05, + "loss": 2.1144, + "step": 36090 + }, + { + "epoch": 0.8501719184211766, + "grad_norm": 2.2217042446136475, + "learning_rate": 3.0036267721727662e-05, + "loss": 2.1761, + "step": 36100 + }, + { + "epoch": 0.8504074231077199, + "grad_norm": 2.1508212089538574, + "learning_rate": 2.998916678441901e-05, + "loss": 2.0179, + "step": 36110 + }, + { + "epoch": 0.8506429277942631, + "grad_norm": 2.1316213607788086, + "learning_rate": 2.9942065847110356e-05, + "loss": 2.0658, + "step": 36120 + }, + { + "epoch": 0.8508784324808064, + "grad_norm": 2.383114814758301, + "learning_rate": 2.9894964909801703e-05, + "loss": 1.9316, + "step": 36130 + }, + { + "epoch": 0.8511139371673496, + "grad_norm": 2.28301739692688, + "learning_rate": 2.9847863972493056e-05, + "loss": 2.1035, + "step": 36140 + }, + { + "epoch": 0.8513494418538929, + "grad_norm": 2.0573742389678955, + "learning_rate": 2.9800763035184403e-05, + "loss": 2.0037, + "step": 36150 + }, + { + "epoch": 0.8515849465404361, + "grad_norm": 2.18030047416687, + "learning_rate": 2.975366209787575e-05, + "loss": 1.925, + "step": 36160 + }, + { + "epoch": 0.8518204512269795, + "grad_norm": 2.2723090648651123, + "learning_rate": 2.9706561160567097e-05, + "loss": 2.1188, + "step": 36170 + }, + { + "epoch": 0.8520559559135227, + "grad_norm": 2.1768040657043457, + "learning_rate": 2.9659460223258444e-05, + "loss": 1.9652, + "step": 36180 + }, + { + "epoch": 0.8522914606000659, + "grad_norm": 2.63767671585083, + "learning_rate": 2.961235928594979e-05, + "loss": 2.0274, + "step": 36190 + }, + { + "epoch": 0.8525269652866092, + "grad_norm": 2.380066394805908, + "learning_rate": 2.956525834864114e-05, + "loss": 1.9089, + "step": 36200 + }, + { + "epoch": 0.8527624699731524, + "grad_norm": 1.9944638013839722, + "learning_rate": 2.9518157411332488e-05, + "loss": 1.815, + "step": 36210 + }, + { + "epoch": 0.8529979746596957, + "grad_norm": 1.8857179880142212, + "learning_rate": 2.9471056474023835e-05, + "loss": 1.8514, + "step": 36220 + }, + { + "epoch": 0.8532334793462389, + "grad_norm": 1.8483797311782837, + "learning_rate": 2.942395553671518e-05, + "loss": 1.9566, + "step": 36230 + }, + { + "epoch": 0.8534689840327823, + "grad_norm": 2.12707781791687, + "learning_rate": 2.937685459940653e-05, + "loss": 2.0217, + "step": 36240 + }, + { + "epoch": 0.8537044887193255, + "grad_norm": 2.1420063972473145, + "learning_rate": 2.9329753662097875e-05, + "loss": 2.2063, + "step": 36250 + }, + { + "epoch": 0.8539399934058688, + "grad_norm": 2.08191180229187, + "learning_rate": 2.9282652724789222e-05, + "loss": 1.9304, + "step": 36260 + }, + { + "epoch": 0.854175498092412, + "grad_norm": 1.8427835702896118, + "learning_rate": 2.923555178748057e-05, + "loss": 1.7183, + "step": 36270 + }, + { + "epoch": 0.8544110027789553, + "grad_norm": 2.656949996948242, + "learning_rate": 2.9188450850171923e-05, + "loss": 2.0153, + "step": 36280 + }, + { + "epoch": 0.8546465074654985, + "grad_norm": 1.9904245138168335, + "learning_rate": 2.914134991286327e-05, + "loss": 1.9786, + "step": 36290 + }, + { + "epoch": 0.8548820121520418, + "grad_norm": 2.09130859375, + "learning_rate": 2.9094248975554616e-05, + "loss": 1.9968, + "step": 36300 + }, + { + "epoch": 0.8551175168385851, + "grad_norm": 2.567244529724121, + "learning_rate": 2.9047148038245963e-05, + "loss": 1.9054, + "step": 36310 + }, + { + "epoch": 0.8553530215251284, + "grad_norm": 2.4783241748809814, + "learning_rate": 2.900004710093731e-05, + "loss": 1.9424, + "step": 36320 + }, + { + "epoch": 0.8555885262116716, + "grad_norm": 2.3222262859344482, + "learning_rate": 2.8952946163628657e-05, + "loss": 2.0085, + "step": 36330 + }, + { + "epoch": 0.8558240308982149, + "grad_norm": 2.0765700340270996, + "learning_rate": 2.8905845226320004e-05, + "loss": 1.9701, + "step": 36340 + }, + { + "epoch": 0.8560595355847581, + "grad_norm": 2.526127815246582, + "learning_rate": 2.8858744289011354e-05, + "loss": 2.0037, + "step": 36350 + }, + { + "epoch": 0.8562950402713014, + "grad_norm": 2.3003344535827637, + "learning_rate": 2.88116433517027e-05, + "loss": 2.0852, + "step": 36360 + }, + { + "epoch": 0.8565305449578446, + "grad_norm": 2.28483247756958, + "learning_rate": 2.8764542414394048e-05, + "loss": 2.1826, + "step": 36370 + }, + { + "epoch": 0.856766049644388, + "grad_norm": 2.5736210346221924, + "learning_rate": 2.8717441477085395e-05, + "loss": 1.9093, + "step": 36380 + }, + { + "epoch": 0.8570015543309312, + "grad_norm": 2.280210018157959, + "learning_rate": 2.867034053977674e-05, + "loss": 2.1804, + "step": 36390 + }, + { + "epoch": 0.8572370590174745, + "grad_norm": 2.255749225616455, + "learning_rate": 2.8623239602468088e-05, + "loss": 2.0126, + "step": 36400 + }, + { + "epoch": 0.8574725637040177, + "grad_norm": 2.0187699794769287, + "learning_rate": 2.8576138665159435e-05, + "loss": 2.2085, + "step": 36410 + }, + { + "epoch": 0.857708068390561, + "grad_norm": 3.545119285583496, + "learning_rate": 2.852903772785079e-05, + "loss": 2.1997, + "step": 36420 + }, + { + "epoch": 0.8579435730771042, + "grad_norm": 2.619332790374756, + "learning_rate": 2.8481936790542136e-05, + "loss": 2.0218, + "step": 36430 + }, + { + "epoch": 0.8581790777636475, + "grad_norm": 2.5285348892211914, + "learning_rate": 2.8434835853233483e-05, + "loss": 1.9183, + "step": 36440 + }, + { + "epoch": 0.8584145824501908, + "grad_norm": 3.5454587936401367, + "learning_rate": 2.838773491592483e-05, + "loss": 2.0172, + "step": 36450 + }, + { + "epoch": 0.8586500871367341, + "grad_norm": 1.9408241510391235, + "learning_rate": 2.8340633978616176e-05, + "loss": 2.2235, + "step": 36460 + }, + { + "epoch": 0.8588855918232773, + "grad_norm": 2.6820948123931885, + "learning_rate": 2.8293533041307523e-05, + "loss": 2.005, + "step": 36470 + }, + { + "epoch": 0.8591210965098205, + "grad_norm": 1.9332228899002075, + "learning_rate": 2.824643210399887e-05, + "loss": 1.9547, + "step": 36480 + }, + { + "epoch": 0.8593566011963638, + "grad_norm": 1.865426778793335, + "learning_rate": 2.8199331166690217e-05, + "loss": 1.8152, + "step": 36490 + }, + { + "epoch": 0.859592105882907, + "grad_norm": 1.7702504396438599, + "learning_rate": 2.8152230229381567e-05, + "loss": 2.0508, + "step": 36500 + }, + { + "epoch": 0.8598276105694503, + "grad_norm": 1.7681245803833008, + "learning_rate": 2.8105129292072914e-05, + "loss": 1.9025, + "step": 36510 + }, + { + "epoch": 0.8600631152559935, + "grad_norm": 1.7548736333847046, + "learning_rate": 2.805802835476426e-05, + "loss": 1.7362, + "step": 36520 + }, + { + "epoch": 0.8602986199425369, + "grad_norm": 2.2081425189971924, + "learning_rate": 2.8010927417455608e-05, + "loss": 2.0412, + "step": 36530 + }, + { + "epoch": 0.8605341246290801, + "grad_norm": 1.8118314743041992, + "learning_rate": 2.7963826480146954e-05, + "loss": 1.9552, + "step": 36540 + }, + { + "epoch": 0.8607696293156234, + "grad_norm": 2.4875271320343018, + "learning_rate": 2.79167255428383e-05, + "loss": 2.0385, + "step": 36550 + }, + { + "epoch": 0.8610051340021666, + "grad_norm": 2.542076349258423, + "learning_rate": 2.7869624605529655e-05, + "loss": 2.1191, + "step": 36560 + }, + { + "epoch": 0.8612406386887099, + "grad_norm": 2.1214327812194824, + "learning_rate": 2.7822523668221002e-05, + "loss": 2.0365, + "step": 36570 + }, + { + "epoch": 0.8614761433752531, + "grad_norm": 4.163021564483643, + "learning_rate": 2.777542273091235e-05, + "loss": 2.0507, + "step": 36580 + }, + { + "epoch": 0.8617116480617965, + "grad_norm": 2.633694648742676, + "learning_rate": 2.7728321793603696e-05, + "loss": 1.9404, + "step": 36590 + }, + { + "epoch": 0.8619471527483397, + "grad_norm": 2.0306925773620605, + "learning_rate": 2.7681220856295042e-05, + "loss": 1.8616, + "step": 36600 + }, + { + "epoch": 0.862182657434883, + "grad_norm": 2.5609326362609863, + "learning_rate": 2.763411991898639e-05, + "loss": 2.0449, + "step": 36610 + }, + { + "epoch": 0.8624181621214262, + "grad_norm": 3.2793045043945312, + "learning_rate": 2.7587018981677736e-05, + "loss": 1.8033, + "step": 36620 + }, + { + "epoch": 0.8626536668079695, + "grad_norm": 2.455267906188965, + "learning_rate": 2.7539918044369083e-05, + "loss": 2.0809, + "step": 36630 + }, + { + "epoch": 0.8628891714945127, + "grad_norm": 2.3325164318084717, + "learning_rate": 2.749281710706043e-05, + "loss": 2.2098, + "step": 36640 + }, + { + "epoch": 0.863124676181056, + "grad_norm": 2.911341667175293, + "learning_rate": 2.744571616975178e-05, + "loss": 1.9998, + "step": 36650 + }, + { + "epoch": 0.8633601808675992, + "grad_norm": 1.7594044208526611, + "learning_rate": 2.7398615232443127e-05, + "loss": 1.9491, + "step": 36660 + }, + { + "epoch": 0.8635956855541426, + "grad_norm": 1.8261598348617554, + "learning_rate": 2.7351514295134474e-05, + "loss": 1.9535, + "step": 36670 + }, + { + "epoch": 0.8638311902406858, + "grad_norm": 2.3168840408325195, + "learning_rate": 2.730441335782582e-05, + "loss": 2.0609, + "step": 36680 + }, + { + "epoch": 0.8640666949272291, + "grad_norm": NaN, + "learning_rate": 2.7257312420517167e-05, + "loss": 2.0912, + "step": 36690 + }, + { + "epoch": 0.8643021996137723, + "grad_norm": 2.733128786087036, + "learning_rate": 2.7214921576939385e-05, + "loss": 1.8064, + "step": 36700 + }, + { + "epoch": 0.8645377043003156, + "grad_norm": 2.325484275817871, + "learning_rate": 2.7167820639630732e-05, + "loss": 2.0417, + "step": 36710 + }, + { + "epoch": 0.8647732089868588, + "grad_norm": 2.0244216918945312, + "learning_rate": 2.712071970232208e-05, + "loss": 1.9551, + "step": 36720 + }, + { + "epoch": 0.8650087136734022, + "grad_norm": 3.063344955444336, + "learning_rate": 2.7073618765013426e-05, + "loss": 2.0306, + "step": 36730 + }, + { + "epoch": 0.8652442183599454, + "grad_norm": 2.5917723178863525, + "learning_rate": 2.7026517827704773e-05, + "loss": 1.9062, + "step": 36740 + }, + { + "epoch": 0.8654797230464887, + "grad_norm": 1.9023863077163696, + "learning_rate": 2.697941689039612e-05, + "loss": 2.0226, + "step": 36750 + }, + { + "epoch": 0.8657152277330319, + "grad_norm": 2.4680817127227783, + "learning_rate": 2.6932315953087467e-05, + "loss": 2.0635, + "step": 36760 + }, + { + "epoch": 0.8659507324195751, + "grad_norm": 1.7801569700241089, + "learning_rate": 2.6885215015778813e-05, + "loss": 1.7529, + "step": 36770 + }, + { + "epoch": 0.8661862371061184, + "grad_norm": 2.0391175746917725, + "learning_rate": 2.683811407847016e-05, + "loss": 2.1877, + "step": 36780 + }, + { + "epoch": 0.8664217417926616, + "grad_norm": 1.8722394704818726, + "learning_rate": 2.6791013141161507e-05, + "loss": 2.0154, + "step": 36790 + }, + { + "epoch": 0.866657246479205, + "grad_norm": 1.8816248178482056, + "learning_rate": 2.6743912203852854e-05, + "loss": 1.9464, + "step": 36800 + }, + { + "epoch": 0.8668927511657482, + "grad_norm": 2.1142876148223877, + "learning_rate": 2.6696811266544208e-05, + "loss": 2.108, + "step": 36810 + }, + { + "epoch": 0.8671282558522915, + "grad_norm": 2.861016273498535, + "learning_rate": 2.6649710329235554e-05, + "loss": 1.998, + "step": 36820 + }, + { + "epoch": 0.8673637605388347, + "grad_norm": 2.2118663787841797, + "learning_rate": 2.66026093919269e-05, + "loss": 2.0853, + "step": 36830 + }, + { + "epoch": 0.867599265225378, + "grad_norm": 2.2175252437591553, + "learning_rate": 2.6555508454618248e-05, + "loss": 1.8626, + "step": 36840 + }, + { + "epoch": 0.8678347699119212, + "grad_norm": 1.7482407093048096, + "learning_rate": 2.65084075173096e-05, + "loss": 2.0537, + "step": 36850 + }, + { + "epoch": 0.8680702745984645, + "grad_norm": 1.8986228704452515, + "learning_rate": 2.6461306580000945e-05, + "loss": 2.0676, + "step": 36860 + }, + { + "epoch": 0.8683057792850077, + "grad_norm": 3.3696067333221436, + "learning_rate": 2.6414205642692292e-05, + "loss": 2.1246, + "step": 36870 + }, + { + "epoch": 0.8685412839715511, + "grad_norm": 2.3474841117858887, + "learning_rate": 2.636710470538364e-05, + "loss": 2.0152, + "step": 36880 + }, + { + "epoch": 0.8687767886580943, + "grad_norm": 2.315103769302368, + "learning_rate": 2.6320003768074986e-05, + "loss": 2.1758, + "step": 36890 + }, + { + "epoch": 0.8690122933446376, + "grad_norm": 1.727292776107788, + "learning_rate": 2.6272902830766333e-05, + "loss": 2.0967, + "step": 36900 + }, + { + "epoch": 0.8692477980311808, + "grad_norm": 2.4542670249938965, + "learning_rate": 2.622580189345768e-05, + "loss": 1.9784, + "step": 36910 + }, + { + "epoch": 0.8694833027177241, + "grad_norm": 2.198284149169922, + "learning_rate": 2.6178700956149026e-05, + "loss": 1.8952, + "step": 36920 + }, + { + "epoch": 0.8697188074042673, + "grad_norm": 3.0475144386291504, + "learning_rate": 2.6131600018840373e-05, + "loss": 1.8998, + "step": 36930 + }, + { + "epoch": 0.8699543120908106, + "grad_norm": 1.985602855682373, + "learning_rate": 2.608449908153172e-05, + "loss": 1.948, + "step": 36940 + }, + { + "epoch": 0.8701898167773539, + "grad_norm": 2.757429838180542, + "learning_rate": 2.6037398144223067e-05, + "loss": 1.9312, + "step": 36950 + }, + { + "epoch": 0.8704253214638972, + "grad_norm": 2.614267349243164, + "learning_rate": 2.599029720691442e-05, + "loss": 2.0093, + "step": 36960 + }, + { + "epoch": 0.8706608261504404, + "grad_norm": 1.770949363708496, + "learning_rate": 2.5943196269605767e-05, + "loss": 1.9344, + "step": 36970 + }, + { + "epoch": 0.8708963308369837, + "grad_norm": 2.9338138103485107, + "learning_rate": 2.5896095332297114e-05, + "loss": 1.9617, + "step": 36980 + }, + { + "epoch": 0.8711318355235269, + "grad_norm": 2.102163076400757, + "learning_rate": 2.584899439498846e-05, + "loss": 2.0294, + "step": 36990 + }, + { + "epoch": 0.8713673402100702, + "grad_norm": 1.8639206886291504, + "learning_rate": 2.580189345767981e-05, + "loss": 1.9084, + "step": 37000 + }, + { + "epoch": 0.8716028448966134, + "grad_norm": 2.295996904373169, + "learning_rate": 2.5754792520371158e-05, + "loss": 2.081, + "step": 37010 + }, + { + "epoch": 0.8718383495831568, + "grad_norm": 2.4227380752563477, + "learning_rate": 2.5707691583062505e-05, + "loss": 2.0755, + "step": 37020 + }, + { + "epoch": 0.8720738542697, + "grad_norm": 2.3559842109680176, + "learning_rate": 2.5660590645753852e-05, + "loss": 1.8186, + "step": 37030 + }, + { + "epoch": 0.8723093589562432, + "grad_norm": 2.341525077819824, + "learning_rate": 2.56134897084452e-05, + "loss": 2.0243, + "step": 37040 + }, + { + "epoch": 0.8725448636427865, + "grad_norm": 3.58907413482666, + "learning_rate": 2.5566388771136546e-05, + "loss": 2.1113, + "step": 37050 + }, + { + "epoch": 0.8727803683293297, + "grad_norm": 1.8478801250457764, + "learning_rate": 2.5519287833827893e-05, + "loss": 1.8545, + "step": 37060 + }, + { + "epoch": 0.873015873015873, + "grad_norm": 2.5865366458892822, + "learning_rate": 2.547218689651924e-05, + "loss": 2.1964, + "step": 37070 + }, + { + "epoch": 0.8732513777024162, + "grad_norm": 2.055079460144043, + "learning_rate": 2.5425085959210586e-05, + "loss": 1.9852, + "step": 37080 + }, + { + "epoch": 0.8734868823889596, + "grad_norm": 1.9942408800125122, + "learning_rate": 2.5377985021901933e-05, + "loss": 2.0406, + "step": 37090 + }, + { + "epoch": 0.8737223870755028, + "grad_norm": 2.4995577335357666, + "learning_rate": 2.5330884084593287e-05, + "loss": 2.0003, + "step": 37100 + }, + { + "epoch": 0.8739578917620461, + "grad_norm": 2.066697597503662, + "learning_rate": 2.5283783147284634e-05, + "loss": 1.9404, + "step": 37110 + }, + { + "epoch": 0.8741933964485893, + "grad_norm": 2.102442741394043, + "learning_rate": 2.523668220997598e-05, + "loss": 1.9775, + "step": 37120 + }, + { + "epoch": 0.8744289011351326, + "grad_norm": 2.435946464538574, + "learning_rate": 2.5189581272667327e-05, + "loss": 2.0967, + "step": 37130 + }, + { + "epoch": 0.8746644058216758, + "grad_norm": 1.9978781938552856, + "learning_rate": 2.5142480335358674e-05, + "loss": 2.0675, + "step": 37140 + }, + { + "epoch": 0.8748999105082191, + "grad_norm": 2.7619597911834717, + "learning_rate": 2.5095379398050024e-05, + "loss": 2.0668, + "step": 37150 + }, + { + "epoch": 0.8751354151947623, + "grad_norm": 1.664589285850525, + "learning_rate": 2.504827846074137e-05, + "loss": 2.1728, + "step": 37160 + }, + { + "epoch": 0.8753709198813057, + "grad_norm": 2.1075856685638428, + "learning_rate": 2.5001177523432718e-05, + "loss": 1.7206, + "step": 37170 + }, + { + "epoch": 0.8756064245678489, + "grad_norm": 2.175349473953247, + "learning_rate": 2.4954076586124065e-05, + "loss": 1.9852, + "step": 37180 + }, + { + "epoch": 0.8758419292543922, + "grad_norm": 2.1654629707336426, + "learning_rate": 2.4906975648815412e-05, + "loss": 1.9823, + "step": 37190 + }, + { + "epoch": 0.8760774339409354, + "grad_norm": 2.405961036682129, + "learning_rate": 2.4859874711506762e-05, + "loss": 1.8534, + "step": 37200 + }, + { + "epoch": 0.8763129386274787, + "grad_norm": 1.9819753170013428, + "learning_rate": 2.481277377419811e-05, + "loss": 2.1591, + "step": 37210 + }, + { + "epoch": 0.8765484433140219, + "grad_norm": 2.0572049617767334, + "learning_rate": 2.4765672836889456e-05, + "loss": 1.9411, + "step": 37220 + }, + { + "epoch": 0.8767839480005652, + "grad_norm": 2.2193071842193604, + "learning_rate": 2.4718571899580803e-05, + "loss": 1.9532, + "step": 37230 + }, + { + "epoch": 0.8770194526871085, + "grad_norm": 2.0980918407440186, + "learning_rate": 2.467147096227215e-05, + "loss": 1.9071, + "step": 37240 + }, + { + "epoch": 0.8772549573736518, + "grad_norm": 2.7470831871032715, + "learning_rate": 2.4624370024963496e-05, + "loss": 2.072, + "step": 37250 + }, + { + "epoch": 0.877490462060195, + "grad_norm": 1.7608436346054077, + "learning_rate": 2.4577269087654843e-05, + "loss": 2.0747, + "step": 37260 + }, + { + "epoch": 0.8777259667467383, + "grad_norm": 1.77220618724823, + "learning_rate": 2.4530168150346193e-05, + "loss": 1.7371, + "step": 37270 + }, + { + "epoch": 0.8779614714332815, + "grad_norm": 2.003990411758423, + "learning_rate": 2.448306721303754e-05, + "loss": 2.0556, + "step": 37280 + }, + { + "epoch": 0.8781969761198248, + "grad_norm": 2.2261626720428467, + "learning_rate": 2.4435966275728887e-05, + "loss": 2.1423, + "step": 37290 + }, + { + "epoch": 0.878432480806368, + "grad_norm": 2.8093926906585693, + "learning_rate": 2.4388865338420237e-05, + "loss": 2.0423, + "step": 37300 + }, + { + "epoch": 0.8786679854929114, + "grad_norm": 2.5685336589813232, + "learning_rate": 2.4341764401111584e-05, + "loss": 2.134, + "step": 37310 + }, + { + "epoch": 0.8789034901794546, + "grad_norm": 2.4057536125183105, + "learning_rate": 2.429466346380293e-05, + "loss": 2.2208, + "step": 37320 + }, + { + "epoch": 0.8791389948659978, + "grad_norm": 1.4305607080459595, + "learning_rate": 2.4247562526494278e-05, + "loss": 2.0589, + "step": 37330 + }, + { + "epoch": 0.8793744995525411, + "grad_norm": 2.37786865234375, + "learning_rate": 2.4200461589185628e-05, + "loss": 1.9452, + "step": 37340 + }, + { + "epoch": 0.8796100042390843, + "grad_norm": 1.7371529340744019, + "learning_rate": 2.4153360651876975e-05, + "loss": 1.9545, + "step": 37350 + }, + { + "epoch": 0.8798455089256276, + "grad_norm": 2.432138681411743, + "learning_rate": 2.4106259714568322e-05, + "loss": 1.9643, + "step": 37360 + }, + { + "epoch": 0.8800810136121708, + "grad_norm": 2.241659641265869, + "learning_rate": 2.405915877725967e-05, + "loss": 1.946, + "step": 37370 + }, + { + "epoch": 0.8803165182987142, + "grad_norm": 2.5859363079071045, + "learning_rate": 2.4012057839951016e-05, + "loss": 2.0878, + "step": 37380 + }, + { + "epoch": 0.8805520229852574, + "grad_norm": 2.5846035480499268, + "learning_rate": 2.3964956902642363e-05, + "loss": 1.9119, + "step": 37390 + }, + { + "epoch": 0.8807875276718007, + "grad_norm": 3.664036512374878, + "learning_rate": 2.391785596533371e-05, + "loss": 1.9178, + "step": 37400 + }, + { + "epoch": 0.8810230323583439, + "grad_norm": 2.499345064163208, + "learning_rate": 2.3870755028025056e-05, + "loss": 2.0054, + "step": 37410 + }, + { + "epoch": 0.8812585370448872, + "grad_norm": 1.7014585733413696, + "learning_rate": 2.3823654090716406e-05, + "loss": 2.0352, + "step": 37420 + }, + { + "epoch": 0.8814940417314304, + "grad_norm": 2.9908242225646973, + "learning_rate": 2.3776553153407753e-05, + "loss": 1.9084, + "step": 37430 + }, + { + "epoch": 0.8817295464179737, + "grad_norm": 2.379347324371338, + "learning_rate": 2.37294522160991e-05, + "loss": 1.9052, + "step": 37440 + }, + { + "epoch": 0.881965051104517, + "grad_norm": 1.8825994729995728, + "learning_rate": 2.368235127879045e-05, + "loss": 2.0933, + "step": 37450 + }, + { + "epoch": 0.8822005557910603, + "grad_norm": 2.112344980239868, + "learning_rate": 2.3635250341481797e-05, + "loss": 1.945, + "step": 37460 + }, + { + "epoch": 0.8824360604776035, + "grad_norm": 2.0043137073516846, + "learning_rate": 2.3588149404173144e-05, + "loss": 2.0133, + "step": 37470 + }, + { + "epoch": 0.8826715651641468, + "grad_norm": 2.380242347717285, + "learning_rate": 2.354104846686449e-05, + "loss": 2.0031, + "step": 37480 + }, + { + "epoch": 0.88290706985069, + "grad_norm": 2.028883457183838, + "learning_rate": 2.349394752955584e-05, + "loss": 1.8129, + "step": 37490 + }, + { + "epoch": 0.8831425745372333, + "grad_norm": 2.6352202892303467, + "learning_rate": 2.3446846592247188e-05, + "loss": 2.1235, + "step": 37500 + }, + { + "epoch": 0.8833780792237765, + "grad_norm": 2.017411470413208, + "learning_rate": 2.3399745654938535e-05, + "loss": 1.8616, + "step": 37510 + }, + { + "epoch": 0.8836135839103199, + "grad_norm": 2.081916093826294, + "learning_rate": 2.3352644717629882e-05, + "loss": 1.8437, + "step": 37520 + }, + { + "epoch": 0.8838490885968631, + "grad_norm": 2.4663143157958984, + "learning_rate": 2.330554378032123e-05, + "loss": 1.7767, + "step": 37530 + }, + { + "epoch": 0.8840845932834064, + "grad_norm": 2.4353020191192627, + "learning_rate": 2.3258442843012576e-05, + "loss": 2.2355, + "step": 37540 + }, + { + "epoch": 0.8843200979699496, + "grad_norm": 2.1264777183532715, + "learning_rate": 2.3211341905703922e-05, + "loss": 2.0394, + "step": 37550 + }, + { + "epoch": 0.8845556026564929, + "grad_norm": 2.2275872230529785, + "learning_rate": 2.3164240968395273e-05, + "loss": 2.2761, + "step": 37560 + }, + { + "epoch": 0.8847911073430361, + "grad_norm": 4.1986284255981445, + "learning_rate": 2.311714003108662e-05, + "loss": 1.9435, + "step": 37570 + }, + { + "epoch": 0.8850266120295794, + "grad_norm": 2.330110549926758, + "learning_rate": 2.3070039093777966e-05, + "loss": 1.7825, + "step": 37580 + }, + { + "epoch": 0.8852621167161226, + "grad_norm": 2.903902769088745, + "learning_rate": 2.3022938156469313e-05, + "loss": 2.0841, + "step": 37590 + }, + { + "epoch": 0.8854976214026659, + "grad_norm": 2.5278985500335693, + "learning_rate": 2.2975837219160663e-05, + "loss": 2.2633, + "step": 37600 + }, + { + "epoch": 0.8857331260892092, + "grad_norm": 2.081477165222168, + "learning_rate": 2.292873628185201e-05, + "loss": 2.0897, + "step": 37610 + }, + { + "epoch": 0.8859686307757524, + "grad_norm": 1.9763598442077637, + "learning_rate": 2.2881635344543357e-05, + "loss": 1.9917, + "step": 37620 + }, + { + "epoch": 0.8862041354622957, + "grad_norm": 2.136435031890869, + "learning_rate": 2.2834534407234707e-05, + "loss": 1.8923, + "step": 37630 + }, + { + "epoch": 0.8864396401488389, + "grad_norm": 2.021329402923584, + "learning_rate": 2.2787433469926054e-05, + "loss": 2.1512, + "step": 37640 + }, + { + "epoch": 0.8866751448353822, + "grad_norm": 2.108199119567871, + "learning_rate": 2.27403325326174e-05, + "loss": 2.1143, + "step": 37650 + }, + { + "epoch": 0.8869106495219254, + "grad_norm": 3.078599691390991, + "learning_rate": 2.2693231595308748e-05, + "loss": 2.0842, + "step": 37660 + }, + { + "epoch": 0.8871461542084688, + "grad_norm": 2.235524892807007, + "learning_rate": 2.2646130658000095e-05, + "loss": 1.945, + "step": 37670 + }, + { + "epoch": 0.887381658895012, + "grad_norm": 2.1809022426605225, + "learning_rate": 2.259902972069144e-05, + "loss": 1.9934, + "step": 37680 + }, + { + "epoch": 0.8876171635815553, + "grad_norm": 2.0931341648101807, + "learning_rate": 2.255192878338279e-05, + "loss": 2.0573, + "step": 37690 + }, + { + "epoch": 0.8878526682680985, + "grad_norm": 2.221452236175537, + "learning_rate": 2.250482784607414e-05, + "loss": 1.8676, + "step": 37700 + }, + { + "epoch": 0.8880881729546418, + "grad_norm": 2.1494803428649902, + "learning_rate": 2.2457726908765486e-05, + "loss": 2.0305, + "step": 37710 + }, + { + "epoch": 0.888323677641185, + "grad_norm": 2.0528335571289062, + "learning_rate": 2.2410625971456832e-05, + "loss": 1.8773, + "step": 37720 + }, + { + "epoch": 0.8885591823277283, + "grad_norm": 2.3743772506713867, + "learning_rate": 2.236352503414818e-05, + "loss": 2.1906, + "step": 37730 + }, + { + "epoch": 0.8887946870142716, + "grad_norm": 2.228240966796875, + "learning_rate": 2.2316424096839526e-05, + "loss": 1.9563, + "step": 37740 + }, + { + "epoch": 0.8890301917008149, + "grad_norm": 2.1875791549682617, + "learning_rate": 2.2269323159530876e-05, + "loss": 1.8761, + "step": 37750 + }, + { + "epoch": 0.8892656963873581, + "grad_norm": 3.2131400108337402, + "learning_rate": 2.2222222222222223e-05, + "loss": 1.999, + "step": 37760 + }, + { + "epoch": 0.8895012010739014, + "grad_norm": 2.3823819160461426, + "learning_rate": 2.2175121284913574e-05, + "loss": 1.9455, + "step": 37770 + }, + { + "epoch": 0.8897367057604446, + "grad_norm": 2.280933141708374, + "learning_rate": 2.212802034760492e-05, + "loss": 1.9539, + "step": 37780 + }, + { + "epoch": 0.8899722104469879, + "grad_norm": 2.0002353191375732, + "learning_rate": 2.2080919410296267e-05, + "loss": 1.891, + "step": 37790 + }, + { + "epoch": 0.8902077151335311, + "grad_norm": 1.8115776777267456, + "learning_rate": 2.2033818472987614e-05, + "loss": 2.0144, + "step": 37800 + }, + { + "epoch": 0.8904432198200745, + "grad_norm": 2.2819020748138428, + "learning_rate": 2.198671753567896e-05, + "loss": 1.8285, + "step": 37810 + }, + { + "epoch": 0.8906787245066177, + "grad_norm": 2.1518003940582275, + "learning_rate": 2.1939616598370308e-05, + "loss": 1.9067, + "step": 37820 + }, + { + "epoch": 0.890914229193161, + "grad_norm": 2.508162498474121, + "learning_rate": 2.1892515661061655e-05, + "loss": 2.061, + "step": 37830 + }, + { + "epoch": 0.8911497338797042, + "grad_norm": 2.1272640228271484, + "learning_rate": 2.1845414723753005e-05, + "loss": 1.9815, + "step": 37840 + }, + { + "epoch": 0.8913852385662475, + "grad_norm": 3.0076818466186523, + "learning_rate": 2.1798313786444352e-05, + "loss": 2.0412, + "step": 37850 + }, + { + "epoch": 0.8916207432527907, + "grad_norm": 2.511852264404297, + "learning_rate": 2.17512128491357e-05, + "loss": 2.0381, + "step": 37860 + }, + { + "epoch": 0.891856247939334, + "grad_norm": 2.035937547683716, + "learning_rate": 2.1704111911827045e-05, + "loss": 1.9446, + "step": 37870 + }, + { + "epoch": 0.8920917526258773, + "grad_norm": 1.9787641763687134, + "learning_rate": 2.1657010974518392e-05, + "loss": 1.9567, + "step": 37880 + }, + { + "epoch": 0.8923272573124205, + "grad_norm": 2.1247246265411377, + "learning_rate": 2.160991003720974e-05, + "loss": 2.0504, + "step": 37890 + }, + { + "epoch": 0.8925627619989638, + "grad_norm": 2.798614740371704, + "learning_rate": 2.156280909990109e-05, + "loss": 1.9327, + "step": 37900 + }, + { + "epoch": 0.892798266685507, + "grad_norm": 2.846179246902466, + "learning_rate": 2.1515708162592436e-05, + "loss": 2.185, + "step": 37910 + }, + { + "epoch": 0.8930337713720503, + "grad_norm": 2.097317934036255, + "learning_rate": 2.1468607225283787e-05, + "loss": 1.8594, + "step": 37920 + }, + { + "epoch": 0.8932692760585935, + "grad_norm": 1.8242942094802856, + "learning_rate": 2.1421506287975133e-05, + "loss": 2.1531, + "step": 37930 + }, + { + "epoch": 0.8935047807451368, + "grad_norm": 2.1491048336029053, + "learning_rate": 2.137440535066648e-05, + "loss": 1.8671, + "step": 37940 + }, + { + "epoch": 0.89374028543168, + "grad_norm": 2.0003557205200195, + "learning_rate": 2.1327304413357827e-05, + "loss": 2.0186, + "step": 37950 + }, + { + "epoch": 0.8939757901182234, + "grad_norm": 2.0959033966064453, + "learning_rate": 2.1280203476049174e-05, + "loss": 2.1963, + "step": 37960 + }, + { + "epoch": 0.8942112948047666, + "grad_norm": 1.9621938467025757, + "learning_rate": 2.123310253874052e-05, + "loss": 2.3335, + "step": 37970 + }, + { + "epoch": 0.8944467994913099, + "grad_norm": 2.0647966861724854, + "learning_rate": 2.118600160143187e-05, + "loss": 2.1695, + "step": 37980 + }, + { + "epoch": 0.8946823041778531, + "grad_norm": 1.7961957454681396, + "learning_rate": 2.1138900664123218e-05, + "loss": 1.8972, + "step": 37990 + }, + { + "epoch": 0.8949178088643964, + "grad_norm": 2.169931411743164, + "learning_rate": 2.1091799726814565e-05, + "loss": 2.2119, + "step": 38000 + }, + { + "epoch": 0.8951533135509396, + "grad_norm": 2.0266969203948975, + "learning_rate": 2.104469878950591e-05, + "loss": 1.8381, + "step": 38010 + }, + { + "epoch": 0.895388818237483, + "grad_norm": 1.9402709007263184, + "learning_rate": 2.099759785219726e-05, + "loss": 2.0565, + "step": 38020 + }, + { + "epoch": 0.8956243229240262, + "grad_norm": 1.9579286575317383, + "learning_rate": 2.0950496914888605e-05, + "loss": 1.8636, + "step": 38030 + }, + { + "epoch": 0.8958598276105695, + "grad_norm": 2.2410125732421875, + "learning_rate": 2.0903395977579952e-05, + "loss": 2.0196, + "step": 38040 + }, + { + "epoch": 0.8960953322971127, + "grad_norm": 2.3416740894317627, + "learning_rate": 2.0856295040271302e-05, + "loss": 1.8177, + "step": 38050 + }, + { + "epoch": 0.896330836983656, + "grad_norm": 1.8137468099594116, + "learning_rate": 2.080919410296265e-05, + "loss": 1.9998, + "step": 38060 + }, + { + "epoch": 0.8965663416701992, + "grad_norm": 2.1465566158294678, + "learning_rate": 2.0762093165654e-05, + "loss": 1.9154, + "step": 38070 + }, + { + "epoch": 0.8968018463567425, + "grad_norm": 2.6183629035949707, + "learning_rate": 2.0714992228345346e-05, + "loss": 2.0203, + "step": 38080 + }, + { + "epoch": 0.8970373510432857, + "grad_norm": 3.3519785404205322, + "learning_rate": 2.0667891291036693e-05, + "loss": 2.0246, + "step": 38090 + }, + { + "epoch": 0.8972728557298291, + "grad_norm": 2.2263829708099365, + "learning_rate": 2.062079035372804e-05, + "loss": 2.158, + "step": 38100 + }, + { + "epoch": 0.8975083604163723, + "grad_norm": 2.257312536239624, + "learning_rate": 2.0573689416419387e-05, + "loss": 1.8202, + "step": 38110 + }, + { + "epoch": 0.8977438651029156, + "grad_norm": 3.83290696144104, + "learning_rate": 2.0526588479110737e-05, + "loss": 2.1171, + "step": 38120 + }, + { + "epoch": 0.8979793697894588, + "grad_norm": 1.9540990591049194, + "learning_rate": 2.0479487541802084e-05, + "loss": 2.0101, + "step": 38130 + }, + { + "epoch": 0.8982148744760021, + "grad_norm": 2.0866284370422363, + "learning_rate": 2.043238660449343e-05, + "loss": 1.8839, + "step": 38140 + }, + { + "epoch": 0.8984503791625453, + "grad_norm": 1.6635223627090454, + "learning_rate": 2.0385285667184778e-05, + "loss": 2.076, + "step": 38150 + }, + { + "epoch": 0.8986858838490887, + "grad_norm": 2.1118903160095215, + "learning_rate": 2.0338184729876125e-05, + "loss": 2.0658, + "step": 38160 + }, + { + "epoch": 0.8989213885356319, + "grad_norm": 1.8942980766296387, + "learning_rate": 2.029108379256747e-05, + "loss": 2.0003, + "step": 38170 + }, + { + "epoch": 0.8991568932221751, + "grad_norm": 2.1378395557403564, + "learning_rate": 2.024398285525882e-05, + "loss": 2.0265, + "step": 38180 + }, + { + "epoch": 0.8993923979087184, + "grad_norm": 2.5827856063842773, + "learning_rate": 2.019688191795017e-05, + "loss": 1.9765, + "step": 38190 + }, + { + "epoch": 0.8996279025952616, + "grad_norm": 2.509906530380249, + "learning_rate": 2.0149780980641515e-05, + "loss": 1.8868, + "step": 38200 + }, + { + "epoch": 0.8998634072818049, + "grad_norm": 2.7832489013671875, + "learning_rate": 2.0102680043332862e-05, + "loss": 1.9318, + "step": 38210 + }, + { + "epoch": 0.9000989119683481, + "grad_norm": 2.0925934314727783, + "learning_rate": 2.0055579106024213e-05, + "loss": 1.7913, + "step": 38220 + }, + { + "epoch": 0.9003344166548914, + "grad_norm": 2.3135039806365967, + "learning_rate": 2.000847816871556e-05, + "loss": 2.1221, + "step": 38230 + }, + { + "epoch": 0.9005699213414347, + "grad_norm": 2.136977195739746, + "learning_rate": 1.9961377231406906e-05, + "loss": 2.0687, + "step": 38240 + }, + { + "epoch": 0.900805426027978, + "grad_norm": 2.6171984672546387, + "learning_rate": 1.9914276294098253e-05, + "loss": 1.9261, + "step": 38250 + }, + { + "epoch": 0.9010409307145212, + "grad_norm": 3.6002085208892822, + "learning_rate": 1.9867175356789603e-05, + "loss": 2.0192, + "step": 38260 + }, + { + "epoch": 0.9012764354010645, + "grad_norm": 1.5538113117218018, + "learning_rate": 1.982007441948095e-05, + "loss": 2.016, + "step": 38270 + }, + { + "epoch": 0.9015119400876077, + "grad_norm": 2.0241286754608154, + "learning_rate": 1.9772973482172297e-05, + "loss": 1.8286, + "step": 38280 + }, + { + "epoch": 0.901747444774151, + "grad_norm": 1.9949616193771362, + "learning_rate": 1.9725872544863644e-05, + "loss": 1.8788, + "step": 38290 + }, + { + "epoch": 0.9019829494606942, + "grad_norm": 2.359792947769165, + "learning_rate": 1.967877160755499e-05, + "loss": 1.8023, + "step": 38300 + }, + { + "epoch": 0.9022184541472376, + "grad_norm": 2.121299982070923, + "learning_rate": 1.9631670670246338e-05, + "loss": 2.0192, + "step": 38310 + }, + { + "epoch": 0.9024539588337808, + "grad_norm": 2.227768659591675, + "learning_rate": 1.9584569732937684e-05, + "loss": 2.0309, + "step": 38320 + }, + { + "epoch": 0.9026894635203241, + "grad_norm": 2.0290210247039795, + "learning_rate": 1.9537468795629035e-05, + "loss": 2.1192, + "step": 38330 + }, + { + "epoch": 0.9029249682068673, + "grad_norm": 2.3042876720428467, + "learning_rate": 1.949036785832038e-05, + "loss": 2.1374, + "step": 38340 + }, + { + "epoch": 0.9031604728934106, + "grad_norm": 2.5515758991241455, + "learning_rate": 1.944326692101173e-05, + "loss": 2.2315, + "step": 38350 + }, + { + "epoch": 0.9033959775799538, + "grad_norm": 2.387439012527466, + "learning_rate": 1.9396165983703075e-05, + "loss": 2.1361, + "step": 38360 + }, + { + "epoch": 0.9036314822664971, + "grad_norm": 1.8408795595169067, + "learning_rate": 1.9349065046394426e-05, + "loss": 1.9147, + "step": 38370 + }, + { + "epoch": 0.9038669869530404, + "grad_norm": 1.412001132965088, + "learning_rate": 1.9301964109085772e-05, + "loss": 1.9571, + "step": 38380 + }, + { + "epoch": 0.9041024916395837, + "grad_norm": 1.8655277490615845, + "learning_rate": 1.925486317177712e-05, + "loss": 1.9687, + "step": 38390 + }, + { + "epoch": 0.9043379963261269, + "grad_norm": 2.4135653972625732, + "learning_rate": 1.9207762234468466e-05, + "loss": 2.0018, + "step": 38400 + }, + { + "epoch": 0.9045735010126702, + "grad_norm": 2.3086419105529785, + "learning_rate": 1.9160661297159816e-05, + "loss": 2.178, + "step": 38410 + }, + { + "epoch": 0.9048090056992134, + "grad_norm": 1.9893290996551514, + "learning_rate": 1.9113560359851163e-05, + "loss": 2.228, + "step": 38420 + }, + { + "epoch": 0.9050445103857567, + "grad_norm": 2.6201305389404297, + "learning_rate": 1.906645942254251e-05, + "loss": 2.059, + "step": 38430 + }, + { + "epoch": 0.9052800150722999, + "grad_norm": 2.3512847423553467, + "learning_rate": 1.9019358485233857e-05, + "loss": 2.0451, + "step": 38440 + }, + { + "epoch": 0.9055155197588431, + "grad_norm": 2.233941078186035, + "learning_rate": 1.8972257547925204e-05, + "loss": 1.9015, + "step": 38450 + }, + { + "epoch": 0.9057510244453865, + "grad_norm": 3.108917713165283, + "learning_rate": 1.892515661061655e-05, + "loss": 1.979, + "step": 38460 + }, + { + "epoch": 0.9059865291319297, + "grad_norm": 2.284076452255249, + "learning_rate": 1.8878055673307898e-05, + "loss": 1.9659, + "step": 38470 + }, + { + "epoch": 0.906222033818473, + "grad_norm": 2.1886470317840576, + "learning_rate": 1.8830954735999248e-05, + "loss": 2.0916, + "step": 38480 + }, + { + "epoch": 0.9064575385050162, + "grad_norm": 1.983109712600708, + "learning_rate": 1.8783853798690595e-05, + "loss": 2.0744, + "step": 38490 + }, + { + "epoch": 0.9066930431915595, + "grad_norm": 1.651294231414795, + "learning_rate": 1.873675286138194e-05, + "loss": 1.8051, + "step": 38500 + }, + { + "epoch": 0.9069285478781027, + "grad_norm": 2.303051710128784, + "learning_rate": 1.868965192407329e-05, + "loss": 1.7336, + "step": 38510 + }, + { + "epoch": 0.907164052564646, + "grad_norm": 1.448872447013855, + "learning_rate": 1.864255098676464e-05, + "loss": 1.9543, + "step": 38520 + }, + { + "epoch": 0.9073995572511893, + "grad_norm": 2.634509563446045, + "learning_rate": 1.8595450049455985e-05, + "loss": 2.0457, + "step": 38530 + }, + { + "epoch": 0.9076350619377326, + "grad_norm": 2.3734359741210938, + "learning_rate": 1.8548349112147332e-05, + "loss": 2.0179, + "step": 38540 + }, + { + "epoch": 0.9078705666242758, + "grad_norm": 2.052739381790161, + "learning_rate": 1.8501248174838683e-05, + "loss": 2.1796, + "step": 38550 + }, + { + "epoch": 0.9081060713108191, + "grad_norm": 2.4247121810913086, + "learning_rate": 1.845414723753003e-05, + "loss": 2.0285, + "step": 38560 + }, + { + "epoch": 0.9083415759973623, + "grad_norm": 2.3661036491394043, + "learning_rate": 1.8407046300221376e-05, + "loss": 2.1629, + "step": 38570 + }, + { + "epoch": 0.9085770806839056, + "grad_norm": 1.7307307720184326, + "learning_rate": 1.8359945362912723e-05, + "loss": 2.0182, + "step": 38580 + }, + { + "epoch": 0.9088125853704488, + "grad_norm": 2.318981885910034, + "learning_rate": 1.831284442560407e-05, + "loss": 2.031, + "step": 38590 + }, + { + "epoch": 0.9090480900569922, + "grad_norm": 2.0078811645507812, + "learning_rate": 1.8265743488295417e-05, + "loss": 1.8833, + "step": 38600 + }, + { + "epoch": 0.9092835947435354, + "grad_norm": 1.9469083547592163, + "learning_rate": 1.8218642550986764e-05, + "loss": 2.0904, + "step": 38610 + }, + { + "epoch": 0.9095190994300787, + "grad_norm": 2.0202481746673584, + "learning_rate": 1.8171541613678114e-05, + "loss": 1.8596, + "step": 38620 + }, + { + "epoch": 0.9097546041166219, + "grad_norm": 2.1429693698883057, + "learning_rate": 1.812444067636946e-05, + "loss": 1.774, + "step": 38630 + }, + { + "epoch": 0.9099901088031652, + "grad_norm": 2.0817933082580566, + "learning_rate": 1.8077339739060808e-05, + "loss": 2.0136, + "step": 38640 + }, + { + "epoch": 0.9102256134897084, + "grad_norm": 2.4810056686401367, + "learning_rate": 1.8030238801752154e-05, + "loss": 2.0795, + "step": 38650 + }, + { + "epoch": 0.9104611181762517, + "grad_norm": 2.75999116897583, + "learning_rate": 1.79831378644435e-05, + "loss": 2.0032, + "step": 38660 + }, + { + "epoch": 0.910696622862795, + "grad_norm": 2.1488144397735596, + "learning_rate": 1.793603692713485e-05, + "loss": 2.3905, + "step": 38670 + }, + { + "epoch": 0.9109321275493383, + "grad_norm": 2.028859853744507, + "learning_rate": 1.78889359898262e-05, + "loss": 2.0345, + "step": 38680 + }, + { + "epoch": 0.9111676322358815, + "grad_norm": 1.8787860870361328, + "learning_rate": 1.784183505251755e-05, + "loss": 1.9495, + "step": 38690 + }, + { + "epoch": 0.9114031369224248, + "grad_norm": 2.0410072803497314, + "learning_rate": 1.7794734115208896e-05, + "loss": 2.0186, + "step": 38700 + }, + { + "epoch": 0.911638641608968, + "grad_norm": 2.9783804416656494, + "learning_rate": 1.7747633177900242e-05, + "loss": 1.8578, + "step": 38710 + }, + { + "epoch": 0.9118741462955113, + "grad_norm": 2.104757785797119, + "learning_rate": 1.770053224059159e-05, + "loss": 1.8201, + "step": 38720 + }, + { + "epoch": 0.9121096509820545, + "grad_norm": 1.9207528829574585, + "learning_rate": 1.7653431303282936e-05, + "loss": 1.8914, + "step": 38730 + }, + { + "epoch": 0.9123451556685978, + "grad_norm": 2.920927047729492, + "learning_rate": 1.7606330365974283e-05, + "loss": 1.9848, + "step": 38740 + }, + { + "epoch": 0.9125806603551411, + "grad_norm": 2.510042190551758, + "learning_rate": 1.755922942866563e-05, + "loss": 2.1576, + "step": 38750 + }, + { + "epoch": 0.9128161650416843, + "grad_norm": 2.3450679779052734, + "learning_rate": 1.751212849135698e-05, + "loss": 2.2116, + "step": 38760 + }, + { + "epoch": 0.9130516697282276, + "grad_norm": 1.95484459400177, + "learning_rate": 1.7465027554048327e-05, + "loss": 1.9605, + "step": 38770 + }, + { + "epoch": 0.9132871744147708, + "grad_norm": 2.005983352661133, + "learning_rate": 1.7417926616739674e-05, + "loss": 2.0368, + "step": 38780 + }, + { + "epoch": 0.9135226791013141, + "grad_norm": 2.1589179039001465, + "learning_rate": 1.737082567943102e-05, + "loss": 1.906, + "step": 38790 + }, + { + "epoch": 0.9137581837878573, + "grad_norm": 1.929579257965088, + "learning_rate": 1.7323724742122367e-05, + "loss": 2.0555, + "step": 38800 + }, + { + "epoch": 0.9139936884744007, + "grad_norm": 2.2730865478515625, + "learning_rate": 1.7276623804813714e-05, + "loss": 2.1307, + "step": 38810 + }, + { + "epoch": 0.9142291931609439, + "grad_norm": 1.4058763980865479, + "learning_rate": 1.7229522867505065e-05, + "loss": 1.6729, + "step": 38820 + }, + { + "epoch": 0.9144646978474872, + "grad_norm": 2.0500447750091553, + "learning_rate": 1.718242193019641e-05, + "loss": 1.8054, + "step": 38830 + }, + { + "epoch": 0.9147002025340304, + "grad_norm": 2.374552011489868, + "learning_rate": 1.713532099288776e-05, + "loss": 1.8979, + "step": 38840 + }, + { + "epoch": 0.9149357072205737, + "grad_norm": 2.021498441696167, + "learning_rate": 1.708822005557911e-05, + "loss": 2.1172, + "step": 38850 + }, + { + "epoch": 0.9151712119071169, + "grad_norm": 2.2136051654815674, + "learning_rate": 1.7041119118270455e-05, + "loss": 2.0001, + "step": 38860 + }, + { + "epoch": 0.9154067165936602, + "grad_norm": 2.6992013454437256, + "learning_rate": 1.6994018180961802e-05, + "loss": 1.8245, + "step": 38870 + }, + { + "epoch": 0.9156422212802034, + "grad_norm": 2.2743868827819824, + "learning_rate": 1.694691724365315e-05, + "loss": 1.8511, + "step": 38880 + }, + { + "epoch": 0.9158777259667468, + "grad_norm": 2.4155523777008057, + "learning_rate": 1.6899816306344496e-05, + "loss": 1.9802, + "step": 38890 + }, + { + "epoch": 0.91611323065329, + "grad_norm": 1.777964472770691, + "learning_rate": 1.6852715369035846e-05, + "loss": 2.0244, + "step": 38900 + }, + { + "epoch": 0.9163487353398333, + "grad_norm": 2.0305655002593994, + "learning_rate": 1.6805614431727193e-05, + "loss": 1.9335, + "step": 38910 + }, + { + "epoch": 0.9165842400263765, + "grad_norm": 2.5534350872039795, + "learning_rate": 1.675851349441854e-05, + "loss": 2.0425, + "step": 38920 + }, + { + "epoch": 0.9168197447129198, + "grad_norm": 1.9499086141586304, + "learning_rate": 1.6711412557109887e-05, + "loss": 2.0643, + "step": 38930 + }, + { + "epoch": 0.917055249399463, + "grad_norm": 2.4477667808532715, + "learning_rate": 1.6664311619801234e-05, + "loss": 1.8278, + "step": 38940 + }, + { + "epoch": 0.9172907540860064, + "grad_norm": 2.7401411533355713, + "learning_rate": 1.661721068249258e-05, + "loss": 2.1896, + "step": 38950 + }, + { + "epoch": 0.9175262587725496, + "grad_norm": 2.0226340293884277, + "learning_rate": 1.6570109745183927e-05, + "loss": 1.9066, + "step": 38960 + }, + { + "epoch": 0.9177617634590929, + "grad_norm": 2.22379469871521, + "learning_rate": 1.6523008807875278e-05, + "loss": 2.0999, + "step": 38970 + }, + { + "epoch": 0.9179972681456361, + "grad_norm": 2.1687543392181396, + "learning_rate": 1.6475907870566624e-05, + "loss": 2.0633, + "step": 38980 + }, + { + "epoch": 0.9182327728321794, + "grad_norm": 3.160351514816284, + "learning_rate": 1.6428806933257975e-05, + "loss": 2.1236, + "step": 38990 + }, + { + "epoch": 0.9184682775187226, + "grad_norm": 2.306687355041504, + "learning_rate": 1.638170599594932e-05, + "loss": 2.1413, + "step": 39000 + }, + { + "epoch": 0.9187037822052659, + "grad_norm": 2.567837953567505, + "learning_rate": 1.633460505864067e-05, + "loss": 1.8291, + "step": 39010 + }, + { + "epoch": 0.9189392868918091, + "grad_norm": 2.327446937561035, + "learning_rate": 1.6287504121332015e-05, + "loss": 2.0234, + "step": 39020 + }, + { + "epoch": 0.9191747915783524, + "grad_norm": 2.052375555038452, + "learning_rate": 1.6240403184023362e-05, + "loss": 1.9689, + "step": 39030 + }, + { + "epoch": 0.9194102962648957, + "grad_norm": 1.6674343347549438, + "learning_rate": 1.6193302246714712e-05, + "loss": 1.9624, + "step": 39040 + }, + { + "epoch": 0.9196458009514389, + "grad_norm": 2.403968572616577, + "learning_rate": 1.614620130940606e-05, + "loss": 2.0186, + "step": 39050 + }, + { + "epoch": 0.9198813056379822, + "grad_norm": 2.198310613632202, + "learning_rate": 1.6099100372097406e-05, + "loss": 1.9116, + "step": 39060 + }, + { + "epoch": 0.9201168103245254, + "grad_norm": 1.7685959339141846, + "learning_rate": 1.6051999434788753e-05, + "loss": 1.7279, + "step": 39070 + }, + { + "epoch": 0.9203523150110687, + "grad_norm": 2.2193026542663574, + "learning_rate": 1.60048984974801e-05, + "loss": 2.064, + "step": 39080 + }, + { + "epoch": 0.9205878196976119, + "grad_norm": 2.313854455947876, + "learning_rate": 1.5957797560171447e-05, + "loss": 1.9564, + "step": 39090 + }, + { + "epoch": 0.9208233243841553, + "grad_norm": 3.075056314468384, + "learning_rate": 1.5910696622862793e-05, + "loss": 1.8532, + "step": 39100 + }, + { + "epoch": 0.9210588290706985, + "grad_norm": 2.181581974029541, + "learning_rate": 1.5863595685554144e-05, + "loss": 2.0414, + "step": 39110 + }, + { + "epoch": 0.9212943337572418, + "grad_norm": 1.9923847913742065, + "learning_rate": 1.581649474824549e-05, + "loss": 1.8529, + "step": 39120 + }, + { + "epoch": 0.921529838443785, + "grad_norm": 2.1557722091674805, + "learning_rate": 1.5769393810936837e-05, + "loss": 1.8607, + "step": 39130 + }, + { + "epoch": 0.9217653431303283, + "grad_norm": 1.7339023351669312, + "learning_rate": 1.5722292873628188e-05, + "loss": 1.9141, + "step": 39140 + }, + { + "epoch": 0.9220008478168715, + "grad_norm": 1.983236312866211, + "learning_rate": 1.5675191936319535e-05, + "loss": 2.1342, + "step": 39150 + }, + { + "epoch": 0.9222363525034148, + "grad_norm": 2.0628926753997803, + "learning_rate": 1.562809099901088e-05, + "loss": 2.0345, + "step": 39160 + }, + { + "epoch": 0.922471857189958, + "grad_norm": 2.1119203567504883, + "learning_rate": 1.5580990061702228e-05, + "loss": 2.033, + "step": 39170 + }, + { + "epoch": 0.9227073618765014, + "grad_norm": 2.4918603897094727, + "learning_rate": 1.553388912439358e-05, + "loss": 1.9512, + "step": 39180 + }, + { + "epoch": 0.9229428665630446, + "grad_norm": 1.8734686374664307, + "learning_rate": 1.5486788187084925e-05, + "loss": 1.8077, + "step": 39190 + }, + { + "epoch": 0.9231783712495879, + "grad_norm": 1.4692710638046265, + "learning_rate": 1.5439687249776272e-05, + "loss": 2.012, + "step": 39200 + }, + { + "epoch": 0.9234138759361311, + "grad_norm": 1.7556036710739136, + "learning_rate": 1.539258631246762e-05, + "loss": 2.0538, + "step": 39210 + }, + { + "epoch": 0.9236493806226744, + "grad_norm": 1.9275639057159424, + "learning_rate": 1.5345485375158966e-05, + "loss": 2.2427, + "step": 39220 + }, + { + "epoch": 0.9238848853092176, + "grad_norm": 2.047739267349243, + "learning_rate": 1.5298384437850313e-05, + "loss": 1.9106, + "step": 39230 + }, + { + "epoch": 0.924120389995761, + "grad_norm": 2.382171630859375, + "learning_rate": 1.525128350054166e-05, + "loss": 2.0102, + "step": 39240 + }, + { + "epoch": 0.9243558946823042, + "grad_norm": 1.8619623184204102, + "learning_rate": 1.520418256323301e-05, + "loss": 2.1269, + "step": 39250 + }, + { + "epoch": 0.9245913993688475, + "grad_norm": 2.0923447608947754, + "learning_rate": 1.5157081625924357e-05, + "loss": 2.0977, + "step": 39260 + }, + { + "epoch": 0.9248269040553907, + "grad_norm": 1.8551141023635864, + "learning_rate": 1.5109980688615705e-05, + "loss": 1.8932, + "step": 39270 + }, + { + "epoch": 0.925062408741934, + "grad_norm": 2.1643242835998535, + "learning_rate": 1.5062879751307052e-05, + "loss": 2.0233, + "step": 39280 + }, + { + "epoch": 0.9252979134284772, + "grad_norm": 1.629560947418213, + "learning_rate": 1.5015778813998399e-05, + "loss": 1.8223, + "step": 39290 + }, + { + "epoch": 0.9255334181150204, + "grad_norm": 1.8448771238327026, + "learning_rate": 1.4968677876689746e-05, + "loss": 2.0332, + "step": 39300 + }, + { + "epoch": 0.9257689228015638, + "grad_norm": 2.0944831371307373, + "learning_rate": 1.4921576939381093e-05, + "loss": 1.8888, + "step": 39310 + }, + { + "epoch": 0.926004427488107, + "grad_norm": 2.4087986946105957, + "learning_rate": 1.4874476002072443e-05, + "loss": 2.0431, + "step": 39320 + }, + { + "epoch": 0.9262399321746503, + "grad_norm": 2.948134422302246, + "learning_rate": 1.482737506476379e-05, + "loss": 1.7793, + "step": 39330 + }, + { + "epoch": 0.9264754368611935, + "grad_norm": 1.8383265733718872, + "learning_rate": 1.4780274127455138e-05, + "loss": 2.0095, + "step": 39340 + }, + { + "epoch": 0.9267109415477368, + "grad_norm": 1.9835010766983032, + "learning_rate": 1.4733173190146485e-05, + "loss": 2.0252, + "step": 39350 + }, + { + "epoch": 0.92694644623428, + "grad_norm": 2.6888811588287354, + "learning_rate": 1.4686072252837832e-05, + "loss": 2.1416, + "step": 39360 + }, + { + "epoch": 0.9271819509208233, + "grad_norm": 2.017975330352783, + "learning_rate": 1.4638971315529179e-05, + "loss": 1.879, + "step": 39370 + }, + { + "epoch": 0.9274174556073665, + "grad_norm": 2.2227847576141357, + "learning_rate": 1.4591870378220526e-05, + "loss": 2.1047, + "step": 39380 + }, + { + "epoch": 0.9276529602939099, + "grad_norm": 1.885798454284668, + "learning_rate": 1.4544769440911876e-05, + "loss": 1.9589, + "step": 39390 + }, + { + "epoch": 0.9278884649804531, + "grad_norm": 1.9954543113708496, + "learning_rate": 1.4497668503603223e-05, + "loss": 2.0047, + "step": 39400 + }, + { + "epoch": 0.9281239696669964, + "grad_norm": 2.5150036811828613, + "learning_rate": 1.445056756629457e-05, + "loss": 2.0924, + "step": 39410 + }, + { + "epoch": 0.9283594743535396, + "grad_norm": 2.4229278564453125, + "learning_rate": 1.4403466628985918e-05, + "loss": 2.015, + "step": 39420 + }, + { + "epoch": 0.9285949790400829, + "grad_norm": 2.048858404159546, + "learning_rate": 1.4356365691677265e-05, + "loss": 1.9736, + "step": 39430 + }, + { + "epoch": 0.9288304837266261, + "grad_norm": 2.147585868835449, + "learning_rate": 1.4309264754368612e-05, + "loss": 1.9895, + "step": 39440 + }, + { + "epoch": 0.9290659884131695, + "grad_norm": 1.9288424253463745, + "learning_rate": 1.4262163817059959e-05, + "loss": 2.0262, + "step": 39450 + }, + { + "epoch": 0.9293014930997127, + "grad_norm": 1.9802799224853516, + "learning_rate": 1.4215062879751306e-05, + "loss": 1.9161, + "step": 39460 + }, + { + "epoch": 0.929536997786256, + "grad_norm": 2.0962471961975098, + "learning_rate": 1.4167961942442656e-05, + "loss": 1.9732, + "step": 39470 + }, + { + "epoch": 0.9297725024727992, + "grad_norm": 2.103632688522339, + "learning_rate": 1.4120861005134003e-05, + "loss": 1.8515, + "step": 39480 + }, + { + "epoch": 0.9300080071593425, + "grad_norm": 2.154771327972412, + "learning_rate": 1.4073760067825351e-05, + "loss": 1.7922, + "step": 39490 + }, + { + "epoch": 0.9302435118458857, + "grad_norm": 2.3538451194763184, + "learning_rate": 1.4026659130516698e-05, + "loss": 2.0359, + "step": 39500 + }, + { + "epoch": 0.930479016532429, + "grad_norm": 2.0333411693573, + "learning_rate": 1.3979558193208045e-05, + "loss": 2.1012, + "step": 39510 + }, + { + "epoch": 0.9307145212189722, + "grad_norm": 2.59187388420105, + "learning_rate": 1.3932457255899392e-05, + "loss": 2.2647, + "step": 39520 + }, + { + "epoch": 0.9309500259055156, + "grad_norm": 2.3052356243133545, + "learning_rate": 1.3885356318590739e-05, + "loss": 2.0736, + "step": 39530 + }, + { + "epoch": 0.9311855305920588, + "grad_norm": 2.131145477294922, + "learning_rate": 1.3838255381282089e-05, + "loss": 2.0845, + "step": 39540 + }, + { + "epoch": 0.9314210352786021, + "grad_norm": 2.3522706031799316, + "learning_rate": 1.3791154443973436e-05, + "loss": 1.9605, + "step": 39550 + }, + { + "epoch": 0.9316565399651453, + "grad_norm": 2.067479372024536, + "learning_rate": 1.3744053506664783e-05, + "loss": 2.138, + "step": 39560 + }, + { + "epoch": 0.9318920446516886, + "grad_norm": 1.7099672555923462, + "learning_rate": 1.3696952569356131e-05, + "loss": 1.7891, + "step": 39570 + }, + { + "epoch": 0.9321275493382318, + "grad_norm": 2.0473127365112305, + "learning_rate": 1.3649851632047478e-05, + "loss": 2.1291, + "step": 39580 + }, + { + "epoch": 0.932363054024775, + "grad_norm": 2.2088100910186768, + "learning_rate": 1.3602750694738825e-05, + "loss": 1.9411, + "step": 39590 + }, + { + "epoch": 0.9325985587113184, + "grad_norm": 2.0423855781555176, + "learning_rate": 1.3555649757430172e-05, + "loss": 2.0322, + "step": 39600 + }, + { + "epoch": 0.9328340633978616, + "grad_norm": 3.2174267768859863, + "learning_rate": 1.3508548820121522e-05, + "loss": 2.0514, + "step": 39610 + }, + { + "epoch": 0.9330695680844049, + "grad_norm": 1.992333173751831, + "learning_rate": 1.3461447882812869e-05, + "loss": 1.7315, + "step": 39620 + }, + { + "epoch": 0.9333050727709481, + "grad_norm": 1.7157886028289795, + "learning_rate": 1.3414346945504216e-05, + "loss": 2.0937, + "step": 39630 + }, + { + "epoch": 0.9335405774574914, + "grad_norm": 2.721675157546997, + "learning_rate": 1.3367246008195564e-05, + "loss": 2.0178, + "step": 39640 + }, + { + "epoch": 0.9337760821440346, + "grad_norm": 2.151686429977417, + "learning_rate": 1.3320145070886911e-05, + "loss": 2.0767, + "step": 39650 + }, + { + "epoch": 0.934011586830578, + "grad_norm": 2.0822360515594482, + "learning_rate": 1.3273044133578258e-05, + "loss": 1.8964, + "step": 39660 + }, + { + "epoch": 0.9342470915171212, + "grad_norm": 2.0849602222442627, + "learning_rate": 1.3225943196269605e-05, + "loss": 2.074, + "step": 39670 + }, + { + "epoch": 0.9344825962036645, + "grad_norm": 1.9029971361160278, + "learning_rate": 1.3178842258960955e-05, + "loss": 1.7693, + "step": 39680 + }, + { + "epoch": 0.9347181008902077, + "grad_norm": 1.9549709558486938, + "learning_rate": 1.3131741321652302e-05, + "loss": 1.866, + "step": 39690 + }, + { + "epoch": 0.934953605576751, + "grad_norm": 3.2078988552093506, + "learning_rate": 1.3084640384343649e-05, + "loss": 2.0639, + "step": 39700 + }, + { + "epoch": 0.9351891102632942, + "grad_norm": 2.0642852783203125, + "learning_rate": 1.3037539447034996e-05, + "loss": 2.2178, + "step": 39710 + }, + { + "epoch": 0.9354246149498375, + "grad_norm": 2.6901581287384033, + "learning_rate": 1.2990438509726344e-05, + "loss": 1.9731, + "step": 39720 + }, + { + "epoch": 0.9356601196363807, + "grad_norm": 2.5783872604370117, + "learning_rate": 1.2943337572417691e-05, + "loss": 2.009, + "step": 39730 + }, + { + "epoch": 0.9358956243229241, + "grad_norm": 2.1882944107055664, + "learning_rate": 1.2896236635109038e-05, + "loss": 1.7685, + "step": 39740 + }, + { + "epoch": 0.9361311290094673, + "grad_norm": 2.3130943775177, + "learning_rate": 1.2849135697800388e-05, + "loss": 1.8615, + "step": 39750 + }, + { + "epoch": 0.9363666336960106, + "grad_norm": 2.8880362510681152, + "learning_rate": 1.2802034760491735e-05, + "loss": 1.6732, + "step": 39760 + }, + { + "epoch": 0.9366021383825538, + "grad_norm": 2.318843364715576, + "learning_rate": 1.2754933823183082e-05, + "loss": 1.7798, + "step": 39770 + }, + { + "epoch": 0.9368376430690971, + "grad_norm": 2.4206154346466064, + "learning_rate": 1.2707832885874429e-05, + "loss": 1.8153, + "step": 39780 + }, + { + "epoch": 0.9370731477556403, + "grad_norm": 2.1725382804870605, + "learning_rate": 1.2660731948565777e-05, + "loss": 1.8012, + "step": 39790 + }, + { + "epoch": 0.9373086524421836, + "grad_norm": 2.6982181072235107, + "learning_rate": 1.2613631011257124e-05, + "loss": 1.9887, + "step": 39800 + }, + { + "epoch": 0.9375441571287269, + "grad_norm": 2.1166913509368896, + "learning_rate": 1.2566530073948471e-05, + "loss": 1.793, + "step": 39810 + }, + { + "epoch": 0.9377796618152702, + "grad_norm": 2.248126745223999, + "learning_rate": 1.2519429136639821e-05, + "loss": 1.9725, + "step": 39820 + }, + { + "epoch": 0.9380151665018134, + "grad_norm": 2.303269386291504, + "learning_rate": 1.2472328199331166e-05, + "loss": 1.9097, + "step": 39830 + }, + { + "epoch": 0.9382506711883567, + "grad_norm": 2.438401460647583, + "learning_rate": 1.2425227262022515e-05, + "loss": 1.8483, + "step": 39840 + }, + { + "epoch": 0.9384861758748999, + "grad_norm": 1.9419487714767456, + "learning_rate": 1.2378126324713862e-05, + "loss": 1.9396, + "step": 39850 + }, + { + "epoch": 0.9387216805614432, + "grad_norm": 2.5072529315948486, + "learning_rate": 1.2331025387405209e-05, + "loss": 1.9367, + "step": 39860 + }, + { + "epoch": 0.9389571852479864, + "grad_norm": 2.39945387840271, + "learning_rate": 1.2283924450096557e-05, + "loss": 1.8372, + "step": 39870 + }, + { + "epoch": 0.9391926899345296, + "grad_norm": 2.786134958267212, + "learning_rate": 1.2236823512787906e-05, + "loss": 1.9416, + "step": 39880 + }, + { + "epoch": 0.939428194621073, + "grad_norm": 2.2066538333892822, + "learning_rate": 1.2189722575479253e-05, + "loss": 2.0354, + "step": 39890 + }, + { + "epoch": 0.9396636993076162, + "grad_norm": 2.544435739517212, + "learning_rate": 1.21426216381706e-05, + "loss": 2.005, + "step": 39900 + }, + { + "epoch": 0.9398992039941595, + "grad_norm": 2.4211769104003906, + "learning_rate": 1.2095520700861948e-05, + "loss": 1.986, + "step": 39910 + }, + { + "epoch": 0.9401347086807027, + "grad_norm": 1.885127305984497, + "learning_rate": 1.2048419763553295e-05, + "loss": 2.0807, + "step": 39920 + }, + { + "epoch": 0.940370213367246, + "grad_norm": 2.2707126140594482, + "learning_rate": 1.2001318826244642e-05, + "loss": 2.1215, + "step": 39930 + }, + { + "epoch": 0.9406057180537892, + "grad_norm": 2.53247332572937, + "learning_rate": 1.195421788893599e-05, + "loss": 2.2294, + "step": 39940 + }, + { + "epoch": 0.9408412227403326, + "grad_norm": 2.236628532409668, + "learning_rate": 1.1907116951627339e-05, + "loss": 2.0286, + "step": 39950 + }, + { + "epoch": 0.9410767274268758, + "grad_norm": 2.0977280139923096, + "learning_rate": 1.1860016014318686e-05, + "loss": 1.9303, + "step": 39960 + }, + { + "epoch": 0.9413122321134191, + "grad_norm": 1.9017722606658936, + "learning_rate": 1.1812915077010033e-05, + "loss": 1.9777, + "step": 39970 + }, + { + "epoch": 0.9415477367999623, + "grad_norm": 2.0843467712402344, + "learning_rate": 1.1765814139701381e-05, + "loss": 2.1042, + "step": 39980 + }, + { + "epoch": 0.9417832414865056, + "grad_norm": 2.946749687194824, + "learning_rate": 1.1718713202392728e-05, + "loss": 2.0549, + "step": 39990 + }, + { + "epoch": 0.9420187461730488, + "grad_norm": 2.1347475051879883, + "learning_rate": 1.1671612265084075e-05, + "loss": 1.8955, + "step": 40000 + }, + { + "epoch": 0.9422542508595921, + "grad_norm": 3.1052968502044678, + "learning_rate": 1.1624511327775423e-05, + "loss": 2.1096, + "step": 40010 + }, + { + "epoch": 0.9424897555461353, + "grad_norm": 2.082920551300049, + "learning_rate": 1.157741039046677e-05, + "loss": 1.8824, + "step": 40020 + }, + { + "epoch": 0.9427252602326787, + "grad_norm": 2.2279791831970215, + "learning_rate": 1.1530309453158119e-05, + "loss": 2.0551, + "step": 40030 + }, + { + "epoch": 0.9429607649192219, + "grad_norm": 2.04917311668396, + "learning_rate": 1.1483208515849466e-05, + "loss": 1.9149, + "step": 40040 + }, + { + "epoch": 0.9431962696057652, + "grad_norm": 1.9186104536056519, + "learning_rate": 1.1436107578540814e-05, + "loss": 1.6671, + "step": 40050 + }, + { + "epoch": 0.9434317742923084, + "grad_norm": 2.3989248275756836, + "learning_rate": 1.1389006641232161e-05, + "loss": 2.007, + "step": 40060 + }, + { + "epoch": 0.9436672789788517, + "grad_norm": 2.3239734172821045, + "learning_rate": 1.1341905703923508e-05, + "loss": 2.064, + "step": 40070 + }, + { + "epoch": 0.9439027836653949, + "grad_norm": 1.8902684450149536, + "learning_rate": 1.1294804766614857e-05, + "loss": 2.0352, + "step": 40080 + }, + { + "epoch": 0.9441382883519382, + "grad_norm": 6.378442764282227, + "learning_rate": 1.1247703829306203e-05, + "loss": 1.9671, + "step": 40090 + }, + { + "epoch": 0.9443737930384815, + "grad_norm": 1.7690014839172363, + "learning_rate": 1.1200602891997552e-05, + "loss": 1.741, + "step": 40100 + }, + { + "epoch": 0.9446092977250248, + "grad_norm": 2.239579677581787, + "learning_rate": 1.1153501954688899e-05, + "loss": 1.8264, + "step": 40110 + }, + { + "epoch": 0.944844802411568, + "grad_norm": 2.8370306491851807, + "learning_rate": 1.1106401017380247e-05, + "loss": 1.9176, + "step": 40120 + }, + { + "epoch": 0.9450803070981113, + "grad_norm": 2.293163537979126, + "learning_rate": 1.1059300080071594e-05, + "loss": 1.9672, + "step": 40130 + }, + { + "epoch": 0.9453158117846545, + "grad_norm": 1.929100751876831, + "learning_rate": 1.1012199142762941e-05, + "loss": 1.9704, + "step": 40140 + }, + { + "epoch": 0.9455513164711977, + "grad_norm": 2.4982588291168213, + "learning_rate": 1.096509820545429e-05, + "loss": 1.925, + "step": 40150 + }, + { + "epoch": 0.945786821157741, + "grad_norm": 1.8431040048599243, + "learning_rate": 1.0917997268145636e-05, + "loss": 2.0691, + "step": 40160 + }, + { + "epoch": 0.9460223258442843, + "grad_norm": 1.8826316595077515, + "learning_rate": 1.0870896330836983e-05, + "loss": 2.2448, + "step": 40170 + }, + { + "epoch": 0.9462578305308276, + "grad_norm": 1.895750880241394, + "learning_rate": 1.0823795393528332e-05, + "loss": 2.0793, + "step": 40180 + }, + { + "epoch": 0.9464933352173708, + "grad_norm": 2.6968462467193604, + "learning_rate": 1.077669445621968e-05, + "loss": 2.0888, + "step": 40190 + }, + { + "epoch": 0.9467288399039141, + "grad_norm": 1.8779919147491455, + "learning_rate": 1.0729593518911027e-05, + "loss": 1.7711, + "step": 40200 + }, + { + "epoch": 0.9469643445904573, + "grad_norm": 2.1035165786743164, + "learning_rate": 1.0682492581602374e-05, + "loss": 1.892, + "step": 40210 + }, + { + "epoch": 0.9471998492770006, + "grad_norm": 3.0900187492370605, + "learning_rate": 1.0635391644293723e-05, + "loss": 1.9634, + "step": 40220 + }, + { + "epoch": 0.9474353539635438, + "grad_norm": 2.7870278358459473, + "learning_rate": 1.058829070698507e-05, + "loss": 2.0701, + "step": 40230 + }, + { + "epoch": 0.9476708586500872, + "grad_norm": 2.2766451835632324, + "learning_rate": 1.0541189769676416e-05, + "loss": 1.9357, + "step": 40240 + }, + { + "epoch": 0.9479063633366304, + "grad_norm": 1.974714756011963, + "learning_rate": 1.0494088832367765e-05, + "loss": 1.8973, + "step": 40250 + }, + { + "epoch": 0.9481418680231737, + "grad_norm": 2.290137767791748, + "learning_rate": 1.0446987895059113e-05, + "loss": 2.0768, + "step": 40260 + }, + { + "epoch": 0.9483773727097169, + "grad_norm": 1.5795409679412842, + "learning_rate": 1.039988695775046e-05, + "loss": 1.8673, + "step": 40270 + }, + { + "epoch": 0.9486128773962602, + "grad_norm": 2.04624605178833, + "learning_rate": 1.0352786020441807e-05, + "loss": 2.0825, + "step": 40280 + }, + { + "epoch": 0.9488483820828034, + "grad_norm": 2.657115936279297, + "learning_rate": 1.0305685083133154e-05, + "loss": 1.9653, + "step": 40290 + }, + { + "epoch": 0.9490838867693467, + "grad_norm": 1.960942029953003, + "learning_rate": 1.0258584145824503e-05, + "loss": 2.1169, + "step": 40300 + }, + { + "epoch": 0.94931939145589, + "grad_norm": 2.3039536476135254, + "learning_rate": 1.021148320851585e-05, + "loss": 1.9415, + "step": 40310 + }, + { + "epoch": 0.9495548961424333, + "grad_norm": 1.9402930736541748, + "learning_rate": 1.0164382271207196e-05, + "loss": 1.9241, + "step": 40320 + }, + { + "epoch": 0.9497904008289765, + "grad_norm": 2.1505539417266846, + "learning_rate": 1.0117281333898545e-05, + "loss": 1.8727, + "step": 40330 + }, + { + "epoch": 0.9500259055155198, + "grad_norm": 2.9176759719848633, + "learning_rate": 1.0070180396589893e-05, + "loss": 1.8201, + "step": 40340 + }, + { + "epoch": 0.950261410202063, + "grad_norm": 1.9368810653686523, + "learning_rate": 1.002307945928124e-05, + "loss": 1.9419, + "step": 40350 + }, + { + "epoch": 0.9504969148886063, + "grad_norm": 2.8279006481170654, + "learning_rate": 9.975978521972587e-06, + "loss": 1.9023, + "step": 40360 + }, + { + "epoch": 0.9507324195751495, + "grad_norm": 1.806648850440979, + "learning_rate": 9.928877584663936e-06, + "loss": 2.0467, + "step": 40370 + }, + { + "epoch": 0.9509679242616929, + "grad_norm": 2.361128807067871, + "learning_rate": 9.881776647355283e-06, + "loss": 2.1681, + "step": 40380 + }, + { + "epoch": 0.9512034289482361, + "grad_norm": 2.5333521366119385, + "learning_rate": 9.83467571004663e-06, + "loss": 1.8089, + "step": 40390 + }, + { + "epoch": 0.9514389336347794, + "grad_norm": 2.0021731853485107, + "learning_rate": 9.787574772737978e-06, + "loss": 2.0067, + "step": 40400 + }, + { + "epoch": 0.9516744383213226, + "grad_norm": 1.8866750001907349, + "learning_rate": 9.740473835429327e-06, + "loss": 2.0784, + "step": 40410 + }, + { + "epoch": 0.9519099430078659, + "grad_norm": 2.2933566570281982, + "learning_rate": 9.693372898120673e-06, + "loss": 1.9378, + "step": 40420 + }, + { + "epoch": 0.9521454476944091, + "grad_norm": 2.6166129112243652, + "learning_rate": 9.64627196081202e-06, + "loss": 1.978, + "step": 40430 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 2.130694627761841, + "learning_rate": 9.599171023503369e-06, + "loss": 1.9178, + "step": 40440 + }, + { + "epoch": 0.9526164570674956, + "grad_norm": 2.019354820251465, + "learning_rate": 9.552070086194716e-06, + "loss": 1.9989, + "step": 40450 + }, + { + "epoch": 0.9528519617540389, + "grad_norm": 2.3226945400238037, + "learning_rate": 9.504969148886062e-06, + "loss": 1.8383, + "step": 40460 + }, + { + "epoch": 0.9530874664405822, + "grad_norm": 3.102097511291504, + "learning_rate": 9.457868211577411e-06, + "loss": 2.1891, + "step": 40470 + }, + { + "epoch": 0.9533229711271254, + "grad_norm": 2.6576321125030518, + "learning_rate": 9.410767274268758e-06, + "loss": 2.1156, + "step": 40480 + }, + { + "epoch": 0.9535584758136687, + "grad_norm": 2.235243797302246, + "learning_rate": 9.363666336960106e-06, + "loss": 2.0167, + "step": 40490 + }, + { + "epoch": 0.9537939805002119, + "grad_norm": 4.053108215332031, + "learning_rate": 9.316565399651453e-06, + "loss": 1.975, + "step": 40500 + }, + { + "epoch": 0.9540294851867552, + "grad_norm": 3.1133532524108887, + "learning_rate": 9.269464462342802e-06, + "loss": 2.1291, + "step": 40510 + }, + { + "epoch": 0.9542649898732984, + "grad_norm": 1.7396987676620483, + "learning_rate": 9.222363525034149e-06, + "loss": 2.0171, + "step": 40520 + }, + { + "epoch": 0.9545004945598418, + "grad_norm": 2.360766649246216, + "learning_rate": 9.175262587725496e-06, + "loss": 1.8423, + "step": 40530 + }, + { + "epoch": 0.954735999246385, + "grad_norm": 1.9602024555206299, + "learning_rate": 9.128161650416844e-06, + "loss": 1.8434, + "step": 40540 + }, + { + "epoch": 0.9549715039329283, + "grad_norm": 1.9240727424621582, + "learning_rate": 9.081060713108191e-06, + "loss": 2.0884, + "step": 40550 + }, + { + "epoch": 0.9552070086194715, + "grad_norm": 1.481034517288208, + "learning_rate": 9.03395977579954e-06, + "loss": 2.115, + "step": 40560 + }, + { + "epoch": 0.9554425133060148, + "grad_norm": 2.2302894592285156, + "learning_rate": 8.986858838490886e-06, + "loss": 1.9161, + "step": 40570 + }, + { + "epoch": 0.955678017992558, + "grad_norm": 2.155421733856201, + "learning_rate": 8.939757901182235e-06, + "loss": 2.0161, + "step": 40580 + }, + { + "epoch": 0.9559135226791013, + "grad_norm": 2.0020880699157715, + "learning_rate": 8.892656963873582e-06, + "loss": 1.7731, + "step": 40590 + }, + { + "epoch": 0.9561490273656446, + "grad_norm": 2.630326271057129, + "learning_rate": 8.845556026564929e-06, + "loss": 2.0842, + "step": 40600 + }, + { + "epoch": 0.9563845320521879, + "grad_norm": 2.0237209796905518, + "learning_rate": 8.798455089256277e-06, + "loss": 1.8431, + "step": 40610 + }, + { + "epoch": 0.9566200367387311, + "grad_norm": 2.217899799346924, + "learning_rate": 8.751354151947624e-06, + "loss": 1.9634, + "step": 40620 + }, + { + "epoch": 0.9568555414252744, + "grad_norm": 2.581691026687622, + "learning_rate": 8.704253214638971e-06, + "loss": 2.1512, + "step": 40630 + }, + { + "epoch": 0.9570910461118176, + "grad_norm": 1.7218579053878784, + "learning_rate": 8.65715227733032e-06, + "loss": 1.9162, + "step": 40640 + }, + { + "epoch": 0.9573265507983609, + "grad_norm": 2.563023328781128, + "learning_rate": 8.610051340021668e-06, + "loss": 2.0915, + "step": 40650 + }, + { + "epoch": 0.9575620554849041, + "grad_norm": 2.261781692504883, + "learning_rate": 8.562950402713015e-06, + "loss": 2.2342, + "step": 40660 + }, + { + "epoch": 0.9577975601714475, + "grad_norm": 2.305806875228882, + "learning_rate": 8.515849465404362e-06, + "loss": 2.096, + "step": 40670 + }, + { + "epoch": 0.9580330648579907, + "grad_norm": 2.0696635246276855, + "learning_rate": 8.46874852809571e-06, + "loss": 2.0407, + "step": 40680 + }, + { + "epoch": 0.958268569544534, + "grad_norm": 2.0293238162994385, + "learning_rate": 8.421647590787057e-06, + "loss": 1.9653, + "step": 40690 + }, + { + "epoch": 0.9585040742310772, + "grad_norm": 1.9397081136703491, + "learning_rate": 8.374546653478404e-06, + "loss": 2.1373, + "step": 40700 + }, + { + "epoch": 0.9587395789176204, + "grad_norm": 2.025054931640625, + "learning_rate": 8.32744571616975e-06, + "loss": 2.0634, + "step": 40710 + }, + { + "epoch": 0.9589750836041637, + "grad_norm": 2.3190510272979736, + "learning_rate": 8.2803447788611e-06, + "loss": 1.8368, + "step": 40720 + }, + { + "epoch": 0.9592105882907069, + "grad_norm": 2.0201010704040527, + "learning_rate": 8.233243841552448e-06, + "loss": 1.9479, + "step": 40730 + }, + { + "epoch": 0.9594460929772503, + "grad_norm": 2.5316407680511475, + "learning_rate": 8.186142904243795e-06, + "loss": 2.0755, + "step": 40740 + }, + { + "epoch": 0.9596815976637935, + "grad_norm": 1.6399452686309814, + "learning_rate": 8.139041966935143e-06, + "loss": 1.9441, + "step": 40750 + }, + { + "epoch": 0.9599171023503368, + "grad_norm": 2.183414936065674, + "learning_rate": 8.09194102962649e-06, + "loss": 1.8982, + "step": 40760 + }, + { + "epoch": 0.96015260703688, + "grad_norm": 2.358241319656372, + "learning_rate": 8.044840092317837e-06, + "loss": 2.098, + "step": 40770 + }, + { + "epoch": 0.9603881117234233, + "grad_norm": 2.2123827934265137, + "learning_rate": 7.997739155009184e-06, + "loss": 1.9308, + "step": 40780 + }, + { + "epoch": 0.9606236164099665, + "grad_norm": 2.3104357719421387, + "learning_rate": 7.950638217700532e-06, + "loss": 2.118, + "step": 40790 + }, + { + "epoch": 0.9608591210965098, + "grad_norm": 2.0428285598754883, + "learning_rate": 7.903537280391881e-06, + "loss": 2.0509, + "step": 40800 + }, + { + "epoch": 0.961094625783053, + "grad_norm": 1.9097564220428467, + "learning_rate": 7.856436343083228e-06, + "loss": 1.8856, + "step": 40810 + }, + { + "epoch": 0.9613301304695964, + "grad_norm": 2.2115395069122314, + "learning_rate": 7.809335405774575e-06, + "loss": 1.8819, + "step": 40820 + }, + { + "epoch": 0.9615656351561396, + "grad_norm": 1.5304468870162964, + "learning_rate": 7.762234468465923e-06, + "loss": 1.7923, + "step": 40830 + }, + { + "epoch": 0.9618011398426829, + "grad_norm": 2.6274614334106445, + "learning_rate": 7.71513353115727e-06, + "loss": 2.2177, + "step": 40840 + }, + { + "epoch": 0.9620366445292261, + "grad_norm": 2.414781093597412, + "learning_rate": 7.668032593848617e-06, + "loss": 2.0156, + "step": 40850 + }, + { + "epoch": 0.9622721492157694, + "grad_norm": 2.1377460956573486, + "learning_rate": 7.620931656539966e-06, + "loss": 2.076, + "step": 40860 + }, + { + "epoch": 0.9625076539023126, + "grad_norm": 1.8874634504318237, + "learning_rate": 7.573830719231313e-06, + "loss": 2.1652, + "step": 40870 + }, + { + "epoch": 0.962743158588856, + "grad_norm": 2.2391018867492676, + "learning_rate": 7.52672978192266e-06, + "loss": 1.9276, + "step": 40880 + }, + { + "epoch": 0.9629786632753992, + "grad_norm": 2.3317453861236572, + "learning_rate": 7.479628844614008e-06, + "loss": 2.1197, + "step": 40890 + }, + { + "epoch": 0.9632141679619425, + "grad_norm": 2.019249677658081, + "learning_rate": 7.432527907305356e-06, + "loss": 2.2113, + "step": 40900 + }, + { + "epoch": 0.9634496726484857, + "grad_norm": 2.417649984359741, + "learning_rate": 7.385426969996703e-06, + "loss": 1.7919, + "step": 40910 + }, + { + "epoch": 0.963685177335029, + "grad_norm": 1.9419840574264526, + "learning_rate": 7.33832603268805e-06, + "loss": 1.8799, + "step": 40920 + }, + { + "epoch": 0.9639206820215722, + "grad_norm": 2.6572020053863525, + "learning_rate": 7.291225095379399e-06, + "loss": 2.0273, + "step": 40930 + }, + { + "epoch": 0.9641561867081155, + "grad_norm": 1.8981895446777344, + "learning_rate": 7.244124158070746e-06, + "loss": 2.0674, + "step": 40940 + }, + { + "epoch": 0.9643916913946587, + "grad_norm": 2.1581413745880127, + "learning_rate": 7.197023220762093e-06, + "loss": 1.9784, + "step": 40950 + }, + { + "epoch": 0.9646271960812021, + "grad_norm": 1.9728518724441528, + "learning_rate": 7.149922283453441e-06, + "loss": 1.9786, + "step": 40960 + }, + { + "epoch": 0.9648627007677453, + "grad_norm": 1.9783124923706055, + "learning_rate": 7.102821346144789e-06, + "loss": 1.9053, + "step": 40970 + }, + { + "epoch": 0.9650982054542886, + "grad_norm": 1.9872087240219116, + "learning_rate": 7.055720408836136e-06, + "loss": 1.867, + "step": 40980 + }, + { + "epoch": 0.9653337101408318, + "grad_norm": 2.02413010597229, + "learning_rate": 7.008619471527483e-06, + "loss": 2.0919, + "step": 40990 + }, + { + "epoch": 0.965569214827375, + "grad_norm": 2.385622978210449, + "learning_rate": 6.961518534218832e-06, + "loss": 1.9556, + "step": 41000 + }, + { + "epoch": 0.9658047195139183, + "grad_norm": 1.7820403575897217, + "learning_rate": 6.914417596910179e-06, + "loss": 1.8152, + "step": 41010 + }, + { + "epoch": 0.9660402242004615, + "grad_norm": 2.060683250427246, + "learning_rate": 6.867316659601526e-06, + "loss": 2.093, + "step": 41020 + }, + { + "epoch": 0.9662757288870049, + "grad_norm": 4.170253753662109, + "learning_rate": 6.820215722292873e-06, + "loss": 2.0762, + "step": 41030 + }, + { + "epoch": 0.9665112335735481, + "grad_norm": 2.2294230461120605, + "learning_rate": 6.773114784984222e-06, + "loss": 1.8819, + "step": 41040 + }, + { + "epoch": 0.9667467382600914, + "grad_norm": 1.990818977355957, + "learning_rate": 6.726013847675569e-06, + "loss": 2.1329, + "step": 41050 + }, + { + "epoch": 0.9669822429466346, + "grad_norm": 1.956518530845642, + "learning_rate": 6.678912910366916e-06, + "loss": 2.1007, + "step": 41060 + }, + { + "epoch": 0.9672177476331779, + "grad_norm": 2.288674831390381, + "learning_rate": 6.631811973058265e-06, + "loss": 2.0032, + "step": 41070 + }, + { + "epoch": 0.9674532523197211, + "grad_norm": 2.927011728286743, + "learning_rate": 6.584711035749612e-06, + "loss": 2.0179, + "step": 41080 + }, + { + "epoch": 0.9676887570062644, + "grad_norm": 2.0494003295898438, + "learning_rate": 6.537610098440959e-06, + "loss": 2.1701, + "step": 41090 + }, + { + "epoch": 0.9679242616928077, + "grad_norm": 1.8264861106872559, + "learning_rate": 6.490509161132306e-06, + "loss": 2.0592, + "step": 41100 + }, + { + "epoch": 0.968159766379351, + "grad_norm": 2.0611650943756104, + "learning_rate": 6.443408223823655e-06, + "loss": 2.1585, + "step": 41110 + }, + { + "epoch": 0.9683952710658942, + "grad_norm": 2.264493703842163, + "learning_rate": 6.396307286515002e-06, + "loss": 1.9042, + "step": 41120 + }, + { + "epoch": 0.9686307757524375, + "grad_norm": 2.1851072311401367, + "learning_rate": 6.349206349206349e-06, + "loss": 1.8886, + "step": 41130 + }, + { + "epoch": 0.9688662804389807, + "grad_norm": 2.544363021850586, + "learning_rate": 6.302105411897698e-06, + "loss": 2.0393, + "step": 41140 + }, + { + "epoch": 0.969101785125524, + "grad_norm": 2.145167827606201, + "learning_rate": 6.255004474589045e-06, + "loss": 2.1154, + "step": 41150 + }, + { + "epoch": 0.9693372898120672, + "grad_norm": 2.0689308643341064, + "learning_rate": 6.207903537280392e-06, + "loss": 2.2185, + "step": 41160 + }, + { + "epoch": 0.9695727944986106, + "grad_norm": 1.8527134656906128, + "learning_rate": 6.16080259997174e-06, + "loss": 1.9672, + "step": 41170 + }, + { + "epoch": 0.9698082991851538, + "grad_norm": 1.9993964433670044, + "learning_rate": 6.113701662663087e-06, + "loss": 2.0707, + "step": 41180 + }, + { + "epoch": 0.9700438038716971, + "grad_norm": 2.341604709625244, + "learning_rate": 6.066600725354435e-06, + "loss": 1.7601, + "step": 41190 + }, + { + "epoch": 0.9702793085582403, + "grad_norm": 2.557854413986206, + "learning_rate": 6.019499788045782e-06, + "loss": 2.0317, + "step": 41200 + }, + { + "epoch": 0.9705148132447836, + "grad_norm": 2.4617807865142822, + "learning_rate": 5.97239885073713e-06, + "loss": 2.0245, + "step": 41210 + }, + { + "epoch": 0.9707503179313268, + "grad_norm": 2.309467077255249, + "learning_rate": 5.925297913428478e-06, + "loss": 2.0089, + "step": 41220 + }, + { + "epoch": 0.9709858226178701, + "grad_norm": 2.5535097122192383, + "learning_rate": 5.878196976119825e-06, + "loss": 2.198, + "step": 41230 + }, + { + "epoch": 0.9712213273044134, + "grad_norm": 2.2563560009002686, + "learning_rate": 5.831096038811173e-06, + "loss": 2.2526, + "step": 41240 + }, + { + "epoch": 0.9714568319909567, + "grad_norm": 2.1398732662200928, + "learning_rate": 5.78399510150252e-06, + "loss": 1.8543, + "step": 41250 + }, + { + "epoch": 0.9716923366774999, + "grad_norm": 1.8591779470443726, + "learning_rate": 5.736894164193868e-06, + "loss": 1.8086, + "step": 41260 + }, + { + "epoch": 0.9719278413640432, + "grad_norm": 1.8352787494659424, + "learning_rate": 5.6897932268852146e-06, + "loss": 1.9678, + "step": 41270 + }, + { + "epoch": 0.9721633460505864, + "grad_norm": 2.3946144580841064, + "learning_rate": 5.642692289576563e-06, + "loss": 2.0764, + "step": 41280 + }, + { + "epoch": 0.9723988507371296, + "grad_norm": 1.8993937969207764, + "learning_rate": 5.595591352267911e-06, + "loss": 2.0317, + "step": 41290 + }, + { + "epoch": 0.9726343554236729, + "grad_norm": 1.8595757484436035, + "learning_rate": 5.548490414959258e-06, + "loss": 2.11, + "step": 41300 + }, + { + "epoch": 0.9728698601102161, + "grad_norm": 2.409909725189209, + "learning_rate": 5.501389477650605e-06, + "loss": 2.1661, + "step": 41310 + }, + { + "epoch": 0.9731053647967595, + "grad_norm": 2.140245199203491, + "learning_rate": 5.454288540341953e-06, + "loss": 1.9142, + "step": 41320 + }, + { + "epoch": 0.9733408694833027, + "grad_norm": 1.9340143203735352, + "learning_rate": 5.407187603033301e-06, + "loss": 2.1344, + "step": 41330 + }, + { + "epoch": 0.973576374169846, + "grad_norm": 2.362156867980957, + "learning_rate": 5.364796759455514e-06, + "loss": 1.6951, + "step": 41340 + }, + { + "epoch": 0.9738118788563892, + "grad_norm": 1.840448260307312, + "learning_rate": 5.317695822146861e-06, + "loss": 1.9706, + "step": 41350 + }, + { + "epoch": 0.9740473835429325, + "grad_norm": 1.9519342184066772, + "learning_rate": 5.270594884838208e-06, + "loss": 2.0591, + "step": 41360 + }, + { + "epoch": 0.9742828882294757, + "grad_norm": 1.9666930437088013, + "learning_rate": 5.223493947529557e-06, + "loss": 2.0745, + "step": 41370 + }, + { + "epoch": 0.974518392916019, + "grad_norm": 2.266784429550171, + "learning_rate": 5.176393010220904e-06, + "loss": 2.0185, + "step": 41380 + }, + { + "epoch": 0.9747538976025623, + "grad_norm": 2.675137758255005, + "learning_rate": 5.129292072912251e-06, + "loss": 1.9687, + "step": 41390 + }, + { + "epoch": 0.9749894022891056, + "grad_norm": 2.1197988986968994, + "learning_rate": 5.082191135603598e-06, + "loss": 1.9073, + "step": 41400 + }, + { + "epoch": 0.9752249069756488, + "grad_norm": 2.488123655319214, + "learning_rate": 5.035090198294947e-06, + "loss": 1.9533, + "step": 41410 + }, + { + "epoch": 0.9754604116621921, + "grad_norm": 2.570176362991333, + "learning_rate": 4.9879892609862936e-06, + "loss": 2.0849, + "step": 41420 + }, + { + "epoch": 0.9756959163487353, + "grad_norm": 2.56309175491333, + "learning_rate": 4.940888323677641e-06, + "loss": 2.1509, + "step": 41430 + }, + { + "epoch": 0.9759314210352786, + "grad_norm": 1.848645806312561, + "learning_rate": 4.893787386368989e-06, + "loss": 2.0104, + "step": 41440 + }, + { + "epoch": 0.9761669257218218, + "grad_norm": 1.9632664918899536, + "learning_rate": 4.846686449060337e-06, + "loss": 1.8198, + "step": 41450 + }, + { + "epoch": 0.9764024304083652, + "grad_norm": 2.0473501682281494, + "learning_rate": 4.799585511751684e-06, + "loss": 2.0671, + "step": 41460 + }, + { + "epoch": 0.9766379350949084, + "grad_norm": 2.3164265155792236, + "learning_rate": 4.752484574443031e-06, + "loss": 2.0143, + "step": 41470 + }, + { + "epoch": 0.9768734397814517, + "grad_norm": 2.0020151138305664, + "learning_rate": 4.705383637134379e-06, + "loss": 1.9446, + "step": 41480 + }, + { + "epoch": 0.9771089444679949, + "grad_norm": 2.8933792114257812, + "learning_rate": 4.658282699825727e-06, + "loss": 2.1086, + "step": 41490 + }, + { + "epoch": 0.9773444491545382, + "grad_norm": 1.7804278135299683, + "learning_rate": 4.611181762517074e-06, + "loss": 1.8856, + "step": 41500 + }, + { + "epoch": 0.9775799538410814, + "grad_norm": 1.9400500059127808, + "learning_rate": 4.564080825208422e-06, + "loss": 1.9144, + "step": 41510 + }, + { + "epoch": 0.9778154585276247, + "grad_norm": 2.1494195461273193, + "learning_rate": 4.51697988789977e-06, + "loss": 1.944, + "step": 41520 + }, + { + "epoch": 0.978050963214168, + "grad_norm": 1.9199514389038086, + "learning_rate": 4.4698789505911175e-06, + "loss": 1.8664, + "step": 41530 + }, + { + "epoch": 0.9782864679007113, + "grad_norm": 2.4373841285705566, + "learning_rate": 4.422778013282464e-06, + "loss": 1.9979, + "step": 41540 + }, + { + "epoch": 0.9785219725872545, + "grad_norm": 1.8263568878173828, + "learning_rate": 4.375677075973812e-06, + "loss": 2.0598, + "step": 41550 + }, + { + "epoch": 0.9787574772737977, + "grad_norm": 2.0048863887786865, + "learning_rate": 4.32857613866516e-06, + "loss": 1.8226, + "step": 41560 + }, + { + "epoch": 0.978992981960341, + "grad_norm": 2.0698230266571045, + "learning_rate": 4.281475201356507e-06, + "loss": 1.8908, + "step": 41570 + }, + { + "epoch": 0.9792284866468842, + "grad_norm": 2.3763415813446045, + "learning_rate": 4.234374264047855e-06, + "loss": 2.1119, + "step": 41580 + }, + { + "epoch": 0.9794639913334275, + "grad_norm": 2.1369435787200928, + "learning_rate": 4.187273326739202e-06, + "loss": 1.7217, + "step": 41590 + }, + { + "epoch": 0.9796994960199708, + "grad_norm": 2.2070326805114746, + "learning_rate": 4.14017238943055e-06, + "loss": 1.8161, + "step": 41600 + }, + { + "epoch": 0.9799350007065141, + "grad_norm": 2.9065499305725098, + "learning_rate": 4.093071452121897e-06, + "loss": 2.0499, + "step": 41610 + }, + { + "epoch": 0.9801705053930573, + "grad_norm": 2.06013560295105, + "learning_rate": 4.045970514813245e-06, + "loss": 1.8116, + "step": 41620 + }, + { + "epoch": 0.9804060100796006, + "grad_norm": 1.7519750595092773, + "learning_rate": 3.998869577504592e-06, + "loss": 1.9417, + "step": 41630 + }, + { + "epoch": 0.9806415147661438, + "grad_norm": 2.1897356510162354, + "learning_rate": 3.9517686401959405e-06, + "loss": 1.9161, + "step": 41640 + }, + { + "epoch": 0.9808770194526871, + "grad_norm": 2.206636428833008, + "learning_rate": 3.904667702887287e-06, + "loss": 1.8641, + "step": 41650 + }, + { + "epoch": 0.9811125241392303, + "grad_norm": 2.4233133792877197, + "learning_rate": 3.857566765578635e-06, + "loss": 1.8659, + "step": 41660 + }, + { + "epoch": 0.9813480288257737, + "grad_norm": 2.3047232627868652, + "learning_rate": 3.810465828269983e-06, + "loss": 1.9812, + "step": 41670 + }, + { + "epoch": 0.9815835335123169, + "grad_norm": 2.138484001159668, + "learning_rate": 3.76336489096133e-06, + "loss": 1.9185, + "step": 41680 + }, + { + "epoch": 0.9818190381988602, + "grad_norm": 2.0543298721313477, + "learning_rate": 3.716263953652678e-06, + "loss": 1.943, + "step": 41690 + }, + { + "epoch": 0.9820545428854034, + "grad_norm": 2.8174407482147217, + "learning_rate": 3.669163016344025e-06, + "loss": 2.2036, + "step": 41700 + }, + { + "epoch": 0.9822900475719467, + "grad_norm": 1.9986740350723267, + "learning_rate": 3.622062079035373e-06, + "loss": 1.9317, + "step": 41710 + }, + { + "epoch": 0.9825255522584899, + "grad_norm": 2.6993019580841064, + "learning_rate": 3.5749611417267204e-06, + "loss": 2.1434, + "step": 41720 + }, + { + "epoch": 0.9827610569450332, + "grad_norm": 2.0281319618225098, + "learning_rate": 3.527860204418068e-06, + "loss": 1.7839, + "step": 41730 + }, + { + "epoch": 0.9829965616315764, + "grad_norm": 2.107598304748535, + "learning_rate": 3.480759267109416e-06, + "loss": 2.0329, + "step": 41740 + }, + { + "epoch": 0.9832320663181198, + "grad_norm": 2.4774956703186035, + "learning_rate": 3.433658329800763e-06, + "loss": 2.0114, + "step": 41750 + }, + { + "epoch": 0.983467571004663, + "grad_norm": 2.693948268890381, + "learning_rate": 3.386557392492111e-06, + "loss": 2.0334, + "step": 41760 + }, + { + "epoch": 0.9837030756912063, + "grad_norm": 2.2076406478881836, + "learning_rate": 3.339456455183458e-06, + "loss": 2.1783, + "step": 41770 + }, + { + "epoch": 0.9839385803777495, + "grad_norm": 2.1578211784362793, + "learning_rate": 3.292355517874806e-06, + "loss": 1.9784, + "step": 41780 + }, + { + "epoch": 0.9841740850642928, + "grad_norm": 2.496828079223633, + "learning_rate": 3.245254580566153e-06, + "loss": 1.8877, + "step": 41790 + }, + { + "epoch": 0.984409589750836, + "grad_norm": 2.085418701171875, + "learning_rate": 3.198153643257501e-06, + "loss": 1.8715, + "step": 41800 + }, + { + "epoch": 0.9846450944373794, + "grad_norm": 2.5021634101867676, + "learning_rate": 3.151052705948849e-06, + "loss": 1.8963, + "step": 41810 + }, + { + "epoch": 0.9848805991239226, + "grad_norm": 2.096789836883545, + "learning_rate": 3.103951768640196e-06, + "loss": 1.9181, + "step": 41820 + }, + { + "epoch": 0.9851161038104659, + "grad_norm": 2.0791969299316406, + "learning_rate": 3.0568508313315435e-06, + "loss": 1.8519, + "step": 41830 + }, + { + "epoch": 0.9853516084970091, + "grad_norm": 2.2031409740448, + "learning_rate": 3.009749894022891e-06, + "loss": 2.0038, + "step": 41840 + }, + { + "epoch": 0.9855871131835523, + "grad_norm": 1.7402150630950928, + "learning_rate": 2.962648956714239e-06, + "loss": 1.968, + "step": 41850 + }, + { + "epoch": 0.9858226178700956, + "grad_norm": 1.8533766269683838, + "learning_rate": 2.9155480194055866e-06, + "loss": 2.0505, + "step": 41860 + }, + { + "epoch": 0.9860581225566388, + "grad_norm": 4.589505672454834, + "learning_rate": 2.868447082096934e-06, + "loss": 2.0808, + "step": 41870 + }, + { + "epoch": 0.9862936272431821, + "grad_norm": 1.68135404586792, + "learning_rate": 2.8213461447882816e-06, + "loss": 2.0021, + "step": 41880 + }, + { + "epoch": 0.9865291319297254, + "grad_norm": 2.261399745941162, + "learning_rate": 2.774245207479629e-06, + "loss": 1.9306, + "step": 41890 + }, + { + "epoch": 0.9867646366162687, + "grad_norm": 1.9954190254211426, + "learning_rate": 2.7271442701709765e-06, + "loss": 2.1601, + "step": 41900 + }, + { + "epoch": 0.9870001413028119, + "grad_norm": 2.2311384677886963, + "learning_rate": 2.680043332862324e-06, + "loss": 1.9097, + "step": 41910 + }, + { + "epoch": 0.9872356459893552, + "grad_norm": 1.8550012111663818, + "learning_rate": 2.632942395553672e-06, + "loss": 2.1214, + "step": 41920 + }, + { + "epoch": 0.9874711506758984, + "grad_norm": 2.2542874813079834, + "learning_rate": 2.5858414582450192e-06, + "loss": 1.8132, + "step": 41930 + }, + { + "epoch": 0.9877066553624417, + "grad_norm": 2.300264596939087, + "learning_rate": 2.538740520936367e-06, + "loss": 2.0594, + "step": 41940 + }, + { + "epoch": 0.9879421600489849, + "grad_norm": 2.213343620300293, + "learning_rate": 2.4916395836277142e-06, + "loss": 2.2302, + "step": 41950 + }, + { + "epoch": 0.9881776647355283, + "grad_norm": 2.0097548961639404, + "learning_rate": 2.444538646319062e-06, + "loss": 2.0405, + "step": 41960 + }, + { + "epoch": 0.9884131694220715, + "grad_norm": 2.0618903636932373, + "learning_rate": 2.397437709010409e-06, + "loss": 1.9674, + "step": 41970 + }, + { + "epoch": 0.9886486741086148, + "grad_norm": 2.110823631286621, + "learning_rate": 2.350336771701757e-06, + "loss": 2.25, + "step": 41980 + }, + { + "epoch": 0.988884178795158, + "grad_norm": 2.1684558391571045, + "learning_rate": 2.303235834393104e-06, + "loss": 1.8892, + "step": 41990 + }, + { + "epoch": 0.9891196834817013, + "grad_norm": 1.705733060836792, + "learning_rate": 2.2561348970844523e-06, + "loss": 1.9763, + "step": 42000 + }, + { + "epoch": 0.9893551881682445, + "grad_norm": 1.6287431716918945, + "learning_rate": 2.2090339597757996e-06, + "loss": 1.9039, + "step": 42010 + }, + { + "epoch": 0.9895906928547878, + "grad_norm": 2.455014944076538, + "learning_rate": 2.1619330224671473e-06, + "loss": 2.0905, + "step": 42020 + }, + { + "epoch": 0.989826197541331, + "grad_norm": 1.862353801727295, + "learning_rate": 2.1148320851584946e-06, + "loss": 1.9751, + "step": 42030 + }, + { + "epoch": 0.9900617022278744, + "grad_norm": 2.33832049369812, + "learning_rate": 2.0677311478498423e-06, + "loss": 1.888, + "step": 42040 + }, + { + "epoch": 0.9902972069144176, + "grad_norm": 2.076282262802124, + "learning_rate": 2.02063021054119e-06, + "loss": 1.9962, + "step": 42050 + }, + { + "epoch": 0.9905327116009609, + "grad_norm": 2.276674509048462, + "learning_rate": 1.9735292732325373e-06, + "loss": 1.9948, + "step": 42060 + }, + { + "epoch": 0.9907682162875041, + "grad_norm": 2.114805221557617, + "learning_rate": 1.926428335923885e-06, + "loss": 2.2751, + "step": 42070 + }, + { + "epoch": 0.9910037209740474, + "grad_norm": 1.974159598350525, + "learning_rate": 1.8793273986152327e-06, + "loss": 2.0404, + "step": 42080 + }, + { + "epoch": 0.9912392256605906, + "grad_norm": 2.0039196014404297, + "learning_rate": 1.8322264613065802e-06, + "loss": 2.0517, + "step": 42090 + }, + { + "epoch": 0.991474730347134, + "grad_norm": 1.9755150079727173, + "learning_rate": 1.7851255239979276e-06, + "loss": 2.1042, + "step": 42100 + }, + { + "epoch": 0.9917102350336772, + "grad_norm": 2.443861722946167, + "learning_rate": 1.7380245866892751e-06, + "loss": 2.0485, + "step": 42110 + }, + { + "epoch": 0.9919457397202205, + "grad_norm": 2.066046953201294, + "learning_rate": 1.6909236493806226e-06, + "loss": 1.917, + "step": 42120 + }, + { + "epoch": 0.9921812444067637, + "grad_norm": 2.3617396354675293, + "learning_rate": 1.6438227120719701e-06, + "loss": 1.9634, + "step": 42130 + }, + { + "epoch": 0.9924167490933069, + "grad_norm": 2.2212107181549072, + "learning_rate": 1.5967217747633178e-06, + "loss": 2.0678, + "step": 42140 + }, + { + "epoch": 0.9926522537798502, + "grad_norm": 1.842693567276001, + "learning_rate": 1.5496208374546653e-06, + "loss": 2.0056, + "step": 42150 + }, + { + "epoch": 0.9928877584663934, + "grad_norm": 2.0039405822753906, + "learning_rate": 1.502519900146013e-06, + "loss": 2.053, + "step": 42160 + }, + { + "epoch": 0.9931232631529368, + "grad_norm": 2.3391904830932617, + "learning_rate": 1.4554189628373605e-06, + "loss": 2.02, + "step": 42170 + }, + { + "epoch": 0.99335876783948, + "grad_norm": 2.3647069931030273, + "learning_rate": 1.4083180255287082e-06, + "loss": 2.0623, + "step": 42180 + }, + { + "epoch": 0.9935942725260233, + "grad_norm": 2.896048069000244, + "learning_rate": 1.3612170882200557e-06, + "loss": 2.1104, + "step": 42190 + }, + { + "epoch": 0.9938297772125665, + "grad_norm": 1.7982397079467773, + "learning_rate": 1.3141161509114032e-06, + "loss": 1.8518, + "step": 42200 + }, + { + "epoch": 0.9940652818991098, + "grad_norm": 2.1359305381774902, + "learning_rate": 1.2670152136027509e-06, + "loss": 1.9961, + "step": 42210 + }, + { + "epoch": 0.994300786585653, + "grad_norm": 1.8116016387939453, + "learning_rate": 1.2199142762940984e-06, + "loss": 2.1378, + "step": 42220 + }, + { + "epoch": 0.9945362912721963, + "grad_norm": 2.0384104251861572, + "learning_rate": 1.1728133389854459e-06, + "loss": 2.0067, + "step": 42230 + }, + { + "epoch": 0.9947717959587395, + "grad_norm": 1.8383530378341675, + "learning_rate": 1.1257124016767934e-06, + "loss": 2.0092, + "step": 42240 + }, + { + "epoch": 0.9950073006452829, + "grad_norm": 2.583355665206909, + "learning_rate": 1.078611464368141e-06, + "loss": 1.9981, + "step": 42250 + }, + { + "epoch": 0.9952428053318261, + "grad_norm": 1.9402409791946411, + "learning_rate": 1.0315105270594886e-06, + "loss": 1.9412, + "step": 42260 + }, + { + "epoch": 0.9954783100183694, + "grad_norm": 2.124499797821045, + "learning_rate": 9.84409589750836e-07, + "loss": 2.0968, + "step": 42270 + }, + { + "epoch": 0.9957138147049126, + "grad_norm": 1.9460561275482178, + "learning_rate": 9.373086524421838e-07, + "loss": 1.9956, + "step": 42280 + }, + { + "epoch": 0.9959493193914559, + "grad_norm": 2.0415124893188477, + "learning_rate": 8.902077151335312e-07, + "loss": 1.9489, + "step": 42290 + }, + { + "epoch": 0.9961848240779991, + "grad_norm": 2.739859104156494, + "learning_rate": 8.431067778248787e-07, + "loss": 2.151, + "step": 42300 + }, + { + "epoch": 0.9964203287645425, + "grad_norm": 2.373000144958496, + "learning_rate": 7.960058405162262e-07, + "loss": 1.7111, + "step": 42310 + }, + { + "epoch": 0.9966558334510857, + "grad_norm": 2.3256282806396484, + "learning_rate": 7.489049032075738e-07, + "loss": 1.9377, + "step": 42320 + }, + { + "epoch": 0.996891338137629, + "grad_norm": 2.3008763790130615, + "learning_rate": 7.018039658989214e-07, + "loss": 1.9568, + "step": 42330 + }, + { + "epoch": 0.9971268428241722, + "grad_norm": 2.1482903957366943, + "learning_rate": 6.547030285902689e-07, + "loss": 1.9677, + "step": 42340 + }, + { + "epoch": 0.9973623475107155, + "grad_norm": 2.5959582328796387, + "learning_rate": 6.076020912816165e-07, + "loss": 2.0476, + "step": 42350 + }, + { + "epoch": 0.9975978521972587, + "grad_norm": 1.8489488363265991, + "learning_rate": 5.605011539729641e-07, + "loss": 1.8465, + "step": 42360 + }, + { + "epoch": 0.997833356883802, + "grad_norm": 2.9984638690948486, + "learning_rate": 5.134002166643117e-07, + "loss": 2.171, + "step": 42370 + }, + { + "epoch": 0.9980688615703452, + "grad_norm": 1.8435646295547485, + "learning_rate": 4.6629927935565915e-07, + "loss": 1.93, + "step": 42380 + }, + { + "epoch": 0.9983043662568886, + "grad_norm": 2.5411810874938965, + "learning_rate": 4.1919834204700674e-07, + "loss": 1.9748, + "step": 42390 + }, + { + "epoch": 0.9985398709434318, + "grad_norm": 2.412177085876465, + "learning_rate": 3.7209740473835434e-07, + "loss": 1.8883, + "step": 42400 + }, + { + "epoch": 0.998775375629975, + "grad_norm": 1.8273178339004517, + "learning_rate": 3.249964674297019e-07, + "loss": 1.9645, + "step": 42410 + }, + { + "epoch": 0.9990108803165183, + "grad_norm": 2.5287861824035645, + "learning_rate": 2.7789553012104943e-07, + "loss": 1.88, + "step": 42420 + }, + { + "epoch": 0.9992463850030615, + "grad_norm": 2.011352300643921, + "learning_rate": 2.3079459281239697e-07, + "loss": 2.0222, + "step": 42430 + }, + { + "epoch": 0.9994818896896048, + "grad_norm": 2.3615148067474365, + "learning_rate": 1.8369365550374452e-07, + "loss": 1.9643, + "step": 42440 + }, + { + "epoch": 0.999717394376148, + "grad_norm": 2.0360193252563477, + "learning_rate": 1.365927181950921e-07, + "loss": 1.8166, + "step": 42450 + }, + { + "epoch": 0.9999528990626914, + "grad_norm": 2.6184275150299072, + "learning_rate": 8.949178088643965e-08, + "loss": 1.9765, + "step": 42460 + } + ], + "logging_steps": 10, + "max_steps": 42462, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.784270210118451e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}