diff --git "a/model/trainer_state.json" "b/model/trainer_state.json" new file mode 100644--- /dev/null +++ "b/model/trainer_state.json" @@ -0,0 +1,90124 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8089093158918155, + "eval_steps": 500, + "global_step": 128700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.285231669711076e-05, + "grad_norm": 110.09329223632812, + "learning_rate": 1.4000000000000001e-06, + "loss": 13.7679, + "step": 10 + }, + { + "epoch": 0.0001257046333942215, + "grad_norm": 96.14527130126953, + "learning_rate": 3.4000000000000005e-06, + "loss": 12.6173, + "step": 20 + }, + { + "epoch": 0.00018855695009133228, + "grad_norm": 43.914146423339844, + "learning_rate": 5.400000000000001e-06, + "loss": 10.4195, + "step": 30 + }, + { + "epoch": 0.000251409266788443, + "grad_norm": 24.050556182861328, + "learning_rate": 7.4e-06, + "loss": 9.2707, + "step": 40 + }, + { + "epoch": 0.0003142615834855538, + "grad_norm": 17.52002716064453, + "learning_rate": 9.4e-06, + "loss": 8.893, + "step": 50 + }, + { + "epoch": 0.00037711390018266457, + "grad_norm": 28.04815673828125, + "learning_rate": 1.14e-05, + "loss": 8.4257, + "step": 60 + }, + { + "epoch": 0.0004399662168797753, + "grad_norm": 16.464113235473633, + "learning_rate": 1.3400000000000002e-05, + "loss": 7.9375, + "step": 70 + }, + { + "epoch": 0.000502818533576886, + "grad_norm": 14.842829704284668, + "learning_rate": 1.54e-05, + "loss": 7.7756, + "step": 80 + }, + { + "epoch": 0.0005656708502739968, + "grad_norm": 21.78369903564453, + "learning_rate": 1.7400000000000003e-05, + "loss": 7.4797, + "step": 90 + }, + { + "epoch": 0.0006285231669711076, + "grad_norm": 15.628141403198242, + "learning_rate": 1.94e-05, + "loss": 7.532, + "step": 100 + }, + { + "epoch": 0.0006913754836682184, + "grad_norm": 15.642536163330078, + "learning_rate": 1.9999706629338745e-05, + "loss": 7.2184, + "step": 110 + }, + { + "epoch": 0.0007542278003653291, + "grad_norm": 15.071418762207031, + "learning_rate": 1.9999287528394092e-05, + "loss": 6.8758, + "step": 120 + }, + { + "epoch": 0.0008170801170624398, + "grad_norm": 13.564769744873047, + "learning_rate": 1.9998868427449436e-05, + "loss": 6.9533, + "step": 130 + }, + { + "epoch": 0.0008799324337595506, + "grad_norm": 12.832381248474121, + "learning_rate": 1.9998449326504783e-05, + "loss": 6.7193, + "step": 140 + }, + { + "epoch": 0.0009427847504566613, + "grad_norm": 12.85204792022705, + "learning_rate": 1.999803022556013e-05, + "loss": 6.6503, + "step": 150 + }, + { + "epoch": 0.001005637067153772, + "grad_norm": 10.495243072509766, + "learning_rate": 1.9997611124615477e-05, + "loss": 6.5755, + "step": 160 + }, + { + "epoch": 0.0010684893838508829, + "grad_norm": 12.84036636352539, + "learning_rate": 1.9997192023670824e-05, + "loss": 6.3054, + "step": 170 + }, + { + "epoch": 0.0011313417005479936, + "grad_norm": 10.062732696533203, + "learning_rate": 1.9996772922726168e-05, + "loss": 6.4009, + "step": 180 + }, + { + "epoch": 0.0011941940172451044, + "grad_norm": 10.66489315032959, + "learning_rate": 1.9996353821781515e-05, + "loss": 5.9644, + "step": 190 + }, + { + "epoch": 0.0012570463339422152, + "grad_norm": 10.236940383911133, + "learning_rate": 1.9995934720836862e-05, + "loss": 6.0764, + "step": 200 + }, + { + "epoch": 0.001319898650639326, + "grad_norm": 11.32441520690918, + "learning_rate": 1.999551561989221e-05, + "loss": 5.8501, + "step": 210 + }, + { + "epoch": 0.0013827509673364367, + "grad_norm": 10.349939346313477, + "learning_rate": 1.9995096518947556e-05, + "loss": 5.9544, + "step": 220 + }, + { + "epoch": 0.0014456032840335475, + "grad_norm": 10.603028297424316, + "learning_rate": 1.99946774180029e-05, + "loss": 5.7299, + "step": 230 + }, + { + "epoch": 0.0015084556007306583, + "grad_norm": 11.104246139526367, + "learning_rate": 1.9994258317058247e-05, + "loss": 5.708, + "step": 240 + }, + { + "epoch": 0.001571307917427769, + "grad_norm": 9.362375259399414, + "learning_rate": 1.9993839216113594e-05, + "loss": 5.6093, + "step": 250 + }, + { + "epoch": 0.0016341602341248796, + "grad_norm": 9.127053260803223, + "learning_rate": 1.999342011516894e-05, + "loss": 5.5071, + "step": 260 + }, + { + "epoch": 0.0016970125508219904, + "grad_norm": 8.870572090148926, + "learning_rate": 1.9993001014224288e-05, + "loss": 5.4138, + "step": 270 + }, + { + "epoch": 0.0017598648675191011, + "grad_norm": 10.999969482421875, + "learning_rate": 1.9992581913279635e-05, + "loss": 5.4419, + "step": 280 + }, + { + "epoch": 0.001822717184216212, + "grad_norm": 8.93590259552002, + "learning_rate": 1.9992162812334982e-05, + "loss": 5.4908, + "step": 290 + }, + { + "epoch": 0.0018855695009133227, + "grad_norm": 8.729954719543457, + "learning_rate": 1.999174371139033e-05, + "loss": 5.2426, + "step": 300 + }, + { + "epoch": 0.0019484218176104334, + "grad_norm": 10.1044340133667, + "learning_rate": 1.9991324610445673e-05, + "loss": 5.2254, + "step": 310 + }, + { + "epoch": 0.002011274134307544, + "grad_norm": 9.7942533493042, + "learning_rate": 1.999090550950102e-05, + "loss": 5.0436, + "step": 320 + }, + { + "epoch": 0.002074126451004655, + "grad_norm": 10.541589736938477, + "learning_rate": 1.9990486408556367e-05, + "loss": 4.803, + "step": 330 + }, + { + "epoch": 0.0021369787677017658, + "grad_norm": 9.875314712524414, + "learning_rate": 1.9990067307611714e-05, + "loss": 4.9165, + "step": 340 + }, + { + "epoch": 0.0021998310843988765, + "grad_norm": 9.487140655517578, + "learning_rate": 1.998964820666706e-05, + "loss": 5.1093, + "step": 350 + }, + { + "epoch": 0.0022626834010959873, + "grad_norm": 9.752985954284668, + "learning_rate": 1.9989229105722405e-05, + "loss": 5.0373, + "step": 360 + }, + { + "epoch": 0.002325535717793098, + "grad_norm": 10.569908142089844, + "learning_rate": 1.9988810004777752e-05, + "loss": 4.9041, + "step": 370 + }, + { + "epoch": 0.002388388034490209, + "grad_norm": 8.912810325622559, + "learning_rate": 1.99883909038331e-05, + "loss": 4.7678, + "step": 380 + }, + { + "epoch": 0.0024512403511873196, + "grad_norm": 9.076282501220703, + "learning_rate": 1.9987971802888446e-05, + "loss": 4.6359, + "step": 390 + }, + { + "epoch": 0.0025140926678844304, + "grad_norm": 10.559319496154785, + "learning_rate": 1.998755270194379e-05, + "loss": 4.5624, + "step": 400 + }, + { + "epoch": 0.002576944984581541, + "grad_norm": 13.224327087402344, + "learning_rate": 1.9987133600999137e-05, + "loss": 4.6897, + "step": 410 + }, + { + "epoch": 0.002639797301278652, + "grad_norm": 10.025303840637207, + "learning_rate": 1.9986714500054484e-05, + "loss": 4.6429, + "step": 420 + }, + { + "epoch": 0.0027026496179757627, + "grad_norm": 9.266767501831055, + "learning_rate": 1.998629539910983e-05, + "loss": 4.6152, + "step": 430 + }, + { + "epoch": 0.0027655019346728735, + "grad_norm": 9.802778244018555, + "learning_rate": 1.9985876298165178e-05, + "loss": 4.8169, + "step": 440 + }, + { + "epoch": 0.0028283542513699842, + "grad_norm": 9.405644416809082, + "learning_rate": 1.9985457197220525e-05, + "loss": 4.3837, + "step": 450 + }, + { + "epoch": 0.002891206568067095, + "grad_norm": 11.649224281311035, + "learning_rate": 1.9985038096275872e-05, + "loss": 4.5792, + "step": 460 + }, + { + "epoch": 0.0029540588847642058, + "grad_norm": 11.123177528381348, + "learning_rate": 1.998461899533122e-05, + "loss": 4.5444, + "step": 470 + }, + { + "epoch": 0.0030169112014613165, + "grad_norm": 10.083602905273438, + "learning_rate": 1.9984199894386563e-05, + "loss": 4.5796, + "step": 480 + }, + { + "epoch": 0.0030797635181584273, + "grad_norm": 9.665024757385254, + "learning_rate": 1.998378079344191e-05, + "loss": 4.5045, + "step": 490 + }, + { + "epoch": 0.003142615834855538, + "grad_norm": 10.009722709655762, + "learning_rate": 1.9983361692497257e-05, + "loss": 4.5415, + "step": 500 + }, + { + "epoch": 0.003205468151552649, + "grad_norm": 8.68890380859375, + "learning_rate": 1.9982942591552604e-05, + "loss": 4.3695, + "step": 510 + }, + { + "epoch": 0.003268320468249759, + "grad_norm": 9.695719718933105, + "learning_rate": 1.998252349060795e-05, + "loss": 4.4293, + "step": 520 + }, + { + "epoch": 0.00333117278494687, + "grad_norm": 11.342803001403809, + "learning_rate": 1.9982104389663295e-05, + "loss": 4.6023, + "step": 530 + }, + { + "epoch": 0.0033940251016439807, + "grad_norm": 9.516191482543945, + "learning_rate": 1.9981685288718642e-05, + "loss": 4.3556, + "step": 540 + }, + { + "epoch": 0.0034568774183410915, + "grad_norm": 9.393511772155762, + "learning_rate": 1.998126618777399e-05, + "loss": 4.4067, + "step": 550 + }, + { + "epoch": 0.0035197297350382023, + "grad_norm": 10.67137336730957, + "learning_rate": 1.9980847086829336e-05, + "loss": 4.375, + "step": 560 + }, + { + "epoch": 0.003582582051735313, + "grad_norm": 9.595892906188965, + "learning_rate": 1.9980427985884683e-05, + "loss": 4.5256, + "step": 570 + }, + { + "epoch": 0.003645434368432424, + "grad_norm": 8.754829406738281, + "learning_rate": 1.9980008884940027e-05, + "loss": 4.4646, + "step": 580 + }, + { + "epoch": 0.0037082866851295346, + "grad_norm": 11.057015419006348, + "learning_rate": 1.9979589783995374e-05, + "loss": 4.5457, + "step": 590 + }, + { + "epoch": 0.0037711390018266453, + "grad_norm": 10.04822826385498, + "learning_rate": 1.997917068305072e-05, + "loss": 4.2818, + "step": 600 + }, + { + "epoch": 0.003833991318523756, + "grad_norm": 10.355257034301758, + "learning_rate": 1.9978793492200532e-05, + "loss": 4.309, + "step": 610 + }, + { + "epoch": 0.003896843635220867, + "grad_norm": 8.481429100036621, + "learning_rate": 1.997837439125588e-05, + "loss": 4.1791, + "step": 620 + }, + { + "epoch": 0.003959695951917978, + "grad_norm": 8.190858840942383, + "learning_rate": 1.9977955290311226e-05, + "loss": 4.1775, + "step": 630 + }, + { + "epoch": 0.004022548268615088, + "grad_norm": 9.911380767822266, + "learning_rate": 1.9977536189366573e-05, + "loss": 4.3927, + "step": 640 + }, + { + "epoch": 0.004085400585312199, + "grad_norm": 9.007355690002441, + "learning_rate": 1.9977117088421917e-05, + "loss": 4.2628, + "step": 650 + }, + { + "epoch": 0.00414825290200931, + "grad_norm": 8.924678802490234, + "learning_rate": 1.9976697987477264e-05, + "loss": 4.2525, + "step": 660 + }, + { + "epoch": 0.004211105218706421, + "grad_norm": 11.19166088104248, + "learning_rate": 1.997627888653261e-05, + "loss": 4.1394, + "step": 670 + }, + { + "epoch": 0.0042739575354035315, + "grad_norm": 10.57885456085205, + "learning_rate": 1.997585978558796e-05, + "loss": 4.2921, + "step": 680 + }, + { + "epoch": 0.004336809852100642, + "grad_norm": 9.213665008544922, + "learning_rate": 1.9975440684643305e-05, + "loss": 4.2321, + "step": 690 + }, + { + "epoch": 0.004399662168797753, + "grad_norm": 9.66899299621582, + "learning_rate": 1.997502158369865e-05, + "loss": 4.1874, + "step": 700 + }, + { + "epoch": 0.004462514485494864, + "grad_norm": 9.520936012268066, + "learning_rate": 1.9974602482753996e-05, + "loss": 4.2696, + "step": 710 + }, + { + "epoch": 0.004525366802191975, + "grad_norm": 10.274086952209473, + "learning_rate": 1.9974183381809343e-05, + "loss": 4.2235, + "step": 720 + }, + { + "epoch": 0.004588219118889085, + "grad_norm": 8.612703323364258, + "learning_rate": 1.997376428086469e-05, + "loss": 4.4711, + "step": 730 + }, + { + "epoch": 0.004651071435586196, + "grad_norm": 10.191298484802246, + "learning_rate": 1.9973345179920037e-05, + "loss": 4.0802, + "step": 740 + }, + { + "epoch": 0.004713923752283307, + "grad_norm": 17.114593505859375, + "learning_rate": 1.9972926078975384e-05, + "loss": 4.1454, + "step": 750 + }, + { + "epoch": 0.004776776068980418, + "grad_norm": 8.410555839538574, + "learning_rate": 1.997250697803073e-05, + "loss": 4.1697, + "step": 760 + }, + { + "epoch": 0.0048396283856775284, + "grad_norm": 9.421134948730469, + "learning_rate": 1.997208787708608e-05, + "loss": 4.2371, + "step": 770 + }, + { + "epoch": 0.004902480702374639, + "grad_norm": 8.89936637878418, + "learning_rate": 1.9971668776141426e-05, + "loss": 4.1051, + "step": 780 + }, + { + "epoch": 0.00496533301907175, + "grad_norm": 10.175434112548828, + "learning_rate": 1.997124967519677e-05, + "loss": 4.3075, + "step": 790 + }, + { + "epoch": 0.005028185335768861, + "grad_norm": 8.717267990112305, + "learning_rate": 1.9970830574252116e-05, + "loss": 4.2175, + "step": 800 + }, + { + "epoch": 0.0050910376524659715, + "grad_norm": 8.255484580993652, + "learning_rate": 1.9970411473307463e-05, + "loss": 4.3324, + "step": 810 + }, + { + "epoch": 0.005153889969163082, + "grad_norm": 8.949331283569336, + "learning_rate": 1.996999237236281e-05, + "loss": 3.9377, + "step": 820 + }, + { + "epoch": 0.005216742285860193, + "grad_norm": 7.299438953399658, + "learning_rate": 1.9969573271418154e-05, + "loss": 3.894, + "step": 830 + }, + { + "epoch": 0.005279594602557304, + "grad_norm": 8.529239654541016, + "learning_rate": 1.99691541704735e-05, + "loss": 4.0175, + "step": 840 + }, + { + "epoch": 0.005342446919254415, + "grad_norm": 8.880363464355469, + "learning_rate": 1.996873506952885e-05, + "loss": 4.0875, + "step": 850 + }, + { + "epoch": 0.005405299235951525, + "grad_norm": 7.495782375335693, + "learning_rate": 1.9968315968584195e-05, + "loss": 3.9243, + "step": 860 + }, + { + "epoch": 0.005468151552648636, + "grad_norm": 11.282350540161133, + "learning_rate": 1.9967896867639543e-05, + "loss": 4.1782, + "step": 870 + }, + { + "epoch": 0.005531003869345747, + "grad_norm": 8.926542282104492, + "learning_rate": 1.9967477766694886e-05, + "loss": 4.0449, + "step": 880 + }, + { + "epoch": 0.005593856186042858, + "grad_norm": 8.818291664123535, + "learning_rate": 1.9967058665750233e-05, + "loss": 4.0877, + "step": 890 + }, + { + "epoch": 0.0056567085027399685, + "grad_norm": 9.448590278625488, + "learning_rate": 1.996663956480558e-05, + "loss": 4.1351, + "step": 900 + }, + { + "epoch": 0.005719560819437079, + "grad_norm": 9.017233848571777, + "learning_rate": 1.9966220463860927e-05, + "loss": 4.0607, + "step": 910 + }, + { + "epoch": 0.00578241313613419, + "grad_norm": 9.787857055664062, + "learning_rate": 1.9965801362916274e-05, + "loss": 4.0828, + "step": 920 + }, + { + "epoch": 0.005845265452831301, + "grad_norm": 16.65804100036621, + "learning_rate": 1.9965382261971618e-05, + "loss": 4.1905, + "step": 930 + }, + { + "epoch": 0.0059081177695284115, + "grad_norm": 8.049735069274902, + "learning_rate": 1.9964963161026965e-05, + "loss": 4.2229, + "step": 940 + }, + { + "epoch": 0.005970970086225522, + "grad_norm": 9.003173828125, + "learning_rate": 1.9964544060082312e-05, + "loss": 3.8822, + "step": 950 + }, + { + "epoch": 0.006033822402922633, + "grad_norm": 10.698800086975098, + "learning_rate": 1.996412495913766e-05, + "loss": 4.1255, + "step": 960 + }, + { + "epoch": 0.006096674719619744, + "grad_norm": 8.154327392578125, + "learning_rate": 1.9963705858193006e-05, + "loss": 4.1607, + "step": 970 + }, + { + "epoch": 0.006159527036316855, + "grad_norm": 8.608023643493652, + "learning_rate": 1.9963286757248354e-05, + "loss": 3.9839, + "step": 980 + }, + { + "epoch": 0.006222379353013965, + "grad_norm": 9.532076835632324, + "learning_rate": 1.99628676563037e-05, + "loss": 4.1282, + "step": 990 + }, + { + "epoch": 0.006285231669711076, + "grad_norm": 7.2183637619018555, + "learning_rate": 1.9962448555359048e-05, + "loss": 4.0516, + "step": 1000 + }, + { + "epoch": 0.006348083986408187, + "grad_norm": 8.70899486541748, + "learning_rate": 1.9962071364508856e-05, + "loss": 3.94, + "step": 1010 + }, + { + "epoch": 0.006410936303105298, + "grad_norm": 7.688412666320801, + "learning_rate": 1.9961652263564203e-05, + "loss": 4.2123, + "step": 1020 + }, + { + "epoch": 0.006473788619802408, + "grad_norm": 11.714567184448242, + "learning_rate": 1.996123316261955e-05, + "loss": 3.9293, + "step": 1030 + }, + { + "epoch": 0.006536640936499518, + "grad_norm": 10.026140213012695, + "learning_rate": 1.9960814061674897e-05, + "loss": 3.9482, + "step": 1040 + }, + { + "epoch": 0.006599493253196629, + "grad_norm": 7.529440879821777, + "learning_rate": 1.9960394960730244e-05, + "loss": 4.0032, + "step": 1050 + }, + { + "epoch": 0.00666234556989374, + "grad_norm": 9.811161041259766, + "learning_rate": 1.995997585978559e-05, + "loss": 4.0338, + "step": 1060 + }, + { + "epoch": 0.006725197886590851, + "grad_norm": 6.567491054534912, + "learning_rate": 1.9959556758840938e-05, + "loss": 4.0207, + "step": 1070 + }, + { + "epoch": 0.0067880502032879615, + "grad_norm": 8.35252571105957, + "learning_rate": 1.9959137657896282e-05, + "loss": 3.9025, + "step": 1080 + }, + { + "epoch": 0.006850902519985072, + "grad_norm": 10.21352767944336, + "learning_rate": 1.995871855695163e-05, + "loss": 4.0365, + "step": 1090 + }, + { + "epoch": 0.006913754836682183, + "grad_norm": 7.785412311553955, + "learning_rate": 1.9958299456006976e-05, + "loss": 3.7487, + "step": 1100 + }, + { + "epoch": 0.006976607153379294, + "grad_norm": 9.56949520111084, + "learning_rate": 1.9957880355062323e-05, + "loss": 4.1561, + "step": 1110 + }, + { + "epoch": 0.0070394594700764045, + "grad_norm": 9.330686569213867, + "learning_rate": 1.995746125411767e-05, + "loss": 3.6948, + "step": 1120 + }, + { + "epoch": 0.007102311786773515, + "grad_norm": 7.871035099029541, + "learning_rate": 1.9957042153173014e-05, + "loss": 3.8271, + "step": 1130 + }, + { + "epoch": 0.007165164103470626, + "grad_norm": 8.37169361114502, + "learning_rate": 1.995662305222836e-05, + "loss": 3.9274, + "step": 1140 + }, + { + "epoch": 0.007228016420167737, + "grad_norm": 7.6004319190979, + "learning_rate": 1.9956203951283708e-05, + "loss": 3.7638, + "step": 1150 + }, + { + "epoch": 0.007290868736864848, + "grad_norm": 8.623445510864258, + "learning_rate": 1.9955784850339055e-05, + "loss": 4.1957, + "step": 1160 + }, + { + "epoch": 0.007353721053561958, + "grad_norm": 7.707347869873047, + "learning_rate": 1.99553657493944e-05, + "loss": 3.9143, + "step": 1170 + }, + { + "epoch": 0.007416573370259069, + "grad_norm": 8.317936897277832, + "learning_rate": 1.9954946648449746e-05, + "loss": 3.8486, + "step": 1180 + }, + { + "epoch": 0.00747942568695618, + "grad_norm": 8.231162071228027, + "learning_rate": 1.9954527547505093e-05, + "loss": 3.999, + "step": 1190 + }, + { + "epoch": 0.007542278003653291, + "grad_norm": 24.25721549987793, + "learning_rate": 1.995410844656044e-05, + "loss": 3.956, + "step": 1200 + }, + { + "epoch": 0.0076051303203504015, + "grad_norm": 7.126772403717041, + "learning_rate": 1.9953689345615787e-05, + "loss": 3.7612, + "step": 1210 + }, + { + "epoch": 0.007667982637047512, + "grad_norm": 7.953426837921143, + "learning_rate": 1.9953270244671134e-05, + "loss": 3.8871, + "step": 1220 + }, + { + "epoch": 0.007730834953744623, + "grad_norm": 9.395339965820312, + "learning_rate": 1.995285114372648e-05, + "loss": 3.9739, + "step": 1230 + }, + { + "epoch": 0.007793687270441734, + "grad_norm": 9.223485946655273, + "learning_rate": 1.9952432042781825e-05, + "loss": 3.8702, + "step": 1240 + }, + { + "epoch": 0.007856539587138845, + "grad_norm": 8.946688652038574, + "learning_rate": 1.9952012941837172e-05, + "loss": 3.7169, + "step": 1250 + }, + { + "epoch": 0.007919391903835955, + "grad_norm": 9.54202938079834, + "learning_rate": 1.995159384089252e-05, + "loss": 4.0095, + "step": 1260 + }, + { + "epoch": 0.007982244220533067, + "grad_norm": 8.55982780456543, + "learning_rate": 1.9951174739947866e-05, + "loss": 4.0312, + "step": 1270 + }, + { + "epoch": 0.008045096537230177, + "grad_norm": 9.204590797424316, + "learning_rate": 1.9950755639003213e-05, + "loss": 3.9259, + "step": 1280 + }, + { + "epoch": 0.008107948853927288, + "grad_norm": 8.297924995422363, + "learning_rate": 1.995033653805856e-05, + "loss": 3.9343, + "step": 1290 + }, + { + "epoch": 0.008170801170624398, + "grad_norm": 8.083635330200195, + "learning_rate": 1.9949917437113907e-05, + "loss": 3.8386, + "step": 1300 + }, + { + "epoch": 0.00823365348732151, + "grad_norm": 8.369839668273926, + "learning_rate": 1.994949833616925e-05, + "loss": 3.8577, + "step": 1310 + }, + { + "epoch": 0.00829650580401862, + "grad_norm": 10.199877738952637, + "learning_rate": 1.9949079235224598e-05, + "loss": 3.8577, + "step": 1320 + }, + { + "epoch": 0.008359358120715732, + "grad_norm": 8.548699378967285, + "learning_rate": 1.9948660134279945e-05, + "loss": 3.8987, + "step": 1330 + }, + { + "epoch": 0.008422210437412841, + "grad_norm": 9.549871444702148, + "learning_rate": 1.9948241033335292e-05, + "loss": 3.7709, + "step": 1340 + }, + { + "epoch": 0.008485062754109953, + "grad_norm": 8.54775619506836, + "learning_rate": 1.9947821932390636e-05, + "loss": 3.7397, + "step": 1350 + }, + { + "epoch": 0.008547915070807063, + "grad_norm": 9.24419116973877, + "learning_rate": 1.9947402831445983e-05, + "loss": 3.8185, + "step": 1360 + }, + { + "epoch": 0.008610767387504173, + "grad_norm": 8.104684829711914, + "learning_rate": 1.994698373050133e-05, + "loss": 3.8048, + "step": 1370 + }, + { + "epoch": 0.008673619704201285, + "grad_norm": 8.316015243530273, + "learning_rate": 1.9946564629556677e-05, + "loss": 3.7468, + "step": 1380 + }, + { + "epoch": 0.008736472020898394, + "grad_norm": 8.843308448791504, + "learning_rate": 1.994614552861202e-05, + "loss": 3.9003, + "step": 1390 + }, + { + "epoch": 0.008799324337595506, + "grad_norm": 8.556550025939941, + "learning_rate": 1.9945726427667368e-05, + "loss": 3.7636, + "step": 1400 + }, + { + "epoch": 0.008862176654292616, + "grad_norm": 8.34255599975586, + "learning_rate": 1.9945307326722715e-05, + "loss": 3.8535, + "step": 1410 + }, + { + "epoch": 0.008925028970989728, + "grad_norm": 7.886868953704834, + "learning_rate": 1.9944888225778062e-05, + "loss": 3.9768, + "step": 1420 + }, + { + "epoch": 0.008987881287686838, + "grad_norm": 7.469964027404785, + "learning_rate": 1.994446912483341e-05, + "loss": 3.9373, + "step": 1430 + }, + { + "epoch": 0.00905073360438395, + "grad_norm": 9.098652839660645, + "learning_rate": 1.9944050023888756e-05, + "loss": 3.7668, + "step": 1440 + }, + { + "epoch": 0.009113585921081059, + "grad_norm": 8.704818725585938, + "learning_rate": 1.9943630922944103e-05, + "loss": 3.9797, + "step": 1450 + }, + { + "epoch": 0.00917643823777817, + "grad_norm": 9.686690330505371, + "learning_rate": 1.994321182199945e-05, + "loss": 3.8918, + "step": 1460 + }, + { + "epoch": 0.00923929055447528, + "grad_norm": 8.338467597961426, + "learning_rate": 1.9942792721054797e-05, + "loss": 3.9657, + "step": 1470 + }, + { + "epoch": 0.009302142871172392, + "grad_norm": 8.457193374633789, + "learning_rate": 1.994237362011014e-05, + "loss": 3.7973, + "step": 1480 + }, + { + "epoch": 0.009364995187869502, + "grad_norm": 9.84638500213623, + "learning_rate": 1.9941954519165488e-05, + "loss": 3.7545, + "step": 1490 + }, + { + "epoch": 0.009427847504566614, + "grad_norm": 7.546156406402588, + "learning_rate": 1.9941535418220835e-05, + "loss": 3.9961, + "step": 1500 + }, + { + "epoch": 0.009490699821263724, + "grad_norm": 10.12843132019043, + "learning_rate": 1.9941116317276182e-05, + "loss": 3.9926, + "step": 1510 + }, + { + "epoch": 0.009553552137960835, + "grad_norm": 9.499560356140137, + "learning_rate": 1.994069721633153e-05, + "loss": 3.7154, + "step": 1520 + }, + { + "epoch": 0.009616404454657945, + "grad_norm": 7.583631992340088, + "learning_rate": 1.9940278115386873e-05, + "loss": 3.605, + "step": 1530 + }, + { + "epoch": 0.009679256771355057, + "grad_norm": 7.655325889587402, + "learning_rate": 1.993985901444222e-05, + "loss": 3.8484, + "step": 1540 + }, + { + "epoch": 0.009742109088052167, + "grad_norm": 6.684481143951416, + "learning_rate": 1.9939439913497567e-05, + "loss": 3.8044, + "step": 1550 + }, + { + "epoch": 0.009804961404749278, + "grad_norm": 8.056528091430664, + "learning_rate": 1.9939020812552914e-05, + "loss": 3.7472, + "step": 1560 + }, + { + "epoch": 0.009867813721446388, + "grad_norm": 9.075079917907715, + "learning_rate": 1.9938601711608258e-05, + "loss": 3.6593, + "step": 1570 + }, + { + "epoch": 0.0099306660381435, + "grad_norm": 7.955049514770508, + "learning_rate": 1.9938182610663605e-05, + "loss": 3.6238, + "step": 1580 + }, + { + "epoch": 0.00999351835484061, + "grad_norm": 9.027932167053223, + "learning_rate": 1.9937763509718952e-05, + "loss": 3.8524, + "step": 1590 + }, + { + "epoch": 0.010056370671537722, + "grad_norm": 9.155461311340332, + "learning_rate": 1.99373444087743e-05, + "loss": 3.7894, + "step": 1600 + }, + { + "epoch": 0.010119222988234831, + "grad_norm": 8.733969688415527, + "learning_rate": 1.9936925307829646e-05, + "loss": 3.758, + "step": 1610 + }, + { + "epoch": 0.010182075304931943, + "grad_norm": 8.7479248046875, + "learning_rate": 1.993650620688499e-05, + "loss": 3.7762, + "step": 1620 + }, + { + "epoch": 0.010244927621629053, + "grad_norm": 7.365170001983643, + "learning_rate": 1.9936087105940337e-05, + "loss": 3.7251, + "step": 1630 + }, + { + "epoch": 0.010307779938326165, + "grad_norm": 8.145270347595215, + "learning_rate": 1.9935668004995684e-05, + "loss": 3.6831, + "step": 1640 + }, + { + "epoch": 0.010370632255023274, + "grad_norm": 8.109509468078613, + "learning_rate": 1.993524890405103e-05, + "loss": 3.5559, + "step": 1650 + }, + { + "epoch": 0.010433484571720386, + "grad_norm": 8.524164199829102, + "learning_rate": 1.9934829803106378e-05, + "loss": 3.7954, + "step": 1660 + }, + { + "epoch": 0.010496336888417496, + "grad_norm": 9.258058547973633, + "learning_rate": 1.9934410702161725e-05, + "loss": 3.774, + "step": 1670 + }, + { + "epoch": 0.010559189205114608, + "grad_norm": 9.292672157287598, + "learning_rate": 1.9933991601217072e-05, + "loss": 3.5493, + "step": 1680 + }, + { + "epoch": 0.010622041521811718, + "grad_norm": 15.687127113342285, + "learning_rate": 1.993357250027242e-05, + "loss": 3.8524, + "step": 1690 + }, + { + "epoch": 0.01068489383850883, + "grad_norm": 7.195960521697998, + "learning_rate": 1.9933153399327763e-05, + "loss": 3.716, + "step": 1700 + }, + { + "epoch": 0.010747746155205939, + "grad_norm": 7.604008674621582, + "learning_rate": 1.993273429838311e-05, + "loss": 3.8347, + "step": 1710 + }, + { + "epoch": 0.01081059847190305, + "grad_norm": 8.035575866699219, + "learning_rate": 1.9932315197438457e-05, + "loss": 3.7588, + "step": 1720 + }, + { + "epoch": 0.01087345078860016, + "grad_norm": 7.609543323516846, + "learning_rate": 1.9931896096493804e-05, + "loss": 3.8961, + "step": 1730 + }, + { + "epoch": 0.010936303105297272, + "grad_norm": 7.832934856414795, + "learning_rate": 1.993147699554915e-05, + "loss": 3.6334, + "step": 1740 + }, + { + "epoch": 0.010999155421994382, + "grad_norm": 8.620970726013184, + "learning_rate": 1.9931057894604495e-05, + "loss": 3.6378, + "step": 1750 + }, + { + "epoch": 0.011062007738691494, + "grad_norm": 7.927221298217773, + "learning_rate": 1.9930638793659842e-05, + "loss": 3.6141, + "step": 1760 + }, + { + "epoch": 0.011124860055388604, + "grad_norm": 11.173967361450195, + "learning_rate": 1.993021969271519e-05, + "loss": 3.8173, + "step": 1770 + }, + { + "epoch": 0.011187712372085715, + "grad_norm": 7.172491550445557, + "learning_rate": 1.9929800591770536e-05, + "loss": 3.725, + "step": 1780 + }, + { + "epoch": 0.011250564688782825, + "grad_norm": 7.763185977935791, + "learning_rate": 1.992938149082588e-05, + "loss": 3.6435, + "step": 1790 + }, + { + "epoch": 0.011313417005479937, + "grad_norm": 7.742452621459961, + "learning_rate": 1.9928962389881227e-05, + "loss": 3.6609, + "step": 1800 + }, + { + "epoch": 0.011376269322177047, + "grad_norm": 8.668071746826172, + "learning_rate": 1.9928543288936574e-05, + "loss": 3.5743, + "step": 1810 + }, + { + "epoch": 0.011439121638874158, + "grad_norm": 7.531178951263428, + "learning_rate": 1.992812418799192e-05, + "loss": 3.6773, + "step": 1820 + }, + { + "epoch": 0.011501973955571268, + "grad_norm": 8.770427703857422, + "learning_rate": 1.9927705087047268e-05, + "loss": 3.6403, + "step": 1830 + }, + { + "epoch": 0.01156482627226838, + "grad_norm": 7.917853832244873, + "learning_rate": 1.9927285986102615e-05, + "loss": 3.8062, + "step": 1840 + }, + { + "epoch": 0.01162767858896549, + "grad_norm": 7.949713706970215, + "learning_rate": 1.9926866885157962e-05, + "loss": 3.7471, + "step": 1850 + }, + { + "epoch": 0.011690530905662602, + "grad_norm": 8.924261093139648, + "learning_rate": 1.992644778421331e-05, + "loss": 3.559, + "step": 1860 + }, + { + "epoch": 0.011753383222359711, + "grad_norm": 9.18735122680664, + "learning_rate": 1.9926028683268653e-05, + "loss": 3.7138, + "step": 1870 + }, + { + "epoch": 0.011816235539056823, + "grad_norm": 7.951308727264404, + "learning_rate": 1.9925609582324e-05, + "loss": 3.6578, + "step": 1880 + }, + { + "epoch": 0.011879087855753933, + "grad_norm": 8.34437370300293, + "learning_rate": 1.9925190481379347e-05, + "loss": 3.7569, + "step": 1890 + }, + { + "epoch": 0.011941940172451045, + "grad_norm": 10.448899269104004, + "learning_rate": 1.9924771380434694e-05, + "loss": 3.8515, + "step": 1900 + }, + { + "epoch": 0.012004792489148155, + "grad_norm": 8.515726089477539, + "learning_rate": 1.992435227949004e-05, + "loss": 3.5718, + "step": 1910 + }, + { + "epoch": 0.012067644805845266, + "grad_norm": 8.738914489746094, + "learning_rate": 1.9923933178545388e-05, + "loss": 3.8008, + "step": 1920 + }, + { + "epoch": 0.012130497122542376, + "grad_norm": 7.068699836730957, + "learning_rate": 1.9923514077600732e-05, + "loss": 3.5684, + "step": 1930 + }, + { + "epoch": 0.012193349439239488, + "grad_norm": 7.559966087341309, + "learning_rate": 1.992309497665608e-05, + "loss": 3.752, + "step": 1940 + }, + { + "epoch": 0.012256201755936598, + "grad_norm": 8.261521339416504, + "learning_rate": 1.9922675875711426e-05, + "loss": 3.5821, + "step": 1950 + }, + { + "epoch": 0.01231905407263371, + "grad_norm": 7.729598522186279, + "learning_rate": 1.9922256774766773e-05, + "loss": 3.6635, + "step": 1960 + }, + { + "epoch": 0.01238190638933082, + "grad_norm": 7.8209733963012695, + "learning_rate": 1.9921837673822117e-05, + "loss": 3.746, + "step": 1970 + }, + { + "epoch": 0.01244475870602793, + "grad_norm": 7.871613502502441, + "learning_rate": 1.9921418572877464e-05, + "loss": 3.4679, + "step": 1980 + }, + { + "epoch": 0.01250761102272504, + "grad_norm": 7.5880913734436035, + "learning_rate": 1.992099947193281e-05, + "loss": 3.6665, + "step": 1990 + }, + { + "epoch": 0.012570463339422152, + "grad_norm": 9.116113662719727, + "learning_rate": 1.9920580370988158e-05, + "loss": 3.7091, + "step": 2000 + }, + { + "epoch": 0.012633315656119262, + "grad_norm": 8.949613571166992, + "learning_rate": 1.9920161270043505e-05, + "loss": 3.659, + "step": 2010 + }, + { + "epoch": 0.012696167972816374, + "grad_norm": 9.201729774475098, + "learning_rate": 1.991974216909885e-05, + "loss": 3.8791, + "step": 2020 + }, + { + "epoch": 0.012759020289513484, + "grad_norm": 7.439731121063232, + "learning_rate": 1.9919323068154196e-05, + "loss": 3.7566, + "step": 2030 + }, + { + "epoch": 0.012821872606210595, + "grad_norm": 8.156105041503906, + "learning_rate": 1.9918903967209543e-05, + "loss": 3.7228, + "step": 2040 + }, + { + "epoch": 0.012884724922907705, + "grad_norm": 9.137293815612793, + "learning_rate": 1.991848486626489e-05, + "loss": 3.6265, + "step": 2050 + }, + { + "epoch": 0.012947577239604815, + "grad_norm": 7.509158134460449, + "learning_rate": 1.9918065765320237e-05, + "loss": 3.7887, + "step": 2060 + }, + { + "epoch": 0.013010429556301927, + "grad_norm": 7.798402309417725, + "learning_rate": 1.9917646664375584e-05, + "loss": 3.6766, + "step": 2070 + }, + { + "epoch": 0.013073281872999037, + "grad_norm": 7.323843002319336, + "learning_rate": 1.991722756343093e-05, + "loss": 3.644, + "step": 2080 + }, + { + "epoch": 0.013136134189696148, + "grad_norm": 8.679986000061035, + "learning_rate": 1.9916808462486278e-05, + "loss": 3.6055, + "step": 2090 + }, + { + "epoch": 0.013198986506393258, + "grad_norm": 9.180718421936035, + "learning_rate": 1.9916389361541622e-05, + "loss": 3.873, + "step": 2100 + }, + { + "epoch": 0.01326183882309037, + "grad_norm": 7.690215587615967, + "learning_rate": 1.991597026059697e-05, + "loss": 3.6968, + "step": 2110 + }, + { + "epoch": 0.01332469113978748, + "grad_norm": 7.911571025848389, + "learning_rate": 1.9915551159652316e-05, + "loss": 3.8175, + "step": 2120 + }, + { + "epoch": 0.013387543456484591, + "grad_norm": 8.48082447052002, + "learning_rate": 1.9915132058707663e-05, + "loss": 3.6924, + "step": 2130 + }, + { + "epoch": 0.013450395773181701, + "grad_norm": 7.75557279586792, + "learning_rate": 1.991471295776301e-05, + "loss": 3.6379, + "step": 2140 + }, + { + "epoch": 0.013513248089878813, + "grad_norm": 7.316931247711182, + "learning_rate": 1.9914293856818354e-05, + "loss": 3.6726, + "step": 2150 + }, + { + "epoch": 0.013576100406575923, + "grad_norm": 8.54746150970459, + "learning_rate": 1.99138747558737e-05, + "loss": 3.6168, + "step": 2160 + }, + { + "epoch": 0.013638952723273035, + "grad_norm": 7.163753032684326, + "learning_rate": 1.9913455654929048e-05, + "loss": 3.4964, + "step": 2170 + }, + { + "epoch": 0.013701805039970144, + "grad_norm": 6.571826457977295, + "learning_rate": 1.9913036553984395e-05, + "loss": 3.7582, + "step": 2180 + }, + { + "epoch": 0.013764657356667256, + "grad_norm": 8.665351867675781, + "learning_rate": 1.991261745303974e-05, + "loss": 3.5776, + "step": 2190 + }, + { + "epoch": 0.013827509673364366, + "grad_norm": 7.472128391265869, + "learning_rate": 1.9912198352095086e-05, + "loss": 3.563, + "step": 2200 + }, + { + "epoch": 0.013890361990061478, + "grad_norm": 8.150984764099121, + "learning_rate": 1.9911779251150433e-05, + "loss": 3.4579, + "step": 2210 + }, + { + "epoch": 0.013953214306758588, + "grad_norm": 7.981696128845215, + "learning_rate": 1.991136015020578e-05, + "loss": 3.3768, + "step": 2220 + }, + { + "epoch": 0.0140160666234557, + "grad_norm": 8.98642635345459, + "learning_rate": 1.9910941049261127e-05, + "loss": 3.6364, + "step": 2230 + }, + { + "epoch": 0.014078918940152809, + "grad_norm": 7.533083438873291, + "learning_rate": 1.991052194831647e-05, + "loss": 3.714, + "step": 2240 + }, + { + "epoch": 0.01414177125684992, + "grad_norm": 7.571574687957764, + "learning_rate": 1.9910102847371818e-05, + "loss": 3.5226, + "step": 2250 + }, + { + "epoch": 0.01420462357354703, + "grad_norm": 7.607986927032471, + "learning_rate": 1.9909683746427165e-05, + "loss": 3.5229, + "step": 2260 + }, + { + "epoch": 0.014267475890244142, + "grad_norm": 8.505463600158691, + "learning_rate": 1.9909264645482512e-05, + "loss": 3.5028, + "step": 2270 + }, + { + "epoch": 0.014330328206941252, + "grad_norm": 8.641060829162598, + "learning_rate": 1.990884554453786e-05, + "loss": 3.6277, + "step": 2280 + }, + { + "epoch": 0.014393180523638364, + "grad_norm": 7.744467258453369, + "learning_rate": 1.9908426443593206e-05, + "loss": 3.5696, + "step": 2290 + }, + { + "epoch": 0.014456032840335474, + "grad_norm": 7.258744716644287, + "learning_rate": 1.9908007342648553e-05, + "loss": 3.7013, + "step": 2300 + }, + { + "epoch": 0.014518885157032585, + "grad_norm": 7.771141052246094, + "learning_rate": 1.99075882417039e-05, + "loss": 3.5281, + "step": 2310 + }, + { + "epoch": 0.014581737473729695, + "grad_norm": 9.042888641357422, + "learning_rate": 1.9907169140759247e-05, + "loss": 3.365, + "step": 2320 + }, + { + "epoch": 0.014644589790426807, + "grad_norm": 8.447803497314453, + "learning_rate": 1.990675003981459e-05, + "loss": 3.6213, + "step": 2330 + }, + { + "epoch": 0.014707442107123917, + "grad_norm": 7.852933406829834, + "learning_rate": 1.9906330938869938e-05, + "loss": 3.4386, + "step": 2340 + }, + { + "epoch": 0.014770294423821028, + "grad_norm": 6.793748378753662, + "learning_rate": 1.9905911837925285e-05, + "loss": 3.5708, + "step": 2350 + }, + { + "epoch": 0.014833146740518138, + "grad_norm": 9.815299987792969, + "learning_rate": 1.9905492736980632e-05, + "loss": 3.4692, + "step": 2360 + }, + { + "epoch": 0.01489599905721525, + "grad_norm": 8.411046981811523, + "learning_rate": 1.9905073636035976e-05, + "loss": 3.4869, + "step": 2370 + }, + { + "epoch": 0.01495885137391236, + "grad_norm": 8.23367977142334, + "learning_rate": 1.9904654535091323e-05, + "loss": 3.4829, + "step": 2380 + }, + { + "epoch": 0.015021703690609471, + "grad_norm": 7.69027042388916, + "learning_rate": 1.990423543414667e-05, + "loss": 3.6509, + "step": 2390 + }, + { + "epoch": 0.015084556007306581, + "grad_norm": 7.956418514251709, + "learning_rate": 1.9903816333202017e-05, + "loss": 3.4739, + "step": 2400 + }, + { + "epoch": 0.015147408324003693, + "grad_norm": 8.069219589233398, + "learning_rate": 1.990339723225736e-05, + "loss": 3.345, + "step": 2410 + }, + { + "epoch": 0.015210260640700803, + "grad_norm": 7.235134124755859, + "learning_rate": 1.9902978131312708e-05, + "loss": 3.4255, + "step": 2420 + }, + { + "epoch": 0.015273112957397915, + "grad_norm": 7.432069301605225, + "learning_rate": 1.9902559030368055e-05, + "loss": 3.4666, + "step": 2430 + }, + { + "epoch": 0.015335965274095024, + "grad_norm": 7.206005573272705, + "learning_rate": 1.9902139929423402e-05, + "loss": 3.5934, + "step": 2440 + }, + { + "epoch": 0.015398817590792136, + "grad_norm": 9.905259132385254, + "learning_rate": 1.990172082847875e-05, + "loss": 3.6262, + "step": 2450 + }, + { + "epoch": 0.015461669907489246, + "grad_norm": 8.01630973815918, + "learning_rate": 1.9901301727534096e-05, + "loss": 3.5275, + "step": 2460 + }, + { + "epoch": 0.015524522224186358, + "grad_norm": 7.935608863830566, + "learning_rate": 1.9900882626589443e-05, + "loss": 3.5331, + "step": 2470 + }, + { + "epoch": 0.015587374540883468, + "grad_norm": 8.589484214782715, + "learning_rate": 1.990046352564479e-05, + "loss": 3.614, + "step": 2480 + }, + { + "epoch": 0.01565022685758058, + "grad_norm": 8.494813919067383, + "learning_rate": 1.9900044424700134e-05, + "loss": 3.4546, + "step": 2490 + }, + { + "epoch": 0.01571307917427769, + "grad_norm": 6.927222728729248, + "learning_rate": 1.989962532375548e-05, + "loss": 3.4779, + "step": 2500 + }, + { + "epoch": 0.0157759314909748, + "grad_norm": 6.986412048339844, + "learning_rate": 1.9899206222810828e-05, + "loss": 3.643, + "step": 2510 + }, + { + "epoch": 0.01583878380767191, + "grad_norm": 7.346634864807129, + "learning_rate": 1.9898787121866175e-05, + "loss": 3.547, + "step": 2520 + }, + { + "epoch": 0.015901636124369022, + "grad_norm": 8.32447624206543, + "learning_rate": 1.9898368020921522e-05, + "loss": 3.5548, + "step": 2530 + }, + { + "epoch": 0.015964488441066134, + "grad_norm": 8.137677192687988, + "learning_rate": 1.989794891997687e-05, + "loss": 3.4738, + "step": 2540 + }, + { + "epoch": 0.016027340757763242, + "grad_norm": 9.068408012390137, + "learning_rate": 1.9897529819032213e-05, + "loss": 3.3477, + "step": 2550 + }, + { + "epoch": 0.016090193074460354, + "grad_norm": 8.339146614074707, + "learning_rate": 1.989711071808756e-05, + "loss": 3.6623, + "step": 2560 + }, + { + "epoch": 0.016153045391157465, + "grad_norm": 8.223811149597168, + "learning_rate": 1.9896691617142907e-05, + "loss": 3.5537, + "step": 2570 + }, + { + "epoch": 0.016215897707854577, + "grad_norm": 9.707834243774414, + "learning_rate": 1.9896272516198254e-05, + "loss": 3.6568, + "step": 2580 + }, + { + "epoch": 0.016278750024551685, + "grad_norm": 7.555861473083496, + "learning_rate": 1.9895853415253598e-05, + "loss": 3.5347, + "step": 2590 + }, + { + "epoch": 0.016341602341248797, + "grad_norm": 9.049849510192871, + "learning_rate": 1.9895434314308945e-05, + "loss": 3.6334, + "step": 2600 + }, + { + "epoch": 0.01640445465794591, + "grad_norm": 7.760476589202881, + "learning_rate": 1.9895015213364292e-05, + "loss": 3.483, + "step": 2610 + }, + { + "epoch": 0.01646730697464302, + "grad_norm": 8.10847282409668, + "learning_rate": 1.989459611241964e-05, + "loss": 3.6302, + "step": 2620 + }, + { + "epoch": 0.016530159291340128, + "grad_norm": 8.256227493286133, + "learning_rate": 1.9894177011474986e-05, + "loss": 3.7809, + "step": 2630 + }, + { + "epoch": 0.01659301160803724, + "grad_norm": 7.8532915115356445, + "learning_rate": 1.989375791053033e-05, + "loss": 3.518, + "step": 2640 + }, + { + "epoch": 0.01665586392473435, + "grad_norm": 8.061572074890137, + "learning_rate": 1.9893338809585677e-05, + "loss": 3.4883, + "step": 2650 + }, + { + "epoch": 0.016718716241431463, + "grad_norm": 7.70623254776001, + "learning_rate": 1.9892919708641024e-05, + "loss": 3.573, + "step": 2660 + }, + { + "epoch": 0.01678156855812857, + "grad_norm": 7.417579174041748, + "learning_rate": 1.989250060769637e-05, + "loss": 3.4534, + "step": 2670 + }, + { + "epoch": 0.016844420874825683, + "grad_norm": 7.6423020362854, + "learning_rate": 1.9892081506751718e-05, + "loss": 3.4455, + "step": 2680 + }, + { + "epoch": 0.016907273191522795, + "grad_norm": 9.982197761535645, + "learning_rate": 1.9891662405807065e-05, + "loss": 3.4562, + "step": 2690 + }, + { + "epoch": 0.016970125508219906, + "grad_norm": 7.2399444580078125, + "learning_rate": 1.9891243304862412e-05, + "loss": 3.4619, + "step": 2700 + }, + { + "epoch": 0.017032977824917014, + "grad_norm": 7.721668243408203, + "learning_rate": 1.989082420391776e-05, + "loss": 3.334, + "step": 2710 + }, + { + "epoch": 0.017095830141614126, + "grad_norm": 7.352929592132568, + "learning_rate": 1.9890405102973103e-05, + "loss": 3.4966, + "step": 2720 + }, + { + "epoch": 0.017158682458311238, + "grad_norm": 7.685946941375732, + "learning_rate": 1.988998600202845e-05, + "loss": 3.2705, + "step": 2730 + }, + { + "epoch": 0.017221534775008346, + "grad_norm": 8.082391738891602, + "learning_rate": 1.9889566901083797e-05, + "loss": 3.3639, + "step": 2740 + }, + { + "epoch": 0.017284387091705457, + "grad_norm": 7.36880350112915, + "learning_rate": 1.9889147800139144e-05, + "loss": 3.362, + "step": 2750 + }, + { + "epoch": 0.01734723940840257, + "grad_norm": 8.787617683410645, + "learning_rate": 1.988872869919449e-05, + "loss": 3.4222, + "step": 2760 + }, + { + "epoch": 0.01741009172509968, + "grad_norm": 8.28075122833252, + "learning_rate": 1.9888309598249835e-05, + "loss": 3.5957, + "step": 2770 + }, + { + "epoch": 0.01747294404179679, + "grad_norm": 9.33713436126709, + "learning_rate": 1.9887890497305182e-05, + "loss": 3.4035, + "step": 2780 + }, + { + "epoch": 0.0175357963584939, + "grad_norm": 8.47854995727539, + "learning_rate": 1.988747139636053e-05, + "loss": 3.4141, + "step": 2790 + }, + { + "epoch": 0.017598648675191012, + "grad_norm": 6.677530288696289, + "learning_rate": 1.9887052295415876e-05, + "loss": 3.389, + "step": 2800 + }, + { + "epoch": 0.017661500991888124, + "grad_norm": 8.675537109375, + "learning_rate": 1.988663319447122e-05, + "loss": 3.3471, + "step": 2810 + }, + { + "epoch": 0.017724353308585232, + "grad_norm": 9.454487800598145, + "learning_rate": 1.9886214093526567e-05, + "loss": 3.6361, + "step": 2820 + }, + { + "epoch": 0.017787205625282344, + "grad_norm": 7.696407318115234, + "learning_rate": 1.9885794992581914e-05, + "loss": 3.3272, + "step": 2830 + }, + { + "epoch": 0.017850057941979455, + "grad_norm": 8.311444282531738, + "learning_rate": 1.988537589163726e-05, + "loss": 3.2998, + "step": 2840 + }, + { + "epoch": 0.017912910258676567, + "grad_norm": 7.864684104919434, + "learning_rate": 1.9884956790692608e-05, + "loss": 3.5902, + "step": 2850 + }, + { + "epoch": 0.017975762575373675, + "grad_norm": 7.836400985717773, + "learning_rate": 1.9884537689747955e-05, + "loss": 3.4457, + "step": 2860 + }, + { + "epoch": 0.018038614892070787, + "grad_norm": 7.825150966644287, + "learning_rate": 1.98841185888033e-05, + "loss": 3.6025, + "step": 2870 + }, + { + "epoch": 0.0181014672087679, + "grad_norm": 8.043344497680664, + "learning_rate": 1.9883699487858646e-05, + "loss": 3.3072, + "step": 2880 + }, + { + "epoch": 0.01816431952546501, + "grad_norm": 8.412161827087402, + "learning_rate": 1.9883280386913993e-05, + "loss": 3.4438, + "step": 2890 + }, + { + "epoch": 0.018227171842162118, + "grad_norm": 8.524395942687988, + "learning_rate": 1.988286128596934e-05, + "loss": 3.2877, + "step": 2900 + }, + { + "epoch": 0.01829002415885923, + "grad_norm": 8.123482704162598, + "learning_rate": 1.9882442185024687e-05, + "loss": 3.2552, + "step": 2910 + }, + { + "epoch": 0.01835287647555634, + "grad_norm": 8.951177597045898, + "learning_rate": 1.9882023084080034e-05, + "loss": 3.5767, + "step": 2920 + }, + { + "epoch": 0.018415728792253453, + "grad_norm": 7.002005100250244, + "learning_rate": 1.988160398313538e-05, + "loss": 3.4357, + "step": 2930 + }, + { + "epoch": 0.01847858110895056, + "grad_norm": 8.552858352661133, + "learning_rate": 1.988118488219073e-05, + "loss": 3.5486, + "step": 2940 + }, + { + "epoch": 0.018541433425647673, + "grad_norm": 8.0409517288208, + "learning_rate": 1.9880765781246072e-05, + "loss": 3.4463, + "step": 2950 + }, + { + "epoch": 0.018604285742344785, + "grad_norm": 8.174979209899902, + "learning_rate": 1.988034668030142e-05, + "loss": 3.3729, + "step": 2960 + }, + { + "epoch": 0.018667138059041896, + "grad_norm": 8.62437915802002, + "learning_rate": 1.9879927579356766e-05, + "loss": 3.5334, + "step": 2970 + }, + { + "epoch": 0.018729990375739004, + "grad_norm": 7.793143272399902, + "learning_rate": 1.9879508478412113e-05, + "loss": 3.3313, + "step": 2980 + }, + { + "epoch": 0.018792842692436116, + "grad_norm": 7.714635372161865, + "learning_rate": 1.9879089377467457e-05, + "loss": 3.573, + "step": 2990 + }, + { + "epoch": 0.018855695009133228, + "grad_norm": 8.211463928222656, + "learning_rate": 1.9878670276522804e-05, + "loss": 3.4991, + "step": 3000 + }, + { + "epoch": 0.01891854732583034, + "grad_norm": 7.586797714233398, + "learning_rate": 1.987825117557815e-05, + "loss": 3.5643, + "step": 3010 + }, + { + "epoch": 0.018981399642527447, + "grad_norm": 7.573616027832031, + "learning_rate": 1.9877832074633498e-05, + "loss": 3.6384, + "step": 3020 + }, + { + "epoch": 0.01904425195922456, + "grad_norm": 7.321767807006836, + "learning_rate": 1.9877412973688842e-05, + "loss": 3.3316, + "step": 3030 + }, + { + "epoch": 0.01910710427592167, + "grad_norm": 8.411580085754395, + "learning_rate": 1.987699387274419e-05, + "loss": 3.2568, + "step": 3040 + }, + { + "epoch": 0.019169956592618782, + "grad_norm": 9.95434284210205, + "learning_rate": 1.9876574771799536e-05, + "loss": 3.3447, + "step": 3050 + }, + { + "epoch": 0.01923280890931589, + "grad_norm": 7.271300315856934, + "learning_rate": 1.9876155670854883e-05, + "loss": 3.1763, + "step": 3060 + }, + { + "epoch": 0.019295661226013002, + "grad_norm": 6.9696502685546875, + "learning_rate": 1.987573656991023e-05, + "loss": 3.4721, + "step": 3070 + }, + { + "epoch": 0.019358513542710114, + "grad_norm": 11.968955039978027, + "learning_rate": 1.9875317468965577e-05, + "loss": 3.4949, + "step": 3080 + }, + { + "epoch": 0.019421365859407225, + "grad_norm": 7.978536605834961, + "learning_rate": 1.9874898368020924e-05, + "loss": 3.484, + "step": 3090 + }, + { + "epoch": 0.019484218176104334, + "grad_norm": 8.469392776489258, + "learning_rate": 1.987447926707627e-05, + "loss": 3.4285, + "step": 3100 + }, + { + "epoch": 0.019547070492801445, + "grad_norm": 8.721917152404785, + "learning_rate": 1.987406016613162e-05, + "loss": 3.5232, + "step": 3110 + }, + { + "epoch": 0.019609922809498557, + "grad_norm": 7.684720516204834, + "learning_rate": 1.9873641065186962e-05, + "loss": 3.4044, + "step": 3120 + }, + { + "epoch": 0.01967277512619567, + "grad_norm": 9.374492645263672, + "learning_rate": 1.987322196424231e-05, + "loss": 3.3665, + "step": 3130 + }, + { + "epoch": 0.019735627442892777, + "grad_norm": 8.347115516662598, + "learning_rate": 1.9872802863297656e-05, + "loss": 3.3971, + "step": 3140 + }, + { + "epoch": 0.01979847975958989, + "grad_norm": 9.327836036682129, + "learning_rate": 1.9872383762353003e-05, + "loss": 3.2726, + "step": 3150 + }, + { + "epoch": 0.019861332076287, + "grad_norm": 9.030014038085938, + "learning_rate": 1.987196466140835e-05, + "loss": 3.3135, + "step": 3160 + }, + { + "epoch": 0.01992418439298411, + "grad_norm": 8.689590454101562, + "learning_rate": 1.9871545560463694e-05, + "loss": 3.2192, + "step": 3170 + }, + { + "epoch": 0.01998703670968122, + "grad_norm": 7.588329792022705, + "learning_rate": 1.987112645951904e-05, + "loss": 3.4882, + "step": 3180 + }, + { + "epoch": 0.02004988902637833, + "grad_norm": 7.4032392501831055, + "learning_rate": 1.9870707358574388e-05, + "loss": 3.4922, + "step": 3190 + }, + { + "epoch": 0.020112741343075443, + "grad_norm": 7.394667625427246, + "learning_rate": 1.9870288257629735e-05, + "loss": 3.3366, + "step": 3200 + }, + { + "epoch": 0.020175593659772555, + "grad_norm": 8.536340713500977, + "learning_rate": 1.986986915668508e-05, + "loss": 3.5691, + "step": 3210 + }, + { + "epoch": 0.020238445976469663, + "grad_norm": 7.568734169006348, + "learning_rate": 1.9869450055740426e-05, + "loss": 3.2338, + "step": 3220 + }, + { + "epoch": 0.020301298293166774, + "grad_norm": 8.91421127319336, + "learning_rate": 1.9869030954795773e-05, + "loss": 3.3526, + "step": 3230 + }, + { + "epoch": 0.020364150609863886, + "grad_norm": 8.39038372039795, + "learning_rate": 1.986861185385112e-05, + "loss": 3.3413, + "step": 3240 + }, + { + "epoch": 0.020427002926560998, + "grad_norm": 11.154706954956055, + "learning_rate": 1.9868192752906467e-05, + "loss": 3.3824, + "step": 3250 + }, + { + "epoch": 0.020489855243258106, + "grad_norm": 7.8558220863342285, + "learning_rate": 1.986777365196181e-05, + "loss": 3.3635, + "step": 3260 + }, + { + "epoch": 0.020552707559955218, + "grad_norm": 10.399998664855957, + "learning_rate": 1.9867354551017158e-05, + "loss": 3.4351, + "step": 3270 + }, + { + "epoch": 0.02061555987665233, + "grad_norm": 9.94041919708252, + "learning_rate": 1.9866935450072505e-05, + "loss": 3.4726, + "step": 3280 + }, + { + "epoch": 0.02067841219334944, + "grad_norm": 7.788326740264893, + "learning_rate": 1.9866516349127852e-05, + "loss": 3.4651, + "step": 3290 + }, + { + "epoch": 0.02074126451004655, + "grad_norm": 6.855205059051514, + "learning_rate": 1.98660972481832e-05, + "loss": 3.0607, + "step": 3300 + }, + { + "epoch": 0.02080411682674366, + "grad_norm": 7.5319671630859375, + "learning_rate": 1.9865678147238546e-05, + "loss": 3.4578, + "step": 3310 + }, + { + "epoch": 0.020866969143440772, + "grad_norm": 8.263099670410156, + "learning_rate": 1.9865259046293893e-05, + "loss": 3.4431, + "step": 3320 + }, + { + "epoch": 0.020929821460137884, + "grad_norm": 7.396290302276611, + "learning_rate": 1.986483994534924e-05, + "loss": 3.3876, + "step": 3330 + }, + { + "epoch": 0.020992673776834992, + "grad_norm": 8.584406852722168, + "learning_rate": 1.9864420844404584e-05, + "loss": 3.088, + "step": 3340 + }, + { + "epoch": 0.021055526093532104, + "grad_norm": 8.801658630371094, + "learning_rate": 1.986400174345993e-05, + "loss": 3.147, + "step": 3350 + }, + { + "epoch": 0.021118378410229215, + "grad_norm": 8.548176765441895, + "learning_rate": 1.9863582642515278e-05, + "loss": 3.2649, + "step": 3360 + }, + { + "epoch": 0.021181230726926327, + "grad_norm": 7.954073905944824, + "learning_rate": 1.9863163541570625e-05, + "loss": 3.2076, + "step": 3370 + }, + { + "epoch": 0.021244083043623435, + "grad_norm": 8.710152626037598, + "learning_rate": 1.9862744440625972e-05, + "loss": 3.4097, + "step": 3380 + }, + { + "epoch": 0.021306935360320547, + "grad_norm": 8.620429992675781, + "learning_rate": 1.9862325339681316e-05, + "loss": 3.5115, + "step": 3390 + }, + { + "epoch": 0.02136978767701766, + "grad_norm": 8.825028419494629, + "learning_rate": 1.9861906238736663e-05, + "loss": 3.4718, + "step": 3400 + }, + { + "epoch": 0.02143263999371477, + "grad_norm": 7.917201995849609, + "learning_rate": 1.986148713779201e-05, + "loss": 3.3384, + "step": 3410 + }, + { + "epoch": 0.021495492310411878, + "grad_norm": 7.338304042816162, + "learning_rate": 1.9861068036847357e-05, + "loss": 3.0676, + "step": 3420 + }, + { + "epoch": 0.02155834462710899, + "grad_norm": 7.712886810302734, + "learning_rate": 1.98606489359027e-05, + "loss": 3.3085, + "step": 3430 + }, + { + "epoch": 0.0216211969438061, + "grad_norm": 8.4638032913208, + "learning_rate": 1.9860229834958048e-05, + "loss": 3.3793, + "step": 3440 + }, + { + "epoch": 0.02168404926050321, + "grad_norm": 8.804676055908203, + "learning_rate": 1.9859810734013395e-05, + "loss": 3.5193, + "step": 3450 + }, + { + "epoch": 0.02174690157720032, + "grad_norm": 7.220158100128174, + "learning_rate": 1.9859391633068742e-05, + "loss": 3.3284, + "step": 3460 + }, + { + "epoch": 0.021809753893897433, + "grad_norm": 6.993019104003906, + "learning_rate": 1.985897253212409e-05, + "loss": 3.3708, + "step": 3470 + }, + { + "epoch": 0.021872606210594545, + "grad_norm": 8.048100471496582, + "learning_rate": 1.9858553431179436e-05, + "loss": 3.5168, + "step": 3480 + }, + { + "epoch": 0.021935458527291653, + "grad_norm": 7.993709564208984, + "learning_rate": 1.9858134330234783e-05, + "loss": 3.2929, + "step": 3490 + }, + { + "epoch": 0.021998310843988764, + "grad_norm": 7.402252674102783, + "learning_rate": 1.9857715229290127e-05, + "loss": 3.438, + "step": 3500 + }, + { + "epoch": 0.022061163160685876, + "grad_norm": 8.188456535339355, + "learning_rate": 1.9857296128345474e-05, + "loss": 3.5718, + "step": 3510 + }, + { + "epoch": 0.022124015477382988, + "grad_norm": 8.500411987304688, + "learning_rate": 1.985687702740082e-05, + "loss": 3.2535, + "step": 3520 + }, + { + "epoch": 0.022186867794080096, + "grad_norm": 7.373854160308838, + "learning_rate": 1.985645792645617e-05, + "loss": 3.3569, + "step": 3530 + }, + { + "epoch": 0.022249720110777207, + "grad_norm": 9.229218482971191, + "learning_rate": 1.9856038825511515e-05, + "loss": 3.4243, + "step": 3540 + }, + { + "epoch": 0.02231257242747432, + "grad_norm": 8.59708309173584, + "learning_rate": 1.9855619724566862e-05, + "loss": 3.354, + "step": 3550 + }, + { + "epoch": 0.02237542474417143, + "grad_norm": 7.8422417640686035, + "learning_rate": 1.985520062362221e-05, + "loss": 3.236, + "step": 3560 + }, + { + "epoch": 0.02243827706086854, + "grad_norm": 6.979317665100098, + "learning_rate": 1.9854781522677553e-05, + "loss": 3.2229, + "step": 3570 + }, + { + "epoch": 0.02250112937756565, + "grad_norm": 8.221169471740723, + "learning_rate": 1.98543624217329e-05, + "loss": 3.315, + "step": 3580 + }, + { + "epoch": 0.022563981694262762, + "grad_norm": 9.011247634887695, + "learning_rate": 1.9853943320788247e-05, + "loss": 3.3895, + "step": 3590 + }, + { + "epoch": 0.022626834010959874, + "grad_norm": 7.5295915603637695, + "learning_rate": 1.9853524219843594e-05, + "loss": 3.3018, + "step": 3600 + }, + { + "epoch": 0.022689686327656982, + "grad_norm": 8.486215591430664, + "learning_rate": 1.9853105118898938e-05, + "loss": 3.2546, + "step": 3610 + }, + { + "epoch": 0.022752538644354094, + "grad_norm": 6.862631320953369, + "learning_rate": 1.9852686017954285e-05, + "loss": 3.2158, + "step": 3620 + }, + { + "epoch": 0.022815390961051205, + "grad_norm": 10.961071968078613, + "learning_rate": 1.9852266917009632e-05, + "loss": 3.5264, + "step": 3630 + }, + { + "epoch": 0.022878243277748317, + "grad_norm": 7.940339088439941, + "learning_rate": 1.985184781606498e-05, + "loss": 3.3097, + "step": 3640 + }, + { + "epoch": 0.022941095594445425, + "grad_norm": 8.660905838012695, + "learning_rate": 1.9851428715120323e-05, + "loss": 3.2254, + "step": 3650 + }, + { + "epoch": 0.023003947911142537, + "grad_norm": 7.902162075042725, + "learning_rate": 1.985100961417567e-05, + "loss": 3.3875, + "step": 3660 + }, + { + "epoch": 0.02306680022783965, + "grad_norm": 8.279126167297363, + "learning_rate": 1.9850590513231017e-05, + "loss": 3.3775, + "step": 3670 + }, + { + "epoch": 0.02312965254453676, + "grad_norm": 7.69015645980835, + "learning_rate": 1.9850171412286364e-05, + "loss": 3.1918, + "step": 3680 + }, + { + "epoch": 0.023192504861233868, + "grad_norm": 14.095693588256836, + "learning_rate": 1.984975231134171e-05, + "loss": 3.1605, + "step": 3690 + }, + { + "epoch": 0.02325535717793098, + "grad_norm": 8.890153884887695, + "learning_rate": 1.984933321039706e-05, + "loss": 3.3755, + "step": 3700 + }, + { + "epoch": 0.02331820949462809, + "grad_norm": 8.48962116241455, + "learning_rate": 1.9848914109452405e-05, + "loss": 3.3708, + "step": 3710 + }, + { + "epoch": 0.023381061811325203, + "grad_norm": 8.391972541809082, + "learning_rate": 1.9848495008507753e-05, + "loss": 3.4301, + "step": 3720 + }, + { + "epoch": 0.02344391412802231, + "grad_norm": 5.6870951652526855, + "learning_rate": 1.98480759075631e-05, + "loss": 3.048, + "step": 3730 + }, + { + "epoch": 0.023506766444719423, + "grad_norm": 7.735255241394043, + "learning_rate": 1.9847656806618443e-05, + "loss": 3.1884, + "step": 3740 + }, + { + "epoch": 0.023569618761416535, + "grad_norm": 116.45984649658203, + "learning_rate": 1.984723770567379e-05, + "loss": 3.4161, + "step": 3750 + }, + { + "epoch": 0.023632471078113646, + "grad_norm": 8.070536613464355, + "learning_rate": 1.9846818604729137e-05, + "loss": 3.0678, + "step": 3760 + }, + { + "epoch": 0.023695323394810754, + "grad_norm": 7.883828163146973, + "learning_rate": 1.9846399503784484e-05, + "loss": 3.2295, + "step": 3770 + }, + { + "epoch": 0.023758175711507866, + "grad_norm": 9.798192977905273, + "learning_rate": 1.984598040283983e-05, + "loss": 3.2163, + "step": 3780 + }, + { + "epoch": 0.023821028028204978, + "grad_norm": 7.705380439758301, + "learning_rate": 1.9845561301895175e-05, + "loss": 3.5051, + "step": 3790 + }, + { + "epoch": 0.02388388034490209, + "grad_norm": 8.782146453857422, + "learning_rate": 1.9845142200950522e-05, + "loss": 3.1344, + "step": 3800 + }, + { + "epoch": 0.023946732661599197, + "grad_norm": 8.779561042785645, + "learning_rate": 1.984472310000587e-05, + "loss": 3.0366, + "step": 3810 + }, + { + "epoch": 0.02400958497829631, + "grad_norm": 8.433015823364258, + "learning_rate": 1.9844303999061216e-05, + "loss": 3.3192, + "step": 3820 + }, + { + "epoch": 0.02407243729499342, + "grad_norm": 8.250164031982422, + "learning_rate": 1.984388489811656e-05, + "loss": 3.0157, + "step": 3830 + }, + { + "epoch": 0.024135289611690532, + "grad_norm": 7.675574779510498, + "learning_rate": 1.9843465797171907e-05, + "loss": 3.3224, + "step": 3840 + }, + { + "epoch": 0.02419814192838764, + "grad_norm": 7.564833641052246, + "learning_rate": 1.9843046696227254e-05, + "loss": 3.3004, + "step": 3850 + }, + { + "epoch": 0.024260994245084752, + "grad_norm": 7.947770595550537, + "learning_rate": 1.98426275952826e-05, + "loss": 3.1889, + "step": 3860 + }, + { + "epoch": 0.024323846561781864, + "grad_norm": 7.688364028930664, + "learning_rate": 1.984220849433795e-05, + "loss": 3.1756, + "step": 3870 + }, + { + "epoch": 0.024386698878478975, + "grad_norm": 8.064852714538574, + "learning_rate": 1.9841789393393292e-05, + "loss": 3.2235, + "step": 3880 + }, + { + "epoch": 0.024449551195176084, + "grad_norm": 8.496567726135254, + "learning_rate": 1.984137029244864e-05, + "loss": 3.2349, + "step": 3890 + }, + { + "epoch": 0.024512403511873195, + "grad_norm": 7.16607141494751, + "learning_rate": 1.9840951191503986e-05, + "loss": 2.9673, + "step": 3900 + }, + { + "epoch": 0.024575255828570307, + "grad_norm": 8.018267631530762, + "learning_rate": 1.9840532090559333e-05, + "loss": 3.3146, + "step": 3910 + }, + { + "epoch": 0.02463810814526742, + "grad_norm": 7.622980117797852, + "learning_rate": 1.984011298961468e-05, + "loss": 3.2143, + "step": 3920 + }, + { + "epoch": 0.024700960461964527, + "grad_norm": 9.034892082214355, + "learning_rate": 1.9839693888670027e-05, + "loss": 3.1585, + "step": 3930 + }, + { + "epoch": 0.02476381277866164, + "grad_norm": 8.77389144897461, + "learning_rate": 1.9839274787725375e-05, + "loss": 3.362, + "step": 3940 + }, + { + "epoch": 0.02482666509535875, + "grad_norm": 8.3350191116333, + "learning_rate": 1.983885568678072e-05, + "loss": 2.8545, + "step": 3950 + }, + { + "epoch": 0.02488951741205586, + "grad_norm": 8.177921295166016, + "learning_rate": 1.9838436585836065e-05, + "loss": 3.0485, + "step": 3960 + }, + { + "epoch": 0.02495236972875297, + "grad_norm": 7.447956562042236, + "learning_rate": 1.9838017484891412e-05, + "loss": 3.2264, + "step": 3970 + }, + { + "epoch": 0.02501522204545008, + "grad_norm": 9.212813377380371, + "learning_rate": 1.983759838394676e-05, + "loss": 3.167, + "step": 3980 + }, + { + "epoch": 0.025078074362147193, + "grad_norm": 7.669730186462402, + "learning_rate": 1.9837179283002106e-05, + "loss": 3.1319, + "step": 3990 + }, + { + "epoch": 0.025140926678844305, + "grad_norm": 9.09615707397461, + "learning_rate": 1.9836760182057454e-05, + "loss": 3.2663, + "step": 4000 + }, + { + "epoch": 0.025203778995541413, + "grad_norm": 8.853797912597656, + "learning_rate": 1.9836341081112797e-05, + "loss": 3.4463, + "step": 4010 + }, + { + "epoch": 0.025266631312238524, + "grad_norm": 10.12424087524414, + "learning_rate": 1.9835921980168144e-05, + "loss": 3.236, + "step": 4020 + }, + { + "epoch": 0.025329483628935636, + "grad_norm": 7.712778568267822, + "learning_rate": 1.983550287922349e-05, + "loss": 3.281, + "step": 4030 + }, + { + "epoch": 0.025392335945632748, + "grad_norm": 7.615508556365967, + "learning_rate": 1.983508377827884e-05, + "loss": 3.1491, + "step": 4040 + }, + { + "epoch": 0.025455188262329856, + "grad_norm": 8.383688926696777, + "learning_rate": 1.9834664677334182e-05, + "loss": 3.1655, + "step": 4050 + }, + { + "epoch": 0.025518040579026968, + "grad_norm": 6.64720344543457, + "learning_rate": 1.983424557638953e-05, + "loss": 3.2549, + "step": 4060 + }, + { + "epoch": 0.02558089289572408, + "grad_norm": 8.25935173034668, + "learning_rate": 1.9833826475444876e-05, + "loss": 3.1786, + "step": 4070 + }, + { + "epoch": 0.02564374521242119, + "grad_norm": 7.64882755279541, + "learning_rate": 1.9833407374500223e-05, + "loss": 3.1456, + "step": 4080 + }, + { + "epoch": 0.0257065975291183, + "grad_norm": 7.684487819671631, + "learning_rate": 1.983298827355557e-05, + "loss": 3.1528, + "step": 4090 + }, + { + "epoch": 0.02576944984581541, + "grad_norm": 7.812594413757324, + "learning_rate": 1.9832569172610917e-05, + "loss": 3.2666, + "step": 4100 + }, + { + "epoch": 0.025832302162512522, + "grad_norm": 7.969986915588379, + "learning_rate": 1.9832150071666265e-05, + "loss": 3.2326, + "step": 4110 + }, + { + "epoch": 0.02589515447920963, + "grad_norm": 7.349579334259033, + "learning_rate": 1.9831730970721608e-05, + "loss": 3.0932, + "step": 4120 + }, + { + "epoch": 0.025958006795906742, + "grad_norm": 8.839384078979492, + "learning_rate": 1.9831311869776955e-05, + "loss": 3.2192, + "step": 4130 + }, + { + "epoch": 0.026020859112603854, + "grad_norm": 7.78971004486084, + "learning_rate": 1.9830892768832302e-05, + "loss": 3.154, + "step": 4140 + }, + { + "epoch": 0.026083711429300965, + "grad_norm": 7.024759769439697, + "learning_rate": 1.983047366788765e-05, + "loss": 3.1585, + "step": 4150 + }, + { + "epoch": 0.026146563745998073, + "grad_norm": 8.18656063079834, + "learning_rate": 1.9830054566942997e-05, + "loss": 3.2396, + "step": 4160 + }, + { + "epoch": 0.026209416062695185, + "grad_norm": 7.245763778686523, + "learning_rate": 1.9829635465998344e-05, + "loss": 3.1889, + "step": 4170 + }, + { + "epoch": 0.026272268379392297, + "grad_norm": 9.72729206085205, + "learning_rate": 1.982921636505369e-05, + "loss": 3.2638, + "step": 4180 + }, + { + "epoch": 0.02633512069608941, + "grad_norm": 7.4755401611328125, + "learning_rate": 1.9828797264109034e-05, + "loss": 3.2889, + "step": 4190 + }, + { + "epoch": 0.026397973012786517, + "grad_norm": 8.020759582519531, + "learning_rate": 1.982837816316438e-05, + "loss": 3.0979, + "step": 4200 + }, + { + "epoch": 0.026460825329483628, + "grad_norm": 8.661406517028809, + "learning_rate": 1.982795906221973e-05, + "loss": 3.1325, + "step": 4210 + }, + { + "epoch": 0.02652367764618074, + "grad_norm": 8.634723663330078, + "learning_rate": 1.9827539961275076e-05, + "loss": 3.4406, + "step": 4220 + }, + { + "epoch": 0.02658652996287785, + "grad_norm": 8.003168106079102, + "learning_rate": 1.982712086033042e-05, + "loss": 3.0698, + "step": 4230 + }, + { + "epoch": 0.02664938227957496, + "grad_norm": 7.931961536407471, + "learning_rate": 1.9826701759385766e-05, + "loss": 3.2705, + "step": 4240 + }, + { + "epoch": 0.02671223459627207, + "grad_norm": 9.693182945251465, + "learning_rate": 1.9826282658441113e-05, + "loss": 3.346, + "step": 4250 + }, + { + "epoch": 0.026775086912969183, + "grad_norm": 7.78812837600708, + "learning_rate": 1.982586355749646e-05, + "loss": 3.1529, + "step": 4260 + }, + { + "epoch": 0.026837939229666295, + "grad_norm": 9.524435043334961, + "learning_rate": 1.9825444456551804e-05, + "loss": 3.1564, + "step": 4270 + }, + { + "epoch": 0.026900791546363403, + "grad_norm": 7.616755485534668, + "learning_rate": 1.982502535560715e-05, + "loss": 3.0974, + "step": 4280 + }, + { + "epoch": 0.026963643863060514, + "grad_norm": 6.952274799346924, + "learning_rate": 1.9824606254662498e-05, + "loss": 2.9486, + "step": 4290 + }, + { + "epoch": 0.027026496179757626, + "grad_norm": 9.390430450439453, + "learning_rate": 1.9824187153717845e-05, + "loss": 3.1375, + "step": 4300 + }, + { + "epoch": 0.027089348496454738, + "grad_norm": 8.931705474853516, + "learning_rate": 1.9823768052773192e-05, + "loss": 3.0853, + "step": 4310 + }, + { + "epoch": 0.027152200813151846, + "grad_norm": 9.245275497436523, + "learning_rate": 1.982334895182854e-05, + "loss": 3.0416, + "step": 4320 + }, + { + "epoch": 0.027215053129848957, + "grad_norm": 6.867707252502441, + "learning_rate": 1.9822929850883887e-05, + "loss": 2.9614, + "step": 4330 + }, + { + "epoch": 0.02727790544654607, + "grad_norm": 7.120062351226807, + "learning_rate": 1.9822510749939234e-05, + "loss": 3.0628, + "step": 4340 + }, + { + "epoch": 0.02734075776324318, + "grad_norm": 6.389726638793945, + "learning_rate": 1.982209164899458e-05, + "loss": 3.1971, + "step": 4350 + }, + { + "epoch": 0.02740361007994029, + "grad_norm": 9.57553482055664, + "learning_rate": 1.9821672548049924e-05, + "loss": 3.0961, + "step": 4360 + }, + { + "epoch": 0.0274664623966374, + "grad_norm": 7.604666233062744, + "learning_rate": 1.982125344710527e-05, + "loss": 2.9903, + "step": 4370 + }, + { + "epoch": 0.027529314713334512, + "grad_norm": 7.997584342956543, + "learning_rate": 1.982083434616062e-05, + "loss": 3.1923, + "step": 4380 + }, + { + "epoch": 0.027592167030031624, + "grad_norm": 8.558812141418457, + "learning_rate": 1.9820415245215966e-05, + "loss": 3.3072, + "step": 4390 + }, + { + "epoch": 0.027655019346728732, + "grad_norm": 8.559096336364746, + "learning_rate": 1.9819996144271313e-05, + "loss": 2.8927, + "step": 4400 + }, + { + "epoch": 0.027717871663425844, + "grad_norm": 8.305462837219238, + "learning_rate": 1.9819577043326656e-05, + "loss": 3.3332, + "step": 4410 + }, + { + "epoch": 0.027780723980122955, + "grad_norm": 7.516127586364746, + "learning_rate": 1.9819157942382003e-05, + "loss": 3.2293, + "step": 4420 + }, + { + "epoch": 0.027843576296820067, + "grad_norm": 13.237881660461426, + "learning_rate": 1.981873884143735e-05, + "loss": 3.0727, + "step": 4430 + }, + { + "epoch": 0.027906428613517175, + "grad_norm": 7.675593376159668, + "learning_rate": 1.9818319740492698e-05, + "loss": 2.9193, + "step": 4440 + }, + { + "epoch": 0.027969280930214287, + "grad_norm": 7.40706205368042, + "learning_rate": 1.981790063954804e-05, + "loss": 3.0294, + "step": 4450 + }, + { + "epoch": 0.0280321332469114, + "grad_norm": 7.931587219238281, + "learning_rate": 1.981748153860339e-05, + "loss": 3.2119, + "step": 4460 + }, + { + "epoch": 0.02809498556360851, + "grad_norm": 9.247722625732422, + "learning_rate": 1.9817062437658735e-05, + "loss": 3.2771, + "step": 4470 + }, + { + "epoch": 0.028157837880305618, + "grad_norm": 8.768036842346191, + "learning_rate": 1.9816643336714082e-05, + "loss": 3.1498, + "step": 4480 + }, + { + "epoch": 0.02822069019700273, + "grad_norm": 7.700189113616943, + "learning_rate": 1.981622423576943e-05, + "loss": 3.3009, + "step": 4490 + }, + { + "epoch": 0.02828354251369984, + "grad_norm": 7.002710819244385, + "learning_rate": 1.9815805134824773e-05, + "loss": 2.8461, + "step": 4500 + }, + { + "epoch": 0.028346394830396953, + "grad_norm": 7.769290447235107, + "learning_rate": 1.981538603388012e-05, + "loss": 3.0789, + "step": 4510 + }, + { + "epoch": 0.02840924714709406, + "grad_norm": 7.786672115325928, + "learning_rate": 1.9814966932935467e-05, + "loss": 3.2034, + "step": 4520 + }, + { + "epoch": 0.028472099463791173, + "grad_norm": 9.238251686096191, + "learning_rate": 1.9814547831990814e-05, + "loss": 3.0802, + "step": 4530 + }, + { + "epoch": 0.028534951780488284, + "grad_norm": 9.059394836425781, + "learning_rate": 1.981412873104616e-05, + "loss": 3.2412, + "step": 4540 + }, + { + "epoch": 0.028597804097185396, + "grad_norm": 7.74805212020874, + "learning_rate": 1.981370963010151e-05, + "loss": 2.9245, + "step": 4550 + }, + { + "epoch": 0.028660656413882504, + "grad_norm": 7.382835388183594, + "learning_rate": 1.9813290529156856e-05, + "loss": 3.2211, + "step": 4560 + }, + { + "epoch": 0.028723508730579616, + "grad_norm": 13.836868286132812, + "learning_rate": 1.9812871428212203e-05, + "loss": 3.1112, + "step": 4570 + }, + { + "epoch": 0.028786361047276728, + "grad_norm": 6.769961357116699, + "learning_rate": 1.9812452327267546e-05, + "loss": 3.098, + "step": 4580 + }, + { + "epoch": 0.02884921336397384, + "grad_norm": 7.997913360595703, + "learning_rate": 1.9812033226322893e-05, + "loss": 2.8804, + "step": 4590 + }, + { + "epoch": 0.028912065680670947, + "grad_norm": 9.096884727478027, + "learning_rate": 1.981161412537824e-05, + "loss": 3.136, + "step": 4600 + }, + { + "epoch": 0.02897491799736806, + "grad_norm": 7.878539562225342, + "learning_rate": 1.9811195024433588e-05, + "loss": 3.083, + "step": 4610 + }, + { + "epoch": 0.02903777031406517, + "grad_norm": 7.577316761016846, + "learning_rate": 1.9810775923488935e-05, + "loss": 2.9114, + "step": 4620 + }, + { + "epoch": 0.029100622630762282, + "grad_norm": 7.151461601257324, + "learning_rate": 1.981035682254428e-05, + "loss": 3.0634, + "step": 4630 + }, + { + "epoch": 0.02916347494745939, + "grad_norm": 8.938619613647461, + "learning_rate": 1.9809937721599625e-05, + "loss": 3.0557, + "step": 4640 + }, + { + "epoch": 0.029226327264156502, + "grad_norm": 7.312988758087158, + "learning_rate": 1.9809518620654972e-05, + "loss": 3.2038, + "step": 4650 + }, + { + "epoch": 0.029289179580853614, + "grad_norm": 8.200956344604492, + "learning_rate": 1.980909951971032e-05, + "loss": 3.1253, + "step": 4660 + }, + { + "epoch": 0.029352031897550725, + "grad_norm": 8.429890632629395, + "learning_rate": 1.9808680418765663e-05, + "loss": 2.8572, + "step": 4670 + }, + { + "epoch": 0.029414884214247834, + "grad_norm": 7.834331035614014, + "learning_rate": 1.980826131782101e-05, + "loss": 3.2663, + "step": 4680 + }, + { + "epoch": 0.029477736530944945, + "grad_norm": 8.754341125488281, + "learning_rate": 1.9807842216876357e-05, + "loss": 3.2611, + "step": 4690 + }, + { + "epoch": 0.029540588847642057, + "grad_norm": 8.42916488647461, + "learning_rate": 1.9807423115931704e-05, + "loss": 3.2332, + "step": 4700 + }, + { + "epoch": 0.02960344116433917, + "grad_norm": 7.737362384796143, + "learning_rate": 1.980700401498705e-05, + "loss": 2.9819, + "step": 4710 + }, + { + "epoch": 0.029666293481036277, + "grad_norm": 9.505520820617676, + "learning_rate": 1.98065849140424e-05, + "loss": 3.0618, + "step": 4720 + }, + { + "epoch": 0.029729145797733388, + "grad_norm": 9.08653450012207, + "learning_rate": 1.9806165813097746e-05, + "loss": 3.1651, + "step": 4730 + }, + { + "epoch": 0.0297919981144305, + "grad_norm": 7.863708972930908, + "learning_rate": 1.9805746712153093e-05, + "loss": 3.077, + "step": 4740 + }, + { + "epoch": 0.02985485043112761, + "grad_norm": 8.24854850769043, + "learning_rate": 1.9805327611208436e-05, + "loss": 3.1126, + "step": 4750 + }, + { + "epoch": 0.02991770274782472, + "grad_norm": 8.799391746520996, + "learning_rate": 1.9804908510263783e-05, + "loss": 2.934, + "step": 4760 + }, + { + "epoch": 0.02998055506452183, + "grad_norm": 9.366531372070312, + "learning_rate": 1.980448940931913e-05, + "loss": 2.8343, + "step": 4770 + }, + { + "epoch": 0.030043407381218943, + "grad_norm": 8.352874755859375, + "learning_rate": 1.9804070308374478e-05, + "loss": 3.1882, + "step": 4780 + }, + { + "epoch": 0.030106259697916055, + "grad_norm": 8.972452163696289, + "learning_rate": 1.9803651207429825e-05, + "loss": 3.0625, + "step": 4790 + }, + { + "epoch": 0.030169112014613163, + "grad_norm": 7.474132061004639, + "learning_rate": 1.9803232106485172e-05, + "loss": 3.1211, + "step": 4800 + }, + { + "epoch": 0.030231964331310274, + "grad_norm": 9.15546703338623, + "learning_rate": 1.9802813005540515e-05, + "loss": 3.0774, + "step": 4810 + }, + { + "epoch": 0.030294816648007386, + "grad_norm": 8.463102340698242, + "learning_rate": 1.9802393904595863e-05, + "loss": 3.0315, + "step": 4820 + }, + { + "epoch": 0.030357668964704494, + "grad_norm": 7.438838005065918, + "learning_rate": 1.980197480365121e-05, + "loss": 3.0363, + "step": 4830 + }, + { + "epoch": 0.030420521281401606, + "grad_norm": 7.952330112457275, + "learning_rate": 1.9801555702706557e-05, + "loss": 3.078, + "step": 4840 + }, + { + "epoch": 0.030483373598098717, + "grad_norm": 7.463561534881592, + "learning_rate": 1.98011366017619e-05, + "loss": 2.9217, + "step": 4850 + }, + { + "epoch": 0.03054622591479583, + "grad_norm": 9.11038875579834, + "learning_rate": 1.9800717500817247e-05, + "loss": 3.4038, + "step": 4860 + }, + { + "epoch": 0.030609078231492937, + "grad_norm": 7.781106472015381, + "learning_rate": 1.9800298399872594e-05, + "loss": 3.0778, + "step": 4870 + }, + { + "epoch": 0.03067193054819005, + "grad_norm": 7.971752643585205, + "learning_rate": 1.979987929892794e-05, + "loss": 3.1256, + "step": 4880 + }, + { + "epoch": 0.03073478286488716, + "grad_norm": 9.645397186279297, + "learning_rate": 1.9799460197983285e-05, + "loss": 2.9327, + "step": 4890 + }, + { + "epoch": 0.030797635181584272, + "grad_norm": 8.911731719970703, + "learning_rate": 1.9799041097038632e-05, + "loss": 3.0361, + "step": 4900 + }, + { + "epoch": 0.03086048749828138, + "grad_norm": 8.497078895568848, + "learning_rate": 1.979862199609398e-05, + "loss": 3.0149, + "step": 4910 + }, + { + "epoch": 0.030923339814978492, + "grad_norm": 7.9162068367004395, + "learning_rate": 1.9798202895149326e-05, + "loss": 2.9479, + "step": 4920 + }, + { + "epoch": 0.030986192131675604, + "grad_norm": 7.7327094078063965, + "learning_rate": 1.9797783794204674e-05, + "loss": 3.0199, + "step": 4930 + }, + { + "epoch": 0.031049044448372715, + "grad_norm": 8.18338394165039, + "learning_rate": 1.979736469326002e-05, + "loss": 2.9839, + "step": 4940 + }, + { + "epoch": 0.031111896765069823, + "grad_norm": 7.837664604187012, + "learning_rate": 1.9796945592315368e-05, + "loss": 3.004, + "step": 4950 + }, + { + "epoch": 0.031174749081766935, + "grad_norm": 7.595458984375, + "learning_rate": 1.9796526491370715e-05, + "loss": 3.0458, + "step": 4960 + }, + { + "epoch": 0.031237601398464047, + "grad_norm": 7.489948749542236, + "learning_rate": 1.9796107390426062e-05, + "loss": 2.8061, + "step": 4970 + }, + { + "epoch": 0.03130045371516116, + "grad_norm": 7.755764484405518, + "learning_rate": 1.9795688289481405e-05, + "loss": 2.921, + "step": 4980 + }, + { + "epoch": 0.03136330603185827, + "grad_norm": 9.916472434997559, + "learning_rate": 1.9795269188536753e-05, + "loss": 3.1144, + "step": 4990 + }, + { + "epoch": 0.03142615834855538, + "grad_norm": 7.7621259689331055, + "learning_rate": 1.97948500875921e-05, + "loss": 2.9962, + "step": 5000 + }, + { + "epoch": 0.03148901066525249, + "grad_norm": 8.081377983093262, + "learning_rate": 1.9794430986647447e-05, + "loss": 2.9858, + "step": 5010 + }, + { + "epoch": 0.0315518629819496, + "grad_norm": 7.946218013763428, + "learning_rate": 1.9794011885702794e-05, + "loss": 2.994, + "step": 5020 + }, + { + "epoch": 0.03161471529864671, + "grad_norm": 7.75227689743042, + "learning_rate": 1.9793592784758137e-05, + "loss": 2.8066, + "step": 5030 + }, + { + "epoch": 0.03167756761534382, + "grad_norm": 8.829666137695312, + "learning_rate": 1.9793173683813485e-05, + "loss": 2.92, + "step": 5040 + }, + { + "epoch": 0.03174041993204093, + "grad_norm": 7.233504295349121, + "learning_rate": 1.979275458286883e-05, + "loss": 3.2379, + "step": 5050 + }, + { + "epoch": 0.031803272248738045, + "grad_norm": 8.083481788635254, + "learning_rate": 1.979233548192418e-05, + "loss": 3.1046, + "step": 5060 + }, + { + "epoch": 0.03186612456543515, + "grad_norm": 8.806325912475586, + "learning_rate": 1.9791916380979522e-05, + "loss": 3.0145, + "step": 5070 + }, + { + "epoch": 0.03192897688213227, + "grad_norm": 7.285245418548584, + "learning_rate": 1.979149728003487e-05, + "loss": 2.8619, + "step": 5080 + }, + { + "epoch": 0.031991829198829376, + "grad_norm": 8.357592582702637, + "learning_rate": 1.9791078179090216e-05, + "loss": 3.1808, + "step": 5090 + }, + { + "epoch": 0.032054681515526484, + "grad_norm": 7.444748401641846, + "learning_rate": 1.9790659078145564e-05, + "loss": 2.8626, + "step": 5100 + }, + { + "epoch": 0.0321175338322236, + "grad_norm": 6.956993579864502, + "learning_rate": 1.979023997720091e-05, + "loss": 2.7885, + "step": 5110 + }, + { + "epoch": 0.03218038614892071, + "grad_norm": 8.073221206665039, + "learning_rate": 1.9789820876256258e-05, + "loss": 3.0592, + "step": 5120 + }, + { + "epoch": 0.032243238465617816, + "grad_norm": 7.597723484039307, + "learning_rate": 1.97894017753116e-05, + "loss": 3.0989, + "step": 5130 + }, + { + "epoch": 0.03230609078231493, + "grad_norm": 7.634385585784912, + "learning_rate": 1.978898267436695e-05, + "loss": 3.2343, + "step": 5140 + }, + { + "epoch": 0.03236894309901204, + "grad_norm": 7.463425636291504, + "learning_rate": 1.9788563573422296e-05, + "loss": 3.1204, + "step": 5150 + }, + { + "epoch": 0.032431795415709154, + "grad_norm": 9.218866348266602, + "learning_rate": 1.9788144472477643e-05, + "loss": 3.0007, + "step": 5160 + }, + { + "epoch": 0.03249464773240626, + "grad_norm": 6.456634044647217, + "learning_rate": 1.978772537153299e-05, + "loss": 3.0349, + "step": 5170 + }, + { + "epoch": 0.03255750004910337, + "grad_norm": 7.182300567626953, + "learning_rate": 1.9787306270588337e-05, + "loss": 3.2196, + "step": 5180 + }, + { + "epoch": 0.032620352365800485, + "grad_norm": 7.616100311279297, + "learning_rate": 1.9786887169643684e-05, + "loss": 3.185, + "step": 5190 + }, + { + "epoch": 0.032683204682497594, + "grad_norm": 8.156831741333008, + "learning_rate": 1.9786468068699027e-05, + "loss": 2.9434, + "step": 5200 + }, + { + "epoch": 0.0327460569991947, + "grad_norm": 7.594676971435547, + "learning_rate": 1.9786048967754375e-05, + "loss": 2.8468, + "step": 5210 + }, + { + "epoch": 0.03280890931589182, + "grad_norm": 14.522555351257324, + "learning_rate": 1.978562986680972e-05, + "loss": 2.9002, + "step": 5220 + }, + { + "epoch": 0.032871761632588925, + "grad_norm": 10.881830215454102, + "learning_rate": 1.978521076586507e-05, + "loss": 3.1985, + "step": 5230 + }, + { + "epoch": 0.03293461394928604, + "grad_norm": 8.674522399902344, + "learning_rate": 1.9784791664920416e-05, + "loss": 3.1654, + "step": 5240 + }, + { + "epoch": 0.03299746626598315, + "grad_norm": 8.216035842895508, + "learning_rate": 1.978437256397576e-05, + "loss": 2.9681, + "step": 5250 + }, + { + "epoch": 0.033060318582680256, + "grad_norm": 6.300205707550049, + "learning_rate": 1.9783953463031107e-05, + "loss": 2.9274, + "step": 5260 + }, + { + "epoch": 0.03312317089937737, + "grad_norm": 13.085122108459473, + "learning_rate": 1.9783534362086454e-05, + "loss": 3.1333, + "step": 5270 + }, + { + "epoch": 0.03318602321607448, + "grad_norm": 8.12608528137207, + "learning_rate": 1.97831152611418e-05, + "loss": 3.1074, + "step": 5280 + }, + { + "epoch": 0.03324887553277159, + "grad_norm": 7.285276412963867, + "learning_rate": 1.9782696160197144e-05, + "loss": 2.9279, + "step": 5290 + }, + { + "epoch": 0.0333117278494687, + "grad_norm": 7.87554931640625, + "learning_rate": 1.978227705925249e-05, + "loss": 2.7172, + "step": 5300 + }, + { + "epoch": 0.03337458016616581, + "grad_norm": 6.538553714752197, + "learning_rate": 1.978185795830784e-05, + "loss": 2.8172, + "step": 5310 + }, + { + "epoch": 0.033437432482862926, + "grad_norm": 6.931741237640381, + "learning_rate": 1.9781438857363186e-05, + "loss": 2.8979, + "step": 5320 + }, + { + "epoch": 0.033500284799560034, + "grad_norm": 8.080859184265137, + "learning_rate": 1.9781019756418533e-05, + "loss": 3.2293, + "step": 5330 + }, + { + "epoch": 0.03356313711625714, + "grad_norm": 9.293947219848633, + "learning_rate": 1.978060065547388e-05, + "loss": 2.9521, + "step": 5340 + }, + { + "epoch": 0.03362598943295426, + "grad_norm": 8.021860122680664, + "learning_rate": 1.9780181554529227e-05, + "loss": 2.9557, + "step": 5350 + }, + { + "epoch": 0.033688841749651366, + "grad_norm": 6.971328258514404, + "learning_rate": 1.9779762453584574e-05, + "loss": 2.8615, + "step": 5360 + }, + { + "epoch": 0.033751694066348474, + "grad_norm": 7.943089962005615, + "learning_rate": 1.977934335263992e-05, + "loss": 2.9748, + "step": 5370 + }, + { + "epoch": 0.03381454638304559, + "grad_norm": 8.65058708190918, + "learning_rate": 1.9778924251695265e-05, + "loss": 3.0009, + "step": 5380 + }, + { + "epoch": 0.0338773986997427, + "grad_norm": 8.905320167541504, + "learning_rate": 1.977850515075061e-05, + "loss": 3.1791, + "step": 5390 + }, + { + "epoch": 0.03394025101643981, + "grad_norm": 7.219740390777588, + "learning_rate": 1.977808604980596e-05, + "loss": 2.8528, + "step": 5400 + }, + { + "epoch": 0.03400310333313692, + "grad_norm": 7.684092044830322, + "learning_rate": 1.9777666948861306e-05, + "loss": 2.7724, + "step": 5410 + }, + { + "epoch": 0.03406595564983403, + "grad_norm": 7.816575527191162, + "learning_rate": 1.9777247847916653e-05, + "loss": 2.9181, + "step": 5420 + }, + { + "epoch": 0.034128807966531144, + "grad_norm": 7.647109508514404, + "learning_rate": 1.9776828746971997e-05, + "loss": 3.147, + "step": 5430 + }, + { + "epoch": 0.03419166028322825, + "grad_norm": 7.7166643142700195, + "learning_rate": 1.9776409646027344e-05, + "loss": 2.8961, + "step": 5440 + }, + { + "epoch": 0.03425451259992536, + "grad_norm": 7.806896686553955, + "learning_rate": 1.977599054508269e-05, + "loss": 2.9634, + "step": 5450 + }, + { + "epoch": 0.034317364916622475, + "grad_norm": 7.769691467285156, + "learning_rate": 1.9775571444138038e-05, + "loss": 3.0611, + "step": 5460 + }, + { + "epoch": 0.034380217233319584, + "grad_norm": 7.387162685394287, + "learning_rate": 1.977515234319338e-05, + "loss": 3.0066, + "step": 5470 + }, + { + "epoch": 0.03444306955001669, + "grad_norm": 8.624832153320312, + "learning_rate": 1.977473324224873e-05, + "loss": 2.8516, + "step": 5480 + }, + { + "epoch": 0.03450592186671381, + "grad_norm": 7.3755998611450195, + "learning_rate": 1.9774314141304076e-05, + "loss": 2.9962, + "step": 5490 + }, + { + "epoch": 0.034568774183410915, + "grad_norm": 7.315758228302002, + "learning_rate": 1.9773895040359423e-05, + "loss": 3.3318, + "step": 5500 + }, + { + "epoch": 0.03463162650010803, + "grad_norm": 8.75410270690918, + "learning_rate": 1.9773475939414766e-05, + "loss": 3.013, + "step": 5510 + }, + { + "epoch": 0.03469447881680514, + "grad_norm": 8.508171081542969, + "learning_rate": 1.9773056838470113e-05, + "loss": 2.7933, + "step": 5520 + }, + { + "epoch": 0.034757331133502246, + "grad_norm": 16.92315101623535, + "learning_rate": 1.977263773752546e-05, + "loss": 2.9606, + "step": 5530 + }, + { + "epoch": 0.03482018345019936, + "grad_norm": 7.559053421020508, + "learning_rate": 1.9772218636580808e-05, + "loss": 2.928, + "step": 5540 + }, + { + "epoch": 0.03488303576689647, + "grad_norm": 8.169791221618652, + "learning_rate": 1.9771799535636155e-05, + "loss": 3.0318, + "step": 5550 + }, + { + "epoch": 0.03494588808359358, + "grad_norm": 8.387129783630371, + "learning_rate": 1.9771380434691502e-05, + "loss": 3.1214, + "step": 5560 + }, + { + "epoch": 0.03500874040029069, + "grad_norm": 7.538905143737793, + "learning_rate": 1.977096133374685e-05, + "loss": 3.1572, + "step": 5570 + }, + { + "epoch": 0.0350715927169878, + "grad_norm": 8.610246658325195, + "learning_rate": 1.9770542232802196e-05, + "loss": 2.86, + "step": 5580 + }, + { + "epoch": 0.035134445033684916, + "grad_norm": 10.131224632263184, + "learning_rate": 1.9770123131857543e-05, + "loss": 3.0039, + "step": 5590 + }, + { + "epoch": 0.035197297350382024, + "grad_norm": 14.25268840789795, + "learning_rate": 1.9769704030912887e-05, + "loss": 2.9156, + "step": 5600 + }, + { + "epoch": 0.03526014966707913, + "grad_norm": 8.070310592651367, + "learning_rate": 1.9769284929968234e-05, + "loss": 2.9185, + "step": 5610 + }, + { + "epoch": 0.03532300198377625, + "grad_norm": 8.09715747833252, + "learning_rate": 1.976886582902358e-05, + "loss": 2.9834, + "step": 5620 + }, + { + "epoch": 0.035385854300473356, + "grad_norm": 9.936697006225586, + "learning_rate": 1.9768446728078928e-05, + "loss": 3.1071, + "step": 5630 + }, + { + "epoch": 0.035448706617170464, + "grad_norm": 7.963923931121826, + "learning_rate": 1.9768027627134275e-05, + "loss": 3.1901, + "step": 5640 + }, + { + "epoch": 0.03551155893386758, + "grad_norm": 10.36141300201416, + "learning_rate": 1.976760852618962e-05, + "loss": 2.9536, + "step": 5650 + }, + { + "epoch": 0.03557441125056469, + "grad_norm": 8.136571884155273, + "learning_rate": 1.9767189425244966e-05, + "loss": 3.0211, + "step": 5660 + }, + { + "epoch": 0.0356372635672618, + "grad_norm": 7.600170612335205, + "learning_rate": 1.9766770324300313e-05, + "loss": 2.8099, + "step": 5670 + }, + { + "epoch": 0.03570011588395891, + "grad_norm": 7.338781356811523, + "learning_rate": 1.976635122335566e-05, + "loss": 2.9765, + "step": 5680 + }, + { + "epoch": 0.03576296820065602, + "grad_norm": 6.763370513916016, + "learning_rate": 1.9765932122411003e-05, + "loss": 2.9867, + "step": 5690 + }, + { + "epoch": 0.035825820517353134, + "grad_norm": 7.72622537612915, + "learning_rate": 1.976551302146635e-05, + "loss": 2.9673, + "step": 5700 + }, + { + "epoch": 0.03588867283405024, + "grad_norm": 7.801615238189697, + "learning_rate": 1.9765093920521698e-05, + "loss": 2.9499, + "step": 5710 + }, + { + "epoch": 0.03595152515074735, + "grad_norm": 9.447830200195312, + "learning_rate": 1.9764674819577045e-05, + "loss": 3.1475, + "step": 5720 + }, + { + "epoch": 0.036014377467444465, + "grad_norm": 9.913958549499512, + "learning_rate": 1.9764255718632392e-05, + "loss": 2.7094, + "step": 5730 + }, + { + "epoch": 0.03607722978414157, + "grad_norm": 7.018682479858398, + "learning_rate": 1.976383661768774e-05, + "loss": 2.7891, + "step": 5740 + }, + { + "epoch": 0.03614008210083869, + "grad_norm": 8.090155601501465, + "learning_rate": 1.9763417516743082e-05, + "loss": 2.8387, + "step": 5750 + }, + { + "epoch": 0.0362029344175358, + "grad_norm": 11.208479881286621, + "learning_rate": 1.976299841579843e-05, + "loss": 2.8956, + "step": 5760 + }, + { + "epoch": 0.036265786734232905, + "grad_norm": 8.381065368652344, + "learning_rate": 1.9762579314853777e-05, + "loss": 2.8762, + "step": 5770 + }, + { + "epoch": 0.03632863905093002, + "grad_norm": 8.11691951751709, + "learning_rate": 1.9762160213909124e-05, + "loss": 2.7947, + "step": 5780 + }, + { + "epoch": 0.03639149136762713, + "grad_norm": 9.803716659545898, + "learning_rate": 1.976174111296447e-05, + "loss": 2.8678, + "step": 5790 + }, + { + "epoch": 0.036454343684324236, + "grad_norm": 7.850063323974609, + "learning_rate": 1.9761322012019818e-05, + "loss": 2.9399, + "step": 5800 + }, + { + "epoch": 0.03651719600102135, + "grad_norm": 7.630870342254639, + "learning_rate": 1.9760902911075165e-05, + "loss": 3.0078, + "step": 5810 + }, + { + "epoch": 0.03658004831771846, + "grad_norm": 7.013731956481934, + "learning_rate": 1.9760483810130512e-05, + "loss": 2.9047, + "step": 5820 + }, + { + "epoch": 0.036642900634415575, + "grad_norm": 12.404043197631836, + "learning_rate": 1.9760064709185856e-05, + "loss": 2.8254, + "step": 5830 + }, + { + "epoch": 0.03670575295111268, + "grad_norm": 6.9944305419921875, + "learning_rate": 1.9759645608241203e-05, + "loss": 2.8782, + "step": 5840 + }, + { + "epoch": 0.03676860526780979, + "grad_norm": 10.313700675964355, + "learning_rate": 1.975922650729655e-05, + "loss": 2.9288, + "step": 5850 + }, + { + "epoch": 0.036831457584506906, + "grad_norm": 8.944605827331543, + "learning_rate": 1.9758807406351897e-05, + "loss": 2.9773, + "step": 5860 + }, + { + "epoch": 0.036894309901204014, + "grad_norm": 7.414758205413818, + "learning_rate": 1.975838830540724e-05, + "loss": 2.7341, + "step": 5870 + }, + { + "epoch": 0.03695716221790112, + "grad_norm": 9.151510238647461, + "learning_rate": 1.9757969204462588e-05, + "loss": 2.9997, + "step": 5880 + }, + { + "epoch": 0.03702001453459824, + "grad_norm": 6.836916923522949, + "learning_rate": 1.9757550103517935e-05, + "loss": 2.9573, + "step": 5890 + }, + { + "epoch": 0.037082866851295346, + "grad_norm": 6.474551677703857, + "learning_rate": 1.9757131002573282e-05, + "loss": 2.939, + "step": 5900 + }, + { + "epoch": 0.03714571916799246, + "grad_norm": 8.964326858520508, + "learning_rate": 1.9756711901628625e-05, + "loss": 3.0834, + "step": 5910 + }, + { + "epoch": 0.03720857148468957, + "grad_norm": 9.188521385192871, + "learning_rate": 1.9756292800683973e-05, + "loss": 2.7889, + "step": 5920 + }, + { + "epoch": 0.03727142380138668, + "grad_norm": 8.61336612701416, + "learning_rate": 1.975587369973932e-05, + "loss": 3.0299, + "step": 5930 + }, + { + "epoch": 0.03733427611808379, + "grad_norm": 11.761985778808594, + "learning_rate": 1.9755454598794667e-05, + "loss": 3.0475, + "step": 5940 + }, + { + "epoch": 0.0373971284347809, + "grad_norm": 8.325447082519531, + "learning_rate": 1.9755035497850014e-05, + "loss": 2.8828, + "step": 5950 + }, + { + "epoch": 0.03745998075147801, + "grad_norm": 8.411840438842773, + "learning_rate": 1.975461639690536e-05, + "loss": 3.0071, + "step": 5960 + }, + { + "epoch": 0.037522833068175124, + "grad_norm": 7.479046821594238, + "learning_rate": 1.9754197295960708e-05, + "loss": 2.8171, + "step": 5970 + }, + { + "epoch": 0.03758568538487223, + "grad_norm": 6.711784839630127, + "learning_rate": 1.9753778195016055e-05, + "loss": 2.7996, + "step": 5980 + }, + { + "epoch": 0.03764853770156935, + "grad_norm": 8.660781860351562, + "learning_rate": 1.9753359094071402e-05, + "loss": 3.0605, + "step": 5990 + }, + { + "epoch": 0.037711390018266455, + "grad_norm": 8.534005165100098, + "learning_rate": 1.9752939993126746e-05, + "loss": 2.683, + "step": 6000 + }, + { + "epoch": 0.03777424233496356, + "grad_norm": 7.058311939239502, + "learning_rate": 1.9752520892182093e-05, + "loss": 3.0076, + "step": 6010 + }, + { + "epoch": 0.03783709465166068, + "grad_norm": 6.711240291595459, + "learning_rate": 1.975210179123744e-05, + "loss": 3.0649, + "step": 6020 + }, + { + "epoch": 0.03789994696835779, + "grad_norm": 7.963628768920898, + "learning_rate": 1.9751682690292787e-05, + "loss": 2.7924, + "step": 6030 + }, + { + "epoch": 0.037962799285054895, + "grad_norm": 15.460878372192383, + "learning_rate": 1.9751263589348134e-05, + "loss": 2.5997, + "step": 6040 + }, + { + "epoch": 0.03802565160175201, + "grad_norm": 8.393654823303223, + "learning_rate": 1.9750844488403478e-05, + "loss": 2.8854, + "step": 6050 + }, + { + "epoch": 0.03808850391844912, + "grad_norm": 7.014850616455078, + "learning_rate": 1.9750425387458825e-05, + "loss": 2.8847, + "step": 6060 + }, + { + "epoch": 0.03815135623514623, + "grad_norm": 7.304200172424316, + "learning_rate": 1.9750006286514172e-05, + "loss": 2.8667, + "step": 6070 + }, + { + "epoch": 0.03821420855184334, + "grad_norm": 7.77520751953125, + "learning_rate": 1.974958718556952e-05, + "loss": 2.8748, + "step": 6080 + }, + { + "epoch": 0.03827706086854045, + "grad_norm": 8.249287605285645, + "learning_rate": 1.9749168084624863e-05, + "loss": 2.9645, + "step": 6090 + }, + { + "epoch": 0.038339913185237565, + "grad_norm": 8.533134460449219, + "learning_rate": 1.974874898368021e-05, + "loss": 3.1404, + "step": 6100 + }, + { + "epoch": 0.03840276550193467, + "grad_norm": 7.188451766967773, + "learning_rate": 1.9748329882735557e-05, + "loss": 2.7063, + "step": 6110 + }, + { + "epoch": 0.03846561781863178, + "grad_norm": 10.097341537475586, + "learning_rate": 1.9747910781790904e-05, + "loss": 2.6763, + "step": 6120 + }, + { + "epoch": 0.038528470135328896, + "grad_norm": 9.750001907348633, + "learning_rate": 1.974749168084625e-05, + "loss": 2.8141, + "step": 6130 + }, + { + "epoch": 0.038591322452026004, + "grad_norm": 8.507477760314941, + "learning_rate": 1.9747072579901595e-05, + "loss": 3.0148, + "step": 6140 + }, + { + "epoch": 0.03865417476872312, + "grad_norm": 6.048313617706299, + "learning_rate": 1.974665347895694e-05, + "loss": 2.7566, + "step": 6150 + }, + { + "epoch": 0.03871702708542023, + "grad_norm": 7.9719014167785645, + "learning_rate": 1.974623437801229e-05, + "loss": 2.7615, + "step": 6160 + }, + { + "epoch": 0.038779879402117336, + "grad_norm": 16.9856014251709, + "learning_rate": 1.9745815277067636e-05, + "loss": 2.8321, + "step": 6170 + }, + { + "epoch": 0.03884273171881445, + "grad_norm": 7.8329973220825195, + "learning_rate": 1.9745396176122983e-05, + "loss": 2.9907, + "step": 6180 + }, + { + "epoch": 0.03890558403551156, + "grad_norm": 8.186614036560059, + "learning_rate": 1.974497707517833e-05, + "loss": 3.0901, + "step": 6190 + }, + { + "epoch": 0.03896843635220867, + "grad_norm": 7.800524711608887, + "learning_rate": 1.9744557974233677e-05, + "loss": 3.0279, + "step": 6200 + }, + { + "epoch": 0.03903128866890578, + "grad_norm": 8.399930000305176, + "learning_rate": 1.9744138873289024e-05, + "loss": 2.6055, + "step": 6210 + }, + { + "epoch": 0.03909414098560289, + "grad_norm": 7.567289352416992, + "learning_rate": 1.9743719772344368e-05, + "loss": 2.8454, + "step": 6220 + }, + { + "epoch": 0.0391569933023, + "grad_norm": 8.097586631774902, + "learning_rate": 1.9743300671399715e-05, + "loss": 2.8724, + "step": 6230 + }, + { + "epoch": 0.039219845618997114, + "grad_norm": 7.968715190887451, + "learning_rate": 1.9742881570455062e-05, + "loss": 2.8206, + "step": 6240 + }, + { + "epoch": 0.03928269793569422, + "grad_norm": 8.068397521972656, + "learning_rate": 1.974246246951041e-05, + "loss": 2.7349, + "step": 6250 + }, + { + "epoch": 0.03934555025239134, + "grad_norm": 7.101130962371826, + "learning_rate": 1.9742043368565756e-05, + "loss": 2.6308, + "step": 6260 + }, + { + "epoch": 0.039408402569088445, + "grad_norm": 7.49091100692749, + "learning_rate": 1.97416242676211e-05, + "loss": 2.7669, + "step": 6270 + }, + { + "epoch": 0.03947125488578555, + "grad_norm": 7.830532550811768, + "learning_rate": 1.9741205166676447e-05, + "loss": 2.7484, + "step": 6280 + }, + { + "epoch": 0.03953410720248267, + "grad_norm": 7.557278633117676, + "learning_rate": 1.9740786065731794e-05, + "loss": 2.8416, + "step": 6290 + }, + { + "epoch": 0.03959695951917978, + "grad_norm": 7.9041972160339355, + "learning_rate": 1.974036696478714e-05, + "loss": 2.781, + "step": 6300 + }, + { + "epoch": 0.039659811835876885, + "grad_norm": 8.118491172790527, + "learning_rate": 1.9739947863842485e-05, + "loss": 3.0058, + "step": 6310 + }, + { + "epoch": 0.039722664152574, + "grad_norm": 8.89059066772461, + "learning_rate": 1.973952876289783e-05, + "loss": 2.9524, + "step": 6320 + }, + { + "epoch": 0.03978551646927111, + "grad_norm": 7.545521259307861, + "learning_rate": 1.973910966195318e-05, + "loss": 2.9124, + "step": 6330 + }, + { + "epoch": 0.03984836878596822, + "grad_norm": 8.568503379821777, + "learning_rate": 1.9738690561008526e-05, + "loss": 3.0594, + "step": 6340 + }, + { + "epoch": 0.03991122110266533, + "grad_norm": 8.055715560913086, + "learning_rate": 1.9738271460063873e-05, + "loss": 3.1005, + "step": 6350 + }, + { + "epoch": 0.03997407341936244, + "grad_norm": 8.731293678283691, + "learning_rate": 1.973785235911922e-05, + "loss": 2.8984, + "step": 6360 + }, + { + "epoch": 0.040036925736059555, + "grad_norm": 7.474778652191162, + "learning_rate": 1.9737433258174567e-05, + "loss": 2.8651, + "step": 6370 + }, + { + "epoch": 0.04009977805275666, + "grad_norm": 7.777046203613281, + "learning_rate": 1.973701415722991e-05, + "loss": 2.7705, + "step": 6380 + }, + { + "epoch": 0.04016263036945377, + "grad_norm": 6.773117542266846, + "learning_rate": 1.9736595056285258e-05, + "loss": 2.8959, + "step": 6390 + }, + { + "epoch": 0.040225482686150886, + "grad_norm": 8.405336380004883, + "learning_rate": 1.9736175955340605e-05, + "loss": 2.6969, + "step": 6400 + }, + { + "epoch": 0.040288335002847994, + "grad_norm": 9.41283893585205, + "learning_rate": 1.9735756854395952e-05, + "loss": 2.9477, + "step": 6410 + }, + { + "epoch": 0.04035118731954511, + "grad_norm": 8.614728927612305, + "learning_rate": 1.97353377534513e-05, + "loss": 2.8038, + "step": 6420 + }, + { + "epoch": 0.04041403963624222, + "grad_norm": 7.485757827758789, + "learning_rate": 1.9734918652506646e-05, + "loss": 2.7371, + "step": 6430 + }, + { + "epoch": 0.040476891952939326, + "grad_norm": 8.51486587524414, + "learning_rate": 1.9734499551561993e-05, + "loss": 2.721, + "step": 6440 + }, + { + "epoch": 0.04053974426963644, + "grad_norm": 8.288865089416504, + "learning_rate": 1.9734080450617337e-05, + "loss": 2.745, + "step": 6450 + }, + { + "epoch": 0.04060259658633355, + "grad_norm": 7.884392261505127, + "learning_rate": 1.9733661349672684e-05, + "loss": 2.7664, + "step": 6460 + }, + { + "epoch": 0.04066544890303066, + "grad_norm": 7.100053787231445, + "learning_rate": 1.973324224872803e-05, + "loss": 2.7882, + "step": 6470 + }, + { + "epoch": 0.04072830121972777, + "grad_norm": 7.000625133514404, + "learning_rate": 1.9732823147783378e-05, + "loss": 2.6478, + "step": 6480 + }, + { + "epoch": 0.04079115353642488, + "grad_norm": 8.798921585083008, + "learning_rate": 1.973240404683872e-05, + "loss": 2.8204, + "step": 6490 + }, + { + "epoch": 0.040854005853121995, + "grad_norm": 7.540419101715088, + "learning_rate": 1.973198494589407e-05, + "loss": 2.6759, + "step": 6500 + }, + { + "epoch": 0.040916858169819104, + "grad_norm": 7.443111419677734, + "learning_rate": 1.9731565844949416e-05, + "loss": 2.7538, + "step": 6510 + }, + { + "epoch": 0.04097971048651621, + "grad_norm": 8.110397338867188, + "learning_rate": 1.9731146744004763e-05, + "loss": 2.8167, + "step": 6520 + }, + { + "epoch": 0.04104256280321333, + "grad_norm": 7.762920379638672, + "learning_rate": 1.9730727643060107e-05, + "loss": 2.7781, + "step": 6530 + }, + { + "epoch": 0.041105415119910435, + "grad_norm": 7.832665920257568, + "learning_rate": 1.9730308542115454e-05, + "loss": 2.8305, + "step": 6540 + }, + { + "epoch": 0.04116826743660754, + "grad_norm": 7.673941612243652, + "learning_rate": 1.97298894411708e-05, + "loss": 2.8912, + "step": 6550 + }, + { + "epoch": 0.04123111975330466, + "grad_norm": 9.414925575256348, + "learning_rate": 1.9729470340226148e-05, + "loss": 2.7999, + "step": 6560 + }, + { + "epoch": 0.041293972070001767, + "grad_norm": 10.081149101257324, + "learning_rate": 1.9729051239281495e-05, + "loss": 2.8094, + "step": 6570 + }, + { + "epoch": 0.04135682438669888, + "grad_norm": 7.751121997833252, + "learning_rate": 1.9728632138336842e-05, + "loss": 2.7884, + "step": 6580 + }, + { + "epoch": 0.04141967670339599, + "grad_norm": 11.314135551452637, + "learning_rate": 1.972821303739219e-05, + "loss": 2.7703, + "step": 6590 + }, + { + "epoch": 0.0414825290200931, + "grad_norm": 8.257302284240723, + "learning_rate": 1.9727793936447536e-05, + "loss": 3.097, + "step": 6600 + }, + { + "epoch": 0.04154538133679021, + "grad_norm": 7.9447526931762695, + "learning_rate": 1.9727374835502883e-05, + "loss": 2.7905, + "step": 6610 + }, + { + "epoch": 0.04160823365348732, + "grad_norm": 6.873098850250244, + "learning_rate": 1.9726955734558227e-05, + "loss": 2.7661, + "step": 6620 + }, + { + "epoch": 0.04167108597018443, + "grad_norm": 8.159378051757812, + "learning_rate": 1.9726536633613574e-05, + "loss": 3.0937, + "step": 6630 + }, + { + "epoch": 0.041733938286881544, + "grad_norm": 8.810033798217773, + "learning_rate": 1.972611753266892e-05, + "loss": 2.7267, + "step": 6640 + }, + { + "epoch": 0.04179679060357865, + "grad_norm": 7.456930160522461, + "learning_rate": 1.9725698431724268e-05, + "loss": 2.8697, + "step": 6650 + }, + { + "epoch": 0.04185964292027577, + "grad_norm": 13.624564170837402, + "learning_rate": 1.9725279330779615e-05, + "loss": 2.6359, + "step": 6660 + }, + { + "epoch": 0.041922495236972876, + "grad_norm": 7.865520477294922, + "learning_rate": 1.972486022983496e-05, + "loss": 2.6366, + "step": 6670 + }, + { + "epoch": 0.041985347553669984, + "grad_norm": 9.472454071044922, + "learning_rate": 1.9724441128890306e-05, + "loss": 2.9113, + "step": 6680 + }, + { + "epoch": 0.0420481998703671, + "grad_norm": 8.41500473022461, + "learning_rate": 1.9724022027945653e-05, + "loss": 3.1977, + "step": 6690 + }, + { + "epoch": 0.04211105218706421, + "grad_norm": 7.459606170654297, + "learning_rate": 1.9723602927001e-05, + "loss": 2.5954, + "step": 6700 + }, + { + "epoch": 0.042173904503761316, + "grad_norm": 8.650262832641602, + "learning_rate": 1.9723183826056344e-05, + "loss": 2.937, + "step": 6710 + }, + { + "epoch": 0.04223675682045843, + "grad_norm": 8.585243225097656, + "learning_rate": 1.972276472511169e-05, + "loss": 2.7615, + "step": 6720 + }, + { + "epoch": 0.04229960913715554, + "grad_norm": 13.335293769836426, + "learning_rate": 1.9722345624167038e-05, + "loss": 3.1172, + "step": 6730 + }, + { + "epoch": 0.042362461453852654, + "grad_norm": 6.974452018737793, + "learning_rate": 1.9721926523222385e-05, + "loss": 3.0229, + "step": 6740 + }, + { + "epoch": 0.04242531377054976, + "grad_norm": 8.071279525756836, + "learning_rate": 1.9721507422277732e-05, + "loss": 2.7496, + "step": 6750 + }, + { + "epoch": 0.04248816608724687, + "grad_norm": 9.10824966430664, + "learning_rate": 1.9721088321333076e-05, + "loss": 2.8403, + "step": 6760 + }, + { + "epoch": 0.042551018403943985, + "grad_norm": 8.561415672302246, + "learning_rate": 1.9720669220388423e-05, + "loss": 2.7773, + "step": 6770 + }, + { + "epoch": 0.042613870720641094, + "grad_norm": 7.2878499031066895, + "learning_rate": 1.972025011944377e-05, + "loss": 2.5423, + "step": 6780 + }, + { + "epoch": 0.0426767230373382, + "grad_norm": 8.255075454711914, + "learning_rate": 1.9719831018499117e-05, + "loss": 3.0136, + "step": 6790 + }, + { + "epoch": 0.04273957535403532, + "grad_norm": 7.35914421081543, + "learning_rate": 1.9719411917554464e-05, + "loss": 2.9261, + "step": 6800 + }, + { + "epoch": 0.042802427670732425, + "grad_norm": 7.392853736877441, + "learning_rate": 1.971899281660981e-05, + "loss": 2.7858, + "step": 6810 + }, + { + "epoch": 0.04286527998742954, + "grad_norm": 7.571976661682129, + "learning_rate": 1.9718573715665158e-05, + "loss": 3.0002, + "step": 6820 + }, + { + "epoch": 0.04292813230412665, + "grad_norm": 10.860123634338379, + "learning_rate": 1.9718154614720505e-05, + "loss": 3.0203, + "step": 6830 + }, + { + "epoch": 0.042990984620823756, + "grad_norm": 8.156268119812012, + "learning_rate": 1.971773551377585e-05, + "loss": 2.9101, + "step": 6840 + }, + { + "epoch": 0.04305383693752087, + "grad_norm": 8.31237506866455, + "learning_rate": 1.9717316412831196e-05, + "loss": 2.754, + "step": 6850 + }, + { + "epoch": 0.04311668925421798, + "grad_norm": 8.099552154541016, + "learning_rate": 1.9716897311886543e-05, + "loss": 2.919, + "step": 6860 + }, + { + "epoch": 0.04317954157091509, + "grad_norm": 7.635723114013672, + "learning_rate": 1.971647821094189e-05, + "loss": 2.7609, + "step": 6870 + }, + { + "epoch": 0.0432423938876122, + "grad_norm": 7.842891216278076, + "learning_rate": 1.9716059109997237e-05, + "loss": 2.8499, + "step": 6880 + }, + { + "epoch": 0.04330524620430931, + "grad_norm": 7.430498123168945, + "learning_rate": 1.971564000905258e-05, + "loss": 2.773, + "step": 6890 + }, + { + "epoch": 0.04336809852100642, + "grad_norm": 7.651177883148193, + "learning_rate": 1.9715220908107928e-05, + "loss": 2.7329, + "step": 6900 + }, + { + "epoch": 0.043430950837703534, + "grad_norm": 7.382655620574951, + "learning_rate": 1.9714801807163275e-05, + "loss": 2.8792, + "step": 6910 + }, + { + "epoch": 0.04349380315440064, + "grad_norm": 8.936134338378906, + "learning_rate": 1.9714382706218622e-05, + "loss": 2.8317, + "step": 6920 + }, + { + "epoch": 0.04355665547109776, + "grad_norm": 7.522771835327148, + "learning_rate": 1.9713963605273966e-05, + "loss": 2.6853, + "step": 6930 + }, + { + "epoch": 0.043619507787794866, + "grad_norm": 8.205595016479492, + "learning_rate": 1.9713544504329313e-05, + "loss": 2.8547, + "step": 6940 + }, + { + "epoch": 0.043682360104491974, + "grad_norm": 8.565117835998535, + "learning_rate": 1.971312540338466e-05, + "loss": 2.6827, + "step": 6950 + }, + { + "epoch": 0.04374521242118909, + "grad_norm": 7.382494926452637, + "learning_rate": 1.9712706302440007e-05, + "loss": 2.7841, + "step": 6960 + }, + { + "epoch": 0.0438080647378862, + "grad_norm": 7.452759265899658, + "learning_rate": 1.9712287201495354e-05, + "loss": 2.8491, + "step": 6970 + }, + { + "epoch": 0.043870917054583305, + "grad_norm": 6.809290885925293, + "learning_rate": 1.97118681005507e-05, + "loss": 2.7978, + "step": 6980 + }, + { + "epoch": 0.04393376937128042, + "grad_norm": 8.27755069732666, + "learning_rate": 1.9711448999606048e-05, + "loss": 2.9641, + "step": 6990 + }, + { + "epoch": 0.04399662168797753, + "grad_norm": 7.654524803161621, + "learning_rate": 1.9711029898661395e-05, + "loss": 2.7442, + "step": 7000 + }, + { + "epoch": 0.044059474004674644, + "grad_norm": 7.790019989013672, + "learning_rate": 1.971061079771674e-05, + "loss": 2.8155, + "step": 7010 + }, + { + "epoch": 0.04412232632137175, + "grad_norm": 7.8720502853393555, + "learning_rate": 1.9710191696772086e-05, + "loss": 2.6249, + "step": 7020 + }, + { + "epoch": 0.04418517863806886, + "grad_norm": 8.328658103942871, + "learning_rate": 1.9709772595827433e-05, + "loss": 2.7292, + "step": 7030 + }, + { + "epoch": 0.044248030954765975, + "grad_norm": 9.934185981750488, + "learning_rate": 1.970935349488278e-05, + "loss": 2.8282, + "step": 7040 + }, + { + "epoch": 0.044310883271463083, + "grad_norm": 7.164620876312256, + "learning_rate": 1.9708934393938127e-05, + "loss": 2.702, + "step": 7050 + }, + { + "epoch": 0.04437373558816019, + "grad_norm": 7.628801345825195, + "learning_rate": 1.9708515292993474e-05, + "loss": 2.8875, + "step": 7060 + }, + { + "epoch": 0.04443658790485731, + "grad_norm": 7.553658485412598, + "learning_rate": 1.9708096192048818e-05, + "loss": 2.5208, + "step": 7070 + }, + { + "epoch": 0.044499440221554415, + "grad_norm": 8.54419994354248, + "learning_rate": 1.9707677091104165e-05, + "loss": 2.8091, + "step": 7080 + }, + { + "epoch": 0.04456229253825153, + "grad_norm": 7.7056660652160645, + "learning_rate": 1.9707257990159512e-05, + "loss": 2.925, + "step": 7090 + }, + { + "epoch": 0.04462514485494864, + "grad_norm": 8.290834426879883, + "learning_rate": 1.970683888921486e-05, + "loss": 2.7995, + "step": 7100 + }, + { + "epoch": 0.044687997171645746, + "grad_norm": 7.718125820159912, + "learning_rate": 1.9706419788270203e-05, + "loss": 2.6879, + "step": 7110 + }, + { + "epoch": 0.04475084948834286, + "grad_norm": 7.2181830406188965, + "learning_rate": 1.970600068732555e-05, + "loss": 2.8546, + "step": 7120 + }, + { + "epoch": 0.04481370180503997, + "grad_norm": 6.926623821258545, + "learning_rate": 1.9705581586380897e-05, + "loss": 2.8165, + "step": 7130 + }, + { + "epoch": 0.04487655412173708, + "grad_norm": 8.519497871398926, + "learning_rate": 1.9705162485436244e-05, + "loss": 2.9327, + "step": 7140 + }, + { + "epoch": 0.04493940643843419, + "grad_norm": 7.977126121520996, + "learning_rate": 1.9704743384491588e-05, + "loss": 2.9805, + "step": 7150 + }, + { + "epoch": 0.0450022587551313, + "grad_norm": 8.655324935913086, + "learning_rate": 1.9704324283546935e-05, + "loss": 2.6832, + "step": 7160 + }, + { + "epoch": 0.045065111071828416, + "grad_norm": 8.400687217712402, + "learning_rate": 1.9703905182602282e-05, + "loss": 2.7897, + "step": 7170 + }, + { + "epoch": 0.045127963388525524, + "grad_norm": 7.366981029510498, + "learning_rate": 1.970348608165763e-05, + "loss": 3.0097, + "step": 7180 + }, + { + "epoch": 0.04519081570522263, + "grad_norm": 7.5406599044799805, + "learning_rate": 1.9703066980712976e-05, + "loss": 2.7499, + "step": 7190 + }, + { + "epoch": 0.04525366802191975, + "grad_norm": 7.338527202606201, + "learning_rate": 1.9702647879768323e-05, + "loss": 2.733, + "step": 7200 + }, + { + "epoch": 0.045316520338616856, + "grad_norm": 8.345789909362793, + "learning_rate": 1.970222877882367e-05, + "loss": 2.6885, + "step": 7210 + }, + { + "epoch": 0.045379372655313964, + "grad_norm": 7.962245464324951, + "learning_rate": 1.9701809677879017e-05, + "loss": 2.5881, + "step": 7220 + }, + { + "epoch": 0.04544222497201108, + "grad_norm": 8.405911445617676, + "learning_rate": 1.9701390576934364e-05, + "loss": 2.7834, + "step": 7230 + }, + { + "epoch": 0.04550507728870819, + "grad_norm": 8.099193572998047, + "learning_rate": 1.9700971475989708e-05, + "loss": 2.8454, + "step": 7240 + }, + { + "epoch": 0.0455679296054053, + "grad_norm": 9.434566497802734, + "learning_rate": 1.9700552375045055e-05, + "loss": 3.0116, + "step": 7250 + }, + { + "epoch": 0.04563078192210241, + "grad_norm": 8.67699909210205, + "learning_rate": 1.9700133274100402e-05, + "loss": 2.8667, + "step": 7260 + }, + { + "epoch": 0.04569363423879952, + "grad_norm": 7.456284999847412, + "learning_rate": 1.969971417315575e-05, + "loss": 2.5807, + "step": 7270 + }, + { + "epoch": 0.045756486555496634, + "grad_norm": 7.954057693481445, + "learning_rate": 1.9699295072211096e-05, + "loss": 2.9401, + "step": 7280 + }, + { + "epoch": 0.04581933887219374, + "grad_norm": 7.707198143005371, + "learning_rate": 1.969887597126644e-05, + "loss": 2.916, + "step": 7290 + }, + { + "epoch": 0.04588219118889085, + "grad_norm": 9.013371467590332, + "learning_rate": 1.9698456870321787e-05, + "loss": 2.8672, + "step": 7300 + }, + { + "epoch": 0.045945043505587965, + "grad_norm": 6.506833076477051, + "learning_rate": 1.9698037769377134e-05, + "loss": 2.6158, + "step": 7310 + }, + { + "epoch": 0.04600789582228507, + "grad_norm": 7.845186233520508, + "learning_rate": 1.969761866843248e-05, + "loss": 2.6699, + "step": 7320 + }, + { + "epoch": 0.04607074813898219, + "grad_norm": 7.635332107543945, + "learning_rate": 1.9697199567487825e-05, + "loss": 2.9555, + "step": 7330 + }, + { + "epoch": 0.0461336004556793, + "grad_norm": 10.10507869720459, + "learning_rate": 1.9696780466543172e-05, + "loss": 2.4269, + "step": 7340 + }, + { + "epoch": 0.046196452772376405, + "grad_norm": 8.401609420776367, + "learning_rate": 1.969636136559852e-05, + "loss": 2.8419, + "step": 7350 + }, + { + "epoch": 0.04625930508907352, + "grad_norm": 8.491729736328125, + "learning_rate": 1.9695942264653866e-05, + "loss": 2.5032, + "step": 7360 + }, + { + "epoch": 0.04632215740577063, + "grad_norm": 7.182466983795166, + "learning_rate": 1.9695523163709213e-05, + "loss": 2.8185, + "step": 7370 + }, + { + "epoch": 0.046385009722467736, + "grad_norm": 8.073323249816895, + "learning_rate": 1.969510406276456e-05, + "loss": 2.8739, + "step": 7380 + }, + { + "epoch": 0.04644786203916485, + "grad_norm": 8.310892105102539, + "learning_rate": 1.9694684961819904e-05, + "loss": 2.8783, + "step": 7390 + }, + { + "epoch": 0.04651071435586196, + "grad_norm": 8.089179039001465, + "learning_rate": 1.969426586087525e-05, + "loss": 2.7613, + "step": 7400 + }, + { + "epoch": 0.046573566672559075, + "grad_norm": 9.599477767944336, + "learning_rate": 1.9693846759930598e-05, + "loss": 2.6879, + "step": 7410 + }, + { + "epoch": 0.04663641898925618, + "grad_norm": 8.713695526123047, + "learning_rate": 1.9693427658985945e-05, + "loss": 2.6981, + "step": 7420 + }, + { + "epoch": 0.04669927130595329, + "grad_norm": 7.749399662017822, + "learning_rate": 1.9693008558041292e-05, + "loss": 2.771, + "step": 7430 + }, + { + "epoch": 0.046762123622650406, + "grad_norm": 9.416159629821777, + "learning_rate": 1.969258945709664e-05, + "loss": 2.8693, + "step": 7440 + }, + { + "epoch": 0.046824975939347514, + "grad_norm": 6.918641567230225, + "learning_rate": 1.9692170356151986e-05, + "loss": 2.7029, + "step": 7450 + }, + { + "epoch": 0.04688782825604462, + "grad_norm": 6.363226890563965, + "learning_rate": 1.969175125520733e-05, + "loss": 2.9845, + "step": 7460 + }, + { + "epoch": 0.04695068057274174, + "grad_norm": 7.930229663848877, + "learning_rate": 1.9691332154262677e-05, + "loss": 2.5707, + "step": 7470 + }, + { + "epoch": 0.047013532889438846, + "grad_norm": 9.078071594238281, + "learning_rate": 1.9690913053318024e-05, + "loss": 2.7976, + "step": 7480 + }, + { + "epoch": 0.04707638520613596, + "grad_norm": 9.267046928405762, + "learning_rate": 1.969049395237337e-05, + "loss": 2.8006, + "step": 7490 + }, + { + "epoch": 0.04713923752283307, + "grad_norm": 7.660119533538818, + "learning_rate": 1.9690074851428718e-05, + "loss": 2.9633, + "step": 7500 + }, + { + "epoch": 0.04720208983953018, + "grad_norm": 7.17922830581665, + "learning_rate": 1.9689655750484062e-05, + "loss": 2.645, + "step": 7510 + }, + { + "epoch": 0.04726494215622729, + "grad_norm": 9.018447875976562, + "learning_rate": 1.968923664953941e-05, + "loss": 2.8149, + "step": 7520 + }, + { + "epoch": 0.0473277944729244, + "grad_norm": 8.614439010620117, + "learning_rate": 1.9688817548594756e-05, + "loss": 2.7819, + "step": 7530 + }, + { + "epoch": 0.04739064678962151, + "grad_norm": 8.659832954406738, + "learning_rate": 1.9688398447650103e-05, + "loss": 2.8259, + "step": 7540 + }, + { + "epoch": 0.047453499106318624, + "grad_norm": 7.812225818634033, + "learning_rate": 1.9687979346705447e-05, + "loss": 2.7513, + "step": 7550 + }, + { + "epoch": 0.04751635142301573, + "grad_norm": 7.860219955444336, + "learning_rate": 1.9687560245760794e-05, + "loss": 2.7221, + "step": 7560 + }, + { + "epoch": 0.04757920373971284, + "grad_norm": 7.393378257751465, + "learning_rate": 1.968714114481614e-05, + "loss": 2.9303, + "step": 7570 + }, + { + "epoch": 0.047642056056409955, + "grad_norm": 11.162703514099121, + "learning_rate": 1.9686722043871488e-05, + "loss": 2.6346, + "step": 7580 + }, + { + "epoch": 0.04770490837310706, + "grad_norm": 6.968651294708252, + "learning_rate": 1.9686302942926835e-05, + "loss": 2.6713, + "step": 7590 + }, + { + "epoch": 0.04776776068980418, + "grad_norm": 7.753033638000488, + "learning_rate": 1.9685883841982182e-05, + "loss": 2.6281, + "step": 7600 + }, + { + "epoch": 0.04783061300650129, + "grad_norm": 9.735809326171875, + "learning_rate": 1.968546474103753e-05, + "loss": 2.6198, + "step": 7610 + }, + { + "epoch": 0.047893465323198395, + "grad_norm": 9.786466598510742, + "learning_rate": 1.9685045640092876e-05, + "loss": 2.7877, + "step": 7620 + }, + { + "epoch": 0.04795631763989551, + "grad_norm": 7.7668843269348145, + "learning_rate": 1.968462653914822e-05, + "loss": 2.732, + "step": 7630 + }, + { + "epoch": 0.04801916995659262, + "grad_norm": 7.759322166442871, + "learning_rate": 1.9684207438203567e-05, + "loss": 2.5917, + "step": 7640 + }, + { + "epoch": 0.048082022273289726, + "grad_norm": 8.343338966369629, + "learning_rate": 1.9683788337258914e-05, + "loss": 2.7432, + "step": 7650 + }, + { + "epoch": 0.04814487458998684, + "grad_norm": 8.134322166442871, + "learning_rate": 1.968336923631426e-05, + "loss": 2.638, + "step": 7660 + }, + { + "epoch": 0.04820772690668395, + "grad_norm": 7.430497169494629, + "learning_rate": 1.9682950135369608e-05, + "loss": 2.5861, + "step": 7670 + }, + { + "epoch": 0.048270579223381065, + "grad_norm": 7.22769021987915, + "learning_rate": 1.9682531034424955e-05, + "loss": 2.7202, + "step": 7680 + }, + { + "epoch": 0.04833343154007817, + "grad_norm": 6.811624050140381, + "learning_rate": 1.96821119334803e-05, + "loss": 2.6732, + "step": 7690 + }, + { + "epoch": 0.04839628385677528, + "grad_norm": 6.873176097869873, + "learning_rate": 1.9681692832535646e-05, + "loss": 2.7445, + "step": 7700 + }, + { + "epoch": 0.048459136173472396, + "grad_norm": 7.949793338775635, + "learning_rate": 1.9681273731590993e-05, + "loss": 2.8871, + "step": 7710 + }, + { + "epoch": 0.048521988490169504, + "grad_norm": 10.053329467773438, + "learning_rate": 1.968085463064634e-05, + "loss": 2.6914, + "step": 7720 + }, + { + "epoch": 0.04858484080686661, + "grad_norm": 6.918039798736572, + "learning_rate": 1.9680435529701684e-05, + "loss": 2.6115, + "step": 7730 + }, + { + "epoch": 0.04864769312356373, + "grad_norm": 7.523407936096191, + "learning_rate": 1.968001642875703e-05, + "loss": 2.588, + "step": 7740 + }, + { + "epoch": 0.048710545440260836, + "grad_norm": 7.9065260887146, + "learning_rate": 1.9679597327812378e-05, + "loss": 2.8345, + "step": 7750 + }, + { + "epoch": 0.04877339775695795, + "grad_norm": 7.3336896896362305, + "learning_rate": 1.9679178226867725e-05, + "loss": 2.7611, + "step": 7760 + }, + { + "epoch": 0.04883625007365506, + "grad_norm": 8.261215209960938, + "learning_rate": 1.967875912592307e-05, + "loss": 2.8819, + "step": 7770 + }, + { + "epoch": 0.04889910239035217, + "grad_norm": 9.53640365600586, + "learning_rate": 1.9678340024978416e-05, + "loss": 2.8616, + "step": 7780 + }, + { + "epoch": 0.04896195470704928, + "grad_norm": 6.995221138000488, + "learning_rate": 1.9677920924033763e-05, + "loss": 2.5584, + "step": 7790 + }, + { + "epoch": 0.04902480702374639, + "grad_norm": 8.18701171875, + "learning_rate": 1.967750182308911e-05, + "loss": 2.7876, + "step": 7800 + }, + { + "epoch": 0.0490876593404435, + "grad_norm": 8.947796821594238, + "learning_rate": 1.9677082722144457e-05, + "loss": 2.6307, + "step": 7810 + }, + { + "epoch": 0.049150511657140614, + "grad_norm": 9.869112968444824, + "learning_rate": 1.9676663621199804e-05, + "loss": 2.4254, + "step": 7820 + }, + { + "epoch": 0.04921336397383772, + "grad_norm": 8.49588394165039, + "learning_rate": 1.967624452025515e-05, + "loss": 2.7566, + "step": 7830 + }, + { + "epoch": 0.04927621629053484, + "grad_norm": 8.207067489624023, + "learning_rate": 1.9675825419310498e-05, + "loss": 2.9028, + "step": 7840 + }, + { + "epoch": 0.049339068607231945, + "grad_norm": 7.481714248657227, + "learning_rate": 1.9675406318365845e-05, + "loss": 2.7137, + "step": 7850 + }, + { + "epoch": 0.04940192092392905, + "grad_norm": 9.47549819946289, + "learning_rate": 1.967498721742119e-05, + "loss": 2.5556, + "step": 7860 + }, + { + "epoch": 0.04946477324062617, + "grad_norm": 7.08083438873291, + "learning_rate": 1.9674568116476536e-05, + "loss": 2.774, + "step": 7870 + }, + { + "epoch": 0.04952762555732328, + "grad_norm": 10.015816688537598, + "learning_rate": 1.9674149015531883e-05, + "loss": 2.7492, + "step": 7880 + }, + { + "epoch": 0.049590477874020385, + "grad_norm": 8.47620964050293, + "learning_rate": 1.967372991458723e-05, + "loss": 2.5963, + "step": 7890 + }, + { + "epoch": 0.0496533301907175, + "grad_norm": 8.212635040283203, + "learning_rate": 1.9673310813642577e-05, + "loss": 2.5873, + "step": 7900 + }, + { + "epoch": 0.04971618250741461, + "grad_norm": 6.361338138580322, + "learning_rate": 1.967289171269792e-05, + "loss": 2.6024, + "step": 7910 + }, + { + "epoch": 0.04977903482411172, + "grad_norm": 7.361743450164795, + "learning_rate": 1.9672472611753268e-05, + "loss": 2.4698, + "step": 7920 + }, + { + "epoch": 0.04984188714080883, + "grad_norm": 9.043204307556152, + "learning_rate": 1.9672053510808615e-05, + "loss": 2.8117, + "step": 7930 + }, + { + "epoch": 0.04990473945750594, + "grad_norm": 6.981082916259766, + "learning_rate": 1.9671634409863962e-05, + "loss": 2.5884, + "step": 7940 + }, + { + "epoch": 0.049967591774203055, + "grad_norm": 7.328317642211914, + "learning_rate": 1.9671215308919306e-05, + "loss": 2.7115, + "step": 7950 + }, + { + "epoch": 0.05003044409090016, + "grad_norm": 7.6271653175354, + "learning_rate": 1.9670796207974653e-05, + "loss": 2.7678, + "step": 7960 + }, + { + "epoch": 0.05009329640759727, + "grad_norm": 7.664426803588867, + "learning_rate": 1.967037710703e-05, + "loss": 2.5816, + "step": 7970 + }, + { + "epoch": 0.050156148724294386, + "grad_norm": 9.592137336730957, + "learning_rate": 1.9669958006085347e-05, + "loss": 2.7122, + "step": 7980 + }, + { + "epoch": 0.050219001040991494, + "grad_norm": 9.07362174987793, + "learning_rate": 1.9669538905140694e-05, + "loss": 2.9154, + "step": 7990 + }, + { + "epoch": 0.05028185335768861, + "grad_norm": 10.417045593261719, + "learning_rate": 1.966911980419604e-05, + "loss": 2.6955, + "step": 8000 + }, + { + "epoch": 0.05034470567438572, + "grad_norm": 7.155338287353516, + "learning_rate": 1.9668700703251385e-05, + "loss": 2.7882, + "step": 8010 + }, + { + "epoch": 0.050407557991082826, + "grad_norm": 7.589554309844971, + "learning_rate": 1.9668281602306732e-05, + "loss": 2.5784, + "step": 8020 + }, + { + "epoch": 0.05047041030777994, + "grad_norm": 8.70826244354248, + "learning_rate": 1.966786250136208e-05, + "loss": 2.8466, + "step": 8030 + }, + { + "epoch": 0.05053326262447705, + "grad_norm": 6.847433090209961, + "learning_rate": 1.9667443400417426e-05, + "loss": 2.7549, + "step": 8040 + }, + { + "epoch": 0.05059611494117416, + "grad_norm": 7.40858268737793, + "learning_rate": 1.9667024299472773e-05, + "loss": 2.7873, + "step": 8050 + }, + { + "epoch": 0.05065896725787127, + "grad_norm": 8.859169006347656, + "learning_rate": 1.966660519852812e-05, + "loss": 2.6523, + "step": 8060 + }, + { + "epoch": 0.05072181957456838, + "grad_norm": 7.8665361404418945, + "learning_rate": 1.9666186097583467e-05, + "loss": 2.8225, + "step": 8070 + }, + { + "epoch": 0.050784671891265495, + "grad_norm": 8.269477844238281, + "learning_rate": 1.966576699663881e-05, + "loss": 2.4463, + "step": 8080 + }, + { + "epoch": 0.050847524207962604, + "grad_norm": 7.107751846313477, + "learning_rate": 1.9665347895694158e-05, + "loss": 2.6323, + "step": 8090 + }, + { + "epoch": 0.05091037652465971, + "grad_norm": 7.459254741668701, + "learning_rate": 1.9664928794749505e-05, + "loss": 2.6698, + "step": 8100 + }, + { + "epoch": 0.05097322884135683, + "grad_norm": 7.243574142456055, + "learning_rate": 1.9664509693804852e-05, + "loss": 2.7532, + "step": 8110 + }, + { + "epoch": 0.051036081158053935, + "grad_norm": 8.207475662231445, + "learning_rate": 1.96640905928602e-05, + "loss": 2.8283, + "step": 8120 + }, + { + "epoch": 0.05109893347475104, + "grad_norm": 7.7275872230529785, + "learning_rate": 1.9663671491915543e-05, + "loss": 2.5482, + "step": 8130 + }, + { + "epoch": 0.05116178579144816, + "grad_norm": 8.615901947021484, + "learning_rate": 1.966325239097089e-05, + "loss": 2.6718, + "step": 8140 + }, + { + "epoch": 0.051224638108145266, + "grad_norm": 7.622805595397949, + "learning_rate": 1.9662833290026237e-05, + "loss": 2.6635, + "step": 8150 + }, + { + "epoch": 0.05128749042484238, + "grad_norm": 8.606935501098633, + "learning_rate": 1.9662414189081584e-05, + "loss": 2.6948, + "step": 8160 + }, + { + "epoch": 0.05135034274153949, + "grad_norm": 7.750791549682617, + "learning_rate": 1.9661995088136928e-05, + "loss": 2.8639, + "step": 8170 + }, + { + "epoch": 0.0514131950582366, + "grad_norm": 7.692723274230957, + "learning_rate": 1.9661617897286743e-05, + "loss": 2.6692, + "step": 8180 + }, + { + "epoch": 0.05147604737493371, + "grad_norm": 8.044435501098633, + "learning_rate": 1.966119879634209e-05, + "loss": 2.6322, + "step": 8190 + }, + { + "epoch": 0.05153889969163082, + "grad_norm": 6.482088088989258, + "learning_rate": 1.9660779695397437e-05, + "loss": 2.6293, + "step": 8200 + }, + { + "epoch": 0.05160175200832793, + "grad_norm": 8.842883110046387, + "learning_rate": 1.966036059445278e-05, + "loss": 2.6414, + "step": 8210 + }, + { + "epoch": 0.051664604325025044, + "grad_norm": 7.571231365203857, + "learning_rate": 1.9659941493508128e-05, + "loss": 2.7317, + "step": 8220 + }, + { + "epoch": 0.05172745664172215, + "grad_norm": 7.618865966796875, + "learning_rate": 1.9659522392563475e-05, + "loss": 2.7609, + "step": 8230 + }, + { + "epoch": 0.05179030895841926, + "grad_norm": 8.84247875213623, + "learning_rate": 1.965910329161882e-05, + "loss": 2.8383, + "step": 8240 + }, + { + "epoch": 0.051853161275116376, + "grad_norm": 9.494882583618164, + "learning_rate": 1.9658684190674165e-05, + "loss": 2.7591, + "step": 8250 + }, + { + "epoch": 0.051916013591813484, + "grad_norm": 8.066009521484375, + "learning_rate": 1.9658265089729512e-05, + "loss": 2.6235, + "step": 8260 + }, + { + "epoch": 0.0519788659085106, + "grad_norm": 7.616729736328125, + "learning_rate": 1.965784598878486e-05, + "loss": 2.4859, + "step": 8270 + }, + { + "epoch": 0.05204171822520771, + "grad_norm": 7.720513343811035, + "learning_rate": 1.9657426887840207e-05, + "loss": 2.8178, + "step": 8280 + }, + { + "epoch": 0.052104570541904816, + "grad_norm": 7.933021068572998, + "learning_rate": 1.9657007786895554e-05, + "loss": 2.5814, + "step": 8290 + }, + { + "epoch": 0.05216742285860193, + "grad_norm": 8.000345230102539, + "learning_rate": 1.96565886859509e-05, + "loss": 2.6746, + "step": 8300 + }, + { + "epoch": 0.05223027517529904, + "grad_norm": 6.778539180755615, + "learning_rate": 1.9656169585006248e-05, + "loss": 2.5875, + "step": 8310 + }, + { + "epoch": 0.05229312749199615, + "grad_norm": 8.313685417175293, + "learning_rate": 1.965575048406159e-05, + "loss": 2.4838, + "step": 8320 + }, + { + "epoch": 0.05235597980869326, + "grad_norm": 8.954083442687988, + "learning_rate": 1.965533138311694e-05, + "loss": 3.0361, + "step": 8330 + }, + { + "epoch": 0.05241883212539037, + "grad_norm": 10.441940307617188, + "learning_rate": 1.9654912282172286e-05, + "loss": 2.766, + "step": 8340 + }, + { + "epoch": 0.052481684442087485, + "grad_norm": 7.779235363006592, + "learning_rate": 1.9654493181227633e-05, + "loss": 2.4505, + "step": 8350 + }, + { + "epoch": 0.052544536758784594, + "grad_norm": 7.268058776855469, + "learning_rate": 1.965407408028298e-05, + "loss": 2.7261, + "step": 8360 + }, + { + "epoch": 0.0526073890754817, + "grad_norm": 6.707265377044678, + "learning_rate": 1.9653654979338327e-05, + "loss": 2.7951, + "step": 8370 + }, + { + "epoch": 0.05267024139217882, + "grad_norm": 8.21595573425293, + "learning_rate": 1.965323587839367e-05, + "loss": 2.5005, + "step": 8380 + }, + { + "epoch": 0.052733093708875925, + "grad_norm": 7.598992824554443, + "learning_rate": 1.9652816777449018e-05, + "loss": 2.8568, + "step": 8390 + }, + { + "epoch": 0.05279594602557303, + "grad_norm": 8.286802291870117, + "learning_rate": 1.9652397676504365e-05, + "loss": 2.7548, + "step": 8400 + }, + { + "epoch": 0.05285879834227015, + "grad_norm": 8.704188346862793, + "learning_rate": 1.965197857555971e-05, + "loss": 2.6038, + "step": 8410 + }, + { + "epoch": 0.052921650658967256, + "grad_norm": 8.282207489013672, + "learning_rate": 1.965155947461506e-05, + "loss": 2.7913, + "step": 8420 + }, + { + "epoch": 0.05298450297566437, + "grad_norm": 8.401204109191895, + "learning_rate": 1.9651140373670402e-05, + "loss": 3.0444, + "step": 8430 + }, + { + "epoch": 0.05304735529236148, + "grad_norm": 8.609389305114746, + "learning_rate": 1.965072127272575e-05, + "loss": 2.7141, + "step": 8440 + }, + { + "epoch": 0.05311020760905859, + "grad_norm": 7.735413074493408, + "learning_rate": 1.9650302171781097e-05, + "loss": 2.7262, + "step": 8450 + }, + { + "epoch": 0.0531730599257557, + "grad_norm": 18.663652420043945, + "learning_rate": 1.9649883070836444e-05, + "loss": 2.6677, + "step": 8460 + }, + { + "epoch": 0.05323591224245281, + "grad_norm": 8.87226676940918, + "learning_rate": 1.9649463969891787e-05, + "loss": 2.4734, + "step": 8470 + }, + { + "epoch": 0.05329876455914992, + "grad_norm": 7.327389717102051, + "learning_rate": 1.9649044868947134e-05, + "loss": 2.765, + "step": 8480 + }, + { + "epoch": 0.053361616875847034, + "grad_norm": 8.151284217834473, + "learning_rate": 1.964862576800248e-05, + "loss": 2.6378, + "step": 8490 + }, + { + "epoch": 0.05342446919254414, + "grad_norm": 7.761752128601074, + "learning_rate": 1.964820666705783e-05, + "loss": 2.7119, + "step": 8500 + }, + { + "epoch": 0.05348732150924126, + "grad_norm": 7.645976543426514, + "learning_rate": 1.9647787566113176e-05, + "loss": 2.7731, + "step": 8510 + }, + { + "epoch": 0.053550173825938366, + "grad_norm": 6.9228434562683105, + "learning_rate": 1.9647410375262987e-05, + "loss": 2.8945, + "step": 8520 + }, + { + "epoch": 0.053613026142635474, + "grad_norm": 6.676364421844482, + "learning_rate": 1.9646991274318334e-05, + "loss": 2.7391, + "step": 8530 + }, + { + "epoch": 0.05367587845933259, + "grad_norm": 7.566768169403076, + "learning_rate": 1.964657217337368e-05, + "loss": 2.7572, + "step": 8540 + }, + { + "epoch": 0.0537387307760297, + "grad_norm": 7.791513919830322, + "learning_rate": 1.9646153072429025e-05, + "loss": 2.655, + "step": 8550 + }, + { + "epoch": 0.053801583092726805, + "grad_norm": 7.675039291381836, + "learning_rate": 1.9645733971484372e-05, + "loss": 2.7216, + "step": 8560 + }, + { + "epoch": 0.05386443540942392, + "grad_norm": 7.082376480102539, + "learning_rate": 1.964531487053972e-05, + "loss": 2.4595, + "step": 8570 + }, + { + "epoch": 0.05392728772612103, + "grad_norm": 9.083993911743164, + "learning_rate": 1.9644895769595066e-05, + "loss": 2.5619, + "step": 8580 + }, + { + "epoch": 0.053990140042818144, + "grad_norm": 7.382872104644775, + "learning_rate": 1.9644476668650413e-05, + "loss": 2.6891, + "step": 8590 + }, + { + "epoch": 0.05405299235951525, + "grad_norm": 7.136111736297607, + "learning_rate": 1.964405756770576e-05, + "loss": 2.6099, + "step": 8600 + }, + { + "epoch": 0.05411584467621236, + "grad_norm": 7.041515350341797, + "learning_rate": 1.9643638466761107e-05, + "loss": 2.5007, + "step": 8610 + }, + { + "epoch": 0.054178696992909475, + "grad_norm": 7.351996898651123, + "learning_rate": 1.9643219365816454e-05, + "loss": 2.7503, + "step": 8620 + }, + { + "epoch": 0.05424154930960658, + "grad_norm": 6.633028984069824, + "learning_rate": 1.96428002648718e-05, + "loss": 2.7683, + "step": 8630 + }, + { + "epoch": 0.05430440162630369, + "grad_norm": 7.365866661071777, + "learning_rate": 1.9642381163927145e-05, + "loss": 2.6287, + "step": 8640 + }, + { + "epoch": 0.05436725394300081, + "grad_norm": 7.213832855224609, + "learning_rate": 1.9641962062982492e-05, + "loss": 2.5954, + "step": 8650 + }, + { + "epoch": 0.054430106259697915, + "grad_norm": 6.928434371948242, + "learning_rate": 1.964154296203784e-05, + "loss": 2.4292, + "step": 8660 + }, + { + "epoch": 0.05449295857639503, + "grad_norm": 8.020259857177734, + "learning_rate": 1.9641123861093186e-05, + "loss": 2.8059, + "step": 8670 + }, + { + "epoch": 0.05455581089309214, + "grad_norm": 7.945987701416016, + "learning_rate": 1.964070476014853e-05, + "loss": 2.7137, + "step": 8680 + }, + { + "epoch": 0.054618663209789246, + "grad_norm": 8.240311622619629, + "learning_rate": 1.9640285659203877e-05, + "loss": 2.5354, + "step": 8690 + }, + { + "epoch": 0.05468151552648636, + "grad_norm": 10.10504150390625, + "learning_rate": 1.9639866558259224e-05, + "loss": 2.6713, + "step": 8700 + }, + { + "epoch": 0.05474436784318347, + "grad_norm": 7.855795383453369, + "learning_rate": 1.963944745731457e-05, + "loss": 2.6301, + "step": 8710 + }, + { + "epoch": 0.05480722015988058, + "grad_norm": 7.5793867111206055, + "learning_rate": 1.9639028356369915e-05, + "loss": 2.7625, + "step": 8720 + }, + { + "epoch": 0.05487007247657769, + "grad_norm": 7.741368293762207, + "learning_rate": 1.9638609255425262e-05, + "loss": 2.5894, + "step": 8730 + }, + { + "epoch": 0.0549329247932748, + "grad_norm": 6.912710189819336, + "learning_rate": 1.963819015448061e-05, + "loss": 2.7917, + "step": 8740 + }, + { + "epoch": 0.054995777109971916, + "grad_norm": 7.872913837432861, + "learning_rate": 1.9637771053535956e-05, + "loss": 2.6818, + "step": 8750 + }, + { + "epoch": 0.055058629426669024, + "grad_norm": 7.7948408126831055, + "learning_rate": 1.9637351952591303e-05, + "loss": 2.4908, + "step": 8760 + }, + { + "epoch": 0.05512148174336613, + "grad_norm": 9.372628211975098, + "learning_rate": 1.9636932851646647e-05, + "loss": 2.6611, + "step": 8770 + }, + { + "epoch": 0.05518433406006325, + "grad_norm": 8.036674499511719, + "learning_rate": 1.9636513750701994e-05, + "loss": 2.751, + "step": 8780 + }, + { + "epoch": 0.055247186376760356, + "grad_norm": 8.101851463317871, + "learning_rate": 1.963609464975734e-05, + "loss": 2.7709, + "step": 8790 + }, + { + "epoch": 0.055310038693457464, + "grad_norm": 8.577589988708496, + "learning_rate": 1.9635675548812688e-05, + "loss": 2.4593, + "step": 8800 + }, + { + "epoch": 0.05537289101015458, + "grad_norm": 7.897840976715088, + "learning_rate": 1.9635256447868035e-05, + "loss": 2.8371, + "step": 8810 + }, + { + "epoch": 0.05543574332685169, + "grad_norm": 8.486788749694824, + "learning_rate": 1.9634837346923382e-05, + "loss": 2.7736, + "step": 8820 + }, + { + "epoch": 0.0554985956435488, + "grad_norm": 7.370955467224121, + "learning_rate": 1.963441824597873e-05, + "loss": 2.7316, + "step": 8830 + }, + { + "epoch": 0.05556144796024591, + "grad_norm": 7.446674346923828, + "learning_rate": 1.9633999145034076e-05, + "loss": 2.719, + "step": 8840 + }, + { + "epoch": 0.05562430027694302, + "grad_norm": 7.9408135414123535, + "learning_rate": 1.9633580044089423e-05, + "loss": 2.7738, + "step": 8850 + }, + { + "epoch": 0.055687152593640134, + "grad_norm": 7.8129496574401855, + "learning_rate": 1.9633160943144767e-05, + "loss": 2.5191, + "step": 8860 + }, + { + "epoch": 0.05575000491033724, + "grad_norm": 7.682240009307861, + "learning_rate": 1.9632741842200114e-05, + "loss": 2.6405, + "step": 8870 + }, + { + "epoch": 0.05581285722703435, + "grad_norm": 7.207090854644775, + "learning_rate": 1.963232274125546e-05, + "loss": 2.5648, + "step": 8880 + }, + { + "epoch": 0.055875709543731465, + "grad_norm": 8.125288009643555, + "learning_rate": 1.9631903640310808e-05, + "loss": 2.6379, + "step": 8890 + }, + { + "epoch": 0.05593856186042857, + "grad_norm": 8.328934669494629, + "learning_rate": 1.9631484539366152e-05, + "loss": 2.8457, + "step": 8900 + }, + { + "epoch": 0.05600141417712568, + "grad_norm": 7.958409786224365, + "learning_rate": 1.96310654384215e-05, + "loss": 2.8154, + "step": 8910 + }, + { + "epoch": 0.0560642664938228, + "grad_norm": 8.580458641052246, + "learning_rate": 1.9630646337476846e-05, + "loss": 2.9781, + "step": 8920 + }, + { + "epoch": 0.056127118810519905, + "grad_norm": 8.746109008789062, + "learning_rate": 1.9630227236532193e-05, + "loss": 2.6343, + "step": 8930 + }, + { + "epoch": 0.05618997112721702, + "grad_norm": 7.417159557342529, + "learning_rate": 1.962980813558754e-05, + "loss": 2.5163, + "step": 8940 + }, + { + "epoch": 0.05625282344391413, + "grad_norm": 6.750892639160156, + "learning_rate": 1.9629389034642884e-05, + "loss": 2.5631, + "step": 8950 + }, + { + "epoch": 0.056315675760611236, + "grad_norm": 7.203453063964844, + "learning_rate": 1.962896993369823e-05, + "loss": 2.7475, + "step": 8960 + }, + { + "epoch": 0.05637852807730835, + "grad_norm": 8.132242202758789, + "learning_rate": 1.9628550832753578e-05, + "loss": 2.6143, + "step": 8970 + }, + { + "epoch": 0.05644138039400546, + "grad_norm": 8.89110279083252, + "learning_rate": 1.9628131731808925e-05, + "loss": 2.7637, + "step": 8980 + }, + { + "epoch": 0.05650423271070257, + "grad_norm": 8.574644088745117, + "learning_rate": 1.9627712630864272e-05, + "loss": 2.6649, + "step": 8990 + }, + { + "epoch": 0.05656708502739968, + "grad_norm": 7.429235935211182, + "learning_rate": 1.962729352991962e-05, + "loss": 2.4796, + "step": 9000 + }, + { + "epoch": 0.05662993734409679, + "grad_norm": 7.8866400718688965, + "learning_rate": 1.9626874428974966e-05, + "loss": 2.7222, + "step": 9010 + }, + { + "epoch": 0.056692789660793906, + "grad_norm": 7.169637680053711, + "learning_rate": 1.962645532803031e-05, + "loss": 2.6579, + "step": 9020 + }, + { + "epoch": 0.056755641977491014, + "grad_norm": 8.332289695739746, + "learning_rate": 1.9626036227085657e-05, + "loss": 2.6769, + "step": 9030 + }, + { + "epoch": 0.05681849429418812, + "grad_norm": 6.445282936096191, + "learning_rate": 1.9625617126141004e-05, + "loss": 2.5097, + "step": 9040 + }, + { + "epoch": 0.05688134661088524, + "grad_norm": 8.111881256103516, + "learning_rate": 1.962519802519635e-05, + "loss": 2.6252, + "step": 9050 + }, + { + "epoch": 0.056944198927582346, + "grad_norm": 6.78681755065918, + "learning_rate": 1.9624778924251698e-05, + "loss": 2.4517, + "step": 9060 + }, + { + "epoch": 0.057007051244279454, + "grad_norm": 7.44216251373291, + "learning_rate": 1.9624359823307045e-05, + "loss": 2.8127, + "step": 9070 + }, + { + "epoch": 0.05706990356097657, + "grad_norm": 8.825942039489746, + "learning_rate": 1.962394072236239e-05, + "loss": 2.8195, + "step": 9080 + }, + { + "epoch": 0.05713275587767368, + "grad_norm": 6.7636566162109375, + "learning_rate": 1.9623521621417736e-05, + "loss": 2.5583, + "step": 9090 + }, + { + "epoch": 0.05719560819437079, + "grad_norm": 7.559068202972412, + "learning_rate": 1.9623102520473083e-05, + "loss": 2.6505, + "step": 9100 + }, + { + "epoch": 0.0572584605110679, + "grad_norm": 8.663436889648438, + "learning_rate": 1.962268341952843e-05, + "loss": 2.6025, + "step": 9110 + }, + { + "epoch": 0.05732131282776501, + "grad_norm": 7.314866065979004, + "learning_rate": 1.9622264318583774e-05, + "loss": 2.9561, + "step": 9120 + }, + { + "epoch": 0.057384165144462124, + "grad_norm": 7.5395612716674805, + "learning_rate": 1.962184521763912e-05, + "loss": 2.4377, + "step": 9130 + }, + { + "epoch": 0.05744701746115923, + "grad_norm": 9.004176139831543, + "learning_rate": 1.9621426116694468e-05, + "loss": 2.4806, + "step": 9140 + }, + { + "epoch": 0.05750986977785634, + "grad_norm": 17.26418113708496, + "learning_rate": 1.9621007015749815e-05, + "loss": 2.5967, + "step": 9150 + }, + { + "epoch": 0.057572722094553455, + "grad_norm": 7.469278812408447, + "learning_rate": 1.9620587914805162e-05, + "loss": 2.8039, + "step": 9160 + }, + { + "epoch": 0.05763557441125056, + "grad_norm": 6.7289628982543945, + "learning_rate": 1.9620168813860506e-05, + "loss": 2.5404, + "step": 9170 + }, + { + "epoch": 0.05769842672794768, + "grad_norm": 8.37357234954834, + "learning_rate": 1.9619749712915853e-05, + "loss": 2.5832, + "step": 9180 + }, + { + "epoch": 0.05776127904464479, + "grad_norm": 9.232333183288574, + "learning_rate": 1.96193306119712e-05, + "loss": 2.3753, + "step": 9190 + }, + { + "epoch": 0.057824131361341895, + "grad_norm": 9.12719440460205, + "learning_rate": 1.9618911511026547e-05, + "loss": 2.7676, + "step": 9200 + }, + { + "epoch": 0.05788698367803901, + "grad_norm": 6.722346305847168, + "learning_rate": 1.9618492410081894e-05, + "loss": 2.6162, + "step": 9210 + }, + { + "epoch": 0.05794983599473612, + "grad_norm": 8.384921073913574, + "learning_rate": 1.961807330913724e-05, + "loss": 2.6299, + "step": 9220 + }, + { + "epoch": 0.058012688311433226, + "grad_norm": 7.978445529937744, + "learning_rate": 1.961765420819259e-05, + "loss": 2.6206, + "step": 9230 + }, + { + "epoch": 0.05807554062813034, + "grad_norm": 7.925615310668945, + "learning_rate": 1.9617235107247935e-05, + "loss": 2.758, + "step": 9240 + }, + { + "epoch": 0.05813839294482745, + "grad_norm": 7.987235069274902, + "learning_rate": 1.9616816006303282e-05, + "loss": 2.4806, + "step": 9250 + }, + { + "epoch": 0.058201245261524565, + "grad_norm": 9.439668655395508, + "learning_rate": 1.9616396905358626e-05, + "loss": 2.55, + "step": 9260 + }, + { + "epoch": 0.05826409757822167, + "grad_norm": 8.0459566116333, + "learning_rate": 1.9615977804413973e-05, + "loss": 2.8098, + "step": 9270 + }, + { + "epoch": 0.05832694989491878, + "grad_norm": 7.324197769165039, + "learning_rate": 1.961555870346932e-05, + "loss": 2.5795, + "step": 9280 + }, + { + "epoch": 0.058389802211615896, + "grad_norm": 7.621894359588623, + "learning_rate": 1.9615139602524667e-05, + "loss": 2.6315, + "step": 9290 + }, + { + "epoch": 0.058452654528313004, + "grad_norm": 6.299142360687256, + "learning_rate": 1.961472050158001e-05, + "loss": 2.6556, + "step": 9300 + }, + { + "epoch": 0.05851550684501011, + "grad_norm": 7.16338586807251, + "learning_rate": 1.9614301400635358e-05, + "loss": 2.5728, + "step": 9310 + }, + { + "epoch": 0.05857835916170723, + "grad_norm": 7.851929187774658, + "learning_rate": 1.9613882299690705e-05, + "loss": 2.4467, + "step": 9320 + }, + { + "epoch": 0.058641211478404336, + "grad_norm": 9.934613227844238, + "learning_rate": 1.9613463198746052e-05, + "loss": 2.7476, + "step": 9330 + }, + { + "epoch": 0.05870406379510145, + "grad_norm": 8.11226749420166, + "learning_rate": 1.9613044097801396e-05, + "loss": 2.5031, + "step": 9340 + }, + { + "epoch": 0.05876691611179856, + "grad_norm": 8.113167762756348, + "learning_rate": 1.9612624996856743e-05, + "loss": 2.5387, + "step": 9350 + }, + { + "epoch": 0.05882976842849567, + "grad_norm": 44.10908126831055, + "learning_rate": 1.961220589591209e-05, + "loss": 2.4559, + "step": 9360 + }, + { + "epoch": 0.05889262074519278, + "grad_norm": 7.4416890144348145, + "learning_rate": 1.9611786794967437e-05, + "loss": 2.4803, + "step": 9370 + }, + { + "epoch": 0.05895547306188989, + "grad_norm": 9.109253883361816, + "learning_rate": 1.9611367694022784e-05, + "loss": 2.7369, + "step": 9380 + }, + { + "epoch": 0.059018325378587, + "grad_norm": 7.512287616729736, + "learning_rate": 1.9610948593078128e-05, + "loss": 2.5443, + "step": 9390 + }, + { + "epoch": 0.059081177695284114, + "grad_norm": 8.12739372253418, + "learning_rate": 1.9610529492133475e-05, + "loss": 2.6543, + "step": 9400 + }, + { + "epoch": 0.05914403001198122, + "grad_norm": 7.4665398597717285, + "learning_rate": 1.9610110391188822e-05, + "loss": 2.6158, + "step": 9410 + }, + { + "epoch": 0.05920688232867834, + "grad_norm": 8.125940322875977, + "learning_rate": 1.960969129024417e-05, + "loss": 2.7101, + "step": 9420 + }, + { + "epoch": 0.059269734645375445, + "grad_norm": 7.124801158905029, + "learning_rate": 1.9609272189299516e-05, + "loss": 2.7561, + "step": 9430 + }, + { + "epoch": 0.05933258696207255, + "grad_norm": 7.844360828399658, + "learning_rate": 1.9608853088354863e-05, + "loss": 2.7748, + "step": 9440 + }, + { + "epoch": 0.05939543927876967, + "grad_norm": 9.075909614562988, + "learning_rate": 1.960843398741021e-05, + "loss": 2.6617, + "step": 9450 + }, + { + "epoch": 0.059458291595466777, + "grad_norm": 8.1566801071167, + "learning_rate": 1.9608014886465557e-05, + "loss": 2.6068, + "step": 9460 + }, + { + "epoch": 0.059521143912163885, + "grad_norm": 5.545306205749512, + "learning_rate": 1.9607595785520904e-05, + "loss": 2.6708, + "step": 9470 + }, + { + "epoch": 0.059583996228861, + "grad_norm": 8.167916297912598, + "learning_rate": 1.9607176684576248e-05, + "loss": 2.518, + "step": 9480 + }, + { + "epoch": 0.05964684854555811, + "grad_norm": 6.242193222045898, + "learning_rate": 1.9606757583631595e-05, + "loss": 2.3471, + "step": 9490 + }, + { + "epoch": 0.05970970086225522, + "grad_norm": 8.109650611877441, + "learning_rate": 1.9606338482686942e-05, + "loss": 2.4626, + "step": 9500 + }, + { + "epoch": 0.05977255317895233, + "grad_norm": 7.323617458343506, + "learning_rate": 1.960591938174229e-05, + "loss": 2.4797, + "step": 9510 + }, + { + "epoch": 0.05983540549564944, + "grad_norm": 7.509581089019775, + "learning_rate": 1.9605500280797633e-05, + "loss": 2.5529, + "step": 9520 + }, + { + "epoch": 0.059898257812346554, + "grad_norm": 8.43581771850586, + "learning_rate": 1.960508117985298e-05, + "loss": 2.3881, + "step": 9530 + }, + { + "epoch": 0.05996111012904366, + "grad_norm": 6.916092395782471, + "learning_rate": 1.9604662078908327e-05, + "loss": 2.6903, + "step": 9540 + }, + { + "epoch": 0.06002396244574077, + "grad_norm": 8.527880668640137, + "learning_rate": 1.9604242977963674e-05, + "loss": 2.8299, + "step": 9550 + }, + { + "epoch": 0.060086814762437886, + "grad_norm": 6.470539093017578, + "learning_rate": 1.960382387701902e-05, + "loss": 2.5176, + "step": 9560 + }, + { + "epoch": 0.060149667079134994, + "grad_norm": 7.453949451446533, + "learning_rate": 1.9603404776074365e-05, + "loss": 2.4548, + "step": 9570 + }, + { + "epoch": 0.06021251939583211, + "grad_norm": 7.623648643493652, + "learning_rate": 1.9602985675129712e-05, + "loss": 2.5387, + "step": 9580 + }, + { + "epoch": 0.06027537171252922, + "grad_norm": 8.512686729431152, + "learning_rate": 1.960256657418506e-05, + "loss": 2.602, + "step": 9590 + }, + { + "epoch": 0.060338224029226326, + "grad_norm": 7.6814751625061035, + "learning_rate": 1.9602147473240406e-05, + "loss": 2.6069, + "step": 9600 + }, + { + "epoch": 0.06040107634592344, + "grad_norm": 7.552587509155273, + "learning_rate": 1.9601728372295753e-05, + "loss": 2.5909, + "step": 9610 + }, + { + "epoch": 0.06046392866262055, + "grad_norm": 7.380543231964111, + "learning_rate": 1.96013092713511e-05, + "loss": 2.5966, + "step": 9620 + }, + { + "epoch": 0.06052678097931766, + "grad_norm": 8.672584533691406, + "learning_rate": 1.9600890170406447e-05, + "loss": 2.503, + "step": 9630 + }, + { + "epoch": 0.06058963329601477, + "grad_norm": 9.565302848815918, + "learning_rate": 1.960047106946179e-05, + "loss": 2.6518, + "step": 9640 + }, + { + "epoch": 0.06065248561271188, + "grad_norm": 6.482096195220947, + "learning_rate": 1.9600051968517138e-05, + "loss": 2.6328, + "step": 9650 + }, + { + "epoch": 0.06071533792940899, + "grad_norm": 11.39160442352295, + "learning_rate": 1.9599632867572485e-05, + "loss": 2.7147, + "step": 9660 + }, + { + "epoch": 0.060778190246106104, + "grad_norm": 7.652275085449219, + "learning_rate": 1.9599213766627832e-05, + "loss": 2.3302, + "step": 9670 + }, + { + "epoch": 0.06084104256280321, + "grad_norm": 14.219861030578613, + "learning_rate": 1.959879466568318e-05, + "loss": 2.614, + "step": 9680 + }, + { + "epoch": 0.06090389487950033, + "grad_norm": 7.919884204864502, + "learning_rate": 1.9598375564738526e-05, + "loss": 2.5997, + "step": 9690 + }, + { + "epoch": 0.060966747196197435, + "grad_norm": 7.179359436035156, + "learning_rate": 1.959795646379387e-05, + "loss": 2.4926, + "step": 9700 + }, + { + "epoch": 0.06102959951289454, + "grad_norm": 8.82780647277832, + "learning_rate": 1.9597537362849217e-05, + "loss": 2.7258, + "step": 9710 + }, + { + "epoch": 0.06109245182959166, + "grad_norm": 7.858294486999512, + "learning_rate": 1.9597118261904564e-05, + "loss": 2.5357, + "step": 9720 + }, + { + "epoch": 0.061155304146288766, + "grad_norm": 7.4505109786987305, + "learning_rate": 1.959669916095991e-05, + "loss": 2.5321, + "step": 9730 + }, + { + "epoch": 0.061218156462985875, + "grad_norm": 7.89233922958374, + "learning_rate": 1.9596280060015255e-05, + "loss": 2.4345, + "step": 9740 + }, + { + "epoch": 0.06128100877968299, + "grad_norm": 8.051054000854492, + "learning_rate": 1.9595860959070602e-05, + "loss": 2.5858, + "step": 9750 + }, + { + "epoch": 0.0613438610963801, + "grad_norm": 7.296353816986084, + "learning_rate": 1.959544185812595e-05, + "loss": 2.4151, + "step": 9760 + }, + { + "epoch": 0.06140671341307721, + "grad_norm": 8.718103408813477, + "learning_rate": 1.9595022757181296e-05, + "loss": 2.8083, + "step": 9770 + }, + { + "epoch": 0.06146956572977432, + "grad_norm": 7.9979472160339355, + "learning_rate": 1.9594603656236643e-05, + "loss": 2.6403, + "step": 9780 + }, + { + "epoch": 0.06153241804647143, + "grad_norm": 6.973628997802734, + "learning_rate": 1.9594184555291987e-05, + "loss": 2.498, + "step": 9790 + }, + { + "epoch": 0.061595270363168544, + "grad_norm": 8.49692440032959, + "learning_rate": 1.9593765454347334e-05, + "loss": 2.5516, + "step": 9800 + }, + { + "epoch": 0.06165812267986565, + "grad_norm": 7.654516220092773, + "learning_rate": 1.959334635340268e-05, + "loss": 2.5383, + "step": 9810 + }, + { + "epoch": 0.06172097499656276, + "grad_norm": 7.838619709014893, + "learning_rate": 1.9592927252458028e-05, + "loss": 2.5105, + "step": 9820 + }, + { + "epoch": 0.061783827313259876, + "grad_norm": 7.077419757843018, + "learning_rate": 1.9592508151513375e-05, + "loss": 2.5268, + "step": 9830 + }, + { + "epoch": 0.061846679629956984, + "grad_norm": 8.740164756774902, + "learning_rate": 1.9592089050568722e-05, + "loss": 2.5884, + "step": 9840 + }, + { + "epoch": 0.0619095319466541, + "grad_norm": 8.284232139587402, + "learning_rate": 1.959166994962407e-05, + "loss": 2.5533, + "step": 9850 + }, + { + "epoch": 0.06197238426335121, + "grad_norm": 7.148123264312744, + "learning_rate": 1.9591250848679417e-05, + "loss": 2.5367, + "step": 9860 + }, + { + "epoch": 0.062035236580048315, + "grad_norm": 7.701465129852295, + "learning_rate": 1.9590831747734764e-05, + "loss": 2.6559, + "step": 9870 + }, + { + "epoch": 0.06209808889674543, + "grad_norm": 6.957675457000732, + "learning_rate": 1.9590412646790107e-05, + "loss": 2.6157, + "step": 9880 + }, + { + "epoch": 0.06216094121344254, + "grad_norm": 8.172834396362305, + "learning_rate": 1.9589993545845454e-05, + "loss": 2.4962, + "step": 9890 + }, + { + "epoch": 0.06222379353013965, + "grad_norm": 7.706208229064941, + "learning_rate": 1.95895744449008e-05, + "loss": 2.4572, + "step": 9900 + }, + { + "epoch": 0.06228664584683676, + "grad_norm": 11.208701133728027, + "learning_rate": 1.958915534395615e-05, + "loss": 2.4993, + "step": 9910 + }, + { + "epoch": 0.06234949816353387, + "grad_norm": 7.338189125061035, + "learning_rate": 1.9588736243011492e-05, + "loss": 2.4891, + "step": 9920 + }, + { + "epoch": 0.062412350480230985, + "grad_norm": 8.864147186279297, + "learning_rate": 1.958831714206684e-05, + "loss": 2.4935, + "step": 9930 + }, + { + "epoch": 0.062475202796928093, + "grad_norm": 8.186270713806152, + "learning_rate": 1.9587898041122186e-05, + "loss": 2.6259, + "step": 9940 + }, + { + "epoch": 0.0625380551136252, + "grad_norm": 7.757992267608643, + "learning_rate": 1.9587478940177533e-05, + "loss": 2.6162, + "step": 9950 + }, + { + "epoch": 0.06260090743032232, + "grad_norm": 7.206307411193848, + "learning_rate": 1.9587059839232877e-05, + "loss": 2.6705, + "step": 9960 + }, + { + "epoch": 0.06266375974701943, + "grad_norm": 7.583564758300781, + "learning_rate": 1.9586640738288224e-05, + "loss": 2.5045, + "step": 9970 + }, + { + "epoch": 0.06272661206371653, + "grad_norm": 7.870649814605713, + "learning_rate": 1.958622163734357e-05, + "loss": 2.3944, + "step": 9980 + }, + { + "epoch": 0.06278946438041365, + "grad_norm": 6.977264881134033, + "learning_rate": 1.9585802536398918e-05, + "loss": 2.4585, + "step": 9990 + }, + { + "epoch": 0.06285231669711076, + "grad_norm": 7.438137054443359, + "learning_rate": 1.9585383435454265e-05, + "loss": 2.6557, + "step": 10000 + }, + { + "epoch": 0.06291516901380786, + "grad_norm": 8.076875686645508, + "learning_rate": 1.9584964334509612e-05, + "loss": 2.5705, + "step": 10010 + }, + { + "epoch": 0.06297802133050498, + "grad_norm": 7.514438629150391, + "learning_rate": 1.9584545233564956e-05, + "loss": 2.5158, + "step": 10020 + }, + { + "epoch": 0.0630408736472021, + "grad_norm": 7.07708740234375, + "learning_rate": 1.9584126132620303e-05, + "loss": 2.5866, + "step": 10030 + }, + { + "epoch": 0.0631037259638992, + "grad_norm": 8.048712730407715, + "learning_rate": 1.958370703167565e-05, + "loss": 2.6047, + "step": 10040 + }, + { + "epoch": 0.06316657828059631, + "grad_norm": 7.391304016113281, + "learning_rate": 1.9583287930730997e-05, + "loss": 2.3468, + "step": 10050 + }, + { + "epoch": 0.06322943059729343, + "grad_norm": 6.838722229003906, + "learning_rate": 1.9582868829786344e-05, + "loss": 2.2845, + "step": 10060 + }, + { + "epoch": 0.06329228291399053, + "grad_norm": 8.18930721282959, + "learning_rate": 1.958244972884169e-05, + "loss": 2.4832, + "step": 10070 + }, + { + "epoch": 0.06335513523068764, + "grad_norm": 6.990052223205566, + "learning_rate": 1.958203062789704e-05, + "loss": 2.7226, + "step": 10080 + }, + { + "epoch": 0.06341798754738476, + "grad_norm": 7.360417366027832, + "learning_rate": 1.9581611526952386e-05, + "loss": 2.5438, + "step": 10090 + }, + { + "epoch": 0.06348083986408186, + "grad_norm": 8.4058837890625, + "learning_rate": 1.958119242600773e-05, + "loss": 2.7104, + "step": 10100 + }, + { + "epoch": 0.06354369218077897, + "grad_norm": 7.125432014465332, + "learning_rate": 1.9580773325063076e-05, + "loss": 2.3911, + "step": 10110 + }, + { + "epoch": 0.06360654449747609, + "grad_norm": 7.1509222984313965, + "learning_rate": 1.9580354224118423e-05, + "loss": 2.4099, + "step": 10120 + }, + { + "epoch": 0.0636693968141732, + "grad_norm": 7.346128463745117, + "learning_rate": 1.957993512317377e-05, + "loss": 2.7052, + "step": 10130 + }, + { + "epoch": 0.0637322491308703, + "grad_norm": 7.011810302734375, + "learning_rate": 1.9579516022229114e-05, + "loss": 2.3179, + "step": 10140 + }, + { + "epoch": 0.06379510144756742, + "grad_norm": 6.529143333435059, + "learning_rate": 1.957909692128446e-05, + "loss": 2.4885, + "step": 10150 + }, + { + "epoch": 0.06385795376426454, + "grad_norm": 7.633592128753662, + "learning_rate": 1.9578677820339808e-05, + "loss": 2.7063, + "step": 10160 + }, + { + "epoch": 0.06392080608096164, + "grad_norm": 8.110146522521973, + "learning_rate": 1.9578258719395155e-05, + "loss": 2.4585, + "step": 10170 + }, + { + "epoch": 0.06398365839765875, + "grad_norm": 7.855094909667969, + "learning_rate": 1.9577839618450502e-05, + "loss": 2.3638, + "step": 10180 + }, + { + "epoch": 0.06404651071435587, + "grad_norm": 7.637715816497803, + "learning_rate": 1.9577420517505846e-05, + "loss": 2.6611, + "step": 10190 + }, + { + "epoch": 0.06410936303105297, + "grad_norm": 7.207457542419434, + "learning_rate": 1.9577001416561193e-05, + "loss": 2.5074, + "step": 10200 + }, + { + "epoch": 0.06417221534775008, + "grad_norm": 7.327394008636475, + "learning_rate": 1.957658231561654e-05, + "loss": 2.6406, + "step": 10210 + }, + { + "epoch": 0.0642350676644472, + "grad_norm": 6.885451316833496, + "learning_rate": 1.9576163214671887e-05, + "loss": 2.7146, + "step": 10220 + }, + { + "epoch": 0.0642979199811443, + "grad_norm": 7.120704174041748, + "learning_rate": 1.9575744113727234e-05, + "loss": 2.2612, + "step": 10230 + }, + { + "epoch": 0.06436077229784141, + "grad_norm": 7.489481449127197, + "learning_rate": 1.957532501278258e-05, + "loss": 2.4659, + "step": 10240 + }, + { + "epoch": 0.06442362461453853, + "grad_norm": 7.456684589385986, + "learning_rate": 1.957490591183793e-05, + "loss": 2.3529, + "step": 10250 + }, + { + "epoch": 0.06448647693123563, + "grad_norm": 8.414656639099121, + "learning_rate": 1.9574486810893276e-05, + "loss": 2.4778, + "step": 10260 + }, + { + "epoch": 0.06454932924793275, + "grad_norm": 6.8723626136779785, + "learning_rate": 1.957406770994862e-05, + "loss": 2.4454, + "step": 10270 + }, + { + "epoch": 0.06461218156462986, + "grad_norm": 20.84834861755371, + "learning_rate": 1.9573648609003966e-05, + "loss": 2.6826, + "step": 10280 + }, + { + "epoch": 0.06467503388132696, + "grad_norm": 7.154892921447754, + "learning_rate": 1.9573229508059313e-05, + "loss": 2.4339, + "step": 10290 + }, + { + "epoch": 0.06473788619802408, + "grad_norm": 6.664883136749268, + "learning_rate": 1.957281040711466e-05, + "loss": 2.3187, + "step": 10300 + }, + { + "epoch": 0.06480073851472119, + "grad_norm": 8.236434936523438, + "learning_rate": 1.9572391306170008e-05, + "loss": 2.4366, + "step": 10310 + }, + { + "epoch": 0.06486359083141831, + "grad_norm": 8.697257995605469, + "learning_rate": 1.957197220522535e-05, + "loss": 2.8649, + "step": 10320 + }, + { + "epoch": 0.06492644314811541, + "grad_norm": 6.582972526550293, + "learning_rate": 1.95715531042807e-05, + "loss": 2.6505, + "step": 10330 + }, + { + "epoch": 0.06498929546481252, + "grad_norm": 8.965617179870605, + "learning_rate": 1.9571134003336045e-05, + "loss": 2.5702, + "step": 10340 + }, + { + "epoch": 0.06505214778150964, + "grad_norm": 7.132918357849121, + "learning_rate": 1.9570714902391392e-05, + "loss": 2.6687, + "step": 10350 + }, + { + "epoch": 0.06511500009820674, + "grad_norm": 10.190888404846191, + "learning_rate": 1.9570295801446736e-05, + "loss": 2.4893, + "step": 10360 + }, + { + "epoch": 0.06517785241490386, + "grad_norm": 8.878976821899414, + "learning_rate": 1.9569876700502083e-05, + "loss": 2.8093, + "step": 10370 + }, + { + "epoch": 0.06524070473160097, + "grad_norm": 7.512330055236816, + "learning_rate": 1.956945759955743e-05, + "loss": 2.6352, + "step": 10380 + }, + { + "epoch": 0.06530355704829807, + "grad_norm": 7.537259578704834, + "learning_rate": 1.9569038498612777e-05, + "loss": 2.5183, + "step": 10390 + }, + { + "epoch": 0.06536640936499519, + "grad_norm": 6.685320854187012, + "learning_rate": 1.9568619397668124e-05, + "loss": 2.457, + "step": 10400 + }, + { + "epoch": 0.0654292616816923, + "grad_norm": 7.5906195640563965, + "learning_rate": 1.9568200296723468e-05, + "loss": 2.6065, + "step": 10410 + }, + { + "epoch": 0.0654921139983894, + "grad_norm": 6.875448703765869, + "learning_rate": 1.9567781195778815e-05, + "loss": 2.7156, + "step": 10420 + }, + { + "epoch": 0.06555496631508652, + "grad_norm": 7.813136100769043, + "learning_rate": 1.9567362094834162e-05, + "loss": 2.7358, + "step": 10430 + }, + { + "epoch": 0.06561781863178363, + "grad_norm": 7.258267879486084, + "learning_rate": 1.956694299388951e-05, + "loss": 2.4653, + "step": 10440 + }, + { + "epoch": 0.06568067094848073, + "grad_norm": 7.039821624755859, + "learning_rate": 1.9566523892944856e-05, + "loss": 2.2635, + "step": 10450 + }, + { + "epoch": 0.06574352326517785, + "grad_norm": 8.447144508361816, + "learning_rate": 1.9566104792000203e-05, + "loss": 2.3831, + "step": 10460 + }, + { + "epoch": 0.06580637558187497, + "grad_norm": 7.895917892456055, + "learning_rate": 1.956568569105555e-05, + "loss": 2.5763, + "step": 10470 + }, + { + "epoch": 0.06586922789857208, + "grad_norm": 7.778244495391846, + "learning_rate": 1.9565266590110898e-05, + "loss": 2.4349, + "step": 10480 + }, + { + "epoch": 0.06593208021526918, + "grad_norm": 7.504886627197266, + "learning_rate": 1.9564847489166245e-05, + "loss": 2.5061, + "step": 10490 + }, + { + "epoch": 0.0659949325319663, + "grad_norm": 11.029016494750977, + "learning_rate": 1.956442838822159e-05, + "loss": 2.4106, + "step": 10500 + }, + { + "epoch": 0.06605778484866341, + "grad_norm": 8.39664363861084, + "learning_rate": 1.9564009287276935e-05, + "loss": 2.7254, + "step": 10510 + }, + { + "epoch": 0.06612063716536051, + "grad_norm": 8.360917091369629, + "learning_rate": 1.9563590186332283e-05, + "loss": 2.2116, + "step": 10520 + }, + { + "epoch": 0.06618348948205763, + "grad_norm": 6.493826866149902, + "learning_rate": 1.956317108538763e-05, + "loss": 2.791, + "step": 10530 + }, + { + "epoch": 0.06624634179875474, + "grad_norm": 5.820472240447998, + "learning_rate": 1.9562751984442973e-05, + "loss": 2.3287, + "step": 10540 + }, + { + "epoch": 0.06630919411545184, + "grad_norm": 8.29838752746582, + "learning_rate": 1.956233288349832e-05, + "loss": 2.6331, + "step": 10550 + }, + { + "epoch": 0.06637204643214896, + "grad_norm": 6.8962082862854, + "learning_rate": 1.9561913782553667e-05, + "loss": 2.5689, + "step": 10560 + }, + { + "epoch": 0.06643489874884607, + "grad_norm": 8.794946670532227, + "learning_rate": 1.9561494681609014e-05, + "loss": 2.8286, + "step": 10570 + }, + { + "epoch": 0.06649775106554318, + "grad_norm": 7.809574604034424, + "learning_rate": 1.956107558066436e-05, + "loss": 2.3104, + "step": 10580 + }, + { + "epoch": 0.06656060338224029, + "grad_norm": 6.886219024658203, + "learning_rate": 1.9560656479719705e-05, + "loss": 2.42, + "step": 10590 + }, + { + "epoch": 0.0666234556989374, + "grad_norm": 9.957528114318848, + "learning_rate": 1.9560237378775052e-05, + "loss": 2.5591, + "step": 10600 + }, + { + "epoch": 0.06668630801563451, + "grad_norm": 8.68337345123291, + "learning_rate": 1.95598182778304e-05, + "loss": 2.5122, + "step": 10610 + }, + { + "epoch": 0.06674916033233162, + "grad_norm": 7.782831192016602, + "learning_rate": 1.9559399176885746e-05, + "loss": 2.5188, + "step": 10620 + }, + { + "epoch": 0.06681201264902874, + "grad_norm": 8.128057479858398, + "learning_rate": 1.9558980075941094e-05, + "loss": 2.4072, + "step": 10630 + }, + { + "epoch": 0.06687486496572585, + "grad_norm": 7.294857025146484, + "learning_rate": 1.955856097499644e-05, + "loss": 2.6447, + "step": 10640 + }, + { + "epoch": 0.06693771728242295, + "grad_norm": 7.196445941925049, + "learning_rate": 1.9558141874051784e-05, + "loss": 2.4874, + "step": 10650 + }, + { + "epoch": 0.06700056959912007, + "grad_norm": 9.046404838562012, + "learning_rate": 1.955772277310713e-05, + "loss": 2.4762, + "step": 10660 + }, + { + "epoch": 0.06706342191581718, + "grad_norm": 7.872631072998047, + "learning_rate": 1.955730367216248e-05, + "loss": 2.7471, + "step": 10670 + }, + { + "epoch": 0.06712627423251429, + "grad_norm": 7.524047374725342, + "learning_rate": 1.9556884571217825e-05, + "loss": 2.3539, + "step": 10680 + }, + { + "epoch": 0.0671891265492114, + "grad_norm": 8.810508728027344, + "learning_rate": 1.9556465470273173e-05, + "loss": 2.4916, + "step": 10690 + }, + { + "epoch": 0.06725197886590852, + "grad_norm": 8.720710754394531, + "learning_rate": 1.955604636932852e-05, + "loss": 2.7151, + "step": 10700 + }, + { + "epoch": 0.06731483118260562, + "grad_norm": 7.8059587478637695, + "learning_rate": 1.9555627268383867e-05, + "loss": 2.5237, + "step": 10710 + }, + { + "epoch": 0.06737768349930273, + "grad_norm": 9.27662467956543, + "learning_rate": 1.955520816743921e-05, + "loss": 2.6769, + "step": 10720 + }, + { + "epoch": 0.06744053581599985, + "grad_norm": 9.518790245056152, + "learning_rate": 1.9554789066494557e-05, + "loss": 2.7134, + "step": 10730 + }, + { + "epoch": 0.06750338813269695, + "grad_norm": 8.18859577178955, + "learning_rate": 1.9554369965549905e-05, + "loss": 2.7251, + "step": 10740 + }, + { + "epoch": 0.06756624044939406, + "grad_norm": 8.43634033203125, + "learning_rate": 1.955395086460525e-05, + "loss": 2.7782, + "step": 10750 + }, + { + "epoch": 0.06762909276609118, + "grad_norm": 6.717535495758057, + "learning_rate": 1.9553531763660595e-05, + "loss": 2.6271, + "step": 10760 + }, + { + "epoch": 0.06769194508278828, + "grad_norm": 8.40380573272705, + "learning_rate": 1.9553112662715942e-05, + "loss": 2.4589, + "step": 10770 + }, + { + "epoch": 0.0677547973994854, + "grad_norm": 7.56313419342041, + "learning_rate": 1.955269356177129e-05, + "loss": 2.4065, + "step": 10780 + }, + { + "epoch": 0.06781764971618251, + "grad_norm": 8.096832275390625, + "learning_rate": 1.9552274460826636e-05, + "loss": 2.3685, + "step": 10790 + }, + { + "epoch": 0.06788050203287962, + "grad_norm": 8.289287567138672, + "learning_rate": 1.9551855359881984e-05, + "loss": 2.3128, + "step": 10800 + }, + { + "epoch": 0.06794335434957673, + "grad_norm": 6.763031482696533, + "learning_rate": 1.9551436258937327e-05, + "loss": 2.438, + "step": 10810 + }, + { + "epoch": 0.06800620666627384, + "grad_norm": 9.077466011047363, + "learning_rate": 1.9551017157992674e-05, + "loss": 2.5176, + "step": 10820 + }, + { + "epoch": 0.06806905898297096, + "grad_norm": 9.284290313720703, + "learning_rate": 1.955059805704802e-05, + "loss": 2.4859, + "step": 10830 + }, + { + "epoch": 0.06813191129966806, + "grad_norm": 9.314769744873047, + "learning_rate": 1.955017895610337e-05, + "loss": 2.4891, + "step": 10840 + }, + { + "epoch": 0.06819476361636517, + "grad_norm": 8.047820091247559, + "learning_rate": 1.9549759855158716e-05, + "loss": 2.4733, + "step": 10850 + }, + { + "epoch": 0.06825761593306229, + "grad_norm": 9.353954315185547, + "learning_rate": 1.9549340754214063e-05, + "loss": 2.4225, + "step": 10860 + }, + { + "epoch": 0.06832046824975939, + "grad_norm": 8.54472541809082, + "learning_rate": 1.954892165326941e-05, + "loss": 2.7112, + "step": 10870 + }, + { + "epoch": 0.0683833205664565, + "grad_norm": 7.9946160316467285, + "learning_rate": 1.9548502552324757e-05, + "loss": 2.4115, + "step": 10880 + }, + { + "epoch": 0.06844617288315362, + "grad_norm": 6.7684831619262695, + "learning_rate": 1.9548083451380104e-05, + "loss": 2.5891, + "step": 10890 + }, + { + "epoch": 0.06850902519985072, + "grad_norm": 8.099825859069824, + "learning_rate": 1.9547664350435447e-05, + "loss": 2.3393, + "step": 10900 + }, + { + "epoch": 0.06857187751654784, + "grad_norm": 9.216423034667969, + "learning_rate": 1.9547245249490795e-05, + "loss": 2.8297, + "step": 10910 + }, + { + "epoch": 0.06863472983324495, + "grad_norm": 8.851109504699707, + "learning_rate": 1.954682614854614e-05, + "loss": 2.2189, + "step": 10920 + }, + { + "epoch": 0.06869758214994205, + "grad_norm": 7.840999603271484, + "learning_rate": 1.954640704760149e-05, + "loss": 2.4758, + "step": 10930 + }, + { + "epoch": 0.06876043446663917, + "grad_norm": 7.600790023803711, + "learning_rate": 1.9545987946656832e-05, + "loss": 2.4172, + "step": 10940 + }, + { + "epoch": 0.06882328678333628, + "grad_norm": 9.331750869750977, + "learning_rate": 1.954556884571218e-05, + "loss": 2.4865, + "step": 10950 + }, + { + "epoch": 0.06888613910003338, + "grad_norm": 7.492789268493652, + "learning_rate": 1.9545149744767527e-05, + "loss": 2.5539, + "step": 10960 + }, + { + "epoch": 0.0689489914167305, + "grad_norm": 7.5510430335998535, + "learning_rate": 1.9544730643822874e-05, + "loss": 2.5407, + "step": 10970 + }, + { + "epoch": 0.06901184373342761, + "grad_norm": 7.0119757652282715, + "learning_rate": 1.9544311542878217e-05, + "loss": 2.5054, + "step": 10980 + }, + { + "epoch": 0.06907469605012473, + "grad_norm": 8.019808769226074, + "learning_rate": 1.9543892441933564e-05, + "loss": 2.4257, + "step": 10990 + }, + { + "epoch": 0.06913754836682183, + "grad_norm": 8.645013809204102, + "learning_rate": 1.954347334098891e-05, + "loss": 2.5276, + "step": 11000 + }, + { + "epoch": 0.06920040068351895, + "grad_norm": 6.927749156951904, + "learning_rate": 1.954305424004426e-05, + "loss": 2.7266, + "step": 11010 + }, + { + "epoch": 0.06926325300021606, + "grad_norm": 7.718427658081055, + "learning_rate": 1.9542635139099606e-05, + "loss": 2.2747, + "step": 11020 + }, + { + "epoch": 0.06932610531691316, + "grad_norm": 6.9200968742370605, + "learning_rate": 1.954221603815495e-05, + "loss": 2.4955, + "step": 11030 + }, + { + "epoch": 0.06938895763361028, + "grad_norm": 7.879162311553955, + "learning_rate": 1.9541796937210296e-05, + "loss": 2.4849, + "step": 11040 + }, + { + "epoch": 0.06945180995030739, + "grad_norm": 7.392335414886475, + "learning_rate": 1.9541377836265643e-05, + "loss": 2.4761, + "step": 11050 + }, + { + "epoch": 0.06951466226700449, + "grad_norm": 8.46662425994873, + "learning_rate": 1.954095873532099e-05, + "loss": 2.5295, + "step": 11060 + }, + { + "epoch": 0.06957751458370161, + "grad_norm": 7.7564287185668945, + "learning_rate": 1.9540539634376338e-05, + "loss": 2.4851, + "step": 11070 + }, + { + "epoch": 0.06964036690039872, + "grad_norm": 8.08362865447998, + "learning_rate": 1.9540120533431685e-05, + "loss": 2.2313, + "step": 11080 + }, + { + "epoch": 0.06970321921709582, + "grad_norm": 8.638923645019531, + "learning_rate": 1.953970143248703e-05, + "loss": 2.3547, + "step": 11090 + }, + { + "epoch": 0.06976607153379294, + "grad_norm": 7.311507701873779, + "learning_rate": 1.953928233154238e-05, + "loss": 2.4118, + "step": 11100 + }, + { + "epoch": 0.06982892385049005, + "grad_norm": 6.914936542510986, + "learning_rate": 1.9538863230597726e-05, + "loss": 2.4252, + "step": 11110 + }, + { + "epoch": 0.06989177616718716, + "grad_norm": 7.176113605499268, + "learning_rate": 1.953844412965307e-05, + "loss": 2.6392, + "step": 11120 + }, + { + "epoch": 0.06995462848388427, + "grad_norm": 8.129477500915527, + "learning_rate": 1.9538025028708417e-05, + "loss": 2.396, + "step": 11130 + }, + { + "epoch": 0.07001748080058139, + "grad_norm": 8.857206344604492, + "learning_rate": 1.9537605927763764e-05, + "loss": 2.6338, + "step": 11140 + }, + { + "epoch": 0.0700803331172785, + "grad_norm": 8.38614559173584, + "learning_rate": 1.953718682681911e-05, + "loss": 2.4761, + "step": 11150 + }, + { + "epoch": 0.0701431854339756, + "grad_norm": 7.537355422973633, + "learning_rate": 1.9536767725874454e-05, + "loss": 2.39, + "step": 11160 + }, + { + "epoch": 0.07020603775067272, + "grad_norm": 7.1328840255737305, + "learning_rate": 1.95363486249298e-05, + "loss": 2.361, + "step": 11170 + }, + { + "epoch": 0.07026889006736983, + "grad_norm": 7.255983829498291, + "learning_rate": 1.953592952398515e-05, + "loss": 2.4905, + "step": 11180 + }, + { + "epoch": 0.07033174238406693, + "grad_norm": 8.002547264099121, + "learning_rate": 1.9535510423040496e-05, + "loss": 2.3523, + "step": 11190 + }, + { + "epoch": 0.07039459470076405, + "grad_norm": 7.435315132141113, + "learning_rate": 1.9535091322095843e-05, + "loss": 2.5576, + "step": 11200 + }, + { + "epoch": 0.07045744701746116, + "grad_norm": 6.7415056228637695, + "learning_rate": 1.9534672221151186e-05, + "loss": 2.4429, + "step": 11210 + }, + { + "epoch": 0.07052029933415827, + "grad_norm": 8.618391036987305, + "learning_rate": 1.9534253120206533e-05, + "loss": 2.4735, + "step": 11220 + }, + { + "epoch": 0.07058315165085538, + "grad_norm": 7.767177581787109, + "learning_rate": 1.953383401926188e-05, + "loss": 2.9142, + "step": 11230 + }, + { + "epoch": 0.0706460039675525, + "grad_norm": 7.603353023529053, + "learning_rate": 1.9533414918317228e-05, + "loss": 2.3744, + "step": 11240 + }, + { + "epoch": 0.0707088562842496, + "grad_norm": 7.479285717010498, + "learning_rate": 1.9532995817372575e-05, + "loss": 2.2115, + "step": 11250 + }, + { + "epoch": 0.07077170860094671, + "grad_norm": 7.70100736618042, + "learning_rate": 1.953257671642792e-05, + "loss": 2.5127, + "step": 11260 + }, + { + "epoch": 0.07083456091764383, + "grad_norm": 7.1571736335754395, + "learning_rate": 1.9532157615483265e-05, + "loss": 2.4352, + "step": 11270 + }, + { + "epoch": 0.07089741323434093, + "grad_norm": 7.968157768249512, + "learning_rate": 1.9531738514538612e-05, + "loss": 2.7268, + "step": 11280 + }, + { + "epoch": 0.07096026555103804, + "grad_norm": 7.105053901672363, + "learning_rate": 1.953131941359396e-05, + "loss": 2.439, + "step": 11290 + }, + { + "epoch": 0.07102311786773516, + "grad_norm": 6.176323413848877, + "learning_rate": 1.9530900312649307e-05, + "loss": 2.3334, + "step": 11300 + }, + { + "epoch": 0.07108597018443227, + "grad_norm": 7.531483173370361, + "learning_rate": 1.9530481211704654e-05, + "loss": 2.5347, + "step": 11310 + }, + { + "epoch": 0.07114882250112937, + "grad_norm": 8.726207733154297, + "learning_rate": 1.953006211076e-05, + "loss": 2.492, + "step": 11320 + }, + { + "epoch": 0.07121167481782649, + "grad_norm": 7.908196926116943, + "learning_rate": 1.9529643009815348e-05, + "loss": 2.6788, + "step": 11330 + }, + { + "epoch": 0.0712745271345236, + "grad_norm": 7.728970527648926, + "learning_rate": 1.952922390887069e-05, + "loss": 2.3624, + "step": 11340 + }, + { + "epoch": 0.0713373794512207, + "grad_norm": 7.621332168579102, + "learning_rate": 1.952880480792604e-05, + "loss": 2.54, + "step": 11350 + }, + { + "epoch": 0.07140023176791782, + "grad_norm": 6.608644008636475, + "learning_rate": 1.9528385706981386e-05, + "loss": 2.5584, + "step": 11360 + }, + { + "epoch": 0.07146308408461494, + "grad_norm": 6.979635715484619, + "learning_rate": 1.9527966606036733e-05, + "loss": 2.1742, + "step": 11370 + }, + { + "epoch": 0.07152593640131204, + "grad_norm": 7.518513202667236, + "learning_rate": 1.9527547505092076e-05, + "loss": 2.5554, + "step": 11380 + }, + { + "epoch": 0.07158878871800915, + "grad_norm": 7.282397747039795, + "learning_rate": 1.9527128404147423e-05, + "loss": 2.4447, + "step": 11390 + }, + { + "epoch": 0.07165164103470627, + "grad_norm": 7.44035005569458, + "learning_rate": 1.952670930320277e-05, + "loss": 2.2029, + "step": 11400 + }, + { + "epoch": 0.07171449335140337, + "grad_norm": 7.8835859298706055, + "learning_rate": 1.9526290202258118e-05, + "loss": 2.3582, + "step": 11410 + }, + { + "epoch": 0.07177734566810048, + "grad_norm": 9.01338005065918, + "learning_rate": 1.9525871101313465e-05, + "loss": 2.5426, + "step": 11420 + }, + { + "epoch": 0.0718401979847976, + "grad_norm": 7.738584041595459, + "learning_rate": 1.952545200036881e-05, + "loss": 2.2553, + "step": 11430 + }, + { + "epoch": 0.0719030503014947, + "grad_norm": 7.0059309005737305, + "learning_rate": 1.9525032899424155e-05, + "loss": 2.4217, + "step": 11440 + }, + { + "epoch": 0.07196590261819182, + "grad_norm": 7.1120524406433105, + "learning_rate": 1.9524613798479502e-05, + "loss": 2.4836, + "step": 11450 + }, + { + "epoch": 0.07202875493488893, + "grad_norm": 7.4890851974487305, + "learning_rate": 1.952419469753485e-05, + "loss": 2.4163, + "step": 11460 + }, + { + "epoch": 0.07209160725158605, + "grad_norm": 7.924356460571289, + "learning_rate": 1.9523775596590197e-05, + "loss": 2.7207, + "step": 11470 + }, + { + "epoch": 0.07215445956828315, + "grad_norm": 8.791280746459961, + "learning_rate": 1.9523356495645544e-05, + "loss": 2.3955, + "step": 11480 + }, + { + "epoch": 0.07221731188498026, + "grad_norm": 7.378311634063721, + "learning_rate": 1.952293739470089e-05, + "loss": 2.4784, + "step": 11490 + }, + { + "epoch": 0.07228016420167738, + "grad_norm": 9.543916702270508, + "learning_rate": 1.9522518293756238e-05, + "loss": 2.6184, + "step": 11500 + }, + { + "epoch": 0.07234301651837448, + "grad_norm": 7.561647891998291, + "learning_rate": 1.9522099192811585e-05, + "loss": 2.5923, + "step": 11510 + }, + { + "epoch": 0.0724058688350716, + "grad_norm": 7.390979290008545, + "learning_rate": 1.952168009186693e-05, + "loss": 2.4147, + "step": 11520 + }, + { + "epoch": 0.07246872115176871, + "grad_norm": 7.260756015777588, + "learning_rate": 1.9521260990922276e-05, + "loss": 2.3274, + "step": 11530 + }, + { + "epoch": 0.07253157346846581, + "grad_norm": 7.111080646514893, + "learning_rate": 1.9520841889977623e-05, + "loss": 2.488, + "step": 11540 + }, + { + "epoch": 0.07259442578516292, + "grad_norm": 8.549944877624512, + "learning_rate": 1.952042278903297e-05, + "loss": 2.5328, + "step": 11550 + }, + { + "epoch": 0.07265727810186004, + "grad_norm": 8.285893440246582, + "learning_rate": 1.9520003688088313e-05, + "loss": 2.5, + "step": 11560 + }, + { + "epoch": 0.07272013041855714, + "grad_norm": 7.941225528717041, + "learning_rate": 1.951958458714366e-05, + "loss": 2.5202, + "step": 11570 + }, + { + "epoch": 0.07278298273525426, + "grad_norm": 7.045867443084717, + "learning_rate": 1.9519165486199008e-05, + "loss": 2.541, + "step": 11580 + }, + { + "epoch": 0.07284583505195137, + "grad_norm": 7.211911678314209, + "learning_rate": 1.9518746385254355e-05, + "loss": 2.2981, + "step": 11590 + }, + { + "epoch": 0.07290868736864847, + "grad_norm": 6.470776557922363, + "learning_rate": 1.95183272843097e-05, + "loss": 2.5156, + "step": 11600 + }, + { + "epoch": 0.07297153968534559, + "grad_norm": 7.176722049713135, + "learning_rate": 1.9517908183365045e-05, + "loss": 2.6306, + "step": 11610 + }, + { + "epoch": 0.0730343920020427, + "grad_norm": 10.153510093688965, + "learning_rate": 1.9517489082420393e-05, + "loss": 2.476, + "step": 11620 + }, + { + "epoch": 0.07309724431873982, + "grad_norm": 9.634344100952148, + "learning_rate": 1.951706998147574e-05, + "loss": 2.3988, + "step": 11630 + }, + { + "epoch": 0.07316009663543692, + "grad_norm": 7.575964450836182, + "learning_rate": 1.9516650880531087e-05, + "loss": 2.5493, + "step": 11640 + }, + { + "epoch": 0.07322294895213403, + "grad_norm": 7.2847065925598145, + "learning_rate": 1.951623177958643e-05, + "loss": 2.6312, + "step": 11650 + }, + { + "epoch": 0.07328580126883115, + "grad_norm": 7.564513683319092, + "learning_rate": 1.9515812678641777e-05, + "loss": 2.3137, + "step": 11660 + }, + { + "epoch": 0.07334865358552825, + "grad_norm": 7.918158531188965, + "learning_rate": 1.9515393577697124e-05, + "loss": 2.3743, + "step": 11670 + }, + { + "epoch": 0.07341150590222537, + "grad_norm": 6.573338508605957, + "learning_rate": 1.9515016386846936e-05, + "loss": 2.692, + "step": 11680 + }, + { + "epoch": 0.07347435821892248, + "grad_norm": 7.603901386260986, + "learning_rate": 1.9514597285902283e-05, + "loss": 2.4422, + "step": 11690 + }, + { + "epoch": 0.07353721053561958, + "grad_norm": 8.727775573730469, + "learning_rate": 1.951417818495763e-05, + "loss": 2.4256, + "step": 11700 + }, + { + "epoch": 0.0736000628523167, + "grad_norm": 7.944945812225342, + "learning_rate": 1.9513759084012977e-05, + "loss": 2.5131, + "step": 11710 + }, + { + "epoch": 0.07366291516901381, + "grad_norm": 7.6638288497924805, + "learning_rate": 1.951333998306832e-05, + "loss": 2.6274, + "step": 11720 + }, + { + "epoch": 0.07372576748571091, + "grad_norm": 7.521015644073486, + "learning_rate": 1.9512920882123668e-05, + "loss": 2.1635, + "step": 11730 + }, + { + "epoch": 0.07378861980240803, + "grad_norm": 9.866744995117188, + "learning_rate": 1.9512501781179015e-05, + "loss": 2.3831, + "step": 11740 + }, + { + "epoch": 0.07385147211910514, + "grad_norm": 8.70056438446045, + "learning_rate": 1.9512082680234362e-05, + "loss": 2.407, + "step": 11750 + }, + { + "epoch": 0.07391432443580224, + "grad_norm": 8.276822090148926, + "learning_rate": 1.951166357928971e-05, + "loss": 2.3133, + "step": 11760 + }, + { + "epoch": 0.07397717675249936, + "grad_norm": 7.402984619140625, + "learning_rate": 1.9511244478345056e-05, + "loss": 2.5265, + "step": 11770 + }, + { + "epoch": 0.07404002906919648, + "grad_norm": 8.126667022705078, + "learning_rate": 1.9510825377400403e-05, + "loss": 2.3826, + "step": 11780 + }, + { + "epoch": 0.07410288138589358, + "grad_norm": 7.4878950119018555, + "learning_rate": 1.951040627645575e-05, + "loss": 2.4999, + "step": 11790 + }, + { + "epoch": 0.07416573370259069, + "grad_norm": 8.355165481567383, + "learning_rate": 1.9509987175511097e-05, + "loss": 2.5496, + "step": 11800 + }, + { + "epoch": 0.0742285860192878, + "grad_norm": 6.998531818389893, + "learning_rate": 1.950956807456644e-05, + "loss": 2.5726, + "step": 11810 + }, + { + "epoch": 0.07429143833598492, + "grad_norm": 8.8850736618042, + "learning_rate": 1.9509148973621788e-05, + "loss": 2.758, + "step": 11820 + }, + { + "epoch": 0.07435429065268202, + "grad_norm": 7.972660064697266, + "learning_rate": 1.9508729872677135e-05, + "loss": 2.6275, + "step": 11830 + }, + { + "epoch": 0.07441714296937914, + "grad_norm": 7.867787837982178, + "learning_rate": 1.9508310771732482e-05, + "loss": 2.3476, + "step": 11840 + }, + { + "epoch": 0.07447999528607625, + "grad_norm": 8.236400604248047, + "learning_rate": 1.950789167078783e-05, + "loss": 2.4277, + "step": 11850 + }, + { + "epoch": 0.07454284760277335, + "grad_norm": 7.272027492523193, + "learning_rate": 1.9507472569843173e-05, + "loss": 2.5366, + "step": 11860 + }, + { + "epoch": 0.07460569991947047, + "grad_norm": 7.932986736297607, + "learning_rate": 1.950705346889852e-05, + "loss": 2.5213, + "step": 11870 + }, + { + "epoch": 0.07466855223616758, + "grad_norm": 6.761380672454834, + "learning_rate": 1.9506634367953867e-05, + "loss": 2.4531, + "step": 11880 + }, + { + "epoch": 0.07473140455286469, + "grad_norm": 9.38737678527832, + "learning_rate": 1.9506215267009214e-05, + "loss": 2.0328, + "step": 11890 + }, + { + "epoch": 0.0747942568695618, + "grad_norm": 11.342555046081543, + "learning_rate": 1.9505796166064558e-05, + "loss": 2.4436, + "step": 11900 + }, + { + "epoch": 0.07485710918625892, + "grad_norm": 8.043527603149414, + "learning_rate": 1.9505377065119905e-05, + "loss": 2.5713, + "step": 11910 + }, + { + "epoch": 0.07491996150295602, + "grad_norm": 7.346526622772217, + "learning_rate": 1.9504957964175252e-05, + "loss": 2.4017, + "step": 11920 + }, + { + "epoch": 0.07498281381965313, + "grad_norm": 8.646933555603027, + "learning_rate": 1.95045388632306e-05, + "loss": 2.5347, + "step": 11930 + }, + { + "epoch": 0.07504566613635025, + "grad_norm": 7.251510143280029, + "learning_rate": 1.9504119762285946e-05, + "loss": 2.4801, + "step": 11940 + }, + { + "epoch": 0.07510851845304735, + "grad_norm": 9.1826810836792, + "learning_rate": 1.9503700661341293e-05, + "loss": 2.499, + "step": 11950 + }, + { + "epoch": 0.07517137076974446, + "grad_norm": 7.73813009262085, + "learning_rate": 1.950328156039664e-05, + "loss": 2.5228, + "step": 11960 + }, + { + "epoch": 0.07523422308644158, + "grad_norm": 7.395781517028809, + "learning_rate": 1.9502862459451984e-05, + "loss": 2.5345, + "step": 11970 + }, + { + "epoch": 0.0752970754031387, + "grad_norm": 8.236130714416504, + "learning_rate": 1.950244335850733e-05, + "loss": 2.3011, + "step": 11980 + }, + { + "epoch": 0.0753599277198358, + "grad_norm": 7.84546422958374, + "learning_rate": 1.9502024257562678e-05, + "loss": 2.3883, + "step": 11990 + }, + { + "epoch": 0.07542278003653291, + "grad_norm": 7.490150451660156, + "learning_rate": 1.9501605156618025e-05, + "loss": 2.4918, + "step": 12000 + }, + { + "epoch": 0.07548563235323003, + "grad_norm": 7.3955488204956055, + "learning_rate": 1.9501186055673372e-05, + "loss": 2.6569, + "step": 12010 + }, + { + "epoch": 0.07554848466992713, + "grad_norm": 7.579392433166504, + "learning_rate": 1.950076695472872e-05, + "loss": 2.526, + "step": 12020 + }, + { + "epoch": 0.07561133698662424, + "grad_norm": 7.453882217407227, + "learning_rate": 1.9500347853784063e-05, + "loss": 2.2517, + "step": 12030 + }, + { + "epoch": 0.07567418930332136, + "grad_norm": 7.354152202606201, + "learning_rate": 1.949992875283941e-05, + "loss": 2.5754, + "step": 12040 + }, + { + "epoch": 0.07573704162001846, + "grad_norm": 7.82139253616333, + "learning_rate": 1.9499509651894757e-05, + "loss": 2.2389, + "step": 12050 + }, + { + "epoch": 0.07579989393671557, + "grad_norm": 7.103209495544434, + "learning_rate": 1.9499090550950104e-05, + "loss": 2.323, + "step": 12060 + }, + { + "epoch": 0.07586274625341269, + "grad_norm": 7.013645648956299, + "learning_rate": 1.949867145000545e-05, + "loss": 2.4986, + "step": 12070 + }, + { + "epoch": 0.07592559857010979, + "grad_norm": 6.946286678314209, + "learning_rate": 1.9498252349060795e-05, + "loss": 2.5583, + "step": 12080 + }, + { + "epoch": 0.0759884508868069, + "grad_norm": 7.744342803955078, + "learning_rate": 1.9497833248116142e-05, + "loss": 2.4402, + "step": 12090 + }, + { + "epoch": 0.07605130320350402, + "grad_norm": 9.439397811889648, + "learning_rate": 1.949741414717149e-05, + "loss": 2.6846, + "step": 12100 + }, + { + "epoch": 0.07611415552020112, + "grad_norm": 7.520416736602783, + "learning_rate": 1.9496995046226836e-05, + "loss": 2.4189, + "step": 12110 + }, + { + "epoch": 0.07617700783689824, + "grad_norm": 7.313348770141602, + "learning_rate": 1.949657594528218e-05, + "loss": 2.332, + "step": 12120 + }, + { + "epoch": 0.07623986015359535, + "grad_norm": 8.317850112915039, + "learning_rate": 1.9496156844337527e-05, + "loss": 2.45, + "step": 12130 + }, + { + "epoch": 0.07630271247029247, + "grad_norm": 7.855832099914551, + "learning_rate": 1.9495737743392874e-05, + "loss": 2.3219, + "step": 12140 + }, + { + "epoch": 0.07636556478698957, + "grad_norm": 7.824488639831543, + "learning_rate": 1.949531864244822e-05, + "loss": 2.5782, + "step": 12150 + }, + { + "epoch": 0.07642841710368668, + "grad_norm": 6.932237148284912, + "learning_rate": 1.9494899541503568e-05, + "loss": 2.3706, + "step": 12160 + }, + { + "epoch": 0.0764912694203838, + "grad_norm": 8.48770523071289, + "learning_rate": 1.9494480440558915e-05, + "loss": 2.335, + "step": 12170 + }, + { + "epoch": 0.0765541217370809, + "grad_norm": 7.064522743225098, + "learning_rate": 1.9494061339614262e-05, + "loss": 2.1833, + "step": 12180 + }, + { + "epoch": 0.07661697405377801, + "grad_norm": 8.988115310668945, + "learning_rate": 1.949364223866961e-05, + "loss": 2.6044, + "step": 12190 + }, + { + "epoch": 0.07667982637047513, + "grad_norm": 7.177382469177246, + "learning_rate": 1.9493223137724956e-05, + "loss": 2.3806, + "step": 12200 + }, + { + "epoch": 0.07674267868717223, + "grad_norm": 11.10888385772705, + "learning_rate": 1.94928040367803e-05, + "loss": 2.4232, + "step": 12210 + }, + { + "epoch": 0.07680553100386935, + "grad_norm": 7.233067035675049, + "learning_rate": 1.9492384935835647e-05, + "loss": 2.2771, + "step": 12220 + }, + { + "epoch": 0.07686838332056646, + "grad_norm": 7.85338830947876, + "learning_rate": 1.9491965834890994e-05, + "loss": 2.4547, + "step": 12230 + }, + { + "epoch": 0.07693123563726356, + "grad_norm": 8.474621772766113, + "learning_rate": 1.949154673394634e-05, + "loss": 2.4507, + "step": 12240 + }, + { + "epoch": 0.07699408795396068, + "grad_norm": 7.005478858947754, + "learning_rate": 1.949112763300169e-05, + "loss": 2.4807, + "step": 12250 + }, + { + "epoch": 0.07705694027065779, + "grad_norm": 16.7135009765625, + "learning_rate": 1.9490708532057032e-05, + "loss": 2.4233, + "step": 12260 + }, + { + "epoch": 0.0771197925873549, + "grad_norm": 7.83366060256958, + "learning_rate": 1.949028943111238e-05, + "loss": 2.4988, + "step": 12270 + }, + { + "epoch": 0.07718264490405201, + "grad_norm": 7.76155424118042, + "learning_rate": 1.9489870330167726e-05, + "loss": 2.6238, + "step": 12280 + }, + { + "epoch": 0.07724549722074912, + "grad_norm": 6.5174760818481445, + "learning_rate": 1.9489451229223073e-05, + "loss": 2.3573, + "step": 12290 + }, + { + "epoch": 0.07730834953744624, + "grad_norm": 8.594646453857422, + "learning_rate": 1.9489032128278417e-05, + "loss": 2.5484, + "step": 12300 + }, + { + "epoch": 0.07737120185414334, + "grad_norm": 8.022442817687988, + "learning_rate": 1.9488613027333764e-05, + "loss": 2.4529, + "step": 12310 + }, + { + "epoch": 0.07743405417084046, + "grad_norm": 7.432999610900879, + "learning_rate": 1.948819392638911e-05, + "loss": 2.2009, + "step": 12320 + }, + { + "epoch": 0.07749690648753757, + "grad_norm": 8.464859008789062, + "learning_rate": 1.9487774825444458e-05, + "loss": 2.3583, + "step": 12330 + }, + { + "epoch": 0.07755975880423467, + "grad_norm": 7.359055519104004, + "learning_rate": 1.9487355724499805e-05, + "loss": 2.3175, + "step": 12340 + }, + { + "epoch": 0.07762261112093179, + "grad_norm": 8.959524154663086, + "learning_rate": 1.948693662355515e-05, + "loss": 2.4113, + "step": 12350 + }, + { + "epoch": 0.0776854634376289, + "grad_norm": 7.489354133605957, + "learning_rate": 1.9486517522610496e-05, + "loss": 2.5092, + "step": 12360 + }, + { + "epoch": 0.077748315754326, + "grad_norm": 8.70345401763916, + "learning_rate": 1.9486098421665843e-05, + "loss": 2.2939, + "step": 12370 + }, + { + "epoch": 0.07781116807102312, + "grad_norm": 6.111259937286377, + "learning_rate": 1.948567932072119e-05, + "loss": 2.3819, + "step": 12380 + }, + { + "epoch": 0.07787402038772023, + "grad_norm": 7.629851341247559, + "learning_rate": 1.9485260219776537e-05, + "loss": 2.3942, + "step": 12390 + }, + { + "epoch": 0.07793687270441733, + "grad_norm": 9.003440856933594, + "learning_rate": 1.9484841118831884e-05, + "loss": 2.5155, + "step": 12400 + }, + { + "epoch": 0.07799972502111445, + "grad_norm": 6.7371039390563965, + "learning_rate": 1.948442201788723e-05, + "loss": 2.3328, + "step": 12410 + }, + { + "epoch": 0.07806257733781156, + "grad_norm": 8.926176071166992, + "learning_rate": 1.948400291694258e-05, + "loss": 2.606, + "step": 12420 + }, + { + "epoch": 0.07812542965450867, + "grad_norm": 7.441720962524414, + "learning_rate": 1.9483583815997922e-05, + "loss": 2.458, + "step": 12430 + }, + { + "epoch": 0.07818828197120578, + "grad_norm": 7.5572662353515625, + "learning_rate": 1.948316471505327e-05, + "loss": 2.6717, + "step": 12440 + }, + { + "epoch": 0.0782511342879029, + "grad_norm": 8.235877990722656, + "learning_rate": 1.9482745614108616e-05, + "loss": 2.5934, + "step": 12450 + }, + { + "epoch": 0.0783139866046, + "grad_norm": 8.707640647888184, + "learning_rate": 1.9482326513163963e-05, + "loss": 2.49, + "step": 12460 + }, + { + "epoch": 0.07837683892129711, + "grad_norm": 8.33491039276123, + "learning_rate": 1.948190741221931e-05, + "loss": 2.4845, + "step": 12470 + }, + { + "epoch": 0.07843969123799423, + "grad_norm": 7.912849426269531, + "learning_rate": 1.9481488311274654e-05, + "loss": 2.2593, + "step": 12480 + }, + { + "epoch": 0.07850254355469134, + "grad_norm": 7.304262638092041, + "learning_rate": 1.948106921033e-05, + "loss": 2.1579, + "step": 12490 + }, + { + "epoch": 0.07856539587138844, + "grad_norm": 7.650024890899658, + "learning_rate": 1.9480650109385348e-05, + "loss": 2.4837, + "step": 12500 + }, + { + "epoch": 0.07862824818808556, + "grad_norm": 9.099596977233887, + "learning_rate": 1.9480231008440695e-05, + "loss": 2.302, + "step": 12510 + }, + { + "epoch": 0.07869110050478267, + "grad_norm": 7.692058563232422, + "learning_rate": 1.947981190749604e-05, + "loss": 2.5345, + "step": 12520 + }, + { + "epoch": 0.07875395282147978, + "grad_norm": 8.309165000915527, + "learning_rate": 1.9479392806551386e-05, + "loss": 2.4045, + "step": 12530 + }, + { + "epoch": 0.07881680513817689, + "grad_norm": 7.551580905914307, + "learning_rate": 1.9478973705606733e-05, + "loss": 2.2348, + "step": 12540 + }, + { + "epoch": 0.078879657454874, + "grad_norm": 7.662278175354004, + "learning_rate": 1.947855460466208e-05, + "loss": 2.4571, + "step": 12550 + }, + { + "epoch": 0.0789425097715711, + "grad_norm": 8.074088096618652, + "learning_rate": 1.9478135503717427e-05, + "loss": 2.2866, + "step": 12560 + }, + { + "epoch": 0.07900536208826822, + "grad_norm": 7.268442153930664, + "learning_rate": 1.9477716402772774e-05, + "loss": 2.3656, + "step": 12570 + }, + { + "epoch": 0.07906821440496534, + "grad_norm": 7.288553237915039, + "learning_rate": 1.947729730182812e-05, + "loss": 2.6178, + "step": 12580 + }, + { + "epoch": 0.07913106672166244, + "grad_norm": 8.39034366607666, + "learning_rate": 1.9476878200883465e-05, + "loss": 2.4084, + "step": 12590 + }, + { + "epoch": 0.07919391903835955, + "grad_norm": 7.558828353881836, + "learning_rate": 1.9476459099938812e-05, + "loss": 2.4615, + "step": 12600 + }, + { + "epoch": 0.07925677135505667, + "grad_norm": 6.886292934417725, + "learning_rate": 1.947603999899416e-05, + "loss": 2.3437, + "step": 12610 + }, + { + "epoch": 0.07931962367175377, + "grad_norm": 9.1254243850708, + "learning_rate": 1.9475620898049506e-05, + "loss": 2.3196, + "step": 12620 + }, + { + "epoch": 0.07938247598845088, + "grad_norm": 7.684484958648682, + "learning_rate": 1.9475201797104853e-05, + "loss": 2.2172, + "step": 12630 + }, + { + "epoch": 0.079445328305148, + "grad_norm": 6.235585689544678, + "learning_rate": 1.94747826961602e-05, + "loss": 2.1569, + "step": 12640 + }, + { + "epoch": 0.07950818062184511, + "grad_norm": 7.675627708435059, + "learning_rate": 1.9474363595215547e-05, + "loss": 2.414, + "step": 12650 + }, + { + "epoch": 0.07957103293854222, + "grad_norm": 7.568105697631836, + "learning_rate": 1.947394449427089e-05, + "loss": 2.2094, + "step": 12660 + }, + { + "epoch": 0.07963388525523933, + "grad_norm": 8.476950645446777, + "learning_rate": 1.9473525393326238e-05, + "loss": 2.4608, + "step": 12670 + }, + { + "epoch": 0.07969673757193645, + "grad_norm": 8.452275276184082, + "learning_rate": 1.9473106292381585e-05, + "loss": 2.3468, + "step": 12680 + }, + { + "epoch": 0.07975958988863355, + "grad_norm": 7.434908866882324, + "learning_rate": 1.9472687191436932e-05, + "loss": 2.3847, + "step": 12690 + }, + { + "epoch": 0.07982244220533066, + "grad_norm": 8.017882347106934, + "learning_rate": 1.9472268090492276e-05, + "loss": 2.2582, + "step": 12700 + }, + { + "epoch": 0.07988529452202778, + "grad_norm": 7.4637837409973145, + "learning_rate": 1.9471848989547623e-05, + "loss": 2.3885, + "step": 12710 + }, + { + "epoch": 0.07994814683872488, + "grad_norm": 8.98967456817627, + "learning_rate": 1.947142988860297e-05, + "loss": 2.3662, + "step": 12720 + }, + { + "epoch": 0.080010999155422, + "grad_norm": 8.459840774536133, + "learning_rate": 1.9471010787658317e-05, + "loss": 2.2906, + "step": 12730 + }, + { + "epoch": 0.08007385147211911, + "grad_norm": 7.0828986167907715, + "learning_rate": 1.947059168671366e-05, + "loss": 2.3257, + "step": 12740 + }, + { + "epoch": 0.08013670378881621, + "grad_norm": 9.346085548400879, + "learning_rate": 1.9470172585769008e-05, + "loss": 2.5332, + "step": 12750 + }, + { + "epoch": 0.08019955610551333, + "grad_norm": 10.998466491699219, + "learning_rate": 1.9469753484824355e-05, + "loss": 2.3698, + "step": 12760 + }, + { + "epoch": 0.08026240842221044, + "grad_norm": 8.243062019348145, + "learning_rate": 1.9469334383879702e-05, + "loss": 2.4414, + "step": 12770 + }, + { + "epoch": 0.08032526073890754, + "grad_norm": 8.127972602844238, + "learning_rate": 1.946891528293505e-05, + "loss": 2.3582, + "step": 12780 + }, + { + "epoch": 0.08038811305560466, + "grad_norm": 8.575108528137207, + "learning_rate": 1.9468496181990396e-05, + "loss": 2.2728, + "step": 12790 + }, + { + "epoch": 0.08045096537230177, + "grad_norm": 7.837752342224121, + "learning_rate": 1.9468077081045743e-05, + "loss": 2.1827, + "step": 12800 + }, + { + "epoch": 0.08051381768899889, + "grad_norm": 7.2447190284729, + "learning_rate": 1.946765798010109e-05, + "loss": 2.3687, + "step": 12810 + }, + { + "epoch": 0.08057667000569599, + "grad_norm": 7.327709674835205, + "learning_rate": 1.9467238879156438e-05, + "loss": 2.5204, + "step": 12820 + }, + { + "epoch": 0.0806395223223931, + "grad_norm": 7.249586582183838, + "learning_rate": 1.946681977821178e-05, + "loss": 2.5019, + "step": 12830 + }, + { + "epoch": 0.08070237463909022, + "grad_norm": 7.0609211921691895, + "learning_rate": 1.9466400677267128e-05, + "loss": 2.4995, + "step": 12840 + }, + { + "epoch": 0.08076522695578732, + "grad_norm": 6.717433929443359, + "learning_rate": 1.9465981576322475e-05, + "loss": 2.5919, + "step": 12850 + }, + { + "epoch": 0.08082807927248443, + "grad_norm": 6.879463195800781, + "learning_rate": 1.9465562475377822e-05, + "loss": 2.3503, + "step": 12860 + }, + { + "epoch": 0.08089093158918155, + "grad_norm": 9.135047912597656, + "learning_rate": 1.946514337443317e-05, + "loss": 2.5435, + "step": 12870 + }, + { + "epoch": 0.08095378390587865, + "grad_norm": 7.627810478210449, + "learning_rate": 1.9464724273488513e-05, + "loss": 2.495, + "step": 12880 + }, + { + "epoch": 0.08101663622257577, + "grad_norm": 7.3780341148376465, + "learning_rate": 1.946430517254386e-05, + "loss": 2.3788, + "step": 12890 + }, + { + "epoch": 0.08107948853927288, + "grad_norm": 7.583935260772705, + "learning_rate": 1.9463886071599207e-05, + "loss": 2.7461, + "step": 12900 + }, + { + "epoch": 0.08114234085596998, + "grad_norm": 7.74154806137085, + "learning_rate": 1.9463466970654554e-05, + "loss": 2.3903, + "step": 12910 + }, + { + "epoch": 0.0812051931726671, + "grad_norm": 7.6371073722839355, + "learning_rate": 1.9463047869709898e-05, + "loss": 2.5167, + "step": 12920 + }, + { + "epoch": 0.08126804548936421, + "grad_norm": 8.727256774902344, + "learning_rate": 1.9462628768765245e-05, + "loss": 2.1948, + "step": 12930 + }, + { + "epoch": 0.08133089780606131, + "grad_norm": 7.339801788330078, + "learning_rate": 1.9462209667820592e-05, + "loss": 2.3406, + "step": 12940 + }, + { + "epoch": 0.08139375012275843, + "grad_norm": 7.481237888336182, + "learning_rate": 1.946179056687594e-05, + "loss": 2.1842, + "step": 12950 + }, + { + "epoch": 0.08145660243945554, + "grad_norm": 8.10486125946045, + "learning_rate": 1.9461371465931286e-05, + "loss": 2.2285, + "step": 12960 + }, + { + "epoch": 0.08151945475615266, + "grad_norm": 6.663724899291992, + "learning_rate": 1.946095236498663e-05, + "loss": 2.3499, + "step": 12970 + }, + { + "epoch": 0.08158230707284976, + "grad_norm": 7.401335716247559, + "learning_rate": 1.9460533264041977e-05, + "loss": 2.3825, + "step": 12980 + }, + { + "epoch": 0.08164515938954688, + "grad_norm": 7.259987831115723, + "learning_rate": 1.9460114163097324e-05, + "loss": 2.2446, + "step": 12990 + }, + { + "epoch": 0.08170801170624399, + "grad_norm": 8.05159854888916, + "learning_rate": 1.945969506215267e-05, + "loss": 2.4418, + "step": 13000 + }, + { + "epoch": 0.08177086402294109, + "grad_norm": 7.465734004974365, + "learning_rate": 1.9459275961208018e-05, + "loss": 2.5513, + "step": 13010 + }, + { + "epoch": 0.08183371633963821, + "grad_norm": 6.613677501678467, + "learning_rate": 1.9458856860263365e-05, + "loss": 2.445, + "step": 13020 + }, + { + "epoch": 0.08189656865633532, + "grad_norm": 7.202596664428711, + "learning_rate": 1.9458437759318712e-05, + "loss": 2.4428, + "step": 13030 + }, + { + "epoch": 0.08195942097303242, + "grad_norm": 8.372621536254883, + "learning_rate": 1.945801865837406e-05, + "loss": 2.4775, + "step": 13040 + }, + { + "epoch": 0.08202227328972954, + "grad_norm": 7.697512149810791, + "learning_rate": 1.9457599557429403e-05, + "loss": 2.4491, + "step": 13050 + }, + { + "epoch": 0.08208512560642665, + "grad_norm": 7.001658916473389, + "learning_rate": 1.945718045648475e-05, + "loss": 2.3397, + "step": 13060 + }, + { + "epoch": 0.08214797792312376, + "grad_norm": 7.712573051452637, + "learning_rate": 1.9456761355540097e-05, + "loss": 2.2824, + "step": 13070 + }, + { + "epoch": 0.08221083023982087, + "grad_norm": 7.388295650482178, + "learning_rate": 1.9456342254595444e-05, + "loss": 2.4061, + "step": 13080 + }, + { + "epoch": 0.08227368255651799, + "grad_norm": 6.870701313018799, + "learning_rate": 1.945592315365079e-05, + "loss": 2.4606, + "step": 13090 + }, + { + "epoch": 0.08233653487321509, + "grad_norm": 6.018819808959961, + "learning_rate": 1.9455504052706135e-05, + "loss": 2.3073, + "step": 13100 + }, + { + "epoch": 0.0823993871899122, + "grad_norm": 7.054746627807617, + "learning_rate": 1.9455084951761482e-05, + "loss": 2.3553, + "step": 13110 + }, + { + "epoch": 0.08246223950660932, + "grad_norm": 6.665430545806885, + "learning_rate": 1.945466585081683e-05, + "loss": 2.4216, + "step": 13120 + }, + { + "epoch": 0.08252509182330642, + "grad_norm": 6.984927177429199, + "learning_rate": 1.9454246749872176e-05, + "loss": 2.3272, + "step": 13130 + }, + { + "epoch": 0.08258794414000353, + "grad_norm": 7.989890098571777, + "learning_rate": 1.945382764892752e-05, + "loss": 2.5116, + "step": 13140 + }, + { + "epoch": 0.08265079645670065, + "grad_norm": 7.233949184417725, + "learning_rate": 1.9453408547982867e-05, + "loss": 2.227, + "step": 13150 + }, + { + "epoch": 0.08271364877339776, + "grad_norm": 8.56431770324707, + "learning_rate": 1.9452989447038214e-05, + "loss": 2.6707, + "step": 13160 + }, + { + "epoch": 0.08277650109009486, + "grad_norm": 8.269586563110352, + "learning_rate": 1.945257034609356e-05, + "loss": 2.2733, + "step": 13170 + }, + { + "epoch": 0.08283935340679198, + "grad_norm": 9.309479713439941, + "learning_rate": 1.945215124514891e-05, + "loss": 2.6278, + "step": 13180 + }, + { + "epoch": 0.0829022057234891, + "grad_norm": 8.378772735595703, + "learning_rate": 1.9451732144204255e-05, + "loss": 2.234, + "step": 13190 + }, + { + "epoch": 0.0829650580401862, + "grad_norm": 8.369424819946289, + "learning_rate": 1.9451313043259602e-05, + "loss": 2.4723, + "step": 13200 + }, + { + "epoch": 0.08302791035688331, + "grad_norm": 7.560756683349609, + "learning_rate": 1.945089394231495e-05, + "loss": 2.0368, + "step": 13210 + }, + { + "epoch": 0.08309076267358043, + "grad_norm": 7.919924736022949, + "learning_rate": 1.9450474841370293e-05, + "loss": 2.1961, + "step": 13220 + }, + { + "epoch": 0.08315361499027753, + "grad_norm": 7.153075218200684, + "learning_rate": 1.945005574042564e-05, + "loss": 2.3935, + "step": 13230 + }, + { + "epoch": 0.08321646730697464, + "grad_norm": 9.653682708740234, + "learning_rate": 1.9449636639480987e-05, + "loss": 2.4894, + "step": 13240 + }, + { + "epoch": 0.08327931962367176, + "grad_norm": 8.509584426879883, + "learning_rate": 1.9449217538536334e-05, + "loss": 2.3599, + "step": 13250 + }, + { + "epoch": 0.08334217194036886, + "grad_norm": 7.730416297912598, + "learning_rate": 1.944879843759168e-05, + "loss": 2.4551, + "step": 13260 + }, + { + "epoch": 0.08340502425706597, + "grad_norm": 7.178668975830078, + "learning_rate": 1.944837933664703e-05, + "loss": 2.3364, + "step": 13270 + }, + { + "epoch": 0.08346787657376309, + "grad_norm": 7.232913494110107, + "learning_rate": 1.9447960235702372e-05, + "loss": 2.1589, + "step": 13280 + }, + { + "epoch": 0.08353072889046019, + "grad_norm": 7.251108169555664, + "learning_rate": 1.944754113475772e-05, + "loss": 2.3956, + "step": 13290 + }, + { + "epoch": 0.0835935812071573, + "grad_norm": 7.310342788696289, + "learning_rate": 1.9447122033813066e-05, + "loss": 2.5985, + "step": 13300 + }, + { + "epoch": 0.08365643352385442, + "grad_norm": 7.670332908630371, + "learning_rate": 1.9446702932868413e-05, + "loss": 2.1638, + "step": 13310 + }, + { + "epoch": 0.08371928584055154, + "grad_norm": 7.718730449676514, + "learning_rate": 1.9446283831923757e-05, + "loss": 2.4241, + "step": 13320 + }, + { + "epoch": 0.08378213815724864, + "grad_norm": 7.897666931152344, + "learning_rate": 1.9445864730979104e-05, + "loss": 2.4343, + "step": 13330 + }, + { + "epoch": 0.08384499047394575, + "grad_norm": 7.8425798416137695, + "learning_rate": 1.944544563003445e-05, + "loss": 2.4654, + "step": 13340 + }, + { + "epoch": 0.08390784279064287, + "grad_norm": 6.980411052703857, + "learning_rate": 1.94450265290898e-05, + "loss": 2.4179, + "step": 13350 + }, + { + "epoch": 0.08397069510733997, + "grad_norm": 7.646371364593506, + "learning_rate": 1.9444607428145142e-05, + "loss": 2.0922, + "step": 13360 + }, + { + "epoch": 0.08403354742403708, + "grad_norm": 7.7674384117126465, + "learning_rate": 1.944418832720049e-05, + "loss": 2.4974, + "step": 13370 + }, + { + "epoch": 0.0840963997407342, + "grad_norm": 7.743281841278076, + "learning_rate": 1.9443769226255836e-05, + "loss": 2.3573, + "step": 13380 + }, + { + "epoch": 0.0841592520574313, + "grad_norm": 7.876406669616699, + "learning_rate": 1.9443350125311183e-05, + "loss": 2.3071, + "step": 13390 + }, + { + "epoch": 0.08422210437412841, + "grad_norm": 7.47829532623291, + "learning_rate": 1.944293102436653e-05, + "loss": 2.5412, + "step": 13400 + }, + { + "epoch": 0.08428495669082553, + "grad_norm": 7.951406478881836, + "learning_rate": 1.9442511923421877e-05, + "loss": 2.5606, + "step": 13410 + }, + { + "epoch": 0.08434780900752263, + "grad_norm": 8.410359382629395, + "learning_rate": 1.9442092822477224e-05, + "loss": 2.3735, + "step": 13420 + }, + { + "epoch": 0.08441066132421975, + "grad_norm": 7.944238662719727, + "learning_rate": 1.944167372153257e-05, + "loss": 2.411, + "step": 13430 + }, + { + "epoch": 0.08447351364091686, + "grad_norm": 7.693236351013184, + "learning_rate": 1.944125462058792e-05, + "loss": 2.4179, + "step": 13440 + }, + { + "epoch": 0.08453636595761396, + "grad_norm": 9.336854934692383, + "learning_rate": 1.9440835519643262e-05, + "loss": 2.4799, + "step": 13450 + }, + { + "epoch": 0.08459921827431108, + "grad_norm": 7.201939582824707, + "learning_rate": 1.944041641869861e-05, + "loss": 2.2801, + "step": 13460 + }, + { + "epoch": 0.08466207059100819, + "grad_norm": 7.295503616333008, + "learning_rate": 1.9439997317753956e-05, + "loss": 2.4658, + "step": 13470 + }, + { + "epoch": 0.08472492290770531, + "grad_norm": 8.042071342468262, + "learning_rate": 1.9439578216809304e-05, + "loss": 2.3253, + "step": 13480 + }, + { + "epoch": 0.08478777522440241, + "grad_norm": 8.716230392456055, + "learning_rate": 1.943915911586465e-05, + "loss": 2.4409, + "step": 13490 + }, + { + "epoch": 0.08485062754109952, + "grad_norm": 8.266105651855469, + "learning_rate": 1.9438740014919994e-05, + "loss": 2.157, + "step": 13500 + }, + { + "epoch": 0.08491347985779664, + "grad_norm": 8.517080307006836, + "learning_rate": 1.943832091397534e-05, + "loss": 2.4678, + "step": 13510 + }, + { + "epoch": 0.08497633217449374, + "grad_norm": 7.580370903015137, + "learning_rate": 1.943790181303069e-05, + "loss": 2.2805, + "step": 13520 + }, + { + "epoch": 0.08503918449119086, + "grad_norm": 7.174275875091553, + "learning_rate": 1.9437482712086035e-05, + "loss": 2.3805, + "step": 13530 + }, + { + "epoch": 0.08510203680788797, + "grad_norm": 8.051619529724121, + "learning_rate": 1.943706361114138e-05, + "loss": 2.5313, + "step": 13540 + }, + { + "epoch": 0.08516488912458507, + "grad_norm": 8.353168487548828, + "learning_rate": 1.9436644510196726e-05, + "loss": 2.5692, + "step": 13550 + }, + { + "epoch": 0.08522774144128219, + "grad_norm": 8.41823673248291, + "learning_rate": 1.9436225409252073e-05, + "loss": 2.2947, + "step": 13560 + }, + { + "epoch": 0.0852905937579793, + "grad_norm": 9.146581649780273, + "learning_rate": 1.943580630830742e-05, + "loss": 2.696, + "step": 13570 + }, + { + "epoch": 0.0853534460746764, + "grad_norm": 7.7751874923706055, + "learning_rate": 1.9435387207362767e-05, + "loss": 2.2336, + "step": 13580 + }, + { + "epoch": 0.08541629839137352, + "grad_norm": 7.192497730255127, + "learning_rate": 1.9434968106418115e-05, + "loss": 2.3596, + "step": 13590 + }, + { + "epoch": 0.08547915070807063, + "grad_norm": 7.996962547302246, + "learning_rate": 1.9434549005473458e-05, + "loss": 2.318, + "step": 13600 + }, + { + "epoch": 0.08554200302476773, + "grad_norm": 7.894214153289795, + "learning_rate": 1.9434129904528805e-05, + "loss": 2.323, + "step": 13610 + }, + { + "epoch": 0.08560485534146485, + "grad_norm": 7.202610015869141, + "learning_rate": 1.9433710803584152e-05, + "loss": 2.3279, + "step": 13620 + }, + { + "epoch": 0.08566770765816197, + "grad_norm": 7.913158416748047, + "learning_rate": 1.94332917026395e-05, + "loss": 2.3408, + "step": 13630 + }, + { + "epoch": 0.08573055997485908, + "grad_norm": 7.302917003631592, + "learning_rate": 1.9432872601694846e-05, + "loss": 2.175, + "step": 13640 + }, + { + "epoch": 0.08579341229155618, + "grad_norm": 7.996654510498047, + "learning_rate": 1.9432453500750194e-05, + "loss": 2.175, + "step": 13650 + }, + { + "epoch": 0.0858562646082533, + "grad_norm": 7.731688022613525, + "learning_rate": 1.943203439980554e-05, + "loss": 2.555, + "step": 13660 + }, + { + "epoch": 0.08591911692495041, + "grad_norm": 7.67500638961792, + "learning_rate": 1.9431615298860884e-05, + "loss": 2.279, + "step": 13670 + }, + { + "epoch": 0.08598196924164751, + "grad_norm": 7.5716023445129395, + "learning_rate": 1.943119619791623e-05, + "loss": 2.3501, + "step": 13680 + }, + { + "epoch": 0.08604482155834463, + "grad_norm": 7.677670955657959, + "learning_rate": 1.943077709697158e-05, + "loss": 2.2414, + "step": 13690 + }, + { + "epoch": 0.08610767387504174, + "grad_norm": 6.14946985244751, + "learning_rate": 1.9430357996026926e-05, + "loss": 2.3285, + "step": 13700 + }, + { + "epoch": 0.08617052619173884, + "grad_norm": 6.973321914672852, + "learning_rate": 1.9429938895082273e-05, + "loss": 2.2303, + "step": 13710 + }, + { + "epoch": 0.08623337850843596, + "grad_norm": 7.703900337219238, + "learning_rate": 1.9429519794137616e-05, + "loss": 2.6025, + "step": 13720 + }, + { + "epoch": 0.08629623082513307, + "grad_norm": 7.670722961425781, + "learning_rate": 1.9429100693192963e-05, + "loss": 2.2004, + "step": 13730 + }, + { + "epoch": 0.08635908314183018, + "grad_norm": 7.5491766929626465, + "learning_rate": 1.942868159224831e-05, + "loss": 2.5119, + "step": 13740 + }, + { + "epoch": 0.08642193545852729, + "grad_norm": 8.034729957580566, + "learning_rate": 1.9428262491303657e-05, + "loss": 2.4391, + "step": 13750 + }, + { + "epoch": 0.0864847877752244, + "grad_norm": 6.830660820007324, + "learning_rate": 1.9427843390359e-05, + "loss": 2.1871, + "step": 13760 + }, + { + "epoch": 0.08654764009192151, + "grad_norm": 7.351603031158447, + "learning_rate": 1.9427424289414348e-05, + "loss": 2.322, + "step": 13770 + }, + { + "epoch": 0.08661049240861862, + "grad_norm": 7.406009674072266, + "learning_rate": 1.9427005188469695e-05, + "loss": 2.57, + "step": 13780 + }, + { + "epoch": 0.08667334472531574, + "grad_norm": 7.952855587005615, + "learning_rate": 1.9426586087525042e-05, + "loss": 2.223, + "step": 13790 + }, + { + "epoch": 0.08673619704201284, + "grad_norm": 7.72921085357666, + "learning_rate": 1.942616698658039e-05, + "loss": 2.291, + "step": 13800 + }, + { + "epoch": 0.08679904935870995, + "grad_norm": 8.208253860473633, + "learning_rate": 1.9425747885635737e-05, + "loss": 2.4412, + "step": 13810 + }, + { + "epoch": 0.08686190167540707, + "grad_norm": 7.6025800704956055, + "learning_rate": 1.9425328784691084e-05, + "loss": 2.3367, + "step": 13820 + }, + { + "epoch": 0.08692475399210418, + "grad_norm": 8.467233657836914, + "learning_rate": 1.942490968374643e-05, + "loss": 2.5404, + "step": 13830 + }, + { + "epoch": 0.08698760630880129, + "grad_norm": 8.328033447265625, + "learning_rate": 1.9424490582801774e-05, + "loss": 2.2398, + "step": 13840 + }, + { + "epoch": 0.0870504586254984, + "grad_norm": 7.479990482330322, + "learning_rate": 1.942407148185712e-05, + "loss": 2.2371, + "step": 13850 + }, + { + "epoch": 0.08711331094219552, + "grad_norm": 6.685925483703613, + "learning_rate": 1.942365238091247e-05, + "loss": 2.1465, + "step": 13860 + }, + { + "epoch": 0.08717616325889262, + "grad_norm": 6.884483814239502, + "learning_rate": 1.9423233279967816e-05, + "loss": 2.3396, + "step": 13870 + }, + { + "epoch": 0.08723901557558973, + "grad_norm": 7.625536918640137, + "learning_rate": 1.9422814179023163e-05, + "loss": 2.5991, + "step": 13880 + }, + { + "epoch": 0.08730186789228685, + "grad_norm": 6.874094009399414, + "learning_rate": 1.942239507807851e-05, + "loss": 2.3927, + "step": 13890 + }, + { + "epoch": 0.08736472020898395, + "grad_norm": 7.947897911071777, + "learning_rate": 1.9421975977133853e-05, + "loss": 2.4413, + "step": 13900 + }, + { + "epoch": 0.08742757252568106, + "grad_norm": 7.6935715675354, + "learning_rate": 1.94215568761892e-05, + "loss": 2.6484, + "step": 13910 + }, + { + "epoch": 0.08749042484237818, + "grad_norm": 7.301044464111328, + "learning_rate": 1.9421137775244548e-05, + "loss": 2.5653, + "step": 13920 + }, + { + "epoch": 0.08755327715907528, + "grad_norm": 7.384383201599121, + "learning_rate": 1.9420718674299895e-05, + "loss": 2.3183, + "step": 13930 + }, + { + "epoch": 0.0876161294757724, + "grad_norm": 7.303337097167969, + "learning_rate": 1.9420299573355238e-05, + "loss": 2.2904, + "step": 13940 + }, + { + "epoch": 0.08767898179246951, + "grad_norm": 8.333943367004395, + "learning_rate": 1.9419880472410585e-05, + "loss": 2.2082, + "step": 13950 + }, + { + "epoch": 0.08774183410916661, + "grad_norm": 7.08447790145874, + "learning_rate": 1.9419461371465932e-05, + "loss": 2.3771, + "step": 13960 + }, + { + "epoch": 0.08780468642586373, + "grad_norm": 7.231597900390625, + "learning_rate": 1.941904227052128e-05, + "loss": 2.4866, + "step": 13970 + }, + { + "epoch": 0.08786753874256084, + "grad_norm": 6.747964859008789, + "learning_rate": 1.9418623169576623e-05, + "loss": 2.399, + "step": 13980 + }, + { + "epoch": 0.08793039105925796, + "grad_norm": 7.162939548492432, + "learning_rate": 1.941820406863197e-05, + "loss": 2.3983, + "step": 13990 + }, + { + "epoch": 0.08799324337595506, + "grad_norm": 6.220515727996826, + "learning_rate": 1.9417784967687317e-05, + "loss": 2.3261, + "step": 14000 + }, + { + "epoch": 0.08805609569265217, + "grad_norm": 7.695446968078613, + "learning_rate": 1.9417365866742664e-05, + "loss": 2.49, + "step": 14010 + }, + { + "epoch": 0.08811894800934929, + "grad_norm": 8.09644889831543, + "learning_rate": 1.941694676579801e-05, + "loss": 2.3648, + "step": 14020 + }, + { + "epoch": 0.08818180032604639, + "grad_norm": 6.877342224121094, + "learning_rate": 1.941652766485336e-05, + "loss": 2.088, + "step": 14030 + }, + { + "epoch": 0.0882446526427435, + "grad_norm": 6.628556251525879, + "learning_rate": 1.9416108563908706e-05, + "loss": 2.3511, + "step": 14040 + }, + { + "epoch": 0.08830750495944062, + "grad_norm": 7.792478084564209, + "learning_rate": 1.9415689462964053e-05, + "loss": 2.3154, + "step": 14050 + }, + { + "epoch": 0.08837035727613772, + "grad_norm": 7.314945697784424, + "learning_rate": 1.94152703620194e-05, + "loss": 2.6679, + "step": 14060 + }, + { + "epoch": 0.08843320959283484, + "grad_norm": 6.940366268157959, + "learning_rate": 1.9414851261074743e-05, + "loss": 2.3944, + "step": 14070 + }, + { + "epoch": 0.08849606190953195, + "grad_norm": 5.507546901702881, + "learning_rate": 1.941443216013009e-05, + "loss": 1.9258, + "step": 14080 + }, + { + "epoch": 0.08855891422622905, + "grad_norm": 7.643945217132568, + "learning_rate": 1.9414013059185438e-05, + "loss": 2.4071, + "step": 14090 + }, + { + "epoch": 0.08862176654292617, + "grad_norm": 7.493567943572998, + "learning_rate": 1.9413593958240785e-05, + "loss": 2.4666, + "step": 14100 + }, + { + "epoch": 0.08868461885962328, + "grad_norm": 9.023538589477539, + "learning_rate": 1.941317485729613e-05, + "loss": 2.173, + "step": 14110 + }, + { + "epoch": 0.08874747117632038, + "grad_norm": 7.049037456512451, + "learning_rate": 1.9412755756351475e-05, + "loss": 2.4321, + "step": 14120 + }, + { + "epoch": 0.0888103234930175, + "grad_norm": 7.310028076171875, + "learning_rate": 1.9412336655406822e-05, + "loss": 2.2373, + "step": 14130 + }, + { + "epoch": 0.08887317580971461, + "grad_norm": 8.413848876953125, + "learning_rate": 1.941191755446217e-05, + "loss": 2.3431, + "step": 14140 + }, + { + "epoch": 0.08893602812641173, + "grad_norm": 8.686189651489258, + "learning_rate": 1.9411498453517517e-05, + "loss": 2.4907, + "step": 14150 + }, + { + "epoch": 0.08899888044310883, + "grad_norm": 7.3697967529296875, + "learning_rate": 1.941107935257286e-05, + "loss": 2.3139, + "step": 14160 + }, + { + "epoch": 0.08906173275980594, + "grad_norm": 7.831506729125977, + "learning_rate": 1.9410660251628207e-05, + "loss": 2.5202, + "step": 14170 + }, + { + "epoch": 0.08912458507650306, + "grad_norm": 8.068471908569336, + "learning_rate": 1.9410241150683554e-05, + "loss": 2.4672, + "step": 14180 + }, + { + "epoch": 0.08918743739320016, + "grad_norm": 8.44082260131836, + "learning_rate": 1.94098220497389e-05, + "loss": 2.358, + "step": 14190 + }, + { + "epoch": 0.08925028970989728, + "grad_norm": 7.093486309051514, + "learning_rate": 1.940940294879425e-05, + "loss": 2.453, + "step": 14200 + }, + { + "epoch": 0.08931314202659439, + "grad_norm": 8.337098121643066, + "learning_rate": 1.9408983847849596e-05, + "loss": 2.4123, + "step": 14210 + }, + { + "epoch": 0.08937599434329149, + "grad_norm": 6.9383087158203125, + "learning_rate": 1.940856474690494e-05, + "loss": 2.3185, + "step": 14220 + }, + { + "epoch": 0.08943884665998861, + "grad_norm": 7.915830135345459, + "learning_rate": 1.9408145645960286e-05, + "loss": 2.6617, + "step": 14230 + }, + { + "epoch": 0.08950169897668572, + "grad_norm": 8.545634269714355, + "learning_rate": 1.9407726545015633e-05, + "loss": 2.4342, + "step": 14240 + }, + { + "epoch": 0.08956455129338282, + "grad_norm": 8.853870391845703, + "learning_rate": 1.940730744407098e-05, + "loss": 2.4135, + "step": 14250 + }, + { + "epoch": 0.08962740361007994, + "grad_norm": 8.391120910644531, + "learning_rate": 1.9406888343126328e-05, + "loss": 2.3313, + "step": 14260 + }, + { + "epoch": 0.08969025592677705, + "grad_norm": 7.608756065368652, + "learning_rate": 1.9406469242181675e-05, + "loss": 2.2267, + "step": 14270 + }, + { + "epoch": 0.08975310824347416, + "grad_norm": 7.681011199951172, + "learning_rate": 1.9406050141237022e-05, + "loss": 2.5206, + "step": 14280 + }, + { + "epoch": 0.08981596056017127, + "grad_norm": 7.436078071594238, + "learning_rate": 1.9405631040292365e-05, + "loss": 2.4975, + "step": 14290 + }, + { + "epoch": 0.08987881287686839, + "grad_norm": 6.741333484649658, + "learning_rate": 1.9405211939347712e-05, + "loss": 2.4191, + "step": 14300 + }, + { + "epoch": 0.0899416651935655, + "grad_norm": 7.723337173461914, + "learning_rate": 1.940479283840306e-05, + "loss": 2.4078, + "step": 14310 + }, + { + "epoch": 0.0900045175102626, + "grad_norm": 6.964266300201416, + "learning_rate": 1.9404373737458407e-05, + "loss": 2.3508, + "step": 14320 + }, + { + "epoch": 0.09006736982695972, + "grad_norm": 8.015843391418457, + "learning_rate": 1.9403954636513754e-05, + "loss": 2.3937, + "step": 14330 + }, + { + "epoch": 0.09013022214365683, + "grad_norm": 7.15214729309082, + "learning_rate": 1.9403535535569097e-05, + "loss": 2.3037, + "step": 14340 + }, + { + "epoch": 0.09019307446035393, + "grad_norm": 6.9445648193359375, + "learning_rate": 1.9403116434624444e-05, + "loss": 2.343, + "step": 14350 + }, + { + "epoch": 0.09025592677705105, + "grad_norm": 6.637543201446533, + "learning_rate": 1.940269733367979e-05, + "loss": 2.4491, + "step": 14360 + }, + { + "epoch": 0.09031877909374816, + "grad_norm": 7.8735551834106445, + "learning_rate": 1.940227823273514e-05, + "loss": 2.3134, + "step": 14370 + }, + { + "epoch": 0.09038163141044527, + "grad_norm": 7.348316669464111, + "learning_rate": 1.9401859131790482e-05, + "loss": 2.5632, + "step": 14380 + }, + { + "epoch": 0.09044448372714238, + "grad_norm": 7.872707366943359, + "learning_rate": 1.940144003084583e-05, + "loss": 2.2116, + "step": 14390 + }, + { + "epoch": 0.0905073360438395, + "grad_norm": 7.157912254333496, + "learning_rate": 1.9401020929901176e-05, + "loss": 2.2396, + "step": 14400 + }, + { + "epoch": 0.0905701883605366, + "grad_norm": 7.548715591430664, + "learning_rate": 1.9400601828956523e-05, + "loss": 2.608, + "step": 14410 + }, + { + "epoch": 0.09063304067723371, + "grad_norm": 7.2458014488220215, + "learning_rate": 1.940018272801187e-05, + "loss": 2.3841, + "step": 14420 + }, + { + "epoch": 0.09069589299393083, + "grad_norm": 7.721854209899902, + "learning_rate": 1.9399763627067218e-05, + "loss": 2.2875, + "step": 14430 + }, + { + "epoch": 0.09075874531062793, + "grad_norm": 7.985480785369873, + "learning_rate": 1.9399344526122565e-05, + "loss": 2.4722, + "step": 14440 + }, + { + "epoch": 0.09082159762732504, + "grad_norm": 7.973264694213867, + "learning_rate": 1.9398925425177912e-05, + "loss": 2.2483, + "step": 14450 + }, + { + "epoch": 0.09088444994402216, + "grad_norm": 7.601576805114746, + "learning_rate": 1.939850632423326e-05, + "loss": 2.1748, + "step": 14460 + }, + { + "epoch": 0.09094730226071926, + "grad_norm": 7.965937614440918, + "learning_rate": 1.9398087223288603e-05, + "loss": 2.4475, + "step": 14470 + }, + { + "epoch": 0.09101015457741637, + "grad_norm": 7.92208194732666, + "learning_rate": 1.939766812234395e-05, + "loss": 2.2445, + "step": 14480 + }, + { + "epoch": 0.09107300689411349, + "grad_norm": 7.090691566467285, + "learning_rate": 1.9397249021399297e-05, + "loss": 2.4417, + "step": 14490 + }, + { + "epoch": 0.0911358592108106, + "grad_norm": 7.450453281402588, + "learning_rate": 1.9396829920454644e-05, + "loss": 2.2301, + "step": 14500 + }, + { + "epoch": 0.0911987115275077, + "grad_norm": 8.080689430236816, + "learning_rate": 1.939641081950999e-05, + "loss": 2.2355, + "step": 14510 + }, + { + "epoch": 0.09126156384420482, + "grad_norm": 8.256315231323242, + "learning_rate": 1.9395991718565334e-05, + "loss": 2.2679, + "step": 14520 + }, + { + "epoch": 0.09132441616090194, + "grad_norm": 8.130385398864746, + "learning_rate": 1.939557261762068e-05, + "loss": 2.4544, + "step": 14530 + }, + { + "epoch": 0.09138726847759904, + "grad_norm": 8.67445182800293, + "learning_rate": 1.939515351667603e-05, + "loss": 2.3577, + "step": 14540 + }, + { + "epoch": 0.09145012079429615, + "grad_norm": 7.793080806732178, + "learning_rate": 1.9394734415731376e-05, + "loss": 2.298, + "step": 14550 + }, + { + "epoch": 0.09151297311099327, + "grad_norm": 9.277105331420898, + "learning_rate": 1.939431531478672e-05, + "loss": 2.3302, + "step": 14560 + }, + { + "epoch": 0.09157582542769037, + "grad_norm": 7.196310043334961, + "learning_rate": 1.9393896213842066e-05, + "loss": 2.3304, + "step": 14570 + }, + { + "epoch": 0.09163867774438748, + "grad_norm": 6.744382381439209, + "learning_rate": 1.9393477112897414e-05, + "loss": 2.3452, + "step": 14580 + }, + { + "epoch": 0.0917015300610846, + "grad_norm": 7.582704067230225, + "learning_rate": 1.939305801195276e-05, + "loss": 2.5666, + "step": 14590 + }, + { + "epoch": 0.0917643823777817, + "grad_norm": 10.222574234008789, + "learning_rate": 1.9392638911008104e-05, + "loss": 2.466, + "step": 14600 + }, + { + "epoch": 0.09182723469447882, + "grad_norm": 7.608969688415527, + "learning_rate": 1.939221981006345e-05, + "loss": 2.2171, + "step": 14610 + }, + { + "epoch": 0.09189008701117593, + "grad_norm": 8.202703475952148, + "learning_rate": 1.93918007091188e-05, + "loss": 2.4784, + "step": 14620 + }, + { + "epoch": 0.09195293932787303, + "grad_norm": 7.415685176849365, + "learning_rate": 1.9391381608174145e-05, + "loss": 2.1183, + "step": 14630 + }, + { + "epoch": 0.09201579164457015, + "grad_norm": 8.944314002990723, + "learning_rate": 1.9390962507229493e-05, + "loss": 2.3361, + "step": 14640 + }, + { + "epoch": 0.09207864396126726, + "grad_norm": 6.944596767425537, + "learning_rate": 1.939054340628484e-05, + "loss": 2.5046, + "step": 14650 + }, + { + "epoch": 0.09214149627796438, + "grad_norm": 7.5702128410339355, + "learning_rate": 1.9390124305340187e-05, + "loss": 2.1109, + "step": 14660 + }, + { + "epoch": 0.09220434859466148, + "grad_norm": 8.582987785339355, + "learning_rate": 1.9389705204395534e-05, + "loss": 2.3905, + "step": 14670 + }, + { + "epoch": 0.0922672009113586, + "grad_norm": 8.8607177734375, + "learning_rate": 1.938928610345088e-05, + "loss": 2.3118, + "step": 14680 + }, + { + "epoch": 0.09233005322805571, + "grad_norm": 8.889832496643066, + "learning_rate": 1.9388867002506225e-05, + "loss": 2.511, + "step": 14690 + }, + { + "epoch": 0.09239290554475281, + "grad_norm": 7.495224475860596, + "learning_rate": 1.938844790156157e-05, + "loss": 2.4922, + "step": 14700 + }, + { + "epoch": 0.09245575786144992, + "grad_norm": 7.616922855377197, + "learning_rate": 1.938802880061692e-05, + "loss": 2.3211, + "step": 14710 + }, + { + "epoch": 0.09251861017814704, + "grad_norm": 9.176539421081543, + "learning_rate": 1.9387609699672266e-05, + "loss": 2.4175, + "step": 14720 + }, + { + "epoch": 0.09258146249484414, + "grad_norm": 8.710726737976074, + "learning_rate": 1.9387190598727613e-05, + "loss": 2.34, + "step": 14730 + }, + { + "epoch": 0.09264431481154126, + "grad_norm": 7.268640995025635, + "learning_rate": 1.9386771497782956e-05, + "loss": 2.5475, + "step": 14740 + }, + { + "epoch": 0.09270716712823837, + "grad_norm": 8.120370864868164, + "learning_rate": 1.9386352396838304e-05, + "loss": 2.25, + "step": 14750 + }, + { + "epoch": 0.09277001944493547, + "grad_norm": 8.488015174865723, + "learning_rate": 1.938593329589365e-05, + "loss": 2.2392, + "step": 14760 + }, + { + "epoch": 0.09283287176163259, + "grad_norm": 7.086648464202881, + "learning_rate": 1.9385514194948998e-05, + "loss": 2.5415, + "step": 14770 + }, + { + "epoch": 0.0928957240783297, + "grad_norm": 7.34189510345459, + "learning_rate": 1.938509509400434e-05, + "loss": 2.1996, + "step": 14780 + }, + { + "epoch": 0.0929585763950268, + "grad_norm": 7.8050079345703125, + "learning_rate": 1.938467599305969e-05, + "loss": 2.3005, + "step": 14790 + }, + { + "epoch": 0.09302142871172392, + "grad_norm": 9.377252578735352, + "learning_rate": 1.9384256892115036e-05, + "loss": 2.3329, + "step": 14800 + }, + { + "epoch": 0.09308428102842103, + "grad_norm": 8.01342487335205, + "learning_rate": 1.9383837791170383e-05, + "loss": 2.3466, + "step": 14810 + }, + { + "epoch": 0.09314713334511815, + "grad_norm": 6.556551933288574, + "learning_rate": 1.938341869022573e-05, + "loss": 2.1089, + "step": 14820 + }, + { + "epoch": 0.09320998566181525, + "grad_norm": 7.6242146492004395, + "learning_rate": 1.9382999589281077e-05, + "loss": 2.5015, + "step": 14830 + }, + { + "epoch": 0.09327283797851237, + "grad_norm": 6.905308246612549, + "learning_rate": 1.9382580488336424e-05, + "loss": 2.6101, + "step": 14840 + }, + { + "epoch": 0.09333569029520948, + "grad_norm": 8.105611801147461, + "learning_rate": 1.9382161387391767e-05, + "loss": 2.3652, + "step": 14850 + }, + { + "epoch": 0.09339854261190658, + "grad_norm": 8.25787353515625, + "learning_rate": 1.9381742286447115e-05, + "loss": 2.1882, + "step": 14860 + }, + { + "epoch": 0.0934613949286037, + "grad_norm": 7.428998947143555, + "learning_rate": 1.938132318550246e-05, + "loss": 2.31, + "step": 14870 + }, + { + "epoch": 0.09352424724530081, + "grad_norm": 6.790313720703125, + "learning_rate": 1.938090408455781e-05, + "loss": 2.3148, + "step": 14880 + }, + { + "epoch": 0.09358709956199791, + "grad_norm": 7.349525451660156, + "learning_rate": 1.9380484983613156e-05, + "loss": 2.418, + "step": 14890 + }, + { + "epoch": 0.09364995187869503, + "grad_norm": 8.276445388793945, + "learning_rate": 1.9380065882668503e-05, + "loss": 2.3691, + "step": 14900 + }, + { + "epoch": 0.09371280419539214, + "grad_norm": 7.776458740234375, + "learning_rate": 1.9379646781723847e-05, + "loss": 2.4167, + "step": 14910 + }, + { + "epoch": 0.09377565651208924, + "grad_norm": 7.6309990882873535, + "learning_rate": 1.9379227680779194e-05, + "loss": 2.3009, + "step": 14920 + }, + { + "epoch": 0.09383850882878636, + "grad_norm": 7.005548477172852, + "learning_rate": 1.937880857983454e-05, + "loss": 2.1872, + "step": 14930 + }, + { + "epoch": 0.09390136114548348, + "grad_norm": 8.230241775512695, + "learning_rate": 1.9378389478889888e-05, + "loss": 2.2369, + "step": 14940 + }, + { + "epoch": 0.09396421346218058, + "grad_norm": 8.217141151428223, + "learning_rate": 1.9377970377945235e-05, + "loss": 2.2704, + "step": 14950 + }, + { + "epoch": 0.09402706577887769, + "grad_norm": 6.3729047775268555, + "learning_rate": 1.937755127700058e-05, + "loss": 2.0222, + "step": 14960 + }, + { + "epoch": 0.0940899180955748, + "grad_norm": 9.567404747009277, + "learning_rate": 1.9377132176055926e-05, + "loss": 2.2782, + "step": 14970 + }, + { + "epoch": 0.09415277041227192, + "grad_norm": 7.393969535827637, + "learning_rate": 1.9376713075111273e-05, + "loss": 2.3824, + "step": 14980 + }, + { + "epoch": 0.09421562272896902, + "grad_norm": 8.631592750549316, + "learning_rate": 1.937629397416662e-05, + "loss": 2.2788, + "step": 14990 + }, + { + "epoch": 0.09427847504566614, + "grad_norm": 8.257186889648438, + "learning_rate": 1.9375874873221963e-05, + "loss": 2.2076, + "step": 15000 + }, + { + "epoch": 0.09434132736236325, + "grad_norm": 7.280744552612305, + "learning_rate": 1.937545577227731e-05, + "loss": 2.2362, + "step": 15010 + }, + { + "epoch": 0.09440417967906035, + "grad_norm": 7.483618259429932, + "learning_rate": 1.9375036671332658e-05, + "loss": 2.3, + "step": 15020 + }, + { + "epoch": 0.09446703199575747, + "grad_norm": 10.175114631652832, + "learning_rate": 1.9374617570388005e-05, + "loss": 2.1979, + "step": 15030 + }, + { + "epoch": 0.09452988431245458, + "grad_norm": 8.764018058776855, + "learning_rate": 1.937419846944335e-05, + "loss": 2.4274, + "step": 15040 + }, + { + "epoch": 0.09459273662915169, + "grad_norm": 7.181217193603516, + "learning_rate": 1.93737793684987e-05, + "loss": 2.2674, + "step": 15050 + }, + { + "epoch": 0.0946555889458488, + "grad_norm": 7.179269790649414, + "learning_rate": 1.9373360267554046e-05, + "loss": 2.5718, + "step": 15060 + }, + { + "epoch": 0.09471844126254592, + "grad_norm": 6.558637619018555, + "learning_rate": 1.9372941166609393e-05, + "loss": 2.3464, + "step": 15070 + }, + { + "epoch": 0.09478129357924302, + "grad_norm": 8.1760835647583, + "learning_rate": 1.937252206566474e-05, + "loss": 2.2597, + "step": 15080 + }, + { + "epoch": 0.09484414589594013, + "grad_norm": 7.907341957092285, + "learning_rate": 1.9372102964720084e-05, + "loss": 2.3224, + "step": 15090 + }, + { + "epoch": 0.09490699821263725, + "grad_norm": 7.5614190101623535, + "learning_rate": 1.937168386377543e-05, + "loss": 2.3524, + "step": 15100 + }, + { + "epoch": 0.09496985052933435, + "grad_norm": 8.202812194824219, + "learning_rate": 1.9371264762830778e-05, + "loss": 2.4304, + "step": 15110 + }, + { + "epoch": 0.09503270284603146, + "grad_norm": 7.19970178604126, + "learning_rate": 1.9370845661886125e-05, + "loss": 2.3235, + "step": 15120 + }, + { + "epoch": 0.09509555516272858, + "grad_norm": 6.639101505279541, + "learning_rate": 1.9370426560941472e-05, + "loss": 2.2978, + "step": 15130 + }, + { + "epoch": 0.09515840747942568, + "grad_norm": 6.895039081573486, + "learning_rate": 1.9370007459996816e-05, + "loss": 2.4561, + "step": 15140 + }, + { + "epoch": 0.0952212597961228, + "grad_norm": 8.347858428955078, + "learning_rate": 1.9369588359052163e-05, + "loss": 2.0922, + "step": 15150 + }, + { + "epoch": 0.09528411211281991, + "grad_norm": 7.56237268447876, + "learning_rate": 1.936916925810751e-05, + "loss": 2.2262, + "step": 15160 + }, + { + "epoch": 0.09534696442951703, + "grad_norm": 7.407289981842041, + "learning_rate": 1.9368750157162857e-05, + "loss": 2.5011, + "step": 15170 + }, + { + "epoch": 0.09540981674621413, + "grad_norm": 7.594213008880615, + "learning_rate": 1.93683310562182e-05, + "loss": 2.1702, + "step": 15180 + }, + { + "epoch": 0.09547266906291124, + "grad_norm": 8.105140686035156, + "learning_rate": 1.9367911955273548e-05, + "loss": 2.4739, + "step": 15190 + }, + { + "epoch": 0.09553552137960836, + "grad_norm": 8.515286445617676, + "learning_rate": 1.9367492854328895e-05, + "loss": 2.4776, + "step": 15200 + }, + { + "epoch": 0.09559837369630546, + "grad_norm": 8.611160278320312, + "learning_rate": 1.936707375338424e-05, + "loss": 2.2731, + "step": 15210 + }, + { + "epoch": 0.09566122601300257, + "grad_norm": 9.178365707397461, + "learning_rate": 1.936665465243959e-05, + "loss": 2.6187, + "step": 15220 + }, + { + "epoch": 0.09572407832969969, + "grad_norm": 8.396846771240234, + "learning_rate": 1.9366235551494932e-05, + "loss": 2.2943, + "step": 15230 + }, + { + "epoch": 0.09578693064639679, + "grad_norm": 6.751037120819092, + "learning_rate": 1.936581645055028e-05, + "loss": 2.1195, + "step": 15240 + }, + { + "epoch": 0.0958497829630939, + "grad_norm": 6.924027919769287, + "learning_rate": 1.9365397349605627e-05, + "loss": 2.2707, + "step": 15250 + }, + { + "epoch": 0.09591263527979102, + "grad_norm": 7.081989288330078, + "learning_rate": 1.9364978248660974e-05, + "loss": 2.3677, + "step": 15260 + }, + { + "epoch": 0.09597548759648812, + "grad_norm": 9.71423053741455, + "learning_rate": 1.936455914771632e-05, + "loss": 2.2383, + "step": 15270 + }, + { + "epoch": 0.09603833991318524, + "grad_norm": 8.501422882080078, + "learning_rate": 1.9364140046771668e-05, + "loss": 2.304, + "step": 15280 + }, + { + "epoch": 0.09610119222988235, + "grad_norm": 7.34773063659668, + "learning_rate": 1.9363720945827015e-05, + "loss": 2.3234, + "step": 15290 + }, + { + "epoch": 0.09616404454657945, + "grad_norm": 7.565731525421143, + "learning_rate": 1.9363301844882362e-05, + "loss": 2.3826, + "step": 15300 + }, + { + "epoch": 0.09622689686327657, + "grad_norm": 7.900417804718018, + "learning_rate": 1.9362882743937706e-05, + "loss": 2.1959, + "step": 15310 + }, + { + "epoch": 0.09628974917997368, + "grad_norm": 7.132443428039551, + "learning_rate": 1.9362463642993053e-05, + "loss": 2.0233, + "step": 15320 + }, + { + "epoch": 0.0963526014966708, + "grad_norm": 8.974869728088379, + "learning_rate": 1.93620445420484e-05, + "loss": 2.2347, + "step": 15330 + }, + { + "epoch": 0.0964154538133679, + "grad_norm": 6.8141188621521, + "learning_rate": 1.9361625441103747e-05, + "loss": 2.3844, + "step": 15340 + }, + { + "epoch": 0.09647830613006501, + "grad_norm": 7.523212432861328, + "learning_rate": 1.9361206340159094e-05, + "loss": 2.4004, + "step": 15350 + }, + { + "epoch": 0.09654115844676213, + "grad_norm": 8.011374473571777, + "learning_rate": 1.9360787239214438e-05, + "loss": 2.2032, + "step": 15360 + }, + { + "epoch": 0.09660401076345923, + "grad_norm": 7.110884666442871, + "learning_rate": 1.9360368138269785e-05, + "loss": 2.2476, + "step": 15370 + }, + { + "epoch": 0.09666686308015635, + "grad_norm": 7.431973934173584, + "learning_rate": 1.9359949037325132e-05, + "loss": 2.2006, + "step": 15380 + }, + { + "epoch": 0.09672971539685346, + "grad_norm": 7.817060470581055, + "learning_rate": 1.935952993638048e-05, + "loss": 2.4115, + "step": 15390 + }, + { + "epoch": 0.09679256771355056, + "grad_norm": 7.787459373474121, + "learning_rate": 1.9359110835435822e-05, + "loss": 2.3382, + "step": 15400 + }, + { + "epoch": 0.09685542003024768, + "grad_norm": 6.05074405670166, + "learning_rate": 1.935869173449117e-05, + "loss": 2.1408, + "step": 15410 + }, + { + "epoch": 0.09691827234694479, + "grad_norm": 8.650323867797852, + "learning_rate": 1.9358272633546517e-05, + "loss": 2.4014, + "step": 15420 + }, + { + "epoch": 0.0969811246636419, + "grad_norm": 8.57004451751709, + "learning_rate": 1.9357853532601864e-05, + "loss": 2.2718, + "step": 15430 + }, + { + "epoch": 0.09704397698033901, + "grad_norm": 8.429915428161621, + "learning_rate": 1.935743443165721e-05, + "loss": 2.2648, + "step": 15440 + }, + { + "epoch": 0.09710682929703612, + "grad_norm": 7.5032148361206055, + "learning_rate": 1.9357015330712558e-05, + "loss": 2.3131, + "step": 15450 + }, + { + "epoch": 0.09716968161373322, + "grad_norm": 8.546192169189453, + "learning_rate": 1.9356596229767905e-05, + "loss": 2.3559, + "step": 15460 + }, + { + "epoch": 0.09723253393043034, + "grad_norm": 8.149224281311035, + "learning_rate": 1.9356177128823252e-05, + "loss": 2.4356, + "step": 15470 + }, + { + "epoch": 0.09729538624712745, + "grad_norm": 8.380104064941406, + "learning_rate": 1.9355758027878596e-05, + "loss": 2.2616, + "step": 15480 + }, + { + "epoch": 0.09735823856382457, + "grad_norm": 7.752674102783203, + "learning_rate": 1.9355338926933943e-05, + "loss": 2.2147, + "step": 15490 + }, + { + "epoch": 0.09742109088052167, + "grad_norm": 7.410440444946289, + "learning_rate": 1.935491982598929e-05, + "loss": 2.0358, + "step": 15500 + }, + { + "epoch": 0.09748394319721879, + "grad_norm": 9.173617362976074, + "learning_rate": 1.9354500725044637e-05, + "loss": 2.7082, + "step": 15510 + }, + { + "epoch": 0.0975467955139159, + "grad_norm": 7.036073684692383, + "learning_rate": 1.9354081624099984e-05, + "loss": 2.2567, + "step": 15520 + }, + { + "epoch": 0.097609647830613, + "grad_norm": 8.036812782287598, + "learning_rate": 1.9353662523155328e-05, + "loss": 2.2407, + "step": 15530 + }, + { + "epoch": 0.09767250014731012, + "grad_norm": 8.202893257141113, + "learning_rate": 1.9353243422210675e-05, + "loss": 2.1459, + "step": 15540 + }, + { + "epoch": 0.09773535246400723, + "grad_norm": 8.069165229797363, + "learning_rate": 1.9352824321266022e-05, + "loss": 2.4264, + "step": 15550 + }, + { + "epoch": 0.09779820478070433, + "grad_norm": 7.467092990875244, + "learning_rate": 1.935240522032137e-05, + "loss": 2.2151, + "step": 15560 + }, + { + "epoch": 0.09786105709740145, + "grad_norm": 7.783108711242676, + "learning_rate": 1.9351986119376716e-05, + "loss": 2.2198, + "step": 15570 + }, + { + "epoch": 0.09792390941409856, + "grad_norm": 8.802495002746582, + "learning_rate": 1.935156701843206e-05, + "loss": 2.4794, + "step": 15580 + }, + { + "epoch": 0.09798676173079567, + "grad_norm": 8.029830932617188, + "learning_rate": 1.9351147917487407e-05, + "loss": 2.4849, + "step": 15590 + }, + { + "epoch": 0.09804961404749278, + "grad_norm": 6.598846435546875, + "learning_rate": 1.9350728816542754e-05, + "loss": 2.238, + "step": 15600 + }, + { + "epoch": 0.0981124663641899, + "grad_norm": 7.8369879722595215, + "learning_rate": 1.93503097155981e-05, + "loss": 2.3688, + "step": 15610 + }, + { + "epoch": 0.098175318680887, + "grad_norm": 9.714192390441895, + "learning_rate": 1.9349890614653444e-05, + "loss": 2.3551, + "step": 15620 + }, + { + "epoch": 0.09823817099758411, + "grad_norm": 7.944300651550293, + "learning_rate": 1.934947151370879e-05, + "loss": 2.2602, + "step": 15630 + }, + { + "epoch": 0.09830102331428123, + "grad_norm": 6.619028091430664, + "learning_rate": 1.934905241276414e-05, + "loss": 2.215, + "step": 15640 + }, + { + "epoch": 0.09836387563097834, + "grad_norm": 8.038568496704102, + "learning_rate": 1.9348633311819486e-05, + "loss": 2.3041, + "step": 15650 + }, + { + "epoch": 0.09842672794767544, + "grad_norm": 8.816732406616211, + "learning_rate": 1.9348214210874833e-05, + "loss": 2.4387, + "step": 15660 + }, + { + "epoch": 0.09848958026437256, + "grad_norm": 7.7710490226745605, + "learning_rate": 1.934779510993018e-05, + "loss": 2.2581, + "step": 15670 + }, + { + "epoch": 0.09855243258106967, + "grad_norm": 7.502630233764648, + "learning_rate": 1.9347376008985527e-05, + "loss": 2.2776, + "step": 15680 + }, + { + "epoch": 0.09861528489776678, + "grad_norm": 7.6552276611328125, + "learning_rate": 1.9346956908040874e-05, + "loss": 2.0136, + "step": 15690 + }, + { + "epoch": 0.09867813721446389, + "grad_norm": 7.770573616027832, + "learning_rate": 1.934653780709622e-05, + "loss": 2.2089, + "step": 15700 + }, + { + "epoch": 0.098740989531161, + "grad_norm": 7.82880973815918, + "learning_rate": 1.9346118706151565e-05, + "loss": 2.4368, + "step": 15710 + }, + { + "epoch": 0.0988038418478581, + "grad_norm": 6.880098342895508, + "learning_rate": 1.9345699605206912e-05, + "loss": 2.0097, + "step": 15720 + }, + { + "epoch": 0.09886669416455522, + "grad_norm": 7.838752269744873, + "learning_rate": 1.934528050426226e-05, + "loss": 2.1871, + "step": 15730 + }, + { + "epoch": 0.09892954648125234, + "grad_norm": 6.527927875518799, + "learning_rate": 1.9344861403317606e-05, + "loss": 2.0773, + "step": 15740 + }, + { + "epoch": 0.09899239879794944, + "grad_norm": 7.516666412353516, + "learning_rate": 1.9344442302372953e-05, + "loss": 2.4226, + "step": 15750 + }, + { + "epoch": 0.09905525111464655, + "grad_norm": 7.703370571136475, + "learning_rate": 1.9344023201428297e-05, + "loss": 2.1255, + "step": 15760 + }, + { + "epoch": 0.09911810343134367, + "grad_norm": 7.8158440589904785, + "learning_rate": 1.9343604100483644e-05, + "loss": 2.2532, + "step": 15770 + }, + { + "epoch": 0.09918095574804077, + "grad_norm": 7.7812089920043945, + "learning_rate": 1.934318499953899e-05, + "loss": 2.3821, + "step": 15780 + }, + { + "epoch": 0.09924380806473788, + "grad_norm": 6.937283992767334, + "learning_rate": 1.9342765898594338e-05, + "loss": 2.2079, + "step": 15790 + }, + { + "epoch": 0.099306660381435, + "grad_norm": 7.606636047363281, + "learning_rate": 1.934234679764968e-05, + "loss": 2.2399, + "step": 15800 + }, + { + "epoch": 0.0993695126981321, + "grad_norm": 7.299315452575684, + "learning_rate": 1.934192769670503e-05, + "loss": 2.1152, + "step": 15810 + }, + { + "epoch": 0.09943236501482922, + "grad_norm": 6.596746921539307, + "learning_rate": 1.9341508595760376e-05, + "loss": 2.1512, + "step": 15820 + }, + { + "epoch": 0.09949521733152633, + "grad_norm": 5.329288482666016, + "learning_rate": 1.9341089494815723e-05, + "loss": 2.0065, + "step": 15830 + }, + { + "epoch": 0.09955806964822345, + "grad_norm": 9.746723175048828, + "learning_rate": 1.934067039387107e-05, + "loss": 2.4375, + "step": 15840 + }, + { + "epoch": 0.09962092196492055, + "grad_norm": 8.963370323181152, + "learning_rate": 1.9340251292926414e-05, + "loss": 2.4851, + "step": 15850 + }, + { + "epoch": 0.09968377428161766, + "grad_norm": 6.951449394226074, + "learning_rate": 1.933983219198176e-05, + "loss": 2.5821, + "step": 15860 + }, + { + "epoch": 0.09974662659831478, + "grad_norm": 8.524133682250977, + "learning_rate": 1.9339413091037108e-05, + "loss": 2.4172, + "step": 15870 + }, + { + "epoch": 0.09980947891501188, + "grad_norm": 8.179025650024414, + "learning_rate": 1.9338993990092455e-05, + "loss": 2.2555, + "step": 15880 + }, + { + "epoch": 0.099872331231709, + "grad_norm": 9.563819885253906, + "learning_rate": 1.9338574889147802e-05, + "loss": 2.3097, + "step": 15890 + }, + { + "epoch": 0.09993518354840611, + "grad_norm": 7.194939136505127, + "learning_rate": 1.933815578820315e-05, + "loss": 2.4191, + "step": 15900 + }, + { + "epoch": 0.09999803586510321, + "grad_norm": 7.345890522003174, + "learning_rate": 1.9337736687258496e-05, + "loss": 2.4248, + "step": 15910 + }, + { + "epoch": 0.10006088818180033, + "grad_norm": 7.661227226257324, + "learning_rate": 1.9337317586313843e-05, + "loss": 2.3021, + "step": 15920 + }, + { + "epoch": 0.10012374049849744, + "grad_norm": 7.1798415184021, + "learning_rate": 1.9336898485369187e-05, + "loss": 2.1467, + "step": 15930 + }, + { + "epoch": 0.10018659281519454, + "grad_norm": 6.5734992027282715, + "learning_rate": 1.9336479384424534e-05, + "loss": 2.2988, + "step": 15940 + }, + { + "epoch": 0.10024944513189166, + "grad_norm": 7.714196681976318, + "learning_rate": 1.933606028347988e-05, + "loss": 2.3623, + "step": 15950 + }, + { + "epoch": 0.10031229744858877, + "grad_norm": 7.4762701988220215, + "learning_rate": 1.9335641182535228e-05, + "loss": 2.2166, + "step": 15960 + }, + { + "epoch": 0.10037514976528587, + "grad_norm": 8.928500175476074, + "learning_rate": 1.9335222081590575e-05, + "loss": 2.2144, + "step": 15970 + }, + { + "epoch": 0.10043800208198299, + "grad_norm": 8.641223907470703, + "learning_rate": 1.933480298064592e-05, + "loss": 2.1497, + "step": 15980 + }, + { + "epoch": 0.1005008543986801, + "grad_norm": 7.675530433654785, + "learning_rate": 1.9334383879701266e-05, + "loss": 2.3305, + "step": 15990 + }, + { + "epoch": 0.10056370671537722, + "grad_norm": 6.492575645446777, + "learning_rate": 1.9333964778756613e-05, + "loss": 2.1332, + "step": 16000 + }, + { + "epoch": 0.10062655903207432, + "grad_norm": 7.188532829284668, + "learning_rate": 1.933354567781196e-05, + "loss": 2.1893, + "step": 16010 + }, + { + "epoch": 0.10068941134877143, + "grad_norm": 8.255434036254883, + "learning_rate": 1.9333126576867304e-05, + "loss": 2.3866, + "step": 16020 + }, + { + "epoch": 0.10075226366546855, + "grad_norm": 6.916716575622559, + "learning_rate": 1.933270747592265e-05, + "loss": 2.3914, + "step": 16030 + }, + { + "epoch": 0.10081511598216565, + "grad_norm": 8.140775680541992, + "learning_rate": 1.9332288374977998e-05, + "loss": 2.3892, + "step": 16040 + }, + { + "epoch": 0.10087796829886277, + "grad_norm": 8.864714622497559, + "learning_rate": 1.9331869274033345e-05, + "loss": 2.478, + "step": 16050 + }, + { + "epoch": 0.10094082061555988, + "grad_norm": 8.247222900390625, + "learning_rate": 1.9331450173088692e-05, + "loss": 2.2977, + "step": 16060 + }, + { + "epoch": 0.10100367293225698, + "grad_norm": 7.6356306076049805, + "learning_rate": 1.933103107214404e-05, + "loss": 2.2705, + "step": 16070 + }, + { + "epoch": 0.1010665252489541, + "grad_norm": 8.452744483947754, + "learning_rate": 1.9330611971199386e-05, + "loss": 2.1171, + "step": 16080 + }, + { + "epoch": 0.10112937756565121, + "grad_norm": 7.3948869705200195, + "learning_rate": 1.9330192870254733e-05, + "loss": 2.2588, + "step": 16090 + }, + { + "epoch": 0.10119222988234831, + "grad_norm": 7.8918280601501465, + "learning_rate": 1.9329773769310077e-05, + "loss": 2.0473, + "step": 16100 + }, + { + "epoch": 0.10125508219904543, + "grad_norm": 7.246849060058594, + "learning_rate": 1.9329354668365424e-05, + "loss": 2.4762, + "step": 16110 + }, + { + "epoch": 0.10131793451574254, + "grad_norm": 12.090804100036621, + "learning_rate": 1.932893556742077e-05, + "loss": 2.3217, + "step": 16120 + }, + { + "epoch": 0.10138078683243965, + "grad_norm": 6.78463077545166, + "learning_rate": 1.9328516466476118e-05, + "loss": 2.2876, + "step": 16130 + }, + { + "epoch": 0.10144363914913676, + "grad_norm": 7.757300853729248, + "learning_rate": 1.9328097365531465e-05, + "loss": 2.2388, + "step": 16140 + }, + { + "epoch": 0.10150649146583388, + "grad_norm": 7.6502861976623535, + "learning_rate": 1.9327678264586812e-05, + "loss": 2.1883, + "step": 16150 + }, + { + "epoch": 0.10156934378253099, + "grad_norm": 7.231910705566406, + "learning_rate": 1.9327259163642156e-05, + "loss": 2.3303, + "step": 16160 + }, + { + "epoch": 0.10163219609922809, + "grad_norm": 6.886990070343018, + "learning_rate": 1.9326840062697503e-05, + "loss": 2.3253, + "step": 16170 + }, + { + "epoch": 0.10169504841592521, + "grad_norm": 7.37526273727417, + "learning_rate": 1.932642096175285e-05, + "loss": 2.3775, + "step": 16180 + }, + { + "epoch": 0.10175790073262232, + "grad_norm": 7.015676498413086, + "learning_rate": 1.9326001860808197e-05, + "loss": 2.4584, + "step": 16190 + }, + { + "epoch": 0.10182075304931942, + "grad_norm": 8.19909381866455, + "learning_rate": 1.932558275986354e-05, + "loss": 2.3381, + "step": 16200 + }, + { + "epoch": 0.10188360536601654, + "grad_norm": 7.600981712341309, + "learning_rate": 1.9325163658918888e-05, + "loss": 2.3598, + "step": 16210 + }, + { + "epoch": 0.10194645768271365, + "grad_norm": 7.685189723968506, + "learning_rate": 1.9324744557974235e-05, + "loss": 2.1358, + "step": 16220 + }, + { + "epoch": 0.10200930999941075, + "grad_norm": 6.872974872589111, + "learning_rate": 1.9324325457029582e-05, + "loss": 2.0541, + "step": 16230 + }, + { + "epoch": 0.10207216231610787, + "grad_norm": 8.120587348937988, + "learning_rate": 1.9323906356084926e-05, + "loss": 2.3747, + "step": 16240 + }, + { + "epoch": 0.10213501463280499, + "grad_norm": 7.509024143218994, + "learning_rate": 1.9323487255140273e-05, + "loss": 2.3145, + "step": 16250 + }, + { + "epoch": 0.10219786694950209, + "grad_norm": 8.789929389953613, + "learning_rate": 1.932306815419562e-05, + "loss": 2.0984, + "step": 16260 + }, + { + "epoch": 0.1022607192661992, + "grad_norm": 6.380049705505371, + "learning_rate": 1.9322649053250967e-05, + "loss": 2.3583, + "step": 16270 + }, + { + "epoch": 0.10232357158289632, + "grad_norm": 8.88233757019043, + "learning_rate": 1.9322229952306314e-05, + "loss": 2.6463, + "step": 16280 + }, + { + "epoch": 0.10238642389959342, + "grad_norm": 6.933871269226074, + "learning_rate": 1.932181085136166e-05, + "loss": 2.2953, + "step": 16290 + }, + { + "epoch": 0.10244927621629053, + "grad_norm": 7.950907230377197, + "learning_rate": 1.9321391750417008e-05, + "loss": 2.3062, + "step": 16300 + }, + { + "epoch": 0.10251212853298765, + "grad_norm": 6.583611011505127, + "learning_rate": 1.9320972649472355e-05, + "loss": 2.2439, + "step": 16310 + }, + { + "epoch": 0.10257498084968476, + "grad_norm": 7.193686485290527, + "learning_rate": 1.9320553548527702e-05, + "loss": 2.1577, + "step": 16320 + }, + { + "epoch": 0.10263783316638186, + "grad_norm": 9.45664119720459, + "learning_rate": 1.9320134447583046e-05, + "loss": 2.372, + "step": 16330 + }, + { + "epoch": 0.10270068548307898, + "grad_norm": 7.178765773773193, + "learning_rate": 1.9319715346638393e-05, + "loss": 2.2334, + "step": 16340 + }, + { + "epoch": 0.1027635377997761, + "grad_norm": 9.265113830566406, + "learning_rate": 1.931929624569374e-05, + "loss": 2.3851, + "step": 16350 + }, + { + "epoch": 0.1028263901164732, + "grad_norm": 10.580037117004395, + "learning_rate": 1.9318877144749087e-05, + "loss": 2.2099, + "step": 16360 + }, + { + "epoch": 0.10288924243317031, + "grad_norm": 8.77173137664795, + "learning_rate": 1.9318458043804434e-05, + "loss": 2.0985, + "step": 16370 + }, + { + "epoch": 0.10295209474986743, + "grad_norm": 7.568657875061035, + "learning_rate": 1.9318038942859778e-05, + "loss": 2.412, + "step": 16380 + }, + { + "epoch": 0.10301494706656453, + "grad_norm": 7.412895202636719, + "learning_rate": 1.9317619841915125e-05, + "loss": 2.4029, + "step": 16390 + }, + { + "epoch": 0.10307779938326164, + "grad_norm": 8.491913795471191, + "learning_rate": 1.9317200740970472e-05, + "loss": 2.325, + "step": 16400 + }, + { + "epoch": 0.10314065169995876, + "grad_norm": 7.789159774780273, + "learning_rate": 1.931678164002582e-05, + "loss": 2.3369, + "step": 16410 + }, + { + "epoch": 0.10320350401665586, + "grad_norm": 6.9769744873046875, + "learning_rate": 1.9316362539081163e-05, + "loss": 2.2403, + "step": 16420 + }, + { + "epoch": 0.10326635633335297, + "grad_norm": 8.47653865814209, + "learning_rate": 1.931594343813651e-05, + "loss": 2.3198, + "step": 16430 + }, + { + "epoch": 0.10332920865005009, + "grad_norm": 7.21108865737915, + "learning_rate": 1.9315524337191857e-05, + "loss": 2.3815, + "step": 16440 + }, + { + "epoch": 0.10339206096674719, + "grad_norm": 13.637954711914062, + "learning_rate": 1.9315105236247204e-05, + "loss": 2.2583, + "step": 16450 + }, + { + "epoch": 0.1034549132834443, + "grad_norm": 8.142091751098633, + "learning_rate": 1.931468613530255e-05, + "loss": 2.3985, + "step": 16460 + }, + { + "epoch": 0.10351776560014142, + "grad_norm": 6.6919074058532715, + "learning_rate": 1.9314267034357898e-05, + "loss": 2.1173, + "step": 16470 + }, + { + "epoch": 0.10358061791683852, + "grad_norm": 8.20840835571289, + "learning_rate": 1.9313847933413242e-05, + "loss": 2.4074, + "step": 16480 + }, + { + "epoch": 0.10364347023353564, + "grad_norm": 7.9486470222473145, + "learning_rate": 1.931342883246859e-05, + "loss": 2.4469, + "step": 16490 + }, + { + "epoch": 0.10370632255023275, + "grad_norm": 7.732548713684082, + "learning_rate": 1.9313009731523936e-05, + "loss": 2.5064, + "step": 16500 + }, + { + "epoch": 0.10376917486692987, + "grad_norm": 8.321471214294434, + "learning_rate": 1.9312590630579283e-05, + "loss": 2.2244, + "step": 16510 + }, + { + "epoch": 0.10383202718362697, + "grad_norm": 8.02713394165039, + "learning_rate": 1.931217152963463e-05, + "loss": 2.2076, + "step": 16520 + }, + { + "epoch": 0.10389487950032408, + "grad_norm": 8.101451873779297, + "learning_rate": 1.9311752428689977e-05, + "loss": 2.1815, + "step": 16530 + }, + { + "epoch": 0.1039577318170212, + "grad_norm": 7.51225471496582, + "learning_rate": 1.9311333327745324e-05, + "loss": 2.2129, + "step": 16540 + }, + { + "epoch": 0.1040205841337183, + "grad_norm": 7.396722316741943, + "learning_rate": 1.9310914226800668e-05, + "loss": 2.2444, + "step": 16550 + }, + { + "epoch": 0.10408343645041541, + "grad_norm": 7.46852970123291, + "learning_rate": 1.9310495125856015e-05, + "loss": 2.3093, + "step": 16560 + }, + { + "epoch": 0.10414628876711253, + "grad_norm": 6.899946689605713, + "learning_rate": 1.9310076024911362e-05, + "loss": 2.2093, + "step": 16570 + }, + { + "epoch": 0.10420914108380963, + "grad_norm": 7.019610404968262, + "learning_rate": 1.930965692396671e-05, + "loss": 2.0416, + "step": 16580 + }, + { + "epoch": 0.10427199340050675, + "grad_norm": 8.696752548217773, + "learning_rate": 1.9309237823022056e-05, + "loss": 2.3048, + "step": 16590 + }, + { + "epoch": 0.10433484571720386, + "grad_norm": 7.059429168701172, + "learning_rate": 1.93088187220774e-05, + "loss": 2.2203, + "step": 16600 + }, + { + "epoch": 0.10439769803390096, + "grad_norm": 7.339666366577148, + "learning_rate": 1.9308399621132747e-05, + "loss": 2.3686, + "step": 16610 + }, + { + "epoch": 0.10446055035059808, + "grad_norm": 8.099876403808594, + "learning_rate": 1.9307980520188094e-05, + "loss": 2.082, + "step": 16620 + }, + { + "epoch": 0.10452340266729519, + "grad_norm": 6.657029628753662, + "learning_rate": 1.930756141924344e-05, + "loss": 2.135, + "step": 16630 + }, + { + "epoch": 0.1045862549839923, + "grad_norm": 7.453880310058594, + "learning_rate": 1.9307142318298785e-05, + "loss": 2.3185, + "step": 16640 + }, + { + "epoch": 0.10464910730068941, + "grad_norm": 8.553909301757812, + "learning_rate": 1.9306723217354132e-05, + "loss": 2.0539, + "step": 16650 + }, + { + "epoch": 0.10471195961738652, + "grad_norm": 7.258327960968018, + "learning_rate": 1.930630411640948e-05, + "loss": 2.0804, + "step": 16660 + }, + { + "epoch": 0.10477481193408364, + "grad_norm": 7.425248146057129, + "learning_rate": 1.9305885015464826e-05, + "loss": 2.2616, + "step": 16670 + }, + { + "epoch": 0.10483766425078074, + "grad_norm": 8.037734985351562, + "learning_rate": 1.9305465914520173e-05, + "loss": 2.3576, + "step": 16680 + }, + { + "epoch": 0.10490051656747786, + "grad_norm": 7.662077903747559, + "learning_rate": 1.930504681357552e-05, + "loss": 2.0718, + "step": 16690 + }, + { + "epoch": 0.10496336888417497, + "grad_norm": 6.908252716064453, + "learning_rate": 1.9304627712630867e-05, + "loss": 2.2498, + "step": 16700 + }, + { + "epoch": 0.10502622120087207, + "grad_norm": 7.3731770515441895, + "learning_rate": 1.9304208611686214e-05, + "loss": 2.378, + "step": 16710 + }, + { + "epoch": 0.10508907351756919, + "grad_norm": 9.528003692626953, + "learning_rate": 1.930378951074156e-05, + "loss": 2.0206, + "step": 16720 + }, + { + "epoch": 0.1051519258342663, + "grad_norm": 7.476037979125977, + "learning_rate": 1.9303370409796905e-05, + "loss": 2.1773, + "step": 16730 + }, + { + "epoch": 0.1052147781509634, + "grad_norm": 8.378963470458984, + "learning_rate": 1.9302951308852252e-05, + "loss": 2.3907, + "step": 16740 + }, + { + "epoch": 0.10527763046766052, + "grad_norm": 7.383098125457764, + "learning_rate": 1.93025322079076e-05, + "loss": 2.3107, + "step": 16750 + }, + { + "epoch": 0.10534048278435763, + "grad_norm": 7.901841163635254, + "learning_rate": 1.9302113106962946e-05, + "loss": 2.1333, + "step": 16760 + }, + { + "epoch": 0.10540333510105473, + "grad_norm": 6.856114387512207, + "learning_rate": 1.9301694006018293e-05, + "loss": 2.195, + "step": 16770 + }, + { + "epoch": 0.10546618741775185, + "grad_norm": 7.679387092590332, + "learning_rate": 1.9301274905073637e-05, + "loss": 2.2593, + "step": 16780 + }, + { + "epoch": 0.10552903973444897, + "grad_norm": 8.764408111572266, + "learning_rate": 1.9300855804128984e-05, + "loss": 2.2052, + "step": 16790 + }, + { + "epoch": 0.10559189205114607, + "grad_norm": 6.764782905578613, + "learning_rate": 1.930043670318433e-05, + "loss": 2.3095, + "step": 16800 + }, + { + "epoch": 0.10565474436784318, + "grad_norm": 7.6861701011657715, + "learning_rate": 1.9300017602239678e-05, + "loss": 2.1621, + "step": 16810 + }, + { + "epoch": 0.1057175966845403, + "grad_norm": 8.404021263122559, + "learning_rate": 1.9299598501295022e-05, + "loss": 2.2331, + "step": 16820 + }, + { + "epoch": 0.10578044900123741, + "grad_norm": 6.845357418060303, + "learning_rate": 1.929917940035037e-05, + "loss": 2.2765, + "step": 16830 + }, + { + "epoch": 0.10584330131793451, + "grad_norm": 6.098022937774658, + "learning_rate": 1.9298760299405716e-05, + "loss": 2.1941, + "step": 16840 + }, + { + "epoch": 0.10590615363463163, + "grad_norm": 6.772059440612793, + "learning_rate": 1.9298341198461063e-05, + "loss": 2.172, + "step": 16850 + }, + { + "epoch": 0.10596900595132874, + "grad_norm": 8.620196342468262, + "learning_rate": 1.9297922097516407e-05, + "loss": 2.3347, + "step": 16860 + }, + { + "epoch": 0.10603185826802584, + "grad_norm": 8.322911262512207, + "learning_rate": 1.9297502996571754e-05, + "loss": 2.3929, + "step": 16870 + }, + { + "epoch": 0.10609471058472296, + "grad_norm": 7.095172882080078, + "learning_rate": 1.92970838956271e-05, + "loss": 2.0793, + "step": 16880 + }, + { + "epoch": 0.10615756290142007, + "grad_norm": 8.138224601745605, + "learning_rate": 1.9296664794682448e-05, + "loss": 2.2524, + "step": 16890 + }, + { + "epoch": 0.10622041521811718, + "grad_norm": 8.498547554016113, + "learning_rate": 1.9296245693737795e-05, + "loss": 2.3374, + "step": 16900 + }, + { + "epoch": 0.10628326753481429, + "grad_norm": 7.647117614746094, + "learning_rate": 1.9295826592793142e-05, + "loss": 2.2798, + "step": 16910 + }, + { + "epoch": 0.1063461198515114, + "grad_norm": 6.767284393310547, + "learning_rate": 1.929540749184849e-05, + "loss": 2.3106, + "step": 16920 + }, + { + "epoch": 0.10640897216820851, + "grad_norm": 7.401424407958984, + "learning_rate": 1.9294988390903836e-05, + "loss": 2.2486, + "step": 16930 + }, + { + "epoch": 0.10647182448490562, + "grad_norm": 9.371999740600586, + "learning_rate": 1.9294569289959183e-05, + "loss": 2.3394, + "step": 16940 + }, + { + "epoch": 0.10653467680160274, + "grad_norm": 7.710360050201416, + "learning_rate": 1.9294150189014527e-05, + "loss": 2.1607, + "step": 16950 + }, + { + "epoch": 0.10659752911829984, + "grad_norm": 8.567865371704102, + "learning_rate": 1.9293731088069874e-05, + "loss": 2.3452, + "step": 16960 + }, + { + "epoch": 0.10666038143499695, + "grad_norm": 7.319023609161377, + "learning_rate": 1.929331198712522e-05, + "loss": 2.2159, + "step": 16970 + }, + { + "epoch": 0.10672323375169407, + "grad_norm": 7.316112041473389, + "learning_rate": 1.9292892886180568e-05, + "loss": 2.4197, + "step": 16980 + }, + { + "epoch": 0.10678608606839118, + "grad_norm": 8.065226554870605, + "learning_rate": 1.9292473785235915e-05, + "loss": 2.2532, + "step": 16990 + }, + { + "epoch": 0.10684893838508829, + "grad_norm": 7.7862868309021, + "learning_rate": 1.929205468429126e-05, + "loss": 2.3215, + "step": 17000 + }, + { + "epoch": 0.1069117907017854, + "grad_norm": 7.383972644805908, + "learning_rate": 1.9291635583346606e-05, + "loss": 2.2977, + "step": 17010 + }, + { + "epoch": 0.10697464301848252, + "grad_norm": 7.82750129699707, + "learning_rate": 1.9291216482401953e-05, + "loss": 2.4872, + "step": 17020 + }, + { + "epoch": 0.10703749533517962, + "grad_norm": 7.056968688964844, + "learning_rate": 1.92907973814573e-05, + "loss": 2.2487, + "step": 17030 + }, + { + "epoch": 0.10710034765187673, + "grad_norm": 6.971473693847656, + "learning_rate": 1.9290378280512644e-05, + "loss": 2.1742, + "step": 17040 + }, + { + "epoch": 0.10716319996857385, + "grad_norm": 7.3401570320129395, + "learning_rate": 1.928995917956799e-05, + "loss": 2.1839, + "step": 17050 + }, + { + "epoch": 0.10722605228527095, + "grad_norm": 8.624346733093262, + "learning_rate": 1.9289540078623338e-05, + "loss": 2.3994, + "step": 17060 + }, + { + "epoch": 0.10728890460196806, + "grad_norm": 8.388733863830566, + "learning_rate": 1.9289120977678685e-05, + "loss": 2.0978, + "step": 17070 + }, + { + "epoch": 0.10735175691866518, + "grad_norm": 8.195749282836914, + "learning_rate": 1.9288701876734032e-05, + "loss": 2.0869, + "step": 17080 + }, + { + "epoch": 0.10741460923536228, + "grad_norm": 8.285856246948242, + "learning_rate": 1.928828277578938e-05, + "loss": 2.1606, + "step": 17090 + }, + { + "epoch": 0.1074774615520594, + "grad_norm": 8.132835388183594, + "learning_rate": 1.9287863674844726e-05, + "loss": 2.3997, + "step": 17100 + }, + { + "epoch": 0.10754031386875651, + "grad_norm": 9.392642974853516, + "learning_rate": 1.928744457390007e-05, + "loss": 2.3096, + "step": 17110 + }, + { + "epoch": 0.10760316618545361, + "grad_norm": 8.301399230957031, + "learning_rate": 1.9287025472955417e-05, + "loss": 2.3675, + "step": 17120 + }, + { + "epoch": 0.10766601850215073, + "grad_norm": 6.58356237411499, + "learning_rate": 1.9286606372010764e-05, + "loss": 2.1104, + "step": 17130 + }, + { + "epoch": 0.10772887081884784, + "grad_norm": 7.440036773681641, + "learning_rate": 1.928618727106611e-05, + "loss": 2.1014, + "step": 17140 + }, + { + "epoch": 0.10779172313554494, + "grad_norm": 7.675351142883301, + "learning_rate": 1.9285768170121458e-05, + "loss": 2.3695, + "step": 17150 + }, + { + "epoch": 0.10785457545224206, + "grad_norm": 5.981052875518799, + "learning_rate": 1.9285349069176805e-05, + "loss": 2.316, + "step": 17160 + }, + { + "epoch": 0.10791742776893917, + "grad_norm": 8.150433540344238, + "learning_rate": 1.928492996823215e-05, + "loss": 2.1734, + "step": 17170 + }, + { + "epoch": 0.10798028008563629, + "grad_norm": 7.965524196624756, + "learning_rate": 1.9284510867287496e-05, + "loss": 2.4759, + "step": 17180 + }, + { + "epoch": 0.10804313240233339, + "grad_norm": 8.376654624938965, + "learning_rate": 1.9284091766342843e-05, + "loss": 2.1809, + "step": 17190 + }, + { + "epoch": 0.1081059847190305, + "grad_norm": 8.441359519958496, + "learning_rate": 1.928367266539819e-05, + "loss": 2.0916, + "step": 17200 + }, + { + "epoch": 0.10816883703572762, + "grad_norm": 7.79421329498291, + "learning_rate": 1.9283253564453537e-05, + "loss": 2.4733, + "step": 17210 + }, + { + "epoch": 0.10823168935242472, + "grad_norm": 6.398135662078857, + "learning_rate": 1.928283446350888e-05, + "loss": 2.3177, + "step": 17220 + }, + { + "epoch": 0.10829454166912184, + "grad_norm": 7.340041637420654, + "learning_rate": 1.9282415362564228e-05, + "loss": 2.2512, + "step": 17230 + }, + { + "epoch": 0.10835739398581895, + "grad_norm": 8.736074447631836, + "learning_rate": 1.9281996261619575e-05, + "loss": 2.1867, + "step": 17240 + }, + { + "epoch": 0.10842024630251605, + "grad_norm": 7.77802848815918, + "learning_rate": 1.9281577160674922e-05, + "loss": 2.3079, + "step": 17250 + }, + { + "epoch": 0.10848309861921317, + "grad_norm": 7.42343282699585, + "learning_rate": 1.9281158059730266e-05, + "loss": 2.3138, + "step": 17260 + }, + { + "epoch": 0.10854595093591028, + "grad_norm": 6.85483980178833, + "learning_rate": 1.9280738958785613e-05, + "loss": 2.1433, + "step": 17270 + }, + { + "epoch": 0.10860880325260738, + "grad_norm": 9.421246528625488, + "learning_rate": 1.928031985784096e-05, + "loss": 2.4897, + "step": 17280 + }, + { + "epoch": 0.1086716555693045, + "grad_norm": 6.760537624359131, + "learning_rate": 1.9279900756896307e-05, + "loss": 2.2983, + "step": 17290 + }, + { + "epoch": 0.10873450788600161, + "grad_norm": 6.944523334503174, + "learning_rate": 1.927952356604612e-05, + "loss": 2.3495, + "step": 17300 + }, + { + "epoch": 0.10879736020269871, + "grad_norm": 8.142277717590332, + "learning_rate": 1.9279104465101465e-05, + "loss": 2.32, + "step": 17310 + }, + { + "epoch": 0.10886021251939583, + "grad_norm": 6.816464900970459, + "learning_rate": 1.9278685364156813e-05, + "loss": 2.0693, + "step": 17320 + }, + { + "epoch": 0.10892306483609294, + "grad_norm": 8.537376403808594, + "learning_rate": 1.927826626321216e-05, + "loss": 2.2503, + "step": 17330 + }, + { + "epoch": 0.10898591715279006, + "grad_norm": 7.559505462646484, + "learning_rate": 1.9277847162267503e-05, + "loss": 2.2101, + "step": 17340 + }, + { + "epoch": 0.10904876946948716, + "grad_norm": 7.39756441116333, + "learning_rate": 1.927742806132285e-05, + "loss": 2.3663, + "step": 17350 + }, + { + "epoch": 0.10911162178618428, + "grad_norm": 7.510336399078369, + "learning_rate": 1.9277008960378197e-05, + "loss": 2.6044, + "step": 17360 + }, + { + "epoch": 0.10917447410288139, + "grad_norm": 7.362528324127197, + "learning_rate": 1.9276589859433544e-05, + "loss": 2.3271, + "step": 17370 + }, + { + "epoch": 0.10923732641957849, + "grad_norm": 7.193731784820557, + "learning_rate": 1.927617075848889e-05, + "loss": 2.1653, + "step": 17380 + }, + { + "epoch": 0.10930017873627561, + "grad_norm": 7.314977169036865, + "learning_rate": 1.927575165754424e-05, + "loss": 2.1461, + "step": 17390 + }, + { + "epoch": 0.10936303105297272, + "grad_norm": 7.490072727203369, + "learning_rate": 1.9275332556599586e-05, + "loss": 2.3455, + "step": 17400 + }, + { + "epoch": 0.10942588336966982, + "grad_norm": 9.052772521972656, + "learning_rate": 1.9274913455654933e-05, + "loss": 2.2784, + "step": 17410 + }, + { + "epoch": 0.10948873568636694, + "grad_norm": 7.401103973388672, + "learning_rate": 1.9274494354710276e-05, + "loss": 2.0925, + "step": 17420 + }, + { + "epoch": 0.10955158800306405, + "grad_norm": 7.619738578796387, + "learning_rate": 1.9274075253765624e-05, + "loss": 2.2382, + "step": 17430 + }, + { + "epoch": 0.10961444031976116, + "grad_norm": 7.593260765075684, + "learning_rate": 1.927365615282097e-05, + "loss": 2.3528, + "step": 17440 + }, + { + "epoch": 0.10967729263645827, + "grad_norm": 7.171609878540039, + "learning_rate": 1.9273237051876318e-05, + "loss": 2.1605, + "step": 17450 + }, + { + "epoch": 0.10974014495315539, + "grad_norm": 6.895904541015625, + "learning_rate": 1.9272817950931665e-05, + "loss": 2.1842, + "step": 17460 + }, + { + "epoch": 0.10980299726985249, + "grad_norm": 7.595503330230713, + "learning_rate": 1.927239884998701e-05, + "loss": 2.0926, + "step": 17470 + }, + { + "epoch": 0.1098658495865496, + "grad_norm": 7.256607532501221, + "learning_rate": 1.9271979749042355e-05, + "loss": 2.3511, + "step": 17480 + }, + { + "epoch": 0.10992870190324672, + "grad_norm": 8.06808853149414, + "learning_rate": 1.9271560648097703e-05, + "loss": 2.2478, + "step": 17490 + }, + { + "epoch": 0.10999155421994383, + "grad_norm": 6.585019588470459, + "learning_rate": 1.927114154715305e-05, + "loss": 2.321, + "step": 17500 + }, + { + "epoch": 0.11005440653664093, + "grad_norm": 7.640449523925781, + "learning_rate": 1.9270722446208397e-05, + "loss": 2.1923, + "step": 17510 + }, + { + "epoch": 0.11011725885333805, + "grad_norm": 7.136289119720459, + "learning_rate": 1.927030334526374e-05, + "loss": 2.3107, + "step": 17520 + }, + { + "epoch": 0.11018011117003516, + "grad_norm": 7.288252353668213, + "learning_rate": 1.9269884244319087e-05, + "loss": 2.3086, + "step": 17530 + }, + { + "epoch": 0.11024296348673226, + "grad_norm": 6.95576810836792, + "learning_rate": 1.9269465143374435e-05, + "loss": 2.1987, + "step": 17540 + }, + { + "epoch": 0.11030581580342938, + "grad_norm": 8.266757011413574, + "learning_rate": 1.926904604242978e-05, + "loss": 2.2896, + "step": 17550 + }, + { + "epoch": 0.1103686681201265, + "grad_norm": 8.554996490478516, + "learning_rate": 1.9268626941485125e-05, + "loss": 2.1805, + "step": 17560 + }, + { + "epoch": 0.1104315204368236, + "grad_norm": 8.602093696594238, + "learning_rate": 1.9268207840540472e-05, + "loss": 2.4654, + "step": 17570 + }, + { + "epoch": 0.11049437275352071, + "grad_norm": 8.329663276672363, + "learning_rate": 1.926778873959582e-05, + "loss": 2.4862, + "step": 17580 + }, + { + "epoch": 0.11055722507021783, + "grad_norm": 8.581228256225586, + "learning_rate": 1.9267369638651166e-05, + "loss": 2.1252, + "step": 17590 + }, + { + "epoch": 0.11062007738691493, + "grad_norm": 7.5395402908325195, + "learning_rate": 1.9266950537706514e-05, + "loss": 2.3789, + "step": 17600 + }, + { + "epoch": 0.11068292970361204, + "grad_norm": 6.943304538726807, + "learning_rate": 1.926653143676186e-05, + "loss": 2.0377, + "step": 17610 + }, + { + "epoch": 0.11074578202030916, + "grad_norm": 8.480267524719238, + "learning_rate": 1.9266112335817208e-05, + "loss": 2.3763, + "step": 17620 + }, + { + "epoch": 0.11080863433700626, + "grad_norm": 9.318236351013184, + "learning_rate": 1.9265693234872555e-05, + "loss": 2.322, + "step": 17630 + }, + { + "epoch": 0.11087148665370337, + "grad_norm": 6.9914069175720215, + "learning_rate": 1.9265274133927902e-05, + "loss": 2.0475, + "step": 17640 + }, + { + "epoch": 0.11093433897040049, + "grad_norm": 6.4933695793151855, + "learning_rate": 1.9264855032983246e-05, + "loss": 2.1868, + "step": 17650 + }, + { + "epoch": 0.1109971912870976, + "grad_norm": 8.123563766479492, + "learning_rate": 1.9264435932038593e-05, + "loss": 2.2155, + "step": 17660 + }, + { + "epoch": 0.1110600436037947, + "grad_norm": 8.205390930175781, + "learning_rate": 1.926401683109394e-05, + "loss": 2.1776, + "step": 17670 + }, + { + "epoch": 0.11112289592049182, + "grad_norm": 7.595110893249512, + "learning_rate": 1.9263597730149287e-05, + "loss": 2.2943, + "step": 17680 + }, + { + "epoch": 0.11118574823718894, + "grad_norm": 6.7248101234436035, + "learning_rate": 1.926317862920463e-05, + "loss": 2.2173, + "step": 17690 + }, + { + "epoch": 0.11124860055388604, + "grad_norm": 7.3498430252075195, + "learning_rate": 1.9262759528259977e-05, + "loss": 2.1011, + "step": 17700 + }, + { + "epoch": 0.11131145287058315, + "grad_norm": 6.804111003875732, + "learning_rate": 1.9262340427315325e-05, + "loss": 2.2512, + "step": 17710 + }, + { + "epoch": 0.11137430518728027, + "grad_norm": 7.69357967376709, + "learning_rate": 1.926192132637067e-05, + "loss": 2.2153, + "step": 17720 + }, + { + "epoch": 0.11143715750397737, + "grad_norm": 7.761533737182617, + "learning_rate": 1.926150222542602e-05, + "loss": 2.111, + "step": 17730 + }, + { + "epoch": 0.11150000982067448, + "grad_norm": 9.239151954650879, + "learning_rate": 1.9261083124481362e-05, + "loss": 2.2681, + "step": 17740 + }, + { + "epoch": 0.1115628621373716, + "grad_norm": 8.001991271972656, + "learning_rate": 1.926066402353671e-05, + "loss": 2.2142, + "step": 17750 + }, + { + "epoch": 0.1116257144540687, + "grad_norm": 7.723743915557861, + "learning_rate": 1.9260244922592057e-05, + "loss": 2.1683, + "step": 17760 + }, + { + "epoch": 0.11168856677076582, + "grad_norm": 7.701489448547363, + "learning_rate": 1.9259825821647404e-05, + "loss": 2.0378, + "step": 17770 + }, + { + "epoch": 0.11175141908746293, + "grad_norm": 7.910447597503662, + "learning_rate": 1.925940672070275e-05, + "loss": 2.2301, + "step": 17780 + }, + { + "epoch": 0.11181427140416003, + "grad_norm": 7.99406099319458, + "learning_rate": 1.9258987619758098e-05, + "loss": 2.0902, + "step": 17790 + }, + { + "epoch": 0.11187712372085715, + "grad_norm": 7.329009532928467, + "learning_rate": 1.925856851881344e-05, + "loss": 2.1012, + "step": 17800 + }, + { + "epoch": 0.11193997603755426, + "grad_norm": 7.231492042541504, + "learning_rate": 1.925814941786879e-05, + "loss": 1.9498, + "step": 17810 + }, + { + "epoch": 0.11200282835425136, + "grad_norm": 7.1054487228393555, + "learning_rate": 1.9257730316924136e-05, + "loss": 2.0591, + "step": 17820 + }, + { + "epoch": 0.11206568067094848, + "grad_norm": 8.44486141204834, + "learning_rate": 1.9257311215979483e-05, + "loss": 2.1928, + "step": 17830 + }, + { + "epoch": 0.1121285329876456, + "grad_norm": 6.6756696701049805, + "learning_rate": 1.925689211503483e-05, + "loss": 2.1967, + "step": 17840 + }, + { + "epoch": 0.11219138530434271, + "grad_norm": 8.86054515838623, + "learning_rate": 1.9256473014090177e-05, + "loss": 2.177, + "step": 17850 + }, + { + "epoch": 0.11225423762103981, + "grad_norm": 7.436457633972168, + "learning_rate": 1.9256053913145524e-05, + "loss": 2.3267, + "step": 17860 + }, + { + "epoch": 0.11231708993773692, + "grad_norm": 7.809746265411377, + "learning_rate": 1.9255634812200868e-05, + "loss": 2.2719, + "step": 17870 + }, + { + "epoch": 0.11237994225443404, + "grad_norm": 6.934885025024414, + "learning_rate": 1.9255215711256215e-05, + "loss": 2.2318, + "step": 17880 + }, + { + "epoch": 0.11244279457113114, + "grad_norm": 8.142309188842773, + "learning_rate": 1.925479661031156e-05, + "loss": 2.2587, + "step": 17890 + }, + { + "epoch": 0.11250564688782826, + "grad_norm": 7.852748870849609, + "learning_rate": 1.925437750936691e-05, + "loss": 2.035, + "step": 17900 + }, + { + "epoch": 0.11256849920452537, + "grad_norm": 7.492555141448975, + "learning_rate": 1.9253958408422252e-05, + "loss": 2.2448, + "step": 17910 + }, + { + "epoch": 0.11263135152122247, + "grad_norm": 7.685120105743408, + "learning_rate": 1.92535393074776e-05, + "loss": 2.199, + "step": 17920 + }, + { + "epoch": 0.11269420383791959, + "grad_norm": 7.95084810256958, + "learning_rate": 1.9253120206532947e-05, + "loss": 2.1024, + "step": 17930 + }, + { + "epoch": 0.1127570561546167, + "grad_norm": 6.954649448394775, + "learning_rate": 1.9252701105588294e-05, + "loss": 2.0585, + "step": 17940 + }, + { + "epoch": 0.1128199084713138, + "grad_norm": 6.373837947845459, + "learning_rate": 1.925228200464364e-05, + "loss": 2.1546, + "step": 17950 + }, + { + "epoch": 0.11288276078801092, + "grad_norm": 6.765232086181641, + "learning_rate": 1.9251862903698984e-05, + "loss": 2.4648, + "step": 17960 + }, + { + "epoch": 0.11294561310470803, + "grad_norm": 6.509195804595947, + "learning_rate": 1.925144380275433e-05, + "loss": 2.0974, + "step": 17970 + }, + { + "epoch": 0.11300846542140514, + "grad_norm": 6.901458263397217, + "learning_rate": 1.925102470180968e-05, + "loss": 2.0435, + "step": 17980 + }, + { + "epoch": 0.11307131773810225, + "grad_norm": 7.925265789031982, + "learning_rate": 1.9250605600865026e-05, + "loss": 2.3093, + "step": 17990 + }, + { + "epoch": 0.11313417005479937, + "grad_norm": 7.479783058166504, + "learning_rate": 1.9250186499920373e-05, + "loss": 2.1798, + "step": 18000 + }, + { + "epoch": 0.11319702237149648, + "grad_norm": 7.755842208862305, + "learning_rate": 1.924976739897572e-05, + "loss": 2.3681, + "step": 18010 + }, + { + "epoch": 0.11325987468819358, + "grad_norm": 7.0234222412109375, + "learning_rate": 1.9249348298031067e-05, + "loss": 2.3122, + "step": 18020 + }, + { + "epoch": 0.1133227270048907, + "grad_norm": 7.856392860412598, + "learning_rate": 1.9248929197086414e-05, + "loss": 2.1876, + "step": 18030 + }, + { + "epoch": 0.11338557932158781, + "grad_norm": 7.414135456085205, + "learning_rate": 1.924851009614176e-05, + "loss": 2.4818, + "step": 18040 + }, + { + "epoch": 0.11344843163828491, + "grad_norm": 7.981344223022461, + "learning_rate": 1.9248090995197105e-05, + "loss": 2.1543, + "step": 18050 + }, + { + "epoch": 0.11351128395498203, + "grad_norm": 7.49269962310791, + "learning_rate": 1.924767189425245e-05, + "loss": 2.3239, + "step": 18060 + }, + { + "epoch": 0.11357413627167914, + "grad_norm": 7.305643558502197, + "learning_rate": 1.92472527933078e-05, + "loss": 2.1973, + "step": 18070 + }, + { + "epoch": 0.11363698858837624, + "grad_norm": 8.535238265991211, + "learning_rate": 1.9246833692363146e-05, + "loss": 2.2497, + "step": 18080 + }, + { + "epoch": 0.11369984090507336, + "grad_norm": 7.401052474975586, + "learning_rate": 1.924641459141849e-05, + "loss": 2.1625, + "step": 18090 + }, + { + "epoch": 0.11376269322177048, + "grad_norm": 8.050284385681152, + "learning_rate": 1.9245995490473837e-05, + "loss": 2.0429, + "step": 18100 + }, + { + "epoch": 0.11382554553846758, + "grad_norm": 7.0560526847839355, + "learning_rate": 1.9245576389529184e-05, + "loss": 2.1123, + "step": 18110 + }, + { + "epoch": 0.11388839785516469, + "grad_norm": 6.865621566772461, + "learning_rate": 1.924515728858453e-05, + "loss": 2.1157, + "step": 18120 + }, + { + "epoch": 0.1139512501718618, + "grad_norm": 7.461267471313477, + "learning_rate": 1.9244738187639878e-05, + "loss": 2.3348, + "step": 18130 + }, + { + "epoch": 0.11401410248855891, + "grad_norm": 8.42668342590332, + "learning_rate": 1.924431908669522e-05, + "loss": 2.3355, + "step": 18140 + }, + { + "epoch": 0.11407695480525602, + "grad_norm": 8.142030715942383, + "learning_rate": 1.924389998575057e-05, + "loss": 2.0568, + "step": 18150 + }, + { + "epoch": 0.11413980712195314, + "grad_norm": 7.327850818634033, + "learning_rate": 1.9243480884805916e-05, + "loss": 2.2661, + "step": 18160 + }, + { + "epoch": 0.11420265943865025, + "grad_norm": 7.3329925537109375, + "learning_rate": 1.9243061783861263e-05, + "loss": 2.4782, + "step": 18170 + }, + { + "epoch": 0.11426551175534735, + "grad_norm": 8.312920570373535, + "learning_rate": 1.9242642682916606e-05, + "loss": 2.6378, + "step": 18180 + }, + { + "epoch": 0.11432836407204447, + "grad_norm": 12.4722900390625, + "learning_rate": 1.9242223581971953e-05, + "loss": 2.1063, + "step": 18190 + }, + { + "epoch": 0.11439121638874158, + "grad_norm": 7.168670654296875, + "learning_rate": 1.92418044810273e-05, + "loss": 2.182, + "step": 18200 + }, + { + "epoch": 0.11445406870543869, + "grad_norm": 8.269729614257812, + "learning_rate": 1.9241385380082648e-05, + "loss": 2.2661, + "step": 18210 + }, + { + "epoch": 0.1145169210221358, + "grad_norm": 7.075867176055908, + "learning_rate": 1.9240966279137995e-05, + "loss": 2.243, + "step": 18220 + }, + { + "epoch": 0.11457977333883292, + "grad_norm": 9.278741836547852, + "learning_rate": 1.9240547178193342e-05, + "loss": 2.0635, + "step": 18230 + }, + { + "epoch": 0.11464262565553002, + "grad_norm": 8.0344820022583, + "learning_rate": 1.924012807724869e-05, + "loss": 2.2158, + "step": 18240 + }, + { + "epoch": 0.11470547797222713, + "grad_norm": 8.224825859069824, + "learning_rate": 1.9239708976304036e-05, + "loss": 2.2555, + "step": 18250 + }, + { + "epoch": 0.11476833028892425, + "grad_norm": 6.739234924316406, + "learning_rate": 1.9239289875359383e-05, + "loss": 1.9965, + "step": 18260 + }, + { + "epoch": 0.11483118260562135, + "grad_norm": 8.875475883483887, + "learning_rate": 1.9238870774414727e-05, + "loss": 2.1516, + "step": 18270 + }, + { + "epoch": 0.11489403492231846, + "grad_norm": 7.0612897872924805, + "learning_rate": 1.9238451673470074e-05, + "loss": 2.1763, + "step": 18280 + }, + { + "epoch": 0.11495688723901558, + "grad_norm": 7.863101959228516, + "learning_rate": 1.923803257252542e-05, + "loss": 2.2357, + "step": 18290 + }, + { + "epoch": 0.11501973955571268, + "grad_norm": 7.017388820648193, + "learning_rate": 1.9237613471580768e-05, + "loss": 2.2272, + "step": 18300 + }, + { + "epoch": 0.1150825918724098, + "grad_norm": 9.14671802520752, + "learning_rate": 1.923719437063611e-05, + "loss": 2.2746, + "step": 18310 + }, + { + "epoch": 0.11514544418910691, + "grad_norm": 8.308667182922363, + "learning_rate": 1.923677526969146e-05, + "loss": 2.016, + "step": 18320 + }, + { + "epoch": 0.11520829650580403, + "grad_norm": 6.846327304840088, + "learning_rate": 1.9236356168746806e-05, + "loss": 2.1404, + "step": 18330 + }, + { + "epoch": 0.11527114882250113, + "grad_norm": 8.261704444885254, + "learning_rate": 1.9235937067802153e-05, + "loss": 2.3775, + "step": 18340 + }, + { + "epoch": 0.11533400113919824, + "grad_norm": 7.210214614868164, + "learning_rate": 1.92355179668575e-05, + "loss": 2.2635, + "step": 18350 + }, + { + "epoch": 0.11539685345589536, + "grad_norm": 7.188510417938232, + "learning_rate": 1.9235098865912843e-05, + "loss": 2.4214, + "step": 18360 + }, + { + "epoch": 0.11545970577259246, + "grad_norm": 7.944930553436279, + "learning_rate": 1.923467976496819e-05, + "loss": 2.1332, + "step": 18370 + }, + { + "epoch": 0.11552255808928957, + "grad_norm": 8.27840805053711, + "learning_rate": 1.9234260664023538e-05, + "loss": 2.1003, + "step": 18380 + }, + { + "epoch": 0.11558541040598669, + "grad_norm": 6.786845684051514, + "learning_rate": 1.9233841563078885e-05, + "loss": 2.1307, + "step": 18390 + }, + { + "epoch": 0.11564826272268379, + "grad_norm": 7.634956359863281, + "learning_rate": 1.9233422462134232e-05, + "loss": 2.1637, + "step": 18400 + }, + { + "epoch": 0.1157111150393809, + "grad_norm": 7.450354099273682, + "learning_rate": 1.923300336118958e-05, + "loss": 2.2814, + "step": 18410 + }, + { + "epoch": 0.11577396735607802, + "grad_norm": 8.3748779296875, + "learning_rate": 1.9232584260244923e-05, + "loss": 2.5221, + "step": 18420 + }, + { + "epoch": 0.11583681967277512, + "grad_norm": 7.9941558837890625, + "learning_rate": 1.923216515930027e-05, + "loss": 2.0243, + "step": 18430 + }, + { + "epoch": 0.11589967198947224, + "grad_norm": 8.738197326660156, + "learning_rate": 1.9231746058355617e-05, + "loss": 2.0117, + "step": 18440 + }, + { + "epoch": 0.11596252430616935, + "grad_norm": 7.011739253997803, + "learning_rate": 1.9231326957410964e-05, + "loss": 2.012, + "step": 18450 + }, + { + "epoch": 0.11602537662286645, + "grad_norm": 7.023013114929199, + "learning_rate": 1.923090785646631e-05, + "loss": 2.0715, + "step": 18460 + }, + { + "epoch": 0.11608822893956357, + "grad_norm": 7.268269062042236, + "learning_rate": 1.9230488755521658e-05, + "loss": 2.1794, + "step": 18470 + }, + { + "epoch": 0.11615108125626068, + "grad_norm": 9.136162757873535, + "learning_rate": 1.9230069654577005e-05, + "loss": 2.2741, + "step": 18480 + }, + { + "epoch": 0.11621393357295778, + "grad_norm": 7.9868621826171875, + "learning_rate": 1.922965055363235e-05, + "loss": 2.1819, + "step": 18490 + }, + { + "epoch": 0.1162767858896549, + "grad_norm": 7.73309326171875, + "learning_rate": 1.9229231452687696e-05, + "loss": 2.4263, + "step": 18500 + }, + { + "epoch": 0.11633963820635201, + "grad_norm": 7.7979350090026855, + "learning_rate": 1.9228812351743043e-05, + "loss": 2.4283, + "step": 18510 + }, + { + "epoch": 0.11640249052304913, + "grad_norm": 6.465146541595459, + "learning_rate": 1.922839325079839e-05, + "loss": 1.9808, + "step": 18520 + }, + { + "epoch": 0.11646534283974623, + "grad_norm": 8.1764554977417, + "learning_rate": 1.9227974149853737e-05, + "loss": 2.1467, + "step": 18530 + }, + { + "epoch": 0.11652819515644335, + "grad_norm": 7.2224273681640625, + "learning_rate": 1.922755504890908e-05, + "loss": 2.1513, + "step": 18540 + }, + { + "epoch": 0.11659104747314046, + "grad_norm": 8.304801940917969, + "learning_rate": 1.9227135947964428e-05, + "loss": 2.0873, + "step": 18550 + }, + { + "epoch": 0.11665389978983756, + "grad_norm": 7.494878768920898, + "learning_rate": 1.9226716847019775e-05, + "loss": 2.1896, + "step": 18560 + }, + { + "epoch": 0.11671675210653468, + "grad_norm": 6.850111484527588, + "learning_rate": 1.9226297746075122e-05, + "loss": 1.9097, + "step": 18570 + }, + { + "epoch": 0.11677960442323179, + "grad_norm": 6.223687171936035, + "learning_rate": 1.9225878645130465e-05, + "loss": 2.0755, + "step": 18580 + }, + { + "epoch": 0.1168424567399289, + "grad_norm": 6.9537200927734375, + "learning_rate": 1.9225459544185813e-05, + "loss": 2.3503, + "step": 18590 + }, + { + "epoch": 0.11690530905662601, + "grad_norm": 8.668498039245605, + "learning_rate": 1.922504044324116e-05, + "loss": 2.2197, + "step": 18600 + }, + { + "epoch": 0.11696816137332312, + "grad_norm": 8.086195945739746, + "learning_rate": 1.9224621342296507e-05, + "loss": 2.2773, + "step": 18610 + }, + { + "epoch": 0.11703101369002022, + "grad_norm": 9.474557876586914, + "learning_rate": 1.9224202241351854e-05, + "loss": 2.3185, + "step": 18620 + }, + { + "epoch": 0.11709386600671734, + "grad_norm": 9.221257209777832, + "learning_rate": 1.92237831404072e-05, + "loss": 2.3129, + "step": 18630 + }, + { + "epoch": 0.11715671832341445, + "grad_norm": 7.471948146820068, + "learning_rate": 1.9223364039462548e-05, + "loss": 2.0476, + "step": 18640 + }, + { + "epoch": 0.11721957064011156, + "grad_norm": 8.019757270812988, + "learning_rate": 1.9222944938517895e-05, + "loss": 2.161, + "step": 18650 + }, + { + "epoch": 0.11728242295680867, + "grad_norm": 7.667059421539307, + "learning_rate": 1.9222525837573242e-05, + "loss": 2.1936, + "step": 18660 + }, + { + "epoch": 0.11734527527350579, + "grad_norm": 8.694701194763184, + "learning_rate": 1.9222106736628586e-05, + "loss": 2.4176, + "step": 18670 + }, + { + "epoch": 0.1174081275902029, + "grad_norm": 7.578640937805176, + "learning_rate": 1.9221687635683933e-05, + "loss": 1.9401, + "step": 18680 + }, + { + "epoch": 0.1174709799069, + "grad_norm": 5.771047592163086, + "learning_rate": 1.922126853473928e-05, + "loss": 1.9679, + "step": 18690 + }, + { + "epoch": 0.11753383222359712, + "grad_norm": 7.293246269226074, + "learning_rate": 1.9220849433794627e-05, + "loss": 2.3527, + "step": 18700 + }, + { + "epoch": 0.11759668454029423, + "grad_norm": 7.9927873611450195, + "learning_rate": 1.922043033284997e-05, + "loss": 2.2599, + "step": 18710 + }, + { + "epoch": 0.11765953685699133, + "grad_norm": 7.356894493103027, + "learning_rate": 1.9220011231905318e-05, + "loss": 2.3567, + "step": 18720 + }, + { + "epoch": 0.11772238917368845, + "grad_norm": 6.663243770599365, + "learning_rate": 1.9219592130960665e-05, + "loss": 2.0537, + "step": 18730 + }, + { + "epoch": 0.11778524149038556, + "grad_norm": 6.652327537536621, + "learning_rate": 1.9219173030016012e-05, + "loss": 2.2665, + "step": 18740 + }, + { + "epoch": 0.11784809380708267, + "grad_norm": 7.32041597366333, + "learning_rate": 1.921875392907136e-05, + "loss": 2.1582, + "step": 18750 + }, + { + "epoch": 0.11791094612377978, + "grad_norm": 7.6705451011657715, + "learning_rate": 1.9218334828126703e-05, + "loss": 2.3782, + "step": 18760 + }, + { + "epoch": 0.1179737984404769, + "grad_norm": 8.588958740234375, + "learning_rate": 1.921791572718205e-05, + "loss": 2.1552, + "step": 18770 + }, + { + "epoch": 0.118036650757174, + "grad_norm": 8.211400032043457, + "learning_rate": 1.9217496626237397e-05, + "loss": 2.1711, + "step": 18780 + }, + { + "epoch": 0.11809950307387111, + "grad_norm": 6.705261707305908, + "learning_rate": 1.9217077525292744e-05, + "loss": 2.1008, + "step": 18790 + }, + { + "epoch": 0.11816235539056823, + "grad_norm": 8.6095609664917, + "learning_rate": 1.9216658424348087e-05, + "loss": 2.2753, + "step": 18800 + }, + { + "epoch": 0.11822520770726533, + "grad_norm": 6.959962844848633, + "learning_rate": 1.9216239323403435e-05, + "loss": 2.0305, + "step": 18810 + }, + { + "epoch": 0.11828806002396244, + "grad_norm": 8.471665382385254, + "learning_rate": 1.921582022245878e-05, + "loss": 2.1081, + "step": 18820 + }, + { + "epoch": 0.11835091234065956, + "grad_norm": 8.152358055114746, + "learning_rate": 1.921540112151413e-05, + "loss": 2.3344, + "step": 18830 + }, + { + "epoch": 0.11841376465735667, + "grad_norm": 6.483791828155518, + "learning_rate": 1.9214982020569476e-05, + "loss": 2.1609, + "step": 18840 + }, + { + "epoch": 0.11847661697405378, + "grad_norm": 7.693551540374756, + "learning_rate": 1.9214562919624823e-05, + "loss": 2.0621, + "step": 18850 + }, + { + "epoch": 0.11853946929075089, + "grad_norm": 7.567829608917236, + "learning_rate": 1.921414381868017e-05, + "loss": 2.169, + "step": 18860 + }, + { + "epoch": 0.118602321607448, + "grad_norm": 6.230541706085205, + "learning_rate": 1.9213724717735517e-05, + "loss": 2.0434, + "step": 18870 + }, + { + "epoch": 0.1186651739241451, + "grad_norm": 7.5944976806640625, + "learning_rate": 1.9213305616790864e-05, + "loss": 2.2748, + "step": 18880 + }, + { + "epoch": 0.11872802624084222, + "grad_norm": 9.224638938903809, + "learning_rate": 1.9212886515846208e-05, + "loss": 2.1078, + "step": 18890 + }, + { + "epoch": 0.11879087855753934, + "grad_norm": 7.6371750831604, + "learning_rate": 1.9212467414901555e-05, + "loss": 2.1176, + "step": 18900 + }, + { + "epoch": 0.11885373087423644, + "grad_norm": 8.238876342773438, + "learning_rate": 1.9212048313956902e-05, + "loss": 2.1418, + "step": 18910 + }, + { + "epoch": 0.11891658319093355, + "grad_norm": 9.216188430786133, + "learning_rate": 1.921162921301225e-05, + "loss": 2.2832, + "step": 18920 + }, + { + "epoch": 0.11897943550763067, + "grad_norm": 7.281813621520996, + "learning_rate": 1.9211210112067593e-05, + "loss": 2.1489, + "step": 18930 + }, + { + "epoch": 0.11904228782432777, + "grad_norm": 7.572481632232666, + "learning_rate": 1.921079101112294e-05, + "loss": 2.4332, + "step": 18940 + }, + { + "epoch": 0.11910514014102488, + "grad_norm": 7.893809795379639, + "learning_rate": 1.9210371910178287e-05, + "loss": 2.4696, + "step": 18950 + }, + { + "epoch": 0.119167992457722, + "grad_norm": 7.050575256347656, + "learning_rate": 1.9209952809233634e-05, + "loss": 2.162, + "step": 18960 + }, + { + "epoch": 0.1192308447744191, + "grad_norm": 8.660941123962402, + "learning_rate": 1.920953370828898e-05, + "loss": 2.413, + "step": 18970 + }, + { + "epoch": 0.11929369709111622, + "grad_norm": 8.012182235717773, + "learning_rate": 1.9209114607344325e-05, + "loss": 2.3525, + "step": 18980 + }, + { + "epoch": 0.11935654940781333, + "grad_norm": 7.362180233001709, + "learning_rate": 1.920869550639967e-05, + "loss": 2.3946, + "step": 18990 + }, + { + "epoch": 0.11941940172451045, + "grad_norm": 7.550148010253906, + "learning_rate": 1.920827640545502e-05, + "loss": 2.1124, + "step": 19000 + }, + { + "epoch": 0.11948225404120755, + "grad_norm": 7.317694664001465, + "learning_rate": 1.9207857304510366e-05, + "loss": 2.3389, + "step": 19010 + }, + { + "epoch": 0.11954510635790466, + "grad_norm": 9.328289031982422, + "learning_rate": 1.9207438203565713e-05, + "loss": 2.1496, + "step": 19020 + }, + { + "epoch": 0.11960795867460178, + "grad_norm": 8.249361038208008, + "learning_rate": 1.920701910262106e-05, + "loss": 2.2233, + "step": 19030 + }, + { + "epoch": 0.11967081099129888, + "grad_norm": 7.633471965789795, + "learning_rate": 1.9206600001676407e-05, + "loss": 2.2853, + "step": 19040 + }, + { + "epoch": 0.119733663307996, + "grad_norm": 9.351219177246094, + "learning_rate": 1.920618090073175e-05, + "loss": 2.2397, + "step": 19050 + }, + { + "epoch": 0.11979651562469311, + "grad_norm": 14.492135047912598, + "learning_rate": 1.9205761799787098e-05, + "loss": 2.0129, + "step": 19060 + }, + { + "epoch": 0.11985936794139021, + "grad_norm": 7.919447898864746, + "learning_rate": 1.9205342698842445e-05, + "loss": 2.1271, + "step": 19070 + }, + { + "epoch": 0.11992222025808733, + "grad_norm": 7.8114142417907715, + "learning_rate": 1.9204923597897792e-05, + "loss": 2.0951, + "step": 19080 + }, + { + "epoch": 0.11998507257478444, + "grad_norm": 7.433046817779541, + "learning_rate": 1.920450449695314e-05, + "loss": 1.9276, + "step": 19090 + }, + { + "epoch": 0.12004792489148154, + "grad_norm": 7.817944526672363, + "learning_rate": 1.9204085396008486e-05, + "loss": 2.0837, + "step": 19100 + }, + { + "epoch": 0.12011077720817866, + "grad_norm": 7.712526321411133, + "learning_rate": 1.920366629506383e-05, + "loss": 2.1003, + "step": 19110 + }, + { + "epoch": 0.12017362952487577, + "grad_norm": 7.097597599029541, + "learning_rate": 1.9203247194119177e-05, + "loss": 2.238, + "step": 19120 + }, + { + "epoch": 0.12023648184157287, + "grad_norm": 8.20934009552002, + "learning_rate": 1.9202828093174524e-05, + "loss": 2.1962, + "step": 19130 + }, + { + "epoch": 0.12029933415826999, + "grad_norm": 9.129846572875977, + "learning_rate": 1.920240899222987e-05, + "loss": 2.2505, + "step": 19140 + }, + { + "epoch": 0.1203621864749671, + "grad_norm": 7.770500183105469, + "learning_rate": 1.9201989891285218e-05, + "loss": 1.9644, + "step": 19150 + }, + { + "epoch": 0.12042503879166422, + "grad_norm": 8.032506942749023, + "learning_rate": 1.920157079034056e-05, + "loss": 2.3757, + "step": 19160 + }, + { + "epoch": 0.12048789110836132, + "grad_norm": 7.227913856506348, + "learning_rate": 1.920115168939591e-05, + "loss": 2.269, + "step": 19170 + }, + { + "epoch": 0.12055074342505843, + "grad_norm": 9.362936973571777, + "learning_rate": 1.9200732588451256e-05, + "loss": 2.0911, + "step": 19180 + }, + { + "epoch": 0.12061359574175555, + "grad_norm": 9.266836166381836, + "learning_rate": 1.9200313487506603e-05, + "loss": 2.0499, + "step": 19190 + }, + { + "epoch": 0.12067644805845265, + "grad_norm": 8.412091255187988, + "learning_rate": 1.9199894386561947e-05, + "loss": 2.3619, + "step": 19200 + }, + { + "epoch": 0.12073930037514977, + "grad_norm": 7.589006423950195, + "learning_rate": 1.9199475285617294e-05, + "loss": 2.4516, + "step": 19210 + }, + { + "epoch": 0.12080215269184688, + "grad_norm": 8.433143615722656, + "learning_rate": 1.919905618467264e-05, + "loss": 2.1688, + "step": 19220 + }, + { + "epoch": 0.12086500500854398, + "grad_norm": 6.455695152282715, + "learning_rate": 1.9198637083727988e-05, + "loss": 2.293, + "step": 19230 + }, + { + "epoch": 0.1209278573252411, + "grad_norm": 8.669682502746582, + "learning_rate": 1.9198217982783335e-05, + "loss": 2.0135, + "step": 19240 + }, + { + "epoch": 0.12099070964193821, + "grad_norm": 6.809682369232178, + "learning_rate": 1.9197798881838682e-05, + "loss": 2.1588, + "step": 19250 + }, + { + "epoch": 0.12105356195863531, + "grad_norm": 7.939022064208984, + "learning_rate": 1.919737978089403e-05, + "loss": 2.4309, + "step": 19260 + }, + { + "epoch": 0.12111641427533243, + "grad_norm": 8.199952125549316, + "learning_rate": 1.9196960679949376e-05, + "loss": 2.3256, + "step": 19270 + }, + { + "epoch": 0.12117926659202954, + "grad_norm": 7.057774543762207, + "learning_rate": 1.9196541579004723e-05, + "loss": 2.1621, + "step": 19280 + }, + { + "epoch": 0.12124211890872665, + "grad_norm": 6.977524280548096, + "learning_rate": 1.9196122478060067e-05, + "loss": 2.2381, + "step": 19290 + }, + { + "epoch": 0.12130497122542376, + "grad_norm": 7.7442216873168945, + "learning_rate": 1.9195703377115414e-05, + "loss": 2.3605, + "step": 19300 + }, + { + "epoch": 0.12136782354212088, + "grad_norm": 8.420924186706543, + "learning_rate": 1.919528427617076e-05, + "loss": 2.2136, + "step": 19310 + }, + { + "epoch": 0.12143067585881798, + "grad_norm": 6.575891971588135, + "learning_rate": 1.9194865175226108e-05, + "loss": 2.122, + "step": 19320 + }, + { + "epoch": 0.12149352817551509, + "grad_norm": 8.827194213867188, + "learning_rate": 1.9194446074281452e-05, + "loss": 2.2611, + "step": 19330 + }, + { + "epoch": 0.12155638049221221, + "grad_norm": 6.7283525466918945, + "learning_rate": 1.91940269733368e-05, + "loss": 2.1378, + "step": 19340 + }, + { + "epoch": 0.12161923280890932, + "grad_norm": 7.744928359985352, + "learning_rate": 1.9193607872392146e-05, + "loss": 2.1025, + "step": 19350 + }, + { + "epoch": 0.12168208512560642, + "grad_norm": 7.292446136474609, + "learning_rate": 1.9193188771447493e-05, + "loss": 2.0431, + "step": 19360 + }, + { + "epoch": 0.12174493744230354, + "grad_norm": 8.067106246948242, + "learning_rate": 1.919276967050284e-05, + "loss": 2.0828, + "step": 19370 + }, + { + "epoch": 0.12180778975900065, + "grad_norm": 7.035076141357422, + "learning_rate": 1.9192350569558184e-05, + "loss": 2.1966, + "step": 19380 + }, + { + "epoch": 0.12187064207569775, + "grad_norm": 8.168293952941895, + "learning_rate": 1.919193146861353e-05, + "loss": 2.3357, + "step": 19390 + }, + { + "epoch": 0.12193349439239487, + "grad_norm": 7.839327335357666, + "learning_rate": 1.9191512367668878e-05, + "loss": 2.0244, + "step": 19400 + }, + { + "epoch": 0.12199634670909199, + "grad_norm": 7.90061092376709, + "learning_rate": 1.9191093266724225e-05, + "loss": 2.1978, + "step": 19410 + }, + { + "epoch": 0.12205919902578909, + "grad_norm": 7.999143600463867, + "learning_rate": 1.9190674165779572e-05, + "loss": 2.2847, + "step": 19420 + }, + { + "epoch": 0.1221220513424862, + "grad_norm": 8.394352912902832, + "learning_rate": 1.9190255064834916e-05, + "loss": 2.2628, + "step": 19430 + }, + { + "epoch": 0.12218490365918332, + "grad_norm": 7.913904190063477, + "learning_rate": 1.9189835963890263e-05, + "loss": 2.064, + "step": 19440 + }, + { + "epoch": 0.12224775597588042, + "grad_norm": 7.781804084777832, + "learning_rate": 1.918941686294561e-05, + "loss": 2.0578, + "step": 19450 + }, + { + "epoch": 0.12231060829257753, + "grad_norm": 8.752440452575684, + "learning_rate": 1.9188997762000957e-05, + "loss": 2.3629, + "step": 19460 + }, + { + "epoch": 0.12237346060927465, + "grad_norm": 6.962963104248047, + "learning_rate": 1.9188578661056304e-05, + "loss": 2.1903, + "step": 19470 + }, + { + "epoch": 0.12243631292597175, + "grad_norm": 7.6390862464904785, + "learning_rate": 1.918815956011165e-05, + "loss": 2.0745, + "step": 19480 + }, + { + "epoch": 0.12249916524266886, + "grad_norm": 7.179372310638428, + "learning_rate": 1.9187740459166998e-05, + "loss": 2.3104, + "step": 19490 + }, + { + "epoch": 0.12256201755936598, + "grad_norm": 7.717180252075195, + "learning_rate": 1.9187321358222345e-05, + "loss": 2.0472, + "step": 19500 + }, + { + "epoch": 0.1226248698760631, + "grad_norm": 7.581270217895508, + "learning_rate": 1.918690225727769e-05, + "loss": 2.2385, + "step": 19510 + }, + { + "epoch": 0.1226877221927602, + "grad_norm": 7.664039611816406, + "learning_rate": 1.9186483156333036e-05, + "loss": 2.1226, + "step": 19520 + }, + { + "epoch": 0.12275057450945731, + "grad_norm": 9.040067672729492, + "learning_rate": 1.9186105965482847e-05, + "loss": 2.1236, + "step": 19530 + }, + { + "epoch": 0.12281342682615443, + "grad_norm": 7.814372539520264, + "learning_rate": 1.9185686864538194e-05, + "loss": 1.9381, + "step": 19540 + }, + { + "epoch": 0.12287627914285153, + "grad_norm": 8.149017333984375, + "learning_rate": 1.918526776359354e-05, + "loss": 2.1876, + "step": 19550 + }, + { + "epoch": 0.12293913145954864, + "grad_norm": 7.6349310874938965, + "learning_rate": 1.918484866264889e-05, + "loss": 2.2091, + "step": 19560 + }, + { + "epoch": 0.12300198377624576, + "grad_norm": 8.281112670898438, + "learning_rate": 1.9184429561704236e-05, + "loss": 2.1029, + "step": 19570 + }, + { + "epoch": 0.12306483609294286, + "grad_norm": 7.725566387176514, + "learning_rate": 1.9184010460759583e-05, + "loss": 2.0583, + "step": 19580 + }, + { + "epoch": 0.12312768840963997, + "grad_norm": 8.579814910888672, + "learning_rate": 1.9183591359814926e-05, + "loss": 2.4456, + "step": 19590 + }, + { + "epoch": 0.12319054072633709, + "grad_norm": 9.688976287841797, + "learning_rate": 1.9183172258870273e-05, + "loss": 2.2219, + "step": 19600 + }, + { + "epoch": 0.12325339304303419, + "grad_norm": 6.544898986816406, + "learning_rate": 1.918275315792562e-05, + "loss": 2.0028, + "step": 19610 + }, + { + "epoch": 0.1233162453597313, + "grad_norm": 8.4871244430542, + "learning_rate": 1.9182334056980967e-05, + "loss": 2.2076, + "step": 19620 + }, + { + "epoch": 0.12337909767642842, + "grad_norm": 6.6899518966674805, + "learning_rate": 1.918191495603631e-05, + "loss": 2.4654, + "step": 19630 + }, + { + "epoch": 0.12344194999312552, + "grad_norm": 6.7486982345581055, + "learning_rate": 1.9181495855091658e-05, + "loss": 2.347, + "step": 19640 + }, + { + "epoch": 0.12350480230982264, + "grad_norm": 8.075026512145996, + "learning_rate": 1.9181076754147005e-05, + "loss": 2.0383, + "step": 19650 + }, + { + "epoch": 0.12356765462651975, + "grad_norm": 8.204153060913086, + "learning_rate": 1.9180657653202352e-05, + "loss": 2.2389, + "step": 19660 + }, + { + "epoch": 0.12363050694321687, + "grad_norm": 7.5068230628967285, + "learning_rate": 1.9180238552257696e-05, + "loss": 1.9144, + "step": 19670 + }, + { + "epoch": 0.12369335925991397, + "grad_norm": 7.916260242462158, + "learning_rate": 1.9179819451313043e-05, + "loss": 2.095, + "step": 19680 + }, + { + "epoch": 0.12375621157661108, + "grad_norm": 7.666872501373291, + "learning_rate": 1.917940035036839e-05, + "loss": 2.2128, + "step": 19690 + }, + { + "epoch": 0.1238190638933082, + "grad_norm": 7.435920238494873, + "learning_rate": 1.9178981249423737e-05, + "loss": 2.0676, + "step": 19700 + }, + { + "epoch": 0.1238819162100053, + "grad_norm": 6.545831680297852, + "learning_rate": 1.9178562148479084e-05, + "loss": 2.1508, + "step": 19710 + }, + { + "epoch": 0.12394476852670241, + "grad_norm": 7.308215618133545, + "learning_rate": 1.917814304753443e-05, + "loss": 2.1635, + "step": 19720 + }, + { + "epoch": 0.12400762084339953, + "grad_norm": 7.002355098724365, + "learning_rate": 1.917772394658978e-05, + "loss": 2.0521, + "step": 19730 + }, + { + "epoch": 0.12407047316009663, + "grad_norm": 7.835935592651367, + "learning_rate": 1.9177304845645122e-05, + "loss": 2.0145, + "step": 19740 + }, + { + "epoch": 0.12413332547679375, + "grad_norm": 7.6545586585998535, + "learning_rate": 1.917688574470047e-05, + "loss": 2.123, + "step": 19750 + }, + { + "epoch": 0.12419617779349086, + "grad_norm": 8.699694633483887, + "learning_rate": 1.9176466643755816e-05, + "loss": 2.1843, + "step": 19760 + }, + { + "epoch": 0.12425903011018796, + "grad_norm": 7.266550064086914, + "learning_rate": 1.9176047542811163e-05, + "loss": 2.3133, + "step": 19770 + }, + { + "epoch": 0.12432188242688508, + "grad_norm": 5.948155879974365, + "learning_rate": 1.917562844186651e-05, + "loss": 2.1415, + "step": 19780 + }, + { + "epoch": 0.12438473474358219, + "grad_norm": 8.048948287963867, + "learning_rate": 1.9175209340921858e-05, + "loss": 2.0368, + "step": 19790 + }, + { + "epoch": 0.1244475870602793, + "grad_norm": 7.991379261016846, + "learning_rate": 1.9174790239977205e-05, + "loss": 2.1586, + "step": 19800 + }, + { + "epoch": 0.12451043937697641, + "grad_norm": 6.790831565856934, + "learning_rate": 1.9174371139032548e-05, + "loss": 1.9571, + "step": 19810 + }, + { + "epoch": 0.12457329169367352, + "grad_norm": 8.452643394470215, + "learning_rate": 1.9173952038087895e-05, + "loss": 1.9298, + "step": 19820 + }, + { + "epoch": 0.12463614401037064, + "grad_norm": 8.183903694152832, + "learning_rate": 1.9173532937143242e-05, + "loss": 2.3263, + "step": 19830 + }, + { + "epoch": 0.12469899632706774, + "grad_norm": 7.026977062225342, + "learning_rate": 1.917311383619859e-05, + "loss": 2.0561, + "step": 19840 + }, + { + "epoch": 0.12476184864376486, + "grad_norm": 7.824635982513428, + "learning_rate": 1.9172694735253933e-05, + "loss": 2.0271, + "step": 19850 + }, + { + "epoch": 0.12482470096046197, + "grad_norm": 7.062274932861328, + "learning_rate": 1.917227563430928e-05, + "loss": 2.2787, + "step": 19860 + }, + { + "epoch": 0.12488755327715907, + "grad_norm": 7.3444294929504395, + "learning_rate": 1.9171856533364627e-05, + "loss": 2.2993, + "step": 19870 + }, + { + "epoch": 0.12495040559385619, + "grad_norm": 7.4288740158081055, + "learning_rate": 1.9171437432419974e-05, + "loss": 1.8584, + "step": 19880 + }, + { + "epoch": 0.1250132579105533, + "grad_norm": 7.470634937286377, + "learning_rate": 1.917101833147532e-05, + "loss": 2.0709, + "step": 19890 + }, + { + "epoch": 0.1250761102272504, + "grad_norm": 7.890097618103027, + "learning_rate": 1.9170599230530665e-05, + "loss": 2.0735, + "step": 19900 + }, + { + "epoch": 0.12513896254394752, + "grad_norm": 8.124094009399414, + "learning_rate": 1.9170180129586012e-05, + "loss": 2.2584, + "step": 19910 + }, + { + "epoch": 0.12520181486064463, + "grad_norm": 7.2056474685668945, + "learning_rate": 1.916976102864136e-05, + "loss": 2.0531, + "step": 19920 + }, + { + "epoch": 0.12526466717734175, + "grad_norm": 7.099191188812256, + "learning_rate": 1.9169341927696706e-05, + "loss": 2.2185, + "step": 19930 + }, + { + "epoch": 0.12532751949403886, + "grad_norm": 8.294858932495117, + "learning_rate": 1.9168922826752053e-05, + "loss": 2.173, + "step": 19940 + }, + { + "epoch": 0.12539037181073595, + "grad_norm": 8.263566017150879, + "learning_rate": 1.91685037258074e-05, + "loss": 2.2138, + "step": 19950 + }, + { + "epoch": 0.12545322412743307, + "grad_norm": 7.2847418785095215, + "learning_rate": 1.9168084624862748e-05, + "loss": 2.2492, + "step": 19960 + }, + { + "epoch": 0.12551607644413018, + "grad_norm": 7.45059061050415, + "learning_rate": 1.9167665523918095e-05, + "loss": 2.3967, + "step": 19970 + }, + { + "epoch": 0.1255789287608273, + "grad_norm": 8.305978775024414, + "learning_rate": 1.916724642297344e-05, + "loss": 2.075, + "step": 19980 + }, + { + "epoch": 0.1256417810775244, + "grad_norm": 7.666145324707031, + "learning_rate": 1.9166827322028785e-05, + "loss": 2.3947, + "step": 19990 + }, + { + "epoch": 0.12570463339422153, + "grad_norm": 6.912258148193359, + "learning_rate": 1.9166408221084132e-05, + "loss": 2.2309, + "step": 20000 + }, + { + "epoch": 0.12576748571091861, + "grad_norm": 7.005916595458984, + "learning_rate": 1.916598912013948e-05, + "loss": 2.3093, + "step": 20010 + }, + { + "epoch": 0.12583033802761573, + "grad_norm": 7.088340759277344, + "learning_rate": 1.9165570019194827e-05, + "loss": 2.3978, + "step": 20020 + }, + { + "epoch": 0.12589319034431284, + "grad_norm": 7.318819046020508, + "learning_rate": 1.916515091825017e-05, + "loss": 1.9451, + "step": 20030 + }, + { + "epoch": 0.12595604266100996, + "grad_norm": 7.074946880340576, + "learning_rate": 1.9164731817305517e-05, + "loss": 2.2586, + "step": 20040 + }, + { + "epoch": 0.12601889497770707, + "grad_norm": 6.7742533683776855, + "learning_rate": 1.9164312716360864e-05, + "loss": 2.0808, + "step": 20050 + }, + { + "epoch": 0.1260817472944042, + "grad_norm": 8.263649940490723, + "learning_rate": 1.916389361541621e-05, + "loss": 2.1158, + "step": 20060 + }, + { + "epoch": 0.12614459961110128, + "grad_norm": 7.028926372528076, + "learning_rate": 1.9163474514471555e-05, + "loss": 2.1163, + "step": 20070 + }, + { + "epoch": 0.1262074519277984, + "grad_norm": 6.812427043914795, + "learning_rate": 1.9163055413526902e-05, + "loss": 1.9023, + "step": 20080 + }, + { + "epoch": 0.1262703042444955, + "grad_norm": 14.464795112609863, + "learning_rate": 1.916263631258225e-05, + "loss": 2.2503, + "step": 20090 + }, + { + "epoch": 0.12633315656119262, + "grad_norm": 7.922114372253418, + "learning_rate": 1.9162217211637596e-05, + "loss": 2.1397, + "step": 20100 + }, + { + "epoch": 0.12639600887788974, + "grad_norm": 6.583799362182617, + "learning_rate": 1.9161798110692943e-05, + "loss": 1.8107, + "step": 20110 + }, + { + "epoch": 0.12645886119458685, + "grad_norm": 6.8187432289123535, + "learning_rate": 1.9161379009748287e-05, + "loss": 2.1158, + "step": 20120 + }, + { + "epoch": 0.12652171351128397, + "grad_norm": 8.50075912475586, + "learning_rate": 1.9160959908803634e-05, + "loss": 2.341, + "step": 20130 + }, + { + "epoch": 0.12658456582798105, + "grad_norm": 8.930004119873047, + "learning_rate": 1.916054080785898e-05, + "loss": 2.3068, + "step": 20140 + }, + { + "epoch": 0.12664741814467817, + "grad_norm": 7.269128799438477, + "learning_rate": 1.916012170691433e-05, + "loss": 1.9076, + "step": 20150 + }, + { + "epoch": 0.12671027046137529, + "grad_norm": 7.562623977661133, + "learning_rate": 1.9159702605969675e-05, + "loss": 2.26, + "step": 20160 + }, + { + "epoch": 0.1267731227780724, + "grad_norm": 9.525056838989258, + "learning_rate": 1.9159283505025022e-05, + "loss": 2.3676, + "step": 20170 + }, + { + "epoch": 0.12683597509476952, + "grad_norm": 7.904518127441406, + "learning_rate": 1.915886440408037e-05, + "loss": 2.291, + "step": 20180 + }, + { + "epoch": 0.12689882741146663, + "grad_norm": 8.256309509277344, + "learning_rate": 1.9158445303135717e-05, + "loss": 2.0052, + "step": 20190 + }, + { + "epoch": 0.12696167972816372, + "grad_norm": 8.02798080444336, + "learning_rate": 1.9158026202191064e-05, + "loss": 2.4553, + "step": 20200 + }, + { + "epoch": 0.12702453204486083, + "grad_norm": 7.898191928863525, + "learning_rate": 1.9157607101246407e-05, + "loss": 2.126, + "step": 20210 + }, + { + "epoch": 0.12708738436155795, + "grad_norm": 6.945441246032715, + "learning_rate": 1.9157188000301754e-05, + "loss": 2.0418, + "step": 20220 + }, + { + "epoch": 0.12715023667825506, + "grad_norm": 6.388503074645996, + "learning_rate": 1.91567688993571e-05, + "loss": 2.0764, + "step": 20230 + }, + { + "epoch": 0.12721308899495218, + "grad_norm": 8.389837265014648, + "learning_rate": 1.915634979841245e-05, + "loss": 2.1635, + "step": 20240 + }, + { + "epoch": 0.1272759413116493, + "grad_norm": 7.304172992706299, + "learning_rate": 1.9155930697467792e-05, + "loss": 2.0955, + "step": 20250 + }, + { + "epoch": 0.1273387936283464, + "grad_norm": 7.5525031089782715, + "learning_rate": 1.915551159652314e-05, + "loss": 2.2257, + "step": 20260 + }, + { + "epoch": 0.1274016459450435, + "grad_norm": 7.935617446899414, + "learning_rate": 1.9155092495578486e-05, + "loss": 2.252, + "step": 20270 + }, + { + "epoch": 0.1274644982617406, + "grad_norm": 7.920311450958252, + "learning_rate": 1.9154673394633833e-05, + "loss": 2.1841, + "step": 20280 + }, + { + "epoch": 0.12752735057843773, + "grad_norm": 7.139548301696777, + "learning_rate": 1.9154254293689177e-05, + "loss": 2.0093, + "step": 20290 + }, + { + "epoch": 0.12759020289513484, + "grad_norm": 9.367383003234863, + "learning_rate": 1.9153835192744524e-05, + "loss": 2.1182, + "step": 20300 + }, + { + "epoch": 0.12765305521183196, + "grad_norm": 7.5193328857421875, + "learning_rate": 1.915341609179987e-05, + "loss": 2.3039, + "step": 20310 + }, + { + "epoch": 0.12771590752852907, + "grad_norm": 7.702613353729248, + "learning_rate": 1.915299699085522e-05, + "loss": 2.3208, + "step": 20320 + }, + { + "epoch": 0.12777875984522616, + "grad_norm": 7.226729393005371, + "learning_rate": 1.9152577889910565e-05, + "loss": 2.3608, + "step": 20330 + }, + { + "epoch": 0.12784161216192327, + "grad_norm": 8.596226692199707, + "learning_rate": 1.9152158788965913e-05, + "loss": 2.5536, + "step": 20340 + }, + { + "epoch": 0.1279044644786204, + "grad_norm": 7.296021938323975, + "learning_rate": 1.915173968802126e-05, + "loss": 2.3818, + "step": 20350 + }, + { + "epoch": 0.1279673167953175, + "grad_norm": 7.413475036621094, + "learning_rate": 1.9151320587076607e-05, + "loss": 2.1525, + "step": 20360 + }, + { + "epoch": 0.12803016911201462, + "grad_norm": 7.288099765777588, + "learning_rate": 1.915090148613195e-05, + "loss": 2.1861, + "step": 20370 + }, + { + "epoch": 0.12809302142871173, + "grad_norm": 8.350249290466309, + "learning_rate": 1.9150482385187297e-05, + "loss": 2.0005, + "step": 20380 + }, + { + "epoch": 0.12815587374540882, + "grad_norm": 7.46809720993042, + "learning_rate": 1.9150063284242644e-05, + "loss": 2.027, + "step": 20390 + }, + { + "epoch": 0.12821872606210594, + "grad_norm": 7.459334850311279, + "learning_rate": 1.914964418329799e-05, + "loss": 1.9684, + "step": 20400 + }, + { + "epoch": 0.12828157837880305, + "grad_norm": 7.063770771026611, + "learning_rate": 1.914922508235334e-05, + "loss": 2.1614, + "step": 20410 + }, + { + "epoch": 0.12834443069550017, + "grad_norm": 7.142075061798096, + "learning_rate": 1.9148805981408686e-05, + "loss": 2.2335, + "step": 20420 + }, + { + "epoch": 0.12840728301219728, + "grad_norm": 7.43182897567749, + "learning_rate": 1.914838688046403e-05, + "loss": 2.0091, + "step": 20430 + }, + { + "epoch": 0.1284701353288944, + "grad_norm": 7.314360618591309, + "learning_rate": 1.9147967779519376e-05, + "loss": 2.1786, + "step": 20440 + }, + { + "epoch": 0.1285329876455915, + "grad_norm": 8.008227348327637, + "learning_rate": 1.9147548678574724e-05, + "loss": 2.0012, + "step": 20450 + }, + { + "epoch": 0.1285958399622886, + "grad_norm": 9.939228057861328, + "learning_rate": 1.914712957763007e-05, + "loss": 2.0807, + "step": 20460 + }, + { + "epoch": 0.12865869227898571, + "grad_norm": 6.3646697998046875, + "learning_rate": 1.9146710476685414e-05, + "loss": 2.0031, + "step": 20470 + }, + { + "epoch": 0.12872154459568283, + "grad_norm": 7.245133876800537, + "learning_rate": 1.914629137574076e-05, + "loss": 2.2448, + "step": 20480 + }, + { + "epoch": 0.12878439691237994, + "grad_norm": 6.556634902954102, + "learning_rate": 1.914587227479611e-05, + "loss": 2.0929, + "step": 20490 + }, + { + "epoch": 0.12884724922907706, + "grad_norm": 7.852427005767822, + "learning_rate": 1.9145453173851455e-05, + "loss": 2.4566, + "step": 20500 + }, + { + "epoch": 0.12891010154577418, + "grad_norm": 6.728714466094971, + "learning_rate": 1.9145034072906803e-05, + "loss": 2.282, + "step": 20510 + }, + { + "epoch": 0.12897295386247126, + "grad_norm": 6.429920673370361, + "learning_rate": 1.9144614971962146e-05, + "loss": 2.0266, + "step": 20520 + }, + { + "epoch": 0.12903580617916838, + "grad_norm": 7.419079780578613, + "learning_rate": 1.9144195871017493e-05, + "loss": 2.3467, + "step": 20530 + }, + { + "epoch": 0.1290986584958655, + "grad_norm": 7.593814373016357, + "learning_rate": 1.914377677007284e-05, + "loss": 2.2039, + "step": 20540 + }, + { + "epoch": 0.1291615108125626, + "grad_norm": 34.75748062133789, + "learning_rate": 1.9143357669128187e-05, + "loss": 2.3577, + "step": 20550 + }, + { + "epoch": 0.12922436312925972, + "grad_norm": 8.49825668334961, + "learning_rate": 1.9142938568183535e-05, + "loss": 2.0563, + "step": 20560 + }, + { + "epoch": 0.12928721544595684, + "grad_norm": 7.967658996582031, + "learning_rate": 1.914251946723888e-05, + "loss": 2.0841, + "step": 20570 + }, + { + "epoch": 0.12935006776265393, + "grad_norm": 8.534242630004883, + "learning_rate": 1.914210036629423e-05, + "loss": 2.2224, + "step": 20580 + }, + { + "epoch": 0.12941292007935104, + "grad_norm": 7.604470252990723, + "learning_rate": 1.9141681265349576e-05, + "loss": 2.0527, + "step": 20590 + }, + { + "epoch": 0.12947577239604816, + "grad_norm": 8.015196800231934, + "learning_rate": 1.9141262164404923e-05, + "loss": 2.1757, + "step": 20600 + }, + { + "epoch": 0.12953862471274527, + "grad_norm": 7.4464111328125, + "learning_rate": 1.9140843063460266e-05, + "loss": 2.1135, + "step": 20610 + }, + { + "epoch": 0.12960147702944239, + "grad_norm": 6.638156414031982, + "learning_rate": 1.9140423962515614e-05, + "loss": 2.0033, + "step": 20620 + }, + { + "epoch": 0.1296643293461395, + "grad_norm": 7.73036527633667, + "learning_rate": 1.914000486157096e-05, + "loss": 2.0447, + "step": 20630 + }, + { + "epoch": 0.12972718166283662, + "grad_norm": 6.8829216957092285, + "learning_rate": 1.9139585760626308e-05, + "loss": 2.1127, + "step": 20640 + }, + { + "epoch": 0.1297900339795337, + "grad_norm": 7.297530174255371, + "learning_rate": 1.913916665968165e-05, + "loss": 2.2805, + "step": 20650 + }, + { + "epoch": 0.12985288629623082, + "grad_norm": 9.506532669067383, + "learning_rate": 1.9138747558737e-05, + "loss": 2.3277, + "step": 20660 + }, + { + "epoch": 0.12991573861292793, + "grad_norm": 7.570408821105957, + "learning_rate": 1.9138328457792346e-05, + "loss": 2.0331, + "step": 20670 + }, + { + "epoch": 0.12997859092962505, + "grad_norm": 6.746794700622559, + "learning_rate": 1.9137909356847693e-05, + "loss": 1.9404, + "step": 20680 + }, + { + "epoch": 0.13004144324632216, + "grad_norm": 7.1945109367370605, + "learning_rate": 1.9137490255903036e-05, + "loss": 2.2295, + "step": 20690 + }, + { + "epoch": 0.13010429556301928, + "grad_norm": 8.07593059539795, + "learning_rate": 1.9137071154958383e-05, + "loss": 2.2345, + "step": 20700 + }, + { + "epoch": 0.13016714787971637, + "grad_norm": 7.70260763168335, + "learning_rate": 1.913665205401373e-05, + "loss": 2.3744, + "step": 20710 + }, + { + "epoch": 0.13023000019641348, + "grad_norm": 7.805847644805908, + "learning_rate": 1.9136232953069077e-05, + "loss": 2.2995, + "step": 20720 + }, + { + "epoch": 0.1302928525131106, + "grad_norm": 7.288343906402588, + "learning_rate": 1.9135813852124425e-05, + "loss": 2.2592, + "step": 20730 + }, + { + "epoch": 0.1303557048298077, + "grad_norm": 8.316977500915527, + "learning_rate": 1.913539475117977e-05, + "loss": 2.5339, + "step": 20740 + }, + { + "epoch": 0.13041855714650483, + "grad_norm": 7.602510452270508, + "learning_rate": 1.9134975650235115e-05, + "loss": 2.0968, + "step": 20750 + }, + { + "epoch": 0.13048140946320194, + "grad_norm": 7.784472465515137, + "learning_rate": 1.9134556549290462e-05, + "loss": 1.9854, + "step": 20760 + }, + { + "epoch": 0.13054426177989906, + "grad_norm": 6.507996559143066, + "learning_rate": 1.913413744834581e-05, + "loss": 2.0867, + "step": 20770 + }, + { + "epoch": 0.13060711409659614, + "grad_norm": 8.400556564331055, + "learning_rate": 1.9133718347401157e-05, + "loss": 2.1331, + "step": 20780 + }, + { + "epoch": 0.13066996641329326, + "grad_norm": 7.46922492980957, + "learning_rate": 1.9133299246456504e-05, + "loss": 2.0639, + "step": 20790 + }, + { + "epoch": 0.13073281872999037, + "grad_norm": 6.9902215003967285, + "learning_rate": 1.913288014551185e-05, + "loss": 2.1429, + "step": 20800 + }, + { + "epoch": 0.1307956710466875, + "grad_norm": 6.405163288116455, + "learning_rate": 1.9132461044567198e-05, + "loss": 2.2893, + "step": 20810 + }, + { + "epoch": 0.1308585233633846, + "grad_norm": 9.020237922668457, + "learning_rate": 1.9132041943622545e-05, + "loss": 2.1135, + "step": 20820 + }, + { + "epoch": 0.13092137568008172, + "grad_norm": 8.635034561157227, + "learning_rate": 1.913162284267789e-05, + "loss": 2.1556, + "step": 20830 + }, + { + "epoch": 0.1309842279967788, + "grad_norm": 8.830340385437012, + "learning_rate": 1.9131203741733236e-05, + "loss": 2.3283, + "step": 20840 + }, + { + "epoch": 0.13104708031347592, + "grad_norm": 8.042695045471191, + "learning_rate": 1.9130784640788583e-05, + "loss": 2.0542, + "step": 20850 + }, + { + "epoch": 0.13110993263017304, + "grad_norm": 8.38254451751709, + "learning_rate": 1.913036553984393e-05, + "loss": 2.2536, + "step": 20860 + }, + { + "epoch": 0.13117278494687015, + "grad_norm": 7.26369571685791, + "learning_rate": 1.9129946438899273e-05, + "loss": 2.4977, + "step": 20870 + }, + { + "epoch": 0.13123563726356727, + "grad_norm": 7.492801666259766, + "learning_rate": 1.912952733795462e-05, + "loss": 2.1608, + "step": 20880 + }, + { + "epoch": 0.13129848958026438, + "grad_norm": 8.494919776916504, + "learning_rate": 1.9129108237009968e-05, + "loss": 2.2264, + "step": 20890 + }, + { + "epoch": 0.13136134189696147, + "grad_norm": 6.754851818084717, + "learning_rate": 1.9128689136065315e-05, + "loss": 2.052, + "step": 20900 + }, + { + "epoch": 0.13142419421365859, + "grad_norm": 7.529036521911621, + "learning_rate": 1.912827003512066e-05, + "loss": 2.1838, + "step": 20910 + }, + { + "epoch": 0.1314870465303557, + "grad_norm": 7.61317253112793, + "learning_rate": 1.9127850934176005e-05, + "loss": 2.3485, + "step": 20920 + }, + { + "epoch": 0.13154989884705282, + "grad_norm": 7.339181423187256, + "learning_rate": 1.9127431833231352e-05, + "loss": 2.0842, + "step": 20930 + }, + { + "epoch": 0.13161275116374993, + "grad_norm": 7.108869552612305, + "learning_rate": 1.91270127322867e-05, + "loss": 2.0695, + "step": 20940 + }, + { + "epoch": 0.13167560348044705, + "grad_norm": 7.704329490661621, + "learning_rate": 1.9126593631342047e-05, + "loss": 2.1022, + "step": 20950 + }, + { + "epoch": 0.13173845579714416, + "grad_norm": 9.51086711883545, + "learning_rate": 1.9126174530397394e-05, + "loss": 1.9662, + "step": 20960 + }, + { + "epoch": 0.13180130811384125, + "grad_norm": 7.567604064941406, + "learning_rate": 1.912575542945274e-05, + "loss": 2.1206, + "step": 20970 + }, + { + "epoch": 0.13186416043053836, + "grad_norm": 7.458498001098633, + "learning_rate": 1.9125336328508088e-05, + "loss": 1.9615, + "step": 20980 + }, + { + "epoch": 0.13192701274723548, + "grad_norm": 7.41848087310791, + "learning_rate": 1.9124917227563435e-05, + "loss": 1.9544, + "step": 20990 + }, + { + "epoch": 0.1319898650639326, + "grad_norm": 7.613551139831543, + "learning_rate": 1.912449812661878e-05, + "loss": 1.8618, + "step": 21000 + }, + { + "epoch": 0.1320527173806297, + "grad_norm": 7.473199844360352, + "learning_rate": 1.9124079025674126e-05, + "loss": 2.0437, + "step": 21010 + }, + { + "epoch": 0.13211556969732682, + "grad_norm": 6.8990478515625, + "learning_rate": 1.9123659924729473e-05, + "loss": 2.159, + "step": 21020 + }, + { + "epoch": 0.1321784220140239, + "grad_norm": 8.175252914428711, + "learning_rate": 1.912324082378482e-05, + "loss": 1.9783, + "step": 21030 + }, + { + "epoch": 0.13224127433072103, + "grad_norm": 6.731335639953613, + "learning_rate": 1.9122821722840167e-05, + "loss": 1.9808, + "step": 21040 + }, + { + "epoch": 0.13230412664741814, + "grad_norm": 8.126704216003418, + "learning_rate": 1.912240262189551e-05, + "loss": 1.9968, + "step": 21050 + }, + { + "epoch": 0.13236697896411526, + "grad_norm": 8.616935729980469, + "learning_rate": 1.9121983520950858e-05, + "loss": 1.9534, + "step": 21060 + }, + { + "epoch": 0.13242983128081237, + "grad_norm": 8.366382598876953, + "learning_rate": 1.9121564420006205e-05, + "loss": 2.3624, + "step": 21070 + }, + { + "epoch": 0.1324926835975095, + "grad_norm": 6.051831245422363, + "learning_rate": 1.9121145319061552e-05, + "loss": 1.7983, + "step": 21080 + }, + { + "epoch": 0.1325555359142066, + "grad_norm": 6.883232593536377, + "learning_rate": 1.9120726218116895e-05, + "loss": 2.0166, + "step": 21090 + }, + { + "epoch": 0.1326183882309037, + "grad_norm": 8.676856994628906, + "learning_rate": 1.9120307117172242e-05, + "loss": 2.0505, + "step": 21100 + }, + { + "epoch": 0.1326812405476008, + "grad_norm": 7.5355987548828125, + "learning_rate": 1.911988801622759e-05, + "loss": 2.1162, + "step": 21110 + }, + { + "epoch": 0.13274409286429792, + "grad_norm": 7.496070384979248, + "learning_rate": 1.9119468915282937e-05, + "loss": 2.1305, + "step": 21120 + }, + { + "epoch": 0.13280694518099503, + "grad_norm": 8.2671537399292, + "learning_rate": 1.9119049814338284e-05, + "loss": 2.2587, + "step": 21130 + }, + { + "epoch": 0.13286979749769215, + "grad_norm": 7.576817512512207, + "learning_rate": 1.9118630713393627e-05, + "loss": 2.0564, + "step": 21140 + }, + { + "epoch": 0.13293264981438926, + "grad_norm": 7.535472869873047, + "learning_rate": 1.9118211612448974e-05, + "loss": 2.0817, + "step": 21150 + }, + { + "epoch": 0.13299550213108635, + "grad_norm": 8.097841262817383, + "learning_rate": 1.911779251150432e-05, + "loss": 2.2972, + "step": 21160 + }, + { + "epoch": 0.13305835444778347, + "grad_norm": 22.04595947265625, + "learning_rate": 1.911737341055967e-05, + "loss": 2.1941, + "step": 21170 + }, + { + "epoch": 0.13312120676448058, + "grad_norm": 7.512133598327637, + "learning_rate": 1.9116954309615016e-05, + "loss": 2.1939, + "step": 21180 + }, + { + "epoch": 0.1331840590811777, + "grad_norm": 7.298905849456787, + "learning_rate": 1.9116535208670363e-05, + "loss": 2.2007, + "step": 21190 + }, + { + "epoch": 0.1332469113978748, + "grad_norm": 6.7467474937438965, + "learning_rate": 1.911611610772571e-05, + "loss": 2.2847, + "step": 21200 + }, + { + "epoch": 0.13330976371457193, + "grad_norm": 7.777745246887207, + "learning_rate": 1.9115697006781057e-05, + "loss": 2.1764, + "step": 21210 + }, + { + "epoch": 0.13337261603126901, + "grad_norm": 7.369444370269775, + "learning_rate": 1.9115277905836404e-05, + "loss": 2.0733, + "step": 21220 + }, + { + "epoch": 0.13343546834796613, + "grad_norm": 7.144137382507324, + "learning_rate": 1.9114858804891748e-05, + "loss": 1.9866, + "step": 21230 + }, + { + "epoch": 0.13349832066466324, + "grad_norm": 8.029589653015137, + "learning_rate": 1.9114439703947095e-05, + "loss": 2.1095, + "step": 21240 + }, + { + "epoch": 0.13356117298136036, + "grad_norm": 7.28410530090332, + "learning_rate": 1.9114020603002442e-05, + "loss": 2.0821, + "step": 21250 + }, + { + "epoch": 0.13362402529805747, + "grad_norm": 7.584111213684082, + "learning_rate": 1.911360150205779e-05, + "loss": 2.043, + "step": 21260 + }, + { + "epoch": 0.1336868776147546, + "grad_norm": 8.032008171081543, + "learning_rate": 1.9113182401113132e-05, + "loss": 2.1231, + "step": 21270 + }, + { + "epoch": 0.1337497299314517, + "grad_norm": 8.244526863098145, + "learning_rate": 1.911276330016848e-05, + "loss": 2.183, + "step": 21280 + }, + { + "epoch": 0.1338125822481488, + "grad_norm": 7.711684703826904, + "learning_rate": 1.9112344199223827e-05, + "loss": 1.9712, + "step": 21290 + }, + { + "epoch": 0.1338754345648459, + "grad_norm": 7.845302104949951, + "learning_rate": 1.9111925098279174e-05, + "loss": 2.0885, + "step": 21300 + }, + { + "epoch": 0.13393828688154302, + "grad_norm": 7.101912498474121, + "learning_rate": 1.9111505997334517e-05, + "loss": 1.847, + "step": 21310 + }, + { + "epoch": 0.13400113919824014, + "grad_norm": 7.40846586227417, + "learning_rate": 1.9111086896389864e-05, + "loss": 2.432, + "step": 21320 + }, + { + "epoch": 0.13406399151493725, + "grad_norm": 6.370995044708252, + "learning_rate": 1.911066779544521e-05, + "loss": 1.9739, + "step": 21330 + }, + { + "epoch": 0.13412684383163437, + "grad_norm": 6.674936294555664, + "learning_rate": 1.911024869450056e-05, + "loss": 2.1724, + "step": 21340 + }, + { + "epoch": 0.13418969614833146, + "grad_norm": 6.612419605255127, + "learning_rate": 1.9109829593555906e-05, + "loss": 2.0961, + "step": 21350 + }, + { + "epoch": 0.13425254846502857, + "grad_norm": 6.799704551696777, + "learning_rate": 1.9109410492611253e-05, + "loss": 1.9552, + "step": 21360 + }, + { + "epoch": 0.13431540078172569, + "grad_norm": 8.12215805053711, + "learning_rate": 1.9108991391666596e-05, + "loss": 2.1582, + "step": 21370 + }, + { + "epoch": 0.1343782530984228, + "grad_norm": 8.388773918151855, + "learning_rate": 1.9108572290721944e-05, + "loss": 2.2236, + "step": 21380 + }, + { + "epoch": 0.13444110541511992, + "grad_norm": 7.834544658660889, + "learning_rate": 1.910815318977729e-05, + "loss": 2.1054, + "step": 21390 + }, + { + "epoch": 0.13450395773181703, + "grad_norm": 8.504358291625977, + "learning_rate": 1.9107734088832638e-05, + "loss": 2.1659, + "step": 21400 + }, + { + "epoch": 0.13456681004851412, + "grad_norm": 8.723959922790527, + "learning_rate": 1.9107314987887985e-05, + "loss": 1.9973, + "step": 21410 + }, + { + "epoch": 0.13462966236521123, + "grad_norm": 19.787212371826172, + "learning_rate": 1.9106895886943332e-05, + "loss": 2.2176, + "step": 21420 + }, + { + "epoch": 0.13469251468190835, + "grad_norm": 7.724793910980225, + "learning_rate": 1.910647678599868e-05, + "loss": 1.9911, + "step": 21430 + }, + { + "epoch": 0.13475536699860546, + "grad_norm": 7.064187526702881, + "learning_rate": 1.9106057685054026e-05, + "loss": 2.1318, + "step": 21440 + }, + { + "epoch": 0.13481821931530258, + "grad_norm": 8.108419418334961, + "learning_rate": 1.910563858410937e-05, + "loss": 2.2516, + "step": 21450 + }, + { + "epoch": 0.1348810716319997, + "grad_norm": 8.397812843322754, + "learning_rate": 1.9105219483164717e-05, + "loss": 2.0927, + "step": 21460 + }, + { + "epoch": 0.1349439239486968, + "grad_norm": 6.9287567138671875, + "learning_rate": 1.9104800382220064e-05, + "loss": 2.0847, + "step": 21470 + }, + { + "epoch": 0.1350067762653939, + "grad_norm": 14.46806526184082, + "learning_rate": 1.910438128127541e-05, + "loss": 2.0837, + "step": 21480 + }, + { + "epoch": 0.135069628582091, + "grad_norm": 8.349653244018555, + "learning_rate": 1.9103962180330755e-05, + "loss": 2.1776, + "step": 21490 + }, + { + "epoch": 0.13513248089878813, + "grad_norm": 7.929886817932129, + "learning_rate": 1.91035430793861e-05, + "loss": 2.2667, + "step": 21500 + }, + { + "epoch": 0.13519533321548524, + "grad_norm": 6.784626007080078, + "learning_rate": 1.910312397844145e-05, + "loss": 2.1803, + "step": 21510 + }, + { + "epoch": 0.13525818553218236, + "grad_norm": 7.63728141784668, + "learning_rate": 1.9102704877496796e-05, + "loss": 2.2395, + "step": 21520 + }, + { + "epoch": 0.13532103784887947, + "grad_norm": 7.283819675445557, + "learning_rate": 1.9102285776552143e-05, + "loss": 2.3257, + "step": 21530 + }, + { + "epoch": 0.13538389016557656, + "grad_norm": 6.891645431518555, + "learning_rate": 1.9101866675607486e-05, + "loss": 2.1525, + "step": 21540 + }, + { + "epoch": 0.13544674248227367, + "grad_norm": 7.524374008178711, + "learning_rate": 1.9101447574662834e-05, + "loss": 1.987, + "step": 21550 + }, + { + "epoch": 0.1355095947989708, + "grad_norm": 7.3678364753723145, + "learning_rate": 1.910102847371818e-05, + "loss": 2.0044, + "step": 21560 + }, + { + "epoch": 0.1355724471156679, + "grad_norm": 7.025672912597656, + "learning_rate": 1.9100609372773528e-05, + "loss": 1.9908, + "step": 21570 + }, + { + "epoch": 0.13563529943236502, + "grad_norm": 6.5765204429626465, + "learning_rate": 1.9100190271828875e-05, + "loss": 1.9447, + "step": 21580 + }, + { + "epoch": 0.13569815174906213, + "grad_norm": 8.267107009887695, + "learning_rate": 1.9099771170884222e-05, + "loss": 2.3063, + "step": 21590 + }, + { + "epoch": 0.13576100406575925, + "grad_norm": 7.098850250244141, + "learning_rate": 1.909935206993957e-05, + "loss": 2.1153, + "step": 21600 + }, + { + "epoch": 0.13582385638245634, + "grad_norm": 7.305449962615967, + "learning_rate": 1.9098932968994916e-05, + "loss": 2.1471, + "step": 21610 + }, + { + "epoch": 0.13588670869915345, + "grad_norm": 7.206968307495117, + "learning_rate": 1.909851386805026e-05, + "loss": 2.2529, + "step": 21620 + }, + { + "epoch": 0.13594956101585057, + "grad_norm": 7.390930652618408, + "learning_rate": 1.9098094767105607e-05, + "loss": 2.094, + "step": 21630 + }, + { + "epoch": 0.13601241333254768, + "grad_norm": 7.941120624542236, + "learning_rate": 1.9097675666160954e-05, + "loss": 2.1407, + "step": 21640 + }, + { + "epoch": 0.1360752656492448, + "grad_norm": 7.612168788909912, + "learning_rate": 1.90972565652163e-05, + "loss": 2.1267, + "step": 21650 + }, + { + "epoch": 0.1361381179659419, + "grad_norm": 8.423344612121582, + "learning_rate": 1.9096837464271648e-05, + "loss": 1.9587, + "step": 21660 + }, + { + "epoch": 0.136200970282639, + "grad_norm": 6.990922451019287, + "learning_rate": 1.909641836332699e-05, + "loss": 2.2546, + "step": 21670 + }, + { + "epoch": 0.13626382259933612, + "grad_norm": 10.280035972595215, + "learning_rate": 1.909599926238234e-05, + "loss": 2.3914, + "step": 21680 + }, + { + "epoch": 0.13632667491603323, + "grad_norm": 7.910796642303467, + "learning_rate": 1.9095580161437686e-05, + "loss": 2.1273, + "step": 21690 + }, + { + "epoch": 0.13638952723273035, + "grad_norm": 7.267086982727051, + "learning_rate": 1.9095161060493033e-05, + "loss": 2.0764, + "step": 21700 + }, + { + "epoch": 0.13645237954942746, + "grad_norm": 7.074910640716553, + "learning_rate": 1.9094783869642844e-05, + "loss": 2.1828, + "step": 21710 + }, + { + "epoch": 0.13651523186612458, + "grad_norm": 8.144148826599121, + "learning_rate": 1.909436476869819e-05, + "loss": 2.1668, + "step": 21720 + }, + { + "epoch": 0.13657808418282166, + "grad_norm": 7.03750467300415, + "learning_rate": 1.909394566775354e-05, + "loss": 2.1083, + "step": 21730 + }, + { + "epoch": 0.13664093649951878, + "grad_norm": 7.418277263641357, + "learning_rate": 1.9093526566808882e-05, + "loss": 2.2161, + "step": 21740 + }, + { + "epoch": 0.1367037888162159, + "grad_norm": 10.619608879089355, + "learning_rate": 1.909310746586423e-05, + "loss": 2.0044, + "step": 21750 + }, + { + "epoch": 0.136766641132913, + "grad_norm": 7.455464839935303, + "learning_rate": 1.9092688364919576e-05, + "loss": 2.2266, + "step": 21760 + }, + { + "epoch": 0.13682949344961012, + "grad_norm": 9.03001594543457, + "learning_rate": 1.9092269263974923e-05, + "loss": 2.2673, + "step": 21770 + }, + { + "epoch": 0.13689234576630724, + "grad_norm": 7.140875339508057, + "learning_rate": 1.909185016303027e-05, + "loss": 2.1292, + "step": 21780 + }, + { + "epoch": 0.13695519808300435, + "grad_norm": 7.704232692718506, + "learning_rate": 1.9091431062085614e-05, + "loss": 2.3453, + "step": 21790 + }, + { + "epoch": 0.13701805039970144, + "grad_norm": 7.573914051055908, + "learning_rate": 1.909101196114096e-05, + "loss": 2.0991, + "step": 21800 + }, + { + "epoch": 0.13708090271639856, + "grad_norm": 8.799951553344727, + "learning_rate": 1.9090592860196308e-05, + "loss": 2.3587, + "step": 21810 + }, + { + "epoch": 0.13714375503309567, + "grad_norm": 6.665711402893066, + "learning_rate": 1.9090173759251655e-05, + "loss": 2.3124, + "step": 21820 + }, + { + "epoch": 0.1372066073497928, + "grad_norm": 7.785305976867676, + "learning_rate": 1.9089754658307e-05, + "loss": 2.323, + "step": 21830 + }, + { + "epoch": 0.1372694596664899, + "grad_norm": 7.521659851074219, + "learning_rate": 1.9089335557362346e-05, + "loss": 2.1861, + "step": 21840 + }, + { + "epoch": 0.13733231198318702, + "grad_norm": 8.873821258544922, + "learning_rate": 1.9088916456417693e-05, + "loss": 2.3054, + "step": 21850 + }, + { + "epoch": 0.1373951642998841, + "grad_norm": 9.21068286895752, + "learning_rate": 1.908849735547304e-05, + "loss": 2.167, + "step": 21860 + }, + { + "epoch": 0.13745801661658122, + "grad_norm": 7.585288047790527, + "learning_rate": 1.9088078254528387e-05, + "loss": 2.0737, + "step": 21870 + }, + { + "epoch": 0.13752086893327833, + "grad_norm": 8.20755672454834, + "learning_rate": 1.9087659153583734e-05, + "loss": 2.1882, + "step": 21880 + }, + { + "epoch": 0.13758372124997545, + "grad_norm": 7.105890274047852, + "learning_rate": 1.908724005263908e-05, + "loss": 2.1434, + "step": 21890 + }, + { + "epoch": 0.13764657356667256, + "grad_norm": 6.78985071182251, + "learning_rate": 1.908682095169443e-05, + "loss": 2.3267, + "step": 21900 + }, + { + "epoch": 0.13770942588336968, + "grad_norm": 8.069618225097656, + "learning_rate": 1.9086401850749775e-05, + "loss": 2.0262, + "step": 21910 + }, + { + "epoch": 0.13777227820006677, + "grad_norm": 7.240670680999756, + "learning_rate": 1.908598274980512e-05, + "loss": 2.1662, + "step": 21920 + }, + { + "epoch": 0.13783513051676388, + "grad_norm": 7.4934868812561035, + "learning_rate": 1.9085563648860466e-05, + "loss": 2.055, + "step": 21930 + }, + { + "epoch": 0.137897982833461, + "grad_norm": 8.327574729919434, + "learning_rate": 1.9085144547915813e-05, + "loss": 2.1307, + "step": 21940 + }, + { + "epoch": 0.1379608351501581, + "grad_norm": 7.624167442321777, + "learning_rate": 1.908472544697116e-05, + "loss": 2.144, + "step": 21950 + }, + { + "epoch": 0.13802368746685523, + "grad_norm": 8.745718955993652, + "learning_rate": 1.9084306346026507e-05, + "loss": 2.1541, + "step": 21960 + }, + { + "epoch": 0.13808653978355234, + "grad_norm": 7.69420051574707, + "learning_rate": 1.908388724508185e-05, + "loss": 2.1109, + "step": 21970 + }, + { + "epoch": 0.13814939210024946, + "grad_norm": 7.407751560211182, + "learning_rate": 1.9083468144137198e-05, + "loss": 2.2494, + "step": 21980 + }, + { + "epoch": 0.13821224441694654, + "grad_norm": 7.729306697845459, + "learning_rate": 1.9083049043192545e-05, + "loss": 2.1493, + "step": 21990 + }, + { + "epoch": 0.13827509673364366, + "grad_norm": 7.324053764343262, + "learning_rate": 1.9082629942247892e-05, + "loss": 2.0412, + "step": 22000 + }, + { + "epoch": 0.13833794905034077, + "grad_norm": 6.690671443939209, + "learning_rate": 1.9082210841303236e-05, + "loss": 1.9636, + "step": 22010 + }, + { + "epoch": 0.1384008013670379, + "grad_norm": 6.156836986541748, + "learning_rate": 1.9081791740358583e-05, + "loss": 2.1582, + "step": 22020 + }, + { + "epoch": 0.138463653683735, + "grad_norm": 7.035289287567139, + "learning_rate": 1.908137263941393e-05, + "loss": 2.1814, + "step": 22030 + }, + { + "epoch": 0.13852650600043212, + "grad_norm": 7.178378582000732, + "learning_rate": 1.9080953538469277e-05, + "loss": 2.0645, + "step": 22040 + }, + { + "epoch": 0.1385893583171292, + "grad_norm": 7.521155834197998, + "learning_rate": 1.9080534437524624e-05, + "loss": 2.1186, + "step": 22050 + }, + { + "epoch": 0.13865221063382632, + "grad_norm": 7.570018291473389, + "learning_rate": 1.908011533657997e-05, + "loss": 2.1362, + "step": 22060 + }, + { + "epoch": 0.13871506295052344, + "grad_norm": 7.197385787963867, + "learning_rate": 1.9079696235635315e-05, + "loss": 2.1849, + "step": 22070 + }, + { + "epoch": 0.13877791526722055, + "grad_norm": 7.9512939453125, + "learning_rate": 1.9079277134690662e-05, + "loss": 2.4368, + "step": 22080 + }, + { + "epoch": 0.13884076758391767, + "grad_norm": 8.198320388793945, + "learning_rate": 1.907885803374601e-05, + "loss": 2.2081, + "step": 22090 + }, + { + "epoch": 0.13890361990061478, + "grad_norm": 8.728532791137695, + "learning_rate": 1.9078438932801356e-05, + "loss": 2.3773, + "step": 22100 + }, + { + "epoch": 0.1389664722173119, + "grad_norm": 7.970332145690918, + "learning_rate": 1.9078019831856703e-05, + "loss": 2.1445, + "step": 22110 + }, + { + "epoch": 0.13902932453400899, + "grad_norm": 7.041906833648682, + "learning_rate": 1.907760073091205e-05, + "loss": 2.1213, + "step": 22120 + }, + { + "epoch": 0.1390921768507061, + "grad_norm": 6.955906867980957, + "learning_rate": 1.9077181629967397e-05, + "loss": 2.0388, + "step": 22130 + }, + { + "epoch": 0.13915502916740322, + "grad_norm": 8.771379470825195, + "learning_rate": 1.907676252902274e-05, + "loss": 2.1978, + "step": 22140 + }, + { + "epoch": 0.13921788148410033, + "grad_norm": 7.178952693939209, + "learning_rate": 1.9076343428078088e-05, + "loss": 2.0043, + "step": 22150 + }, + { + "epoch": 0.13928073380079745, + "grad_norm": 7.710979461669922, + "learning_rate": 1.9075924327133435e-05, + "loss": 2.138, + "step": 22160 + }, + { + "epoch": 0.13934358611749456, + "grad_norm": 7.282996654510498, + "learning_rate": 1.9075505226188782e-05, + "loss": 2.1759, + "step": 22170 + }, + { + "epoch": 0.13940643843419165, + "grad_norm": 6.640257358551025, + "learning_rate": 1.907508612524413e-05, + "loss": 1.8781, + "step": 22180 + }, + { + "epoch": 0.13946929075088876, + "grad_norm": 7.761207580566406, + "learning_rate": 1.9074667024299473e-05, + "loss": 2.1495, + "step": 22190 + }, + { + "epoch": 0.13953214306758588, + "grad_norm": 7.394071102142334, + "learning_rate": 1.907424792335482e-05, + "loss": 2.0641, + "step": 22200 + }, + { + "epoch": 0.139594995384283, + "grad_norm": 6.498964309692383, + "learning_rate": 1.9073828822410167e-05, + "loss": 2.0303, + "step": 22210 + }, + { + "epoch": 0.1396578477009801, + "grad_norm": 7.536453723907471, + "learning_rate": 1.9073409721465514e-05, + "loss": 2.2705, + "step": 22220 + }, + { + "epoch": 0.13972070001767722, + "grad_norm": 7.092509746551514, + "learning_rate": 1.9072990620520858e-05, + "loss": 2.0013, + "step": 22230 + }, + { + "epoch": 0.1397835523343743, + "grad_norm": 8.55628776550293, + "learning_rate": 1.9072571519576205e-05, + "loss": 2.3906, + "step": 22240 + }, + { + "epoch": 0.13984640465107143, + "grad_norm": 8.0609130859375, + "learning_rate": 1.9072152418631552e-05, + "loss": 2.0266, + "step": 22250 + }, + { + "epoch": 0.13990925696776854, + "grad_norm": 7.505898475646973, + "learning_rate": 1.90717333176869e-05, + "loss": 2.1944, + "step": 22260 + }, + { + "epoch": 0.13997210928446566, + "grad_norm": 8.852445602416992, + "learning_rate": 1.9071314216742246e-05, + "loss": 2.0867, + "step": 22270 + }, + { + "epoch": 0.14003496160116277, + "grad_norm": 7.316661357879639, + "learning_rate": 1.9070895115797593e-05, + "loss": 2.1679, + "step": 22280 + }, + { + "epoch": 0.1400978139178599, + "grad_norm": 8.005709648132324, + "learning_rate": 1.907047601485294e-05, + "loss": 2.1251, + "step": 22290 + }, + { + "epoch": 0.140160666234557, + "grad_norm": 7.422688961029053, + "learning_rate": 1.9070056913908287e-05, + "loss": 1.8722, + "step": 22300 + }, + { + "epoch": 0.1402235185512541, + "grad_norm": 6.957630634307861, + "learning_rate": 1.906963781296363e-05, + "loss": 2.3425, + "step": 22310 + }, + { + "epoch": 0.1402863708679512, + "grad_norm": 7.596073627471924, + "learning_rate": 1.9069218712018978e-05, + "loss": 2.2905, + "step": 22320 + }, + { + "epoch": 0.14034922318464832, + "grad_norm": 7.504247188568115, + "learning_rate": 1.9068799611074325e-05, + "loss": 2.1704, + "step": 22330 + }, + { + "epoch": 0.14041207550134543, + "grad_norm": 6.982486248016357, + "learning_rate": 1.9068380510129672e-05, + "loss": 2.0466, + "step": 22340 + }, + { + "epoch": 0.14047492781804255, + "grad_norm": 6.621249675750732, + "learning_rate": 1.906796140918502e-05, + "loss": 2.1644, + "step": 22350 + }, + { + "epoch": 0.14053778013473966, + "grad_norm": 7.738133430480957, + "learning_rate": 1.9067542308240363e-05, + "loss": 2.1581, + "step": 22360 + }, + { + "epoch": 0.14060063245143675, + "grad_norm": 7.586179733276367, + "learning_rate": 1.906712320729571e-05, + "loss": 2.0927, + "step": 22370 + }, + { + "epoch": 0.14066348476813387, + "grad_norm": 6.744780540466309, + "learning_rate": 1.9066704106351057e-05, + "loss": 2.3744, + "step": 22380 + }, + { + "epoch": 0.14072633708483098, + "grad_norm": 6.437538146972656, + "learning_rate": 1.9066285005406404e-05, + "loss": 2.295, + "step": 22390 + }, + { + "epoch": 0.1407891894015281, + "grad_norm": 7.903862953186035, + "learning_rate": 1.906586590446175e-05, + "loss": 2.1277, + "step": 22400 + }, + { + "epoch": 0.1408520417182252, + "grad_norm": 7.29001522064209, + "learning_rate": 1.9065446803517095e-05, + "loss": 2.2617, + "step": 22410 + }, + { + "epoch": 0.14091489403492233, + "grad_norm": 7.974186897277832, + "learning_rate": 1.9065027702572442e-05, + "loss": 2.3483, + "step": 22420 + }, + { + "epoch": 0.14097774635161944, + "grad_norm": 7.985724449157715, + "learning_rate": 1.906460860162779e-05, + "loss": 2.0671, + "step": 22430 + }, + { + "epoch": 0.14104059866831653, + "grad_norm": 7.703798294067383, + "learning_rate": 1.9064189500683136e-05, + "loss": 2.0304, + "step": 22440 + }, + { + "epoch": 0.14110345098501365, + "grad_norm": 7.514264106750488, + "learning_rate": 1.906377039973848e-05, + "loss": 2.1393, + "step": 22450 + }, + { + "epoch": 0.14116630330171076, + "grad_norm": 8.436440467834473, + "learning_rate": 1.9063351298793827e-05, + "loss": 1.9612, + "step": 22460 + }, + { + "epoch": 0.14122915561840788, + "grad_norm": 7.111056327819824, + "learning_rate": 1.9062932197849174e-05, + "loss": 2.0796, + "step": 22470 + }, + { + "epoch": 0.141292007935105, + "grad_norm": 8.205460548400879, + "learning_rate": 1.906251309690452e-05, + "loss": 2.1031, + "step": 22480 + }, + { + "epoch": 0.1413548602518021, + "grad_norm": 7.536001682281494, + "learning_rate": 1.9062093995959868e-05, + "loss": 2.1979, + "step": 22490 + }, + { + "epoch": 0.1414177125684992, + "grad_norm": 8.101511001586914, + "learning_rate": 1.9061674895015215e-05, + "loss": 1.928, + "step": 22500 + }, + { + "epoch": 0.1414805648851963, + "grad_norm": 7.2906599044799805, + "learning_rate": 1.9061255794070562e-05, + "loss": 2.2956, + "step": 22510 + }, + { + "epoch": 0.14154341720189342, + "grad_norm": 7.893942832946777, + "learning_rate": 1.906083669312591e-05, + "loss": 2.133, + "step": 22520 + }, + { + "epoch": 0.14160626951859054, + "grad_norm": 11.342302322387695, + "learning_rate": 1.9060417592181257e-05, + "loss": 2.1614, + "step": 22530 + }, + { + "epoch": 0.14166912183528765, + "grad_norm": 8.061858177185059, + "learning_rate": 1.90599984912366e-05, + "loss": 2.1909, + "step": 22540 + }, + { + "epoch": 0.14173197415198477, + "grad_norm": 7.32312536239624, + "learning_rate": 1.9059579390291947e-05, + "loss": 2.1426, + "step": 22550 + }, + { + "epoch": 0.14179482646868186, + "grad_norm": 8.220121383666992, + "learning_rate": 1.9059160289347294e-05, + "loss": 2.295, + "step": 22560 + }, + { + "epoch": 0.14185767878537897, + "grad_norm": 7.3067402839660645, + "learning_rate": 1.905874118840264e-05, + "loss": 2.003, + "step": 22570 + }, + { + "epoch": 0.1419205311020761, + "grad_norm": 7.1083879470825195, + "learning_rate": 1.905832208745799e-05, + "loss": 1.9329, + "step": 22580 + }, + { + "epoch": 0.1419833834187732, + "grad_norm": 7.069380283355713, + "learning_rate": 1.9057902986513332e-05, + "loss": 2.2562, + "step": 22590 + }, + { + "epoch": 0.14204623573547032, + "grad_norm": 7.174941539764404, + "learning_rate": 1.905748388556868e-05, + "loss": 2.1896, + "step": 22600 + }, + { + "epoch": 0.14210908805216743, + "grad_norm": 7.403316497802734, + "learning_rate": 1.9057064784624026e-05, + "loss": 2.0871, + "step": 22610 + }, + { + "epoch": 0.14217194036886455, + "grad_norm": 7.457070827484131, + "learning_rate": 1.9056645683679373e-05, + "loss": 1.8901, + "step": 22620 + }, + { + "epoch": 0.14223479268556163, + "grad_norm": 7.98372745513916, + "learning_rate": 1.9056226582734717e-05, + "loss": 2.2116, + "step": 22630 + }, + { + "epoch": 0.14229764500225875, + "grad_norm": 7.929397106170654, + "learning_rate": 1.9055807481790064e-05, + "loss": 2.1043, + "step": 22640 + }, + { + "epoch": 0.14236049731895586, + "grad_norm": 8.145434379577637, + "learning_rate": 1.905538838084541e-05, + "loss": 1.8081, + "step": 22650 + }, + { + "epoch": 0.14242334963565298, + "grad_norm": 7.806945323944092, + "learning_rate": 1.9054969279900758e-05, + "loss": 2.3025, + "step": 22660 + }, + { + "epoch": 0.1424862019523501, + "grad_norm": 10.782485008239746, + "learning_rate": 1.9054550178956105e-05, + "loss": 2.2739, + "step": 22670 + }, + { + "epoch": 0.1425490542690472, + "grad_norm": 8.331992149353027, + "learning_rate": 1.9054131078011452e-05, + "loss": 2.1624, + "step": 22680 + }, + { + "epoch": 0.1426119065857443, + "grad_norm": 7.473433494567871, + "learning_rate": 1.9053711977066796e-05, + "loss": 1.9709, + "step": 22690 + }, + { + "epoch": 0.1426747589024414, + "grad_norm": 7.280951499938965, + "learning_rate": 1.9053292876122143e-05, + "loss": 2.1562, + "step": 22700 + }, + { + "epoch": 0.14273761121913853, + "grad_norm": 8.735506057739258, + "learning_rate": 1.905287377517749e-05, + "loss": 2.4145, + "step": 22710 + }, + { + "epoch": 0.14280046353583564, + "grad_norm": 8.281670570373535, + "learning_rate": 1.9052454674232837e-05, + "loss": 1.9165, + "step": 22720 + }, + { + "epoch": 0.14286331585253276, + "grad_norm": 8.478446006774902, + "learning_rate": 1.9052035573288184e-05, + "loss": 2.1552, + "step": 22730 + }, + { + "epoch": 0.14292616816922987, + "grad_norm": 7.284202575683594, + "learning_rate": 1.905161647234353e-05, + "loss": 2.4096, + "step": 22740 + }, + { + "epoch": 0.14298902048592696, + "grad_norm": 6.107362747192383, + "learning_rate": 1.905119737139888e-05, + "loss": 2.1548, + "step": 22750 + }, + { + "epoch": 0.14305187280262407, + "grad_norm": 8.200136184692383, + "learning_rate": 1.9050778270454222e-05, + "loss": 2.0889, + "step": 22760 + }, + { + "epoch": 0.1431147251193212, + "grad_norm": 7.132558345794678, + "learning_rate": 1.905035916950957e-05, + "loss": 2.0952, + "step": 22770 + }, + { + "epoch": 0.1431775774360183, + "grad_norm": 7.06957483291626, + "learning_rate": 1.9049940068564916e-05, + "loss": 2.2101, + "step": 22780 + }, + { + "epoch": 0.14324042975271542, + "grad_norm": 10.873089790344238, + "learning_rate": 1.9049520967620263e-05, + "loss": 2.2968, + "step": 22790 + }, + { + "epoch": 0.14330328206941254, + "grad_norm": 7.581949710845947, + "learning_rate": 1.904910186667561e-05, + "loss": 2.0683, + "step": 22800 + }, + { + "epoch": 0.14336613438610965, + "grad_norm": 7.665589809417725, + "learning_rate": 1.9048682765730954e-05, + "loss": 2.1991, + "step": 22810 + }, + { + "epoch": 0.14342898670280674, + "grad_norm": 8.55495834350586, + "learning_rate": 1.90482636647863e-05, + "loss": 2.1726, + "step": 22820 + }, + { + "epoch": 0.14349183901950385, + "grad_norm": 6.944456100463867, + "learning_rate": 1.904784456384165e-05, + "loss": 1.994, + "step": 22830 + }, + { + "epoch": 0.14355469133620097, + "grad_norm": 6.976284027099609, + "learning_rate": 1.9047425462896995e-05, + "loss": 2.3131, + "step": 22840 + }, + { + "epoch": 0.14361754365289808, + "grad_norm": 7.741434097290039, + "learning_rate": 1.904700636195234e-05, + "loss": 1.9898, + "step": 22850 + }, + { + "epoch": 0.1436803959695952, + "grad_norm": 8.721227645874023, + "learning_rate": 1.9046587261007686e-05, + "loss": 2.1514, + "step": 22860 + }, + { + "epoch": 0.1437432482862923, + "grad_norm": 7.248230934143066, + "learning_rate": 1.90462100701575e-05, + "loss": 2.0825, + "step": 22870 + }, + { + "epoch": 0.1438061006029894, + "grad_norm": 6.804014682769775, + "learning_rate": 1.9045790969212845e-05, + "loss": 2.1672, + "step": 22880 + }, + { + "epoch": 0.14386895291968652, + "grad_norm": 8.10350513458252, + "learning_rate": 1.904537186826819e-05, + "loss": 2.2151, + "step": 22890 + }, + { + "epoch": 0.14393180523638363, + "grad_norm": 7.515980243682861, + "learning_rate": 1.904495276732354e-05, + "loss": 2.1118, + "step": 22900 + }, + { + "epoch": 0.14399465755308075, + "grad_norm": 6.625450134277344, + "learning_rate": 1.9044533666378886e-05, + "loss": 2.0872, + "step": 22910 + }, + { + "epoch": 0.14405750986977786, + "grad_norm": 7.152846336364746, + "learning_rate": 1.9044114565434233e-05, + "loss": 2.2226, + "step": 22920 + }, + { + "epoch": 0.14412036218647498, + "grad_norm": 7.193029403686523, + "learning_rate": 1.9043695464489577e-05, + "loss": 2.1758, + "step": 22930 + }, + { + "epoch": 0.1441832145031721, + "grad_norm": 8.163533210754395, + "learning_rate": 1.9043276363544924e-05, + "loss": 2.1661, + "step": 22940 + }, + { + "epoch": 0.14424606681986918, + "grad_norm": 7.087532997131348, + "learning_rate": 1.904285726260027e-05, + "loss": 1.9239, + "step": 22950 + }, + { + "epoch": 0.1443089191365663, + "grad_norm": 6.445407390594482, + "learning_rate": 1.9042438161655618e-05, + "loss": 2.0338, + "step": 22960 + }, + { + "epoch": 0.1443717714532634, + "grad_norm": 6.491453647613525, + "learning_rate": 1.9042019060710965e-05, + "loss": 2.0514, + "step": 22970 + }, + { + "epoch": 0.14443462376996052, + "grad_norm": 6.91152811050415, + "learning_rate": 1.9041599959766312e-05, + "loss": 2.1338, + "step": 22980 + }, + { + "epoch": 0.14449747608665764, + "grad_norm": 8.279658317565918, + "learning_rate": 1.904118085882166e-05, + "loss": 2.2453, + "step": 22990 + }, + { + "epoch": 0.14456032840335475, + "grad_norm": 7.245723247528076, + "learning_rate": 1.9040761757877003e-05, + "loss": 2.1084, + "step": 23000 + }, + { + "epoch": 0.14462318072005184, + "grad_norm": 6.900940895080566, + "learning_rate": 1.904034265693235e-05, + "loss": 2.2148, + "step": 23010 + }, + { + "epoch": 0.14468603303674896, + "grad_norm": 7.883230686187744, + "learning_rate": 1.9039923555987697e-05, + "loss": 2.032, + "step": 23020 + }, + { + "epoch": 0.14474888535344607, + "grad_norm": 8.119175910949707, + "learning_rate": 1.9039504455043044e-05, + "loss": 2.1719, + "step": 23030 + }, + { + "epoch": 0.1448117376701432, + "grad_norm": 8.134607315063477, + "learning_rate": 1.903908535409839e-05, + "loss": 2.0942, + "step": 23040 + }, + { + "epoch": 0.1448745899868403, + "grad_norm": 6.855745792388916, + "learning_rate": 1.9038666253153738e-05, + "loss": 2.1209, + "step": 23050 + }, + { + "epoch": 0.14493744230353742, + "grad_norm": 7.171293258666992, + "learning_rate": 1.903824715220908e-05, + "loss": 2.2055, + "step": 23060 + }, + { + "epoch": 0.1450002946202345, + "grad_norm": 7.572606563568115, + "learning_rate": 1.903782805126443e-05, + "loss": 2.1366, + "step": 23070 + }, + { + "epoch": 0.14506314693693162, + "grad_norm": 7.0616374015808105, + "learning_rate": 1.9037408950319776e-05, + "loss": 2.0039, + "step": 23080 + }, + { + "epoch": 0.14512599925362873, + "grad_norm": 7.565642833709717, + "learning_rate": 1.9036989849375123e-05, + "loss": 2.2619, + "step": 23090 + }, + { + "epoch": 0.14518885157032585, + "grad_norm": 7.184761047363281, + "learning_rate": 1.903657074843047e-05, + "loss": 2.0735, + "step": 23100 + }, + { + "epoch": 0.14525170388702296, + "grad_norm": 8.689106941223145, + "learning_rate": 1.9036151647485814e-05, + "loss": 2.0887, + "step": 23110 + }, + { + "epoch": 0.14531455620372008, + "grad_norm": 7.586963176727295, + "learning_rate": 1.903573254654116e-05, + "loss": 2.0515, + "step": 23120 + }, + { + "epoch": 0.1453774085204172, + "grad_norm": 9.940788269042969, + "learning_rate": 1.9035313445596508e-05, + "loss": 1.9153, + "step": 23130 + }, + { + "epoch": 0.14544026083711428, + "grad_norm": 7.923069000244141, + "learning_rate": 1.9034894344651855e-05, + "loss": 2.1228, + "step": 23140 + }, + { + "epoch": 0.1455031131538114, + "grad_norm": 15.715959548950195, + "learning_rate": 1.90344752437072e-05, + "loss": 2.0652, + "step": 23150 + }, + { + "epoch": 0.1455659654705085, + "grad_norm": 6.555418491363525, + "learning_rate": 1.9034056142762546e-05, + "loss": 2.32, + "step": 23160 + }, + { + "epoch": 0.14562881778720563, + "grad_norm": 7.920915126800537, + "learning_rate": 1.9033637041817893e-05, + "loss": 2.1331, + "step": 23170 + }, + { + "epoch": 0.14569167010390274, + "grad_norm": 5.9372687339782715, + "learning_rate": 1.903321794087324e-05, + "loss": 1.8449, + "step": 23180 + }, + { + "epoch": 0.14575452242059986, + "grad_norm": 7.384964466094971, + "learning_rate": 1.9032798839928587e-05, + "loss": 2.1567, + "step": 23190 + }, + { + "epoch": 0.14581737473729695, + "grad_norm": 6.332118034362793, + "learning_rate": 1.9032379738983934e-05, + "loss": 2.0163, + "step": 23200 + }, + { + "epoch": 0.14588022705399406, + "grad_norm": 8.333724021911621, + "learning_rate": 1.903196063803928e-05, + "loss": 2.067, + "step": 23210 + }, + { + "epoch": 0.14594307937069118, + "grad_norm": 7.829019069671631, + "learning_rate": 1.9031541537094628e-05, + "loss": 2.1369, + "step": 23220 + }, + { + "epoch": 0.1460059316873883, + "grad_norm": 7.6123366355896, + "learning_rate": 1.9031122436149975e-05, + "loss": 2.229, + "step": 23230 + }, + { + "epoch": 0.1460687840040854, + "grad_norm": 7.171878337860107, + "learning_rate": 1.903070333520532e-05, + "loss": 1.9208, + "step": 23240 + }, + { + "epoch": 0.14613163632078252, + "grad_norm": 6.105739116668701, + "learning_rate": 1.9030284234260666e-05, + "loss": 2.0472, + "step": 23250 + }, + { + "epoch": 0.14619448863747964, + "grad_norm": 6.974096775054932, + "learning_rate": 1.9029865133316013e-05, + "loss": 2.0482, + "step": 23260 + }, + { + "epoch": 0.14625734095417672, + "grad_norm": 7.583916664123535, + "learning_rate": 1.902944603237136e-05, + "loss": 2.2159, + "step": 23270 + }, + { + "epoch": 0.14632019327087384, + "grad_norm": 7.2792205810546875, + "learning_rate": 1.9029026931426704e-05, + "loss": 2.2024, + "step": 23280 + }, + { + "epoch": 0.14638304558757095, + "grad_norm": 6.52831506729126, + "learning_rate": 1.902860783048205e-05, + "loss": 2.045, + "step": 23290 + }, + { + "epoch": 0.14644589790426807, + "grad_norm": 8.369816780090332, + "learning_rate": 1.9028188729537398e-05, + "loss": 2.0575, + "step": 23300 + }, + { + "epoch": 0.14650875022096518, + "grad_norm": 7.134170055389404, + "learning_rate": 1.9027769628592745e-05, + "loss": 2.0503, + "step": 23310 + }, + { + "epoch": 0.1465716025376623, + "grad_norm": 6.758161544799805, + "learning_rate": 1.9027350527648092e-05, + "loss": 2.1567, + "step": 23320 + }, + { + "epoch": 0.1466344548543594, + "grad_norm": 10.140778541564941, + "learning_rate": 1.9026931426703436e-05, + "loss": 2.1435, + "step": 23330 + }, + { + "epoch": 0.1466973071710565, + "grad_norm": 8.284818649291992, + "learning_rate": 1.9026512325758783e-05, + "loss": 1.9622, + "step": 23340 + }, + { + "epoch": 0.14676015948775362, + "grad_norm": 8.317974090576172, + "learning_rate": 1.902609322481413e-05, + "loss": 2.3861, + "step": 23350 + }, + { + "epoch": 0.14682301180445073, + "grad_norm": 7.516535758972168, + "learning_rate": 1.9025674123869477e-05, + "loss": 2.2544, + "step": 23360 + }, + { + "epoch": 0.14688586412114785, + "grad_norm": 6.966447353363037, + "learning_rate": 1.9025255022924824e-05, + "loss": 2.0672, + "step": 23370 + }, + { + "epoch": 0.14694871643784496, + "grad_norm": 7.023961544036865, + "learning_rate": 1.9024835921980168e-05, + "loss": 2.0731, + "step": 23380 + }, + { + "epoch": 0.14701156875454205, + "grad_norm": 8.957491874694824, + "learning_rate": 1.9024416821035515e-05, + "loss": 2.3124, + "step": 23390 + }, + { + "epoch": 0.14707442107123916, + "grad_norm": 7.384146690368652, + "learning_rate": 1.9023997720090862e-05, + "loss": 1.913, + "step": 23400 + }, + { + "epoch": 0.14713727338793628, + "grad_norm": 8.35905933380127, + "learning_rate": 1.902357861914621e-05, + "loss": 2.0068, + "step": 23410 + }, + { + "epoch": 0.1472001257046334, + "grad_norm": 7.860772132873535, + "learning_rate": 1.9023159518201556e-05, + "loss": 2.1883, + "step": 23420 + }, + { + "epoch": 0.1472629780213305, + "grad_norm": 7.34670352935791, + "learning_rate": 1.9022740417256903e-05, + "loss": 1.8775, + "step": 23430 + }, + { + "epoch": 0.14732583033802762, + "grad_norm": 7.937277793884277, + "learning_rate": 1.902232131631225e-05, + "loss": 2.1031, + "step": 23440 + }, + { + "epoch": 0.14738868265472474, + "grad_norm": 8.50955581665039, + "learning_rate": 1.9021902215367597e-05, + "loss": 2.2568, + "step": 23450 + }, + { + "epoch": 0.14745153497142183, + "grad_norm": 7.741550922393799, + "learning_rate": 1.902148311442294e-05, + "loss": 2.2492, + "step": 23460 + }, + { + "epoch": 0.14751438728811894, + "grad_norm": 8.376154899597168, + "learning_rate": 1.9021064013478288e-05, + "loss": 2.088, + "step": 23470 + }, + { + "epoch": 0.14757723960481606, + "grad_norm": 7.21097993850708, + "learning_rate": 1.9020644912533635e-05, + "loss": 2.2054, + "step": 23480 + }, + { + "epoch": 0.14764009192151317, + "grad_norm": 9.768784523010254, + "learning_rate": 1.9020225811588982e-05, + "loss": 2.2019, + "step": 23490 + }, + { + "epoch": 0.1477029442382103, + "grad_norm": 6.948235034942627, + "learning_rate": 1.9019806710644326e-05, + "loss": 1.9864, + "step": 23500 + }, + { + "epoch": 0.1477657965549074, + "grad_norm": 9.397354125976562, + "learning_rate": 1.9019387609699673e-05, + "loss": 1.9141, + "step": 23510 + }, + { + "epoch": 0.1478286488716045, + "grad_norm": 6.551173686981201, + "learning_rate": 1.901896850875502e-05, + "loss": 1.9257, + "step": 23520 + }, + { + "epoch": 0.1478915011883016, + "grad_norm": 9.810015678405762, + "learning_rate": 1.9018549407810367e-05, + "loss": 2.2414, + "step": 23530 + }, + { + "epoch": 0.14795435350499872, + "grad_norm": 6.3327717781066895, + "learning_rate": 1.9018130306865714e-05, + "loss": 1.9833, + "step": 23540 + }, + { + "epoch": 0.14801720582169584, + "grad_norm": 7.804057598114014, + "learning_rate": 1.9017711205921058e-05, + "loss": 1.9943, + "step": 23550 + }, + { + "epoch": 0.14808005813839295, + "grad_norm": 8.205534934997559, + "learning_rate": 1.9017292104976405e-05, + "loss": 2.1566, + "step": 23560 + }, + { + "epoch": 0.14814291045509007, + "grad_norm": 7.657957553863525, + "learning_rate": 1.9016873004031752e-05, + "loss": 1.9966, + "step": 23570 + }, + { + "epoch": 0.14820576277178715, + "grad_norm": 7.589319705963135, + "learning_rate": 1.90164539030871e-05, + "loss": 2.0899, + "step": 23580 + }, + { + "epoch": 0.14826861508848427, + "grad_norm": 8.065515518188477, + "learning_rate": 1.9016034802142446e-05, + "loss": 2.1794, + "step": 23590 + }, + { + "epoch": 0.14833146740518138, + "grad_norm": 7.099700927734375, + "learning_rate": 1.9015615701197793e-05, + "loss": 1.9405, + "step": 23600 + }, + { + "epoch": 0.1483943197218785, + "grad_norm": 6.308840274810791, + "learning_rate": 1.901519660025314e-05, + "loss": 1.9217, + "step": 23610 + }, + { + "epoch": 0.1484571720385756, + "grad_norm": 7.382903575897217, + "learning_rate": 1.9014777499308487e-05, + "loss": 2.0211, + "step": 23620 + }, + { + "epoch": 0.14852002435527273, + "grad_norm": 8.541641235351562, + "learning_rate": 1.901435839836383e-05, + "loss": 2.2192, + "step": 23630 + }, + { + "epoch": 0.14858287667196984, + "grad_norm": 7.6181793212890625, + "learning_rate": 1.9013939297419178e-05, + "loss": 1.9691, + "step": 23640 + }, + { + "epoch": 0.14864572898866693, + "grad_norm": 8.119380950927734, + "learning_rate": 1.9013520196474525e-05, + "loss": 1.9171, + "step": 23650 + }, + { + "epoch": 0.14870858130536405, + "grad_norm": 7.697994709014893, + "learning_rate": 1.9013101095529872e-05, + "loss": 2.0111, + "step": 23660 + }, + { + "epoch": 0.14877143362206116, + "grad_norm": 8.19538402557373, + "learning_rate": 1.901268199458522e-05, + "loss": 2.153, + "step": 23670 + }, + { + "epoch": 0.14883428593875828, + "grad_norm": 7.4682111740112305, + "learning_rate": 1.9012262893640563e-05, + "loss": 1.9928, + "step": 23680 + }, + { + "epoch": 0.1488971382554554, + "grad_norm": 8.932218551635742, + "learning_rate": 1.901184379269591e-05, + "loss": 2.2439, + "step": 23690 + }, + { + "epoch": 0.1489599905721525, + "grad_norm": 7.902842044830322, + "learning_rate": 1.9011424691751257e-05, + "loss": 2.2822, + "step": 23700 + }, + { + "epoch": 0.1490228428888496, + "grad_norm": 7.800632476806641, + "learning_rate": 1.9011005590806604e-05, + "loss": 2.2285, + "step": 23710 + }, + { + "epoch": 0.1490856952055467, + "grad_norm": 7.971667766571045, + "learning_rate": 1.901058648986195e-05, + "loss": 1.9143, + "step": 23720 + }, + { + "epoch": 0.14914854752224382, + "grad_norm": 8.899446487426758, + "learning_rate": 1.9010167388917295e-05, + "loss": 1.9402, + "step": 23730 + }, + { + "epoch": 0.14921139983894094, + "grad_norm": 7.046481609344482, + "learning_rate": 1.9009748287972642e-05, + "loss": 1.8834, + "step": 23740 + }, + { + "epoch": 0.14927425215563805, + "grad_norm": 7.612890720367432, + "learning_rate": 1.900932918702799e-05, + "loss": 2.0692, + "step": 23750 + }, + { + "epoch": 0.14933710447233517, + "grad_norm": 8.558869361877441, + "learning_rate": 1.9008910086083336e-05, + "loss": 2.1888, + "step": 23760 + }, + { + "epoch": 0.14939995678903228, + "grad_norm": 7.5883331298828125, + "learning_rate": 1.900849098513868e-05, + "loss": 1.8002, + "step": 23770 + }, + { + "epoch": 0.14946280910572937, + "grad_norm": 7.729646682739258, + "learning_rate": 1.9008071884194027e-05, + "loss": 2.121, + "step": 23780 + }, + { + "epoch": 0.1495256614224265, + "grad_norm": 7.262862205505371, + "learning_rate": 1.9007652783249374e-05, + "loss": 1.982, + "step": 23790 + }, + { + "epoch": 0.1495885137391236, + "grad_norm": 7.4409356117248535, + "learning_rate": 1.900723368230472e-05, + "loss": 2.0201, + "step": 23800 + }, + { + "epoch": 0.14965136605582072, + "grad_norm": 6.645078182220459, + "learning_rate": 1.9006814581360068e-05, + "loss": 2.1479, + "step": 23810 + }, + { + "epoch": 0.14971421837251783, + "grad_norm": 6.964990615844727, + "learning_rate": 1.9006395480415415e-05, + "loss": 2.1139, + "step": 23820 + }, + { + "epoch": 0.14977707068921495, + "grad_norm": 6.36972713470459, + "learning_rate": 1.9005976379470762e-05, + "loss": 1.907, + "step": 23830 + }, + { + "epoch": 0.14983992300591203, + "grad_norm": 7.909121513366699, + "learning_rate": 1.900555727852611e-05, + "loss": 1.9887, + "step": 23840 + }, + { + "epoch": 0.14990277532260915, + "grad_norm": 7.761707782745361, + "learning_rate": 1.9005138177581456e-05, + "loss": 1.901, + "step": 23850 + }, + { + "epoch": 0.14996562763930626, + "grad_norm": 7.285144329071045, + "learning_rate": 1.90047190766368e-05, + "loss": 2.0067, + "step": 23860 + }, + { + "epoch": 0.15002847995600338, + "grad_norm": 7.631621360778809, + "learning_rate": 1.9004299975692147e-05, + "loss": 2.2314, + "step": 23870 + }, + { + "epoch": 0.1500913322727005, + "grad_norm": 8.568719863891602, + "learning_rate": 1.9003880874747494e-05, + "loss": 1.9857, + "step": 23880 + }, + { + "epoch": 0.1501541845893976, + "grad_norm": 7.1593546867370605, + "learning_rate": 1.900346177380284e-05, + "loss": 2.0381, + "step": 23890 + }, + { + "epoch": 0.1502170369060947, + "grad_norm": 7.844557285308838, + "learning_rate": 1.9003042672858185e-05, + "loss": 2.0089, + "step": 23900 + }, + { + "epoch": 0.1502798892227918, + "grad_norm": 6.168196201324463, + "learning_rate": 1.9002623571913532e-05, + "loss": 1.8796, + "step": 23910 + }, + { + "epoch": 0.15034274153948893, + "grad_norm": 7.286600112915039, + "learning_rate": 1.900220447096888e-05, + "loss": 2.1071, + "step": 23920 + }, + { + "epoch": 0.15040559385618604, + "grad_norm": 6.133782863616943, + "learning_rate": 1.9001785370024226e-05, + "loss": 2.1286, + "step": 23930 + }, + { + "epoch": 0.15046844617288316, + "grad_norm": 7.266432762145996, + "learning_rate": 1.9001366269079573e-05, + "loss": 2.0999, + "step": 23940 + }, + { + "epoch": 0.15053129848958027, + "grad_norm": 8.122286796569824, + "learning_rate": 1.9000947168134917e-05, + "loss": 2.0707, + "step": 23950 + }, + { + "epoch": 0.1505941508062774, + "grad_norm": 8.332359313964844, + "learning_rate": 1.9000528067190264e-05, + "loss": 2.2856, + "step": 23960 + }, + { + "epoch": 0.15065700312297448, + "grad_norm": 7.63896369934082, + "learning_rate": 1.900010896624561e-05, + "loss": 2.1809, + "step": 23970 + }, + { + "epoch": 0.1507198554396716, + "grad_norm": 7.593400001525879, + "learning_rate": 1.8999689865300958e-05, + "loss": 1.9636, + "step": 23980 + }, + { + "epoch": 0.1507827077563687, + "grad_norm": 7.072235584259033, + "learning_rate": 1.8999270764356305e-05, + "loss": 2.3203, + "step": 23990 + }, + { + "epoch": 0.15084556007306582, + "grad_norm": 7.944361209869385, + "learning_rate": 1.8998851663411652e-05, + "loss": 2.1457, + "step": 24000 + }, + { + "epoch": 0.15090841238976294, + "grad_norm": 7.352777481079102, + "learning_rate": 1.8998432562466996e-05, + "loss": 1.9976, + "step": 24010 + }, + { + "epoch": 0.15097126470646005, + "grad_norm": 9.210273742675781, + "learning_rate": 1.8998013461522343e-05, + "loss": 1.9532, + "step": 24020 + }, + { + "epoch": 0.15103411702315714, + "grad_norm": 7.580131530761719, + "learning_rate": 1.899759436057769e-05, + "loss": 2.2155, + "step": 24030 + }, + { + "epoch": 0.15109696933985425, + "grad_norm": 7.389312744140625, + "learning_rate": 1.8997175259633037e-05, + "loss": 2.3535, + "step": 24040 + }, + { + "epoch": 0.15115982165655137, + "grad_norm": 7.961337089538574, + "learning_rate": 1.8996756158688384e-05, + "loss": 2.0597, + "step": 24050 + }, + { + "epoch": 0.15122267397324848, + "grad_norm": 6.754824161529541, + "learning_rate": 1.899633705774373e-05, + "loss": 2.0535, + "step": 24060 + }, + { + "epoch": 0.1512855262899456, + "grad_norm": 6.829716682434082, + "learning_rate": 1.8995917956799078e-05, + "loss": 1.9852, + "step": 24070 + }, + { + "epoch": 0.15134837860664271, + "grad_norm": 8.027902603149414, + "learning_rate": 1.8995498855854422e-05, + "loss": 1.9553, + "step": 24080 + }, + { + "epoch": 0.1514112309233398, + "grad_norm": 7.129761219024658, + "learning_rate": 1.899507975490977e-05, + "loss": 1.9923, + "step": 24090 + }, + { + "epoch": 0.15147408324003692, + "grad_norm": 7.457757472991943, + "learning_rate": 1.8994660653965116e-05, + "loss": 2.1197, + "step": 24100 + }, + { + "epoch": 0.15153693555673403, + "grad_norm": 7.700310230255127, + "learning_rate": 1.8994241553020463e-05, + "loss": 1.9089, + "step": 24110 + }, + { + "epoch": 0.15159978787343115, + "grad_norm": 7.856703281402588, + "learning_rate": 1.8993822452075807e-05, + "loss": 2.2076, + "step": 24120 + }, + { + "epoch": 0.15166264019012826, + "grad_norm": 7.630815029144287, + "learning_rate": 1.8993403351131154e-05, + "loss": 2.0582, + "step": 24130 + }, + { + "epoch": 0.15172549250682538, + "grad_norm": 7.412759780883789, + "learning_rate": 1.89929842501865e-05, + "loss": 2.1895, + "step": 24140 + }, + { + "epoch": 0.1517883448235225, + "grad_norm": 7.12389612197876, + "learning_rate": 1.8992565149241848e-05, + "loss": 2.2506, + "step": 24150 + }, + { + "epoch": 0.15185119714021958, + "grad_norm": 7.502507209777832, + "learning_rate": 1.8992146048297195e-05, + "loss": 2.0753, + "step": 24160 + }, + { + "epoch": 0.1519140494569167, + "grad_norm": 7.414200305938721, + "learning_rate": 1.899172694735254e-05, + "loss": 1.9224, + "step": 24170 + }, + { + "epoch": 0.1519769017736138, + "grad_norm": 7.306906223297119, + "learning_rate": 1.8991307846407886e-05, + "loss": 2.0083, + "step": 24180 + }, + { + "epoch": 0.15203975409031092, + "grad_norm": 7.0531134605407715, + "learning_rate": 1.8990888745463233e-05, + "loss": 1.91, + "step": 24190 + }, + { + "epoch": 0.15210260640700804, + "grad_norm": 7.230875492095947, + "learning_rate": 1.899046964451858e-05, + "loss": 2.0589, + "step": 24200 + }, + { + "epoch": 0.15216545872370515, + "grad_norm": 7.221288681030273, + "learning_rate": 1.8990050543573927e-05, + "loss": 2.1291, + "step": 24210 + }, + { + "epoch": 0.15222831104040224, + "grad_norm": 8.530590057373047, + "learning_rate": 1.8989631442629274e-05, + "loss": 2.1829, + "step": 24220 + }, + { + "epoch": 0.15229116335709936, + "grad_norm": 7.478540897369385, + "learning_rate": 1.898921234168462e-05, + "loss": 1.9379, + "step": 24230 + }, + { + "epoch": 0.15235401567379647, + "grad_norm": 7.320249080657959, + "learning_rate": 1.8988793240739968e-05, + "loss": 2.2113, + "step": 24240 + }, + { + "epoch": 0.1524168679904936, + "grad_norm": 7.91867733001709, + "learning_rate": 1.8988374139795315e-05, + "loss": 2.217, + "step": 24250 + }, + { + "epoch": 0.1524797203071907, + "grad_norm": 7.058121204376221, + "learning_rate": 1.898795503885066e-05, + "loss": 2.0749, + "step": 24260 + }, + { + "epoch": 0.15254257262388782, + "grad_norm": 6.411896228790283, + "learning_rate": 1.8987535937906006e-05, + "loss": 2.1421, + "step": 24270 + }, + { + "epoch": 0.15260542494058493, + "grad_norm": 7.097553253173828, + "learning_rate": 1.8987116836961353e-05, + "loss": 2.0138, + "step": 24280 + }, + { + "epoch": 0.15266827725728202, + "grad_norm": 10.559577941894531, + "learning_rate": 1.89866977360167e-05, + "loss": 2.0342, + "step": 24290 + }, + { + "epoch": 0.15273112957397914, + "grad_norm": 7.3536458015441895, + "learning_rate": 1.8986278635072044e-05, + "loss": 2.0605, + "step": 24300 + }, + { + "epoch": 0.15279398189067625, + "grad_norm": 6.632030963897705, + "learning_rate": 1.898585953412739e-05, + "loss": 2.2258, + "step": 24310 + }, + { + "epoch": 0.15285683420737337, + "grad_norm": 7.509122848510742, + "learning_rate": 1.8985440433182738e-05, + "loss": 2.0435, + "step": 24320 + }, + { + "epoch": 0.15291968652407048, + "grad_norm": 8.334511756896973, + "learning_rate": 1.8985021332238085e-05, + "loss": 2.0767, + "step": 24330 + }, + { + "epoch": 0.1529825388407676, + "grad_norm": 7.24440860748291, + "learning_rate": 1.8984602231293432e-05, + "loss": 2.1765, + "step": 24340 + }, + { + "epoch": 0.15304539115746468, + "grad_norm": 7.586240768432617, + "learning_rate": 1.8984183130348776e-05, + "loss": 2.113, + "step": 24350 + }, + { + "epoch": 0.1531082434741618, + "grad_norm": 7.222861289978027, + "learning_rate": 1.8983764029404123e-05, + "loss": 1.922, + "step": 24360 + }, + { + "epoch": 0.1531710957908589, + "grad_norm": 6.647038459777832, + "learning_rate": 1.898334492845947e-05, + "loss": 2.0281, + "step": 24370 + }, + { + "epoch": 0.15323394810755603, + "grad_norm": 7.2084527015686035, + "learning_rate": 1.8982925827514817e-05, + "loss": 2.0836, + "step": 24380 + }, + { + "epoch": 0.15329680042425314, + "grad_norm": 8.349883079528809, + "learning_rate": 1.898250672657016e-05, + "loss": 2.1746, + "step": 24390 + }, + { + "epoch": 0.15335965274095026, + "grad_norm": 7.082326889038086, + "learning_rate": 1.8982087625625508e-05, + "loss": 2.065, + "step": 24400 + }, + { + "epoch": 0.15342250505764735, + "grad_norm": 8.142436981201172, + "learning_rate": 1.8981668524680855e-05, + "loss": 2.0993, + "step": 24410 + }, + { + "epoch": 0.15348535737434446, + "grad_norm": 7.3239922523498535, + "learning_rate": 1.8981249423736202e-05, + "loss": 1.9832, + "step": 24420 + }, + { + "epoch": 0.15354820969104158, + "grad_norm": 8.010440826416016, + "learning_rate": 1.898083032279155e-05, + "loss": 2.214, + "step": 24430 + }, + { + "epoch": 0.1536110620077387, + "grad_norm": 6.904417514801025, + "learning_rate": 1.8980411221846896e-05, + "loss": 2.1383, + "step": 24440 + }, + { + "epoch": 0.1536739143244358, + "grad_norm": 6.834875583648682, + "learning_rate": 1.8979992120902243e-05, + "loss": 2.1952, + "step": 24450 + }, + { + "epoch": 0.15373676664113292, + "grad_norm": 6.857359409332275, + "learning_rate": 1.897957301995759e-05, + "loss": 2.0035, + "step": 24460 + }, + { + "epoch": 0.15379961895783004, + "grad_norm": 8.004855155944824, + "learning_rate": 1.8979153919012937e-05, + "loss": 2.2526, + "step": 24470 + }, + { + "epoch": 0.15386247127452712, + "grad_norm": 7.017568111419678, + "learning_rate": 1.897873481806828e-05, + "loss": 1.8192, + "step": 24480 + }, + { + "epoch": 0.15392532359122424, + "grad_norm": 7.01463508605957, + "learning_rate": 1.8978315717123628e-05, + "loss": 2.3115, + "step": 24490 + }, + { + "epoch": 0.15398817590792135, + "grad_norm": 7.071869373321533, + "learning_rate": 1.8977896616178975e-05, + "loss": 2.2191, + "step": 24500 + }, + { + "epoch": 0.15405102822461847, + "grad_norm": 7.331428050994873, + "learning_rate": 1.8977477515234322e-05, + "loss": 2.1485, + "step": 24510 + }, + { + "epoch": 0.15411388054131558, + "grad_norm": 7.987422466278076, + "learning_rate": 1.8977058414289666e-05, + "loss": 2.0528, + "step": 24520 + }, + { + "epoch": 0.1541767328580127, + "grad_norm": 7.370058536529541, + "learning_rate": 1.8976639313345013e-05, + "loss": 2.0887, + "step": 24530 + }, + { + "epoch": 0.1542395851747098, + "grad_norm": 7.986361026763916, + "learning_rate": 1.897622021240036e-05, + "loss": 2.2906, + "step": 24540 + }, + { + "epoch": 0.1543024374914069, + "grad_norm": 8.180559158325195, + "learning_rate": 1.8975801111455707e-05, + "loss": 2.2125, + "step": 24550 + }, + { + "epoch": 0.15436528980810402, + "grad_norm": 7.562689304351807, + "learning_rate": 1.8975382010511054e-05, + "loss": 2.2225, + "step": 24560 + }, + { + "epoch": 0.15442814212480113, + "grad_norm": 7.62192964553833, + "learning_rate": 1.8974962909566398e-05, + "loss": 2.1318, + "step": 24570 + }, + { + "epoch": 0.15449099444149825, + "grad_norm": 7.785002708435059, + "learning_rate": 1.8974543808621745e-05, + "loss": 2.1521, + "step": 24580 + }, + { + "epoch": 0.15455384675819536, + "grad_norm": 6.82577657699585, + "learning_rate": 1.8974124707677092e-05, + "loss": 2.0823, + "step": 24590 + }, + { + "epoch": 0.15461669907489248, + "grad_norm": 7.56740140914917, + "learning_rate": 1.897370560673244e-05, + "loss": 2.2076, + "step": 24600 + }, + { + "epoch": 0.15467955139158956, + "grad_norm": 6.195840358734131, + "learning_rate": 1.8973286505787786e-05, + "loss": 2.0197, + "step": 24610 + }, + { + "epoch": 0.15474240370828668, + "grad_norm": 7.166738033294678, + "learning_rate": 1.8972867404843133e-05, + "loss": 2.1333, + "step": 24620 + }, + { + "epoch": 0.1548052560249838, + "grad_norm": 8.683135032653809, + "learning_rate": 1.897244830389848e-05, + "loss": 2.0418, + "step": 24630 + }, + { + "epoch": 0.1548681083416809, + "grad_norm": 8.308981895446777, + "learning_rate": 1.8972029202953824e-05, + "loss": 2.1947, + "step": 24640 + }, + { + "epoch": 0.15493096065837803, + "grad_norm": 7.782462120056152, + "learning_rate": 1.897161010200917e-05, + "loss": 2.0108, + "step": 24650 + }, + { + "epoch": 0.15499381297507514, + "grad_norm": 7.015844345092773, + "learning_rate": 1.8971191001064518e-05, + "loss": 2.0842, + "step": 24660 + }, + { + "epoch": 0.15505666529177223, + "grad_norm": 8.82336139678955, + "learning_rate": 1.8970771900119865e-05, + "loss": 1.9346, + "step": 24670 + }, + { + "epoch": 0.15511951760846934, + "grad_norm": 8.218207359313965, + "learning_rate": 1.8970352799175212e-05, + "loss": 2.0771, + "step": 24680 + }, + { + "epoch": 0.15518236992516646, + "grad_norm": 7.350327968597412, + "learning_rate": 1.896993369823056e-05, + "loss": 2.0874, + "step": 24690 + }, + { + "epoch": 0.15524522224186357, + "grad_norm": 7.691098213195801, + "learning_rate": 1.8969514597285903e-05, + "loss": 1.8805, + "step": 24700 + }, + { + "epoch": 0.1553080745585607, + "grad_norm": 7.976955890655518, + "learning_rate": 1.896909549634125e-05, + "loss": 2.0366, + "step": 24710 + }, + { + "epoch": 0.1553709268752578, + "grad_norm": 6.466163635253906, + "learning_rate": 1.8968676395396597e-05, + "loss": 2.0855, + "step": 24720 + }, + { + "epoch": 0.1554337791919549, + "grad_norm": 6.638965606689453, + "learning_rate": 1.8968257294451944e-05, + "loss": 2.2471, + "step": 24730 + }, + { + "epoch": 0.155496631508652, + "grad_norm": 6.676800727844238, + "learning_rate": 1.8967838193507288e-05, + "loss": 2.2198, + "step": 24740 + }, + { + "epoch": 0.15555948382534912, + "grad_norm": 7.223644256591797, + "learning_rate": 1.8967419092562635e-05, + "loss": 2.1861, + "step": 24750 + }, + { + "epoch": 0.15562233614204624, + "grad_norm": 7.7987589836120605, + "learning_rate": 1.8966999991617982e-05, + "loss": 2.1355, + "step": 24760 + }, + { + "epoch": 0.15568518845874335, + "grad_norm": 8.250946998596191, + "learning_rate": 1.896658089067333e-05, + "loss": 2.149, + "step": 24770 + }, + { + "epoch": 0.15574804077544047, + "grad_norm": 8.638188362121582, + "learning_rate": 1.8966161789728676e-05, + "loss": 2.0937, + "step": 24780 + }, + { + "epoch": 0.15581089309213758, + "grad_norm": 7.171464443206787, + "learning_rate": 1.896574268878402e-05, + "loss": 2.1431, + "step": 24790 + }, + { + "epoch": 0.15587374540883467, + "grad_norm": 8.648698806762695, + "learning_rate": 1.8965323587839367e-05, + "loss": 2.1714, + "step": 24800 + }, + { + "epoch": 0.15593659772553178, + "grad_norm": 7.166804313659668, + "learning_rate": 1.8964904486894714e-05, + "loss": 2.1407, + "step": 24810 + }, + { + "epoch": 0.1559994500422289, + "grad_norm": 7.397407531738281, + "learning_rate": 1.896448538595006e-05, + "loss": 2.0526, + "step": 24820 + }, + { + "epoch": 0.15606230235892601, + "grad_norm": 7.774376392364502, + "learning_rate": 1.8964066285005408e-05, + "loss": 1.9338, + "step": 24830 + }, + { + "epoch": 0.15612515467562313, + "grad_norm": 7.6225457191467285, + "learning_rate": 1.8963647184060755e-05, + "loss": 1.7409, + "step": 24840 + }, + { + "epoch": 0.15618800699232024, + "grad_norm": 11.696316719055176, + "learning_rate": 1.8963228083116102e-05, + "loss": 2.2644, + "step": 24850 + }, + { + "epoch": 0.15625085930901733, + "grad_norm": 8.621893882751465, + "learning_rate": 1.896280898217145e-05, + "loss": 2.0853, + "step": 24860 + }, + { + "epoch": 0.15631371162571445, + "grad_norm": 9.359789848327637, + "learning_rate": 1.8962389881226796e-05, + "loss": 2.2944, + "step": 24870 + }, + { + "epoch": 0.15637656394241156, + "grad_norm": 7.055229187011719, + "learning_rate": 1.896197078028214e-05, + "loss": 1.7894, + "step": 24880 + }, + { + "epoch": 0.15643941625910868, + "grad_norm": 7.7220258712768555, + "learning_rate": 1.8961551679337487e-05, + "loss": 1.9412, + "step": 24890 + }, + { + "epoch": 0.1565022685758058, + "grad_norm": 8.135756492614746, + "learning_rate": 1.8961132578392834e-05, + "loss": 1.9745, + "step": 24900 + }, + { + "epoch": 0.1565651208925029, + "grad_norm": 7.712810039520264, + "learning_rate": 1.896071347744818e-05, + "loss": 2.1804, + "step": 24910 + }, + { + "epoch": 0.1566279732092, + "grad_norm": 7.599367618560791, + "learning_rate": 1.8960294376503525e-05, + "loss": 2.1436, + "step": 24920 + }, + { + "epoch": 0.1566908255258971, + "grad_norm": 5.9962687492370605, + "learning_rate": 1.8959875275558872e-05, + "loss": 2.0119, + "step": 24930 + }, + { + "epoch": 0.15675367784259422, + "grad_norm": 8.699943542480469, + "learning_rate": 1.895945617461422e-05, + "loss": 1.9338, + "step": 24940 + }, + { + "epoch": 0.15681653015929134, + "grad_norm": 8.431902885437012, + "learning_rate": 1.8959037073669566e-05, + "loss": 2.1374, + "step": 24950 + }, + { + "epoch": 0.15687938247598845, + "grad_norm": 7.762973785400391, + "learning_rate": 1.8958617972724913e-05, + "loss": 2.0149, + "step": 24960 + }, + { + "epoch": 0.15694223479268557, + "grad_norm": 6.656715393066406, + "learning_rate": 1.8958198871780257e-05, + "loss": 2.2427, + "step": 24970 + }, + { + "epoch": 0.15700508710938268, + "grad_norm": 7.184601306915283, + "learning_rate": 1.8957779770835604e-05, + "loss": 2.0844, + "step": 24980 + }, + { + "epoch": 0.15706793942607977, + "grad_norm": 7.309114933013916, + "learning_rate": 1.895736066989095e-05, + "loss": 1.9691, + "step": 24990 + }, + { + "epoch": 0.1571307917427769, + "grad_norm": 7.920795917510986, + "learning_rate": 1.8956941568946298e-05, + "loss": 2.2264, + "step": 25000 + }, + { + "epoch": 0.157193644059474, + "grad_norm": 7.16301965713501, + "learning_rate": 1.8956522468001642e-05, + "loss": 2.1716, + "step": 25010 + }, + { + "epoch": 0.15725649637617112, + "grad_norm": 8.722789764404297, + "learning_rate": 1.895610336705699e-05, + "loss": 2.012, + "step": 25020 + }, + { + "epoch": 0.15731934869286823, + "grad_norm": 7.967345237731934, + "learning_rate": 1.8955684266112336e-05, + "loss": 2.0114, + "step": 25030 + }, + { + "epoch": 0.15738220100956535, + "grad_norm": 8.800018310546875, + "learning_rate": 1.8955265165167683e-05, + "loss": 1.9146, + "step": 25040 + }, + { + "epoch": 0.15744505332626244, + "grad_norm": 8.132065773010254, + "learning_rate": 1.895484606422303e-05, + "loss": 1.8465, + "step": 25050 + }, + { + "epoch": 0.15750790564295955, + "grad_norm": 7.073143005371094, + "learning_rate": 1.8954426963278377e-05, + "loss": 2.1489, + "step": 25060 + }, + { + "epoch": 0.15757075795965667, + "grad_norm": 7.072580814361572, + "learning_rate": 1.8954007862333724e-05, + "loss": 2.2444, + "step": 25070 + }, + { + "epoch": 0.15763361027635378, + "grad_norm": 6.578721523284912, + "learning_rate": 1.895358876138907e-05, + "loss": 2.2326, + "step": 25080 + }, + { + "epoch": 0.1576964625930509, + "grad_norm": 7.9277024269104, + "learning_rate": 1.895316966044442e-05, + "loss": 2.2134, + "step": 25090 + }, + { + "epoch": 0.157759314909748, + "grad_norm": 8.125829696655273, + "learning_rate": 1.8952750559499762e-05, + "loss": 2.216, + "step": 25100 + }, + { + "epoch": 0.15782216722644513, + "grad_norm": 13.079134941101074, + "learning_rate": 1.895233145855511e-05, + "loss": 2.1439, + "step": 25110 + }, + { + "epoch": 0.1578850195431422, + "grad_norm": 7.656450271606445, + "learning_rate": 1.8951912357610456e-05, + "loss": 2.097, + "step": 25120 + }, + { + "epoch": 0.15794787185983933, + "grad_norm": 6.691035270690918, + "learning_rate": 1.8951493256665803e-05, + "loss": 1.9697, + "step": 25130 + }, + { + "epoch": 0.15801072417653644, + "grad_norm": 7.895198345184326, + "learning_rate": 1.8951074155721147e-05, + "loss": 2.3009, + "step": 25140 + }, + { + "epoch": 0.15807357649323356, + "grad_norm": 8.710088729858398, + "learning_rate": 1.8950655054776494e-05, + "loss": 2.0514, + "step": 25150 + }, + { + "epoch": 0.15813642880993067, + "grad_norm": 7.562583923339844, + "learning_rate": 1.895023595383184e-05, + "loss": 2.0919, + "step": 25160 + }, + { + "epoch": 0.1581992811266278, + "grad_norm": 7.830014705657959, + "learning_rate": 1.8949816852887188e-05, + "loss": 2.3568, + "step": 25170 + }, + { + "epoch": 0.15826213344332488, + "grad_norm": 6.225578784942627, + "learning_rate": 1.8949397751942535e-05, + "loss": 2.197, + "step": 25180 + }, + { + "epoch": 0.158324985760022, + "grad_norm": 9.283886909484863, + "learning_rate": 1.894897865099788e-05, + "loss": 2.2077, + "step": 25190 + }, + { + "epoch": 0.1583878380767191, + "grad_norm": 7.862932205200195, + "learning_rate": 1.8948559550053226e-05, + "loss": 2.0121, + "step": 25200 + }, + { + "epoch": 0.15845069039341622, + "grad_norm": 7.057699680328369, + "learning_rate": 1.8948140449108573e-05, + "loss": 2.1178, + "step": 25210 + }, + { + "epoch": 0.15851354271011334, + "grad_norm": 6.624111652374268, + "learning_rate": 1.894772134816392e-05, + "loss": 2.1101, + "step": 25220 + }, + { + "epoch": 0.15857639502681045, + "grad_norm": 7.000601768493652, + "learning_rate": 1.8947302247219267e-05, + "loss": 2.126, + "step": 25230 + }, + { + "epoch": 0.15863924734350754, + "grad_norm": 7.766676902770996, + "learning_rate": 1.8946883146274614e-05, + "loss": 2.0739, + "step": 25240 + }, + { + "epoch": 0.15870209966020465, + "grad_norm": 7.2908034324646, + "learning_rate": 1.894646404532996e-05, + "loss": 1.8524, + "step": 25250 + }, + { + "epoch": 0.15876495197690177, + "grad_norm": 6.436234951019287, + "learning_rate": 1.8946044944385305e-05, + "loss": 1.7905, + "step": 25260 + }, + { + "epoch": 0.15882780429359888, + "grad_norm": 7.536283493041992, + "learning_rate": 1.8945625843440652e-05, + "loss": 2.101, + "step": 25270 + }, + { + "epoch": 0.158890656610296, + "grad_norm": 7.62872838973999, + "learning_rate": 1.8945206742496e-05, + "loss": 2.0949, + "step": 25280 + }, + { + "epoch": 0.15895350892699311, + "grad_norm": 6.854900360107422, + "learning_rate": 1.8944787641551346e-05, + "loss": 2.0687, + "step": 25290 + }, + { + "epoch": 0.15901636124369023, + "grad_norm": 7.434235572814941, + "learning_rate": 1.8944368540606693e-05, + "loss": 2.0897, + "step": 25300 + }, + { + "epoch": 0.15907921356038732, + "grad_norm": 6.923051357269287, + "learning_rate": 1.894394943966204e-05, + "loss": 1.8295, + "step": 25310 + }, + { + "epoch": 0.15914206587708443, + "grad_norm": 8.220632553100586, + "learning_rate": 1.8943530338717384e-05, + "loss": 2.24, + "step": 25320 + }, + { + "epoch": 0.15920491819378155, + "grad_norm": 7.645471096038818, + "learning_rate": 1.894311123777273e-05, + "loss": 2.0898, + "step": 25330 + }, + { + "epoch": 0.15926777051047866, + "grad_norm": 8.202953338623047, + "learning_rate": 1.8942692136828078e-05, + "loss": 2.1, + "step": 25340 + }, + { + "epoch": 0.15933062282717578, + "grad_norm": 6.860551834106445, + "learning_rate": 1.8942273035883425e-05, + "loss": 2.0844, + "step": 25350 + }, + { + "epoch": 0.1593934751438729, + "grad_norm": 6.949239730834961, + "learning_rate": 1.8941853934938772e-05, + "loss": 2.1234, + "step": 25360 + }, + { + "epoch": 0.15945632746056998, + "grad_norm": 6.777586460113525, + "learning_rate": 1.8941434833994116e-05, + "loss": 1.9905, + "step": 25370 + }, + { + "epoch": 0.1595191797772671, + "grad_norm": 7.5527825355529785, + "learning_rate": 1.8941015733049463e-05, + "loss": 2.065, + "step": 25380 + }, + { + "epoch": 0.1595820320939642, + "grad_norm": 7.536026954650879, + "learning_rate": 1.894059663210481e-05, + "loss": 1.9825, + "step": 25390 + }, + { + "epoch": 0.15964488441066133, + "grad_norm": 7.107115745544434, + "learning_rate": 1.8940177531160157e-05, + "loss": 1.9168, + "step": 25400 + }, + { + "epoch": 0.15970773672735844, + "grad_norm": 7.15897274017334, + "learning_rate": 1.89397584302155e-05, + "loss": 1.8647, + "step": 25410 + }, + { + "epoch": 0.15977058904405556, + "grad_norm": 6.944474220275879, + "learning_rate": 1.8939339329270848e-05, + "loss": 2.1414, + "step": 25420 + }, + { + "epoch": 0.15983344136075264, + "grad_norm": 8.514678955078125, + "learning_rate": 1.8938920228326195e-05, + "loss": 2.1249, + "step": 25430 + }, + { + "epoch": 0.15989629367744976, + "grad_norm": 9.144061088562012, + "learning_rate": 1.8938501127381542e-05, + "loss": 2.322, + "step": 25440 + }, + { + "epoch": 0.15995914599414687, + "grad_norm": 7.6061692237854, + "learning_rate": 1.893808202643689e-05, + "loss": 2.0253, + "step": 25450 + }, + { + "epoch": 0.160021998310844, + "grad_norm": 7.343978404998779, + "learning_rate": 1.8937662925492236e-05, + "loss": 2.1732, + "step": 25460 + }, + { + "epoch": 0.1600848506275411, + "grad_norm": 6.9757490158081055, + "learning_rate": 1.8937243824547583e-05, + "loss": 2.1448, + "step": 25470 + }, + { + "epoch": 0.16014770294423822, + "grad_norm": 7.120965003967285, + "learning_rate": 1.893682472360293e-05, + "loss": 1.9994, + "step": 25480 + }, + { + "epoch": 0.16021055526093533, + "grad_norm": 7.839221954345703, + "learning_rate": 1.8936405622658278e-05, + "loss": 2.1946, + "step": 25490 + }, + { + "epoch": 0.16027340757763242, + "grad_norm": 7.733880519866943, + "learning_rate": 1.893598652171362e-05, + "loss": 2.0072, + "step": 25500 + }, + { + "epoch": 0.16033625989432954, + "grad_norm": 9.782451629638672, + "learning_rate": 1.8935567420768968e-05, + "loss": 2.1506, + "step": 25510 + }, + { + "epoch": 0.16039911221102665, + "grad_norm": 6.857192039489746, + "learning_rate": 1.8935148319824315e-05, + "loss": 1.9458, + "step": 25520 + }, + { + "epoch": 0.16046196452772377, + "grad_norm": 7.340935707092285, + "learning_rate": 1.8934729218879662e-05, + "loss": 2.0559, + "step": 25530 + }, + { + "epoch": 0.16052481684442088, + "grad_norm": 7.422693252563477, + "learning_rate": 1.8934310117935006e-05, + "loss": 2.192, + "step": 25540 + }, + { + "epoch": 0.160587669161118, + "grad_norm": 9.500493049621582, + "learning_rate": 1.8933891016990353e-05, + "loss": 2.1425, + "step": 25550 + }, + { + "epoch": 0.16065052147781508, + "grad_norm": 7.396688938140869, + "learning_rate": 1.89334719160457e-05, + "loss": 2.3758, + "step": 25560 + }, + { + "epoch": 0.1607133737945122, + "grad_norm": 6.810682773590088, + "learning_rate": 1.8933052815101047e-05, + "loss": 1.9584, + "step": 25570 + }, + { + "epoch": 0.1607762261112093, + "grad_norm": 7.831878662109375, + "learning_rate": 1.8932633714156394e-05, + "loss": 2.265, + "step": 25580 + }, + { + "epoch": 0.16083907842790643, + "grad_norm": 9.585444450378418, + "learning_rate": 1.8932214613211738e-05, + "loss": 2.083, + "step": 25590 + }, + { + "epoch": 0.16090193074460354, + "grad_norm": 6.930378437042236, + "learning_rate": 1.8931795512267085e-05, + "loss": 2.0618, + "step": 25600 + }, + { + "epoch": 0.16096478306130066, + "grad_norm": 8.438226699829102, + "learning_rate": 1.8931376411322432e-05, + "loss": 1.9928, + "step": 25610 + }, + { + "epoch": 0.16102763537799777, + "grad_norm": 6.970348834991455, + "learning_rate": 1.893095731037778e-05, + "loss": 1.9087, + "step": 25620 + }, + { + "epoch": 0.16109048769469486, + "grad_norm": 8.59546184539795, + "learning_rate": 1.8930538209433126e-05, + "loss": 2.2538, + "step": 25630 + }, + { + "epoch": 0.16115334001139198, + "grad_norm": 6.681682586669922, + "learning_rate": 1.893011910848847e-05, + "loss": 2.2343, + "step": 25640 + }, + { + "epoch": 0.1612161923280891, + "grad_norm": 7.520928382873535, + "learning_rate": 1.8929700007543817e-05, + "loss": 2.2066, + "step": 25650 + }, + { + "epoch": 0.1612790446447862, + "grad_norm": 7.324418544769287, + "learning_rate": 1.8929280906599164e-05, + "loss": 2.0636, + "step": 25660 + }, + { + "epoch": 0.16134189696148332, + "grad_norm": 8.397371292114258, + "learning_rate": 1.892886180565451e-05, + "loss": 2.1226, + "step": 25670 + }, + { + "epoch": 0.16140474927818044, + "grad_norm": 7.918264865875244, + "learning_rate": 1.892844270470986e-05, + "loss": 2.085, + "step": 25680 + }, + { + "epoch": 0.16146760159487752, + "grad_norm": 7.92132568359375, + "learning_rate": 1.8928023603765205e-05, + "loss": 2.0146, + "step": 25690 + }, + { + "epoch": 0.16153045391157464, + "grad_norm": 8.547807693481445, + "learning_rate": 1.8927604502820552e-05, + "loss": 1.9715, + "step": 25700 + }, + { + "epoch": 0.16159330622827175, + "grad_norm": 7.091030597686768, + "learning_rate": 1.89271854018759e-05, + "loss": 2.2613, + "step": 25710 + }, + { + "epoch": 0.16165615854496887, + "grad_norm": 6.6840081214904785, + "learning_rate": 1.8926766300931243e-05, + "loss": 2.1982, + "step": 25720 + }, + { + "epoch": 0.16171901086166598, + "grad_norm": 7.225265979766846, + "learning_rate": 1.892634719998659e-05, + "loss": 2.0174, + "step": 25730 + }, + { + "epoch": 0.1617818631783631, + "grad_norm": 7.222609043121338, + "learning_rate": 1.8925928099041937e-05, + "loss": 1.9866, + "step": 25740 + }, + { + "epoch": 0.1618447154950602, + "grad_norm": 7.990882396697998, + "learning_rate": 1.8925508998097284e-05, + "loss": 2.0287, + "step": 25750 + }, + { + "epoch": 0.1619075678117573, + "grad_norm": 6.329669952392578, + "learning_rate": 1.8925089897152628e-05, + "loss": 2.1078, + "step": 25760 + }, + { + "epoch": 0.16197042012845442, + "grad_norm": 8.103155136108398, + "learning_rate": 1.8924670796207975e-05, + "loss": 2.0957, + "step": 25770 + }, + { + "epoch": 0.16203327244515153, + "grad_norm": 8.01485538482666, + "learning_rate": 1.8924251695263322e-05, + "loss": 2.0673, + "step": 25780 + }, + { + "epoch": 0.16209612476184865, + "grad_norm": 6.672882080078125, + "learning_rate": 1.892383259431867e-05, + "loss": 2.2367, + "step": 25790 + }, + { + "epoch": 0.16215897707854576, + "grad_norm": 7.1393561363220215, + "learning_rate": 1.8923413493374016e-05, + "loss": 2.2425, + "step": 25800 + }, + { + "epoch": 0.16222182939524288, + "grad_norm": 7.2413458824157715, + "learning_rate": 1.892299439242936e-05, + "loss": 1.9738, + "step": 25810 + }, + { + "epoch": 0.16228468171193997, + "grad_norm": 7.2407121658325195, + "learning_rate": 1.8922575291484707e-05, + "loss": 2.1412, + "step": 25820 + }, + { + "epoch": 0.16234753402863708, + "grad_norm": 8.259404182434082, + "learning_rate": 1.8922156190540054e-05, + "loss": 2.3482, + "step": 25830 + }, + { + "epoch": 0.1624103863453342, + "grad_norm": 7.328073501586914, + "learning_rate": 1.89217370895954e-05, + "loss": 2.0186, + "step": 25840 + }, + { + "epoch": 0.1624732386620313, + "grad_norm": 7.726652145385742, + "learning_rate": 1.892131798865075e-05, + "loss": 2.1059, + "step": 25850 + }, + { + "epoch": 0.16253609097872843, + "grad_norm": 7.808199882507324, + "learning_rate": 1.8920898887706095e-05, + "loss": 2.226, + "step": 25860 + }, + { + "epoch": 0.16259894329542554, + "grad_norm": 7.739694118499756, + "learning_rate": 1.8920479786761442e-05, + "loss": 2.3565, + "step": 25870 + }, + { + "epoch": 0.16266179561212263, + "grad_norm": 8.303359031677246, + "learning_rate": 1.892006068581679e-05, + "loss": 2.2056, + "step": 25880 + }, + { + "epoch": 0.16272464792881974, + "grad_norm": 9.105769157409668, + "learning_rate": 1.8919641584872133e-05, + "loss": 1.9957, + "step": 25890 + }, + { + "epoch": 0.16278750024551686, + "grad_norm": 7.577997207641602, + "learning_rate": 1.891922248392748e-05, + "loss": 2.0289, + "step": 25900 + }, + { + "epoch": 0.16285035256221397, + "grad_norm": 8.940987586975098, + "learning_rate": 1.8918803382982827e-05, + "loss": 2.0189, + "step": 25910 + }, + { + "epoch": 0.1629132048789111, + "grad_norm": 7.030361175537109, + "learning_rate": 1.8918384282038174e-05, + "loss": 2.2427, + "step": 25920 + }, + { + "epoch": 0.1629760571956082, + "grad_norm": 7.728364944458008, + "learning_rate": 1.891796518109352e-05, + "loss": 2.1514, + "step": 25930 + }, + { + "epoch": 0.16303890951230532, + "grad_norm": 6.962830543518066, + "learning_rate": 1.8917546080148865e-05, + "loss": 2.1062, + "step": 25940 + }, + { + "epoch": 0.1631017618290024, + "grad_norm": 7.344377517700195, + "learning_rate": 1.8917126979204212e-05, + "loss": 1.8316, + "step": 25950 + }, + { + "epoch": 0.16316461414569952, + "grad_norm": 7.470015048980713, + "learning_rate": 1.891670787825956e-05, + "loss": 1.8542, + "step": 25960 + }, + { + "epoch": 0.16322746646239664, + "grad_norm": 7.419188022613525, + "learning_rate": 1.8916288777314906e-05, + "loss": 2.1597, + "step": 25970 + }, + { + "epoch": 0.16329031877909375, + "grad_norm": 8.001127243041992, + "learning_rate": 1.8915869676370253e-05, + "loss": 1.8469, + "step": 25980 + }, + { + "epoch": 0.16335317109579087, + "grad_norm": 7.438764572143555, + "learning_rate": 1.8915450575425597e-05, + "loss": 2.0946, + "step": 25990 + }, + { + "epoch": 0.16341602341248798, + "grad_norm": 7.521275997161865, + "learning_rate": 1.8915031474480944e-05, + "loss": 2.2205, + "step": 26000 + }, + { + "epoch": 0.16347887572918507, + "grad_norm": 7.863826751708984, + "learning_rate": 1.891461237353629e-05, + "loss": 1.8943, + "step": 26010 + }, + { + "epoch": 0.16354172804588218, + "grad_norm": 8.256706237792969, + "learning_rate": 1.891419327259164e-05, + "loss": 2.2367, + "step": 26020 + }, + { + "epoch": 0.1636045803625793, + "grad_norm": 6.641922950744629, + "learning_rate": 1.8913774171646982e-05, + "loss": 2.0092, + "step": 26030 + }, + { + "epoch": 0.16366743267927641, + "grad_norm": 7.038200378417969, + "learning_rate": 1.891335507070233e-05, + "loss": 2.1011, + "step": 26040 + }, + { + "epoch": 0.16373028499597353, + "grad_norm": 7.9291090965271, + "learning_rate": 1.8912935969757676e-05, + "loss": 2.2102, + "step": 26050 + }, + { + "epoch": 0.16379313731267064, + "grad_norm": 7.497622489929199, + "learning_rate": 1.8912516868813023e-05, + "loss": 2.0191, + "step": 26060 + }, + { + "epoch": 0.16385598962936773, + "grad_norm": 8.030279159545898, + "learning_rate": 1.891209776786837e-05, + "loss": 1.9894, + "step": 26070 + }, + { + "epoch": 0.16391884194606485, + "grad_norm": 7.592596054077148, + "learning_rate": 1.8911678666923717e-05, + "loss": 2.0592, + "step": 26080 + }, + { + "epoch": 0.16398169426276196, + "grad_norm": 7.6316304206848145, + "learning_rate": 1.8911259565979064e-05, + "loss": 2.0823, + "step": 26090 + }, + { + "epoch": 0.16404454657945908, + "grad_norm": 7.14187479019165, + "learning_rate": 1.891084046503441e-05, + "loss": 1.6933, + "step": 26100 + }, + { + "epoch": 0.1641073988961562, + "grad_norm": 6.482819080352783, + "learning_rate": 1.891042136408976e-05, + "loss": 2.0323, + "step": 26110 + }, + { + "epoch": 0.1641702512128533, + "grad_norm": 8.437029838562012, + "learning_rate": 1.8910002263145102e-05, + "loss": 1.9024, + "step": 26120 + }, + { + "epoch": 0.16423310352955042, + "grad_norm": 7.370238780975342, + "learning_rate": 1.890958316220045e-05, + "loss": 2.0056, + "step": 26130 + }, + { + "epoch": 0.1642959558462475, + "grad_norm": 7.479917526245117, + "learning_rate": 1.8909164061255796e-05, + "loss": 2.052, + "step": 26140 + }, + { + "epoch": 0.16435880816294463, + "grad_norm": 8.088809967041016, + "learning_rate": 1.8908744960311144e-05, + "loss": 1.9173, + "step": 26150 + }, + { + "epoch": 0.16442166047964174, + "grad_norm": 7.7105560302734375, + "learning_rate": 1.8908325859366487e-05, + "loss": 2.1858, + "step": 26160 + }, + { + "epoch": 0.16448451279633886, + "grad_norm": 7.6701836585998535, + "learning_rate": 1.8907906758421834e-05, + "loss": 1.9759, + "step": 26170 + }, + { + "epoch": 0.16454736511303597, + "grad_norm": 7.496591091156006, + "learning_rate": 1.890748765747718e-05, + "loss": 2.0881, + "step": 26180 + }, + { + "epoch": 0.16461021742973309, + "grad_norm": 8.055353164672852, + "learning_rate": 1.890706855653253e-05, + "loss": 2.1669, + "step": 26190 + }, + { + "epoch": 0.16467306974643017, + "grad_norm": 7.290207386016846, + "learning_rate": 1.8906649455587875e-05, + "loss": 2.2242, + "step": 26200 + }, + { + "epoch": 0.1647359220631273, + "grad_norm": 7.497858047485352, + "learning_rate": 1.890623035464322e-05, + "loss": 1.9517, + "step": 26210 + }, + { + "epoch": 0.1647987743798244, + "grad_norm": 9.148297309875488, + "learning_rate": 1.8905811253698566e-05, + "loss": 2.1669, + "step": 26220 + }, + { + "epoch": 0.16486162669652152, + "grad_norm": 7.774691104888916, + "learning_rate": 1.8905392152753913e-05, + "loss": 2.1396, + "step": 26230 + }, + { + "epoch": 0.16492447901321863, + "grad_norm": 7.218595504760742, + "learning_rate": 1.890497305180926e-05, + "loss": 2.0969, + "step": 26240 + }, + { + "epoch": 0.16498733132991575, + "grad_norm": 7.237490653991699, + "learning_rate": 1.8904553950864607e-05, + "loss": 1.8568, + "step": 26250 + }, + { + "epoch": 0.16505018364661284, + "grad_norm": 7.246071815490723, + "learning_rate": 1.8904134849919955e-05, + "loss": 2.1812, + "step": 26260 + }, + { + "epoch": 0.16511303596330995, + "grad_norm": 8.438283920288086, + "learning_rate": 1.8903715748975298e-05, + "loss": 2.1437, + "step": 26270 + }, + { + "epoch": 0.16517588828000707, + "grad_norm": 6.827730655670166, + "learning_rate": 1.8903296648030645e-05, + "loss": 1.963, + "step": 26280 + }, + { + "epoch": 0.16523874059670418, + "grad_norm": 8.639034271240234, + "learning_rate": 1.8902877547085992e-05, + "loss": 2.0042, + "step": 26290 + }, + { + "epoch": 0.1653015929134013, + "grad_norm": 6.844017028808594, + "learning_rate": 1.890245844614134e-05, + "loss": 2.1669, + "step": 26300 + }, + { + "epoch": 0.1653644452300984, + "grad_norm": 6.324168682098389, + "learning_rate": 1.8902039345196686e-05, + "loss": 2.0545, + "step": 26310 + }, + { + "epoch": 0.16542729754679553, + "grad_norm": 6.322104454040527, + "learning_rate": 1.8901620244252034e-05, + "loss": 1.9711, + "step": 26320 + }, + { + "epoch": 0.1654901498634926, + "grad_norm": 12.182509422302246, + "learning_rate": 1.890120114330738e-05, + "loss": 1.9883, + "step": 26330 + }, + { + "epoch": 0.16555300218018973, + "grad_norm": 7.797282695770264, + "learning_rate": 1.8900782042362724e-05, + "loss": 2.1898, + "step": 26340 + }, + { + "epoch": 0.16561585449688684, + "grad_norm": 7.0130615234375, + "learning_rate": 1.890036294141807e-05, + "loss": 1.9196, + "step": 26350 + }, + { + "epoch": 0.16567870681358396, + "grad_norm": 6.654932498931885, + "learning_rate": 1.889994384047342e-05, + "loss": 2.0265, + "step": 26360 + }, + { + "epoch": 0.16574155913028107, + "grad_norm": 8.605035781860352, + "learning_rate": 1.8899524739528766e-05, + "loss": 2.2541, + "step": 26370 + }, + { + "epoch": 0.1658044114469782, + "grad_norm": 6.103670120239258, + "learning_rate": 1.889910563858411e-05, + "loss": 2.1181, + "step": 26380 + }, + { + "epoch": 0.16586726376367528, + "grad_norm": 7.237303733825684, + "learning_rate": 1.8898686537639456e-05, + "loss": 2.0054, + "step": 26390 + }, + { + "epoch": 0.1659301160803724, + "grad_norm": 7.135437488555908, + "learning_rate": 1.8898267436694803e-05, + "loss": 1.9511, + "step": 26400 + }, + { + "epoch": 0.1659929683970695, + "grad_norm": 7.38407039642334, + "learning_rate": 1.889784833575015e-05, + "loss": 1.8589, + "step": 26410 + }, + { + "epoch": 0.16605582071376662, + "grad_norm": 6.739563465118408, + "learning_rate": 1.8897429234805497e-05, + "loss": 2.1611, + "step": 26420 + }, + { + "epoch": 0.16611867303046374, + "grad_norm": 8.108920097351074, + "learning_rate": 1.889701013386084e-05, + "loss": 2.0602, + "step": 26430 + }, + { + "epoch": 0.16618152534716085, + "grad_norm": 8.454718589782715, + "learning_rate": 1.8896591032916188e-05, + "loss": 2.1088, + "step": 26440 + }, + { + "epoch": 0.16624437766385797, + "grad_norm": 6.543862819671631, + "learning_rate": 1.8896171931971535e-05, + "loss": 2.1595, + "step": 26450 + }, + { + "epoch": 0.16630722998055505, + "grad_norm": 6.936466217041016, + "learning_rate": 1.8895752831026882e-05, + "loss": 1.9975, + "step": 26460 + }, + { + "epoch": 0.16637008229725217, + "grad_norm": 8.221345901489258, + "learning_rate": 1.889533373008223e-05, + "loss": 2.0402, + "step": 26470 + }, + { + "epoch": 0.16643293461394928, + "grad_norm": 8.026082038879395, + "learning_rate": 1.8894914629137577e-05, + "loss": 2.0939, + "step": 26480 + }, + { + "epoch": 0.1664957869306464, + "grad_norm": 7.128113269805908, + "learning_rate": 1.8894495528192924e-05, + "loss": 1.9315, + "step": 26490 + }, + { + "epoch": 0.16655863924734352, + "grad_norm": 7.8659491539001465, + "learning_rate": 1.889407642724827e-05, + "loss": 2.0228, + "step": 26500 + }, + { + "epoch": 0.16662149156404063, + "grad_norm": 6.884822368621826, + "learning_rate": 1.8893657326303618e-05, + "loss": 2.2244, + "step": 26510 + }, + { + "epoch": 0.16668434388073772, + "grad_norm": 7.679152488708496, + "learning_rate": 1.889323822535896e-05, + "loss": 2.0107, + "step": 26520 + }, + { + "epoch": 0.16674719619743483, + "grad_norm": 7.619312763214111, + "learning_rate": 1.889281912441431e-05, + "loss": 1.9638, + "step": 26530 + }, + { + "epoch": 0.16681004851413195, + "grad_norm": 7.260880947113037, + "learning_rate": 1.8892400023469656e-05, + "loss": 2.0733, + "step": 26540 + }, + { + "epoch": 0.16687290083082906, + "grad_norm": 7.044139385223389, + "learning_rate": 1.8891980922525003e-05, + "loss": 2.1626, + "step": 26550 + }, + { + "epoch": 0.16693575314752618, + "grad_norm": 7.697073936462402, + "learning_rate": 1.8891561821580346e-05, + "loss": 1.9792, + "step": 26560 + }, + { + "epoch": 0.1669986054642233, + "grad_norm": 7.447133541107178, + "learning_rate": 1.8891142720635693e-05, + "loss": 2.0448, + "step": 26570 + }, + { + "epoch": 0.16706145778092038, + "grad_norm": 7.185406684875488, + "learning_rate": 1.889072361969104e-05, + "loss": 2.2215, + "step": 26580 + }, + { + "epoch": 0.1671243100976175, + "grad_norm": 8.131444931030273, + "learning_rate": 1.8890304518746388e-05, + "loss": 2.1474, + "step": 26590 + }, + { + "epoch": 0.1671871624143146, + "grad_norm": 6.428308010101318, + "learning_rate": 1.8889885417801735e-05, + "loss": 2.1216, + "step": 26600 + }, + { + "epoch": 0.16725001473101173, + "grad_norm": 7.927680969238281, + "learning_rate": 1.8889466316857078e-05, + "loss": 2.0213, + "step": 26610 + }, + { + "epoch": 0.16731286704770884, + "grad_norm": 7.277192115783691, + "learning_rate": 1.8889047215912425e-05, + "loss": 1.8976, + "step": 26620 + }, + { + "epoch": 0.16737571936440596, + "grad_norm": 7.823034763336182, + "learning_rate": 1.8888628114967772e-05, + "loss": 1.9142, + "step": 26630 + }, + { + "epoch": 0.16743857168110307, + "grad_norm": 7.691032886505127, + "learning_rate": 1.888820901402312e-05, + "loss": 2.1413, + "step": 26640 + }, + { + "epoch": 0.16750142399780016, + "grad_norm": 7.83949613571167, + "learning_rate": 1.8887789913078463e-05, + "loss": 2.1934, + "step": 26650 + }, + { + "epoch": 0.16756427631449727, + "grad_norm": 7.6085944175720215, + "learning_rate": 1.888737081213381e-05, + "loss": 2.0881, + "step": 26660 + }, + { + "epoch": 0.1676271286311944, + "grad_norm": 7.7130446434021, + "learning_rate": 1.8886951711189157e-05, + "loss": 2.0636, + "step": 26670 + }, + { + "epoch": 0.1676899809478915, + "grad_norm": 6.7432026863098145, + "learning_rate": 1.8886532610244504e-05, + "loss": 1.8504, + "step": 26680 + }, + { + "epoch": 0.16775283326458862, + "grad_norm": 7.776494026184082, + "learning_rate": 1.888611350929985e-05, + "loss": 2.0823, + "step": 26690 + }, + { + "epoch": 0.16781568558128573, + "grad_norm": 6.37518835067749, + "learning_rate": 1.88856944083552e-05, + "loss": 2.0266, + "step": 26700 + }, + { + "epoch": 0.16787853789798282, + "grad_norm": 7.716270923614502, + "learning_rate": 1.8885275307410546e-05, + "loss": 2.1018, + "step": 26710 + }, + { + "epoch": 0.16794139021467994, + "grad_norm": 7.457879543304443, + "learning_rate": 1.8884856206465893e-05, + "loss": 2.296, + "step": 26720 + }, + { + "epoch": 0.16800424253137705, + "grad_norm": 7.101696014404297, + "learning_rate": 1.888443710552124e-05, + "loss": 2.1638, + "step": 26730 + }, + { + "epoch": 0.16806709484807417, + "grad_norm": 7.180225372314453, + "learning_rate": 1.8884018004576583e-05, + "loss": 1.7523, + "step": 26740 + }, + { + "epoch": 0.16812994716477128, + "grad_norm": 6.898594379425049, + "learning_rate": 1.888359890363193e-05, + "loss": 2.0864, + "step": 26750 + }, + { + "epoch": 0.1681927994814684, + "grad_norm": 7.147555351257324, + "learning_rate": 1.8883179802687278e-05, + "loss": 1.9752, + "step": 26760 + }, + { + "epoch": 0.16825565179816548, + "grad_norm": 7.030953884124756, + "learning_rate": 1.8882760701742625e-05, + "loss": 2.0757, + "step": 26770 + }, + { + "epoch": 0.1683185041148626, + "grad_norm": 8.195552825927734, + "learning_rate": 1.888234160079797e-05, + "loss": 1.9025, + "step": 26780 + }, + { + "epoch": 0.16838135643155971, + "grad_norm": 6.486485004425049, + "learning_rate": 1.8881922499853315e-05, + "loss": 2.0026, + "step": 26790 + }, + { + "epoch": 0.16844420874825683, + "grad_norm": 7.766326427459717, + "learning_rate": 1.8881503398908662e-05, + "loss": 2.0227, + "step": 26800 + }, + { + "epoch": 0.16850706106495394, + "grad_norm": 7.248443603515625, + "learning_rate": 1.888108429796401e-05, + "loss": 1.962, + "step": 26810 + }, + { + "epoch": 0.16856991338165106, + "grad_norm": 7.111790180206299, + "learning_rate": 1.8880665197019357e-05, + "loss": 2.094, + "step": 26820 + }, + { + "epoch": 0.16863276569834817, + "grad_norm": 7.6839118003845215, + "learning_rate": 1.88802460960747e-05, + "loss": 2.2033, + "step": 26830 + }, + { + "epoch": 0.16869561801504526, + "grad_norm": 8.051858901977539, + "learning_rate": 1.8879826995130047e-05, + "loss": 2.1157, + "step": 26840 + }, + { + "epoch": 0.16875847033174238, + "grad_norm": 8.392078399658203, + "learning_rate": 1.8879407894185394e-05, + "loss": 2.0272, + "step": 26850 + }, + { + "epoch": 0.1688213226484395, + "grad_norm": 7.104002475738525, + "learning_rate": 1.887898879324074e-05, + "loss": 2.0796, + "step": 26860 + }, + { + "epoch": 0.1688841749651366, + "grad_norm": 7.312976360321045, + "learning_rate": 1.887856969229609e-05, + "loss": 2.0547, + "step": 26870 + }, + { + "epoch": 0.16894702728183372, + "grad_norm": 6.258914947509766, + "learning_rate": 1.8878150591351436e-05, + "loss": 2.2227, + "step": 26880 + }, + { + "epoch": 0.16900987959853084, + "grad_norm": 7.952404499053955, + "learning_rate": 1.887773149040678e-05, + "loss": 2.2863, + "step": 26890 + }, + { + "epoch": 0.16907273191522793, + "grad_norm": 7.923120975494385, + "learning_rate": 1.8877312389462126e-05, + "loss": 1.9709, + "step": 26900 + }, + { + "epoch": 0.16913558423192504, + "grad_norm": 6.05596399307251, + "learning_rate": 1.8876893288517473e-05, + "loss": 2.0636, + "step": 26910 + }, + { + "epoch": 0.16919843654862216, + "grad_norm": 7.628379821777344, + "learning_rate": 1.887647418757282e-05, + "loss": 2.0983, + "step": 26920 + }, + { + "epoch": 0.16926128886531927, + "grad_norm": 7.406306743621826, + "learning_rate": 1.8876055086628168e-05, + "loss": 1.9923, + "step": 26930 + }, + { + "epoch": 0.16932414118201639, + "grad_norm": 7.642539024353027, + "learning_rate": 1.8875635985683515e-05, + "loss": 2.0245, + "step": 26940 + }, + { + "epoch": 0.1693869934987135, + "grad_norm": 7.147662162780762, + "learning_rate": 1.8875216884738862e-05, + "loss": 1.8431, + "step": 26950 + }, + { + "epoch": 0.16944984581541062, + "grad_norm": 7.125799179077148, + "learning_rate": 1.8874797783794205e-05, + "loss": 2.195, + "step": 26960 + }, + { + "epoch": 0.1695126981321077, + "grad_norm": 8.474396705627441, + "learning_rate": 1.8874378682849552e-05, + "loss": 2.0423, + "step": 26970 + }, + { + "epoch": 0.16957555044880482, + "grad_norm": 7.183737754821777, + "learning_rate": 1.88739595819049e-05, + "loss": 1.7227, + "step": 26980 + }, + { + "epoch": 0.16963840276550193, + "grad_norm": 7.125051975250244, + "learning_rate": 1.8873540480960247e-05, + "loss": 2.0083, + "step": 26990 + }, + { + "epoch": 0.16970125508219905, + "grad_norm": 7.695666790008545, + "learning_rate": 1.887312138001559e-05, + "loss": 2.1497, + "step": 27000 + }, + { + "epoch": 0.16976410739889616, + "grad_norm": 7.176787376403809, + "learning_rate": 1.8872702279070937e-05, + "loss": 2.3338, + "step": 27010 + }, + { + "epoch": 0.16982695971559328, + "grad_norm": 7.642787933349609, + "learning_rate": 1.8872283178126284e-05, + "loss": 2.1084, + "step": 27020 + }, + { + "epoch": 0.16988981203229037, + "grad_norm": 7.733269691467285, + "learning_rate": 1.887186407718163e-05, + "loss": 2.1504, + "step": 27030 + }, + { + "epoch": 0.16995266434898748, + "grad_norm": 7.00707483291626, + "learning_rate": 1.887144497623698e-05, + "loss": 2.192, + "step": 27040 + }, + { + "epoch": 0.1700155166656846, + "grad_norm": 6.551406383514404, + "learning_rate": 1.8871025875292322e-05, + "loss": 1.9258, + "step": 27050 + }, + { + "epoch": 0.1700783689823817, + "grad_norm": 8.196138381958008, + "learning_rate": 1.887060677434767e-05, + "loss": 1.7894, + "step": 27060 + }, + { + "epoch": 0.17014122129907883, + "grad_norm": 7.6168107986450195, + "learning_rate": 1.8870187673403016e-05, + "loss": 2.0308, + "step": 27070 + }, + { + "epoch": 0.17020407361577594, + "grad_norm": 7.582335948944092, + "learning_rate": 1.8869768572458363e-05, + "loss": 2.0958, + "step": 27080 + }, + { + "epoch": 0.17026692593247303, + "grad_norm": 7.733460903167725, + "learning_rate": 1.886934947151371e-05, + "loss": 2.0958, + "step": 27090 + }, + { + "epoch": 0.17032977824917014, + "grad_norm": 7.207364082336426, + "learning_rate": 1.8868930370569058e-05, + "loss": 2.0868, + "step": 27100 + }, + { + "epoch": 0.17039263056586726, + "grad_norm": 7.0060296058654785, + "learning_rate": 1.8868511269624405e-05, + "loss": 1.9792, + "step": 27110 + }, + { + "epoch": 0.17045548288256437, + "grad_norm": 6.219759464263916, + "learning_rate": 1.8868092168679752e-05, + "loss": 2.0264, + "step": 27120 + }, + { + "epoch": 0.1705183351992615, + "grad_norm": 7.041900634765625, + "learning_rate": 1.88676730677351e-05, + "loss": 2.0412, + "step": 27130 + }, + { + "epoch": 0.1705811875159586, + "grad_norm": 7.1881890296936035, + "learning_rate": 1.8867253966790443e-05, + "loss": 1.8791, + "step": 27140 + }, + { + "epoch": 0.17064403983265572, + "grad_norm": 7.2067341804504395, + "learning_rate": 1.886683486584579e-05, + "loss": 1.9835, + "step": 27150 + }, + { + "epoch": 0.1707068921493528, + "grad_norm": 6.838316440582275, + "learning_rate": 1.8866415764901137e-05, + "loss": 1.8088, + "step": 27160 + }, + { + "epoch": 0.17076974446604992, + "grad_norm": 6.98081111907959, + "learning_rate": 1.8865996663956484e-05, + "loss": 2.108, + "step": 27170 + }, + { + "epoch": 0.17083259678274704, + "grad_norm": 7.362522125244141, + "learning_rate": 1.8865577563011827e-05, + "loss": 1.9587, + "step": 27180 + }, + { + "epoch": 0.17089544909944415, + "grad_norm": 7.864029407501221, + "learning_rate": 1.8865158462067174e-05, + "loss": 2.0782, + "step": 27190 + }, + { + "epoch": 0.17095830141614127, + "grad_norm": 6.886123180389404, + "learning_rate": 1.886473936112252e-05, + "loss": 1.8335, + "step": 27200 + }, + { + "epoch": 0.17102115373283838, + "grad_norm": 6.821227550506592, + "learning_rate": 1.886432026017787e-05, + "loss": 2.0717, + "step": 27210 + }, + { + "epoch": 0.17108400604953547, + "grad_norm": 7.68241548538208, + "learning_rate": 1.8863901159233216e-05, + "loss": 2.064, + "step": 27220 + }, + { + "epoch": 0.17114685836623258, + "grad_norm": 6.796754837036133, + "learning_rate": 1.886348205828856e-05, + "loss": 2.0502, + "step": 27230 + }, + { + "epoch": 0.1712097106829297, + "grad_norm": 6.333973407745361, + "learning_rate": 1.8863062957343906e-05, + "loss": 1.869, + "step": 27240 + }, + { + "epoch": 0.17127256299962682, + "grad_norm": 9.181356430053711, + "learning_rate": 1.8862643856399254e-05, + "loss": 2.0765, + "step": 27250 + }, + { + "epoch": 0.17133541531632393, + "grad_norm": 6.63329553604126, + "learning_rate": 1.88622247554546e-05, + "loss": 1.8724, + "step": 27260 + }, + { + "epoch": 0.17139826763302105, + "grad_norm": 7.677600860595703, + "learning_rate": 1.8861805654509944e-05, + "loss": 1.8641, + "step": 27270 + }, + { + "epoch": 0.17146111994971816, + "grad_norm": 8.482681274414062, + "learning_rate": 1.886138655356529e-05, + "loss": 1.818, + "step": 27280 + }, + { + "epoch": 0.17152397226641525, + "grad_norm": 9.265092849731445, + "learning_rate": 1.886096745262064e-05, + "loss": 2.2261, + "step": 27290 + }, + { + "epoch": 0.17158682458311236, + "grad_norm": 8.26068115234375, + "learning_rate": 1.8860548351675985e-05, + "loss": 1.9023, + "step": 27300 + }, + { + "epoch": 0.17164967689980948, + "grad_norm": 7.694910526275635, + "learning_rate": 1.8860129250731333e-05, + "loss": 2.0018, + "step": 27310 + }, + { + "epoch": 0.1717125292165066, + "grad_norm": 7.138001918792725, + "learning_rate": 1.885971014978668e-05, + "loss": 2.173, + "step": 27320 + }, + { + "epoch": 0.1717753815332037, + "grad_norm": 6.843593597412109, + "learning_rate": 1.8859291048842027e-05, + "loss": 1.8752, + "step": 27330 + }, + { + "epoch": 0.17183823384990082, + "grad_norm": 8.503833770751953, + "learning_rate": 1.8858871947897374e-05, + "loss": 1.8912, + "step": 27340 + }, + { + "epoch": 0.1719010861665979, + "grad_norm": 7.459292888641357, + "learning_rate": 1.885845284695272e-05, + "loss": 1.9018, + "step": 27350 + }, + { + "epoch": 0.17196393848329503, + "grad_norm": 6.216436862945557, + "learning_rate": 1.8858033746008065e-05, + "loss": 1.8553, + "step": 27360 + }, + { + "epoch": 0.17202679079999214, + "grad_norm": 8.26440715789795, + "learning_rate": 1.885761464506341e-05, + "loss": 2.0397, + "step": 27370 + }, + { + "epoch": 0.17208964311668926, + "grad_norm": 6.738312721252441, + "learning_rate": 1.885719554411876e-05, + "loss": 2.0152, + "step": 27380 + }, + { + "epoch": 0.17215249543338637, + "grad_norm": 6.979240894317627, + "learning_rate": 1.8856776443174106e-05, + "loss": 1.9694, + "step": 27390 + }, + { + "epoch": 0.1722153477500835, + "grad_norm": 7.805268287658691, + "learning_rate": 1.885635734222945e-05, + "loss": 1.9635, + "step": 27400 + }, + { + "epoch": 0.17227820006678057, + "grad_norm": 7.3537917137146, + "learning_rate": 1.8855938241284796e-05, + "loss": 2.1664, + "step": 27410 + }, + { + "epoch": 0.1723410523834777, + "grad_norm": 7.4962615966796875, + "learning_rate": 1.8855519140340144e-05, + "loss": 2.1061, + "step": 27420 + }, + { + "epoch": 0.1724039047001748, + "grad_norm": 7.050327777862549, + "learning_rate": 1.885510003939549e-05, + "loss": 2.2767, + "step": 27430 + }, + { + "epoch": 0.17246675701687192, + "grad_norm": 7.685523986816406, + "learning_rate": 1.8854680938450838e-05, + "loss": 1.9987, + "step": 27440 + }, + { + "epoch": 0.17252960933356903, + "grad_norm": 7.283895015716553, + "learning_rate": 1.885426183750618e-05, + "loss": 1.8152, + "step": 27450 + }, + { + "epoch": 0.17259246165026615, + "grad_norm": 7.353915691375732, + "learning_rate": 1.885384273656153e-05, + "loss": 2.2302, + "step": 27460 + }, + { + "epoch": 0.17265531396696326, + "grad_norm": 7.832383155822754, + "learning_rate": 1.8853423635616876e-05, + "loss": 2.0384, + "step": 27470 + }, + { + "epoch": 0.17271816628366035, + "grad_norm": 8.037903785705566, + "learning_rate": 1.8853004534672223e-05, + "loss": 2.2001, + "step": 27480 + }, + { + "epoch": 0.17278101860035747, + "grad_norm": 8.03400993347168, + "learning_rate": 1.885258543372757e-05, + "loss": 1.7838, + "step": 27490 + }, + { + "epoch": 0.17284387091705458, + "grad_norm": 7.769232273101807, + "learning_rate": 1.8852166332782917e-05, + "loss": 1.9368, + "step": 27500 + }, + { + "epoch": 0.1729067232337517, + "grad_norm": 8.408327102661133, + "learning_rate": 1.8851747231838264e-05, + "loss": 2.1784, + "step": 27510 + }, + { + "epoch": 0.1729695755504488, + "grad_norm": 6.302871227264404, + "learning_rate": 1.8851328130893607e-05, + "loss": 2.1831, + "step": 27520 + }, + { + "epoch": 0.17303242786714593, + "grad_norm": 7.579148769378662, + "learning_rate": 1.8850909029948955e-05, + "loss": 1.984, + "step": 27530 + }, + { + "epoch": 0.17309528018384301, + "grad_norm": 5.984960556030273, + "learning_rate": 1.88504899290043e-05, + "loss": 2.1541, + "step": 27540 + }, + { + "epoch": 0.17315813250054013, + "grad_norm": 6.045041561126709, + "learning_rate": 1.885007082805965e-05, + "loss": 1.9877, + "step": 27550 + }, + { + "epoch": 0.17322098481723724, + "grad_norm": 7.484976768493652, + "learning_rate": 1.8849651727114996e-05, + "loss": 2.097, + "step": 27560 + }, + { + "epoch": 0.17328383713393436, + "grad_norm": 8.815256118774414, + "learning_rate": 1.8849232626170343e-05, + "loss": 1.9681, + "step": 27570 + }, + { + "epoch": 0.17334668945063147, + "grad_norm": 6.937033653259277, + "learning_rate": 1.8848813525225687e-05, + "loss": 1.8963, + "step": 27580 + }, + { + "epoch": 0.1734095417673286, + "grad_norm": 7.550961494445801, + "learning_rate": 1.8848394424281034e-05, + "loss": 1.8807, + "step": 27590 + }, + { + "epoch": 0.17347239408402568, + "grad_norm": 7.170886993408203, + "learning_rate": 1.884797532333638e-05, + "loss": 1.9588, + "step": 27600 + }, + { + "epoch": 0.1735352464007228, + "grad_norm": 8.13257884979248, + "learning_rate": 1.8847556222391728e-05, + "loss": 2.0812, + "step": 27610 + }, + { + "epoch": 0.1735980987174199, + "grad_norm": 6.796542167663574, + "learning_rate": 1.884713712144707e-05, + "loss": 1.788, + "step": 27620 + }, + { + "epoch": 0.17366095103411702, + "grad_norm": 6.174264430999756, + "learning_rate": 1.884671802050242e-05, + "loss": 1.8799, + "step": 27630 + }, + { + "epoch": 0.17372380335081414, + "grad_norm": 6.56820821762085, + "learning_rate": 1.8846298919557766e-05, + "loss": 1.9007, + "step": 27640 + }, + { + "epoch": 0.17378665566751125, + "grad_norm": 8.850946426391602, + "learning_rate": 1.8845879818613113e-05, + "loss": 2.0807, + "step": 27650 + }, + { + "epoch": 0.17384950798420837, + "grad_norm": 8.081398963928223, + "learning_rate": 1.884546071766846e-05, + "loss": 1.9101, + "step": 27660 + }, + { + "epoch": 0.17391236030090546, + "grad_norm": 7.104781150817871, + "learning_rate": 1.8845041616723803e-05, + "loss": 1.8105, + "step": 27670 + }, + { + "epoch": 0.17397521261760257, + "grad_norm": 6.3523173332214355, + "learning_rate": 1.884462251577915e-05, + "loss": 1.9426, + "step": 27680 + }, + { + "epoch": 0.17403806493429969, + "grad_norm": 6.756746768951416, + "learning_rate": 1.8844203414834498e-05, + "loss": 2.2213, + "step": 27690 + }, + { + "epoch": 0.1741009172509968, + "grad_norm": 7.961772441864014, + "learning_rate": 1.8843784313889845e-05, + "loss": 1.9764, + "step": 27700 + }, + { + "epoch": 0.17416376956769392, + "grad_norm": 7.900453090667725, + "learning_rate": 1.884336521294519e-05, + "loss": 2.0987, + "step": 27710 + }, + { + "epoch": 0.17422662188439103, + "grad_norm": 8.148445129394531, + "learning_rate": 1.884294611200054e-05, + "loss": 1.9654, + "step": 27720 + }, + { + "epoch": 0.17428947420108812, + "grad_norm": 7.783229351043701, + "learning_rate": 1.8842527011055886e-05, + "loss": 1.9184, + "step": 27730 + }, + { + "epoch": 0.17435232651778523, + "grad_norm": 8.24295711517334, + "learning_rate": 1.8842107910111233e-05, + "loss": 2.1139, + "step": 27740 + }, + { + "epoch": 0.17441517883448235, + "grad_norm": 8.228842735290527, + "learning_rate": 1.884168880916658e-05, + "loss": 2.1052, + "step": 27750 + }, + { + "epoch": 0.17447803115117946, + "grad_norm": 6.867830276489258, + "learning_rate": 1.8841269708221924e-05, + "loss": 2.0018, + "step": 27760 + }, + { + "epoch": 0.17454088346787658, + "grad_norm": 6.347137928009033, + "learning_rate": 1.884085060727727e-05, + "loss": 1.9588, + "step": 27770 + }, + { + "epoch": 0.1746037357845737, + "grad_norm": 7.055315017700195, + "learning_rate": 1.8840431506332618e-05, + "loss": 2.227, + "step": 27780 + }, + { + "epoch": 0.1746665881012708, + "grad_norm": 7.518227577209473, + "learning_rate": 1.8840012405387965e-05, + "loss": 1.93, + "step": 27790 + }, + { + "epoch": 0.1747294404179679, + "grad_norm": 8.064926147460938, + "learning_rate": 1.883959330444331e-05, + "loss": 2.0228, + "step": 27800 + }, + { + "epoch": 0.174792292734665, + "grad_norm": 8.549365043640137, + "learning_rate": 1.8839174203498656e-05, + "loss": 1.9405, + "step": 27810 + }, + { + "epoch": 0.17485514505136213, + "grad_norm": 7.1288228034973145, + "learning_rate": 1.8838755102554003e-05, + "loss": 2.3457, + "step": 27820 + }, + { + "epoch": 0.17491799736805924, + "grad_norm": 6.547165870666504, + "learning_rate": 1.883833600160935e-05, + "loss": 2.0101, + "step": 27830 + }, + { + "epoch": 0.17498084968475636, + "grad_norm": 7.813448905944824, + "learning_rate": 1.8837916900664697e-05, + "loss": 1.825, + "step": 27840 + }, + { + "epoch": 0.17504370200145347, + "grad_norm": 6.4091410636901855, + "learning_rate": 1.883749779972004e-05, + "loss": 1.8868, + "step": 27850 + }, + { + "epoch": 0.17510655431815056, + "grad_norm": 8.775810241699219, + "learning_rate": 1.8837078698775388e-05, + "loss": 1.9993, + "step": 27860 + }, + { + "epoch": 0.17516940663484767, + "grad_norm": 7.959566593170166, + "learning_rate": 1.8836659597830735e-05, + "loss": 2.2209, + "step": 27870 + }, + { + "epoch": 0.1752322589515448, + "grad_norm": 7.217938423156738, + "learning_rate": 1.8836240496886082e-05, + "loss": 2.1552, + "step": 27880 + }, + { + "epoch": 0.1752951112682419, + "grad_norm": 7.476831912994385, + "learning_rate": 1.883582139594143e-05, + "loss": 2.0758, + "step": 27890 + }, + { + "epoch": 0.17535796358493902, + "grad_norm": 6.8728461265563965, + "learning_rate": 1.8835402294996772e-05, + "loss": 2.0728, + "step": 27900 + }, + { + "epoch": 0.17542081590163613, + "grad_norm": 7.740730285644531, + "learning_rate": 1.883498319405212e-05, + "loss": 1.8849, + "step": 27910 + }, + { + "epoch": 0.17548366821833322, + "grad_norm": 6.934227466583252, + "learning_rate": 1.8834564093107467e-05, + "loss": 1.8524, + "step": 27920 + }, + { + "epoch": 0.17554652053503034, + "grad_norm": 6.788956165313721, + "learning_rate": 1.8834144992162814e-05, + "loss": 1.92, + "step": 27930 + }, + { + "epoch": 0.17560937285172745, + "grad_norm": 8.07392692565918, + "learning_rate": 1.883372589121816e-05, + "loss": 2.0643, + "step": 27940 + }, + { + "epoch": 0.17567222516842457, + "grad_norm": 8.197890281677246, + "learning_rate": 1.8833306790273508e-05, + "loss": 2.2463, + "step": 27950 + }, + { + "epoch": 0.17573507748512168, + "grad_norm": 7.5556721687316895, + "learning_rate": 1.8832887689328855e-05, + "loss": 2.1021, + "step": 27960 + }, + { + "epoch": 0.1757979298018188, + "grad_norm": 6.970726013183594, + "learning_rate": 1.8832468588384202e-05, + "loss": 1.8875, + "step": 27970 + }, + { + "epoch": 0.1758607821185159, + "grad_norm": 7.748169422149658, + "learning_rate": 1.8832049487439546e-05, + "loss": 1.8909, + "step": 27980 + }, + { + "epoch": 0.175923634435213, + "grad_norm": 7.20039176940918, + "learning_rate": 1.8831630386494893e-05, + "loss": 1.8259, + "step": 27990 + }, + { + "epoch": 0.17598648675191012, + "grad_norm": 8.26569938659668, + "learning_rate": 1.883121128555024e-05, + "loss": 2.378, + "step": 28000 + }, + { + "epoch": 0.17604933906860723, + "grad_norm": 7.183000087738037, + "learning_rate": 1.8830792184605587e-05, + "loss": 2.3396, + "step": 28010 + }, + { + "epoch": 0.17611219138530435, + "grad_norm": 5.683530330657959, + "learning_rate": 1.883037308366093e-05, + "loss": 1.9981, + "step": 28020 + }, + { + "epoch": 0.17617504370200146, + "grad_norm": 7.313313961029053, + "learning_rate": 1.8829953982716278e-05, + "loss": 2.0859, + "step": 28030 + }, + { + "epoch": 0.17623789601869858, + "grad_norm": 7.710696220397949, + "learning_rate": 1.8829534881771625e-05, + "loss": 2.0126, + "step": 28040 + }, + { + "epoch": 0.17630074833539566, + "grad_norm": 7.8255510330200195, + "learning_rate": 1.8829115780826972e-05, + "loss": 1.8825, + "step": 28050 + }, + { + "epoch": 0.17636360065209278, + "grad_norm": 8.450091361999512, + "learning_rate": 1.882869667988232e-05, + "loss": 2.1833, + "step": 28060 + }, + { + "epoch": 0.1764264529687899, + "grad_norm": 8.356134414672852, + "learning_rate": 1.8828277578937662e-05, + "loss": 1.9364, + "step": 28070 + }, + { + "epoch": 0.176489305285487, + "grad_norm": 6.771615505218506, + "learning_rate": 1.882785847799301e-05, + "loss": 1.9951, + "step": 28080 + }, + { + "epoch": 0.17655215760218412, + "grad_norm": 7.926447868347168, + "learning_rate": 1.8827439377048357e-05, + "loss": 2.0846, + "step": 28090 + }, + { + "epoch": 0.17661500991888124, + "grad_norm": 7.876333236694336, + "learning_rate": 1.8827020276103704e-05, + "loss": 1.9776, + "step": 28100 + }, + { + "epoch": 0.17667786223557833, + "grad_norm": 6.883592128753662, + "learning_rate": 1.882660117515905e-05, + "loss": 2.2083, + "step": 28110 + }, + { + "epoch": 0.17674071455227544, + "grad_norm": 6.218436241149902, + "learning_rate": 1.8826182074214398e-05, + "loss": 1.9894, + "step": 28120 + }, + { + "epoch": 0.17680356686897256, + "grad_norm": 8.508634567260742, + "learning_rate": 1.8825762973269745e-05, + "loss": 2.1307, + "step": 28130 + }, + { + "epoch": 0.17686641918566967, + "grad_norm": 7.165008068084717, + "learning_rate": 1.8825343872325092e-05, + "loss": 2.0333, + "step": 28140 + }, + { + "epoch": 0.1769292715023668, + "grad_norm": 7.859193325042725, + "learning_rate": 1.8824924771380436e-05, + "loss": 1.9486, + "step": 28150 + }, + { + "epoch": 0.1769921238190639, + "grad_norm": 6.977528095245361, + "learning_rate": 1.8824505670435783e-05, + "loss": 1.9967, + "step": 28160 + }, + { + "epoch": 0.17705497613576102, + "grad_norm": 6.745163440704346, + "learning_rate": 1.882408656949113e-05, + "loss": 1.9882, + "step": 28170 + }, + { + "epoch": 0.1771178284524581, + "grad_norm": 7.4453840255737305, + "learning_rate": 1.8823667468546477e-05, + "loss": 2.1332, + "step": 28180 + }, + { + "epoch": 0.17718068076915522, + "grad_norm": 7.185011863708496, + "learning_rate": 1.8823248367601824e-05, + "loss": 1.9738, + "step": 28190 + }, + { + "epoch": 0.17724353308585233, + "grad_norm": 7.705361843109131, + "learning_rate": 1.8822829266657168e-05, + "loss": 2.3986, + "step": 28200 + }, + { + "epoch": 0.17730638540254945, + "grad_norm": 7.561318874359131, + "learning_rate": 1.8822410165712515e-05, + "loss": 2.006, + "step": 28210 + }, + { + "epoch": 0.17736923771924656, + "grad_norm": 6.81564998626709, + "learning_rate": 1.8821991064767862e-05, + "loss": 1.6773, + "step": 28220 + }, + { + "epoch": 0.17743209003594368, + "grad_norm": 7.152491092681885, + "learning_rate": 1.882157196382321e-05, + "loss": 2.0266, + "step": 28230 + }, + { + "epoch": 0.17749494235264077, + "grad_norm": 7.514481067657471, + "learning_rate": 1.8821152862878553e-05, + "loss": 1.9871, + "step": 28240 + }, + { + "epoch": 0.17755779466933788, + "grad_norm": 7.656121253967285, + "learning_rate": 1.88207337619339e-05, + "loss": 2.1298, + "step": 28250 + }, + { + "epoch": 0.177620646986035, + "grad_norm": 7.641325950622559, + "learning_rate": 1.8820314660989247e-05, + "loss": 2.1423, + "step": 28260 + }, + { + "epoch": 0.1776834993027321, + "grad_norm": 8.13333511352539, + "learning_rate": 1.8819895560044594e-05, + "loss": 1.9955, + "step": 28270 + }, + { + "epoch": 0.17774635161942923, + "grad_norm": 6.73756217956543, + "learning_rate": 1.881947645909994e-05, + "loss": 2.2309, + "step": 28280 + }, + { + "epoch": 0.17780920393612634, + "grad_norm": 7.742393970489502, + "learning_rate": 1.8819057358155284e-05, + "loss": 2.0561, + "step": 28290 + }, + { + "epoch": 0.17787205625282346, + "grad_norm": 6.713929653167725, + "learning_rate": 1.881863825721063e-05, + "loss": 1.931, + "step": 28300 + }, + { + "epoch": 0.17793490856952054, + "grad_norm": 6.656432151794434, + "learning_rate": 1.881821915626598e-05, + "loss": 1.9597, + "step": 28310 + }, + { + "epoch": 0.17799776088621766, + "grad_norm": 7.4837646484375, + "learning_rate": 1.8817800055321326e-05, + "loss": 2.1406, + "step": 28320 + }, + { + "epoch": 0.17806061320291477, + "grad_norm": 6.651392459869385, + "learning_rate": 1.8817380954376673e-05, + "loss": 1.9187, + "step": 28330 + }, + { + "epoch": 0.1781234655196119, + "grad_norm": 7.317150592803955, + "learning_rate": 1.881696185343202e-05, + "loss": 2.0177, + "step": 28340 + }, + { + "epoch": 0.178186317836309, + "grad_norm": 7.599892616271973, + "learning_rate": 1.8816542752487367e-05, + "loss": 1.8601, + "step": 28350 + }, + { + "epoch": 0.17824917015300612, + "grad_norm": 6.383946895599365, + "learning_rate": 1.8816123651542714e-05, + "loss": 1.9067, + "step": 28360 + }, + { + "epoch": 0.1783120224697032, + "grad_norm": 8.048141479492188, + "learning_rate": 1.881570455059806e-05, + "loss": 1.9985, + "step": 28370 + }, + { + "epoch": 0.17837487478640032, + "grad_norm": 8.132674217224121, + "learning_rate": 1.8815285449653405e-05, + "loss": 1.9002, + "step": 28380 + }, + { + "epoch": 0.17843772710309744, + "grad_norm": 7.169011116027832, + "learning_rate": 1.8814866348708752e-05, + "loss": 2.0158, + "step": 28390 + }, + { + "epoch": 0.17850057941979455, + "grad_norm": 8.223552703857422, + "learning_rate": 1.88144472477641e-05, + "loss": 2.1555, + "step": 28400 + }, + { + "epoch": 0.17856343173649167, + "grad_norm": 8.827797889709473, + "learning_rate": 1.8814028146819446e-05, + "loss": 2.0891, + "step": 28410 + }, + { + "epoch": 0.17862628405318878, + "grad_norm": 7.081093788146973, + "learning_rate": 1.881360904587479e-05, + "loss": 1.8826, + "step": 28420 + }, + { + "epoch": 0.17868913636988587, + "grad_norm": 6.889060020446777, + "learning_rate": 1.8813189944930137e-05, + "loss": 2.0148, + "step": 28430 + }, + { + "epoch": 0.17875198868658299, + "grad_norm": 7.420534610748291, + "learning_rate": 1.8812770843985484e-05, + "loss": 2.0423, + "step": 28440 + }, + { + "epoch": 0.1788148410032801, + "grad_norm": 8.297712326049805, + "learning_rate": 1.881235174304083e-05, + "loss": 1.9754, + "step": 28450 + }, + { + "epoch": 0.17887769331997722, + "grad_norm": 7.373516082763672, + "learning_rate": 1.8811932642096178e-05, + "loss": 1.8087, + "step": 28460 + }, + { + "epoch": 0.17894054563667433, + "grad_norm": 8.368450164794922, + "learning_rate": 1.881151354115152e-05, + "loss": 2.0844, + "step": 28470 + }, + { + "epoch": 0.17900339795337145, + "grad_norm": 7.975512504577637, + "learning_rate": 1.881109444020687e-05, + "loss": 1.9639, + "step": 28480 + }, + { + "epoch": 0.17906625027006856, + "grad_norm": 8.503561019897461, + "learning_rate": 1.8810675339262216e-05, + "loss": 2.1627, + "step": 28490 + }, + { + "epoch": 0.17912910258676565, + "grad_norm": 6.963150978088379, + "learning_rate": 1.8810256238317563e-05, + "loss": 1.9203, + "step": 28500 + }, + { + "epoch": 0.17919195490346276, + "grad_norm": 7.963650703430176, + "learning_rate": 1.880983713737291e-05, + "loss": 2.1543, + "step": 28510 + }, + { + "epoch": 0.17925480722015988, + "grad_norm": 7.235218048095703, + "learning_rate": 1.8809418036428254e-05, + "loss": 1.9921, + "step": 28520 + }, + { + "epoch": 0.179317659536857, + "grad_norm": 7.330077648162842, + "learning_rate": 1.88089989354836e-05, + "loss": 2.1142, + "step": 28530 + }, + { + "epoch": 0.1793805118535541, + "grad_norm": 7.96385383605957, + "learning_rate": 1.8808579834538948e-05, + "loss": 1.9681, + "step": 28540 + }, + { + "epoch": 0.17944336417025122, + "grad_norm": 7.287219524383545, + "learning_rate": 1.8808160733594295e-05, + "loss": 2.0364, + "step": 28550 + }, + { + "epoch": 0.1795062164869483, + "grad_norm": 7.122207164764404, + "learning_rate": 1.8807741632649642e-05, + "loss": 1.966, + "step": 28560 + }, + { + "epoch": 0.17956906880364543, + "grad_norm": 7.921074867248535, + "learning_rate": 1.880732253170499e-05, + "loss": 1.8209, + "step": 28570 + }, + { + "epoch": 0.17963192112034254, + "grad_norm": 7.608434677124023, + "learning_rate": 1.8806903430760336e-05, + "loss": 1.9363, + "step": 28580 + }, + { + "epoch": 0.17969477343703966, + "grad_norm": 11.472769737243652, + "learning_rate": 1.8806484329815683e-05, + "loss": 1.9104, + "step": 28590 + }, + { + "epoch": 0.17975762575373677, + "grad_norm": 7.228365421295166, + "learning_rate": 1.8806065228871027e-05, + "loss": 2.1976, + "step": 28600 + }, + { + "epoch": 0.1798204780704339, + "grad_norm": 8.290234565734863, + "learning_rate": 1.8805646127926374e-05, + "loss": 2.2106, + "step": 28610 + }, + { + "epoch": 0.179883330387131, + "grad_norm": 6.307222843170166, + "learning_rate": 1.880522702698172e-05, + "loss": 1.8733, + "step": 28620 + }, + { + "epoch": 0.1799461827038281, + "grad_norm": 7.063374042510986, + "learning_rate": 1.8804807926037068e-05, + "loss": 1.9629, + "step": 28630 + }, + { + "epoch": 0.1800090350205252, + "grad_norm": 8.154483795166016, + "learning_rate": 1.880438882509241e-05, + "loss": 2.1459, + "step": 28640 + }, + { + "epoch": 0.18007188733722232, + "grad_norm": 6.897121906280518, + "learning_rate": 1.880396972414776e-05, + "loss": 1.8751, + "step": 28650 + }, + { + "epoch": 0.18013473965391943, + "grad_norm": 6.4750142097473145, + "learning_rate": 1.8803550623203106e-05, + "loss": 1.8537, + "step": 28660 + }, + { + "epoch": 0.18019759197061655, + "grad_norm": 7.589195728302002, + "learning_rate": 1.8803131522258453e-05, + "loss": 1.8881, + "step": 28670 + }, + { + "epoch": 0.18026044428731366, + "grad_norm": 7.621940612792969, + "learning_rate": 1.88027124213138e-05, + "loss": 2.2198, + "step": 28680 + }, + { + "epoch": 0.18032329660401075, + "grad_norm": 7.115721225738525, + "learning_rate": 1.8802293320369144e-05, + "loss": 1.9749, + "step": 28690 + }, + { + "epoch": 0.18038614892070787, + "grad_norm": 7.395238876342773, + "learning_rate": 1.880187421942449e-05, + "loss": 2.0467, + "step": 28700 + }, + { + "epoch": 0.18044900123740498, + "grad_norm": 6.3072285652160645, + "learning_rate": 1.8801455118479838e-05, + "loss": 1.7886, + "step": 28710 + }, + { + "epoch": 0.1805118535541021, + "grad_norm": 6.7934184074401855, + "learning_rate": 1.8801036017535185e-05, + "loss": 2.2724, + "step": 28720 + }, + { + "epoch": 0.1805747058707992, + "grad_norm": 7.178077220916748, + "learning_rate": 1.8800616916590532e-05, + "loss": 2.0401, + "step": 28730 + }, + { + "epoch": 0.18063755818749633, + "grad_norm": 7.876676082611084, + "learning_rate": 1.880019781564588e-05, + "loss": 1.8102, + "step": 28740 + }, + { + "epoch": 0.18070041050419341, + "grad_norm": 7.284195899963379, + "learning_rate": 1.8799778714701226e-05, + "loss": 1.9114, + "step": 28750 + }, + { + "epoch": 0.18076326282089053, + "grad_norm": 6.8634257316589355, + "learning_rate": 1.8799359613756573e-05, + "loss": 2.0179, + "step": 28760 + }, + { + "epoch": 0.18082611513758765, + "grad_norm": 7.710941314697266, + "learning_rate": 1.8798940512811917e-05, + "loss": 2.0999, + "step": 28770 + }, + { + "epoch": 0.18088896745428476, + "grad_norm": 7.6825852394104, + "learning_rate": 1.8798521411867264e-05, + "loss": 1.9196, + "step": 28780 + }, + { + "epoch": 0.18095181977098188, + "grad_norm": 7.826768398284912, + "learning_rate": 1.879810231092261e-05, + "loss": 2.1616, + "step": 28790 + }, + { + "epoch": 0.181014672087679, + "grad_norm": 6.824447154998779, + "learning_rate": 1.8797683209977958e-05, + "loss": 1.9437, + "step": 28800 + }, + { + "epoch": 0.1810775244043761, + "grad_norm": 5.521284103393555, + "learning_rate": 1.8797264109033305e-05, + "loss": 1.8503, + "step": 28810 + }, + { + "epoch": 0.1811403767210732, + "grad_norm": 7.975641250610352, + "learning_rate": 1.879684500808865e-05, + "loss": 1.9168, + "step": 28820 + }, + { + "epoch": 0.1812032290377703, + "grad_norm": 7.012667179107666, + "learning_rate": 1.8796425907143996e-05, + "loss": 1.9218, + "step": 28830 + }, + { + "epoch": 0.18126608135446742, + "grad_norm": 8.359773635864258, + "learning_rate": 1.8796006806199343e-05, + "loss": 2.1473, + "step": 28840 + }, + { + "epoch": 0.18132893367116454, + "grad_norm": 7.304141044616699, + "learning_rate": 1.879558770525469e-05, + "loss": 2.2035, + "step": 28850 + }, + { + "epoch": 0.18139178598786165, + "grad_norm": 6.638857841491699, + "learning_rate": 1.8795168604310037e-05, + "loss": 1.9289, + "step": 28860 + }, + { + "epoch": 0.18145463830455877, + "grad_norm": 7.668281555175781, + "learning_rate": 1.879474950336538e-05, + "loss": 2.037, + "step": 28870 + }, + { + "epoch": 0.18151749062125586, + "grad_norm": 6.877059459686279, + "learning_rate": 1.8794330402420728e-05, + "loss": 1.8873, + "step": 28880 + }, + { + "epoch": 0.18158034293795297, + "grad_norm": 6.669254302978516, + "learning_rate": 1.8793911301476075e-05, + "loss": 1.9665, + "step": 28890 + }, + { + "epoch": 0.18164319525465009, + "grad_norm": 7.703857898712158, + "learning_rate": 1.8793492200531422e-05, + "loss": 1.8277, + "step": 28900 + }, + { + "epoch": 0.1817060475713472, + "grad_norm": 7.366462707519531, + "learning_rate": 1.8793073099586766e-05, + "loss": 1.9984, + "step": 28910 + }, + { + "epoch": 0.18176889988804432, + "grad_norm": 7.7693867683410645, + "learning_rate": 1.8792653998642113e-05, + "loss": 2.0154, + "step": 28920 + }, + { + "epoch": 0.18183175220474143, + "grad_norm": 7.505848407745361, + "learning_rate": 1.879223489769746e-05, + "loss": 1.8839, + "step": 28930 + }, + { + "epoch": 0.18189460452143852, + "grad_norm": 7.814448356628418, + "learning_rate": 1.8791815796752807e-05, + "loss": 1.8928, + "step": 28940 + }, + { + "epoch": 0.18195745683813563, + "grad_norm": 7.27949333190918, + "learning_rate": 1.8791396695808154e-05, + "loss": 2.1309, + "step": 28950 + }, + { + "epoch": 0.18202030915483275, + "grad_norm": 6.9486165046691895, + "learning_rate": 1.87909775948635e-05, + "loss": 2.0047, + "step": 28960 + }, + { + "epoch": 0.18208316147152986, + "grad_norm": 7.7865447998046875, + "learning_rate": 1.8790558493918848e-05, + "loss": 2.0098, + "step": 28970 + }, + { + "epoch": 0.18214601378822698, + "grad_norm": 7.254465103149414, + "learning_rate": 1.8790139392974195e-05, + "loss": 2.1038, + "step": 28980 + }, + { + "epoch": 0.1822088661049241, + "grad_norm": 7.637696266174316, + "learning_rate": 1.8789720292029542e-05, + "loss": 2.1296, + "step": 28990 + }, + { + "epoch": 0.1822717184216212, + "grad_norm": 8.04129409790039, + "learning_rate": 1.8789301191084886e-05, + "loss": 2.2248, + "step": 29000 + }, + { + "epoch": 0.1823345707383183, + "grad_norm": 7.1449408531188965, + "learning_rate": 1.8788882090140233e-05, + "loss": 1.9733, + "step": 29010 + }, + { + "epoch": 0.1823974230550154, + "grad_norm": 7.499587059020996, + "learning_rate": 1.878846298919558e-05, + "loss": 2.1853, + "step": 29020 + }, + { + "epoch": 0.18246027537171253, + "grad_norm": 9.368136405944824, + "learning_rate": 1.8788043888250927e-05, + "loss": 2.1305, + "step": 29030 + }, + { + "epoch": 0.18252312768840964, + "grad_norm": 7.33623743057251, + "learning_rate": 1.878762478730627e-05, + "loss": 1.9875, + "step": 29040 + }, + { + "epoch": 0.18258598000510676, + "grad_norm": 8.26385498046875, + "learning_rate": 1.8787205686361618e-05, + "loss": 2.1754, + "step": 29050 + }, + { + "epoch": 0.18264883232180387, + "grad_norm": 6.485950946807861, + "learning_rate": 1.8786786585416965e-05, + "loss": 2.0015, + "step": 29060 + }, + { + "epoch": 0.18271168463850096, + "grad_norm": 6.6667962074279785, + "learning_rate": 1.8786367484472312e-05, + "loss": 1.9353, + "step": 29070 + }, + { + "epoch": 0.18277453695519807, + "grad_norm": 6.05470609664917, + "learning_rate": 1.878594838352766e-05, + "loss": 1.7624, + "step": 29080 + }, + { + "epoch": 0.1828373892718952, + "grad_norm": 7.437668800354004, + "learning_rate": 1.8785529282583003e-05, + "loss": 2.1268, + "step": 29090 + }, + { + "epoch": 0.1829002415885923, + "grad_norm": 7.7610297203063965, + "learning_rate": 1.878511018163835e-05, + "loss": 1.9864, + "step": 29100 + }, + { + "epoch": 0.18296309390528942, + "grad_norm": 6.294562339782715, + "learning_rate": 1.8784691080693697e-05, + "loss": 2.1718, + "step": 29110 + }, + { + "epoch": 0.18302594622198654, + "grad_norm": 7.2774553298950195, + "learning_rate": 1.8784271979749044e-05, + "loss": 2.0012, + "step": 29120 + }, + { + "epoch": 0.18308879853868365, + "grad_norm": 6.683416366577148, + "learning_rate": 1.878385287880439e-05, + "loss": 1.9754, + "step": 29130 + }, + { + "epoch": 0.18315165085538074, + "grad_norm": 6.475184917449951, + "learning_rate": 1.8783433777859738e-05, + "loss": 1.9697, + "step": 29140 + }, + { + "epoch": 0.18321450317207785, + "grad_norm": 6.563970565795898, + "learning_rate": 1.8783014676915082e-05, + "loss": 2.2486, + "step": 29150 + }, + { + "epoch": 0.18327735548877497, + "grad_norm": 8.39940357208252, + "learning_rate": 1.878259557597043e-05, + "loss": 1.8987, + "step": 29160 + }, + { + "epoch": 0.18334020780547208, + "grad_norm": 7.151874542236328, + "learning_rate": 1.8782176475025776e-05, + "loss": 1.8675, + "step": 29170 + }, + { + "epoch": 0.1834030601221692, + "grad_norm": 6.557036399841309, + "learning_rate": 1.8781757374081123e-05, + "loss": 2.0887, + "step": 29180 + }, + { + "epoch": 0.1834659124388663, + "grad_norm": 7.522086143493652, + "learning_rate": 1.878133827313647e-05, + "loss": 1.9528, + "step": 29190 + }, + { + "epoch": 0.1835287647555634, + "grad_norm": 6.171652793884277, + "learning_rate": 1.8780919172191817e-05, + "loss": 2.0084, + "step": 29200 + }, + { + "epoch": 0.18359161707226052, + "grad_norm": 6.9197096824646, + "learning_rate": 1.8780500071247164e-05, + "loss": 1.9259, + "step": 29210 + }, + { + "epoch": 0.18365446938895763, + "grad_norm": 7.602723598480225, + "learning_rate": 1.8780080970302508e-05, + "loss": 2.2391, + "step": 29220 + }, + { + "epoch": 0.18371732170565475, + "grad_norm": 6.543840408325195, + "learning_rate": 1.8779661869357855e-05, + "loss": 1.9864, + "step": 29230 + }, + { + "epoch": 0.18378017402235186, + "grad_norm": 6.674700736999512, + "learning_rate": 1.8779242768413202e-05, + "loss": 2.1489, + "step": 29240 + }, + { + "epoch": 0.18384302633904898, + "grad_norm": 6.689058780670166, + "learning_rate": 1.877882366746855e-05, + "loss": 1.9458, + "step": 29250 + }, + { + "epoch": 0.18390587865574606, + "grad_norm": 7.193964004516602, + "learning_rate": 1.8778404566523893e-05, + "loss": 1.8953, + "step": 29260 + }, + { + "epoch": 0.18396873097244318, + "grad_norm": 7.274568557739258, + "learning_rate": 1.877798546557924e-05, + "loss": 1.9315, + "step": 29270 + }, + { + "epoch": 0.1840315832891403, + "grad_norm": 7.066225528717041, + "learning_rate": 1.8777566364634587e-05, + "loss": 1.8457, + "step": 29280 + }, + { + "epoch": 0.1840944356058374, + "grad_norm": 8.32391357421875, + "learning_rate": 1.8777147263689934e-05, + "loss": 1.8756, + "step": 29290 + }, + { + "epoch": 0.18415728792253452, + "grad_norm": 7.687808513641357, + "learning_rate": 1.877672816274528e-05, + "loss": 2.0536, + "step": 29300 + }, + { + "epoch": 0.18422014023923164, + "grad_norm": 8.26240062713623, + "learning_rate": 1.8776309061800625e-05, + "loss": 1.9104, + "step": 29310 + }, + { + "epoch": 0.18428299255592875, + "grad_norm": 8.49111557006836, + "learning_rate": 1.8775889960855972e-05, + "loss": 1.8923, + "step": 29320 + }, + { + "epoch": 0.18434584487262584, + "grad_norm": 7.489538192749023, + "learning_rate": 1.877547085991132e-05, + "loss": 2.1269, + "step": 29330 + }, + { + "epoch": 0.18440869718932296, + "grad_norm": 6.954035758972168, + "learning_rate": 1.8775051758966666e-05, + "loss": 2.0588, + "step": 29340 + }, + { + "epoch": 0.18447154950602007, + "grad_norm": 7.260134696960449, + "learning_rate": 1.8774632658022013e-05, + "loss": 1.9555, + "step": 29350 + }, + { + "epoch": 0.1845344018227172, + "grad_norm": 7.261524200439453, + "learning_rate": 1.877421355707736e-05, + "loss": 2.1734, + "step": 29360 + }, + { + "epoch": 0.1845972541394143, + "grad_norm": 6.00492525100708, + "learning_rate": 1.8773794456132707e-05, + "loss": 2.1805, + "step": 29370 + }, + { + "epoch": 0.18466010645611142, + "grad_norm": 6.522037029266357, + "learning_rate": 1.8773375355188054e-05, + "loss": 1.8158, + "step": 29380 + }, + { + "epoch": 0.1847229587728085, + "grad_norm": 7.0802412033081055, + "learning_rate": 1.87729562542434e-05, + "loss": 1.8112, + "step": 29390 + }, + { + "epoch": 0.18478581108950562, + "grad_norm": 7.415716648101807, + "learning_rate": 1.8772537153298745e-05, + "loss": 1.7868, + "step": 29400 + }, + { + "epoch": 0.18484866340620273, + "grad_norm": 5.86637020111084, + "learning_rate": 1.8772118052354092e-05, + "loss": 1.8952, + "step": 29410 + }, + { + "epoch": 0.18491151572289985, + "grad_norm": 7.00277042388916, + "learning_rate": 1.877169895140944e-05, + "loss": 2.0638, + "step": 29420 + }, + { + "epoch": 0.18497436803959696, + "grad_norm": 7.230973720550537, + "learning_rate": 1.8771279850464786e-05, + "loss": 1.8949, + "step": 29430 + }, + { + "epoch": 0.18503722035629408, + "grad_norm": 6.861478328704834, + "learning_rate": 1.8770902659614598e-05, + "loss": 1.9096, + "step": 29440 + }, + { + "epoch": 0.18510007267299117, + "grad_norm": 6.255519390106201, + "learning_rate": 1.8770483558669945e-05, + "loss": 2.0701, + "step": 29450 + }, + { + "epoch": 0.18516292498968828, + "grad_norm": 7.750514030456543, + "learning_rate": 1.8770064457725288e-05, + "loss": 2.1037, + "step": 29460 + }, + { + "epoch": 0.1852257773063854, + "grad_norm": 8.064445495605469, + "learning_rate": 1.8769645356780635e-05, + "loss": 1.9983, + "step": 29470 + }, + { + "epoch": 0.1852886296230825, + "grad_norm": 6.737326622009277, + "learning_rate": 1.8769226255835982e-05, + "loss": 1.9377, + "step": 29480 + }, + { + "epoch": 0.18535148193977963, + "grad_norm": 7.76080322265625, + "learning_rate": 1.876880715489133e-05, + "loss": 1.7995, + "step": 29490 + }, + { + "epoch": 0.18541433425647674, + "grad_norm": 7.405367851257324, + "learning_rate": 1.8768388053946677e-05, + "loss": 1.8267, + "step": 29500 + }, + { + "epoch": 0.18547718657317386, + "grad_norm": 6.397325038909912, + "learning_rate": 1.8767968953002024e-05, + "loss": 2.0647, + "step": 29510 + }, + { + "epoch": 0.18554003888987095, + "grad_norm": 8.42994499206543, + "learning_rate": 1.8767549852057367e-05, + "loss": 2.3467, + "step": 29520 + }, + { + "epoch": 0.18560289120656806, + "grad_norm": 7.365906715393066, + "learning_rate": 1.8767130751112714e-05, + "loss": 1.6478, + "step": 29530 + }, + { + "epoch": 0.18566574352326518, + "grad_norm": 6.7625298500061035, + "learning_rate": 1.876671165016806e-05, + "loss": 1.7483, + "step": 29540 + }, + { + "epoch": 0.1857285958399623, + "grad_norm": 6.54100227355957, + "learning_rate": 1.876629254922341e-05, + "loss": 1.8306, + "step": 29550 + }, + { + "epoch": 0.1857914481566594, + "grad_norm": 7.321144104003906, + "learning_rate": 1.8765873448278752e-05, + "loss": 1.8764, + "step": 29560 + }, + { + "epoch": 0.18585430047335652, + "grad_norm": 7.2385478019714355, + "learning_rate": 1.87654543473341e-05, + "loss": 1.7689, + "step": 29570 + }, + { + "epoch": 0.1859171527900536, + "grad_norm": 7.6455302238464355, + "learning_rate": 1.8765035246389446e-05, + "loss": 2.1768, + "step": 29580 + }, + { + "epoch": 0.18598000510675072, + "grad_norm": 6.5208635330200195, + "learning_rate": 1.8764616145444793e-05, + "loss": 1.8892, + "step": 29590 + }, + { + "epoch": 0.18604285742344784, + "grad_norm": 8.060873031616211, + "learning_rate": 1.876419704450014e-05, + "loss": 2.0558, + "step": 29600 + }, + { + "epoch": 0.18610570974014495, + "grad_norm": 7.863399028778076, + "learning_rate": 1.8763777943555484e-05, + "loss": 1.8187, + "step": 29610 + }, + { + "epoch": 0.18616856205684207, + "grad_norm": 6.913415431976318, + "learning_rate": 1.876335884261083e-05, + "loss": 1.8612, + "step": 29620 + }, + { + "epoch": 0.18623141437353918, + "grad_norm": 7.467434883117676, + "learning_rate": 1.8762939741666178e-05, + "loss": 2.0697, + "step": 29630 + }, + { + "epoch": 0.1862942666902363, + "grad_norm": 7.388409614562988, + "learning_rate": 1.8762520640721525e-05, + "loss": 2.0388, + "step": 29640 + }, + { + "epoch": 0.18635711900693339, + "grad_norm": 6.596724987030029, + "learning_rate": 1.8762101539776872e-05, + "loss": 1.8998, + "step": 29650 + }, + { + "epoch": 0.1864199713236305, + "grad_norm": 7.31245756149292, + "learning_rate": 1.876168243883222e-05, + "loss": 2.0896, + "step": 29660 + }, + { + "epoch": 0.18648282364032762, + "grad_norm": 7.265267848968506, + "learning_rate": 1.8761263337887567e-05, + "loss": 1.9591, + "step": 29670 + }, + { + "epoch": 0.18654567595702473, + "grad_norm": 7.891168594360352, + "learning_rate": 1.8760844236942914e-05, + "loss": 1.8877, + "step": 29680 + }, + { + "epoch": 0.18660852827372185, + "grad_norm": 6.974050998687744, + "learning_rate": 1.8760425135998257e-05, + "loss": 2.061, + "step": 29690 + }, + { + "epoch": 0.18667138059041896, + "grad_norm": 6.938966751098633, + "learning_rate": 1.8760006035053604e-05, + "loss": 1.9556, + "step": 29700 + }, + { + "epoch": 0.18673423290711605, + "grad_norm": 6.422309398651123, + "learning_rate": 1.875958693410895e-05, + "loss": 2.0012, + "step": 29710 + }, + { + "epoch": 0.18679708522381316, + "grad_norm": 8.46394157409668, + "learning_rate": 1.87591678331643e-05, + "loss": 1.9021, + "step": 29720 + }, + { + "epoch": 0.18685993754051028, + "grad_norm": 8.600244522094727, + "learning_rate": 1.8758748732219646e-05, + "loss": 2.0819, + "step": 29730 + }, + { + "epoch": 0.1869227898572074, + "grad_norm": 7.660400390625, + "learning_rate": 1.875832963127499e-05, + "loss": 1.9857, + "step": 29740 + }, + { + "epoch": 0.1869856421739045, + "grad_norm": 6.847285270690918, + "learning_rate": 1.8757910530330336e-05, + "loss": 2.1663, + "step": 29750 + }, + { + "epoch": 0.18704849449060162, + "grad_norm": 7.508523941040039, + "learning_rate": 1.8757491429385683e-05, + "loss": 2.0756, + "step": 29760 + }, + { + "epoch": 0.1871113468072987, + "grad_norm": 7.233463287353516, + "learning_rate": 1.875707232844103e-05, + "loss": 1.9206, + "step": 29770 + }, + { + "epoch": 0.18717419912399583, + "grad_norm": 7.623024940490723, + "learning_rate": 1.8756653227496374e-05, + "loss": 1.8882, + "step": 29780 + }, + { + "epoch": 0.18723705144069294, + "grad_norm": 8.130066871643066, + "learning_rate": 1.875623412655172e-05, + "loss": 2.2309, + "step": 29790 + }, + { + "epoch": 0.18729990375739006, + "grad_norm": 6.728233337402344, + "learning_rate": 1.875581502560707e-05, + "loss": 2.1045, + "step": 29800 + }, + { + "epoch": 0.18736275607408717, + "grad_norm": 7.936640739440918, + "learning_rate": 1.8755395924662415e-05, + "loss": 1.9853, + "step": 29810 + }, + { + "epoch": 0.1874256083907843, + "grad_norm": 5.2671098709106445, + "learning_rate": 1.8754976823717762e-05, + "loss": 2.1318, + "step": 29820 + }, + { + "epoch": 0.1874884607074814, + "grad_norm": 7.63880729675293, + "learning_rate": 1.875455772277311e-05, + "loss": 1.9968, + "step": 29830 + }, + { + "epoch": 0.1875513130241785, + "grad_norm": 8.06202507019043, + "learning_rate": 1.8754138621828453e-05, + "loss": 2.0014, + "step": 29840 + }, + { + "epoch": 0.1876141653408756, + "grad_norm": 6.8607611656188965, + "learning_rate": 1.87537195208838e-05, + "loss": 1.8825, + "step": 29850 + }, + { + "epoch": 0.18767701765757272, + "grad_norm": 7.449906349182129, + "learning_rate": 1.8753300419939147e-05, + "loss": 2.1468, + "step": 29860 + }, + { + "epoch": 0.18773986997426984, + "grad_norm": 8.055960655212402, + "learning_rate": 1.8752881318994494e-05, + "loss": 2.0776, + "step": 29870 + }, + { + "epoch": 0.18780272229096695, + "grad_norm": 8.381570816040039, + "learning_rate": 1.875246221804984e-05, + "loss": 2.2393, + "step": 29880 + }, + { + "epoch": 0.18786557460766407, + "grad_norm": 8.259137153625488, + "learning_rate": 1.875204311710519e-05, + "loss": 2.2421, + "step": 29890 + }, + { + "epoch": 0.18792842692436115, + "grad_norm": 5.870490550994873, + "learning_rate": 1.8751624016160536e-05, + "loss": 1.7767, + "step": 29900 + }, + { + "epoch": 0.18799127924105827, + "grad_norm": 7.300480842590332, + "learning_rate": 1.8751204915215883e-05, + "loss": 2.0607, + "step": 29910 + }, + { + "epoch": 0.18805413155775538, + "grad_norm": 6.586701393127441, + "learning_rate": 1.8750785814271226e-05, + "loss": 2.0254, + "step": 29920 + }, + { + "epoch": 0.1881169838744525, + "grad_norm": 6.941963195800781, + "learning_rate": 1.8750366713326573e-05, + "loss": 2.0021, + "step": 29930 + }, + { + "epoch": 0.1881798361911496, + "grad_norm": 7.193849086761475, + "learning_rate": 1.874994761238192e-05, + "loss": 2.2742, + "step": 29940 + }, + { + "epoch": 0.18824268850784673, + "grad_norm": 7.205645561218262, + "learning_rate": 1.8749528511437268e-05, + "loss": 2.007, + "step": 29950 + }, + { + "epoch": 0.18830554082454384, + "grad_norm": 6.672652721405029, + "learning_rate": 1.874910941049261e-05, + "loss": 1.967, + "step": 29960 + }, + { + "epoch": 0.18836839314124093, + "grad_norm": 8.637862205505371, + "learning_rate": 1.874869030954796e-05, + "loss": 2.0933, + "step": 29970 + }, + { + "epoch": 0.18843124545793805, + "grad_norm": 6.464426517486572, + "learning_rate": 1.8748271208603305e-05, + "loss": 1.8952, + "step": 29980 + }, + { + "epoch": 0.18849409777463516, + "grad_norm": 7.936492919921875, + "learning_rate": 1.8747852107658653e-05, + "loss": 1.8801, + "step": 29990 + }, + { + "epoch": 0.18855695009133228, + "grad_norm": 7.535575866699219, + "learning_rate": 1.8747433006713996e-05, + "loss": 1.9741, + "step": 30000 + }, + { + "epoch": 0.1886198024080294, + "grad_norm": 7.745148181915283, + "learning_rate": 1.8747013905769343e-05, + "loss": 2.1146, + "step": 30010 + }, + { + "epoch": 0.1886826547247265, + "grad_norm": 6.761495113372803, + "learning_rate": 1.874659480482469e-05, + "loss": 2.0053, + "step": 30020 + }, + { + "epoch": 0.1887455070414236, + "grad_norm": 7.184195518493652, + "learning_rate": 1.8746175703880037e-05, + "loss": 1.9576, + "step": 30030 + }, + { + "epoch": 0.1888083593581207, + "grad_norm": 6.510901927947998, + "learning_rate": 1.8745756602935384e-05, + "loss": 2.0378, + "step": 30040 + }, + { + "epoch": 0.18887121167481782, + "grad_norm": 6.986710548400879, + "learning_rate": 1.874533750199073e-05, + "loss": 1.7362, + "step": 30050 + }, + { + "epoch": 0.18893406399151494, + "grad_norm": 6.512456893920898, + "learning_rate": 1.874491840104608e-05, + "loss": 1.8793, + "step": 30060 + }, + { + "epoch": 0.18899691630821205, + "grad_norm": 7.3014702796936035, + "learning_rate": 1.8744499300101426e-05, + "loss": 2.1635, + "step": 30070 + }, + { + "epoch": 0.18905976862490917, + "grad_norm": 6.359899997711182, + "learning_rate": 1.8744080199156773e-05, + "loss": 2.0233, + "step": 30080 + }, + { + "epoch": 0.18912262094160626, + "grad_norm": 7.0193915367126465, + "learning_rate": 1.8743661098212116e-05, + "loss": 1.965, + "step": 30090 + }, + { + "epoch": 0.18918547325830337, + "grad_norm": 7.0321455001831055, + "learning_rate": 1.8743241997267464e-05, + "loss": 2.0563, + "step": 30100 + }, + { + "epoch": 0.1892483255750005, + "grad_norm": 7.861650466918945, + "learning_rate": 1.874282289632281e-05, + "loss": 2.0758, + "step": 30110 + }, + { + "epoch": 0.1893111778916976, + "grad_norm": 6.646553039550781, + "learning_rate": 1.8742403795378158e-05, + "loss": 1.9899, + "step": 30120 + }, + { + "epoch": 0.18937403020839472, + "grad_norm": 7.474400043487549, + "learning_rate": 1.8741984694433505e-05, + "loss": 2.0311, + "step": 30130 + }, + { + "epoch": 0.18943688252509183, + "grad_norm": 7.869038105010986, + "learning_rate": 1.874156559348885e-05, + "loss": 2.3259, + "step": 30140 + }, + { + "epoch": 0.18949973484178895, + "grad_norm": 7.051912784576416, + "learning_rate": 1.8741146492544195e-05, + "loss": 1.9394, + "step": 30150 + }, + { + "epoch": 0.18956258715848603, + "grad_norm": 7.221027851104736, + "learning_rate": 1.8740727391599543e-05, + "loss": 2.0958, + "step": 30160 + }, + { + "epoch": 0.18962543947518315, + "grad_norm": 7.719078540802002, + "learning_rate": 1.874030829065489e-05, + "loss": 1.7209, + "step": 30170 + }, + { + "epoch": 0.18968829179188026, + "grad_norm": 7.397263526916504, + "learning_rate": 1.8739889189710233e-05, + "loss": 1.7915, + "step": 30180 + }, + { + "epoch": 0.18975114410857738, + "grad_norm": 7.989697456359863, + "learning_rate": 1.873947008876558e-05, + "loss": 1.9291, + "step": 30190 + }, + { + "epoch": 0.1898139964252745, + "grad_norm": 8.301801681518555, + "learning_rate": 1.8739050987820927e-05, + "loss": 2.3006, + "step": 30200 + }, + { + "epoch": 0.1898768487419716, + "grad_norm": 6.9124040603637695, + "learning_rate": 1.8738631886876275e-05, + "loss": 1.7534, + "step": 30210 + }, + { + "epoch": 0.1899397010586687, + "grad_norm": 8.110513687133789, + "learning_rate": 1.873821278593162e-05, + "loss": 2.0729, + "step": 30220 + }, + { + "epoch": 0.1900025533753658, + "grad_norm": 7.219425678253174, + "learning_rate": 1.8737793684986965e-05, + "loss": 1.7849, + "step": 30230 + }, + { + "epoch": 0.19006540569206293, + "grad_norm": 8.310271263122559, + "learning_rate": 1.8737374584042312e-05, + "loss": 1.9025, + "step": 30240 + }, + { + "epoch": 0.19012825800876004, + "grad_norm": 7.594659328460693, + "learning_rate": 1.873695548309766e-05, + "loss": 2.0431, + "step": 30250 + }, + { + "epoch": 0.19019111032545716, + "grad_norm": 7.479565143585205, + "learning_rate": 1.8736536382153006e-05, + "loss": 2.0942, + "step": 30260 + }, + { + "epoch": 0.19025396264215427, + "grad_norm": 7.137730121612549, + "learning_rate": 1.8736117281208354e-05, + "loss": 1.7541, + "step": 30270 + }, + { + "epoch": 0.19031681495885136, + "grad_norm": 8.164888381958008, + "learning_rate": 1.87356981802637e-05, + "loss": 2.0119, + "step": 30280 + }, + { + "epoch": 0.19037966727554848, + "grad_norm": 7.230342388153076, + "learning_rate": 1.8735279079319048e-05, + "loss": 2.0207, + "step": 30290 + }, + { + "epoch": 0.1904425195922456, + "grad_norm": 7.518143653869629, + "learning_rate": 1.8734859978374395e-05, + "loss": 2.2041, + "step": 30300 + }, + { + "epoch": 0.1905053719089427, + "grad_norm": 7.932175636291504, + "learning_rate": 1.873444087742974e-05, + "loss": 2.164, + "step": 30310 + }, + { + "epoch": 0.19056822422563982, + "grad_norm": 6.18129825592041, + "learning_rate": 1.8734021776485086e-05, + "loss": 1.8517, + "step": 30320 + }, + { + "epoch": 0.19063107654233694, + "grad_norm": 7.166076183319092, + "learning_rate": 1.8733602675540433e-05, + "loss": 2.0263, + "step": 30330 + }, + { + "epoch": 0.19069392885903405, + "grad_norm": 8.349908828735352, + "learning_rate": 1.873318357459578e-05, + "loss": 2.2155, + "step": 30340 + }, + { + "epoch": 0.19075678117573114, + "grad_norm": 6.5167059898376465, + "learning_rate": 1.8732764473651127e-05, + "loss": 1.8075, + "step": 30350 + }, + { + "epoch": 0.19081963349242825, + "grad_norm": 7.859280586242676, + "learning_rate": 1.873234537270647e-05, + "loss": 1.9008, + "step": 30360 + }, + { + "epoch": 0.19088248580912537, + "grad_norm": 6.828162670135498, + "learning_rate": 1.8731926271761817e-05, + "loss": 1.8431, + "step": 30370 + }, + { + "epoch": 0.19094533812582248, + "grad_norm": 6.228019714355469, + "learning_rate": 1.8731507170817165e-05, + "loss": 1.849, + "step": 30380 + }, + { + "epoch": 0.1910081904425196, + "grad_norm": 6.685621738433838, + "learning_rate": 1.873108806987251e-05, + "loss": 1.9236, + "step": 30390 + }, + { + "epoch": 0.1910710427592167, + "grad_norm": 6.393117427825928, + "learning_rate": 1.8730668968927855e-05, + "loss": 1.905, + "step": 30400 + }, + { + "epoch": 0.1911338950759138, + "grad_norm": 6.564292907714844, + "learning_rate": 1.8730249867983202e-05, + "loss": 2.0825, + "step": 30410 + }, + { + "epoch": 0.19119674739261092, + "grad_norm": 7.268558502197266, + "learning_rate": 1.872983076703855e-05, + "loss": 1.9467, + "step": 30420 + }, + { + "epoch": 0.19125959970930803, + "grad_norm": 7.2220892906188965, + "learning_rate": 1.8729411666093897e-05, + "loss": 1.932, + "step": 30430 + }, + { + "epoch": 0.19132245202600515, + "grad_norm": 6.725642681121826, + "learning_rate": 1.8728992565149244e-05, + "loss": 2.0276, + "step": 30440 + }, + { + "epoch": 0.19138530434270226, + "grad_norm": 6.740306377410889, + "learning_rate": 1.872857346420459e-05, + "loss": 1.8869, + "step": 30450 + }, + { + "epoch": 0.19144815665939938, + "grad_norm": 7.151871681213379, + "learning_rate": 1.8728154363259938e-05, + "loss": 1.9544, + "step": 30460 + }, + { + "epoch": 0.1915110089760965, + "grad_norm": 6.8912811279296875, + "learning_rate": 1.872773526231528e-05, + "loss": 1.8118, + "step": 30470 + }, + { + "epoch": 0.19157386129279358, + "grad_norm": 6.042466163635254, + "learning_rate": 1.872731616137063e-05, + "loss": 1.938, + "step": 30480 + }, + { + "epoch": 0.1916367136094907, + "grad_norm": 6.601611137390137, + "learning_rate": 1.8726897060425976e-05, + "loss": 2.017, + "step": 30490 + }, + { + "epoch": 0.1916995659261878, + "grad_norm": 6.856364727020264, + "learning_rate": 1.8726477959481323e-05, + "loss": 1.7202, + "step": 30500 + }, + { + "epoch": 0.19176241824288492, + "grad_norm": 6.876039981842041, + "learning_rate": 1.872605885853667e-05, + "loss": 2.1173, + "step": 30510 + }, + { + "epoch": 0.19182527055958204, + "grad_norm": 6.808557987213135, + "learning_rate": 1.8725639757592017e-05, + "loss": 1.9516, + "step": 30520 + }, + { + "epoch": 0.19188812287627915, + "grad_norm": 7.098701477050781, + "learning_rate": 1.8725220656647364e-05, + "loss": 2.0891, + "step": 30530 + }, + { + "epoch": 0.19195097519297624, + "grad_norm": 7.035343170166016, + "learning_rate": 1.8724801555702708e-05, + "loss": 2.0032, + "step": 30540 + }, + { + "epoch": 0.19201382750967336, + "grad_norm": 7.604888439178467, + "learning_rate": 1.8724382454758055e-05, + "loss": 2.0704, + "step": 30550 + }, + { + "epoch": 0.19207667982637047, + "grad_norm": 7.634496688842773, + "learning_rate": 1.87239633538134e-05, + "loss": 2.1229, + "step": 30560 + }, + { + "epoch": 0.1921395321430676, + "grad_norm": 7.305230617523193, + "learning_rate": 1.872354425286875e-05, + "loss": 1.7444, + "step": 30570 + }, + { + "epoch": 0.1922023844597647, + "grad_norm": 6.693951606750488, + "learning_rate": 1.8723125151924092e-05, + "loss": 1.7571, + "step": 30580 + }, + { + "epoch": 0.19226523677646182, + "grad_norm": 5.700252056121826, + "learning_rate": 1.872270605097944e-05, + "loss": 1.9711, + "step": 30590 + }, + { + "epoch": 0.1923280890931589, + "grad_norm": 7.504659652709961, + "learning_rate": 1.8722286950034787e-05, + "loss": 1.9572, + "step": 30600 + }, + { + "epoch": 0.19239094140985602, + "grad_norm": 7.629385471343994, + "learning_rate": 1.8721867849090134e-05, + "loss": 1.9407, + "step": 30610 + }, + { + "epoch": 0.19245379372655314, + "grad_norm": 7.406287670135498, + "learning_rate": 1.8721448748145477e-05, + "loss": 2.2253, + "step": 30620 + }, + { + "epoch": 0.19251664604325025, + "grad_norm": 8.016263961791992, + "learning_rate": 1.8721029647200824e-05, + "loss": 2.0097, + "step": 30630 + }, + { + "epoch": 0.19257949835994737, + "grad_norm": 7.708703994750977, + "learning_rate": 1.872061054625617e-05, + "loss": 1.9363, + "step": 30640 + }, + { + "epoch": 0.19264235067664448, + "grad_norm": 7.221123218536377, + "learning_rate": 1.872019144531152e-05, + "loss": 1.8846, + "step": 30650 + }, + { + "epoch": 0.1927052029933416, + "grad_norm": 7.941370010375977, + "learning_rate": 1.8719772344366866e-05, + "loss": 1.9737, + "step": 30660 + }, + { + "epoch": 0.19276805531003868, + "grad_norm": 8.912504196166992, + "learning_rate": 1.8719353243422213e-05, + "loss": 2.1479, + "step": 30670 + }, + { + "epoch": 0.1928309076267358, + "grad_norm": 8.278250694274902, + "learning_rate": 1.871893414247756e-05, + "loss": 1.6379, + "step": 30680 + }, + { + "epoch": 0.1928937599434329, + "grad_norm": 6.728115558624268, + "learning_rate": 1.8718515041532907e-05, + "loss": 1.8149, + "step": 30690 + }, + { + "epoch": 0.19295661226013003, + "grad_norm": 7.390202522277832, + "learning_rate": 1.8718095940588254e-05, + "loss": 1.9503, + "step": 30700 + }, + { + "epoch": 0.19301946457682714, + "grad_norm": 7.695835113525391, + "learning_rate": 1.8717676839643598e-05, + "loss": 1.9788, + "step": 30710 + }, + { + "epoch": 0.19308231689352426, + "grad_norm": 8.717729568481445, + "learning_rate": 1.8717257738698945e-05, + "loss": 2.011, + "step": 30720 + }, + { + "epoch": 0.19314516921022135, + "grad_norm": 8.436739921569824, + "learning_rate": 1.8716838637754292e-05, + "loss": 2.0215, + "step": 30730 + }, + { + "epoch": 0.19320802152691846, + "grad_norm": 8.262494087219238, + "learning_rate": 1.871641953680964e-05, + "loss": 2.0514, + "step": 30740 + }, + { + "epoch": 0.19327087384361558, + "grad_norm": 8.385856628417969, + "learning_rate": 1.8716000435864986e-05, + "loss": 1.7285, + "step": 30750 + }, + { + "epoch": 0.1933337261603127, + "grad_norm": 7.661288738250732, + "learning_rate": 1.871558133492033e-05, + "loss": 1.9245, + "step": 30760 + }, + { + "epoch": 0.1933965784770098, + "grad_norm": 9.048144340515137, + "learning_rate": 1.8715162233975677e-05, + "loss": 2.4136, + "step": 30770 + }, + { + "epoch": 0.19345943079370692, + "grad_norm": 7.569735050201416, + "learning_rate": 1.8714743133031024e-05, + "loss": 2.2554, + "step": 30780 + }, + { + "epoch": 0.19352228311040404, + "grad_norm": 8.289266586303711, + "learning_rate": 1.871432403208637e-05, + "loss": 2.0044, + "step": 30790 + }, + { + "epoch": 0.19358513542710112, + "grad_norm": 7.837000846862793, + "learning_rate": 1.8713904931141714e-05, + "loss": 1.9743, + "step": 30800 + }, + { + "epoch": 0.19364798774379824, + "grad_norm": 6.99252986907959, + "learning_rate": 1.871348583019706e-05, + "loss": 1.9266, + "step": 30810 + }, + { + "epoch": 0.19371084006049535, + "grad_norm": 7.3852643966674805, + "learning_rate": 1.871306672925241e-05, + "loss": 2.1344, + "step": 30820 + }, + { + "epoch": 0.19377369237719247, + "grad_norm": 7.649234294891357, + "learning_rate": 1.8712647628307756e-05, + "loss": 2.159, + "step": 30830 + }, + { + "epoch": 0.19383654469388958, + "grad_norm": 7.783021450042725, + "learning_rate": 1.8712228527363103e-05, + "loss": 2.0225, + "step": 30840 + }, + { + "epoch": 0.1938993970105867, + "grad_norm": 6.159892559051514, + "learning_rate": 1.8711809426418446e-05, + "loss": 1.733, + "step": 30850 + }, + { + "epoch": 0.1939622493272838, + "grad_norm": 7.179442882537842, + "learning_rate": 1.8711390325473793e-05, + "loss": 2.1753, + "step": 30860 + }, + { + "epoch": 0.1940251016439809, + "grad_norm": 8.023426055908203, + "learning_rate": 1.871097122452914e-05, + "loss": 2.1041, + "step": 30870 + }, + { + "epoch": 0.19408795396067802, + "grad_norm": 8.841620445251465, + "learning_rate": 1.8710552123584488e-05, + "loss": 1.9144, + "step": 30880 + }, + { + "epoch": 0.19415080627737513, + "grad_norm": 9.015290260314941, + "learning_rate": 1.8710133022639835e-05, + "loss": 1.9328, + "step": 30890 + }, + { + "epoch": 0.19421365859407225, + "grad_norm": 8.53672981262207, + "learning_rate": 1.8709713921695182e-05, + "loss": 1.9781, + "step": 30900 + }, + { + "epoch": 0.19427651091076936, + "grad_norm": 6.604968547821045, + "learning_rate": 1.870929482075053e-05, + "loss": 2.0874, + "step": 30910 + }, + { + "epoch": 0.19433936322746645, + "grad_norm": 7.135388374328613, + "learning_rate": 1.8708875719805876e-05, + "loss": 1.961, + "step": 30920 + }, + { + "epoch": 0.19440221554416356, + "grad_norm": 6.9580864906311035, + "learning_rate": 1.8708456618861223e-05, + "loss": 1.8935, + "step": 30930 + }, + { + "epoch": 0.19446506786086068, + "grad_norm": 6.016077041625977, + "learning_rate": 1.8708037517916567e-05, + "loss": 1.8778, + "step": 30940 + }, + { + "epoch": 0.1945279201775578, + "grad_norm": 7.97602653503418, + "learning_rate": 1.8707618416971914e-05, + "loss": 2.0686, + "step": 30950 + }, + { + "epoch": 0.1945907724942549, + "grad_norm": 6.80618953704834, + "learning_rate": 1.870719931602726e-05, + "loss": 1.8221, + "step": 30960 + }, + { + "epoch": 0.19465362481095203, + "grad_norm": 7.454229831695557, + "learning_rate": 1.8706780215082608e-05, + "loss": 2.0162, + "step": 30970 + }, + { + "epoch": 0.19471647712764914, + "grad_norm": 7.484661102294922, + "learning_rate": 1.870636111413795e-05, + "loss": 1.9677, + "step": 30980 + }, + { + "epoch": 0.19477932944434623, + "grad_norm": 6.804109573364258, + "learning_rate": 1.87059420131933e-05, + "loss": 1.9224, + "step": 30990 + }, + { + "epoch": 0.19484218176104334, + "grad_norm": 7.103456497192383, + "learning_rate": 1.8705522912248646e-05, + "loss": 2.0525, + "step": 31000 + }, + { + "epoch": 0.19490503407774046, + "grad_norm": 7.709608554840088, + "learning_rate": 1.8705103811303993e-05, + "loss": 1.9993, + "step": 31010 + }, + { + "epoch": 0.19496788639443757, + "grad_norm": 7.108093738555908, + "learning_rate": 1.8704684710359336e-05, + "loss": 2.0137, + "step": 31020 + }, + { + "epoch": 0.1950307387111347, + "grad_norm": 7.940584182739258, + "learning_rate": 1.8704265609414683e-05, + "loss": 2.1052, + "step": 31030 + }, + { + "epoch": 0.1950935910278318, + "grad_norm": 8.082210540771484, + "learning_rate": 1.870384650847003e-05, + "loss": 2.0226, + "step": 31040 + }, + { + "epoch": 0.1951564433445289, + "grad_norm": 6.77839469909668, + "learning_rate": 1.8703427407525378e-05, + "loss": 1.7747, + "step": 31050 + }, + { + "epoch": 0.195219295661226, + "grad_norm": 7.5021162033081055, + "learning_rate": 1.8703008306580725e-05, + "loss": 1.8515, + "step": 31060 + }, + { + "epoch": 0.19528214797792312, + "grad_norm": 6.902249336242676, + "learning_rate": 1.8702589205636072e-05, + "loss": 1.9328, + "step": 31070 + }, + { + "epoch": 0.19534500029462024, + "grad_norm": 6.502012252807617, + "learning_rate": 1.870217010469142e-05, + "loss": 1.7997, + "step": 31080 + }, + { + "epoch": 0.19540785261131735, + "grad_norm": 6.4549665451049805, + "learning_rate": 1.8701751003746766e-05, + "loss": 2.0455, + "step": 31090 + }, + { + "epoch": 0.19547070492801447, + "grad_norm": 6.250760555267334, + "learning_rate": 1.870133190280211e-05, + "loss": 1.9771, + "step": 31100 + }, + { + "epoch": 0.19553355724471155, + "grad_norm": 6.72709321975708, + "learning_rate": 1.8700912801857457e-05, + "loss": 2.032, + "step": 31110 + }, + { + "epoch": 0.19559640956140867, + "grad_norm": 7.2245097160339355, + "learning_rate": 1.8700493700912804e-05, + "loss": 1.8392, + "step": 31120 + }, + { + "epoch": 0.19565926187810578, + "grad_norm": 7.101731777191162, + "learning_rate": 1.870007459996815e-05, + "loss": 1.9772, + "step": 31130 + }, + { + "epoch": 0.1957221141948029, + "grad_norm": 6.784846305847168, + "learning_rate": 1.8699655499023498e-05, + "loss": 1.9331, + "step": 31140 + }, + { + "epoch": 0.1957849665115, + "grad_norm": 9.258913040161133, + "learning_rate": 1.8699236398078845e-05, + "loss": 1.8773, + "step": 31150 + }, + { + "epoch": 0.19584781882819713, + "grad_norm": 7.051684379577637, + "learning_rate": 1.869881729713419e-05, + "loss": 1.9141, + "step": 31160 + }, + { + "epoch": 0.19591067114489424, + "grad_norm": 7.824609756469727, + "learning_rate": 1.8698398196189536e-05, + "loss": 1.8682, + "step": 31170 + }, + { + "epoch": 0.19597352346159133, + "grad_norm": 9.092242240905762, + "learning_rate": 1.8697979095244883e-05, + "loss": 2.0567, + "step": 31180 + }, + { + "epoch": 0.19603637577828845, + "grad_norm": 6.69179630279541, + "learning_rate": 1.869755999430023e-05, + "loss": 1.7153, + "step": 31190 + }, + { + "epoch": 0.19609922809498556, + "grad_norm": 7.820713996887207, + "learning_rate": 1.8697140893355574e-05, + "loss": 1.951, + "step": 31200 + }, + { + "epoch": 0.19616208041168268, + "grad_norm": 7.920715808868408, + "learning_rate": 1.869672179241092e-05, + "loss": 1.9306, + "step": 31210 + }, + { + "epoch": 0.1962249327283798, + "grad_norm": 7.537846565246582, + "learning_rate": 1.8696302691466268e-05, + "loss": 1.9301, + "step": 31220 + }, + { + "epoch": 0.1962877850450769, + "grad_norm": 7.648664951324463, + "learning_rate": 1.8695883590521615e-05, + "loss": 2.0416, + "step": 31230 + }, + { + "epoch": 0.196350637361774, + "grad_norm": 6.697866439819336, + "learning_rate": 1.8695464489576962e-05, + "loss": 1.9286, + "step": 31240 + }, + { + "epoch": 0.1964134896784711, + "grad_norm": 6.970717906951904, + "learning_rate": 1.8695045388632305e-05, + "loss": 1.8984, + "step": 31250 + }, + { + "epoch": 0.19647634199516822, + "grad_norm": 6.391444206237793, + "learning_rate": 1.8694626287687653e-05, + "loss": 1.7854, + "step": 31260 + }, + { + "epoch": 0.19653919431186534, + "grad_norm": 7.58518123626709, + "learning_rate": 1.8694207186743e-05, + "loss": 2.1818, + "step": 31270 + }, + { + "epoch": 0.19660204662856245, + "grad_norm": 6.825311183929443, + "learning_rate": 1.8693788085798347e-05, + "loss": 2.0447, + "step": 31280 + }, + { + "epoch": 0.19666489894525957, + "grad_norm": 7.749687671661377, + "learning_rate": 1.8693368984853694e-05, + "loss": 1.8898, + "step": 31290 + }, + { + "epoch": 0.19672775126195668, + "grad_norm": 7.068374156951904, + "learning_rate": 1.869294988390904e-05, + "loss": 2.1863, + "step": 31300 + }, + { + "epoch": 0.19679060357865377, + "grad_norm": 7.546702861785889, + "learning_rate": 1.8692530782964388e-05, + "loss": 2.037, + "step": 31310 + }, + { + "epoch": 0.1968534558953509, + "grad_norm": 7.583136558532715, + "learning_rate": 1.8692111682019735e-05, + "loss": 2.0095, + "step": 31320 + }, + { + "epoch": 0.196916308212048, + "grad_norm": 7.856510639190674, + "learning_rate": 1.869169258107508e-05, + "loss": 2.0096, + "step": 31330 + }, + { + "epoch": 0.19697916052874512, + "grad_norm": 7.463872909545898, + "learning_rate": 1.8691273480130426e-05, + "loss": 2.0048, + "step": 31340 + }, + { + "epoch": 0.19704201284544223, + "grad_norm": 7.2952799797058105, + "learning_rate": 1.8690854379185773e-05, + "loss": 1.908, + "step": 31350 + }, + { + "epoch": 0.19710486516213935, + "grad_norm": 7.742048740386963, + "learning_rate": 1.869043527824112e-05, + "loss": 1.9118, + "step": 31360 + }, + { + "epoch": 0.19716771747883644, + "grad_norm": 7.366936683654785, + "learning_rate": 1.8690016177296467e-05, + "loss": 1.9242, + "step": 31370 + }, + { + "epoch": 0.19723056979553355, + "grad_norm": 6.955266952514648, + "learning_rate": 1.868959707635181e-05, + "loss": 2.0451, + "step": 31380 + }, + { + "epoch": 0.19729342211223067, + "grad_norm": 7.954512596130371, + "learning_rate": 1.8689177975407158e-05, + "loss": 2.0263, + "step": 31390 + }, + { + "epoch": 0.19735627442892778, + "grad_norm": 7.566021919250488, + "learning_rate": 1.8688758874462505e-05, + "loss": 2.0368, + "step": 31400 + }, + { + "epoch": 0.1974191267456249, + "grad_norm": 6.972746849060059, + "learning_rate": 1.8688339773517852e-05, + "loss": 2.0718, + "step": 31410 + }, + { + "epoch": 0.197481979062322, + "grad_norm": 6.899226188659668, + "learning_rate": 1.8687920672573196e-05, + "loss": 1.8342, + "step": 31420 + }, + { + "epoch": 0.1975448313790191, + "grad_norm": 6.51491117477417, + "learning_rate": 1.8687501571628543e-05, + "loss": 2.0807, + "step": 31430 + }, + { + "epoch": 0.1976076836957162, + "grad_norm": 7.504712104797363, + "learning_rate": 1.868708247068389e-05, + "loss": 2.0655, + "step": 31440 + }, + { + "epoch": 0.19767053601241333, + "grad_norm": 7.929646015167236, + "learning_rate": 1.8686663369739237e-05, + "loss": 2.0732, + "step": 31450 + }, + { + "epoch": 0.19773338832911044, + "grad_norm": 7.556617736816406, + "learning_rate": 1.8686244268794584e-05, + "loss": 1.8426, + "step": 31460 + }, + { + "epoch": 0.19779624064580756, + "grad_norm": 6.628917217254639, + "learning_rate": 1.8685825167849927e-05, + "loss": 2.037, + "step": 31470 + }, + { + "epoch": 0.19785909296250467, + "grad_norm": 6.637345790863037, + "learning_rate": 1.8685406066905275e-05, + "loss": 2.1008, + "step": 31480 + }, + { + "epoch": 0.1979219452792018, + "grad_norm": 7.68168830871582, + "learning_rate": 1.868498696596062e-05, + "loss": 1.6905, + "step": 31490 + }, + { + "epoch": 0.19798479759589888, + "grad_norm": 7.9139180183410645, + "learning_rate": 1.868456786501597e-05, + "loss": 2.0851, + "step": 31500 + }, + { + "epoch": 0.198047649912596, + "grad_norm": 6.979345798492432, + "learning_rate": 1.8684148764071316e-05, + "loss": 1.8896, + "step": 31510 + }, + { + "epoch": 0.1981105022292931, + "grad_norm": 7.564640998840332, + "learning_rate": 1.8683729663126663e-05, + "loss": 1.8157, + "step": 31520 + }, + { + "epoch": 0.19817335454599022, + "grad_norm": 5.947103977203369, + "learning_rate": 1.868331056218201e-05, + "loss": 1.8437, + "step": 31530 + }, + { + "epoch": 0.19823620686268734, + "grad_norm": 7.795741558074951, + "learning_rate": 1.8682891461237357e-05, + "loss": 2.0519, + "step": 31540 + }, + { + "epoch": 0.19829905917938445, + "grad_norm": 7.159440040588379, + "learning_rate": 1.8682472360292704e-05, + "loss": 2.1248, + "step": 31550 + }, + { + "epoch": 0.19836191149608154, + "grad_norm": 6.944345474243164, + "learning_rate": 1.8682053259348048e-05, + "loss": 2.1194, + "step": 31560 + }, + { + "epoch": 0.19842476381277865, + "grad_norm": 7.7796196937561035, + "learning_rate": 1.8681634158403395e-05, + "loss": 1.8926, + "step": 31570 + }, + { + "epoch": 0.19848761612947577, + "grad_norm": 7.708437919616699, + "learning_rate": 1.8681215057458742e-05, + "loss": 1.8538, + "step": 31580 + }, + { + "epoch": 0.19855046844617288, + "grad_norm": 6.7379279136657715, + "learning_rate": 1.868079595651409e-05, + "loss": 2.2006, + "step": 31590 + }, + { + "epoch": 0.19861332076287, + "grad_norm": 6.8227458000183105, + "learning_rate": 1.8680376855569433e-05, + "loss": 2.0633, + "step": 31600 + }, + { + "epoch": 0.19867617307956711, + "grad_norm": 8.552653312683105, + "learning_rate": 1.867995775462478e-05, + "loss": 2.2047, + "step": 31610 + }, + { + "epoch": 0.1987390253962642, + "grad_norm": 6.670979976654053, + "learning_rate": 1.8679538653680127e-05, + "loss": 2.1558, + "step": 31620 + }, + { + "epoch": 0.19880187771296132, + "grad_norm": 6.225050449371338, + "learning_rate": 1.8679119552735474e-05, + "loss": 2.0291, + "step": 31630 + }, + { + "epoch": 0.19886473002965843, + "grad_norm": 7.140894412994385, + "learning_rate": 1.8678700451790818e-05, + "loss": 2.24, + "step": 31640 + }, + { + "epoch": 0.19892758234635555, + "grad_norm": 7.125330448150635, + "learning_rate": 1.8678281350846165e-05, + "loss": 1.9717, + "step": 31650 + }, + { + "epoch": 0.19899043466305266, + "grad_norm": 8.001860618591309, + "learning_rate": 1.867786224990151e-05, + "loss": 2.2029, + "step": 31660 + }, + { + "epoch": 0.19905328697974978, + "grad_norm": 7.351769924163818, + "learning_rate": 1.867744314895686e-05, + "loss": 1.6518, + "step": 31670 + }, + { + "epoch": 0.1991161392964469, + "grad_norm": 8.122381210327148, + "learning_rate": 1.8677024048012206e-05, + "loss": 1.7484, + "step": 31680 + }, + { + "epoch": 0.19917899161314398, + "grad_norm": 5.550985813140869, + "learning_rate": 1.8676604947067553e-05, + "loss": 1.7822, + "step": 31690 + }, + { + "epoch": 0.1992418439298411, + "grad_norm": 7.294111728668213, + "learning_rate": 1.86761858461229e-05, + "loss": 1.7865, + "step": 31700 + }, + { + "epoch": 0.1993046962465382, + "grad_norm": 7.205888271331787, + "learning_rate": 1.8675766745178247e-05, + "loss": 2.0595, + "step": 31710 + }, + { + "epoch": 0.19936754856323533, + "grad_norm": 7.846662998199463, + "learning_rate": 1.867534764423359e-05, + "loss": 1.9511, + "step": 31720 + }, + { + "epoch": 0.19943040087993244, + "grad_norm": 8.158760070800781, + "learning_rate": 1.8674928543288938e-05, + "loss": 2.076, + "step": 31730 + }, + { + "epoch": 0.19949325319662956, + "grad_norm": 6.844155788421631, + "learning_rate": 1.8674509442344285e-05, + "loss": 1.8929, + "step": 31740 + }, + { + "epoch": 0.19955610551332664, + "grad_norm": 7.20194149017334, + "learning_rate": 1.8674090341399632e-05, + "loss": 2.0575, + "step": 31750 + }, + { + "epoch": 0.19961895783002376, + "grad_norm": 7.942824840545654, + "learning_rate": 1.867367124045498e-05, + "loss": 2.0269, + "step": 31760 + }, + { + "epoch": 0.19968181014672087, + "grad_norm": 7.399482250213623, + "learning_rate": 1.8673252139510326e-05, + "loss": 2.122, + "step": 31770 + }, + { + "epoch": 0.199744662463418, + "grad_norm": 6.742488861083984, + "learning_rate": 1.867283303856567e-05, + "loss": 2.0259, + "step": 31780 + }, + { + "epoch": 0.1998075147801151, + "grad_norm": 9.902754783630371, + "learning_rate": 1.8672413937621017e-05, + "loss": 1.7763, + "step": 31790 + }, + { + "epoch": 0.19987036709681222, + "grad_norm": 6.923020839691162, + "learning_rate": 1.8671994836676364e-05, + "loss": 1.9856, + "step": 31800 + }, + { + "epoch": 0.19993321941350933, + "grad_norm": 7.055446147918701, + "learning_rate": 1.867157573573171e-05, + "loss": 2.0138, + "step": 31810 + }, + { + "epoch": 0.19999607173020642, + "grad_norm": 7.5703229904174805, + "learning_rate": 1.8671156634787055e-05, + "loss": 2.0022, + "step": 31820 + }, + { + "epoch": 0.20005892404690354, + "grad_norm": 7.893503665924072, + "learning_rate": 1.8670737533842402e-05, + "loss": 2.0686, + "step": 31830 + }, + { + "epoch": 0.20012177636360065, + "grad_norm": 6.54017972946167, + "learning_rate": 1.867031843289775e-05, + "loss": 1.9186, + "step": 31840 + }, + { + "epoch": 0.20018462868029777, + "grad_norm": 8.596176147460938, + "learning_rate": 1.8669899331953096e-05, + "loss": 1.7146, + "step": 31850 + }, + { + "epoch": 0.20024748099699488, + "grad_norm": 9.219547271728516, + "learning_rate": 1.8669480231008443e-05, + "loss": 1.9238, + "step": 31860 + }, + { + "epoch": 0.200310333313692, + "grad_norm": 7.179672718048096, + "learning_rate": 1.8669061130063787e-05, + "loss": 2.1072, + "step": 31870 + }, + { + "epoch": 0.20037318563038908, + "grad_norm": 6.458518028259277, + "learning_rate": 1.8668642029119134e-05, + "loss": 1.8323, + "step": 31880 + }, + { + "epoch": 0.2004360379470862, + "grad_norm": 8.046995162963867, + "learning_rate": 1.866822292817448e-05, + "loss": 1.8013, + "step": 31890 + }, + { + "epoch": 0.2004988902637833, + "grad_norm": 7.660754203796387, + "learning_rate": 1.8667803827229828e-05, + "loss": 2.1948, + "step": 31900 + }, + { + "epoch": 0.20056174258048043, + "grad_norm": 7.255456447601318, + "learning_rate": 1.8667384726285175e-05, + "loss": 1.7914, + "step": 31910 + }, + { + "epoch": 0.20062459489717754, + "grad_norm": 6.695418834686279, + "learning_rate": 1.8666965625340522e-05, + "loss": 1.8412, + "step": 31920 + }, + { + "epoch": 0.20068744721387466, + "grad_norm": 7.198409557342529, + "learning_rate": 1.866654652439587e-05, + "loss": 2.0424, + "step": 31930 + }, + { + "epoch": 0.20075029953057175, + "grad_norm": 6.992066860198975, + "learning_rate": 1.8666127423451216e-05, + "loss": 2.2153, + "step": 31940 + }, + { + "epoch": 0.20081315184726886, + "grad_norm": 6.353859901428223, + "learning_rate": 1.866570832250656e-05, + "loss": 1.888, + "step": 31950 + }, + { + "epoch": 0.20087600416396598, + "grad_norm": 8.30443000793457, + "learning_rate": 1.8665289221561907e-05, + "loss": 2.1721, + "step": 31960 + }, + { + "epoch": 0.2009388564806631, + "grad_norm": 7.140757083892822, + "learning_rate": 1.8664870120617254e-05, + "loss": 2.3043, + "step": 31970 + }, + { + "epoch": 0.2010017087973602, + "grad_norm": 7.410743713378906, + "learning_rate": 1.86644510196726e-05, + "loss": 1.8216, + "step": 31980 + }, + { + "epoch": 0.20106456111405732, + "grad_norm": 7.6606316566467285, + "learning_rate": 1.8664031918727948e-05, + "loss": 1.9886, + "step": 31990 + }, + { + "epoch": 0.20112741343075444, + "grad_norm": 6.549091815948486, + "learning_rate": 1.8663612817783292e-05, + "loss": 1.8681, + "step": 32000 + }, + { + "epoch": 0.20119026574745152, + "grad_norm": 6.990193843841553, + "learning_rate": 1.866319371683864e-05, + "loss": 2.0551, + "step": 32010 + }, + { + "epoch": 0.20125311806414864, + "grad_norm": 6.669229984283447, + "learning_rate": 1.8662774615893986e-05, + "loss": 1.7833, + "step": 32020 + }, + { + "epoch": 0.20131597038084575, + "grad_norm": 8.114689826965332, + "learning_rate": 1.8662355514949333e-05, + "loss": 2.0699, + "step": 32030 + }, + { + "epoch": 0.20137882269754287, + "grad_norm": 7.928976535797119, + "learning_rate": 1.8661936414004677e-05, + "loss": 2.0345, + "step": 32040 + }, + { + "epoch": 0.20144167501423998, + "grad_norm": 6.947659492492676, + "learning_rate": 1.8661517313060024e-05, + "loss": 1.9629, + "step": 32050 + }, + { + "epoch": 0.2015045273309371, + "grad_norm": 8.57833480834961, + "learning_rate": 1.866109821211537e-05, + "loss": 2.1507, + "step": 32060 + }, + { + "epoch": 0.2015673796476342, + "grad_norm": 6.773591995239258, + "learning_rate": 1.8660679111170718e-05, + "loss": 1.7984, + "step": 32070 + }, + { + "epoch": 0.2016302319643313, + "grad_norm": 7.336679935455322, + "learning_rate": 1.8660260010226065e-05, + "loss": 2.0215, + "step": 32080 + }, + { + "epoch": 0.20169308428102842, + "grad_norm": 8.003287315368652, + "learning_rate": 1.8659840909281412e-05, + "loss": 2.0209, + "step": 32090 + }, + { + "epoch": 0.20175593659772553, + "grad_norm": 6.318203926086426, + "learning_rate": 1.8659421808336756e-05, + "loss": 1.9984, + "step": 32100 + }, + { + "epoch": 0.20181878891442265, + "grad_norm": 8.037556648254395, + "learning_rate": 1.8659002707392103e-05, + "loss": 2.1062, + "step": 32110 + }, + { + "epoch": 0.20188164123111976, + "grad_norm": 7.300333023071289, + "learning_rate": 1.865858360644745e-05, + "loss": 1.8028, + "step": 32120 + }, + { + "epoch": 0.20194449354781688, + "grad_norm": 5.501397132873535, + "learning_rate": 1.8658164505502797e-05, + "loss": 1.978, + "step": 32130 + }, + { + "epoch": 0.20200734586451397, + "grad_norm": 6.930439472198486, + "learning_rate": 1.8657745404558144e-05, + "loss": 1.9132, + "step": 32140 + }, + { + "epoch": 0.20207019818121108, + "grad_norm": 6.926006317138672, + "learning_rate": 1.865732630361349e-05, + "loss": 1.8571, + "step": 32150 + }, + { + "epoch": 0.2021330504979082, + "grad_norm": 7.173760890960693, + "learning_rate": 1.8656907202668838e-05, + "loss": 1.9771, + "step": 32160 + }, + { + "epoch": 0.2021959028146053, + "grad_norm": 7.056827545166016, + "learning_rate": 1.8656488101724185e-05, + "loss": 2.0286, + "step": 32170 + }, + { + "epoch": 0.20225875513130243, + "grad_norm": 7.20012092590332, + "learning_rate": 1.865606900077953e-05, + "loss": 1.8052, + "step": 32180 + }, + { + "epoch": 0.20232160744799954, + "grad_norm": 7.294288158416748, + "learning_rate": 1.8655649899834876e-05, + "loss": 1.9413, + "step": 32190 + }, + { + "epoch": 0.20238445976469663, + "grad_norm": 6.476958274841309, + "learning_rate": 1.8655230798890223e-05, + "loss": 1.9641, + "step": 32200 + }, + { + "epoch": 0.20244731208139374, + "grad_norm": 6.717261791229248, + "learning_rate": 1.865481169794557e-05, + "loss": 1.9349, + "step": 32210 + }, + { + "epoch": 0.20251016439809086, + "grad_norm": 6.926117897033691, + "learning_rate": 1.8654392597000914e-05, + "loss": 1.822, + "step": 32220 + }, + { + "epoch": 0.20257301671478797, + "grad_norm": 6.997591495513916, + "learning_rate": 1.865397349605626e-05, + "loss": 2.0666, + "step": 32230 + }, + { + "epoch": 0.2026358690314851, + "grad_norm": 7.3034563064575195, + "learning_rate": 1.8653554395111608e-05, + "loss": 2.0412, + "step": 32240 + }, + { + "epoch": 0.2026987213481822, + "grad_norm": 7.561244487762451, + "learning_rate": 1.8653135294166955e-05, + "loss": 1.9237, + "step": 32250 + }, + { + "epoch": 0.2027615736648793, + "grad_norm": 7.470678806304932, + "learning_rate": 1.86527161932223e-05, + "loss": 1.834, + "step": 32260 + }, + { + "epoch": 0.2028244259815764, + "grad_norm": 7.097418785095215, + "learning_rate": 1.8652297092277646e-05, + "loss": 1.9299, + "step": 32270 + }, + { + "epoch": 0.20288727829827352, + "grad_norm": 6.99167537689209, + "learning_rate": 1.8651877991332993e-05, + "loss": 2.2147, + "step": 32280 + }, + { + "epoch": 0.20295013061497064, + "grad_norm": 6.0950927734375, + "learning_rate": 1.865145889038834e-05, + "loss": 1.6959, + "step": 32290 + }, + { + "epoch": 0.20301298293166775, + "grad_norm": 7.768097400665283, + "learning_rate": 1.8651039789443687e-05, + "loss": 1.9643, + "step": 32300 + }, + { + "epoch": 0.20307583524836487, + "grad_norm": 7.126907825469971, + "learning_rate": 1.8650620688499034e-05, + "loss": 1.663, + "step": 32310 + }, + { + "epoch": 0.20313868756506198, + "grad_norm": 6.074197769165039, + "learning_rate": 1.865020158755438e-05, + "loss": 2.1272, + "step": 32320 + }, + { + "epoch": 0.20320153988175907, + "grad_norm": 6.936085224151611, + "learning_rate": 1.8649782486609728e-05, + "loss": 1.8462, + "step": 32330 + }, + { + "epoch": 0.20326439219845618, + "grad_norm": 6.845409393310547, + "learning_rate": 1.8649363385665075e-05, + "loss": 1.8656, + "step": 32340 + }, + { + "epoch": 0.2033272445151533, + "grad_norm": 7.36346960067749, + "learning_rate": 1.864894428472042e-05, + "loss": 1.9648, + "step": 32350 + }, + { + "epoch": 0.20339009683185041, + "grad_norm": 7.719435214996338, + "learning_rate": 1.8648525183775766e-05, + "loss": 1.8929, + "step": 32360 + }, + { + "epoch": 0.20345294914854753, + "grad_norm": 8.386730194091797, + "learning_rate": 1.8648106082831113e-05, + "loss": 1.757, + "step": 32370 + }, + { + "epoch": 0.20351580146524464, + "grad_norm": 7.338146686553955, + "learning_rate": 1.864768698188646e-05, + "loss": 1.9952, + "step": 32380 + }, + { + "epoch": 0.20357865378194173, + "grad_norm": 6.615225315093994, + "learning_rate": 1.8647267880941807e-05, + "loss": 1.9445, + "step": 32390 + }, + { + "epoch": 0.20364150609863885, + "grad_norm": 7.198486804962158, + "learning_rate": 1.864684877999715e-05, + "loss": 2.0266, + "step": 32400 + }, + { + "epoch": 0.20370435841533596, + "grad_norm": 7.512393474578857, + "learning_rate": 1.8646429679052498e-05, + "loss": 1.9268, + "step": 32410 + }, + { + "epoch": 0.20376721073203308, + "grad_norm": 7.614518165588379, + "learning_rate": 1.8646010578107845e-05, + "loss": 2.1856, + "step": 32420 + }, + { + "epoch": 0.2038300630487302, + "grad_norm": 7.497756481170654, + "learning_rate": 1.8645591477163192e-05, + "loss": 2.0897, + "step": 32430 + }, + { + "epoch": 0.2038929153654273, + "grad_norm": 6.641429901123047, + "learning_rate": 1.8645172376218536e-05, + "loss": 2.0392, + "step": 32440 + }, + { + "epoch": 0.2039557676821244, + "grad_norm": 8.049214363098145, + "learning_rate": 1.8644753275273883e-05, + "loss": 1.8764, + "step": 32450 + }, + { + "epoch": 0.2040186199988215, + "grad_norm": 8.235671997070312, + "learning_rate": 1.864433417432923e-05, + "loss": 2.1654, + "step": 32460 + }, + { + "epoch": 0.20408147231551862, + "grad_norm": 6.430967807769775, + "learning_rate": 1.8643915073384577e-05, + "loss": 1.9691, + "step": 32470 + }, + { + "epoch": 0.20414432463221574, + "grad_norm": 8.980220794677734, + "learning_rate": 1.8643495972439924e-05, + "loss": 2.0066, + "step": 32480 + }, + { + "epoch": 0.20420717694891286, + "grad_norm": 9.148171424865723, + "learning_rate": 1.8643076871495268e-05, + "loss": 1.9771, + "step": 32490 + }, + { + "epoch": 0.20427002926560997, + "grad_norm": 8.299166679382324, + "learning_rate": 1.8642657770550615e-05, + "loss": 1.8559, + "step": 32500 + }, + { + "epoch": 0.20433288158230709, + "grad_norm": 8.3163480758667, + "learning_rate": 1.8642238669605962e-05, + "loss": 1.9074, + "step": 32510 + }, + { + "epoch": 0.20439573389900417, + "grad_norm": 7.030807018280029, + "learning_rate": 1.864181956866131e-05, + "loss": 2.0593, + "step": 32520 + }, + { + "epoch": 0.2044585862157013, + "grad_norm": 6.964199542999268, + "learning_rate": 1.8641400467716656e-05, + "loss": 2.0386, + "step": 32530 + }, + { + "epoch": 0.2045214385323984, + "grad_norm": 6.957630157470703, + "learning_rate": 1.8640981366772003e-05, + "loss": 1.9482, + "step": 32540 + }, + { + "epoch": 0.20458429084909552, + "grad_norm": 6.892584323883057, + "learning_rate": 1.864056226582735e-05, + "loss": 1.9165, + "step": 32550 + }, + { + "epoch": 0.20464714316579263, + "grad_norm": 7.2235918045043945, + "learning_rate": 1.8640143164882697e-05, + "loss": 1.947, + "step": 32560 + }, + { + "epoch": 0.20470999548248975, + "grad_norm": 7.948803901672363, + "learning_rate": 1.863972406393804e-05, + "loss": 2.0304, + "step": 32570 + }, + { + "epoch": 0.20477284779918684, + "grad_norm": 6.61076021194458, + "learning_rate": 1.8639304962993388e-05, + "loss": 2.0487, + "step": 32580 + }, + { + "epoch": 0.20483570011588395, + "grad_norm": 8.155477523803711, + "learning_rate": 1.8638885862048735e-05, + "loss": 1.8815, + "step": 32590 + }, + { + "epoch": 0.20489855243258107, + "grad_norm": 6.9196248054504395, + "learning_rate": 1.8638466761104082e-05, + "loss": 1.7916, + "step": 32600 + }, + { + "epoch": 0.20496140474927818, + "grad_norm": 7.408926963806152, + "learning_rate": 1.863804766015943e-05, + "loss": 2.0736, + "step": 32610 + }, + { + "epoch": 0.2050242570659753, + "grad_norm": 8.11421012878418, + "learning_rate": 1.8637628559214773e-05, + "loss": 1.8672, + "step": 32620 + }, + { + "epoch": 0.2050871093826724, + "grad_norm": 7.909786224365234, + "learning_rate": 1.863720945827012e-05, + "loss": 2.0961, + "step": 32630 + }, + { + "epoch": 0.20514996169936953, + "grad_norm": 8.090932846069336, + "learning_rate": 1.8636790357325467e-05, + "loss": 2.0442, + "step": 32640 + }, + { + "epoch": 0.2052128140160666, + "grad_norm": 6.865467548370361, + "learning_rate": 1.8636371256380814e-05, + "loss": 1.7117, + "step": 32650 + }, + { + "epoch": 0.20527566633276373, + "grad_norm": 7.839059829711914, + "learning_rate": 1.8635952155436158e-05, + "loss": 1.9982, + "step": 32660 + }, + { + "epoch": 0.20533851864946084, + "grad_norm": 5.283694267272949, + "learning_rate": 1.8635533054491505e-05, + "loss": 1.9163, + "step": 32670 + }, + { + "epoch": 0.20540137096615796, + "grad_norm": 7.552374362945557, + "learning_rate": 1.8635113953546852e-05, + "loss": 1.8389, + "step": 32680 + }, + { + "epoch": 0.20546422328285507, + "grad_norm": 7.677777290344238, + "learning_rate": 1.86346948526022e-05, + "loss": 2.1014, + "step": 32690 + }, + { + "epoch": 0.2055270755995522, + "grad_norm": 7.4425201416015625, + "learning_rate": 1.8634275751657546e-05, + "loss": 1.9164, + "step": 32700 + }, + { + "epoch": 0.20558992791624928, + "grad_norm": 6.851995468139648, + "learning_rate": 1.8633856650712893e-05, + "loss": 1.8816, + "step": 32710 + }, + { + "epoch": 0.2056527802329464, + "grad_norm": 7.684861660003662, + "learning_rate": 1.863343754976824e-05, + "loss": 2.1476, + "step": 32720 + }, + { + "epoch": 0.2057156325496435, + "grad_norm": 6.646461486816406, + "learning_rate": 1.8633018448823584e-05, + "loss": 1.8131, + "step": 32730 + }, + { + "epoch": 0.20577848486634062, + "grad_norm": 7.502947807312012, + "learning_rate": 1.863259934787893e-05, + "loss": 2.1382, + "step": 32740 + }, + { + "epoch": 0.20584133718303774, + "grad_norm": 7.392009258270264, + "learning_rate": 1.8632180246934278e-05, + "loss": 2.0775, + "step": 32750 + }, + { + "epoch": 0.20590418949973485, + "grad_norm": 7.370657920837402, + "learning_rate": 1.8631761145989625e-05, + "loss": 1.902, + "step": 32760 + }, + { + "epoch": 0.20596704181643194, + "grad_norm": 7.597288608551025, + "learning_rate": 1.8631342045044972e-05, + "loss": 1.9322, + "step": 32770 + }, + { + "epoch": 0.20602989413312905, + "grad_norm": 6.6630353927612305, + "learning_rate": 1.863092294410032e-05, + "loss": 1.9984, + "step": 32780 + }, + { + "epoch": 0.20609274644982617, + "grad_norm": 6.317451477050781, + "learning_rate": 1.8630503843155666e-05, + "loss": 2.0104, + "step": 32790 + }, + { + "epoch": 0.20615559876652328, + "grad_norm": 6.763668060302734, + "learning_rate": 1.863008474221101e-05, + "loss": 1.9379, + "step": 32800 + }, + { + "epoch": 0.2062184510832204, + "grad_norm": 8.344666481018066, + "learning_rate": 1.8629665641266357e-05, + "loss": 2.0772, + "step": 32810 + }, + { + "epoch": 0.20628130339991751, + "grad_norm": 6.620004653930664, + "learning_rate": 1.8629246540321704e-05, + "loss": 1.8502, + "step": 32820 + }, + { + "epoch": 0.20634415571661463, + "grad_norm": 7.310399055480957, + "learning_rate": 1.862882743937705e-05, + "loss": 1.946, + "step": 32830 + }, + { + "epoch": 0.20640700803331172, + "grad_norm": 7.351963996887207, + "learning_rate": 1.8628408338432395e-05, + "loss": 1.835, + "step": 32840 + }, + { + "epoch": 0.20646986035000883, + "grad_norm": 6.5745439529418945, + "learning_rate": 1.8627989237487742e-05, + "loss": 1.9042, + "step": 32850 + }, + { + "epoch": 0.20653271266670595, + "grad_norm": 7.718234062194824, + "learning_rate": 1.862757013654309e-05, + "loss": 1.7437, + "step": 32860 + }, + { + "epoch": 0.20659556498340306, + "grad_norm": 6.668702125549316, + "learning_rate": 1.8627151035598436e-05, + "loss": 1.966, + "step": 32870 + }, + { + "epoch": 0.20665841730010018, + "grad_norm": 7.5997490882873535, + "learning_rate": 1.862673193465378e-05, + "loss": 1.7494, + "step": 32880 + }, + { + "epoch": 0.2067212696167973, + "grad_norm": 8.042951583862305, + "learning_rate": 1.8626312833709127e-05, + "loss": 1.977, + "step": 32890 + }, + { + "epoch": 0.20678412193349438, + "grad_norm": 8.362555503845215, + "learning_rate": 1.8625893732764474e-05, + "loss": 1.9018, + "step": 32900 + }, + { + "epoch": 0.2068469742501915, + "grad_norm": Infinity, + "learning_rate": 1.862547463181982e-05, + "loss": 2.0126, + "step": 32910 + }, + { + "epoch": 0.2069098265668886, + "grad_norm": 6.688395977020264, + "learning_rate": 1.8625097440969632e-05, + "loss": 1.9023, + "step": 32920 + }, + { + "epoch": 0.20697267888358573, + "grad_norm": 7.301969528198242, + "learning_rate": 1.862467834002498e-05, + "loss": 1.9322, + "step": 32930 + }, + { + "epoch": 0.20703553120028284, + "grad_norm": 6.88472843170166, + "learning_rate": 1.8624259239080326e-05, + "loss": 1.8666, + "step": 32940 + }, + { + "epoch": 0.20709838351697996, + "grad_norm": 8.703797340393066, + "learning_rate": 1.8623840138135674e-05, + "loss": 2.0496, + "step": 32950 + }, + { + "epoch": 0.20716123583367704, + "grad_norm": 6.187541484832764, + "learning_rate": 1.8623421037191017e-05, + "loss": 1.9576, + "step": 32960 + }, + { + "epoch": 0.20722408815037416, + "grad_norm": 8.077150344848633, + "learning_rate": 1.8623001936246364e-05, + "loss": 1.8276, + "step": 32970 + }, + { + "epoch": 0.20728694046707127, + "grad_norm": 7.203952789306641, + "learning_rate": 1.862258283530171e-05, + "loss": 2.1624, + "step": 32980 + }, + { + "epoch": 0.2073497927837684, + "grad_norm": 6.861067771911621, + "learning_rate": 1.862216373435706e-05, + "loss": 1.8687, + "step": 32990 + }, + { + "epoch": 0.2074126451004655, + "grad_norm": 6.995094299316406, + "learning_rate": 1.8621744633412405e-05, + "loss": 1.8891, + "step": 33000 + }, + { + "epoch": 0.20747549741716262, + "grad_norm": 7.380050182342529, + "learning_rate": 1.8621325532467753e-05, + "loss": 1.8304, + "step": 33010 + }, + { + "epoch": 0.20753834973385973, + "grad_norm": 7.955836772918701, + "learning_rate": 1.86209064315231e-05, + "loss": 2.1125, + "step": 33020 + }, + { + "epoch": 0.20760120205055682, + "grad_norm": 7.0294084548950195, + "learning_rate": 1.8620487330578447e-05, + "loss": 2.0791, + "step": 33030 + }, + { + "epoch": 0.20766405436725394, + "grad_norm": 7.854935169219971, + "learning_rate": 1.862006822963379e-05, + "loss": 1.9195, + "step": 33040 + }, + { + "epoch": 0.20772690668395105, + "grad_norm": 7.9961161613464355, + "learning_rate": 1.8619649128689137e-05, + "loss": 1.699, + "step": 33050 + }, + { + "epoch": 0.20778975900064817, + "grad_norm": 7.443821430206299, + "learning_rate": 1.8619230027744485e-05, + "loss": 1.9154, + "step": 33060 + }, + { + "epoch": 0.20785261131734528, + "grad_norm": 8.965798377990723, + "learning_rate": 1.861881092679983e-05, + "loss": 1.9549, + "step": 33070 + }, + { + "epoch": 0.2079154636340424, + "grad_norm": 7.658891677856445, + "learning_rate": 1.861839182585518e-05, + "loss": 1.9491, + "step": 33080 + }, + { + "epoch": 0.20797831595073948, + "grad_norm": 8.56626033782959, + "learning_rate": 1.8617972724910522e-05, + "loss": 1.8583, + "step": 33090 + }, + { + "epoch": 0.2080411682674366, + "grad_norm": 7.63130521774292, + "learning_rate": 1.861755362396587e-05, + "loss": 2.1472, + "step": 33100 + }, + { + "epoch": 0.20810402058413371, + "grad_norm": 7.68650484085083, + "learning_rate": 1.8617134523021216e-05, + "loss": 1.8979, + "step": 33110 + }, + { + "epoch": 0.20816687290083083, + "grad_norm": 6.086266040802002, + "learning_rate": 1.8616715422076564e-05, + "loss": 1.8511, + "step": 33120 + }, + { + "epoch": 0.20822972521752794, + "grad_norm": 6.9972243309021, + "learning_rate": 1.861629632113191e-05, + "loss": 1.9685, + "step": 33130 + }, + { + "epoch": 0.20829257753422506, + "grad_norm": 6.733551979064941, + "learning_rate": 1.8615877220187254e-05, + "loss": 1.9713, + "step": 33140 + }, + { + "epoch": 0.20835542985092217, + "grad_norm": 6.972572326660156, + "learning_rate": 1.86154581192426e-05, + "loss": 1.917, + "step": 33150 + }, + { + "epoch": 0.20841828216761926, + "grad_norm": 6.9418625831604, + "learning_rate": 1.861503901829795e-05, + "loss": 1.9291, + "step": 33160 + }, + { + "epoch": 0.20848113448431638, + "grad_norm": 7.004095077514648, + "learning_rate": 1.8614619917353296e-05, + "loss": 1.8306, + "step": 33170 + }, + { + "epoch": 0.2085439868010135, + "grad_norm": 6.511467933654785, + "learning_rate": 1.861420081640864e-05, + "loss": 1.9946, + "step": 33180 + }, + { + "epoch": 0.2086068391177106, + "grad_norm": 8.272610664367676, + "learning_rate": 1.8613781715463986e-05, + "loss": 1.9819, + "step": 33190 + }, + { + "epoch": 0.20866969143440772, + "grad_norm": 6.825449466705322, + "learning_rate": 1.8613362614519333e-05, + "loss": 1.8908, + "step": 33200 + }, + { + "epoch": 0.20873254375110484, + "grad_norm": 8.018668174743652, + "learning_rate": 1.861294351357468e-05, + "loss": 2.0171, + "step": 33210 + }, + { + "epoch": 0.20879539606780192, + "grad_norm": 7.659886360168457, + "learning_rate": 1.8612524412630027e-05, + "loss": 2.2547, + "step": 33220 + }, + { + "epoch": 0.20885824838449904, + "grad_norm": 7.284714698791504, + "learning_rate": 1.8612105311685375e-05, + "loss": 1.7837, + "step": 33230 + }, + { + "epoch": 0.20892110070119616, + "grad_norm": 7.291626930236816, + "learning_rate": 1.861168621074072e-05, + "loss": 1.972, + "step": 33240 + }, + { + "epoch": 0.20898395301789327, + "grad_norm": 6.9453606605529785, + "learning_rate": 1.861126710979607e-05, + "loss": 2.0272, + "step": 33250 + }, + { + "epoch": 0.20904680533459039, + "grad_norm": 9.36789321899414, + "learning_rate": 1.8610848008851416e-05, + "loss": 2.0488, + "step": 33260 + }, + { + "epoch": 0.2091096576512875, + "grad_norm": 6.998178482055664, + "learning_rate": 1.861042890790676e-05, + "loss": 1.9161, + "step": 33270 + }, + { + "epoch": 0.2091725099679846, + "grad_norm": 7.868252754211426, + "learning_rate": 1.8610009806962107e-05, + "loss": 1.996, + "step": 33280 + }, + { + "epoch": 0.2092353622846817, + "grad_norm": 7.053635597229004, + "learning_rate": 1.8609590706017454e-05, + "loss": 2.0046, + "step": 33290 + }, + { + "epoch": 0.20929821460137882, + "grad_norm": 6.25361967086792, + "learning_rate": 1.86091716050728e-05, + "loss": 1.9745, + "step": 33300 + }, + { + "epoch": 0.20936106691807593, + "grad_norm": 6.859872817993164, + "learning_rate": 1.8608752504128148e-05, + "loss": 1.9112, + "step": 33310 + }, + { + "epoch": 0.20942391923477305, + "grad_norm": 6.452934265136719, + "learning_rate": 1.860833340318349e-05, + "loss": 1.8774, + "step": 33320 + }, + { + "epoch": 0.20948677155147016, + "grad_norm": 7.296616077423096, + "learning_rate": 1.860791430223884e-05, + "loss": 1.9381, + "step": 33330 + }, + { + "epoch": 0.20954962386816728, + "grad_norm": 6.141557216644287, + "learning_rate": 1.8607495201294186e-05, + "loss": 1.8099, + "step": 33340 + }, + { + "epoch": 0.20961247618486437, + "grad_norm": 7.579769611358643, + "learning_rate": 1.8607076100349533e-05, + "loss": 2.1197, + "step": 33350 + }, + { + "epoch": 0.20967532850156148, + "grad_norm": 8.49437141418457, + "learning_rate": 1.8606656999404876e-05, + "loss": 2.0408, + "step": 33360 + }, + { + "epoch": 0.2097381808182586, + "grad_norm": 7.307910442352295, + "learning_rate": 1.8606237898460223e-05, + "loss": 1.7301, + "step": 33370 + }, + { + "epoch": 0.2098010331349557, + "grad_norm": 7.528223991394043, + "learning_rate": 1.860581879751557e-05, + "loss": 2.0548, + "step": 33380 + }, + { + "epoch": 0.20986388545165283, + "grad_norm": 7.4743428230285645, + "learning_rate": 1.8605399696570918e-05, + "loss": 1.9875, + "step": 33390 + }, + { + "epoch": 0.20992673776834994, + "grad_norm": 6.497557640075684, + "learning_rate": 1.8604980595626265e-05, + "loss": 2.0149, + "step": 33400 + }, + { + "epoch": 0.20998959008504703, + "grad_norm": 6.687356948852539, + "learning_rate": 1.860456149468161e-05, + "loss": 2.0166, + "step": 33410 + }, + { + "epoch": 0.21005244240174414, + "grad_norm": 7.749118328094482, + "learning_rate": 1.8604142393736955e-05, + "loss": 1.9113, + "step": 33420 + }, + { + "epoch": 0.21011529471844126, + "grad_norm": 7.288553237915039, + "learning_rate": 1.8603723292792302e-05, + "loss": 2.0303, + "step": 33430 + }, + { + "epoch": 0.21017814703513837, + "grad_norm": 7.221781253814697, + "learning_rate": 1.860330419184765e-05, + "loss": 1.6279, + "step": 33440 + }, + { + "epoch": 0.2102409993518355, + "grad_norm": 6.4051666259765625, + "learning_rate": 1.8602885090902997e-05, + "loss": 1.9534, + "step": 33450 + }, + { + "epoch": 0.2103038516685326, + "grad_norm": 6.147698402404785, + "learning_rate": 1.8602465989958344e-05, + "loss": 1.9419, + "step": 33460 + }, + { + "epoch": 0.21036670398522972, + "grad_norm": 6.287750720977783, + "learning_rate": 1.860204688901369e-05, + "loss": 1.9551, + "step": 33470 + }, + { + "epoch": 0.2104295563019268, + "grad_norm": 6.478549957275391, + "learning_rate": 1.8601627788069038e-05, + "loss": 1.8279, + "step": 33480 + }, + { + "epoch": 0.21049240861862392, + "grad_norm": 8.444985389709473, + "learning_rate": 1.860120868712438e-05, + "loss": 2.0476, + "step": 33490 + }, + { + "epoch": 0.21055526093532104, + "grad_norm": 8.121697425842285, + "learning_rate": 1.860078958617973e-05, + "loss": 2.0606, + "step": 33500 + }, + { + "epoch": 0.21061811325201815, + "grad_norm": 6.9500017166137695, + "learning_rate": 1.8600370485235076e-05, + "loss": 1.9861, + "step": 33510 + }, + { + "epoch": 0.21068096556871527, + "grad_norm": 5.910980701446533, + "learning_rate": 1.8599951384290423e-05, + "loss": 1.8387, + "step": 33520 + }, + { + "epoch": 0.21074381788541238, + "grad_norm": 7.609782695770264, + "learning_rate": 1.859953228334577e-05, + "loss": 1.9676, + "step": 33530 + }, + { + "epoch": 0.21080667020210947, + "grad_norm": 6.835140705108643, + "learning_rate": 1.8599113182401113e-05, + "loss": 2.1067, + "step": 33540 + }, + { + "epoch": 0.21086952251880658, + "grad_norm": 7.364989757537842, + "learning_rate": 1.859869408145646e-05, + "loss": 1.9079, + "step": 33550 + }, + { + "epoch": 0.2109323748355037, + "grad_norm": 8.439874649047852, + "learning_rate": 1.8598274980511808e-05, + "loss": 1.9671, + "step": 33560 + }, + { + "epoch": 0.21099522715220081, + "grad_norm": 8.540244102478027, + "learning_rate": 1.8597855879567155e-05, + "loss": 2.2535, + "step": 33570 + }, + { + "epoch": 0.21105807946889793, + "grad_norm": 7.351819038391113, + "learning_rate": 1.8597436778622498e-05, + "loss": 1.9451, + "step": 33580 + }, + { + "epoch": 0.21112093178559505, + "grad_norm": 7.363847255706787, + "learning_rate": 1.8597017677677845e-05, + "loss": 1.8018, + "step": 33590 + }, + { + "epoch": 0.21118378410229213, + "grad_norm": 7.224581718444824, + "learning_rate": 1.8596598576733192e-05, + "loss": 1.91, + "step": 33600 + }, + { + "epoch": 0.21124663641898925, + "grad_norm": 8.13434886932373, + "learning_rate": 1.859617947578854e-05, + "loss": 2.0783, + "step": 33610 + }, + { + "epoch": 0.21130948873568636, + "grad_norm": 7.822972774505615, + "learning_rate": 1.859580228493835e-05, + "loss": 1.9776, + "step": 33620 + }, + { + "epoch": 0.21137234105238348, + "grad_norm": 6.807519435882568, + "learning_rate": 1.8595383183993698e-05, + "loss": 1.8579, + "step": 33630 + }, + { + "epoch": 0.2114351933690806, + "grad_norm": 7.029597282409668, + "learning_rate": 1.8594964083049045e-05, + "loss": 2.0714, + "step": 33640 + }, + { + "epoch": 0.2114980456857777, + "grad_norm": 7.00266695022583, + "learning_rate": 1.8594544982104392e-05, + "loss": 1.8764, + "step": 33650 + }, + { + "epoch": 0.21156089800247482, + "grad_norm": 7.116758823394775, + "learning_rate": 1.8594125881159736e-05, + "loss": 2.0121, + "step": 33660 + }, + { + "epoch": 0.2116237503191719, + "grad_norm": 7.175611972808838, + "learning_rate": 1.8593706780215083e-05, + "loss": 1.7897, + "step": 33670 + }, + { + "epoch": 0.21168660263586903, + "grad_norm": 6.896945953369141, + "learning_rate": 1.859328767927043e-05, + "loss": 2.0325, + "step": 33680 + }, + { + "epoch": 0.21174945495256614, + "grad_norm": 7.818874359130859, + "learning_rate": 1.8592868578325777e-05, + "loss": 1.9826, + "step": 33690 + }, + { + "epoch": 0.21181230726926326, + "grad_norm": 7.196053504943848, + "learning_rate": 1.8592449477381124e-05, + "loss": 1.6508, + "step": 33700 + }, + { + "epoch": 0.21187515958596037, + "grad_norm": 6.747186660766602, + "learning_rate": 1.859203037643647e-05, + "loss": 1.8931, + "step": 33710 + }, + { + "epoch": 0.21193801190265749, + "grad_norm": 7.24590539932251, + "learning_rate": 1.8591611275491818e-05, + "loss": 1.9535, + "step": 33720 + }, + { + "epoch": 0.21200086421935457, + "grad_norm": 8.471415519714355, + "learning_rate": 1.8591192174547162e-05, + "loss": 2.0143, + "step": 33730 + }, + { + "epoch": 0.2120637165360517, + "grad_norm": 7.297245502471924, + "learning_rate": 1.859077307360251e-05, + "loss": 1.9066, + "step": 33740 + }, + { + "epoch": 0.2121265688527488, + "grad_norm": 8.15941047668457, + "learning_rate": 1.8590353972657856e-05, + "loss": 1.81, + "step": 33750 + }, + { + "epoch": 0.21218942116944592, + "grad_norm": 7.2019877433776855, + "learning_rate": 1.8589934871713203e-05, + "loss": 1.9632, + "step": 33760 + }, + { + "epoch": 0.21225227348614303, + "grad_norm": 6.810883045196533, + "learning_rate": 1.858951577076855e-05, + "loss": 1.796, + "step": 33770 + }, + { + "epoch": 0.21231512580284015, + "grad_norm": 7.2735419273376465, + "learning_rate": 1.8589096669823897e-05, + "loss": 1.993, + "step": 33780 + }, + { + "epoch": 0.21237797811953724, + "grad_norm": 7.423776149749756, + "learning_rate": 1.858867756887924e-05, + "loss": 1.9816, + "step": 33790 + }, + { + "epoch": 0.21244083043623435, + "grad_norm": 6.887326240539551, + "learning_rate": 1.8588258467934588e-05, + "loss": 2.0905, + "step": 33800 + }, + { + "epoch": 0.21250368275293147, + "grad_norm": 8.197897911071777, + "learning_rate": 1.8587839366989935e-05, + "loss": 2.0615, + "step": 33810 + }, + { + "epoch": 0.21256653506962858, + "grad_norm": 6.670204162597656, + "learning_rate": 1.8587420266045282e-05, + "loss": 1.9871, + "step": 33820 + }, + { + "epoch": 0.2126293873863257, + "grad_norm": 7.086750507354736, + "learning_rate": 1.8587001165100626e-05, + "loss": 1.922, + "step": 33830 + }, + { + "epoch": 0.2126922397030228, + "grad_norm": 7.694448471069336, + "learning_rate": 1.8586582064155973e-05, + "loss": 2.1166, + "step": 33840 + }, + { + "epoch": 0.21275509201971993, + "grad_norm": 7.550271034240723, + "learning_rate": 1.858616296321132e-05, + "loss": 1.84, + "step": 33850 + }, + { + "epoch": 0.21281794433641701, + "grad_norm": 6.961797714233398, + "learning_rate": 1.8585743862266667e-05, + "loss": 2.1871, + "step": 33860 + }, + { + "epoch": 0.21288079665311413, + "grad_norm": 7.044167995452881, + "learning_rate": 1.8585324761322014e-05, + "loss": 1.7825, + "step": 33870 + }, + { + "epoch": 0.21294364896981124, + "grad_norm": 7.101032733917236, + "learning_rate": 1.8584905660377358e-05, + "loss": 1.9029, + "step": 33880 + }, + { + "epoch": 0.21300650128650836, + "grad_norm": 7.548702716827393, + "learning_rate": 1.8584486559432705e-05, + "loss": 1.9279, + "step": 33890 + }, + { + "epoch": 0.21306935360320547, + "grad_norm": 8.239489555358887, + "learning_rate": 1.8584067458488052e-05, + "loss": 2.0406, + "step": 33900 + }, + { + "epoch": 0.2131322059199026, + "grad_norm": 7.909656524658203, + "learning_rate": 1.85836483575434e-05, + "loss": 2.0369, + "step": 33910 + }, + { + "epoch": 0.21319505823659968, + "grad_norm": 7.576388835906982, + "learning_rate": 1.8583229256598746e-05, + "loss": 1.8615, + "step": 33920 + }, + { + "epoch": 0.2132579105532968, + "grad_norm": 7.289960861206055, + "learning_rate": 1.8582810155654093e-05, + "loss": 2.0094, + "step": 33930 + }, + { + "epoch": 0.2133207628699939, + "grad_norm": 7.018068313598633, + "learning_rate": 1.858239105470944e-05, + "loss": 2.1906, + "step": 33940 + }, + { + "epoch": 0.21338361518669102, + "grad_norm": 7.7681684494018555, + "learning_rate": 1.8581971953764787e-05, + "loss": 2.4025, + "step": 33950 + }, + { + "epoch": 0.21344646750338814, + "grad_norm": 7.197399139404297, + "learning_rate": 1.8581552852820134e-05, + "loss": 2.0322, + "step": 33960 + }, + { + "epoch": 0.21350931982008525, + "grad_norm": 7.588764190673828, + "learning_rate": 1.8581133751875478e-05, + "loss": 2.1111, + "step": 33970 + }, + { + "epoch": 0.21357217213678237, + "grad_norm": 9.010973930358887, + "learning_rate": 1.8580714650930825e-05, + "loss": 2.1089, + "step": 33980 + }, + { + "epoch": 0.21363502445347946, + "grad_norm": 7.639834403991699, + "learning_rate": 1.8580295549986172e-05, + "loss": 1.7106, + "step": 33990 + }, + { + "epoch": 0.21369787677017657, + "grad_norm": 7.014494895935059, + "learning_rate": 1.857987644904152e-05, + "loss": 2.1511, + "step": 34000 + }, + { + "epoch": 0.21376072908687369, + "grad_norm": 13.64177417755127, + "learning_rate": 1.8579457348096863e-05, + "loss": 2.2471, + "step": 34010 + }, + { + "epoch": 0.2138235814035708, + "grad_norm": 8.525516510009766, + "learning_rate": 1.857903824715221e-05, + "loss": 1.927, + "step": 34020 + }, + { + "epoch": 0.21388643372026792, + "grad_norm": 7.092137336730957, + "learning_rate": 1.8578619146207557e-05, + "loss": 2.0672, + "step": 34030 + }, + { + "epoch": 0.21394928603696503, + "grad_norm": 7.667873382568359, + "learning_rate": 1.8578200045262904e-05, + "loss": 1.8891, + "step": 34040 + }, + { + "epoch": 0.21401213835366212, + "grad_norm": 8.19531536102295, + "learning_rate": 1.857778094431825e-05, + "loss": 1.9668, + "step": 34050 + }, + { + "epoch": 0.21407499067035923, + "grad_norm": 7.556863784790039, + "learning_rate": 1.8577361843373595e-05, + "loss": 2.002, + "step": 34060 + }, + { + "epoch": 0.21413784298705635, + "grad_norm": 6.630819320678711, + "learning_rate": 1.8576942742428942e-05, + "loss": 1.8466, + "step": 34070 + }, + { + "epoch": 0.21420069530375346, + "grad_norm": 8.189940452575684, + "learning_rate": 1.857652364148429e-05, + "loss": 1.9341, + "step": 34080 + }, + { + "epoch": 0.21426354762045058, + "grad_norm": 7.160456657409668, + "learning_rate": 1.8576104540539636e-05, + "loss": 2.0784, + "step": 34090 + }, + { + "epoch": 0.2143263999371477, + "grad_norm": 6.931938171386719, + "learning_rate": 1.8575685439594983e-05, + "loss": 1.8851, + "step": 34100 + }, + { + "epoch": 0.21438925225384478, + "grad_norm": 7.42091178894043, + "learning_rate": 1.8575266338650327e-05, + "loss": 1.9543, + "step": 34110 + }, + { + "epoch": 0.2144521045705419, + "grad_norm": 5.8673272132873535, + "learning_rate": 1.8574847237705674e-05, + "loss": 1.9578, + "step": 34120 + }, + { + "epoch": 0.214514956887239, + "grad_norm": 7.70461893081665, + "learning_rate": 1.857442813676102e-05, + "loss": 1.9633, + "step": 34130 + }, + { + "epoch": 0.21457780920393613, + "grad_norm": 7.143349647521973, + "learning_rate": 1.8574009035816368e-05, + "loss": 1.9404, + "step": 34140 + }, + { + "epoch": 0.21464066152063324, + "grad_norm": 6.253964900970459, + "learning_rate": 1.8573589934871715e-05, + "loss": 1.9488, + "step": 34150 + }, + { + "epoch": 0.21470351383733036, + "grad_norm": 7.41900110244751, + "learning_rate": 1.8573170833927062e-05, + "loss": 2.109, + "step": 34160 + }, + { + "epoch": 0.21476636615402747, + "grad_norm": 8.520472526550293, + "learning_rate": 1.857275173298241e-05, + "loss": 1.9835, + "step": 34170 + }, + { + "epoch": 0.21482921847072456, + "grad_norm": 7.361577987670898, + "learning_rate": 1.8572332632037756e-05, + "loss": 1.9218, + "step": 34180 + }, + { + "epoch": 0.21489207078742167, + "grad_norm": 6.775546073913574, + "learning_rate": 1.85719135310931e-05, + "loss": 1.7766, + "step": 34190 + }, + { + "epoch": 0.2149549231041188, + "grad_norm": 6.618929386138916, + "learning_rate": 1.8571494430148447e-05, + "loss": 1.8218, + "step": 34200 + }, + { + "epoch": 0.2150177754208159, + "grad_norm": 8.674612045288086, + "learning_rate": 1.8571075329203794e-05, + "loss": 1.9593, + "step": 34210 + }, + { + "epoch": 0.21508062773751302, + "grad_norm": 7.391805648803711, + "learning_rate": 1.857065622825914e-05, + "loss": 2.0312, + "step": 34220 + }, + { + "epoch": 0.21514348005421013, + "grad_norm": 7.1152753829956055, + "learning_rate": 1.8570237127314485e-05, + "loss": 2.0281, + "step": 34230 + }, + { + "epoch": 0.21520633237090722, + "grad_norm": 8.81369400024414, + "learning_rate": 1.8569818026369832e-05, + "loss": 2.0559, + "step": 34240 + }, + { + "epoch": 0.21526918468760434, + "grad_norm": 6.1095452308654785, + "learning_rate": 1.856939892542518e-05, + "loss": 1.7486, + "step": 34250 + }, + { + "epoch": 0.21533203700430145, + "grad_norm": 7.259755611419678, + "learning_rate": 1.8568979824480526e-05, + "loss": 2.0501, + "step": 34260 + }, + { + "epoch": 0.21539488932099857, + "grad_norm": 7.357776165008545, + "learning_rate": 1.8568560723535873e-05, + "loss": 1.6965, + "step": 34270 + }, + { + "epoch": 0.21545774163769568, + "grad_norm": 7.37483549118042, + "learning_rate": 1.8568141622591217e-05, + "loss": 2.0616, + "step": 34280 + }, + { + "epoch": 0.2155205939543928, + "grad_norm": 7.865124702453613, + "learning_rate": 1.8567722521646564e-05, + "loss": 2.0083, + "step": 34290 + }, + { + "epoch": 0.21558344627108988, + "grad_norm": 6.377746105194092, + "learning_rate": 1.856730342070191e-05, + "loss": 1.9817, + "step": 34300 + }, + { + "epoch": 0.215646298587787, + "grad_norm": 8.451074600219727, + "learning_rate": 1.8566884319757258e-05, + "loss": 2.0513, + "step": 34310 + }, + { + "epoch": 0.21570915090448411, + "grad_norm": 6.891283988952637, + "learning_rate": 1.8566465218812605e-05, + "loss": 1.7818, + "step": 34320 + }, + { + "epoch": 0.21577200322118123, + "grad_norm": 7.912236213684082, + "learning_rate": 1.8566046117867952e-05, + "loss": 2.0117, + "step": 34330 + }, + { + "epoch": 0.21583485553787835, + "grad_norm": 6.531398296356201, + "learning_rate": 1.85656270169233e-05, + "loss": 2.1723, + "step": 34340 + }, + { + "epoch": 0.21589770785457546, + "grad_norm": 7.56461238861084, + "learning_rate": 1.8565207915978646e-05, + "loss": 2.1443, + "step": 34350 + }, + { + "epoch": 0.21596056017127258, + "grad_norm": 7.823632717132568, + "learning_rate": 1.856478881503399e-05, + "loss": 1.9112, + "step": 34360 + }, + { + "epoch": 0.21602341248796966, + "grad_norm": 8.01508903503418, + "learning_rate": 1.8564369714089337e-05, + "loss": 2.1299, + "step": 34370 + }, + { + "epoch": 0.21608626480466678, + "grad_norm": 6.519814968109131, + "learning_rate": 1.8563950613144684e-05, + "loss": 1.9654, + "step": 34380 + }, + { + "epoch": 0.2161491171213639, + "grad_norm": 7.177895545959473, + "learning_rate": 1.856353151220003e-05, + "loss": 2.0657, + "step": 34390 + }, + { + "epoch": 0.216211969438061, + "grad_norm": 6.885359287261963, + "learning_rate": 1.856311241125538e-05, + "loss": 1.7852, + "step": 34400 + }, + { + "epoch": 0.21627482175475812, + "grad_norm": 9.090568542480469, + "learning_rate": 1.8562693310310722e-05, + "loss": 1.7628, + "step": 34410 + }, + { + "epoch": 0.21633767407145524, + "grad_norm": 8.141672134399414, + "learning_rate": 1.856227420936607e-05, + "loss": 1.9475, + "step": 34420 + }, + { + "epoch": 0.21640052638815233, + "grad_norm": 7.501651287078857, + "learning_rate": 1.8561855108421416e-05, + "loss": 1.9273, + "step": 34430 + }, + { + "epoch": 0.21646337870484944, + "grad_norm": 6.813697814941406, + "learning_rate": 1.8561436007476763e-05, + "loss": 1.9541, + "step": 34440 + }, + { + "epoch": 0.21652623102154656, + "grad_norm": 6.917619228363037, + "learning_rate": 1.8561016906532107e-05, + "loss": 1.9514, + "step": 34450 + }, + { + "epoch": 0.21658908333824367, + "grad_norm": 7.056396961212158, + "learning_rate": 1.8560597805587454e-05, + "loss": 2.0344, + "step": 34460 + }, + { + "epoch": 0.21665193565494079, + "grad_norm": 7.068763256072998, + "learning_rate": 1.85601787046428e-05, + "loss": 2.0627, + "step": 34470 + }, + { + "epoch": 0.2167147879716379, + "grad_norm": 6.844529151916504, + "learning_rate": 1.8559759603698148e-05, + "loss": 1.9777, + "step": 34480 + }, + { + "epoch": 0.21677764028833502, + "grad_norm": 6.759030342102051, + "learning_rate": 1.8559340502753495e-05, + "loss": 1.8708, + "step": 34490 + }, + { + "epoch": 0.2168404926050321, + "grad_norm": 6.698564052581787, + "learning_rate": 1.855892140180884e-05, + "loss": 2.0901, + "step": 34500 + }, + { + "epoch": 0.21690334492172922, + "grad_norm": 7.497093677520752, + "learning_rate": 1.8558502300864186e-05, + "loss": 1.7706, + "step": 34510 + }, + { + "epoch": 0.21696619723842633, + "grad_norm": 8.183006286621094, + "learning_rate": 1.8558083199919533e-05, + "loss": 2.0137, + "step": 34520 + }, + { + "epoch": 0.21702904955512345, + "grad_norm": 7.996163845062256, + "learning_rate": 1.855766409897488e-05, + "loss": 2.0093, + "step": 34530 + }, + { + "epoch": 0.21709190187182056, + "grad_norm": 7.119990348815918, + "learning_rate": 1.8557244998030227e-05, + "loss": 1.8239, + "step": 34540 + }, + { + "epoch": 0.21715475418851768, + "grad_norm": 8.420408248901367, + "learning_rate": 1.8556825897085574e-05, + "loss": 2.3014, + "step": 34550 + }, + { + "epoch": 0.21721760650521477, + "grad_norm": 7.02096700668335, + "learning_rate": 1.855640679614092e-05, + "loss": 1.923, + "step": 34560 + }, + { + "epoch": 0.21728045882191188, + "grad_norm": 6.849606990814209, + "learning_rate": 1.855598769519627e-05, + "loss": 2.2225, + "step": 34570 + }, + { + "epoch": 0.217343311138609, + "grad_norm": 7.89677619934082, + "learning_rate": 1.8555568594251615e-05, + "loss": 1.5972, + "step": 34580 + }, + { + "epoch": 0.2174061634553061, + "grad_norm": 7.193415641784668, + "learning_rate": 1.855514949330696e-05, + "loss": 1.9054, + "step": 34590 + }, + { + "epoch": 0.21746901577200323, + "grad_norm": 7.3271331787109375, + "learning_rate": 1.8554730392362306e-05, + "loss": 1.9486, + "step": 34600 + }, + { + "epoch": 0.21753186808870034, + "grad_norm": 7.489126205444336, + "learning_rate": 1.8554311291417653e-05, + "loss": 1.8873, + "step": 34610 + }, + { + "epoch": 0.21759472040539743, + "grad_norm": 6.797194957733154, + "learning_rate": 1.8553892190473e-05, + "loss": 1.9083, + "step": 34620 + }, + { + "epoch": 0.21765757272209454, + "grad_norm": 7.961541175842285, + "learning_rate": 1.8553473089528344e-05, + "loss": 1.6529, + "step": 34630 + }, + { + "epoch": 0.21772042503879166, + "grad_norm": 7.54440450668335, + "learning_rate": 1.855305398858369e-05, + "loss": 1.8152, + "step": 34640 + }, + { + "epoch": 0.21778327735548877, + "grad_norm": 7.466855049133301, + "learning_rate": 1.8552634887639038e-05, + "loss": 1.6849, + "step": 34650 + }, + { + "epoch": 0.2178461296721859, + "grad_norm": 7.031246185302734, + "learning_rate": 1.8552215786694385e-05, + "loss": 1.9987, + "step": 34660 + }, + { + "epoch": 0.217908981988883, + "grad_norm": 8.001404762268066, + "learning_rate": 1.8551796685749732e-05, + "loss": 2.2657, + "step": 34670 + }, + { + "epoch": 0.21797183430558012, + "grad_norm": 6.206151962280273, + "learning_rate": 1.8551377584805076e-05, + "loss": 2.0952, + "step": 34680 + }, + { + "epoch": 0.2180346866222772, + "grad_norm": 8.160574913024902, + "learning_rate": 1.8550958483860423e-05, + "loss": 2.24, + "step": 34690 + }, + { + "epoch": 0.21809753893897432, + "grad_norm": 8.026134490966797, + "learning_rate": 1.855053938291577e-05, + "loss": 1.9693, + "step": 34700 + }, + { + "epoch": 0.21816039125567144, + "grad_norm": 7.086154460906982, + "learning_rate": 1.8550120281971117e-05, + "loss": 1.8228, + "step": 34710 + }, + { + "epoch": 0.21822324357236855, + "grad_norm": 7.1722731590271, + "learning_rate": 1.8549701181026464e-05, + "loss": 1.6558, + "step": 34720 + }, + { + "epoch": 0.21828609588906567, + "grad_norm": 7.947413444519043, + "learning_rate": 1.854928208008181e-05, + "loss": 1.6155, + "step": 34730 + }, + { + "epoch": 0.21834894820576278, + "grad_norm": 8.06889533996582, + "learning_rate": 1.8548862979137155e-05, + "loss": 2.0601, + "step": 34740 + }, + { + "epoch": 0.21841180052245987, + "grad_norm": 8.035139083862305, + "learning_rate": 1.8548443878192502e-05, + "loss": 1.9296, + "step": 34750 + }, + { + "epoch": 0.21847465283915699, + "grad_norm": 6.365030765533447, + "learning_rate": 1.854802477724785e-05, + "loss": 1.8284, + "step": 34760 + }, + { + "epoch": 0.2185375051558541, + "grad_norm": 7.066539764404297, + "learning_rate": 1.8547605676303196e-05, + "loss": 1.9459, + "step": 34770 + }, + { + "epoch": 0.21860035747255122, + "grad_norm": 7.134952068328857, + "learning_rate": 1.8547186575358543e-05, + "loss": 1.9494, + "step": 34780 + }, + { + "epoch": 0.21866320978924833, + "grad_norm": 6.721660137176514, + "learning_rate": 1.854676747441389e-05, + "loss": 1.9896, + "step": 34790 + }, + { + "epoch": 0.21872606210594545, + "grad_norm": 5.704010009765625, + "learning_rate": 1.8546348373469237e-05, + "loss": 2.0603, + "step": 34800 + }, + { + "epoch": 0.21878891442264256, + "grad_norm": 6.107640266418457, + "learning_rate": 1.8545971182619045e-05, + "loss": 1.844, + "step": 34810 + }, + { + "epoch": 0.21885176673933965, + "grad_norm": 6.949453353881836, + "learning_rate": 1.8545552081674392e-05, + "loss": 1.992, + "step": 34820 + }, + { + "epoch": 0.21891461905603676, + "grad_norm": 7.728182315826416, + "learning_rate": 1.854513298072974e-05, + "loss": 1.9564, + "step": 34830 + }, + { + "epoch": 0.21897747137273388, + "grad_norm": 6.855501174926758, + "learning_rate": 1.8544713879785087e-05, + "loss": 2.0618, + "step": 34840 + }, + { + "epoch": 0.219040323689431, + "grad_norm": 7.998497009277344, + "learning_rate": 1.8544294778840434e-05, + "loss": 2.0028, + "step": 34850 + }, + { + "epoch": 0.2191031760061281, + "grad_norm": 7.709017753601074, + "learning_rate": 1.854387567789578e-05, + "loss": 2.2238, + "step": 34860 + }, + { + "epoch": 0.21916602832282522, + "grad_norm": 7.136343955993652, + "learning_rate": 1.8543456576951128e-05, + "loss": 1.8661, + "step": 34870 + }, + { + "epoch": 0.2192288806395223, + "grad_norm": 6.129396915435791, + "learning_rate": 1.8543037476006475e-05, + "loss": 1.8171, + "step": 34880 + }, + { + "epoch": 0.21929173295621943, + "grad_norm": 7.82442569732666, + "learning_rate": 1.854261837506182e-05, + "loss": 2.0803, + "step": 34890 + }, + { + "epoch": 0.21935458527291654, + "grad_norm": 7.109223365783691, + "learning_rate": 1.8542199274117166e-05, + "loss": 1.7726, + "step": 34900 + }, + { + "epoch": 0.21941743758961366, + "grad_norm": 6.688714981079102, + "learning_rate": 1.8541780173172513e-05, + "loss": 1.9677, + "step": 34910 + }, + { + "epoch": 0.21948028990631077, + "grad_norm": 7.616767406463623, + "learning_rate": 1.854136107222786e-05, + "loss": 1.9761, + "step": 34920 + }, + { + "epoch": 0.2195431422230079, + "grad_norm": 8.643089294433594, + "learning_rate": 1.8540941971283203e-05, + "loss": 1.9013, + "step": 34930 + }, + { + "epoch": 0.21960599453970497, + "grad_norm": 7.0558576583862305, + "learning_rate": 1.854052287033855e-05, + "loss": 2.1458, + "step": 34940 + }, + { + "epoch": 0.2196688468564021, + "grad_norm": 8.11750316619873, + "learning_rate": 1.8540103769393898e-05, + "loss": 1.89, + "step": 34950 + }, + { + "epoch": 0.2197316991730992, + "grad_norm": 7.2882513999938965, + "learning_rate": 1.8539684668449245e-05, + "loss": 1.97, + "step": 34960 + }, + { + "epoch": 0.21979455148979632, + "grad_norm": 6.838789939880371, + "learning_rate": 1.853926556750459e-05, + "loss": 1.9848, + "step": 34970 + }, + { + "epoch": 0.21985740380649343, + "grad_norm": 8.134601593017578, + "learning_rate": 1.8538846466559935e-05, + "loss": 1.9338, + "step": 34980 + }, + { + "epoch": 0.21992025612319055, + "grad_norm": 7.3437089920043945, + "learning_rate": 1.8538427365615282e-05, + "loss": 1.9343, + "step": 34990 + }, + { + "epoch": 0.21998310843988766, + "grad_norm": 7.4846343994140625, + "learning_rate": 1.853800826467063e-05, + "loss": 2.0585, + "step": 35000 + }, + { + "epoch": 0.22004596075658475, + "grad_norm": 5.733299732208252, + "learning_rate": 1.8537589163725977e-05, + "loss": 1.5824, + "step": 35010 + }, + { + "epoch": 0.22010881307328187, + "grad_norm": 6.123286724090576, + "learning_rate": 1.8537170062781324e-05, + "loss": 1.7044, + "step": 35020 + }, + { + "epoch": 0.22017166538997898, + "grad_norm": 7.225106239318848, + "learning_rate": 1.853675096183667e-05, + "loss": 1.985, + "step": 35030 + }, + { + "epoch": 0.2202345177066761, + "grad_norm": 7.095724582672119, + "learning_rate": 1.8536331860892018e-05, + "loss": 1.9784, + "step": 35040 + }, + { + "epoch": 0.2202973700233732, + "grad_norm": 5.690850734710693, + "learning_rate": 1.853591275994736e-05, + "loss": 1.8355, + "step": 35050 + }, + { + "epoch": 0.22036022234007033, + "grad_norm": 6.794473171234131, + "learning_rate": 1.853549365900271e-05, + "loss": 1.9682, + "step": 35060 + }, + { + "epoch": 0.22042307465676741, + "grad_norm": 7.523730754852295, + "learning_rate": 1.8535074558058056e-05, + "loss": 1.9303, + "step": 35070 + }, + { + "epoch": 0.22048592697346453, + "grad_norm": 6.781824111938477, + "learning_rate": 1.8534655457113403e-05, + "loss": 2.0273, + "step": 35080 + }, + { + "epoch": 0.22054877929016165, + "grad_norm": 7.648756980895996, + "learning_rate": 1.853423635616875e-05, + "loss": 2.0898, + "step": 35090 + }, + { + "epoch": 0.22061163160685876, + "grad_norm": 6.996502876281738, + "learning_rate": 1.8533817255224097e-05, + "loss": 1.8881, + "step": 35100 + }, + { + "epoch": 0.22067448392355588, + "grad_norm": 7.366175174713135, + "learning_rate": 1.853339815427944e-05, + "loss": 1.9159, + "step": 35110 + }, + { + "epoch": 0.220737336240253, + "grad_norm": 7.124420166015625, + "learning_rate": 1.8532979053334788e-05, + "loss": 2.0173, + "step": 35120 + }, + { + "epoch": 0.22080018855695008, + "grad_norm": 9.098976135253906, + "learning_rate": 1.8532559952390135e-05, + "loss": 1.9402, + "step": 35130 + }, + { + "epoch": 0.2208630408736472, + "grad_norm": 7.208345890045166, + "learning_rate": 1.8532140851445482e-05, + "loss": 2.1235, + "step": 35140 + }, + { + "epoch": 0.2209258931903443, + "grad_norm": 7.569570064544678, + "learning_rate": 1.8531721750500825e-05, + "loss": 2.0295, + "step": 35150 + }, + { + "epoch": 0.22098874550704142, + "grad_norm": 8.353118896484375, + "learning_rate": 1.8531302649556173e-05, + "loss": 2.1179, + "step": 35160 + }, + { + "epoch": 0.22105159782373854, + "grad_norm": 6.591300010681152, + "learning_rate": 1.853088354861152e-05, + "loss": 1.82, + "step": 35170 + }, + { + "epoch": 0.22111445014043565, + "grad_norm": 7.18334436416626, + "learning_rate": 1.8530464447666867e-05, + "loss": 2.1257, + "step": 35180 + }, + { + "epoch": 0.22117730245713277, + "grad_norm": 7.796203136444092, + "learning_rate": 1.8530045346722214e-05, + "loss": 1.9239, + "step": 35190 + }, + { + "epoch": 0.22124015477382986, + "grad_norm": 7.146561145782471, + "learning_rate": 1.8529626245777557e-05, + "loss": 1.7704, + "step": 35200 + }, + { + "epoch": 0.22130300709052697, + "grad_norm": 6.450527191162109, + "learning_rate": 1.8529207144832904e-05, + "loss": 2.0196, + "step": 35210 + }, + { + "epoch": 0.22136585940722409, + "grad_norm": 6.391997814178467, + "learning_rate": 1.852878804388825e-05, + "loss": 1.9764, + "step": 35220 + }, + { + "epoch": 0.2214287117239212, + "grad_norm": 7.410685062408447, + "learning_rate": 1.85283689429436e-05, + "loss": 1.9668, + "step": 35230 + }, + { + "epoch": 0.22149156404061832, + "grad_norm": 8.92043685913086, + "learning_rate": 1.8527949841998946e-05, + "loss": 2.041, + "step": 35240 + }, + { + "epoch": 0.22155441635731543, + "grad_norm": 6.214332580566406, + "learning_rate": 1.8527530741054293e-05, + "loss": 1.902, + "step": 35250 + }, + { + "epoch": 0.22161726867401252, + "grad_norm": 7.188347339630127, + "learning_rate": 1.852711164010964e-05, + "loss": 1.828, + "step": 35260 + }, + { + "epoch": 0.22168012099070963, + "grad_norm": 7.721948623657227, + "learning_rate": 1.8526692539164987e-05, + "loss": 2.0143, + "step": 35270 + }, + { + "epoch": 0.22174297330740675, + "grad_norm": 7.235950946807861, + "learning_rate": 1.852627343822033e-05, + "loss": 1.6793, + "step": 35280 + }, + { + "epoch": 0.22180582562410386, + "grad_norm": 7.141629219055176, + "learning_rate": 1.8525854337275678e-05, + "loss": 1.9044, + "step": 35290 + }, + { + "epoch": 0.22186867794080098, + "grad_norm": 6.771084308624268, + "learning_rate": 1.8525435236331025e-05, + "loss": 2.1811, + "step": 35300 + }, + { + "epoch": 0.2219315302574981, + "grad_norm": 7.240036487579346, + "learning_rate": 1.8525016135386372e-05, + "loss": 1.8158, + "step": 35310 + }, + { + "epoch": 0.2219943825741952, + "grad_norm": 6.458792209625244, + "learning_rate": 1.852459703444172e-05, + "loss": 1.9552, + "step": 35320 + }, + { + "epoch": 0.2220572348908923, + "grad_norm": 6.815674304962158, + "learning_rate": 1.8524177933497063e-05, + "loss": 1.8711, + "step": 35330 + }, + { + "epoch": 0.2221200872075894, + "grad_norm": 6.916085243225098, + "learning_rate": 1.852375883255241e-05, + "loss": 2.02, + "step": 35340 + }, + { + "epoch": 0.22218293952428653, + "grad_norm": 6.218405723571777, + "learning_rate": 1.8523339731607757e-05, + "loss": 1.8892, + "step": 35350 + }, + { + "epoch": 0.22224579184098364, + "grad_norm": 6.3282294273376465, + "learning_rate": 1.8522920630663104e-05, + "loss": 1.8681, + "step": 35360 + }, + { + "epoch": 0.22230864415768076, + "grad_norm": 5.904386043548584, + "learning_rate": 1.8522501529718447e-05, + "loss": 1.9586, + "step": 35370 + }, + { + "epoch": 0.22237149647437787, + "grad_norm": 6.8369526863098145, + "learning_rate": 1.8522082428773795e-05, + "loss": 1.9303, + "step": 35380 + }, + { + "epoch": 0.22243434879107496, + "grad_norm": 8.529288291931152, + "learning_rate": 1.852166332782914e-05, + "loss": 2.1147, + "step": 35390 + }, + { + "epoch": 0.22249720110777207, + "grad_norm": 13.044854164123535, + "learning_rate": 1.852124422688449e-05, + "loss": 1.7855, + "step": 35400 + }, + { + "epoch": 0.2225600534244692, + "grad_norm": 7.8360819816589355, + "learning_rate": 1.8520825125939836e-05, + "loss": 2.1567, + "step": 35410 + }, + { + "epoch": 0.2226229057411663, + "grad_norm": 7.3442769050598145, + "learning_rate": 1.8520406024995183e-05, + "loss": 1.9528, + "step": 35420 + }, + { + "epoch": 0.22268575805786342, + "grad_norm": 6.816009521484375, + "learning_rate": 1.8519986924050526e-05, + "loss": 1.9875, + "step": 35430 + }, + { + "epoch": 0.22274861037456054, + "grad_norm": 6.637674331665039, + "learning_rate": 1.8519567823105874e-05, + "loss": 1.9868, + "step": 35440 + }, + { + "epoch": 0.22281146269125762, + "grad_norm": 7.885603904724121, + "learning_rate": 1.851914872216122e-05, + "loss": 2.0622, + "step": 35450 + }, + { + "epoch": 0.22287431500795474, + "grad_norm": 7.169187068939209, + "learning_rate": 1.8518729621216568e-05, + "loss": 1.741, + "step": 35460 + }, + { + "epoch": 0.22293716732465185, + "grad_norm": 7.062591075897217, + "learning_rate": 1.8518310520271915e-05, + "loss": 1.9444, + "step": 35470 + }, + { + "epoch": 0.22300001964134897, + "grad_norm": 8.210655212402344, + "learning_rate": 1.8517891419327262e-05, + "loss": 1.9655, + "step": 35480 + }, + { + "epoch": 0.22306287195804608, + "grad_norm": 7.064944267272949, + "learning_rate": 1.851747231838261e-05, + "loss": 2.0158, + "step": 35490 + }, + { + "epoch": 0.2231257242747432, + "grad_norm": 6.961498737335205, + "learning_rate": 1.8517053217437956e-05, + "loss": 1.8474, + "step": 35500 + }, + { + "epoch": 0.2231885765914403, + "grad_norm": 7.126535892486572, + "learning_rate": 1.85166341164933e-05, + "loss": 1.9145, + "step": 35510 + }, + { + "epoch": 0.2232514289081374, + "grad_norm": 6.700169563293457, + "learning_rate": 1.8516215015548647e-05, + "loss": 1.9152, + "step": 35520 + }, + { + "epoch": 0.22331428122483452, + "grad_norm": 7.212548732757568, + "learning_rate": 1.8515795914603994e-05, + "loss": 1.967, + "step": 35530 + }, + { + "epoch": 0.22337713354153163, + "grad_norm": 6.833872318267822, + "learning_rate": 1.851537681365934e-05, + "loss": 1.7896, + "step": 35540 + }, + { + "epoch": 0.22343998585822875, + "grad_norm": 7.667410373687744, + "learning_rate": 1.8514957712714685e-05, + "loss": 1.9779, + "step": 35550 + }, + { + "epoch": 0.22350283817492586, + "grad_norm": 7.731854438781738, + "learning_rate": 1.851453861177003e-05, + "loss": 2.0346, + "step": 35560 + }, + { + "epoch": 0.22356569049162298, + "grad_norm": 7.555546283721924, + "learning_rate": 1.851411951082538e-05, + "loss": 2.0954, + "step": 35570 + }, + { + "epoch": 0.22362854280832006, + "grad_norm": 7.541079044342041, + "learning_rate": 1.8513700409880726e-05, + "loss": 2.0232, + "step": 35580 + }, + { + "epoch": 0.22369139512501718, + "grad_norm": 7.672801971435547, + "learning_rate": 1.851328130893607e-05, + "loss": 1.8418, + "step": 35590 + }, + { + "epoch": 0.2237542474417143, + "grad_norm": 8.663089752197266, + "learning_rate": 1.8512862207991417e-05, + "loss": 1.9483, + "step": 35600 + }, + { + "epoch": 0.2238170997584114, + "grad_norm": 7.020554542541504, + "learning_rate": 1.8512443107046764e-05, + "loss": 2.0585, + "step": 35610 + }, + { + "epoch": 0.22387995207510852, + "grad_norm": 7.544107437133789, + "learning_rate": 1.851202400610211e-05, + "loss": 1.9529, + "step": 35620 + }, + { + "epoch": 0.22394280439180564, + "grad_norm": 7.0736470222473145, + "learning_rate": 1.8511604905157458e-05, + "loss": 1.7787, + "step": 35630 + }, + { + "epoch": 0.22400565670850273, + "grad_norm": 8.328640937805176, + "learning_rate": 1.8511185804212805e-05, + "loss": 1.8952, + "step": 35640 + }, + { + "epoch": 0.22406850902519984, + "grad_norm": 7.583932876586914, + "learning_rate": 1.8510766703268152e-05, + "loss": 2.076, + "step": 35650 + }, + { + "epoch": 0.22413136134189696, + "grad_norm": 7.296407699584961, + "learning_rate": 1.85103476023235e-05, + "loss": 1.7793, + "step": 35660 + }, + { + "epoch": 0.22419421365859407, + "grad_norm": 6.540462017059326, + "learning_rate": 1.8509928501378846e-05, + "loss": 1.9372, + "step": 35670 + }, + { + "epoch": 0.2242570659752912, + "grad_norm": 7.858813285827637, + "learning_rate": 1.850950940043419e-05, + "loss": 1.6556, + "step": 35680 + }, + { + "epoch": 0.2243199182919883, + "grad_norm": 6.697934150695801, + "learning_rate": 1.8509090299489537e-05, + "loss": 1.7858, + "step": 35690 + }, + { + "epoch": 0.22438277060868542, + "grad_norm": 7.115276336669922, + "learning_rate": 1.8508671198544884e-05, + "loss": 1.9851, + "step": 35700 + }, + { + "epoch": 0.2244456229253825, + "grad_norm": 7.1068925857543945, + "learning_rate": 1.850825209760023e-05, + "loss": 1.9345, + "step": 35710 + }, + { + "epoch": 0.22450847524207962, + "grad_norm": 7.588796615600586, + "learning_rate": 1.8507832996655578e-05, + "loss": 2.0009, + "step": 35720 + }, + { + "epoch": 0.22457132755877673, + "grad_norm": 6.471503257751465, + "learning_rate": 1.850741389571092e-05, + "loss": 2.0809, + "step": 35730 + }, + { + "epoch": 0.22463417987547385, + "grad_norm": 8.151644706726074, + "learning_rate": 1.850699479476627e-05, + "loss": 2.0383, + "step": 35740 + }, + { + "epoch": 0.22469703219217096, + "grad_norm": 6.93273401260376, + "learning_rate": 1.8506575693821616e-05, + "loss": 2.0041, + "step": 35750 + }, + { + "epoch": 0.22475988450886808, + "grad_norm": 7.2442121505737305, + "learning_rate": 1.8506156592876963e-05, + "loss": 1.6638, + "step": 35760 + }, + { + "epoch": 0.22482273682556517, + "grad_norm": 6.275068759918213, + "learning_rate": 1.8505737491932307e-05, + "loss": 1.8618, + "step": 35770 + }, + { + "epoch": 0.22488558914226228, + "grad_norm": 8.405787467956543, + "learning_rate": 1.8505318390987654e-05, + "loss": 2.202, + "step": 35780 + }, + { + "epoch": 0.2249484414589594, + "grad_norm": 7.45995569229126, + "learning_rate": 1.8504899290043e-05, + "loss": 2.0527, + "step": 35790 + }, + { + "epoch": 0.2250112937756565, + "grad_norm": 7.501440048217773, + "learning_rate": 1.8504480189098348e-05, + "loss": 1.8311, + "step": 35800 + }, + { + "epoch": 0.22507414609235363, + "grad_norm": 6.156557083129883, + "learning_rate": 1.8504061088153695e-05, + "loss": 2.0608, + "step": 35810 + }, + { + "epoch": 0.22513699840905074, + "grad_norm": 7.16656494140625, + "learning_rate": 1.850364198720904e-05, + "loss": 1.8149, + "step": 35820 + }, + { + "epoch": 0.22519985072574786, + "grad_norm": 8.1950101852417, + "learning_rate": 1.8503222886264386e-05, + "loss": 1.8656, + "step": 35830 + }, + { + "epoch": 0.22526270304244495, + "grad_norm": 7.542734146118164, + "learning_rate": 1.8502803785319733e-05, + "loss": 2.0309, + "step": 35840 + }, + { + "epoch": 0.22532555535914206, + "grad_norm": 8.013267517089844, + "learning_rate": 1.850238468437508e-05, + "loss": 1.7096, + "step": 35850 + }, + { + "epoch": 0.22538840767583918, + "grad_norm": 7.8642964363098145, + "learning_rate": 1.8501965583430427e-05, + "loss": 2.0083, + "step": 35860 + }, + { + "epoch": 0.2254512599925363, + "grad_norm": 7.795157432556152, + "learning_rate": 1.8501546482485774e-05, + "loss": 1.8551, + "step": 35870 + }, + { + "epoch": 0.2255141123092334, + "grad_norm": 9.358570098876953, + "learning_rate": 1.850112738154112e-05, + "loss": 1.9385, + "step": 35880 + }, + { + "epoch": 0.22557696462593052, + "grad_norm": 7.4309587478637695, + "learning_rate": 1.8500708280596468e-05, + "loss": 1.6916, + "step": 35890 + }, + { + "epoch": 0.2256398169426276, + "grad_norm": 7.713372230529785, + "learning_rate": 1.8500289179651812e-05, + "loss": 1.9155, + "step": 35900 + }, + { + "epoch": 0.22570266925932472, + "grad_norm": 6.559614181518555, + "learning_rate": 1.849987007870716e-05, + "loss": 1.9792, + "step": 35910 + }, + { + "epoch": 0.22576552157602184, + "grad_norm": 8.431838035583496, + "learning_rate": 1.8499450977762506e-05, + "loss": 2.0648, + "step": 35920 + }, + { + "epoch": 0.22582837389271895, + "grad_norm": 7.000290393829346, + "learning_rate": 1.8499031876817853e-05, + "loss": 1.8901, + "step": 35930 + }, + { + "epoch": 0.22589122620941607, + "grad_norm": 6.437477111816406, + "learning_rate": 1.84986127758732e-05, + "loss": 1.8744, + "step": 35940 + }, + { + "epoch": 0.22595407852611318, + "grad_norm": 5.661660194396973, + "learning_rate": 1.8498193674928544e-05, + "loss": 1.9295, + "step": 35950 + }, + { + "epoch": 0.22601693084281027, + "grad_norm": 7.3603692054748535, + "learning_rate": 1.849777457398389e-05, + "loss": 1.9294, + "step": 35960 + }, + { + "epoch": 0.22607978315950739, + "grad_norm": 8.71002197265625, + "learning_rate": 1.8497355473039238e-05, + "loss": 1.8093, + "step": 35970 + }, + { + "epoch": 0.2261426354762045, + "grad_norm": 7.771989345550537, + "learning_rate": 1.8496936372094585e-05, + "loss": 1.8833, + "step": 35980 + }, + { + "epoch": 0.22620548779290162, + "grad_norm": 6.397435665130615, + "learning_rate": 1.849651727114993e-05, + "loss": 1.7469, + "step": 35990 + }, + { + "epoch": 0.22626834010959873, + "grad_norm": 7.078105449676514, + "learning_rate": 1.8496098170205276e-05, + "loss": 1.825, + "step": 36000 + }, + { + "epoch": 0.22633119242629585, + "grad_norm": 7.667077541351318, + "learning_rate": 1.8495679069260623e-05, + "loss": 1.7933, + "step": 36010 + }, + { + "epoch": 0.22639404474299296, + "grad_norm": 7.291276931762695, + "learning_rate": 1.849525996831597e-05, + "loss": 2.0556, + "step": 36020 + }, + { + "epoch": 0.22645689705969005, + "grad_norm": 7.426200866699219, + "learning_rate": 1.8494840867371317e-05, + "loss": 1.8302, + "step": 36030 + }, + { + "epoch": 0.22651974937638716, + "grad_norm": 6.769180774688721, + "learning_rate": 1.8494421766426664e-05, + "loss": 1.9561, + "step": 36040 + }, + { + "epoch": 0.22658260169308428, + "grad_norm": 7.960482597351074, + "learning_rate": 1.8494002665482008e-05, + "loss": 1.7625, + "step": 36050 + }, + { + "epoch": 0.2266454540097814, + "grad_norm": 7.0758585929870605, + "learning_rate": 1.8493583564537355e-05, + "loss": 1.8897, + "step": 36060 + }, + { + "epoch": 0.2267083063264785, + "grad_norm": 6.825809955596924, + "learning_rate": 1.8493164463592702e-05, + "loss": 1.9526, + "step": 36070 + }, + { + "epoch": 0.22677115864317562, + "grad_norm": 6.6412858963012695, + "learning_rate": 1.849274536264805e-05, + "loss": 1.9598, + "step": 36080 + }, + { + "epoch": 0.2268340109598727, + "grad_norm": 7.068365097045898, + "learning_rate": 1.8492326261703396e-05, + "loss": 2.061, + "step": 36090 + }, + { + "epoch": 0.22689686327656983, + "grad_norm": 8.627281188964844, + "learning_rate": 1.8491907160758743e-05, + "loss": 1.8702, + "step": 36100 + }, + { + "epoch": 0.22695971559326694, + "grad_norm": 6.969089984893799, + "learning_rate": 1.849148805981409e-05, + "loss": 2.0275, + "step": 36110 + }, + { + "epoch": 0.22702256790996406, + "grad_norm": 7.00883674621582, + "learning_rate": 1.8491068958869437e-05, + "loss": 2.0406, + "step": 36120 + }, + { + "epoch": 0.22708542022666117, + "grad_norm": 7.159546375274658, + "learning_rate": 1.849064985792478e-05, + "loss": 2.1287, + "step": 36130 + }, + { + "epoch": 0.2271482725433583, + "grad_norm": 6.845968723297119, + "learning_rate": 1.8490230756980128e-05, + "loss": 1.9461, + "step": 36140 + }, + { + "epoch": 0.2272111248600554, + "grad_norm": 7.5593414306640625, + "learning_rate": 1.8489811656035475e-05, + "loss": 2.1837, + "step": 36150 + }, + { + "epoch": 0.2272739771767525, + "grad_norm": 7.399803638458252, + "learning_rate": 1.8489392555090822e-05, + "loss": 1.9431, + "step": 36160 + }, + { + "epoch": 0.2273368294934496, + "grad_norm": 6.044048309326172, + "learning_rate": 1.8488973454146166e-05, + "loss": 2.0285, + "step": 36170 + }, + { + "epoch": 0.22739968181014672, + "grad_norm": 7.432627201080322, + "learning_rate": 1.8488554353201513e-05, + "loss": 2.0044, + "step": 36180 + }, + { + "epoch": 0.22746253412684383, + "grad_norm": 7.868930816650391, + "learning_rate": 1.848813525225686e-05, + "loss": 1.8236, + "step": 36190 + }, + { + "epoch": 0.22752538644354095, + "grad_norm": 7.376712322235107, + "learning_rate": 1.8487716151312207e-05, + "loss": 1.7301, + "step": 36200 + }, + { + "epoch": 0.22758823876023807, + "grad_norm": 7.406872749328613, + "learning_rate": 1.848729705036755e-05, + "loss": 1.9849, + "step": 36210 + }, + { + "epoch": 0.22765109107693515, + "grad_norm": 8.414949417114258, + "learning_rate": 1.8486877949422898e-05, + "loss": 1.8838, + "step": 36220 + }, + { + "epoch": 0.22771394339363227, + "grad_norm": 8.242228507995605, + "learning_rate": 1.8486458848478245e-05, + "loss": 1.8699, + "step": 36230 + }, + { + "epoch": 0.22777679571032938, + "grad_norm": 7.094605922698975, + "learning_rate": 1.8486039747533592e-05, + "loss": 2.0294, + "step": 36240 + }, + { + "epoch": 0.2278396480270265, + "grad_norm": 7.137697219848633, + "learning_rate": 1.848562064658894e-05, + "loss": 1.9057, + "step": 36250 + }, + { + "epoch": 0.2279025003437236, + "grad_norm": 7.533381462097168, + "learning_rate": 1.8485201545644286e-05, + "loss": 1.7952, + "step": 36260 + }, + { + "epoch": 0.22796535266042073, + "grad_norm": 6.198008060455322, + "learning_rate": 1.8484782444699633e-05, + "loss": 1.9044, + "step": 36270 + }, + { + "epoch": 0.22802820497711782, + "grad_norm": 9.231335639953613, + "learning_rate": 1.848436334375498e-05, + "loss": 2.181, + "step": 36280 + }, + { + "epoch": 0.22809105729381493, + "grad_norm": 8.630999565124512, + "learning_rate": 1.8483944242810327e-05, + "loss": 1.7809, + "step": 36290 + }, + { + "epoch": 0.22815390961051205, + "grad_norm": 6.130497455596924, + "learning_rate": 1.848352514186567e-05, + "loss": 1.9128, + "step": 36300 + }, + { + "epoch": 0.22821676192720916, + "grad_norm": 6.748734951019287, + "learning_rate": 1.8483106040921018e-05, + "loss": 1.9731, + "step": 36310 + }, + { + "epoch": 0.22827961424390628, + "grad_norm": 6.8579182624816895, + "learning_rate": 1.8482686939976365e-05, + "loss": 1.895, + "step": 36320 + }, + { + "epoch": 0.2283424665606034, + "grad_norm": 7.391290187835693, + "learning_rate": 1.8482267839031712e-05, + "loss": 1.9956, + "step": 36330 + }, + { + "epoch": 0.2284053188773005, + "grad_norm": 7.041627407073975, + "learning_rate": 1.848184873808706e-05, + "loss": 1.8528, + "step": 36340 + }, + { + "epoch": 0.2284681711939976, + "grad_norm": 7.496146202087402, + "learning_rate": 1.8481429637142403e-05, + "loss": 1.9317, + "step": 36350 + }, + { + "epoch": 0.2285310235106947, + "grad_norm": 7.350832939147949, + "learning_rate": 1.848101053619775e-05, + "loss": 1.9906, + "step": 36360 + }, + { + "epoch": 0.22859387582739182, + "grad_norm": 7.2522664070129395, + "learning_rate": 1.8480591435253097e-05, + "loss": 1.7981, + "step": 36370 + }, + { + "epoch": 0.22865672814408894, + "grad_norm": 6.560575008392334, + "learning_rate": 1.8480172334308444e-05, + "loss": 2.0547, + "step": 36380 + }, + { + "epoch": 0.22871958046078605, + "grad_norm": 7.1260552406311035, + "learning_rate": 1.8479753233363788e-05, + "loss": 1.8038, + "step": 36390 + }, + { + "epoch": 0.22878243277748317, + "grad_norm": 6.739349842071533, + "learning_rate": 1.8479334132419135e-05, + "loss": 1.7947, + "step": 36400 + }, + { + "epoch": 0.22884528509418026, + "grad_norm": 6.752826690673828, + "learning_rate": 1.8478915031474482e-05, + "loss": 1.8501, + "step": 36410 + }, + { + "epoch": 0.22890813741087737, + "grad_norm": 7.900572776794434, + "learning_rate": 1.847849593052983e-05, + "loss": 2.0682, + "step": 36420 + }, + { + "epoch": 0.2289709897275745, + "grad_norm": 8.176858901977539, + "learning_rate": 1.8478076829585176e-05, + "loss": 1.8969, + "step": 36430 + }, + { + "epoch": 0.2290338420442716, + "grad_norm": 6.831380844116211, + "learning_rate": 1.847765772864052e-05, + "loss": 1.9836, + "step": 36440 + }, + { + "epoch": 0.22909669436096872, + "grad_norm": 7.1293721199035645, + "learning_rate": 1.8477238627695867e-05, + "loss": 2.1393, + "step": 36450 + }, + { + "epoch": 0.22915954667766583, + "grad_norm": 7.110170364379883, + "learning_rate": 1.8476819526751214e-05, + "loss": 1.9506, + "step": 36460 + }, + { + "epoch": 0.22922239899436292, + "grad_norm": 7.697797775268555, + "learning_rate": 1.847640042580656e-05, + "loss": 1.7845, + "step": 36470 + }, + { + "epoch": 0.22928525131106003, + "grad_norm": 7.563226699829102, + "learning_rate": 1.8475981324861908e-05, + "loss": 1.9659, + "step": 36480 + }, + { + "epoch": 0.22934810362775715, + "grad_norm": 7.161571025848389, + "learning_rate": 1.8475562223917255e-05, + "loss": 1.7976, + "step": 36490 + }, + { + "epoch": 0.22941095594445426, + "grad_norm": 7.61132287979126, + "learning_rate": 1.8475143122972602e-05, + "loss": 1.8065, + "step": 36500 + }, + { + "epoch": 0.22947380826115138, + "grad_norm": 7.758613109588623, + "learning_rate": 1.847472402202795e-05, + "loss": 1.8325, + "step": 36510 + }, + { + "epoch": 0.2295366605778485, + "grad_norm": 6.902079105377197, + "learning_rate": 1.8474304921083293e-05, + "loss": 1.9519, + "step": 36520 + }, + { + "epoch": 0.2295995128945456, + "grad_norm": 6.420543193817139, + "learning_rate": 1.847388582013864e-05, + "loss": 1.8842, + "step": 36530 + }, + { + "epoch": 0.2296623652112427, + "grad_norm": 6.8175272941589355, + "learning_rate": 1.8473466719193987e-05, + "loss": 1.9314, + "step": 36540 + }, + { + "epoch": 0.2297252175279398, + "grad_norm": 7.08127498626709, + "learning_rate": 1.8473047618249334e-05, + "loss": 1.9251, + "step": 36550 + }, + { + "epoch": 0.22978806984463693, + "grad_norm": 6.201793670654297, + "learning_rate": 1.847262851730468e-05, + "loss": 1.926, + "step": 36560 + }, + { + "epoch": 0.22985092216133404, + "grad_norm": 7.628470420837402, + "learning_rate": 1.8472209416360025e-05, + "loss": 1.9377, + "step": 36570 + }, + { + "epoch": 0.22991377447803116, + "grad_norm": 7.624232769012451, + "learning_rate": 1.8471790315415372e-05, + "loss": 1.9868, + "step": 36580 + }, + { + "epoch": 0.22997662679472827, + "grad_norm": 7.706589698791504, + "learning_rate": 1.847137121447072e-05, + "loss": 1.8757, + "step": 36590 + }, + { + "epoch": 0.23003947911142536, + "grad_norm": 8.891164779663086, + "learning_rate": 1.8470952113526066e-05, + "loss": 1.9888, + "step": 36600 + }, + { + "epoch": 0.23010233142812248, + "grad_norm": 6.154812335968018, + "learning_rate": 1.847053301258141e-05, + "loss": 1.8775, + "step": 36610 + }, + { + "epoch": 0.2301651837448196, + "grad_norm": 8.748318672180176, + "learning_rate": 1.8470113911636757e-05, + "loss": 1.9606, + "step": 36620 + }, + { + "epoch": 0.2302280360615167, + "grad_norm": 7.380129814147949, + "learning_rate": 1.8469694810692104e-05, + "loss": 1.8137, + "step": 36630 + }, + { + "epoch": 0.23029088837821382, + "grad_norm": 6.104862213134766, + "learning_rate": 1.846927570974745e-05, + "loss": 1.7568, + "step": 36640 + }, + { + "epoch": 0.23035374069491094, + "grad_norm": 8.323019027709961, + "learning_rate": 1.8468856608802798e-05, + "loss": 1.7345, + "step": 36650 + }, + { + "epoch": 0.23041659301160805, + "grad_norm": 7.859176158905029, + "learning_rate": 1.8468437507858145e-05, + "loss": 1.8129, + "step": 36660 + }, + { + "epoch": 0.23047944532830514, + "grad_norm": 8.44453239440918, + "learning_rate": 1.8468018406913492e-05, + "loss": 1.9672, + "step": 36670 + }, + { + "epoch": 0.23054229764500225, + "grad_norm": 7.817448616027832, + "learning_rate": 1.8467599305968836e-05, + "loss": 1.8993, + "step": 36680 + }, + { + "epoch": 0.23060514996169937, + "grad_norm": 7.425734043121338, + "learning_rate": 1.8467180205024183e-05, + "loss": 1.9744, + "step": 36690 + }, + { + "epoch": 0.23066800227839648, + "grad_norm": 6.993030548095703, + "learning_rate": 1.846676110407953e-05, + "loss": 1.9328, + "step": 36700 + }, + { + "epoch": 0.2307308545950936, + "grad_norm": 6.675727367401123, + "learning_rate": 1.8466342003134877e-05, + "loss": 1.9184, + "step": 36710 + }, + { + "epoch": 0.2307937069117907, + "grad_norm": 7.259601593017578, + "learning_rate": 1.8465922902190224e-05, + "loss": 2.027, + "step": 36720 + }, + { + "epoch": 0.2308565592284878, + "grad_norm": 6.463089942932129, + "learning_rate": 1.846550380124557e-05, + "loss": 1.9571, + "step": 36730 + }, + { + "epoch": 0.23091941154518492, + "grad_norm": 6.317759990692139, + "learning_rate": 1.8465084700300918e-05, + "loss": 1.838, + "step": 36740 + }, + { + "epoch": 0.23098226386188203, + "grad_norm": 7.501256942749023, + "learning_rate": 1.8464665599356262e-05, + "loss": 1.8231, + "step": 36750 + }, + { + "epoch": 0.23104511617857915, + "grad_norm": 7.5587663650512695, + "learning_rate": 1.846424649841161e-05, + "loss": 1.8397, + "step": 36760 + }, + { + "epoch": 0.23110796849527626, + "grad_norm": 6.539580345153809, + "learning_rate": 1.8463827397466956e-05, + "loss": 2.1636, + "step": 36770 + }, + { + "epoch": 0.23117082081197338, + "grad_norm": 7.328402042388916, + "learning_rate": 1.8463408296522303e-05, + "loss": 1.6553, + "step": 36780 + }, + { + "epoch": 0.23123367312867046, + "grad_norm": 7.860400676727295, + "learning_rate": 1.8462989195577647e-05, + "loss": 2.0599, + "step": 36790 + }, + { + "epoch": 0.23129652544536758, + "grad_norm": 6.228035926818848, + "learning_rate": 1.8462570094632994e-05, + "loss": 1.9108, + "step": 36800 + }, + { + "epoch": 0.2313593777620647, + "grad_norm": 8.228126525878906, + "learning_rate": 1.846215099368834e-05, + "loss": 1.9351, + "step": 36810 + }, + { + "epoch": 0.2314222300787618, + "grad_norm": 6.4865617752075195, + "learning_rate": 1.8461731892743688e-05, + "loss": 1.9241, + "step": 36820 + }, + { + "epoch": 0.23148508239545892, + "grad_norm": 8.462693214416504, + "learning_rate": 1.846131279179903e-05, + "loss": 1.7073, + "step": 36830 + }, + { + "epoch": 0.23154793471215604, + "grad_norm": 8.767376899719238, + "learning_rate": 1.846089369085438e-05, + "loss": 2.2247, + "step": 36840 + }, + { + "epoch": 0.23161078702885315, + "grad_norm": 7.113800525665283, + "learning_rate": 1.8460474589909726e-05, + "loss": 1.9196, + "step": 36850 + }, + { + "epoch": 0.23167363934555024, + "grad_norm": 7.881669044494629, + "learning_rate": 1.8460055488965073e-05, + "loss": 1.8683, + "step": 36860 + }, + { + "epoch": 0.23173649166224736, + "grad_norm": 7.283566951751709, + "learning_rate": 1.845963638802042e-05, + "loss": 1.9212, + "step": 36870 + }, + { + "epoch": 0.23179934397894447, + "grad_norm": 6.925536155700684, + "learning_rate": 1.8459217287075767e-05, + "loss": 1.8082, + "step": 36880 + }, + { + "epoch": 0.2318621962956416, + "grad_norm": 7.803654193878174, + "learning_rate": 1.8458798186131114e-05, + "loss": 1.9035, + "step": 36890 + }, + { + "epoch": 0.2319250486123387, + "grad_norm": 5.596521377563477, + "learning_rate": 1.845837908518646e-05, + "loss": 1.7774, + "step": 36900 + }, + { + "epoch": 0.23198790092903582, + "grad_norm": 6.094423770904541, + "learning_rate": 1.8457959984241808e-05, + "loss": 1.9895, + "step": 36910 + }, + { + "epoch": 0.2320507532457329, + "grad_norm": 7.609260559082031, + "learning_rate": 1.8457540883297152e-05, + "loss": 2.0385, + "step": 36920 + }, + { + "epoch": 0.23211360556243002, + "grad_norm": 6.813177585601807, + "learning_rate": 1.84571217823525e-05, + "loss": 1.9822, + "step": 36930 + }, + { + "epoch": 0.23217645787912713, + "grad_norm": 6.039997577667236, + "learning_rate": 1.8456702681407846e-05, + "loss": 1.9013, + "step": 36940 + }, + { + "epoch": 0.23223931019582425, + "grad_norm": 7.389283180236816, + "learning_rate": 1.8456283580463193e-05, + "loss": 2.094, + "step": 36950 + }, + { + "epoch": 0.23230216251252137, + "grad_norm": 7.88203763961792, + "learning_rate": 1.845586447951854e-05, + "loss": 2.1993, + "step": 36960 + }, + { + "epoch": 0.23236501482921848, + "grad_norm": 6.947795867919922, + "learning_rate": 1.8455445378573884e-05, + "loss": 2.0832, + "step": 36970 + }, + { + "epoch": 0.23242786714591557, + "grad_norm": 6.959676265716553, + "learning_rate": 1.845502627762923e-05, + "loss": 1.9911, + "step": 36980 + }, + { + "epoch": 0.23249071946261268, + "grad_norm": 7.610995292663574, + "learning_rate": 1.8454607176684578e-05, + "loss": 1.7864, + "step": 36990 + }, + { + "epoch": 0.2325535717793098, + "grad_norm": 7.01109504699707, + "learning_rate": 1.8454188075739925e-05, + "loss": 1.9153, + "step": 37000 + }, + { + "epoch": 0.2326164240960069, + "grad_norm": 6.2115960121154785, + "learning_rate": 1.845376897479527e-05, + "loss": 1.9913, + "step": 37010 + }, + { + "epoch": 0.23267927641270403, + "grad_norm": 7.3208770751953125, + "learning_rate": 1.8453349873850616e-05, + "loss": 1.9758, + "step": 37020 + }, + { + "epoch": 0.23274212872940114, + "grad_norm": 6.857991695404053, + "learning_rate": 1.8452930772905963e-05, + "loss": 1.8901, + "step": 37030 + }, + { + "epoch": 0.23280498104609826, + "grad_norm": 6.762681007385254, + "learning_rate": 1.845251167196131e-05, + "loss": 1.966, + "step": 37040 + }, + { + "epoch": 0.23286783336279535, + "grad_norm": 7.531749725341797, + "learning_rate": 1.8452092571016657e-05, + "loss": 1.8867, + "step": 37050 + }, + { + "epoch": 0.23293068567949246, + "grad_norm": 7.440679550170898, + "learning_rate": 1.8451673470072e-05, + "loss": 1.8707, + "step": 37060 + }, + { + "epoch": 0.23299353799618958, + "grad_norm": 12.17404842376709, + "learning_rate": 1.8451254369127348e-05, + "loss": 1.8228, + "step": 37070 + }, + { + "epoch": 0.2330563903128867, + "grad_norm": 8.215689659118652, + "learning_rate": 1.8450835268182695e-05, + "loss": 1.9519, + "step": 37080 + }, + { + "epoch": 0.2331192426295838, + "grad_norm": 8.259377479553223, + "learning_rate": 1.8450416167238042e-05, + "loss": 1.8253, + "step": 37090 + }, + { + "epoch": 0.23318209494628092, + "grad_norm": 8.118638038635254, + "learning_rate": 1.844999706629339e-05, + "loss": 1.8352, + "step": 37100 + }, + { + "epoch": 0.233244947262978, + "grad_norm": 7.638366222381592, + "learning_rate": 1.8449577965348736e-05, + "loss": 1.8802, + "step": 37110 + }, + { + "epoch": 0.23330779957967512, + "grad_norm": 7.182828426361084, + "learning_rate": 1.8449158864404083e-05, + "loss": 1.8821, + "step": 37120 + }, + { + "epoch": 0.23337065189637224, + "grad_norm": 7.47348690032959, + "learning_rate": 1.844873976345943e-05, + "loss": 2.0787, + "step": 37130 + }, + { + "epoch": 0.23343350421306935, + "grad_norm": 8.01142692565918, + "learning_rate": 1.8448320662514774e-05, + "loss": 1.7158, + "step": 37140 + }, + { + "epoch": 0.23349635652976647, + "grad_norm": 9.263545036315918, + "learning_rate": 1.844790156157012e-05, + "loss": 2.0585, + "step": 37150 + }, + { + "epoch": 0.23355920884646358, + "grad_norm": 7.731915473937988, + "learning_rate": 1.8447482460625468e-05, + "loss": 1.8395, + "step": 37160 + }, + { + "epoch": 0.2336220611631607, + "grad_norm": 7.082794666290283, + "learning_rate": 1.8447063359680815e-05, + "loss": 1.777, + "step": 37170 + }, + { + "epoch": 0.2336849134798578, + "grad_norm": 8.51518726348877, + "learning_rate": 1.8446644258736162e-05, + "loss": 1.9184, + "step": 37180 + }, + { + "epoch": 0.2337477657965549, + "grad_norm": 6.730716228485107, + "learning_rate": 1.8446225157791506e-05, + "loss": 1.8458, + "step": 37190 + }, + { + "epoch": 0.23381061811325202, + "grad_norm": 7.255427837371826, + "learning_rate": 1.8445806056846853e-05, + "loss": 1.9946, + "step": 37200 + }, + { + "epoch": 0.23387347042994913, + "grad_norm": 8.256935119628906, + "learning_rate": 1.84453869559022e-05, + "loss": 1.9618, + "step": 37210 + }, + { + "epoch": 0.23393632274664625, + "grad_norm": 7.718365669250488, + "learning_rate": 1.8444967854957547e-05, + "loss": 1.7952, + "step": 37220 + }, + { + "epoch": 0.23399917506334336, + "grad_norm": 6.30307674407959, + "learning_rate": 1.844454875401289e-05, + "loss": 1.8853, + "step": 37230 + }, + { + "epoch": 0.23406202738004045, + "grad_norm": 6.986265182495117, + "learning_rate": 1.8444129653068238e-05, + "loss": 1.8914, + "step": 37240 + }, + { + "epoch": 0.23412487969673756, + "grad_norm": 7.488687515258789, + "learning_rate": 1.8443710552123585e-05, + "loss": 2.0103, + "step": 37250 + }, + { + "epoch": 0.23418773201343468, + "grad_norm": 7.455891132354736, + "learning_rate": 1.8443291451178932e-05, + "loss": 1.8, + "step": 37260 + }, + { + "epoch": 0.2342505843301318, + "grad_norm": 8.035429954528809, + "learning_rate": 1.844287235023428e-05, + "loss": 1.9958, + "step": 37270 + }, + { + "epoch": 0.2343134366468289, + "grad_norm": 6.650293827056885, + "learning_rate": 1.8442453249289626e-05, + "loss": 1.9657, + "step": 37280 + }, + { + "epoch": 0.23437628896352602, + "grad_norm": 6.432164192199707, + "learning_rate": 1.8442034148344973e-05, + "loss": 1.734, + "step": 37290 + }, + { + "epoch": 0.2344391412802231, + "grad_norm": 7.902127742767334, + "learning_rate": 1.844161504740032e-05, + "loss": 2.1061, + "step": 37300 + }, + { + "epoch": 0.23450199359692023, + "grad_norm": 7.714845180511475, + "learning_rate": 1.8441195946455664e-05, + "loss": 1.9538, + "step": 37310 + }, + { + "epoch": 0.23456484591361734, + "grad_norm": 7.619652271270752, + "learning_rate": 1.844077684551101e-05, + "loss": 1.9894, + "step": 37320 + }, + { + "epoch": 0.23462769823031446, + "grad_norm": 7.897617816925049, + "learning_rate": 1.8440357744566358e-05, + "loss": 1.9426, + "step": 37330 + }, + { + "epoch": 0.23469055054701157, + "grad_norm": 7.258893966674805, + "learning_rate": 1.8439938643621705e-05, + "loss": 1.8904, + "step": 37340 + }, + { + "epoch": 0.2347534028637087, + "grad_norm": 7.671611309051514, + "learning_rate": 1.8439519542677052e-05, + "loss": 1.8734, + "step": 37350 + }, + { + "epoch": 0.2348162551804058, + "grad_norm": 6.240477085113525, + "learning_rate": 1.84391004417324e-05, + "loss": 1.7576, + "step": 37360 + }, + { + "epoch": 0.2348791074971029, + "grad_norm": 7.337497234344482, + "learning_rate": 1.8438681340787743e-05, + "loss": 1.7614, + "step": 37370 + }, + { + "epoch": 0.2349419598138, + "grad_norm": 6.677089214324951, + "learning_rate": 1.843826223984309e-05, + "loss": 2.0978, + "step": 37380 + }, + { + "epoch": 0.23500481213049712, + "grad_norm": 7.023135185241699, + "learning_rate": 1.8437843138898437e-05, + "loss": 1.943, + "step": 37390 + }, + { + "epoch": 0.23506766444719424, + "grad_norm": 7.9640374183654785, + "learning_rate": 1.8437424037953784e-05, + "loss": 1.9079, + "step": 37400 + }, + { + "epoch": 0.23513051676389135, + "grad_norm": 7.736348628997803, + "learning_rate": 1.8437004937009128e-05, + "loss": 1.9522, + "step": 37410 + }, + { + "epoch": 0.23519336908058847, + "grad_norm": 7.060858249664307, + "learning_rate": 1.8436585836064475e-05, + "loss": 1.7814, + "step": 37420 + }, + { + "epoch": 0.23525622139728555, + "grad_norm": 7.679143905639648, + "learning_rate": 1.8436166735119822e-05, + "loss": 2.0866, + "step": 37430 + }, + { + "epoch": 0.23531907371398267, + "grad_norm": 6.9466352462768555, + "learning_rate": 1.843574763417517e-05, + "loss": 2.0974, + "step": 37440 + }, + { + "epoch": 0.23538192603067978, + "grad_norm": 6.300509929656982, + "learning_rate": 1.8435328533230513e-05, + "loss": 1.7952, + "step": 37450 + }, + { + "epoch": 0.2354447783473769, + "grad_norm": 7.0272135734558105, + "learning_rate": 1.843490943228586e-05, + "loss": 1.6515, + "step": 37460 + }, + { + "epoch": 0.235507630664074, + "grad_norm": 7.288054466247559, + "learning_rate": 1.8434490331341207e-05, + "loss": 1.8176, + "step": 37470 + }, + { + "epoch": 0.23557048298077113, + "grad_norm": 8.304322242736816, + "learning_rate": 1.8434071230396554e-05, + "loss": 2.1369, + "step": 37480 + }, + { + "epoch": 0.23563333529746824, + "grad_norm": 6.926466464996338, + "learning_rate": 1.84336521294519e-05, + "loss": 2.0315, + "step": 37490 + }, + { + "epoch": 0.23569618761416533, + "grad_norm": 7.540550708770752, + "learning_rate": 1.8433233028507248e-05, + "loss": 2.0642, + "step": 37500 + }, + { + "epoch": 0.23575903993086245, + "grad_norm": 7.766870021820068, + "learning_rate": 1.8432813927562595e-05, + "loss": 1.8289, + "step": 37510 + }, + { + "epoch": 0.23582189224755956, + "grad_norm": 6.657734394073486, + "learning_rate": 1.8432394826617942e-05, + "loss": 1.945, + "step": 37520 + }, + { + "epoch": 0.23588474456425668, + "grad_norm": 6.852512836456299, + "learning_rate": 1.843197572567329e-05, + "loss": 1.7377, + "step": 37530 + }, + { + "epoch": 0.2359475968809538, + "grad_norm": 7.0113444328308105, + "learning_rate": 1.8431556624728633e-05, + "loss": 1.9068, + "step": 37540 + }, + { + "epoch": 0.2360104491976509, + "grad_norm": 7.569218635559082, + "learning_rate": 1.843113752378398e-05, + "loss": 1.7622, + "step": 37550 + }, + { + "epoch": 0.236073301514348, + "grad_norm": 7.438565254211426, + "learning_rate": 1.8430718422839327e-05, + "loss": 1.9368, + "step": 37560 + }, + { + "epoch": 0.2361361538310451, + "grad_norm": 10.159157752990723, + "learning_rate": 1.8430299321894674e-05, + "loss": 1.9322, + "step": 37570 + }, + { + "epoch": 0.23619900614774222, + "grad_norm": 6.393486022949219, + "learning_rate": 1.842988022095002e-05, + "loss": 2.1107, + "step": 37580 + }, + { + "epoch": 0.23626185846443934, + "grad_norm": 7.414916038513184, + "learning_rate": 1.8429461120005365e-05, + "loss": 1.8803, + "step": 37590 + }, + { + "epoch": 0.23632471078113645, + "grad_norm": 7.108283519744873, + "learning_rate": 1.8429042019060712e-05, + "loss": 2.1052, + "step": 37600 + }, + { + "epoch": 0.23638756309783357, + "grad_norm": 7.331905841827393, + "learning_rate": 1.842862291811606e-05, + "loss": 1.7375, + "step": 37610 + }, + { + "epoch": 0.23645041541453066, + "grad_norm": 6.825594425201416, + "learning_rate": 1.8428203817171406e-05, + "loss": 1.8658, + "step": 37620 + }, + { + "epoch": 0.23651326773122777, + "grad_norm": 7.41994047164917, + "learning_rate": 1.842778471622675e-05, + "loss": 1.8993, + "step": 37630 + }, + { + "epoch": 0.2365761200479249, + "grad_norm": 6.07677698135376, + "learning_rate": 1.8427365615282097e-05, + "loss": 1.8716, + "step": 37640 + }, + { + "epoch": 0.236638972364622, + "grad_norm": 6.925858497619629, + "learning_rate": 1.8426946514337444e-05, + "loss": 1.6922, + "step": 37650 + }, + { + "epoch": 0.23670182468131912, + "grad_norm": 6.979998588562012, + "learning_rate": 1.842652741339279e-05, + "loss": 2.3251, + "step": 37660 + }, + { + "epoch": 0.23676467699801623, + "grad_norm": 7.7621541023254395, + "learning_rate": 1.8426108312448138e-05, + "loss": 1.9647, + "step": 37670 + }, + { + "epoch": 0.23682752931471335, + "grad_norm": 6.522325038909912, + "learning_rate": 1.8425689211503482e-05, + "loss": 1.9629, + "step": 37680 + }, + { + "epoch": 0.23689038163141043, + "grad_norm": 6.459441661834717, + "learning_rate": 1.842527011055883e-05, + "loss": 1.8711, + "step": 37690 + }, + { + "epoch": 0.23695323394810755, + "grad_norm": 7.515565395355225, + "learning_rate": 1.8424851009614176e-05, + "loss": 2.0095, + "step": 37700 + }, + { + "epoch": 0.23701608626480467, + "grad_norm": 8.603837966918945, + "learning_rate": 1.8424431908669523e-05, + "loss": 2.128, + "step": 37710 + }, + { + "epoch": 0.23707893858150178, + "grad_norm": 9.59928035736084, + "learning_rate": 1.842401280772487e-05, + "loss": 1.8273, + "step": 37720 + }, + { + "epoch": 0.2371417908981989, + "grad_norm": 10.73585033416748, + "learning_rate": 1.8423593706780217e-05, + "loss": 2.014, + "step": 37730 + }, + { + "epoch": 0.237204643214896, + "grad_norm": 6.706937313079834, + "learning_rate": 1.8423174605835564e-05, + "loss": 2.0051, + "step": 37740 + }, + { + "epoch": 0.2372674955315931, + "grad_norm": 5.793028831481934, + "learning_rate": 1.842275550489091e-05, + "loss": 1.7939, + "step": 37750 + }, + { + "epoch": 0.2373303478482902, + "grad_norm": 6.68222713470459, + "learning_rate": 1.842233640394626e-05, + "loss": 1.9552, + "step": 37760 + }, + { + "epoch": 0.23739320016498733, + "grad_norm": 7.45481538772583, + "learning_rate": 1.8421917303001602e-05, + "loss": 1.7819, + "step": 37770 + }, + { + "epoch": 0.23745605248168444, + "grad_norm": 6.734755039215088, + "learning_rate": 1.842149820205695e-05, + "loss": 1.9601, + "step": 37780 + }, + { + "epoch": 0.23751890479838156, + "grad_norm": 7.868802547454834, + "learning_rate": 1.8421079101112296e-05, + "loss": 1.8973, + "step": 37790 + }, + { + "epoch": 0.23758175711507867, + "grad_norm": 7.722999572753906, + "learning_rate": 1.8420660000167643e-05, + "loss": 1.8141, + "step": 37800 + }, + { + "epoch": 0.23764460943177576, + "grad_norm": 6.061532974243164, + "learning_rate": 1.8420240899222987e-05, + "loss": 1.8178, + "step": 37810 + }, + { + "epoch": 0.23770746174847288, + "grad_norm": 8.7529878616333, + "learning_rate": 1.8419821798278334e-05, + "loss": 2.0316, + "step": 37820 + }, + { + "epoch": 0.23777031406517, + "grad_norm": 7.669527053833008, + "learning_rate": 1.841940269733368e-05, + "loss": 1.9007, + "step": 37830 + }, + { + "epoch": 0.2378331663818671, + "grad_norm": 7.42069149017334, + "learning_rate": 1.8418983596389028e-05, + "loss": 1.7614, + "step": 37840 + }, + { + "epoch": 0.23789601869856422, + "grad_norm": 6.95194673538208, + "learning_rate": 1.8418564495444372e-05, + "loss": 1.952, + "step": 37850 + }, + { + "epoch": 0.23795887101526134, + "grad_norm": 11.418732643127441, + "learning_rate": 1.841814539449972e-05, + "loss": 1.9587, + "step": 37860 + }, + { + "epoch": 0.23802172333195845, + "grad_norm": 7.482511043548584, + "learning_rate": 1.8417726293555066e-05, + "loss": 2.0166, + "step": 37870 + }, + { + "epoch": 0.23808457564865554, + "grad_norm": 16.596914291381836, + "learning_rate": 1.8417307192610413e-05, + "loss": 2.0108, + "step": 37880 + }, + { + "epoch": 0.23814742796535265, + "grad_norm": 6.911890506744385, + "learning_rate": 1.841688809166576e-05, + "loss": 1.7536, + "step": 37890 + }, + { + "epoch": 0.23821028028204977, + "grad_norm": 8.240942001342773, + "learning_rate": 1.8416468990721107e-05, + "loss": 2.0914, + "step": 37900 + }, + { + "epoch": 0.23827313259874688, + "grad_norm": 7.102033615112305, + "learning_rate": 1.8416049889776454e-05, + "loss": 1.9289, + "step": 37910 + }, + { + "epoch": 0.238335984915444, + "grad_norm": 7.021769046783447, + "learning_rate": 1.84156307888318e-05, + "loss": 1.9185, + "step": 37920 + }, + { + "epoch": 0.23839883723214111, + "grad_norm": 6.7171735763549805, + "learning_rate": 1.8415211687887145e-05, + "loss": 1.7612, + "step": 37930 + }, + { + "epoch": 0.2384616895488382, + "grad_norm": 8.630457878112793, + "learning_rate": 1.8414792586942492e-05, + "loss": 1.9459, + "step": 37940 + }, + { + "epoch": 0.23852454186553532, + "grad_norm": 7.017162322998047, + "learning_rate": 1.841437348599784e-05, + "loss": 1.9242, + "step": 37950 + }, + { + "epoch": 0.23858739418223243, + "grad_norm": 7.716609477996826, + "learning_rate": 1.8413954385053186e-05, + "loss": 1.9872, + "step": 37960 + }, + { + "epoch": 0.23865024649892955, + "grad_norm": 6.626461982727051, + "learning_rate": 1.8413535284108533e-05, + "loss": 1.9801, + "step": 37970 + }, + { + "epoch": 0.23871309881562666, + "grad_norm": 7.237145900726318, + "learning_rate": 1.841311618316388e-05, + "loss": 2.0975, + "step": 37980 + }, + { + "epoch": 0.23877595113232378, + "grad_norm": 7.1675705909729, + "learning_rate": 1.8412697082219224e-05, + "loss": 1.8958, + "step": 37990 + }, + { + "epoch": 0.2388388034490209, + "grad_norm": 7.57642936706543, + "learning_rate": 1.841227798127457e-05, + "loss": 1.6876, + "step": 38000 + }, + { + "epoch": 0.23890165576571798, + "grad_norm": 8.831753730773926, + "learning_rate": 1.8411858880329918e-05, + "loss": 2.1432, + "step": 38010 + }, + { + "epoch": 0.2389645080824151, + "grad_norm": 6.9499711990356445, + "learning_rate": 1.8411439779385265e-05, + "loss": 1.9944, + "step": 38020 + }, + { + "epoch": 0.2390273603991122, + "grad_norm": 7.618555545806885, + "learning_rate": 1.841102067844061e-05, + "loss": 2.0899, + "step": 38030 + }, + { + "epoch": 0.23909021271580932, + "grad_norm": 7.863029956817627, + "learning_rate": 1.8410601577495956e-05, + "loss": 2.0186, + "step": 38040 + }, + { + "epoch": 0.23915306503250644, + "grad_norm": 7.103366374969482, + "learning_rate": 1.8410182476551303e-05, + "loss": 1.8927, + "step": 38050 + }, + { + "epoch": 0.23921591734920356, + "grad_norm": 7.5608015060424805, + "learning_rate": 1.840976337560665e-05, + "loss": 1.9394, + "step": 38060 + }, + { + "epoch": 0.23927876966590064, + "grad_norm": 7.124314308166504, + "learning_rate": 1.8409344274661997e-05, + "loss": 2.0939, + "step": 38070 + }, + { + "epoch": 0.23934162198259776, + "grad_norm": 7.2767720222473145, + "learning_rate": 1.840892517371734e-05, + "loss": 1.84, + "step": 38080 + }, + { + "epoch": 0.23940447429929487, + "grad_norm": 7.567097187042236, + "learning_rate": 1.8408506072772688e-05, + "loss": 1.8521, + "step": 38090 + }, + { + "epoch": 0.239467326615992, + "grad_norm": 8.322702407836914, + "learning_rate": 1.8408086971828035e-05, + "loss": 1.9675, + "step": 38100 + }, + { + "epoch": 0.2395301789326891, + "grad_norm": 8.67414665222168, + "learning_rate": 1.8407667870883382e-05, + "loss": 1.946, + "step": 38110 + }, + { + "epoch": 0.23959303124938622, + "grad_norm": 6.918972015380859, + "learning_rate": 1.840724876993873e-05, + "loss": 2.0705, + "step": 38120 + }, + { + "epoch": 0.2396558835660833, + "grad_norm": 6.610874652862549, + "learning_rate": 1.8406829668994076e-05, + "loss": 1.9874, + "step": 38130 + }, + { + "epoch": 0.23971873588278042, + "grad_norm": 7.756814002990723, + "learning_rate": 1.8406410568049423e-05, + "loss": 1.9056, + "step": 38140 + }, + { + "epoch": 0.23978158819947754, + "grad_norm": 8.447721481323242, + "learning_rate": 1.840599146710477e-05, + "loss": 1.6769, + "step": 38150 + }, + { + "epoch": 0.23984444051617465, + "grad_norm": 6.847433567047119, + "learning_rate": 1.8405572366160114e-05, + "loss": 1.9969, + "step": 38160 + }, + { + "epoch": 0.23990729283287177, + "grad_norm": 6.875399112701416, + "learning_rate": 1.840515326521546e-05, + "loss": 1.82, + "step": 38170 + }, + { + "epoch": 0.23997014514956888, + "grad_norm": 7.013482093811035, + "learning_rate": 1.8404734164270808e-05, + "loss": 1.8327, + "step": 38180 + }, + { + "epoch": 0.240032997466266, + "grad_norm": 7.169978618621826, + "learning_rate": 1.8404315063326155e-05, + "loss": 2.073, + "step": 38190 + }, + { + "epoch": 0.24009584978296308, + "grad_norm": 8.269976615905762, + "learning_rate": 1.8403895962381502e-05, + "loss": 1.9683, + "step": 38200 + }, + { + "epoch": 0.2401587020996602, + "grad_norm": 7.200091361999512, + "learning_rate": 1.8403476861436846e-05, + "loss": 1.9096, + "step": 38210 + }, + { + "epoch": 0.2402215544163573, + "grad_norm": 7.73632287979126, + "learning_rate": 1.8403057760492193e-05, + "loss": 2.2556, + "step": 38220 + }, + { + "epoch": 0.24028440673305443, + "grad_norm": 6.895431041717529, + "learning_rate": 1.840263865954754e-05, + "loss": 2.0842, + "step": 38230 + }, + { + "epoch": 0.24034725904975154, + "grad_norm": 7.235763072967529, + "learning_rate": 1.8402219558602887e-05, + "loss": 1.9744, + "step": 38240 + }, + { + "epoch": 0.24041011136644866, + "grad_norm": 5.841657638549805, + "learning_rate": 1.840180045765823e-05, + "loss": 1.9609, + "step": 38250 + }, + { + "epoch": 0.24047296368314575, + "grad_norm": 6.5870490074157715, + "learning_rate": 1.8401381356713578e-05, + "loss": 1.973, + "step": 38260 + }, + { + "epoch": 0.24053581599984286, + "grad_norm": 8.151089668273926, + "learning_rate": 1.8400962255768925e-05, + "loss": 1.812, + "step": 38270 + }, + { + "epoch": 0.24059866831653998, + "grad_norm": 5.663036346435547, + "learning_rate": 1.8400543154824272e-05, + "loss": 1.8347, + "step": 38280 + }, + { + "epoch": 0.2406615206332371, + "grad_norm": 7.262301921844482, + "learning_rate": 1.840012405387962e-05, + "loss": 1.792, + "step": 38290 + }, + { + "epoch": 0.2407243729499342, + "grad_norm": 7.873507499694824, + "learning_rate": 1.8399704952934966e-05, + "loss": 2.061, + "step": 38300 + }, + { + "epoch": 0.24078722526663132, + "grad_norm": 6.455713272094727, + "learning_rate": 1.839928585199031e-05, + "loss": 1.8353, + "step": 38310 + }, + { + "epoch": 0.24085007758332844, + "grad_norm": 7.31044864654541, + "learning_rate": 1.8398866751045657e-05, + "loss": 1.7257, + "step": 38320 + }, + { + "epoch": 0.24091292990002552, + "grad_norm": 7.6389265060424805, + "learning_rate": 1.8398447650101004e-05, + "loss": 1.9959, + "step": 38330 + }, + { + "epoch": 0.24097578221672264, + "grad_norm": 7.888726711273193, + "learning_rate": 1.839802854915635e-05, + "loss": 1.8684, + "step": 38340 + }, + { + "epoch": 0.24103863453341975, + "grad_norm": 7.7473673820495605, + "learning_rate": 1.83976094482117e-05, + "loss": 1.9824, + "step": 38350 + }, + { + "epoch": 0.24110148685011687, + "grad_norm": 7.208690643310547, + "learning_rate": 1.8397190347267045e-05, + "loss": 2.0381, + "step": 38360 + }, + { + "epoch": 0.24116433916681398, + "grad_norm": 6.633978366851807, + "learning_rate": 1.8396771246322392e-05, + "loss": 1.9413, + "step": 38370 + }, + { + "epoch": 0.2412271914835111, + "grad_norm": 5.579523086547852, + "learning_rate": 1.839635214537774e-05, + "loss": 1.7466, + "step": 38380 + }, + { + "epoch": 0.2412900438002082, + "grad_norm": 6.117504596710205, + "learning_rate": 1.8395933044433083e-05, + "loss": 1.8218, + "step": 38390 + }, + { + "epoch": 0.2413528961169053, + "grad_norm": 7.406382083892822, + "learning_rate": 1.839551394348843e-05, + "loss": 1.9613, + "step": 38400 + }, + { + "epoch": 0.24141574843360242, + "grad_norm": 6.532168865203857, + "learning_rate": 1.8395094842543777e-05, + "loss": 1.7304, + "step": 38410 + }, + { + "epoch": 0.24147860075029953, + "grad_norm": 7.247741222381592, + "learning_rate": 1.8394675741599124e-05, + "loss": 2.0489, + "step": 38420 + }, + { + "epoch": 0.24154145306699665, + "grad_norm": 6.924738883972168, + "learning_rate": 1.8394256640654468e-05, + "loss": 1.8917, + "step": 38430 + }, + { + "epoch": 0.24160430538369376, + "grad_norm": 8.146101951599121, + "learning_rate": 1.8393837539709815e-05, + "loss": 1.9852, + "step": 38440 + }, + { + "epoch": 0.24166715770039085, + "grad_norm": 6.957603931427002, + "learning_rate": 1.8393418438765162e-05, + "loss": 2.0234, + "step": 38450 + }, + { + "epoch": 0.24173001001708797, + "grad_norm": 7.829459190368652, + "learning_rate": 1.839299933782051e-05, + "loss": 2.0043, + "step": 38460 + }, + { + "epoch": 0.24179286233378508, + "grad_norm": 6.800477981567383, + "learning_rate": 1.8392580236875853e-05, + "loss": 2.0715, + "step": 38470 + }, + { + "epoch": 0.2418557146504822, + "grad_norm": 7.800566673278809, + "learning_rate": 1.83921611359312e-05, + "loss": 2.0213, + "step": 38480 + }, + { + "epoch": 0.2419185669671793, + "grad_norm": 7.374375820159912, + "learning_rate": 1.8391742034986547e-05, + "loss": 1.8844, + "step": 38490 + }, + { + "epoch": 0.24198141928387643, + "grad_norm": 9.049388885498047, + "learning_rate": 1.8391322934041894e-05, + "loss": 2.2025, + "step": 38500 + }, + { + "epoch": 0.24204427160057354, + "grad_norm": 7.550909996032715, + "learning_rate": 1.839090383309724e-05, + "loss": 1.9928, + "step": 38510 + }, + { + "epoch": 0.24210712391727063, + "grad_norm": 6.672713279724121, + "learning_rate": 1.839048473215259e-05, + "loss": 1.7213, + "step": 38520 + }, + { + "epoch": 0.24216997623396774, + "grad_norm": 7.553778648376465, + "learning_rate": 1.8390065631207935e-05, + "loss": 1.8934, + "step": 38530 + }, + { + "epoch": 0.24223282855066486, + "grad_norm": 8.76152229309082, + "learning_rate": 1.8389646530263283e-05, + "loss": 1.9237, + "step": 38540 + }, + { + "epoch": 0.24229568086736197, + "grad_norm": 6.914309978485107, + "learning_rate": 1.838922742931863e-05, + "loss": 1.9673, + "step": 38550 + }, + { + "epoch": 0.2423585331840591, + "grad_norm": 7.236188888549805, + "learning_rate": 1.8388808328373973e-05, + "loss": 1.798, + "step": 38560 + }, + { + "epoch": 0.2424213855007562, + "grad_norm": 7.777528762817383, + "learning_rate": 1.838838922742932e-05, + "loss": 1.8949, + "step": 38570 + }, + { + "epoch": 0.2424842378174533, + "grad_norm": 8.247822761535645, + "learning_rate": 1.8387970126484667e-05, + "loss": 2.0834, + "step": 38580 + }, + { + "epoch": 0.2425470901341504, + "grad_norm": 8.609414100646973, + "learning_rate": 1.8387551025540014e-05, + "loss": 1.8722, + "step": 38590 + }, + { + "epoch": 0.24260994245084752, + "grad_norm": 7.682502746582031, + "learning_rate": 1.838713192459536e-05, + "loss": 1.9986, + "step": 38600 + }, + { + "epoch": 0.24267279476754464, + "grad_norm": 8.470084190368652, + "learning_rate": 1.8386712823650705e-05, + "loss": 2.0193, + "step": 38610 + }, + { + "epoch": 0.24273564708424175, + "grad_norm": 7.982653617858887, + "learning_rate": 1.8386293722706052e-05, + "loss": 1.9688, + "step": 38620 + }, + { + "epoch": 0.24279849940093887, + "grad_norm": 7.206417560577393, + "learning_rate": 1.83858746217614e-05, + "loss": 1.7714, + "step": 38630 + }, + { + "epoch": 0.24286135171763595, + "grad_norm": 7.726312160491943, + "learning_rate": 1.8385455520816746e-05, + "loss": 1.9045, + "step": 38640 + }, + { + "epoch": 0.24292420403433307, + "grad_norm": 5.919642448425293, + "learning_rate": 1.838503641987209e-05, + "loss": 1.9664, + "step": 38650 + }, + { + "epoch": 0.24298705635103018, + "grad_norm": 7.361550807952881, + "learning_rate": 1.8384617318927437e-05, + "loss": 2.1269, + "step": 38660 + }, + { + "epoch": 0.2430499086677273, + "grad_norm": 7.250823497772217, + "learning_rate": 1.8384198217982784e-05, + "loss": 1.7964, + "step": 38670 + }, + { + "epoch": 0.24311276098442441, + "grad_norm": 7.15161657333374, + "learning_rate": 1.8383821027132596e-05, + "loss": 2.0504, + "step": 38680 + }, + { + "epoch": 0.24317561330112153, + "grad_norm": 7.380502223968506, + "learning_rate": 1.8383401926187943e-05, + "loss": 1.8924, + "step": 38690 + }, + { + "epoch": 0.24323846561781864, + "grad_norm": 8.092615127563477, + "learning_rate": 1.838298282524329e-05, + "loss": 1.9202, + "step": 38700 + }, + { + "epoch": 0.24330131793451573, + "grad_norm": 6.767857074737549, + "learning_rate": 1.8382563724298637e-05, + "loss": 2.0366, + "step": 38710 + }, + { + "epoch": 0.24336417025121285, + "grad_norm": 6.968144416809082, + "learning_rate": 1.8382144623353984e-05, + "loss": 1.8625, + "step": 38720 + }, + { + "epoch": 0.24342702256790996, + "grad_norm": 5.415788650512695, + "learning_rate": 1.8381725522409328e-05, + "loss": 1.6783, + "step": 38730 + }, + { + "epoch": 0.24348987488460708, + "grad_norm": 7.07074499130249, + "learning_rate": 1.8381306421464675e-05, + "loss": 2.0826, + "step": 38740 + }, + { + "epoch": 0.2435527272013042, + "grad_norm": 6.802286624908447, + "learning_rate": 1.8380887320520022e-05, + "loss": 1.9283, + "step": 38750 + }, + { + "epoch": 0.2436155795180013, + "grad_norm": 6.098469257354736, + "learning_rate": 1.838046821957537e-05, + "loss": 1.8083, + "step": 38760 + }, + { + "epoch": 0.2436784318346984, + "grad_norm": 6.389626979827881, + "learning_rate": 1.8380049118630712e-05, + "loss": 1.9582, + "step": 38770 + }, + { + "epoch": 0.2437412841513955, + "grad_norm": 5.906976222991943, + "learning_rate": 1.837963001768606e-05, + "loss": 2.0481, + "step": 38780 + }, + { + "epoch": 0.24380413646809262, + "grad_norm": 7.177793025970459, + "learning_rate": 1.8379210916741407e-05, + "loss": 1.9249, + "step": 38790 + }, + { + "epoch": 0.24386698878478974, + "grad_norm": 7.2348198890686035, + "learning_rate": 1.8378791815796754e-05, + "loss": 1.8801, + "step": 38800 + }, + { + "epoch": 0.24392984110148686, + "grad_norm": 6.951122283935547, + "learning_rate": 1.83783727148521e-05, + "loss": 1.9686, + "step": 38810 + }, + { + "epoch": 0.24399269341818397, + "grad_norm": 7.613935470581055, + "learning_rate": 1.8377953613907448e-05, + "loss": 2.0456, + "step": 38820 + }, + { + "epoch": 0.24405554573488109, + "grad_norm": 6.578233242034912, + "learning_rate": 1.8377534512962795e-05, + "loss": 1.9367, + "step": 38830 + }, + { + "epoch": 0.24411839805157817, + "grad_norm": 7.682235240936279, + "learning_rate": 1.8377115412018142e-05, + "loss": 1.6629, + "step": 38840 + }, + { + "epoch": 0.2441812503682753, + "grad_norm": 6.999513149261475, + "learning_rate": 1.837669631107349e-05, + "loss": 2.0293, + "step": 38850 + }, + { + "epoch": 0.2442441026849724, + "grad_norm": 7.855412483215332, + "learning_rate": 1.8376277210128833e-05, + "loss": 1.9867, + "step": 38860 + }, + { + "epoch": 0.24430695500166952, + "grad_norm": 6.694448947906494, + "learning_rate": 1.837585810918418e-05, + "loss": 1.8862, + "step": 38870 + }, + { + "epoch": 0.24436980731836663, + "grad_norm": 6.943238735198975, + "learning_rate": 1.8375439008239527e-05, + "loss": 1.8387, + "step": 38880 + }, + { + "epoch": 0.24443265963506375, + "grad_norm": 7.439672470092773, + "learning_rate": 1.8375019907294874e-05, + "loss": 1.9138, + "step": 38890 + }, + { + "epoch": 0.24449551195176084, + "grad_norm": 6.86573600769043, + "learning_rate": 1.8374600806350218e-05, + "loss": 1.945, + "step": 38900 + }, + { + "epoch": 0.24455836426845795, + "grad_norm": 6.689842224121094, + "learning_rate": 1.8374181705405565e-05, + "loss": 1.7859, + "step": 38910 + }, + { + "epoch": 0.24462121658515507, + "grad_norm": 6.846593379974365, + "learning_rate": 1.8373762604460912e-05, + "loss": 1.9778, + "step": 38920 + }, + { + "epoch": 0.24468406890185218, + "grad_norm": 7.001317024230957, + "learning_rate": 1.837334350351626e-05, + "loss": 1.909, + "step": 38930 + }, + { + "epoch": 0.2447469212185493, + "grad_norm": 7.235071659088135, + "learning_rate": 1.8372924402571606e-05, + "loss": 1.7714, + "step": 38940 + }, + { + "epoch": 0.2448097735352464, + "grad_norm": 7.201655387878418, + "learning_rate": 1.837250530162695e-05, + "loss": 2.0205, + "step": 38950 + }, + { + "epoch": 0.2448726258519435, + "grad_norm": 5.981378555297852, + "learning_rate": 1.8372086200682297e-05, + "loss": 1.7683, + "step": 38960 + }, + { + "epoch": 0.2449354781686406, + "grad_norm": 6.676976203918457, + "learning_rate": 1.8371667099737644e-05, + "loss": 2.077, + "step": 38970 + }, + { + "epoch": 0.24499833048533773, + "grad_norm": 6.120922565460205, + "learning_rate": 1.837124799879299e-05, + "loss": 1.8403, + "step": 38980 + }, + { + "epoch": 0.24506118280203484, + "grad_norm": 6.761012077331543, + "learning_rate": 1.8370828897848338e-05, + "loss": 1.765, + "step": 38990 + }, + { + "epoch": 0.24512403511873196, + "grad_norm": 7.727017402648926, + "learning_rate": 1.837040979690368e-05, + "loss": 1.817, + "step": 39000 + }, + { + "epoch": 0.24518688743542907, + "grad_norm": 7.75016975402832, + "learning_rate": 1.836999069595903e-05, + "loss": 1.7469, + "step": 39010 + }, + { + "epoch": 0.2452497397521262, + "grad_norm": 7.089818477630615, + "learning_rate": 1.8369571595014376e-05, + "loss": 1.8925, + "step": 39020 + }, + { + "epoch": 0.24531259206882328, + "grad_norm": 6.474025726318359, + "learning_rate": 1.8369152494069723e-05, + "loss": 1.7677, + "step": 39030 + }, + { + "epoch": 0.2453754443855204, + "grad_norm": 7.904554843902588, + "learning_rate": 1.836873339312507e-05, + "loss": 1.9615, + "step": 39040 + }, + { + "epoch": 0.2454382967022175, + "grad_norm": 6.933401584625244, + "learning_rate": 1.8368314292180417e-05, + "loss": 1.9095, + "step": 39050 + }, + { + "epoch": 0.24550114901891462, + "grad_norm": 6.717565059661865, + "learning_rate": 1.8367895191235764e-05, + "loss": 1.8879, + "step": 39060 + }, + { + "epoch": 0.24556400133561174, + "grad_norm": 7.307586193084717, + "learning_rate": 1.836747609029111e-05, + "loss": 1.8777, + "step": 39070 + }, + { + "epoch": 0.24562685365230885, + "grad_norm": 8.712448120117188, + "learning_rate": 1.8367056989346455e-05, + "loss": 1.832, + "step": 39080 + }, + { + "epoch": 0.24568970596900594, + "grad_norm": 8.435969352722168, + "learning_rate": 1.8366637888401802e-05, + "loss": 1.9329, + "step": 39090 + }, + { + "epoch": 0.24575255828570305, + "grad_norm": 8.680069923400879, + "learning_rate": 1.836621878745715e-05, + "loss": 2.0208, + "step": 39100 + }, + { + "epoch": 0.24581541060240017, + "grad_norm": 8.355234146118164, + "learning_rate": 1.8365799686512496e-05, + "loss": 1.8571, + "step": 39110 + }, + { + "epoch": 0.24587826291909728, + "grad_norm": 8.172979354858398, + "learning_rate": 1.8365380585567843e-05, + "loss": 1.2727, + "step": 39120 + }, + { + "epoch": 0.2459411152357944, + "grad_norm": 7.647946834564209, + "learning_rate": 1.8364961484623187e-05, + "loss": 1.9499, + "step": 39130 + }, + { + "epoch": 0.24600396755249151, + "grad_norm": 6.790382385253906, + "learning_rate": 1.8364542383678534e-05, + "loss": 1.7528, + "step": 39140 + }, + { + "epoch": 0.2460668198691886, + "grad_norm": 6.446195125579834, + "learning_rate": 1.836412328273388e-05, + "loss": 1.7171, + "step": 39150 + }, + { + "epoch": 0.24612967218588572, + "grad_norm": 7.8714776039123535, + "learning_rate": 1.8363704181789228e-05, + "loss": 2.0022, + "step": 39160 + }, + { + "epoch": 0.24619252450258283, + "grad_norm": 7.599400997161865, + "learning_rate": 1.836328508084457e-05, + "loss": 1.9062, + "step": 39170 + }, + { + "epoch": 0.24625537681927995, + "grad_norm": 7.171163082122803, + "learning_rate": 1.836286597989992e-05, + "loss": 1.9292, + "step": 39180 + }, + { + "epoch": 0.24631822913597706, + "grad_norm": 6.78662633895874, + "learning_rate": 1.8362446878955266e-05, + "loss": 2.055, + "step": 39190 + }, + { + "epoch": 0.24638108145267418, + "grad_norm": 6.9192609786987305, + "learning_rate": 1.8362027778010613e-05, + "loss": 1.7286, + "step": 39200 + }, + { + "epoch": 0.2464439337693713, + "grad_norm": 6.885414123535156, + "learning_rate": 1.836160867706596e-05, + "loss": 1.9249, + "step": 39210 + }, + { + "epoch": 0.24650678608606838, + "grad_norm": 7.425045013427734, + "learning_rate": 1.8361189576121307e-05, + "loss": 1.8222, + "step": 39220 + }, + { + "epoch": 0.2465696384027655, + "grad_norm": 7.735393524169922, + "learning_rate": 1.8360770475176654e-05, + "loss": 2.0101, + "step": 39230 + }, + { + "epoch": 0.2466324907194626, + "grad_norm": 7.208608627319336, + "learning_rate": 1.8360351374232e-05, + "loss": 1.8489, + "step": 39240 + }, + { + "epoch": 0.24669534303615973, + "grad_norm": 6.560624122619629, + "learning_rate": 1.8359932273287345e-05, + "loss": 2.0849, + "step": 39250 + }, + { + "epoch": 0.24675819535285684, + "grad_norm": 7.286542892456055, + "learning_rate": 1.8359513172342692e-05, + "loss": 1.9742, + "step": 39260 + }, + { + "epoch": 0.24682104766955396, + "grad_norm": 6.513575553894043, + "learning_rate": 1.835909407139804e-05, + "loss": 1.8736, + "step": 39270 + }, + { + "epoch": 0.24688389998625104, + "grad_norm": 7.486867427825928, + "learning_rate": 1.8358674970453386e-05, + "loss": 1.903, + "step": 39280 + }, + { + "epoch": 0.24694675230294816, + "grad_norm": 7.773925304412842, + "learning_rate": 1.8358255869508733e-05, + "loss": 1.7813, + "step": 39290 + }, + { + "epoch": 0.24700960461964527, + "grad_norm": 7.760753154754639, + "learning_rate": 1.8357836768564077e-05, + "loss": 1.7484, + "step": 39300 + }, + { + "epoch": 0.2470724569363424, + "grad_norm": 7.225697040557861, + "learning_rate": 1.8357417667619424e-05, + "loss": 1.8298, + "step": 39310 + }, + { + "epoch": 0.2471353092530395, + "grad_norm": 7.341373443603516, + "learning_rate": 1.835699856667477e-05, + "loss": 1.9556, + "step": 39320 + }, + { + "epoch": 0.24719816156973662, + "grad_norm": 7.047973155975342, + "learning_rate": 1.8356579465730118e-05, + "loss": 1.7168, + "step": 39330 + }, + { + "epoch": 0.24726101388643373, + "grad_norm": 7.430441856384277, + "learning_rate": 1.8356160364785465e-05, + "loss": 1.7909, + "step": 39340 + }, + { + "epoch": 0.24732386620313082, + "grad_norm": 7.2714643478393555, + "learning_rate": 1.835574126384081e-05, + "loss": 2.1455, + "step": 39350 + }, + { + "epoch": 0.24738671851982794, + "grad_norm": 6.922287464141846, + "learning_rate": 1.8355322162896156e-05, + "loss": 1.9391, + "step": 39360 + }, + { + "epoch": 0.24744957083652505, + "grad_norm": 6.799683570861816, + "learning_rate": 1.8354903061951503e-05, + "loss": 1.8829, + "step": 39370 + }, + { + "epoch": 0.24751242315322217, + "grad_norm": 5.420069217681885, + "learning_rate": 1.835448396100685e-05, + "loss": 1.7586, + "step": 39380 + }, + { + "epoch": 0.24757527546991928, + "grad_norm": 7.595252990722656, + "learning_rate": 1.8354064860062194e-05, + "loss": 1.8475, + "step": 39390 + }, + { + "epoch": 0.2476381277866164, + "grad_norm": 7.183959484100342, + "learning_rate": 1.835364575911754e-05, + "loss": 2.0457, + "step": 39400 + }, + { + "epoch": 0.24770098010331348, + "grad_norm": 7.531284332275391, + "learning_rate": 1.8353226658172888e-05, + "loss": 1.8284, + "step": 39410 + }, + { + "epoch": 0.2477638324200106, + "grad_norm": 7.272885799407959, + "learning_rate": 1.8352807557228235e-05, + "loss": 2.0378, + "step": 39420 + }, + { + "epoch": 0.24782668473670771, + "grad_norm": 6.173305511474609, + "learning_rate": 1.8352388456283582e-05, + "loss": 1.9055, + "step": 39430 + }, + { + "epoch": 0.24788953705340483, + "grad_norm": 7.7378692626953125, + "learning_rate": 1.835196935533893e-05, + "loss": 2.0607, + "step": 39440 + }, + { + "epoch": 0.24795238937010194, + "grad_norm": 7.691354274749756, + "learning_rate": 1.8351550254394276e-05, + "loss": 1.7121, + "step": 39450 + }, + { + "epoch": 0.24801524168679906, + "grad_norm": 6.638065814971924, + "learning_rate": 1.8351131153449623e-05, + "loss": 1.8401, + "step": 39460 + }, + { + "epoch": 0.24807809400349615, + "grad_norm": 7.033689498901367, + "learning_rate": 1.835071205250497e-05, + "loss": 1.9848, + "step": 39470 + }, + { + "epoch": 0.24814094632019326, + "grad_norm": 6.727336406707764, + "learning_rate": 1.8350292951560314e-05, + "loss": 1.8813, + "step": 39480 + }, + { + "epoch": 0.24820379863689038, + "grad_norm": 7.711666584014893, + "learning_rate": 1.834987385061566e-05, + "loss": 1.8474, + "step": 39490 + }, + { + "epoch": 0.2482666509535875, + "grad_norm": 8.10097599029541, + "learning_rate": 1.8349454749671008e-05, + "loss": 1.8589, + "step": 39500 + }, + { + "epoch": 0.2483295032702846, + "grad_norm": 7.868946075439453, + "learning_rate": 1.8349035648726355e-05, + "loss": 1.7973, + "step": 39510 + }, + { + "epoch": 0.24839235558698172, + "grad_norm": 8.510309219360352, + "learning_rate": 1.83486165477817e-05, + "loss": 1.6622, + "step": 39520 + }, + { + "epoch": 0.24845520790367884, + "grad_norm": 5.8302764892578125, + "learning_rate": 1.8348197446837046e-05, + "loss": 1.7117, + "step": 39530 + }, + { + "epoch": 0.24851806022037592, + "grad_norm": 6.924520969390869, + "learning_rate": 1.8347778345892393e-05, + "loss": 1.6787, + "step": 39540 + }, + { + "epoch": 0.24858091253707304, + "grad_norm": 7.145519256591797, + "learning_rate": 1.834735924494774e-05, + "loss": 1.8117, + "step": 39550 + }, + { + "epoch": 0.24864376485377016, + "grad_norm": 7.44929313659668, + "learning_rate": 1.8346940144003087e-05, + "loss": 1.9223, + "step": 39560 + }, + { + "epoch": 0.24870661717046727, + "grad_norm": 7.225125789642334, + "learning_rate": 1.834652104305843e-05, + "loss": 1.8719, + "step": 39570 + }, + { + "epoch": 0.24876946948716439, + "grad_norm": 7.701131343841553, + "learning_rate": 1.8346101942113778e-05, + "loss": 2.0049, + "step": 39580 + }, + { + "epoch": 0.2488323218038615, + "grad_norm": 6.39308500289917, + "learning_rate": 1.8345682841169125e-05, + "loss": 1.7611, + "step": 39590 + }, + { + "epoch": 0.2488951741205586, + "grad_norm": 6.422939777374268, + "learning_rate": 1.8345263740224472e-05, + "loss": 1.8618, + "step": 39600 + }, + { + "epoch": 0.2489580264372557, + "grad_norm": 6.512946605682373, + "learning_rate": 1.834484463927982e-05, + "loss": 1.8978, + "step": 39610 + }, + { + "epoch": 0.24902087875395282, + "grad_norm": 7.747503757476807, + "learning_rate": 1.8344425538335166e-05, + "loss": 1.6621, + "step": 39620 + }, + { + "epoch": 0.24908373107064993, + "grad_norm": 7.341767311096191, + "learning_rate": 1.834400643739051e-05, + "loss": 1.8461, + "step": 39630 + }, + { + "epoch": 0.24914658338734705, + "grad_norm": 8.208362579345703, + "learning_rate": 1.8343587336445857e-05, + "loss": 2.0456, + "step": 39640 + }, + { + "epoch": 0.24920943570404416, + "grad_norm": 7.730144023895264, + "learning_rate": 1.8343168235501204e-05, + "loss": 1.9841, + "step": 39650 + }, + { + "epoch": 0.24927228802074128, + "grad_norm": 7.366255760192871, + "learning_rate": 1.834274913455655e-05, + "loss": 1.8643, + "step": 39660 + }, + { + "epoch": 0.24933514033743837, + "grad_norm": 7.58958101272583, + "learning_rate": 1.8342330033611898e-05, + "loss": 2.0262, + "step": 39670 + }, + { + "epoch": 0.24939799265413548, + "grad_norm": 8.771005630493164, + "learning_rate": 1.8341910932667245e-05, + "loss": 2.0699, + "step": 39680 + }, + { + "epoch": 0.2494608449708326, + "grad_norm": 6.679497718811035, + "learning_rate": 1.8341491831722592e-05, + "loss": 2.0086, + "step": 39690 + }, + { + "epoch": 0.2495236972875297, + "grad_norm": 8.130735397338867, + "learning_rate": 1.8341072730777936e-05, + "loss": 1.9688, + "step": 39700 + }, + { + "epoch": 0.24958654960422683, + "grad_norm": 6.9749603271484375, + "learning_rate": 1.8340653629833283e-05, + "loss": 1.6664, + "step": 39710 + }, + { + "epoch": 0.24964940192092394, + "grad_norm": 7.334023475646973, + "learning_rate": 1.834023452888863e-05, + "loss": 2.0066, + "step": 39720 + }, + { + "epoch": 0.24971225423762103, + "grad_norm": 8.127015113830566, + "learning_rate": 1.8339815427943977e-05, + "loss": 1.9115, + "step": 39730 + }, + { + "epoch": 0.24977510655431814, + "grad_norm": 7.029850006103516, + "learning_rate": 1.8339396326999324e-05, + "loss": 1.8836, + "step": 39740 + }, + { + "epoch": 0.24983795887101526, + "grad_norm": 7.777439594268799, + "learning_rate": 1.8338977226054668e-05, + "loss": 1.9276, + "step": 39750 + }, + { + "epoch": 0.24990081118771237, + "grad_norm": 6.383263111114502, + "learning_rate": 1.8338558125110015e-05, + "loss": 1.8764, + "step": 39760 + }, + { + "epoch": 0.2499636635044095, + "grad_norm": 7.595082759857178, + "learning_rate": 1.8338139024165362e-05, + "loss": 1.8711, + "step": 39770 + }, + { + "epoch": 0.2500265158211066, + "grad_norm": 6.532485008239746, + "learning_rate": 1.833771992322071e-05, + "loss": 1.8727, + "step": 39780 + }, + { + "epoch": 0.2500893681378037, + "grad_norm": 6.619588851928711, + "learning_rate": 1.8337300822276053e-05, + "loss": 1.7837, + "step": 39790 + }, + { + "epoch": 0.2501522204545008, + "grad_norm": 7.122777462005615, + "learning_rate": 1.83368817213314e-05, + "loss": 1.9078, + "step": 39800 + }, + { + "epoch": 0.2502150727711979, + "grad_norm": 6.726501941680908, + "learning_rate": 1.8336462620386747e-05, + "loss": 1.9333, + "step": 39810 + }, + { + "epoch": 0.25027792508789504, + "grad_norm": 8.014927864074707, + "learning_rate": 1.8336043519442094e-05, + "loss": 1.8131, + "step": 39820 + }, + { + "epoch": 0.25034077740459215, + "grad_norm": 6.615452766418457, + "learning_rate": 1.833562441849744e-05, + "loss": 1.9475, + "step": 39830 + }, + { + "epoch": 0.25040362972128927, + "grad_norm": 7.125904560089111, + "learning_rate": 1.8335205317552788e-05, + "loss": 1.842, + "step": 39840 + }, + { + "epoch": 0.2504664820379864, + "grad_norm": 7.295060157775879, + "learning_rate": 1.8334786216608135e-05, + "loss": 1.7838, + "step": 39850 + }, + { + "epoch": 0.2505293343546835, + "grad_norm": 7.834619522094727, + "learning_rate": 1.8334367115663482e-05, + "loss": 1.7631, + "step": 39860 + }, + { + "epoch": 0.2505921866713806, + "grad_norm": 6.960846900939941, + "learning_rate": 1.833394801471883e-05, + "loss": 1.9703, + "step": 39870 + }, + { + "epoch": 0.2506550389880777, + "grad_norm": 8.08946418762207, + "learning_rate": 1.8333528913774173e-05, + "loss": 1.9355, + "step": 39880 + }, + { + "epoch": 0.2507178913047748, + "grad_norm": 8.536508560180664, + "learning_rate": 1.833310981282952e-05, + "loss": 2.1335, + "step": 39890 + }, + { + "epoch": 0.2507807436214719, + "grad_norm": 7.6163506507873535, + "learning_rate": 1.8332690711884867e-05, + "loss": 1.9276, + "step": 39900 + }, + { + "epoch": 0.250843595938169, + "grad_norm": 8.61902141571045, + "learning_rate": 1.8332271610940214e-05, + "loss": 1.8752, + "step": 39910 + }, + { + "epoch": 0.25090644825486613, + "grad_norm": 7.387242317199707, + "learning_rate": 1.8331852509995558e-05, + "loss": 2.0766, + "step": 39920 + }, + { + "epoch": 0.25096930057156325, + "grad_norm": 5.936997890472412, + "learning_rate": 1.8331433409050905e-05, + "loss": 1.9449, + "step": 39930 + }, + { + "epoch": 0.25103215288826036, + "grad_norm": 7.217583179473877, + "learning_rate": 1.8331014308106252e-05, + "loss": 1.9657, + "step": 39940 + }, + { + "epoch": 0.2510950052049575, + "grad_norm": 6.556284427642822, + "learning_rate": 1.83305952071616e-05, + "loss": 1.9077, + "step": 39950 + }, + { + "epoch": 0.2511578575216546, + "grad_norm": 7.746654033660889, + "learning_rate": 1.8330176106216946e-05, + "loss": 1.9202, + "step": 39960 + }, + { + "epoch": 0.2512207098383517, + "grad_norm": 7.003026962280273, + "learning_rate": 1.832975700527229e-05, + "loss": 1.9193, + "step": 39970 + }, + { + "epoch": 0.2512835621550488, + "grad_norm": 7.067366600036621, + "learning_rate": 1.8329337904327637e-05, + "loss": 1.9311, + "step": 39980 + }, + { + "epoch": 0.25134641447174594, + "grad_norm": 8.074548721313477, + "learning_rate": 1.8328918803382984e-05, + "loss": 1.8689, + "step": 39990 + }, + { + "epoch": 0.25140926678844305, + "grad_norm": 8.37764835357666, + "learning_rate": 1.832849970243833e-05, + "loss": 1.9689, + "step": 40000 + }, + { + "epoch": 0.25147211910514017, + "grad_norm": 7.969852924346924, + "learning_rate": 1.8328080601493675e-05, + "loss": 2.1185, + "step": 40010 + }, + { + "epoch": 0.25153497142183723, + "grad_norm": 7.347843170166016, + "learning_rate": 1.8327661500549022e-05, + "loss": 1.6484, + "step": 40020 + }, + { + "epoch": 0.25159782373853434, + "grad_norm": 7.6527814865112305, + "learning_rate": 1.832724239960437e-05, + "loss": 1.9171, + "step": 40030 + }, + { + "epoch": 0.25166067605523146, + "grad_norm": 6.635735034942627, + "learning_rate": 1.8326823298659716e-05, + "loss": 1.6042, + "step": 40040 + }, + { + "epoch": 0.2517235283719286, + "grad_norm": 5.760181427001953, + "learning_rate": 1.8326404197715063e-05, + "loss": 2.0133, + "step": 40050 + }, + { + "epoch": 0.2517863806886257, + "grad_norm": 6.241506576538086, + "learning_rate": 1.832598509677041e-05, + "loss": 1.815, + "step": 40060 + }, + { + "epoch": 0.2518492330053228, + "grad_norm": 6.897012710571289, + "learning_rate": 1.8325565995825757e-05, + "loss": 1.9044, + "step": 40070 + }, + { + "epoch": 0.2519120853220199, + "grad_norm": 5.902112007141113, + "learning_rate": 1.8325146894881104e-05, + "loss": 1.7641, + "step": 40080 + }, + { + "epoch": 0.25197493763871703, + "grad_norm": 7.367774486541748, + "learning_rate": 1.832472779393645e-05, + "loss": 1.9469, + "step": 40090 + }, + { + "epoch": 0.25203778995541415, + "grad_norm": 6.922102928161621, + "learning_rate": 1.8324308692991795e-05, + "loss": 1.9259, + "step": 40100 + }, + { + "epoch": 0.25210064227211126, + "grad_norm": 7.971136569976807, + "learning_rate": 1.8323889592047142e-05, + "loss": 1.8942, + "step": 40110 + }, + { + "epoch": 0.2521634945888084, + "grad_norm": 6.794959545135498, + "learning_rate": 1.832347049110249e-05, + "loss": 2.0783, + "step": 40120 + }, + { + "epoch": 0.2522263469055055, + "grad_norm": 7.142230033874512, + "learning_rate": 1.8323051390157836e-05, + "loss": 1.9941, + "step": 40130 + }, + { + "epoch": 0.25228919922220255, + "grad_norm": 7.810637950897217, + "learning_rate": 1.8322632289213183e-05, + "loss": 1.9061, + "step": 40140 + }, + { + "epoch": 0.25235205153889967, + "grad_norm": 8.566405296325684, + "learning_rate": 1.8322213188268527e-05, + "loss": 1.9135, + "step": 40150 + }, + { + "epoch": 0.2524149038555968, + "grad_norm": 7.335825443267822, + "learning_rate": 1.8321794087323874e-05, + "loss": 1.9056, + "step": 40160 + }, + { + "epoch": 0.2524777561722939, + "grad_norm": 6.504067420959473, + "learning_rate": 1.832137498637922e-05, + "loss": 1.8662, + "step": 40170 + }, + { + "epoch": 0.252540608488991, + "grad_norm": 6.566962718963623, + "learning_rate": 1.8320955885434568e-05, + "loss": 2.0712, + "step": 40180 + }, + { + "epoch": 0.25260346080568813, + "grad_norm": 7.839852809906006, + "learning_rate": 1.8320536784489912e-05, + "loss": 1.9855, + "step": 40190 + }, + { + "epoch": 0.25266631312238524, + "grad_norm": 8.374872207641602, + "learning_rate": 1.832011768354526e-05, + "loss": 1.9443, + "step": 40200 + }, + { + "epoch": 0.25272916543908236, + "grad_norm": 7.07660436630249, + "learning_rate": 1.8319698582600606e-05, + "loss": 1.8826, + "step": 40210 + }, + { + "epoch": 0.2527920177557795, + "grad_norm": 7.191566467285156, + "learning_rate": 1.8319279481655953e-05, + "loss": 1.959, + "step": 40220 + }, + { + "epoch": 0.2528548700724766, + "grad_norm": 7.686609268188477, + "learning_rate": 1.83188603807113e-05, + "loss": 1.9148, + "step": 40230 + }, + { + "epoch": 0.2529177223891737, + "grad_norm": 6.395354747772217, + "learning_rate": 1.8318441279766647e-05, + "loss": 1.9321, + "step": 40240 + }, + { + "epoch": 0.2529805747058708, + "grad_norm": 7.376534461975098, + "learning_rate": 1.8318022178821994e-05, + "loss": 2.1802, + "step": 40250 + }, + { + "epoch": 0.25304342702256793, + "grad_norm": 7.409773826599121, + "learning_rate": 1.8317603077877338e-05, + "loss": 1.8117, + "step": 40260 + }, + { + "epoch": 0.253106279339265, + "grad_norm": 7.840152740478516, + "learning_rate": 1.8317183976932685e-05, + "loss": 1.9052, + "step": 40270 + }, + { + "epoch": 0.2531691316559621, + "grad_norm": 7.386919975280762, + "learning_rate": 1.8316764875988032e-05, + "loss": 1.9497, + "step": 40280 + }, + { + "epoch": 0.2532319839726592, + "grad_norm": 6.20576286315918, + "learning_rate": 1.831634577504338e-05, + "loss": 1.8981, + "step": 40290 + }, + { + "epoch": 0.25329483628935634, + "grad_norm": 7.299727916717529, + "learning_rate": 1.8315926674098726e-05, + "loss": 1.9856, + "step": 40300 + }, + { + "epoch": 0.25335768860605345, + "grad_norm": 7.2291083335876465, + "learning_rate": 1.8315507573154073e-05, + "loss": 1.743, + "step": 40310 + }, + { + "epoch": 0.25342054092275057, + "grad_norm": 8.496075630187988, + "learning_rate": 1.8315088472209417e-05, + "loss": 1.796, + "step": 40320 + }, + { + "epoch": 0.2534833932394477, + "grad_norm": 6.407141208648682, + "learning_rate": 1.8314669371264764e-05, + "loss": 1.9525, + "step": 40330 + }, + { + "epoch": 0.2535462455561448, + "grad_norm": 7.616662979125977, + "learning_rate": 1.831425027032011e-05, + "loss": 1.9649, + "step": 40340 + }, + { + "epoch": 0.2536090978728419, + "grad_norm": 6.378571510314941, + "learning_rate": 1.8313831169375458e-05, + "loss": 1.9117, + "step": 40350 + }, + { + "epoch": 0.25367195018953903, + "grad_norm": 7.712974548339844, + "learning_rate": 1.8313412068430805e-05, + "loss": 1.7425, + "step": 40360 + }, + { + "epoch": 0.25373480250623615, + "grad_norm": 7.338319301605225, + "learning_rate": 1.831299296748615e-05, + "loss": 1.8324, + "step": 40370 + }, + { + "epoch": 0.25379765482293326, + "grad_norm": 6.323298454284668, + "learning_rate": 1.8312573866541496e-05, + "loss": 1.9462, + "step": 40380 + }, + { + "epoch": 0.2538605071396304, + "grad_norm": 7.694051265716553, + "learning_rate": 1.8312154765596843e-05, + "loss": 1.9082, + "step": 40390 + }, + { + "epoch": 0.25392335945632744, + "grad_norm": 8.786967277526855, + "learning_rate": 1.831173566465219e-05, + "loss": 2.0764, + "step": 40400 + }, + { + "epoch": 0.25398621177302455, + "grad_norm": 7.639488220214844, + "learning_rate": 1.8311316563707534e-05, + "loss": 1.962, + "step": 40410 + }, + { + "epoch": 0.25404906408972167, + "grad_norm": 8.818544387817383, + "learning_rate": 1.831089746276288e-05, + "loss": 1.8749, + "step": 40420 + }, + { + "epoch": 0.2541119164064188, + "grad_norm": 7.728259086608887, + "learning_rate": 1.8310478361818228e-05, + "loss": 1.7827, + "step": 40430 + }, + { + "epoch": 0.2541747687231159, + "grad_norm": 6.672224998474121, + "learning_rate": 1.8310059260873575e-05, + "loss": 1.8389, + "step": 40440 + }, + { + "epoch": 0.254237621039813, + "grad_norm": 6.4283342361450195, + "learning_rate": 1.8309640159928922e-05, + "loss": 1.8576, + "step": 40450 + }, + { + "epoch": 0.2543004733565101, + "grad_norm": 7.002668857574463, + "learning_rate": 1.830922105898427e-05, + "loss": 1.9462, + "step": 40460 + }, + { + "epoch": 0.25436332567320724, + "grad_norm": 7.611618995666504, + "learning_rate": 1.8308801958039616e-05, + "loss": 2.0004, + "step": 40470 + }, + { + "epoch": 0.25442617798990436, + "grad_norm": 7.9169745445251465, + "learning_rate": 1.8308382857094963e-05, + "loss": 1.8753, + "step": 40480 + }, + { + "epoch": 0.25448903030660147, + "grad_norm": 7.655786037445068, + "learning_rate": 1.830796375615031e-05, + "loss": 1.929, + "step": 40490 + }, + { + "epoch": 0.2545518826232986, + "grad_norm": 7.357483386993408, + "learning_rate": 1.8307544655205654e-05, + "loss": 1.903, + "step": 40500 + }, + { + "epoch": 0.2546147349399957, + "grad_norm": 9.201723098754883, + "learning_rate": 1.8307125554261e-05, + "loss": 1.9587, + "step": 40510 + }, + { + "epoch": 0.2546775872566928, + "grad_norm": 6.650157451629639, + "learning_rate": 1.8306706453316348e-05, + "loss": 1.7194, + "step": 40520 + }, + { + "epoch": 0.2547404395733899, + "grad_norm": 6.331323623657227, + "learning_rate": 1.8306287352371695e-05, + "loss": 2.1537, + "step": 40530 + }, + { + "epoch": 0.254803291890087, + "grad_norm": 7.6211724281311035, + "learning_rate": 1.830586825142704e-05, + "loss": 1.8748, + "step": 40540 + }, + { + "epoch": 0.2548661442067841, + "grad_norm": 7.957117080688477, + "learning_rate": 1.8305449150482386e-05, + "loss": 1.9157, + "step": 40550 + }, + { + "epoch": 0.2549289965234812, + "grad_norm": 6.765776634216309, + "learning_rate": 1.8305030049537733e-05, + "loss": 1.7877, + "step": 40560 + }, + { + "epoch": 0.25499184884017834, + "grad_norm": 7.246833324432373, + "learning_rate": 1.830461094859308e-05, + "loss": 1.7688, + "step": 40570 + }, + { + "epoch": 0.25505470115687545, + "grad_norm": 6.436697959899902, + "learning_rate": 1.8304191847648427e-05, + "loss": 2.1203, + "step": 40580 + }, + { + "epoch": 0.25511755347357257, + "grad_norm": 7.299831867218018, + "learning_rate": 1.830377274670377e-05, + "loss": 1.8017, + "step": 40590 + }, + { + "epoch": 0.2551804057902697, + "grad_norm": 7.9287004470825195, + "learning_rate": 1.8303353645759118e-05, + "loss": 1.8727, + "step": 40600 + }, + { + "epoch": 0.2552432581069668, + "grad_norm": 6.686092853546143, + "learning_rate": 1.8302934544814465e-05, + "loss": 1.8771, + "step": 40610 + }, + { + "epoch": 0.2553061104236639, + "grad_norm": 7.040564060211182, + "learning_rate": 1.8302515443869812e-05, + "loss": 1.9667, + "step": 40620 + }, + { + "epoch": 0.255368962740361, + "grad_norm": 8.112922668457031, + "learning_rate": 1.8302096342925156e-05, + "loss": 1.8328, + "step": 40630 + }, + { + "epoch": 0.25543181505705814, + "grad_norm": 7.869579792022705, + "learning_rate": 1.8301677241980503e-05, + "loss": 1.7613, + "step": 40640 + }, + { + "epoch": 0.2554946673737552, + "grad_norm": 7.157601356506348, + "learning_rate": 1.830125814103585e-05, + "loss": 1.9874, + "step": 40650 + }, + { + "epoch": 0.2555575196904523, + "grad_norm": 6.303585052490234, + "learning_rate": 1.8300839040091197e-05, + "loss": 1.8133, + "step": 40660 + }, + { + "epoch": 0.25562037200714943, + "grad_norm": 6.151522636413574, + "learning_rate": 1.8300419939146544e-05, + "loss": 1.8912, + "step": 40670 + }, + { + "epoch": 0.25568322432384655, + "grad_norm": 7.436097621917725, + "learning_rate": 1.830000083820189e-05, + "loss": 1.8466, + "step": 40680 + }, + { + "epoch": 0.25574607664054366, + "grad_norm": 7.4817681312561035, + "learning_rate": 1.8299581737257238e-05, + "loss": 2.0094, + "step": 40690 + }, + { + "epoch": 0.2558089289572408, + "grad_norm": 7.078312397003174, + "learning_rate": 1.8299162636312585e-05, + "loss": 2.038, + "step": 40700 + }, + { + "epoch": 0.2558717812739379, + "grad_norm": 6.778133869171143, + "learning_rate": 1.8298743535367932e-05, + "loss": 2.0069, + "step": 40710 + }, + { + "epoch": 0.255934633590635, + "grad_norm": 8.176437377929688, + "learning_rate": 1.8298324434423276e-05, + "loss": 1.9749, + "step": 40720 + }, + { + "epoch": 0.2559974859073321, + "grad_norm": 7.017731189727783, + "learning_rate": 1.8297905333478623e-05, + "loss": 1.7378, + "step": 40730 + }, + { + "epoch": 0.25606033822402924, + "grad_norm": 6.80908203125, + "learning_rate": 1.829748623253397e-05, + "loss": 1.9403, + "step": 40740 + }, + { + "epoch": 0.25612319054072635, + "grad_norm": 7.145242214202881, + "learning_rate": 1.8297067131589317e-05, + "loss": 1.7958, + "step": 40750 + }, + { + "epoch": 0.25618604285742347, + "grad_norm": 7.3790693283081055, + "learning_rate": 1.8296648030644664e-05, + "loss": 1.87, + "step": 40760 + }, + { + "epoch": 0.2562488951741206, + "grad_norm": 8.010051727294922, + "learning_rate": 1.8296228929700008e-05, + "loss": 1.6271, + "step": 40770 + }, + { + "epoch": 0.25631174749081764, + "grad_norm": 7.379945278167725, + "learning_rate": 1.8295809828755355e-05, + "loss": 1.8067, + "step": 40780 + }, + { + "epoch": 0.25637459980751476, + "grad_norm": 7.175872802734375, + "learning_rate": 1.8295390727810702e-05, + "loss": 1.8745, + "step": 40790 + }, + { + "epoch": 0.2564374521242119, + "grad_norm": 7.595414638519287, + "learning_rate": 1.829497162686605e-05, + "loss": 1.9023, + "step": 40800 + }, + { + "epoch": 0.256500304440909, + "grad_norm": 7.016720294952393, + "learning_rate": 1.8294552525921393e-05, + "loss": 2.0608, + "step": 40810 + }, + { + "epoch": 0.2565631567576061, + "grad_norm": 6.0823588371276855, + "learning_rate": 1.829413342497674e-05, + "loss": 1.9367, + "step": 40820 + }, + { + "epoch": 0.2566260090743032, + "grad_norm": 6.850696563720703, + "learning_rate": 1.8293714324032087e-05, + "loss": 1.9112, + "step": 40830 + }, + { + "epoch": 0.25668886139100033, + "grad_norm": 8.202967643737793, + "learning_rate": 1.8293295223087434e-05, + "loss": 1.6909, + "step": 40840 + }, + { + "epoch": 0.25675171370769745, + "grad_norm": 6.884469509124756, + "learning_rate": 1.829287612214278e-05, + "loss": 1.7071, + "step": 40850 + }, + { + "epoch": 0.25681456602439456, + "grad_norm": 6.499464511871338, + "learning_rate": 1.8292457021198128e-05, + "loss": 1.6713, + "step": 40860 + }, + { + "epoch": 0.2568774183410917, + "grad_norm": 6.973784446716309, + "learning_rate": 1.8292037920253475e-05, + "loss": 2.105, + "step": 40870 + }, + { + "epoch": 0.2569402706577888, + "grad_norm": 6.484634876251221, + "learning_rate": 1.829161881930882e-05, + "loss": 1.9186, + "step": 40880 + }, + { + "epoch": 0.2570031229744859, + "grad_norm": 7.263411521911621, + "learning_rate": 1.8291199718364166e-05, + "loss": 1.9716, + "step": 40890 + }, + { + "epoch": 0.257065975291183, + "grad_norm": 6.684298515319824, + "learning_rate": 1.8290780617419513e-05, + "loss": 1.7181, + "step": 40900 + }, + { + "epoch": 0.2571288276078801, + "grad_norm": 6.107182502746582, + "learning_rate": 1.829036151647486e-05, + "loss": 1.7592, + "step": 40910 + }, + { + "epoch": 0.2571916799245772, + "grad_norm": 7.427902698516846, + "learning_rate": 1.8289942415530207e-05, + "loss": 1.9022, + "step": 40920 + }, + { + "epoch": 0.2572545322412743, + "grad_norm": 7.072940826416016, + "learning_rate": 1.8289523314585554e-05, + "loss": 1.7885, + "step": 40930 + }, + { + "epoch": 0.25731738455797143, + "grad_norm": 8.010061264038086, + "learning_rate": 1.8289104213640898e-05, + "loss": 1.7651, + "step": 40940 + }, + { + "epoch": 0.25738023687466854, + "grad_norm": 6.3123393058776855, + "learning_rate": 1.8288685112696245e-05, + "loss": 2.0995, + "step": 40950 + }, + { + "epoch": 0.25744308919136566, + "grad_norm": 6.515552520751953, + "learning_rate": 1.8288266011751592e-05, + "loss": 1.7928, + "step": 40960 + }, + { + "epoch": 0.2575059415080628, + "grad_norm": 7.143669128417969, + "learning_rate": 1.828784691080694e-05, + "loss": 1.8767, + "step": 40970 + }, + { + "epoch": 0.2575687938247599, + "grad_norm": 7.594226360321045, + "learning_rate": 1.8287427809862286e-05, + "loss": 1.8088, + "step": 40980 + }, + { + "epoch": 0.257631646141457, + "grad_norm": 8.058490753173828, + "learning_rate": 1.828700870891763e-05, + "loss": 2.0683, + "step": 40990 + }, + { + "epoch": 0.2576944984581541, + "grad_norm": 9.095369338989258, + "learning_rate": 1.8286589607972977e-05, + "loss": 1.9984, + "step": 41000 + }, + { + "epoch": 0.25775735077485123, + "grad_norm": 7.421639442443848, + "learning_rate": 1.8286170507028324e-05, + "loss": 1.7457, + "step": 41010 + }, + { + "epoch": 0.25782020309154835, + "grad_norm": 6.662604331970215, + "learning_rate": 1.828575140608367e-05, + "loss": 2.006, + "step": 41020 + }, + { + "epoch": 0.25788305540824547, + "grad_norm": 7.050024509429932, + "learning_rate": 1.8285332305139015e-05, + "loss": 1.8124, + "step": 41030 + }, + { + "epoch": 0.2579459077249425, + "grad_norm": 7.553315162658691, + "learning_rate": 1.8284913204194362e-05, + "loss": 1.8247, + "step": 41040 + }, + { + "epoch": 0.25800876004163964, + "grad_norm": 5.7506561279296875, + "learning_rate": 1.828449410324971e-05, + "loss": 1.8771, + "step": 41050 + }, + { + "epoch": 0.25807161235833675, + "grad_norm": 7.674767971038818, + "learning_rate": 1.8284075002305056e-05, + "loss": 1.7781, + "step": 41060 + }, + { + "epoch": 0.25813446467503387, + "grad_norm": 7.037848472595215, + "learning_rate": 1.8283655901360403e-05, + "loss": 1.7003, + "step": 41070 + }, + { + "epoch": 0.258197316991731, + "grad_norm": 7.2399115562438965, + "learning_rate": 1.828323680041575e-05, + "loss": 1.9313, + "step": 41080 + }, + { + "epoch": 0.2582601693084281, + "grad_norm": 7.564504146575928, + "learning_rate": 1.8282817699471097e-05, + "loss": 2.0094, + "step": 41090 + }, + { + "epoch": 0.2583230216251252, + "grad_norm": 6.998504161834717, + "learning_rate": 1.8282398598526444e-05, + "loss": 1.8335, + "step": 41100 + }, + { + "epoch": 0.25838587394182233, + "grad_norm": 7.1183762550354, + "learning_rate": 1.828197949758179e-05, + "loss": 1.7731, + "step": 41110 + }, + { + "epoch": 0.25844872625851945, + "grad_norm": 7.143240451812744, + "learning_rate": 1.8281560396637135e-05, + "loss": 1.8062, + "step": 41120 + }, + { + "epoch": 0.25851157857521656, + "grad_norm": 6.633826732635498, + "learning_rate": 1.8281141295692482e-05, + "loss": 1.9169, + "step": 41130 + }, + { + "epoch": 0.2585744308919137, + "grad_norm": 6.880277156829834, + "learning_rate": 1.828072219474783e-05, + "loss": 1.8099, + "step": 41140 + }, + { + "epoch": 0.2586372832086108, + "grad_norm": 5.943376541137695, + "learning_rate": 1.8280303093803176e-05, + "loss": 1.7188, + "step": 41150 + }, + { + "epoch": 0.25870013552530785, + "grad_norm": 7.167764186859131, + "learning_rate": 1.827988399285852e-05, + "loss": 1.9132, + "step": 41160 + }, + { + "epoch": 0.25876298784200497, + "grad_norm": 7.091798305511475, + "learning_rate": 1.8279464891913867e-05, + "loss": 1.7669, + "step": 41170 + }, + { + "epoch": 0.2588258401587021, + "grad_norm": 7.426420211791992, + "learning_rate": 1.8279045790969214e-05, + "loss": 1.7289, + "step": 41180 + }, + { + "epoch": 0.2588886924753992, + "grad_norm": 7.881027698516846, + "learning_rate": 1.827862669002456e-05, + "loss": 2.0903, + "step": 41190 + }, + { + "epoch": 0.2589515447920963, + "grad_norm": 6.953278064727783, + "learning_rate": 1.827820758907991e-05, + "loss": 1.8588, + "step": 41200 + }, + { + "epoch": 0.2590143971087934, + "grad_norm": 7.793636798858643, + "learning_rate": 1.8277788488135252e-05, + "loss": 2.1033, + "step": 41210 + }, + { + "epoch": 0.25907724942549054, + "grad_norm": 8.12525463104248, + "learning_rate": 1.82773693871906e-05, + "loss": 2.0343, + "step": 41220 + }, + { + "epoch": 0.25914010174218766, + "grad_norm": 6.306594371795654, + "learning_rate": 1.8276950286245946e-05, + "loss": 1.775, + "step": 41230 + }, + { + "epoch": 0.25920295405888477, + "grad_norm": 8.239091873168945, + "learning_rate": 1.8276531185301293e-05, + "loss": 1.9468, + "step": 41240 + }, + { + "epoch": 0.2592658063755819, + "grad_norm": 7.029281139373779, + "learning_rate": 1.827611208435664e-05, + "loss": 1.9031, + "step": 41250 + }, + { + "epoch": 0.259328658692279, + "grad_norm": 7.506056785583496, + "learning_rate": 1.8275692983411984e-05, + "loss": 1.9074, + "step": 41260 + }, + { + "epoch": 0.2593915110089761, + "grad_norm": 6.82318639755249, + "learning_rate": 1.827527388246733e-05, + "loss": 1.6273, + "step": 41270 + }, + { + "epoch": 0.25945436332567323, + "grad_norm": 6.963465690612793, + "learning_rate": 1.8274854781522678e-05, + "loss": 1.6154, + "step": 41280 + }, + { + "epoch": 0.2595172156423703, + "grad_norm": 7.320285320281982, + "learning_rate": 1.8274435680578025e-05, + "loss": 1.7584, + "step": 41290 + }, + { + "epoch": 0.2595800679590674, + "grad_norm": 6.893568992614746, + "learning_rate": 1.8274016579633372e-05, + "loss": 1.8717, + "step": 41300 + }, + { + "epoch": 0.2596429202757645, + "grad_norm": 7.00272274017334, + "learning_rate": 1.827359747868872e-05, + "loss": 1.7207, + "step": 41310 + }, + { + "epoch": 0.25970577259246164, + "grad_norm": 6.852663993835449, + "learning_rate": 1.8273178377744066e-05, + "loss": 1.7521, + "step": 41320 + }, + { + "epoch": 0.25976862490915875, + "grad_norm": 6.563899040222168, + "learning_rate": 1.8272759276799413e-05, + "loss": 1.7454, + "step": 41330 + }, + { + "epoch": 0.25983147722585587, + "grad_norm": 7.41424036026001, + "learning_rate": 1.8272340175854757e-05, + "loss": 1.7128, + "step": 41340 + }, + { + "epoch": 0.259894329542553, + "grad_norm": 6.487424373626709, + "learning_rate": 1.8271921074910104e-05, + "loss": 2.0147, + "step": 41350 + }, + { + "epoch": 0.2599571818592501, + "grad_norm": 7.088781356811523, + "learning_rate": 1.827150197396545e-05, + "loss": 1.7212, + "step": 41360 + }, + { + "epoch": 0.2600200341759472, + "grad_norm": 7.368875503540039, + "learning_rate": 1.82710828730208e-05, + "loss": 1.9403, + "step": 41370 + }, + { + "epoch": 0.2600828864926443, + "grad_norm": 8.326485633850098, + "learning_rate": 1.8270663772076145e-05, + "loss": 1.9106, + "step": 41380 + }, + { + "epoch": 0.26014573880934144, + "grad_norm": 9.53166389465332, + "learning_rate": 1.827024467113149e-05, + "loss": 1.987, + "step": 41390 + }, + { + "epoch": 0.26020859112603856, + "grad_norm": 7.934520244598389, + "learning_rate": 1.8269825570186836e-05, + "loss": 1.9611, + "step": 41400 + }, + { + "epoch": 0.2602714434427357, + "grad_norm": 7.475955009460449, + "learning_rate": 1.8269406469242183e-05, + "loss": 1.8361, + "step": 41410 + }, + { + "epoch": 0.26033429575943273, + "grad_norm": 8.234063148498535, + "learning_rate": 1.826898736829753e-05, + "loss": 1.7954, + "step": 41420 + }, + { + "epoch": 0.26039714807612985, + "grad_norm": 7.798406600952148, + "learning_rate": 1.8268568267352874e-05, + "loss": 1.8615, + "step": 41430 + }, + { + "epoch": 0.26046000039282696, + "grad_norm": 6.932463645935059, + "learning_rate": 1.826814916640822e-05, + "loss": 1.9989, + "step": 41440 + }, + { + "epoch": 0.2605228527095241, + "grad_norm": 8.069757461547852, + "learning_rate": 1.8267730065463568e-05, + "loss": 1.6508, + "step": 41450 + }, + { + "epoch": 0.2605857050262212, + "grad_norm": 7.715116024017334, + "learning_rate": 1.8267310964518915e-05, + "loss": 1.86, + "step": 41460 + }, + { + "epoch": 0.2606485573429183, + "grad_norm": 7.59549617767334, + "learning_rate": 1.8266891863574262e-05, + "loss": 2.0385, + "step": 41470 + }, + { + "epoch": 0.2607114096596154, + "grad_norm": 6.884585380554199, + "learning_rate": 1.826647276262961e-05, + "loss": 1.7335, + "step": 41480 + }, + { + "epoch": 0.26077426197631254, + "grad_norm": 7.218272686004639, + "learning_rate": 1.8266053661684956e-05, + "loss": 1.9569, + "step": 41490 + }, + { + "epoch": 0.26083711429300965, + "grad_norm": 6.667884826660156, + "learning_rate": 1.8265634560740303e-05, + "loss": 1.7593, + "step": 41500 + }, + { + "epoch": 0.26089996660970677, + "grad_norm": 6.3704304695129395, + "learning_rate": 1.8265215459795647e-05, + "loss": 1.8527, + "step": 41510 + }, + { + "epoch": 0.2609628189264039, + "grad_norm": 7.603665828704834, + "learning_rate": 1.8264796358850994e-05, + "loss": 1.8057, + "step": 41520 + }, + { + "epoch": 0.261025671243101, + "grad_norm": 7.23233699798584, + "learning_rate": 1.826437725790634e-05, + "loss": 1.669, + "step": 41530 + }, + { + "epoch": 0.2610885235597981, + "grad_norm": 7.816350936889648, + "learning_rate": 1.826395815696169e-05, + "loss": 1.8749, + "step": 41540 + }, + { + "epoch": 0.2611513758764952, + "grad_norm": 7.779391288757324, + "learning_rate": 1.8263539056017035e-05, + "loss": 2.0454, + "step": 41550 + }, + { + "epoch": 0.2612142281931923, + "grad_norm": 6.121278762817383, + "learning_rate": 1.826311995507238e-05, + "loss": 2.085, + "step": 41560 + }, + { + "epoch": 0.2612770805098894, + "grad_norm": 7.626347064971924, + "learning_rate": 1.8262700854127726e-05, + "loss": 1.8736, + "step": 41570 + }, + { + "epoch": 0.2613399328265865, + "grad_norm": 6.436768054962158, + "learning_rate": 1.8262281753183073e-05, + "loss": 1.8548, + "step": 41580 + }, + { + "epoch": 0.26140278514328363, + "grad_norm": 7.84519100189209, + "learning_rate": 1.826186265223842e-05, + "loss": 1.9526, + "step": 41590 + }, + { + "epoch": 0.26146563745998075, + "grad_norm": 7.475573539733887, + "learning_rate": 1.8261443551293767e-05, + "loss": 1.9821, + "step": 41600 + }, + { + "epoch": 0.26152848977667786, + "grad_norm": 6.5553178787231445, + "learning_rate": 1.826102445034911e-05, + "loss": 1.8634, + "step": 41610 + }, + { + "epoch": 0.261591342093375, + "grad_norm": 7.0052008628845215, + "learning_rate": 1.8260605349404458e-05, + "loss": 1.7814, + "step": 41620 + }, + { + "epoch": 0.2616541944100721, + "grad_norm": 6.52140998840332, + "learning_rate": 1.8260186248459805e-05, + "loss": 2.06, + "step": 41630 + }, + { + "epoch": 0.2617170467267692, + "grad_norm": 8.644024848937988, + "learning_rate": 1.8259767147515152e-05, + "loss": 1.9339, + "step": 41640 + }, + { + "epoch": 0.2617798990434663, + "grad_norm": 6.678708076477051, + "learning_rate": 1.8259348046570496e-05, + "loss": 2.0563, + "step": 41650 + }, + { + "epoch": 0.26184275136016344, + "grad_norm": 7.915999412536621, + "learning_rate": 1.8258928945625843e-05, + "loss": 1.802, + "step": 41660 + }, + { + "epoch": 0.26190560367686055, + "grad_norm": 6.686285018920898, + "learning_rate": 1.825850984468119e-05, + "loss": 1.8831, + "step": 41670 + }, + { + "epoch": 0.2619684559935576, + "grad_norm": 7.126838684082031, + "learning_rate": 1.8258090743736537e-05, + "loss": 1.789, + "step": 41680 + }, + { + "epoch": 0.26203130831025473, + "grad_norm": 7.053625106811523, + "learning_rate": 1.8257671642791884e-05, + "loss": 1.9077, + "step": 41690 + }, + { + "epoch": 0.26209416062695184, + "grad_norm": 7.124743938446045, + "learning_rate": 1.825725254184723e-05, + "loss": 1.7721, + "step": 41700 + }, + { + "epoch": 0.26215701294364896, + "grad_norm": 7.246686935424805, + "learning_rate": 1.825683344090258e-05, + "loss": 1.9152, + "step": 41710 + }, + { + "epoch": 0.2622198652603461, + "grad_norm": 7.376430511474609, + "learning_rate": 1.8256414339957925e-05, + "loss": 1.7907, + "step": 41720 + }, + { + "epoch": 0.2622827175770432, + "grad_norm": 7.931061267852783, + "learning_rate": 1.8255995239013273e-05, + "loss": 1.7704, + "step": 41730 + }, + { + "epoch": 0.2623455698937403, + "grad_norm": 7.240325450897217, + "learning_rate": 1.8255576138068616e-05, + "loss": 1.7538, + "step": 41740 + }, + { + "epoch": 0.2624084222104374, + "grad_norm": 7.850881099700928, + "learning_rate": 1.8255157037123963e-05, + "loss": 1.9528, + "step": 41750 + }, + { + "epoch": 0.26247127452713453, + "grad_norm": 7.4015069007873535, + "learning_rate": 1.825473793617931e-05, + "loss": 1.7776, + "step": 41760 + }, + { + "epoch": 0.26253412684383165, + "grad_norm": 7.406811714172363, + "learning_rate": 1.8254318835234657e-05, + "loss": 1.7798, + "step": 41770 + }, + { + "epoch": 0.26259697916052877, + "grad_norm": 7.2380781173706055, + "learning_rate": 1.825389973429e-05, + "loss": 1.9065, + "step": 41780 + }, + { + "epoch": 0.2626598314772259, + "grad_norm": 8.308854103088379, + "learning_rate": 1.8253480633345348e-05, + "loss": 1.9674, + "step": 41790 + }, + { + "epoch": 0.26272268379392294, + "grad_norm": 7.051663875579834, + "learning_rate": 1.8253061532400695e-05, + "loss": 1.8546, + "step": 41800 + }, + { + "epoch": 0.26278553611062005, + "grad_norm": 6.652577877044678, + "learning_rate": 1.8252642431456042e-05, + "loss": 1.8808, + "step": 41810 + }, + { + "epoch": 0.26284838842731717, + "grad_norm": 7.104018688201904, + "learning_rate": 1.825222333051139e-05, + "loss": 1.9225, + "step": 41820 + }, + { + "epoch": 0.2629112407440143, + "grad_norm": 7.6953325271606445, + "learning_rate": 1.8251804229566733e-05, + "loss": 1.7381, + "step": 41830 + }, + { + "epoch": 0.2629740930607114, + "grad_norm": 7.816473007202148, + "learning_rate": 1.825138512862208e-05, + "loss": 1.7347, + "step": 41840 + }, + { + "epoch": 0.2630369453774085, + "grad_norm": 7.108369827270508, + "learning_rate": 1.8250966027677427e-05, + "loss": 1.7597, + "step": 41850 + }, + { + "epoch": 0.26309979769410563, + "grad_norm": 6.802537441253662, + "learning_rate": 1.8250546926732774e-05, + "loss": 1.7106, + "step": 41860 + }, + { + "epoch": 0.26316265001080275, + "grad_norm": 7.982716083526611, + "learning_rate": 1.825012782578812e-05, + "loss": 1.7053, + "step": 41870 + }, + { + "epoch": 0.26322550232749986, + "grad_norm": 6.618117809295654, + "learning_rate": 1.824970872484347e-05, + "loss": 1.9238, + "step": 41880 + }, + { + "epoch": 0.263288354644197, + "grad_norm": 7.67053747177124, + "learning_rate": 1.8249289623898812e-05, + "loss": 1.9405, + "step": 41890 + }, + { + "epoch": 0.2633512069608941, + "grad_norm": 8.179542541503906, + "learning_rate": 1.824887052295416e-05, + "loss": 2.0244, + "step": 41900 + }, + { + "epoch": 0.2634140592775912, + "grad_norm": 7.588283538818359, + "learning_rate": 1.8248451422009506e-05, + "loss": 1.7046, + "step": 41910 + }, + { + "epoch": 0.2634769115942883, + "grad_norm": 6.082485198974609, + "learning_rate": 1.8248032321064853e-05, + "loss": 1.8868, + "step": 41920 + }, + { + "epoch": 0.2635397639109854, + "grad_norm": 9.015851974487305, + "learning_rate": 1.82476132201202e-05, + "loss": 1.8189, + "step": 41930 + }, + { + "epoch": 0.2636026162276825, + "grad_norm": 7.327020168304443, + "learning_rate": 1.8247194119175547e-05, + "loss": 1.826, + "step": 41940 + }, + { + "epoch": 0.2636654685443796, + "grad_norm": 8.161823272705078, + "learning_rate": 1.8246775018230895e-05, + "loss": 1.8783, + "step": 41950 + }, + { + "epoch": 0.2637283208610767, + "grad_norm": 7.522366046905518, + "learning_rate": 1.8246355917286238e-05, + "loss": 1.9316, + "step": 41960 + }, + { + "epoch": 0.26379117317777384, + "grad_norm": 7.7846856117248535, + "learning_rate": 1.8245936816341585e-05, + "loss": 2.0466, + "step": 41970 + }, + { + "epoch": 0.26385402549447096, + "grad_norm": 7.288768768310547, + "learning_rate": 1.8245517715396932e-05, + "loss": 1.5244, + "step": 41980 + }, + { + "epoch": 0.26391687781116807, + "grad_norm": 7.846412181854248, + "learning_rate": 1.824509861445228e-05, + "loss": 1.8989, + "step": 41990 + }, + { + "epoch": 0.2639797301278652, + "grad_norm": 6.715677738189697, + "learning_rate": 1.8244679513507627e-05, + "loss": 1.9037, + "step": 42000 + }, + { + "epoch": 0.2640425824445623, + "grad_norm": 7.158120155334473, + "learning_rate": 1.824426041256297e-05, + "loss": 1.9783, + "step": 42010 + }, + { + "epoch": 0.2641054347612594, + "grad_norm": 6.922282695770264, + "learning_rate": 1.8243841311618317e-05, + "loss": 1.8341, + "step": 42020 + }, + { + "epoch": 0.26416828707795653, + "grad_norm": 7.328244686126709, + "learning_rate": 1.8243422210673664e-05, + "loss": 2.0118, + "step": 42030 + }, + { + "epoch": 0.26423113939465365, + "grad_norm": 7.900145530700684, + "learning_rate": 1.824300310972901e-05, + "loss": 1.8651, + "step": 42040 + }, + { + "epoch": 0.26429399171135076, + "grad_norm": 9.116615295410156, + "learning_rate": 1.8242584008784355e-05, + "loss": 2.0223, + "step": 42050 + }, + { + "epoch": 0.2643568440280478, + "grad_norm": 9.100872039794922, + "learning_rate": 1.8242164907839702e-05, + "loss": 2.0511, + "step": 42060 + }, + { + "epoch": 0.26441969634474494, + "grad_norm": 6.846112251281738, + "learning_rate": 1.824174580689505e-05, + "loss": 1.9461, + "step": 42070 + }, + { + "epoch": 0.26448254866144205, + "grad_norm": 7.331666946411133, + "learning_rate": 1.8241326705950396e-05, + "loss": 1.929, + "step": 42080 + }, + { + "epoch": 0.26454540097813917, + "grad_norm": 7.769443988800049, + "learning_rate": 1.8240907605005743e-05, + "loss": 1.9024, + "step": 42090 + }, + { + "epoch": 0.2646082532948363, + "grad_norm": 9.214117050170898, + "learning_rate": 1.824048850406109e-05, + "loss": 1.8484, + "step": 42100 + }, + { + "epoch": 0.2646711056115334, + "grad_norm": 7.874613285064697, + "learning_rate": 1.8240069403116438e-05, + "loss": 2.0011, + "step": 42110 + }, + { + "epoch": 0.2647339579282305, + "grad_norm": 6.289083003997803, + "learning_rate": 1.8239650302171785e-05, + "loss": 1.7715, + "step": 42120 + }, + { + "epoch": 0.2647968102449276, + "grad_norm": 7.777981758117676, + "learning_rate": 1.8239231201227128e-05, + "loss": 2.0084, + "step": 42130 + }, + { + "epoch": 0.26485966256162474, + "grad_norm": 6.87426233291626, + "learning_rate": 1.8238812100282475e-05, + "loss": 1.6479, + "step": 42140 + }, + { + "epoch": 0.26492251487832186, + "grad_norm": 7.014932632446289, + "learning_rate": 1.8238392999337822e-05, + "loss": 1.784, + "step": 42150 + }, + { + "epoch": 0.264985367195019, + "grad_norm": 7.573215484619141, + "learning_rate": 1.823797389839317e-05, + "loss": 1.8301, + "step": 42160 + }, + { + "epoch": 0.2650482195117161, + "grad_norm": 5.516359806060791, + "learning_rate": 1.8237554797448517e-05, + "loss": 1.9987, + "step": 42170 + }, + { + "epoch": 0.2651110718284132, + "grad_norm": 7.882081508636475, + "learning_rate": 1.823713569650386e-05, + "loss": 1.9692, + "step": 42180 + }, + { + "epoch": 0.26517392414511026, + "grad_norm": 7.476166248321533, + "learning_rate": 1.8236716595559207e-05, + "loss": 1.8581, + "step": 42190 + }, + { + "epoch": 0.2652367764618074, + "grad_norm": 6.487078666687012, + "learning_rate": 1.8236297494614554e-05, + "loss": 1.7073, + "step": 42200 + }, + { + "epoch": 0.2652996287785045, + "grad_norm": 6.1671857833862305, + "learning_rate": 1.82358783936699e-05, + "loss": 1.8987, + "step": 42210 + }, + { + "epoch": 0.2653624810952016, + "grad_norm": 6.757456302642822, + "learning_rate": 1.823545929272525e-05, + "loss": 1.9064, + "step": 42220 + }, + { + "epoch": 0.2654253334118987, + "grad_norm": 6.9185261726379395, + "learning_rate": 1.8235040191780592e-05, + "loss": 1.8034, + "step": 42230 + }, + { + "epoch": 0.26548818572859584, + "grad_norm": 8.581500053405762, + "learning_rate": 1.823462109083594e-05, + "loss": 1.939, + "step": 42240 + }, + { + "epoch": 0.26555103804529295, + "grad_norm": 7.534511566162109, + "learning_rate": 1.8234201989891286e-05, + "loss": 1.9409, + "step": 42250 + }, + { + "epoch": 0.26561389036199007, + "grad_norm": 6.689976215362549, + "learning_rate": 1.8233782888946633e-05, + "loss": 2.0029, + "step": 42260 + }, + { + "epoch": 0.2656767426786872, + "grad_norm": 6.737766265869141, + "learning_rate": 1.8233363788001977e-05, + "loss": 1.9975, + "step": 42270 + }, + { + "epoch": 0.2657395949953843, + "grad_norm": 7.648566722869873, + "learning_rate": 1.8232944687057324e-05, + "loss": 1.777, + "step": 42280 + }, + { + "epoch": 0.2658024473120814, + "grad_norm": 6.44831657409668, + "learning_rate": 1.823252558611267e-05, + "loss": 1.9846, + "step": 42290 + }, + { + "epoch": 0.26586529962877853, + "grad_norm": 6.997977256774902, + "learning_rate": 1.823210648516802e-05, + "loss": 1.8574, + "step": 42300 + }, + { + "epoch": 0.2659281519454756, + "grad_norm": 7.669799327850342, + "learning_rate": 1.8231687384223365e-05, + "loss": 1.8257, + "step": 42310 + }, + { + "epoch": 0.2659910042621727, + "grad_norm": 7.207052707672119, + "learning_rate": 1.8231268283278712e-05, + "loss": 1.7204, + "step": 42320 + }, + { + "epoch": 0.2660538565788698, + "grad_norm": 7.405806541442871, + "learning_rate": 1.823084918233406e-05, + "loss": 1.9153, + "step": 42330 + }, + { + "epoch": 0.26611670889556693, + "grad_norm": 7.449708938598633, + "learning_rate": 1.8230430081389407e-05, + "loss": 1.838, + "step": 42340 + }, + { + "epoch": 0.26617956121226405, + "grad_norm": 6.299845218658447, + "learning_rate": 1.8230010980444754e-05, + "loss": 1.7433, + "step": 42350 + }, + { + "epoch": 0.26624241352896116, + "grad_norm": 6.5375871658325195, + "learning_rate": 1.8229591879500097e-05, + "loss": 1.8614, + "step": 42360 + }, + { + "epoch": 0.2663052658456583, + "grad_norm": 7.137462139129639, + "learning_rate": 1.8229172778555444e-05, + "loss": 2.0776, + "step": 42370 + }, + { + "epoch": 0.2663681181623554, + "grad_norm": 7.907176971435547, + "learning_rate": 1.822875367761079e-05, + "loss": 1.9978, + "step": 42380 + }, + { + "epoch": 0.2664309704790525, + "grad_norm": 7.982783794403076, + "learning_rate": 1.822833457666614e-05, + "loss": 1.8599, + "step": 42390 + }, + { + "epoch": 0.2664938227957496, + "grad_norm": 7.208168029785156, + "learning_rate": 1.8227915475721482e-05, + "loss": 1.7397, + "step": 42400 + }, + { + "epoch": 0.26655667511244674, + "grad_norm": 6.838236331939697, + "learning_rate": 1.822749637477683e-05, + "loss": 1.9251, + "step": 42410 + }, + { + "epoch": 0.26661952742914385, + "grad_norm": 7.437549591064453, + "learning_rate": 1.8227077273832176e-05, + "loss": 1.7901, + "step": 42420 + }, + { + "epoch": 0.26668237974584097, + "grad_norm": 6.826678276062012, + "learning_rate": 1.8226658172887523e-05, + "loss": 1.7771, + "step": 42430 + }, + { + "epoch": 0.26674523206253803, + "grad_norm": 6.3887152671813965, + "learning_rate": 1.822623907194287e-05, + "loss": 1.8314, + "step": 42440 + }, + { + "epoch": 0.26680808437923514, + "grad_norm": 7.1717071533203125, + "learning_rate": 1.8225819970998214e-05, + "loss": 1.877, + "step": 42450 + }, + { + "epoch": 0.26687093669593226, + "grad_norm": 6.2922844886779785, + "learning_rate": 1.822540087005356e-05, + "loss": 1.7357, + "step": 42460 + }, + { + "epoch": 0.2669337890126294, + "grad_norm": 6.810061931610107, + "learning_rate": 1.822498176910891e-05, + "loss": 1.7068, + "step": 42470 + }, + { + "epoch": 0.2669966413293265, + "grad_norm": 7.887887954711914, + "learning_rate": 1.8224562668164255e-05, + "loss": 1.9572, + "step": 42480 + }, + { + "epoch": 0.2670594936460236, + "grad_norm": 7.99580717086792, + "learning_rate": 1.8224143567219602e-05, + "loss": 1.7972, + "step": 42490 + }, + { + "epoch": 0.2671223459627207, + "grad_norm": 6.92336893081665, + "learning_rate": 1.822372446627495e-05, + "loss": 1.7053, + "step": 42500 + }, + { + "epoch": 0.26718519827941783, + "grad_norm": 7.77141809463501, + "learning_rate": 1.8223305365330293e-05, + "loss": 1.9202, + "step": 42510 + }, + { + "epoch": 0.26724805059611495, + "grad_norm": 6.241732597351074, + "learning_rate": 1.822288626438564e-05, + "loss": 1.9697, + "step": 42520 + }, + { + "epoch": 0.26731090291281207, + "grad_norm": 7.296006679534912, + "learning_rate": 1.8222467163440987e-05, + "loss": 1.8685, + "step": 42530 + }, + { + "epoch": 0.2673737552295092, + "grad_norm": 16.223621368408203, + "learning_rate": 1.8222048062496334e-05, + "loss": 1.9736, + "step": 42540 + }, + { + "epoch": 0.2674366075462063, + "grad_norm": 6.669566631317139, + "learning_rate": 1.822162896155168e-05, + "loss": 1.821, + "step": 42550 + }, + { + "epoch": 0.2674994598629034, + "grad_norm": 7.176951885223389, + "learning_rate": 1.822120986060703e-05, + "loss": 1.5961, + "step": 42560 + }, + { + "epoch": 0.26756231217960047, + "grad_norm": 6.5811920166015625, + "learning_rate": 1.8220790759662376e-05, + "loss": 1.7338, + "step": 42570 + }, + { + "epoch": 0.2676251644962976, + "grad_norm": 7.482040882110596, + "learning_rate": 1.822037165871772e-05, + "loss": 1.877, + "step": 42580 + }, + { + "epoch": 0.2676880168129947, + "grad_norm": 7.4308061599731445, + "learning_rate": 1.8219952557773066e-05, + "loss": 2.0624, + "step": 42590 + }, + { + "epoch": 0.2677508691296918, + "grad_norm": 7.0572004318237305, + "learning_rate": 1.8219533456828413e-05, + "loss": 1.7521, + "step": 42600 + }, + { + "epoch": 0.26781372144638893, + "grad_norm": 8.193439483642578, + "learning_rate": 1.821911435588376e-05, + "loss": 1.7971, + "step": 42610 + }, + { + "epoch": 0.26787657376308605, + "grad_norm": 7.08612585067749, + "learning_rate": 1.8218695254939108e-05, + "loss": 1.8994, + "step": 42620 + }, + { + "epoch": 0.26793942607978316, + "grad_norm": 6.6897969245910645, + "learning_rate": 1.821827615399445e-05, + "loss": 1.8881, + "step": 42630 + }, + { + "epoch": 0.2680022783964803, + "grad_norm": 7.877755165100098, + "learning_rate": 1.82178570530498e-05, + "loss": 1.8548, + "step": 42640 + }, + { + "epoch": 0.2680651307131774, + "grad_norm": 7.418746471405029, + "learning_rate": 1.8217437952105145e-05, + "loss": 1.8281, + "step": 42650 + }, + { + "epoch": 0.2681279830298745, + "grad_norm": 7.265000343322754, + "learning_rate": 1.8217018851160493e-05, + "loss": 1.72, + "step": 42660 + }, + { + "epoch": 0.2681908353465716, + "grad_norm": 7.055147171020508, + "learning_rate": 1.8216599750215836e-05, + "loss": 1.7341, + "step": 42670 + }, + { + "epoch": 0.26825368766326874, + "grad_norm": 6.618005275726318, + "learning_rate": 1.8216180649271183e-05, + "loss": 2.2909, + "step": 42680 + }, + { + "epoch": 0.26831653997996585, + "grad_norm": 6.586921691894531, + "learning_rate": 1.821576154832653e-05, + "loss": 2.0484, + "step": 42690 + }, + { + "epoch": 0.2683793922966629, + "grad_norm": 7.860531806945801, + "learning_rate": 1.8215342447381877e-05, + "loss": 1.8451, + "step": 42700 + }, + { + "epoch": 0.26844224461336, + "grad_norm": 8.915355682373047, + "learning_rate": 1.8214923346437224e-05, + "loss": 1.9683, + "step": 42710 + }, + { + "epoch": 0.26850509693005714, + "grad_norm": 6.985775470733643, + "learning_rate": 1.821450424549257e-05, + "loss": 2.1368, + "step": 42720 + }, + { + "epoch": 0.26856794924675426, + "grad_norm": 6.577744007110596, + "learning_rate": 1.821408514454792e-05, + "loss": 1.6015, + "step": 42730 + }, + { + "epoch": 0.26863080156345137, + "grad_norm": 6.80482292175293, + "learning_rate": 1.8213666043603266e-05, + "loss": 1.7962, + "step": 42740 + }, + { + "epoch": 0.2686936538801485, + "grad_norm": 6.925163269042969, + "learning_rate": 1.8213246942658613e-05, + "loss": 1.706, + "step": 42750 + }, + { + "epoch": 0.2687565061968456, + "grad_norm": 7.358352184295654, + "learning_rate": 1.8212827841713956e-05, + "loss": 1.8237, + "step": 42760 + }, + { + "epoch": 0.2688193585135427, + "grad_norm": 8.235050201416016, + "learning_rate": 1.8212408740769304e-05, + "loss": 1.6833, + "step": 42770 + }, + { + "epoch": 0.26888221083023983, + "grad_norm": 7.137479305267334, + "learning_rate": 1.821198963982465e-05, + "loss": 1.6839, + "step": 42780 + }, + { + "epoch": 0.26894506314693695, + "grad_norm": 8.005147933959961, + "learning_rate": 1.8211570538879998e-05, + "loss": 1.768, + "step": 42790 + }, + { + "epoch": 0.26900791546363406, + "grad_norm": 7.089692115783691, + "learning_rate": 1.821119334802981e-05, + "loss": 1.9332, + "step": 42800 + }, + { + "epoch": 0.2690707677803312, + "grad_norm": 6.873706817626953, + "learning_rate": 1.8210774247085156e-05, + "loss": 1.8431, + "step": 42810 + }, + { + "epoch": 0.26913362009702824, + "grad_norm": 9.355104446411133, + "learning_rate": 1.8210355146140503e-05, + "loss": 1.8965, + "step": 42820 + }, + { + "epoch": 0.26919647241372535, + "grad_norm": 6.228792667388916, + "learning_rate": 1.8209936045195847e-05, + "loss": 1.8587, + "step": 42830 + }, + { + "epoch": 0.26925932473042247, + "grad_norm": 7.237106800079346, + "learning_rate": 1.8209516944251194e-05, + "loss": 1.9352, + "step": 42840 + }, + { + "epoch": 0.2693221770471196, + "grad_norm": 6.2146453857421875, + "learning_rate": 1.820909784330654e-05, + "loss": 1.8494, + "step": 42850 + }, + { + "epoch": 0.2693850293638167, + "grad_norm": 7.7184319496154785, + "learning_rate": 1.8208678742361888e-05, + "loss": 1.8093, + "step": 42860 + }, + { + "epoch": 0.2694478816805138, + "grad_norm": 8.168176651000977, + "learning_rate": 1.8208259641417235e-05, + "loss": 1.8825, + "step": 42870 + }, + { + "epoch": 0.2695107339972109, + "grad_norm": 6.3709797859191895, + "learning_rate": 1.820784054047258e-05, + "loss": 1.7919, + "step": 42880 + }, + { + "epoch": 0.26957358631390804, + "grad_norm": 7.18712043762207, + "learning_rate": 1.8207421439527926e-05, + "loss": 1.7581, + "step": 42890 + }, + { + "epoch": 0.26963643863060516, + "grad_norm": 7.938587188720703, + "learning_rate": 1.8207002338583273e-05, + "loss": 1.7975, + "step": 42900 + }, + { + "epoch": 0.2696992909473023, + "grad_norm": 7.001043319702148, + "learning_rate": 1.820658323763862e-05, + "loss": 1.9066, + "step": 42910 + }, + { + "epoch": 0.2697621432639994, + "grad_norm": 7.061951160430908, + "learning_rate": 1.8206164136693964e-05, + "loss": 1.7411, + "step": 42920 + }, + { + "epoch": 0.2698249955806965, + "grad_norm": 7.237574577331543, + "learning_rate": 1.820574503574931e-05, + "loss": 1.9942, + "step": 42930 + }, + { + "epoch": 0.2698878478973936, + "grad_norm": 7.545398712158203, + "learning_rate": 1.8205325934804658e-05, + "loss": 1.9839, + "step": 42940 + }, + { + "epoch": 0.2699507002140907, + "grad_norm": 5.7841668128967285, + "learning_rate": 1.8204906833860005e-05, + "loss": 1.7837, + "step": 42950 + }, + { + "epoch": 0.2700135525307878, + "grad_norm": 7.160608768463135, + "learning_rate": 1.8204487732915352e-05, + "loss": 1.9018, + "step": 42960 + }, + { + "epoch": 0.2700764048474849, + "grad_norm": 6.914604187011719, + "learning_rate": 1.8204068631970696e-05, + "loss": 1.8397, + "step": 42970 + }, + { + "epoch": 0.270139257164182, + "grad_norm": 7.4479827880859375, + "learning_rate": 1.8203649531026043e-05, + "loss": 1.8815, + "step": 42980 + }, + { + "epoch": 0.27020210948087914, + "grad_norm": 6.855360984802246, + "learning_rate": 1.820323043008139e-05, + "loss": 1.8275, + "step": 42990 + }, + { + "epoch": 0.27026496179757625, + "grad_norm": 7.653349876403809, + "learning_rate": 1.8202811329136737e-05, + "loss": 1.9389, + "step": 43000 + }, + { + "epoch": 0.27032781411427337, + "grad_norm": 7.4689788818359375, + "learning_rate": 1.8202392228192084e-05, + "loss": 1.7573, + "step": 43010 + }, + { + "epoch": 0.2703906664309705, + "grad_norm": 9.028587341308594, + "learning_rate": 1.820197312724743e-05, + "loss": 1.8279, + "step": 43020 + }, + { + "epoch": 0.2704535187476676, + "grad_norm": 6.932178020477295, + "learning_rate": 1.8201554026302778e-05, + "loss": 1.8318, + "step": 43030 + }, + { + "epoch": 0.2705163710643647, + "grad_norm": 6.070540428161621, + "learning_rate": 1.8201134925358125e-05, + "loss": 1.7882, + "step": 43040 + }, + { + "epoch": 0.27057922338106183, + "grad_norm": 7.225831985473633, + "learning_rate": 1.8200715824413472e-05, + "loss": 1.839, + "step": 43050 + }, + { + "epoch": 0.27064207569775894, + "grad_norm": 7.78971004486084, + "learning_rate": 1.8200296723468816e-05, + "loss": 1.7446, + "step": 43060 + }, + { + "epoch": 0.27070492801445606, + "grad_norm": 7.131658554077148, + "learning_rate": 1.8199877622524163e-05, + "loss": 2.0806, + "step": 43070 + }, + { + "epoch": 0.2707677803311531, + "grad_norm": 7.393949508666992, + "learning_rate": 1.819945852157951e-05, + "loss": 1.8194, + "step": 43080 + }, + { + "epoch": 0.27083063264785023, + "grad_norm": 6.4248175621032715, + "learning_rate": 1.8199039420634857e-05, + "loss": 1.7011, + "step": 43090 + }, + { + "epoch": 0.27089348496454735, + "grad_norm": 7.033029079437256, + "learning_rate": 1.81986203196902e-05, + "loss": 1.9835, + "step": 43100 + }, + { + "epoch": 0.27095633728124446, + "grad_norm": 7.071962833404541, + "learning_rate": 1.8198201218745548e-05, + "loss": 1.7778, + "step": 43110 + }, + { + "epoch": 0.2710191895979416, + "grad_norm": 6.914316177368164, + "learning_rate": 1.8197782117800895e-05, + "loss": 1.7871, + "step": 43120 + }, + { + "epoch": 0.2710820419146387, + "grad_norm": 7.62619686126709, + "learning_rate": 1.8197363016856242e-05, + "loss": 1.7542, + "step": 43130 + }, + { + "epoch": 0.2711448942313358, + "grad_norm": 7.535961151123047, + "learning_rate": 1.819694391591159e-05, + "loss": 1.9424, + "step": 43140 + }, + { + "epoch": 0.2712077465480329, + "grad_norm": 7.0686845779418945, + "learning_rate": 1.8196524814966933e-05, + "loss": 2.1509, + "step": 43150 + }, + { + "epoch": 0.27127059886473004, + "grad_norm": 5.649727821350098, + "learning_rate": 1.819610571402228e-05, + "loss": 1.9517, + "step": 43160 + }, + { + "epoch": 0.27133345118142715, + "grad_norm": 7.16280460357666, + "learning_rate": 1.8195686613077627e-05, + "loss": 1.8772, + "step": 43170 + }, + { + "epoch": 0.27139630349812427, + "grad_norm": 7.139915466308594, + "learning_rate": 1.8195267512132974e-05, + "loss": 2.0204, + "step": 43180 + }, + { + "epoch": 0.2714591558148214, + "grad_norm": 6.4540863037109375, + "learning_rate": 1.819484841118832e-05, + "loss": 1.6998, + "step": 43190 + }, + { + "epoch": 0.2715220081315185, + "grad_norm": 7.365581512451172, + "learning_rate": 1.8194429310243665e-05, + "loss": 2.0298, + "step": 43200 + }, + { + "epoch": 0.27158486044821556, + "grad_norm": 7.0859246253967285, + "learning_rate": 1.8194010209299012e-05, + "loss": 1.9111, + "step": 43210 + }, + { + "epoch": 0.2716477127649127, + "grad_norm": 6.624123573303223, + "learning_rate": 1.819359110835436e-05, + "loss": 2.1248, + "step": 43220 + }, + { + "epoch": 0.2717105650816098, + "grad_norm": 8.226758003234863, + "learning_rate": 1.8193172007409706e-05, + "loss": 1.6659, + "step": 43230 + }, + { + "epoch": 0.2717734173983069, + "grad_norm": 6.948604106903076, + "learning_rate": 1.8192752906465053e-05, + "loss": 1.9483, + "step": 43240 + }, + { + "epoch": 0.271836269715004, + "grad_norm": 6.023930072784424, + "learning_rate": 1.81923338055204e-05, + "loss": 1.7512, + "step": 43250 + }, + { + "epoch": 0.27189912203170113, + "grad_norm": 7.766272068023682, + "learning_rate": 1.8191914704575747e-05, + "loss": 1.9309, + "step": 43260 + }, + { + "epoch": 0.27196197434839825, + "grad_norm": 6.777820110321045, + "learning_rate": 1.8191495603631094e-05, + "loss": 1.931, + "step": 43270 + }, + { + "epoch": 0.27202482666509537, + "grad_norm": 8.068720817565918, + "learning_rate": 1.8191076502686438e-05, + "loss": 2.0577, + "step": 43280 + }, + { + "epoch": 0.2720876789817925, + "grad_norm": 7.683823108673096, + "learning_rate": 1.8190657401741785e-05, + "loss": 1.9969, + "step": 43290 + }, + { + "epoch": 0.2721505312984896, + "grad_norm": 6.8867363929748535, + "learning_rate": 1.8190238300797132e-05, + "loss": 1.9588, + "step": 43300 + }, + { + "epoch": 0.2722133836151867, + "grad_norm": 7.831424236297607, + "learning_rate": 1.818981919985248e-05, + "loss": 1.7287, + "step": 43310 + }, + { + "epoch": 0.2722762359318838, + "grad_norm": 6.731321334838867, + "learning_rate": 1.8189400098907823e-05, + "loss": 2.1138, + "step": 43320 + }, + { + "epoch": 0.2723390882485809, + "grad_norm": 7.519513130187988, + "learning_rate": 1.818898099796317e-05, + "loss": 2.0652, + "step": 43330 + }, + { + "epoch": 0.272401940565278, + "grad_norm": 7.941908359527588, + "learning_rate": 1.8188561897018517e-05, + "loss": 1.9205, + "step": 43340 + }, + { + "epoch": 0.2724647928819751, + "grad_norm": 7.573431491851807, + "learning_rate": 1.8188142796073864e-05, + "loss": 1.8556, + "step": 43350 + }, + { + "epoch": 0.27252764519867223, + "grad_norm": 8.56758975982666, + "learning_rate": 1.818772369512921e-05, + "loss": 1.9231, + "step": 43360 + }, + { + "epoch": 0.27259049751536935, + "grad_norm": 6.786647796630859, + "learning_rate": 1.8187304594184555e-05, + "loss": 1.8466, + "step": 43370 + }, + { + "epoch": 0.27265334983206646, + "grad_norm": 7.454209804534912, + "learning_rate": 1.8186885493239902e-05, + "loss": 1.8693, + "step": 43380 + }, + { + "epoch": 0.2727162021487636, + "grad_norm": 8.307778358459473, + "learning_rate": 1.818646639229525e-05, + "loss": 1.6996, + "step": 43390 + }, + { + "epoch": 0.2727790544654607, + "grad_norm": 7.2898712158203125, + "learning_rate": 1.8186047291350596e-05, + "loss": 1.7893, + "step": 43400 + }, + { + "epoch": 0.2728419067821578, + "grad_norm": 7.7085981369018555, + "learning_rate": 1.8185628190405943e-05, + "loss": 2.0989, + "step": 43410 + }, + { + "epoch": 0.2729047590988549, + "grad_norm": 7.155144691467285, + "learning_rate": 1.818520908946129e-05, + "loss": 1.7218, + "step": 43420 + }, + { + "epoch": 0.27296761141555204, + "grad_norm": 6.132807731628418, + "learning_rate": 1.8184789988516637e-05, + "loss": 1.8792, + "step": 43430 + }, + { + "epoch": 0.27303046373224915, + "grad_norm": 7.1564836502075195, + "learning_rate": 1.8184370887571984e-05, + "loss": 2.0505, + "step": 43440 + }, + { + "epoch": 0.27309331604894627, + "grad_norm": 5.861886978149414, + "learning_rate": 1.8183951786627328e-05, + "loss": 1.8952, + "step": 43450 + }, + { + "epoch": 0.2731561683656433, + "grad_norm": 7.8853278160095215, + "learning_rate": 1.8183532685682675e-05, + "loss": 1.7529, + "step": 43460 + }, + { + "epoch": 0.27321902068234044, + "grad_norm": 7.434305191040039, + "learning_rate": 1.8183113584738022e-05, + "loss": 1.7616, + "step": 43470 + }, + { + "epoch": 0.27328187299903756, + "grad_norm": 7.336535453796387, + "learning_rate": 1.818269448379337e-05, + "loss": 2.0926, + "step": 43480 + }, + { + "epoch": 0.27334472531573467, + "grad_norm": 7.187647342681885, + "learning_rate": 1.8182275382848716e-05, + "loss": 1.8862, + "step": 43490 + }, + { + "epoch": 0.2734075776324318, + "grad_norm": 7.7725934982299805, + "learning_rate": 1.818185628190406e-05, + "loss": 1.8336, + "step": 43500 + }, + { + "epoch": 0.2734704299491289, + "grad_norm": 7.245877265930176, + "learning_rate": 1.8181437180959407e-05, + "loss": 1.9086, + "step": 43510 + }, + { + "epoch": 0.273533282265826, + "grad_norm": 5.464651107788086, + "learning_rate": 1.8181018080014754e-05, + "loss": 1.8325, + "step": 43520 + }, + { + "epoch": 0.27359613458252313, + "grad_norm": 6.858070373535156, + "learning_rate": 1.81805989790701e-05, + "loss": 1.9098, + "step": 43530 + }, + { + "epoch": 0.27365898689922025, + "grad_norm": 8.30384349822998, + "learning_rate": 1.8180179878125445e-05, + "loss": 1.8923, + "step": 43540 + }, + { + "epoch": 0.27372183921591736, + "grad_norm": 8.637042045593262, + "learning_rate": 1.8179760777180792e-05, + "loss": 1.9143, + "step": 43550 + }, + { + "epoch": 0.2737846915326145, + "grad_norm": 7.063789367675781, + "learning_rate": 1.817934167623614e-05, + "loss": 1.7999, + "step": 43560 + }, + { + "epoch": 0.2738475438493116, + "grad_norm": 6.5637030601501465, + "learning_rate": 1.8178922575291486e-05, + "loss": 1.9424, + "step": 43570 + }, + { + "epoch": 0.2739103961660087, + "grad_norm": 6.852513790130615, + "learning_rate": 1.8178503474346833e-05, + "loss": 1.9062, + "step": 43580 + }, + { + "epoch": 0.27397324848270577, + "grad_norm": 7.877373695373535, + "learning_rate": 1.8178084373402177e-05, + "loss": 1.8612, + "step": 43590 + }, + { + "epoch": 0.2740361007994029, + "grad_norm": 7.566814422607422, + "learning_rate": 1.8177665272457524e-05, + "loss": 1.9674, + "step": 43600 + }, + { + "epoch": 0.2740989531161, + "grad_norm": 6.332150936126709, + "learning_rate": 1.817724617151287e-05, + "loss": 1.6876, + "step": 43610 + }, + { + "epoch": 0.2741618054327971, + "grad_norm": 6.841239929199219, + "learning_rate": 1.8176827070568218e-05, + "loss": 2.0219, + "step": 43620 + }, + { + "epoch": 0.2742246577494942, + "grad_norm": 6.8319292068481445, + "learning_rate": 1.8176407969623565e-05, + "loss": 1.7973, + "step": 43630 + }, + { + "epoch": 0.27428751006619134, + "grad_norm": 7.159534931182861, + "learning_rate": 1.8175988868678912e-05, + "loss": 1.77, + "step": 43640 + }, + { + "epoch": 0.27435036238288846, + "grad_norm": 6.824354648590088, + "learning_rate": 1.817556976773426e-05, + "loss": 1.7697, + "step": 43650 + }, + { + "epoch": 0.2744132146995856, + "grad_norm": 6.295945167541504, + "learning_rate": 1.8175150666789606e-05, + "loss": 1.8574, + "step": 43660 + }, + { + "epoch": 0.2744760670162827, + "grad_norm": 7.479640483856201, + "learning_rate": 1.8174731565844953e-05, + "loss": 2.059, + "step": 43670 + }, + { + "epoch": 0.2745389193329798, + "grad_norm": 6.682166576385498, + "learning_rate": 1.8174312464900297e-05, + "loss": 1.8981, + "step": 43680 + }, + { + "epoch": 0.2746017716496769, + "grad_norm": 7.518378734588623, + "learning_rate": 1.8173893363955644e-05, + "loss": 1.8331, + "step": 43690 + }, + { + "epoch": 0.27466462396637403, + "grad_norm": 6.496458053588867, + "learning_rate": 1.817347426301099e-05, + "loss": 2.0259, + "step": 43700 + }, + { + "epoch": 0.27472747628307115, + "grad_norm": 7.358792304992676, + "learning_rate": 1.8173055162066338e-05, + "loss": 1.9976, + "step": 43710 + }, + { + "epoch": 0.2747903285997682, + "grad_norm": 6.544776916503906, + "learning_rate": 1.8172636061121682e-05, + "loss": 1.7799, + "step": 43720 + }, + { + "epoch": 0.2748531809164653, + "grad_norm": 7.420206069946289, + "learning_rate": 1.817221696017703e-05, + "loss": 1.9171, + "step": 43730 + }, + { + "epoch": 0.27491603323316244, + "grad_norm": 6.45020866394043, + "learning_rate": 1.8171797859232376e-05, + "loss": 1.9072, + "step": 43740 + }, + { + "epoch": 0.27497888554985955, + "grad_norm": 6.753374099731445, + "learning_rate": 1.8171378758287723e-05, + "loss": 1.9285, + "step": 43750 + }, + { + "epoch": 0.27504173786655667, + "grad_norm": 8.145105361938477, + "learning_rate": 1.817095965734307e-05, + "loss": 1.8416, + "step": 43760 + }, + { + "epoch": 0.2751045901832538, + "grad_norm": 5.977471351623535, + "learning_rate": 1.8170540556398414e-05, + "loss": 1.8596, + "step": 43770 + }, + { + "epoch": 0.2751674424999509, + "grad_norm": 7.059874057769775, + "learning_rate": 1.817012145545376e-05, + "loss": 2.0057, + "step": 43780 + }, + { + "epoch": 0.275230294816648, + "grad_norm": 7.4796013832092285, + "learning_rate": 1.8169702354509108e-05, + "loss": 1.8092, + "step": 43790 + }, + { + "epoch": 0.27529314713334513, + "grad_norm": 6.353641033172607, + "learning_rate": 1.8169283253564455e-05, + "loss": 1.8401, + "step": 43800 + }, + { + "epoch": 0.27535599945004224, + "grad_norm": 6.131389617919922, + "learning_rate": 1.8168864152619802e-05, + "loss": 1.8887, + "step": 43810 + }, + { + "epoch": 0.27541885176673936, + "grad_norm": 6.0684943199157715, + "learning_rate": 1.816844505167515e-05, + "loss": 1.8536, + "step": 43820 + }, + { + "epoch": 0.2754817040834365, + "grad_norm": 6.397604465484619, + "learning_rate": 1.8168025950730493e-05, + "loss": 1.8396, + "step": 43830 + }, + { + "epoch": 0.27554455640013353, + "grad_norm": 7.1050262451171875, + "learning_rate": 1.816760684978584e-05, + "loss": 1.7696, + "step": 43840 + }, + { + "epoch": 0.27560740871683065, + "grad_norm": 8.189983367919922, + "learning_rate": 1.8167187748841187e-05, + "loss": 1.7782, + "step": 43850 + }, + { + "epoch": 0.27567026103352776, + "grad_norm": 6.490654468536377, + "learning_rate": 1.8166768647896534e-05, + "loss": 1.8053, + "step": 43860 + }, + { + "epoch": 0.2757331133502249, + "grad_norm": 6.4062113761901855, + "learning_rate": 1.816634954695188e-05, + "loss": 1.5862, + "step": 43870 + }, + { + "epoch": 0.275795965666922, + "grad_norm": 6.748137950897217, + "learning_rate": 1.8165930446007228e-05, + "loss": 2.0288, + "step": 43880 + }, + { + "epoch": 0.2758588179836191, + "grad_norm": 7.861601829528809, + "learning_rate": 1.8165511345062575e-05, + "loss": 1.8009, + "step": 43890 + }, + { + "epoch": 0.2759216703003162, + "grad_norm": 7.185576438903809, + "learning_rate": 1.816509224411792e-05, + "loss": 1.8527, + "step": 43900 + }, + { + "epoch": 0.27598452261701334, + "grad_norm": 6.2016754150390625, + "learning_rate": 1.8164673143173266e-05, + "loss": 1.7815, + "step": 43910 + }, + { + "epoch": 0.27604737493371045, + "grad_norm": 7.197715759277344, + "learning_rate": 1.8164254042228613e-05, + "loss": 1.9198, + "step": 43920 + }, + { + "epoch": 0.27611022725040757, + "grad_norm": 6.321927547454834, + "learning_rate": 1.816383494128396e-05, + "loss": 1.6873, + "step": 43930 + }, + { + "epoch": 0.2761730795671047, + "grad_norm": 8.296004295349121, + "learning_rate": 1.8163415840339304e-05, + "loss": 1.8916, + "step": 43940 + }, + { + "epoch": 0.2762359318838018, + "grad_norm": 8.541367530822754, + "learning_rate": 1.816299673939465e-05, + "loss": 2.0147, + "step": 43950 + }, + { + "epoch": 0.2762987842004989, + "grad_norm": 7.3273420333862305, + "learning_rate": 1.8162577638449998e-05, + "loss": 2.008, + "step": 43960 + }, + { + "epoch": 0.276361636517196, + "grad_norm": 7.654440879821777, + "learning_rate": 1.8162158537505345e-05, + "loss": 1.7337, + "step": 43970 + }, + { + "epoch": 0.2764244888338931, + "grad_norm": 7.776981353759766, + "learning_rate": 1.8161739436560692e-05, + "loss": 1.7561, + "step": 43980 + }, + { + "epoch": 0.2764873411505902, + "grad_norm": 7.037495136260986, + "learning_rate": 1.8161320335616036e-05, + "loss": 1.8316, + "step": 43990 + }, + { + "epoch": 0.2765501934672873, + "grad_norm": 7.595053672790527, + "learning_rate": 1.8160901234671383e-05, + "loss": 2.1776, + "step": 44000 + }, + { + "epoch": 0.27661304578398443, + "grad_norm": 6.931766033172607, + "learning_rate": 1.816048213372673e-05, + "loss": 2.0454, + "step": 44010 + }, + { + "epoch": 0.27667589810068155, + "grad_norm": 7.494742393493652, + "learning_rate": 1.8160063032782077e-05, + "loss": 1.9182, + "step": 44020 + }, + { + "epoch": 0.27673875041737866, + "grad_norm": 8.088569641113281, + "learning_rate": 1.8159643931837424e-05, + "loss": 1.8282, + "step": 44030 + }, + { + "epoch": 0.2768016027340758, + "grad_norm": 6.901574611663818, + "learning_rate": 1.815922483089277e-05, + "loss": 1.7486, + "step": 44040 + }, + { + "epoch": 0.2768644550507729, + "grad_norm": 6.851449489593506, + "learning_rate": 1.815880572994812e-05, + "loss": 1.8935, + "step": 44050 + }, + { + "epoch": 0.27692730736747, + "grad_norm": 8.495573997497559, + "learning_rate": 1.8158386629003465e-05, + "loss": 1.7771, + "step": 44060 + }, + { + "epoch": 0.2769901596841671, + "grad_norm": 7.091639995574951, + "learning_rate": 1.8157967528058812e-05, + "loss": 1.83, + "step": 44070 + }, + { + "epoch": 0.27705301200086424, + "grad_norm": 6.349162578582764, + "learning_rate": 1.8157548427114156e-05, + "loss": 1.8799, + "step": 44080 + }, + { + "epoch": 0.27711586431756136, + "grad_norm": 7.483797073364258, + "learning_rate": 1.8157129326169503e-05, + "loss": 1.8951, + "step": 44090 + }, + { + "epoch": 0.2771787166342584, + "grad_norm": 6.60048770904541, + "learning_rate": 1.815671022522485e-05, + "loss": 1.6704, + "step": 44100 + }, + { + "epoch": 0.27724156895095553, + "grad_norm": 6.710370063781738, + "learning_rate": 1.8156291124280197e-05, + "loss": 1.8409, + "step": 44110 + }, + { + "epoch": 0.27730442126765265, + "grad_norm": 7.331045150756836, + "learning_rate": 1.815587202333554e-05, + "loss": 1.5637, + "step": 44120 + }, + { + "epoch": 0.27736727358434976, + "grad_norm": 6.952859401702881, + "learning_rate": 1.8155452922390888e-05, + "loss": 1.7967, + "step": 44130 + }, + { + "epoch": 0.2774301259010469, + "grad_norm": 6.21740198135376, + "learning_rate": 1.8155033821446235e-05, + "loss": 1.8562, + "step": 44140 + }, + { + "epoch": 0.277492978217744, + "grad_norm": 6.948338508605957, + "learning_rate": 1.8154614720501582e-05, + "loss": 2.0678, + "step": 44150 + }, + { + "epoch": 0.2775558305344411, + "grad_norm": 7.098877906799316, + "learning_rate": 1.8154195619556926e-05, + "loss": 1.8754, + "step": 44160 + }, + { + "epoch": 0.2776186828511382, + "grad_norm": 7.387324333190918, + "learning_rate": 1.8153776518612273e-05, + "loss": 1.9309, + "step": 44170 + }, + { + "epoch": 0.27768153516783534, + "grad_norm": 6.8969407081604, + "learning_rate": 1.815335741766762e-05, + "loss": 2.0453, + "step": 44180 + }, + { + "epoch": 0.27774438748453245, + "grad_norm": 7.333196640014648, + "learning_rate": 1.8152938316722967e-05, + "loss": 2.0103, + "step": 44190 + }, + { + "epoch": 0.27780723980122957, + "grad_norm": 7.968249797821045, + "learning_rate": 1.8152519215778314e-05, + "loss": 1.9121, + "step": 44200 + }, + { + "epoch": 0.2778700921179267, + "grad_norm": 6.779103755950928, + "learning_rate": 1.8152100114833658e-05, + "loss": 1.8259, + "step": 44210 + }, + { + "epoch": 0.2779329444346238, + "grad_norm": 6.785097599029541, + "learning_rate": 1.8151681013889005e-05, + "loss": 1.7175, + "step": 44220 + }, + { + "epoch": 0.27799579675132086, + "grad_norm": 6.160770416259766, + "learning_rate": 1.8151261912944352e-05, + "loss": 2.0265, + "step": 44230 + }, + { + "epoch": 0.27805864906801797, + "grad_norm": 7.9139204025268555, + "learning_rate": 1.81508428119997e-05, + "loss": 1.747, + "step": 44240 + }, + { + "epoch": 0.2781215013847151, + "grad_norm": 6.520354270935059, + "learning_rate": 1.8150423711055046e-05, + "loss": 1.7669, + "step": 44250 + }, + { + "epoch": 0.2781843537014122, + "grad_norm": 6.231197357177734, + "learning_rate": 1.8150004610110393e-05, + "loss": 1.8292, + "step": 44260 + }, + { + "epoch": 0.2782472060181093, + "grad_norm": 7.701934337615967, + "learning_rate": 1.814958550916574e-05, + "loss": 1.8001, + "step": 44270 + }, + { + "epoch": 0.27831005833480643, + "grad_norm": 6.897225379943848, + "learning_rate": 1.8149166408221087e-05, + "loss": 1.5311, + "step": 44280 + }, + { + "epoch": 0.27837291065150355, + "grad_norm": 7.523116111755371, + "learning_rate": 1.8148747307276434e-05, + "loss": 1.895, + "step": 44290 + }, + { + "epoch": 0.27843576296820066, + "grad_norm": 7.386162757873535, + "learning_rate": 1.8148328206331778e-05, + "loss": 1.9553, + "step": 44300 + }, + { + "epoch": 0.2784986152848978, + "grad_norm": 8.141165733337402, + "learning_rate": 1.8147909105387125e-05, + "loss": 1.6888, + "step": 44310 + }, + { + "epoch": 0.2785614676015949, + "grad_norm": 7.3685455322265625, + "learning_rate": 1.8147490004442472e-05, + "loss": 1.7884, + "step": 44320 + }, + { + "epoch": 0.278624319918292, + "grad_norm": 7.618593692779541, + "learning_rate": 1.814707090349782e-05, + "loss": 1.7598, + "step": 44330 + }, + { + "epoch": 0.2786871722349891, + "grad_norm": 7.6132330894470215, + "learning_rate": 1.8146651802553163e-05, + "loss": 1.5731, + "step": 44340 + }, + { + "epoch": 0.27875002455168624, + "grad_norm": 6.275602340698242, + "learning_rate": 1.814623270160851e-05, + "loss": 1.9637, + "step": 44350 + }, + { + "epoch": 0.2788128768683833, + "grad_norm": 6.493110656738281, + "learning_rate": 1.8145813600663857e-05, + "loss": 2.0089, + "step": 44360 + }, + { + "epoch": 0.2788757291850804, + "grad_norm": 7.190394401550293, + "learning_rate": 1.8145394499719204e-05, + "loss": 1.7452, + "step": 44370 + }, + { + "epoch": 0.2789385815017775, + "grad_norm": 8.51392650604248, + "learning_rate": 1.814497539877455e-05, + "loss": 1.906, + "step": 44380 + }, + { + "epoch": 0.27900143381847464, + "grad_norm": 7.147787570953369, + "learning_rate": 1.8144556297829895e-05, + "loss": 1.9689, + "step": 44390 + }, + { + "epoch": 0.27906428613517176, + "grad_norm": 7.204176425933838, + "learning_rate": 1.8144137196885242e-05, + "loss": 1.9244, + "step": 44400 + }, + { + "epoch": 0.2791271384518689, + "grad_norm": 7.5683674812316895, + "learning_rate": 1.814371809594059e-05, + "loss": 1.8164, + "step": 44410 + }, + { + "epoch": 0.279189990768566, + "grad_norm": 7.33584451675415, + "learning_rate": 1.8143298994995936e-05, + "loss": 1.6963, + "step": 44420 + }, + { + "epoch": 0.2792528430852631, + "grad_norm": 7.349540710449219, + "learning_rate": 1.8142879894051283e-05, + "loss": 1.7436, + "step": 44430 + }, + { + "epoch": 0.2793156954019602, + "grad_norm": 9.453506469726562, + "learning_rate": 1.814246079310663e-05, + "loss": 1.8077, + "step": 44440 + }, + { + "epoch": 0.27937854771865733, + "grad_norm": 9.688944816589355, + "learning_rate": 1.8142041692161977e-05, + "loss": 1.7876, + "step": 44450 + }, + { + "epoch": 0.27944140003535445, + "grad_norm": 6.370456218719482, + "learning_rate": 1.814162259121732e-05, + "loss": 1.8971, + "step": 44460 + }, + { + "epoch": 0.27950425235205156, + "grad_norm": 6.250444412231445, + "learning_rate": 1.8141203490272668e-05, + "loss": 1.6054, + "step": 44470 + }, + { + "epoch": 0.2795671046687486, + "grad_norm": 7.204066753387451, + "learning_rate": 1.8140784389328015e-05, + "loss": 1.797, + "step": 44480 + }, + { + "epoch": 0.27962995698544574, + "grad_norm": 7.6014485359191895, + "learning_rate": 1.8140365288383362e-05, + "loss": 1.8645, + "step": 44490 + }, + { + "epoch": 0.27969280930214285, + "grad_norm": 6.935083389282227, + "learning_rate": 1.813994618743871e-05, + "loss": 1.821, + "step": 44500 + }, + { + "epoch": 0.27975566161883997, + "grad_norm": 7.669597625732422, + "learning_rate": 1.8139527086494056e-05, + "loss": 2.1411, + "step": 44510 + }, + { + "epoch": 0.2798185139355371, + "grad_norm": 7.0571441650390625, + "learning_rate": 1.81391079855494e-05, + "loss": 1.8528, + "step": 44520 + }, + { + "epoch": 0.2798813662522342, + "grad_norm": 7.463881969451904, + "learning_rate": 1.8138688884604747e-05, + "loss": 1.9098, + "step": 44530 + }, + { + "epoch": 0.2799442185689313, + "grad_norm": 6.437138557434082, + "learning_rate": 1.8138269783660094e-05, + "loss": 1.8339, + "step": 44540 + }, + { + "epoch": 0.28000707088562843, + "grad_norm": 8.589752197265625, + "learning_rate": 1.813785068271544e-05, + "loss": 1.7393, + "step": 44550 + }, + { + "epoch": 0.28006992320232554, + "grad_norm": 7.563274383544922, + "learning_rate": 1.8137431581770785e-05, + "loss": 1.9598, + "step": 44560 + }, + { + "epoch": 0.28013277551902266, + "grad_norm": 7.278768062591553, + "learning_rate": 1.8137012480826132e-05, + "loss": 1.9527, + "step": 44570 + }, + { + "epoch": 0.2801956278357198, + "grad_norm": 7.549731254577637, + "learning_rate": 1.813659337988148e-05, + "loss": 1.8646, + "step": 44580 + }, + { + "epoch": 0.2802584801524169, + "grad_norm": 7.279454231262207, + "learning_rate": 1.8136174278936826e-05, + "loss": 1.7997, + "step": 44590 + }, + { + "epoch": 0.280321332469114, + "grad_norm": 6.979907035827637, + "learning_rate": 1.8135755177992173e-05, + "loss": 1.8811, + "step": 44600 + }, + { + "epoch": 0.28038418478581106, + "grad_norm": 6.6481852531433105, + "learning_rate": 1.8135336077047517e-05, + "loss": 1.9376, + "step": 44610 + }, + { + "epoch": 0.2804470371025082, + "grad_norm": 7.196473598480225, + "learning_rate": 1.8134916976102864e-05, + "loss": 1.7155, + "step": 44620 + }, + { + "epoch": 0.2805098894192053, + "grad_norm": 6.652624607086182, + "learning_rate": 1.813449787515821e-05, + "loss": 1.8019, + "step": 44630 + }, + { + "epoch": 0.2805727417359024, + "grad_norm": 7.184447765350342, + "learning_rate": 1.8134078774213558e-05, + "loss": 1.9028, + "step": 44640 + }, + { + "epoch": 0.2806355940525995, + "grad_norm": 8.438286781311035, + "learning_rate": 1.8133659673268905e-05, + "loss": 1.9653, + "step": 44650 + }, + { + "epoch": 0.28069844636929664, + "grad_norm": 6.87973690032959, + "learning_rate": 1.8133240572324252e-05, + "loss": 1.7853, + "step": 44660 + }, + { + "epoch": 0.28076129868599375, + "grad_norm": 7.515631675720215, + "learning_rate": 1.81328214713796e-05, + "loss": 1.5751, + "step": 44670 + }, + { + "epoch": 0.28082415100269087, + "grad_norm": 6.332245349884033, + "learning_rate": 1.8132402370434946e-05, + "loss": 1.7445, + "step": 44680 + }, + { + "epoch": 0.280887003319388, + "grad_norm": 6.123690128326416, + "learning_rate": 1.8131983269490294e-05, + "loss": 1.8947, + "step": 44690 + }, + { + "epoch": 0.2809498556360851, + "grad_norm": 7.331873893737793, + "learning_rate": 1.8131564168545637e-05, + "loss": 1.7275, + "step": 44700 + }, + { + "epoch": 0.2810127079527822, + "grad_norm": 6.67120885848999, + "learning_rate": 1.8131145067600984e-05, + "loss": 1.8632, + "step": 44710 + }, + { + "epoch": 0.28107556026947933, + "grad_norm": 8.140172004699707, + "learning_rate": 1.813072596665633e-05, + "loss": 2.0517, + "step": 44720 + }, + { + "epoch": 0.28113841258617644, + "grad_norm": 7.026405334472656, + "learning_rate": 1.813030686571168e-05, + "loss": 1.7541, + "step": 44730 + }, + { + "epoch": 0.2812012649028735, + "grad_norm": 9.236125946044922, + "learning_rate": 1.8129887764767022e-05, + "loss": 1.8394, + "step": 44740 + }, + { + "epoch": 0.2812641172195706, + "grad_norm": 9.79562759399414, + "learning_rate": 1.812946866382237e-05, + "loss": 1.9962, + "step": 44750 + }, + { + "epoch": 0.28132696953626773, + "grad_norm": 7.946891784667969, + "learning_rate": 1.8129049562877716e-05, + "loss": 1.9605, + "step": 44760 + }, + { + "epoch": 0.28138982185296485, + "grad_norm": 7.475961208343506, + "learning_rate": 1.8128630461933063e-05, + "loss": 1.6945, + "step": 44770 + }, + { + "epoch": 0.28145267416966196, + "grad_norm": 7.607438087463379, + "learning_rate": 1.8128211360988407e-05, + "loss": 1.8782, + "step": 44780 + }, + { + "epoch": 0.2815155264863591, + "grad_norm": 7.399308681488037, + "learning_rate": 1.8127792260043754e-05, + "loss": 1.8027, + "step": 44790 + }, + { + "epoch": 0.2815783788030562, + "grad_norm": 6.98337984085083, + "learning_rate": 1.81273731590991e-05, + "loss": 2.0415, + "step": 44800 + }, + { + "epoch": 0.2816412311197533, + "grad_norm": 7.4911017417907715, + "learning_rate": 1.8126954058154448e-05, + "loss": 1.8791, + "step": 44810 + }, + { + "epoch": 0.2817040834364504, + "grad_norm": 6.311008930206299, + "learning_rate": 1.8126534957209795e-05, + "loss": 1.7386, + "step": 44820 + }, + { + "epoch": 0.28176693575314754, + "grad_norm": 6.257545471191406, + "learning_rate": 1.8126115856265142e-05, + "loss": 1.7411, + "step": 44830 + }, + { + "epoch": 0.28182978806984466, + "grad_norm": 6.519497394561768, + "learning_rate": 1.8125696755320486e-05, + "loss": 1.5972, + "step": 44840 + }, + { + "epoch": 0.28189264038654177, + "grad_norm": 6.8323540687561035, + "learning_rate": 1.8125277654375833e-05, + "loss": 1.7022, + "step": 44850 + }, + { + "epoch": 0.2819554927032389, + "grad_norm": 7.942287445068359, + "learning_rate": 1.812485855343118e-05, + "loss": 2.1369, + "step": 44860 + }, + { + "epoch": 0.28201834501993595, + "grad_norm": 7.868318557739258, + "learning_rate": 1.8124439452486527e-05, + "loss": 1.9454, + "step": 44870 + }, + { + "epoch": 0.28208119733663306, + "grad_norm": 6.588150501251221, + "learning_rate": 1.8124020351541874e-05, + "loss": 1.7428, + "step": 44880 + }, + { + "epoch": 0.2821440496533302, + "grad_norm": 7.526489734649658, + "learning_rate": 1.812360125059722e-05, + "loss": 1.8493, + "step": 44890 + }, + { + "epoch": 0.2822069019700273, + "grad_norm": 7.636748790740967, + "learning_rate": 1.812318214965257e-05, + "loss": 1.7206, + "step": 44900 + }, + { + "epoch": 0.2822697542867244, + "grad_norm": 6.8038177490234375, + "learning_rate": 1.8122763048707916e-05, + "loss": 1.5991, + "step": 44910 + }, + { + "epoch": 0.2823326066034215, + "grad_norm": 6.8554863929748535, + "learning_rate": 1.812234394776326e-05, + "loss": 1.6602, + "step": 44920 + }, + { + "epoch": 0.28239545892011864, + "grad_norm": 8.983702659606934, + "learning_rate": 1.8121924846818606e-05, + "loss": 1.8623, + "step": 44930 + }, + { + "epoch": 0.28245831123681575, + "grad_norm": 6.636299133300781, + "learning_rate": 1.8121505745873953e-05, + "loss": 2.0028, + "step": 44940 + }, + { + "epoch": 0.28252116355351287, + "grad_norm": 7.728862762451172, + "learning_rate": 1.81210866449293e-05, + "loss": 2.0382, + "step": 44950 + }, + { + "epoch": 0.28258401587021, + "grad_norm": 7.6725311279296875, + "learning_rate": 1.8120667543984644e-05, + "loss": 2.1286, + "step": 44960 + }, + { + "epoch": 0.2826468681869071, + "grad_norm": 7.758844375610352, + "learning_rate": 1.812024844303999e-05, + "loss": 1.8814, + "step": 44970 + }, + { + "epoch": 0.2827097205036042, + "grad_norm": 6.9506988525390625, + "learning_rate": 1.8119829342095338e-05, + "loss": 1.7229, + "step": 44980 + }, + { + "epoch": 0.28277257282030127, + "grad_norm": 6.1850385665893555, + "learning_rate": 1.8119410241150685e-05, + "loss": 1.8845, + "step": 44990 + }, + { + "epoch": 0.2828354251369984, + "grad_norm": 6.863743305206299, + "learning_rate": 1.8118991140206032e-05, + "loss": 1.8941, + "step": 45000 + }, + { + "epoch": 0.2828982774536955, + "grad_norm": 5.781214714050293, + "learning_rate": 1.8118572039261376e-05, + "loss": 2.0665, + "step": 45010 + }, + { + "epoch": 0.2829611297703926, + "grad_norm": 8.170014381408691, + "learning_rate": 1.8118152938316723e-05, + "loss": 1.8578, + "step": 45020 + }, + { + "epoch": 0.28302398208708973, + "grad_norm": 7.467271327972412, + "learning_rate": 1.811773383737207e-05, + "loss": 1.9366, + "step": 45030 + }, + { + "epoch": 0.28308683440378685, + "grad_norm": 7.218717098236084, + "learning_rate": 1.8117314736427417e-05, + "loss": 1.8072, + "step": 45040 + }, + { + "epoch": 0.28314968672048396, + "grad_norm": 6.2370147705078125, + "learning_rate": 1.8116895635482764e-05, + "loss": 1.9271, + "step": 45050 + }, + { + "epoch": 0.2832125390371811, + "grad_norm": 7.850654125213623, + "learning_rate": 1.811647653453811e-05, + "loss": 1.8077, + "step": 45060 + }, + { + "epoch": 0.2832753913538782, + "grad_norm": 6.582813739776611, + "learning_rate": 1.811605743359346e-05, + "loss": 1.766, + "step": 45070 + }, + { + "epoch": 0.2833382436705753, + "grad_norm": 6.800104141235352, + "learning_rate": 1.8115638332648802e-05, + "loss": 1.9248, + "step": 45080 + }, + { + "epoch": 0.2834010959872724, + "grad_norm": 8.2304105758667, + "learning_rate": 1.811521923170415e-05, + "loss": 1.7349, + "step": 45090 + }, + { + "epoch": 0.28346394830396954, + "grad_norm": 6.5748138427734375, + "learning_rate": 1.8114800130759496e-05, + "loss": 2.0388, + "step": 45100 + }, + { + "epoch": 0.28352680062066665, + "grad_norm": 7.620097637176514, + "learning_rate": 1.8114381029814843e-05, + "loss": 2.0377, + "step": 45110 + }, + { + "epoch": 0.2835896529373637, + "grad_norm": 8.494855880737305, + "learning_rate": 1.811396192887019e-05, + "loss": 2.0819, + "step": 45120 + }, + { + "epoch": 0.2836525052540608, + "grad_norm": 8.075994491577148, + "learning_rate": 1.8113542827925538e-05, + "loss": 1.9846, + "step": 45130 + }, + { + "epoch": 0.28371535757075794, + "grad_norm": 7.5319695472717285, + "learning_rate": 1.811312372698088e-05, + "loss": 1.82, + "step": 45140 + }, + { + "epoch": 0.28377820988745506, + "grad_norm": 6.592759132385254, + "learning_rate": 1.811270462603623e-05, + "loss": 2.0664, + "step": 45150 + }, + { + "epoch": 0.2838410622041522, + "grad_norm": 6.713395118713379, + "learning_rate": 1.8112285525091575e-05, + "loss": 1.8459, + "step": 45160 + }, + { + "epoch": 0.2839039145208493, + "grad_norm": 8.242391586303711, + "learning_rate": 1.8111866424146922e-05, + "loss": 1.7817, + "step": 45170 + }, + { + "epoch": 0.2839667668375464, + "grad_norm": 7.351894855499268, + "learning_rate": 1.8111447323202266e-05, + "loss": 1.8684, + "step": 45180 + }, + { + "epoch": 0.2840296191542435, + "grad_norm": 6.898585319519043, + "learning_rate": 1.8111028222257613e-05, + "loss": 1.8582, + "step": 45190 + }, + { + "epoch": 0.28409247147094063, + "grad_norm": 7.325027942657471, + "learning_rate": 1.811060912131296e-05, + "loss": 1.8005, + "step": 45200 + }, + { + "epoch": 0.28415532378763775, + "grad_norm": 7.066969871520996, + "learning_rate": 1.8110190020368307e-05, + "loss": 1.9176, + "step": 45210 + }, + { + "epoch": 0.28421817610433486, + "grad_norm": 7.08568811416626, + "learning_rate": 1.8109770919423654e-05, + "loss": 1.847, + "step": 45220 + }, + { + "epoch": 0.284281028421032, + "grad_norm": 7.803054332733154, + "learning_rate": 1.8109351818478998e-05, + "loss": 1.7753, + "step": 45230 + }, + { + "epoch": 0.2843438807377291, + "grad_norm": 7.064817428588867, + "learning_rate": 1.8108932717534345e-05, + "loss": 1.9478, + "step": 45240 + }, + { + "epoch": 0.28440673305442615, + "grad_norm": 8.686222076416016, + "learning_rate": 1.8108513616589692e-05, + "loss": 1.641, + "step": 45250 + }, + { + "epoch": 0.28446958537112327, + "grad_norm": 5.989514350891113, + "learning_rate": 1.810809451564504e-05, + "loss": 1.8908, + "step": 45260 + }, + { + "epoch": 0.2845324376878204, + "grad_norm": 7.37302303314209, + "learning_rate": 1.8107675414700386e-05, + "loss": 1.8321, + "step": 45270 + }, + { + "epoch": 0.2845952900045175, + "grad_norm": 6.899227142333984, + "learning_rate": 1.8107256313755733e-05, + "loss": 1.9037, + "step": 45280 + }, + { + "epoch": 0.2846581423212146, + "grad_norm": 12.614656448364258, + "learning_rate": 1.810683721281108e-05, + "loss": 1.8213, + "step": 45290 + }, + { + "epoch": 0.28472099463791173, + "grad_norm": 7.848989486694336, + "learning_rate": 1.8106418111866428e-05, + "loss": 1.9224, + "step": 45300 + }, + { + "epoch": 0.28478384695460884, + "grad_norm": 8.137557029724121, + "learning_rate": 1.8105999010921775e-05, + "loss": 2.0452, + "step": 45310 + }, + { + "epoch": 0.28484669927130596, + "grad_norm": 6.750499725341797, + "learning_rate": 1.810557990997712e-05, + "loss": 2.0184, + "step": 45320 + }, + { + "epoch": 0.2849095515880031, + "grad_norm": 7.370104789733887, + "learning_rate": 1.8105160809032465e-05, + "loss": 1.9131, + "step": 45330 + }, + { + "epoch": 0.2849724039047002, + "grad_norm": 6.264123916625977, + "learning_rate": 1.8104741708087812e-05, + "loss": 1.7559, + "step": 45340 + }, + { + "epoch": 0.2850352562213973, + "grad_norm": 7.859541893005371, + "learning_rate": 1.810432260714316e-05, + "loss": 1.8596, + "step": 45350 + }, + { + "epoch": 0.2850981085380944, + "grad_norm": 6.466876983642578, + "learning_rate": 1.8103903506198503e-05, + "loss": 1.8305, + "step": 45360 + }, + { + "epoch": 0.28516096085479153, + "grad_norm": 6.512380599975586, + "learning_rate": 1.810348440525385e-05, + "loss": 1.7773, + "step": 45370 + }, + { + "epoch": 0.2852238131714886, + "grad_norm": 6.475058078765869, + "learning_rate": 1.8103065304309197e-05, + "loss": 1.8184, + "step": 45380 + }, + { + "epoch": 0.2852866654881857, + "grad_norm": 6.929133892059326, + "learning_rate": 1.8102646203364544e-05, + "loss": 1.8313, + "step": 45390 + }, + { + "epoch": 0.2853495178048828, + "grad_norm": 7.595178127288818, + "learning_rate": 1.8102227102419888e-05, + "loss": 2.0823, + "step": 45400 + }, + { + "epoch": 0.28541237012157994, + "grad_norm": 6.889100551605225, + "learning_rate": 1.8101808001475235e-05, + "loss": 1.8938, + "step": 45410 + }, + { + "epoch": 0.28547522243827705, + "grad_norm": 7.095263957977295, + "learning_rate": 1.8101388900530582e-05, + "loss": 1.829, + "step": 45420 + }, + { + "epoch": 0.28553807475497417, + "grad_norm": 8.837443351745605, + "learning_rate": 1.810096979958593e-05, + "loss": 1.853, + "step": 45430 + }, + { + "epoch": 0.2856009270716713, + "grad_norm": 6.931927680969238, + "learning_rate": 1.8100550698641276e-05, + "loss": 1.7004, + "step": 45440 + }, + { + "epoch": 0.2856637793883684, + "grad_norm": 7.829457759857178, + "learning_rate": 1.8100131597696623e-05, + "loss": 1.8764, + "step": 45450 + }, + { + "epoch": 0.2857266317050655, + "grad_norm": 8.360833168029785, + "learning_rate": 1.8099712496751967e-05, + "loss": 1.702, + "step": 45460 + }, + { + "epoch": 0.28578948402176263, + "grad_norm": 7.660463809967041, + "learning_rate": 1.8099293395807314e-05, + "loss": 1.6702, + "step": 45470 + }, + { + "epoch": 0.28585233633845974, + "grad_norm": 6.292911052703857, + "learning_rate": 1.809887429486266e-05, + "loss": 1.8652, + "step": 45480 + }, + { + "epoch": 0.28591518865515686, + "grad_norm": 6.531392574310303, + "learning_rate": 1.809845519391801e-05, + "loss": 1.7984, + "step": 45490 + }, + { + "epoch": 0.2859780409718539, + "grad_norm": 7.413627624511719, + "learning_rate": 1.8098036092973355e-05, + "loss": 1.882, + "step": 45500 + }, + { + "epoch": 0.28604089328855103, + "grad_norm": 6.160747528076172, + "learning_rate": 1.8097616992028703e-05, + "loss": 1.7704, + "step": 45510 + }, + { + "epoch": 0.28610374560524815, + "grad_norm": 7.964906215667725, + "learning_rate": 1.809719789108405e-05, + "loss": 1.8103, + "step": 45520 + }, + { + "epoch": 0.28616659792194526, + "grad_norm": 6.141238212585449, + "learning_rate": 1.8096778790139397e-05, + "loss": 1.959, + "step": 45530 + }, + { + "epoch": 0.2862294502386424, + "grad_norm": 6.768047332763672, + "learning_rate": 1.809635968919474e-05, + "loss": 2.0092, + "step": 45540 + }, + { + "epoch": 0.2862923025553395, + "grad_norm": 7.242840766906738, + "learning_rate": 1.8095940588250087e-05, + "loss": 1.7468, + "step": 45550 + }, + { + "epoch": 0.2863551548720366, + "grad_norm": 7.306134223937988, + "learning_rate": 1.8095521487305434e-05, + "loss": 1.8085, + "step": 45560 + }, + { + "epoch": 0.2864180071887337, + "grad_norm": 7.484009265899658, + "learning_rate": 1.809510238636078e-05, + "loss": 1.896, + "step": 45570 + }, + { + "epoch": 0.28648085950543084, + "grad_norm": 7.079681873321533, + "learning_rate": 1.8094683285416125e-05, + "loss": 1.8512, + "step": 45580 + }, + { + "epoch": 0.28654371182212796, + "grad_norm": 7.087714672088623, + "learning_rate": 1.8094264184471472e-05, + "loss": 1.7813, + "step": 45590 + }, + { + "epoch": 0.28660656413882507, + "grad_norm": 9.201361656188965, + "learning_rate": 1.809384508352682e-05, + "loss": 1.6572, + "step": 45600 + }, + { + "epoch": 0.2866694164555222, + "grad_norm": 6.778953552246094, + "learning_rate": 1.8093425982582166e-05, + "loss": 1.8233, + "step": 45610 + }, + { + "epoch": 0.2867322687722193, + "grad_norm": 8.023008346557617, + "learning_rate": 1.8093006881637514e-05, + "loss": 1.8874, + "step": 45620 + }, + { + "epoch": 0.28679512108891636, + "grad_norm": 7.189453601837158, + "learning_rate": 1.8092587780692857e-05, + "loss": 1.7675, + "step": 45630 + }, + { + "epoch": 0.2868579734056135, + "grad_norm": 7.227446556091309, + "learning_rate": 1.8092168679748204e-05, + "loss": 1.9177, + "step": 45640 + }, + { + "epoch": 0.2869208257223106, + "grad_norm": 6.096076488494873, + "learning_rate": 1.809174957880355e-05, + "loss": 2.1245, + "step": 45650 + }, + { + "epoch": 0.2869836780390077, + "grad_norm": 7.4338202476501465, + "learning_rate": 1.80913304778589e-05, + "loss": 1.6487, + "step": 45660 + }, + { + "epoch": 0.2870465303557048, + "grad_norm": 7.482662677764893, + "learning_rate": 1.8090911376914245e-05, + "loss": 1.8797, + "step": 45670 + }, + { + "epoch": 0.28710938267240194, + "grad_norm": 7.430908203125, + "learning_rate": 1.8090492275969593e-05, + "loss": 1.7771, + "step": 45680 + }, + { + "epoch": 0.28717223498909905, + "grad_norm": 7.199875354766846, + "learning_rate": 1.809007317502494e-05, + "loss": 1.8885, + "step": 45690 + }, + { + "epoch": 0.28723508730579617, + "grad_norm": 7.804337501525879, + "learning_rate": 1.8089654074080287e-05, + "loss": 1.6821, + "step": 45700 + }, + { + "epoch": 0.2872979396224933, + "grad_norm": 6.368311405181885, + "learning_rate": 1.808923497313563e-05, + "loss": 1.6673, + "step": 45710 + }, + { + "epoch": 0.2873607919391904, + "grad_norm": 5.977096080780029, + "learning_rate": 1.8088815872190977e-05, + "loss": 1.6639, + "step": 45720 + }, + { + "epoch": 0.2874236442558875, + "grad_norm": 5.912306308746338, + "learning_rate": 1.8088396771246325e-05, + "loss": 1.8296, + "step": 45730 + }, + { + "epoch": 0.2874864965725846, + "grad_norm": 6.9958577156066895, + "learning_rate": 1.808797767030167e-05, + "loss": 1.6494, + "step": 45740 + }, + { + "epoch": 0.28754934888928174, + "grad_norm": 8.182312965393066, + "learning_rate": 1.808755856935702e-05, + "loss": 1.9476, + "step": 45750 + }, + { + "epoch": 0.2876122012059788, + "grad_norm": 8.067004203796387, + "learning_rate": 1.8087139468412362e-05, + "loss": 1.6992, + "step": 45760 + }, + { + "epoch": 0.2876750535226759, + "grad_norm": 7.400192737579346, + "learning_rate": 1.808672036746771e-05, + "loss": 1.7374, + "step": 45770 + }, + { + "epoch": 0.28773790583937303, + "grad_norm": 7.941074848175049, + "learning_rate": 1.8086301266523056e-05, + "loss": 1.9707, + "step": 45780 + }, + { + "epoch": 0.28780075815607015, + "grad_norm": 7.446524143218994, + "learning_rate": 1.8085882165578404e-05, + "loss": 1.749, + "step": 45790 + }, + { + "epoch": 0.28786361047276726, + "grad_norm": 5.95335054397583, + "learning_rate": 1.8085463064633747e-05, + "loss": 1.6531, + "step": 45800 + }, + { + "epoch": 0.2879264627894644, + "grad_norm": 7.67755126953125, + "learning_rate": 1.8085043963689094e-05, + "loss": 1.8494, + "step": 45810 + }, + { + "epoch": 0.2879893151061615, + "grad_norm": 6.806369304656982, + "learning_rate": 1.808462486274444e-05, + "loss": 1.8981, + "step": 45820 + }, + { + "epoch": 0.2880521674228586, + "grad_norm": 7.677136421203613, + "learning_rate": 1.808420576179979e-05, + "loss": 1.8251, + "step": 45830 + }, + { + "epoch": 0.2881150197395557, + "grad_norm": 7.445577621459961, + "learning_rate": 1.8083786660855136e-05, + "loss": 1.9718, + "step": 45840 + }, + { + "epoch": 0.28817787205625284, + "grad_norm": 7.094901084899902, + "learning_rate": 1.808336755991048e-05, + "loss": 1.713, + "step": 45850 + }, + { + "epoch": 0.28824072437294995, + "grad_norm": 6.669374942779541, + "learning_rate": 1.8082948458965826e-05, + "loss": 1.8079, + "step": 45860 + }, + { + "epoch": 0.28830357668964707, + "grad_norm": 8.922381401062012, + "learning_rate": 1.8082529358021173e-05, + "loss": 1.8787, + "step": 45870 + }, + { + "epoch": 0.2883664290063442, + "grad_norm": 7.130077838897705, + "learning_rate": 1.808211025707652e-05, + "loss": 1.8555, + "step": 45880 + }, + { + "epoch": 0.28842928132304124, + "grad_norm": 6.75282096862793, + "learning_rate": 1.8081691156131867e-05, + "loss": 1.9025, + "step": 45890 + }, + { + "epoch": 0.28849213363973836, + "grad_norm": 7.469769477844238, + "learning_rate": 1.8081272055187215e-05, + "loss": 1.8409, + "step": 45900 + }, + { + "epoch": 0.2885549859564355, + "grad_norm": 6.7698750495910645, + "learning_rate": 1.808085295424256e-05, + "loss": 2.028, + "step": 45910 + }, + { + "epoch": 0.2886178382731326, + "grad_norm": 8.43509292602539, + "learning_rate": 1.808043385329791e-05, + "loss": 2.0557, + "step": 45920 + }, + { + "epoch": 0.2886806905898297, + "grad_norm": 7.978787899017334, + "learning_rate": 1.8080014752353256e-05, + "loss": 1.8012, + "step": 45930 + }, + { + "epoch": 0.2887435429065268, + "grad_norm": 7.802064418792725, + "learning_rate": 1.80795956514086e-05, + "loss": 1.6254, + "step": 45940 + }, + { + "epoch": 0.28880639522322393, + "grad_norm": 8.451842308044434, + "learning_rate": 1.8079176550463947e-05, + "loss": 1.665, + "step": 45950 + }, + { + "epoch": 0.28886924753992105, + "grad_norm": 7.110132694244385, + "learning_rate": 1.8078757449519294e-05, + "loss": 1.9293, + "step": 45960 + }, + { + "epoch": 0.28893209985661816, + "grad_norm": 6.904272556304932, + "learning_rate": 1.807833834857464e-05, + "loss": 1.6345, + "step": 45970 + }, + { + "epoch": 0.2889949521733153, + "grad_norm": 7.582462787628174, + "learning_rate": 1.8077919247629984e-05, + "loss": 1.8533, + "step": 45980 + }, + { + "epoch": 0.2890578044900124, + "grad_norm": 7.024543285369873, + "learning_rate": 1.807750014668533e-05, + "loss": 1.6023, + "step": 45990 + }, + { + "epoch": 0.2891206568067095, + "grad_norm": 6.810546398162842, + "learning_rate": 1.807708104574068e-05, + "loss": 1.9153, + "step": 46000 + }, + { + "epoch": 0.28918350912340657, + "grad_norm": 6.868898868560791, + "learning_rate": 1.8076661944796026e-05, + "loss": 1.9369, + "step": 46010 + }, + { + "epoch": 0.2892463614401037, + "grad_norm": 6.2243971824646, + "learning_rate": 1.8076242843851373e-05, + "loss": 1.8618, + "step": 46020 + }, + { + "epoch": 0.2893092137568008, + "grad_norm": 7.958976745605469, + "learning_rate": 1.8075823742906716e-05, + "loss": 1.9746, + "step": 46030 + }, + { + "epoch": 0.2893720660734979, + "grad_norm": 7.3935770988464355, + "learning_rate": 1.8075404641962063e-05, + "loss": 1.8323, + "step": 46040 + }, + { + "epoch": 0.28943491839019503, + "grad_norm": 6.162091255187988, + "learning_rate": 1.807498554101741e-05, + "loss": 1.6404, + "step": 46050 + }, + { + "epoch": 0.28949777070689214, + "grad_norm": 6.663823127746582, + "learning_rate": 1.8074566440072758e-05, + "loss": 1.7306, + "step": 46060 + }, + { + "epoch": 0.28956062302358926, + "grad_norm": 7.021383285522461, + "learning_rate": 1.8074147339128105e-05, + "loss": 1.7698, + "step": 46070 + }, + { + "epoch": 0.2896234753402864, + "grad_norm": 7.511137962341309, + "learning_rate": 1.807372823818345e-05, + "loss": 1.8128, + "step": 46080 + }, + { + "epoch": 0.2896863276569835, + "grad_norm": 6.127781867980957, + "learning_rate": 1.8073309137238795e-05, + "loss": 1.7309, + "step": 46090 + }, + { + "epoch": 0.2897491799736806, + "grad_norm": 7.155242443084717, + "learning_rate": 1.8072890036294142e-05, + "loss": 1.8202, + "step": 46100 + }, + { + "epoch": 0.2898120322903777, + "grad_norm": 7.369287014007568, + "learning_rate": 1.807247093534949e-05, + "loss": 1.84, + "step": 46110 + }, + { + "epoch": 0.28987488460707483, + "grad_norm": 6.920827865600586, + "learning_rate": 1.8072051834404837e-05, + "loss": 1.6678, + "step": 46120 + }, + { + "epoch": 0.28993773692377195, + "grad_norm": 7.506908416748047, + "learning_rate": 1.8071632733460184e-05, + "loss": 1.7669, + "step": 46130 + }, + { + "epoch": 0.290000589240469, + "grad_norm": 6.9853901863098145, + "learning_rate": 1.807121363251553e-05, + "loss": 1.5735, + "step": 46140 + }, + { + "epoch": 0.2900634415571661, + "grad_norm": 7.828129291534424, + "learning_rate": 1.8070794531570878e-05, + "loss": 1.7866, + "step": 46150 + }, + { + "epoch": 0.29012629387386324, + "grad_norm": 6.989505767822266, + "learning_rate": 1.807037543062622e-05, + "loss": 1.7681, + "step": 46160 + }, + { + "epoch": 0.29018914619056035, + "grad_norm": 6.1492414474487305, + "learning_rate": 1.806995632968157e-05, + "loss": 1.8462, + "step": 46170 + }, + { + "epoch": 0.29025199850725747, + "grad_norm": 7.770036220550537, + "learning_rate": 1.8069537228736916e-05, + "loss": 1.6597, + "step": 46180 + }, + { + "epoch": 0.2903148508239546, + "grad_norm": 6.923418045043945, + "learning_rate": 1.8069118127792263e-05, + "loss": 1.8354, + "step": 46190 + }, + { + "epoch": 0.2903777031406517, + "grad_norm": 6.774586200714111, + "learning_rate": 1.8068699026847606e-05, + "loss": 1.8253, + "step": 46200 + }, + { + "epoch": 0.2904405554573488, + "grad_norm": 7.21401309967041, + "learning_rate": 1.8068279925902953e-05, + "loss": 1.799, + "step": 46210 + }, + { + "epoch": 0.29050340777404593, + "grad_norm": 7.436776638031006, + "learning_rate": 1.80678608249583e-05, + "loss": 1.6846, + "step": 46220 + }, + { + "epoch": 0.29056626009074304, + "grad_norm": 6.8276214599609375, + "learning_rate": 1.8067441724013648e-05, + "loss": 1.8868, + "step": 46230 + }, + { + "epoch": 0.29062911240744016, + "grad_norm": 7.0181193351745605, + "learning_rate": 1.8067022623068995e-05, + "loss": 1.564, + "step": 46240 + }, + { + "epoch": 0.2906919647241373, + "grad_norm": 6.839227676391602, + "learning_rate": 1.806660352212434e-05, + "loss": 1.8778, + "step": 46250 + }, + { + "epoch": 0.2907548170408344, + "grad_norm": 8.030241012573242, + "learning_rate": 1.8066184421179685e-05, + "loss": 2.032, + "step": 46260 + }, + { + "epoch": 0.29081766935753145, + "grad_norm": 8.75223159790039, + "learning_rate": 1.8065765320235032e-05, + "loss": 1.8368, + "step": 46270 + }, + { + "epoch": 0.29088052167422856, + "grad_norm": 7.519930839538574, + "learning_rate": 1.806534621929038e-05, + "loss": 1.8853, + "step": 46280 + }, + { + "epoch": 0.2909433739909257, + "grad_norm": 6.830507755279541, + "learning_rate": 1.8064927118345727e-05, + "loss": 1.7432, + "step": 46290 + }, + { + "epoch": 0.2910062263076228, + "grad_norm": 7.217705726623535, + "learning_rate": 1.8064508017401074e-05, + "loss": 1.8971, + "step": 46300 + }, + { + "epoch": 0.2910690786243199, + "grad_norm": 5.745712757110596, + "learning_rate": 1.806408891645642e-05, + "loss": 1.7438, + "step": 46310 + }, + { + "epoch": 0.291131930941017, + "grad_norm": 7.343744277954102, + "learning_rate": 1.8063669815511768e-05, + "loss": 1.751, + "step": 46320 + }, + { + "epoch": 0.29119478325771414, + "grad_norm": 6.936794757843018, + "learning_rate": 1.8063250714567115e-05, + "loss": 1.74, + "step": 46330 + }, + { + "epoch": 0.29125763557441126, + "grad_norm": 6.6871657371521, + "learning_rate": 1.806283161362246e-05, + "loss": 1.8208, + "step": 46340 + }, + { + "epoch": 0.29132048789110837, + "grad_norm": 7.089998245239258, + "learning_rate": 1.8062412512677806e-05, + "loss": 1.9357, + "step": 46350 + }, + { + "epoch": 0.2913833402078055, + "grad_norm": 7.669445037841797, + "learning_rate": 1.8061993411733153e-05, + "loss": 1.9424, + "step": 46360 + }, + { + "epoch": 0.2914461925245026, + "grad_norm": 7.0265631675720215, + "learning_rate": 1.80615743107885e-05, + "loss": 1.6816, + "step": 46370 + }, + { + "epoch": 0.2915090448411997, + "grad_norm": 7.4309587478637695, + "learning_rate": 1.8061155209843843e-05, + "loss": 1.8171, + "step": 46380 + }, + { + "epoch": 0.29157189715789683, + "grad_norm": 7.687270641326904, + "learning_rate": 1.806073610889919e-05, + "loss": 1.868, + "step": 46390 + }, + { + "epoch": 0.2916347494745939, + "grad_norm": 7.764848709106445, + "learning_rate": 1.8060317007954538e-05, + "loss": 2.1017, + "step": 46400 + }, + { + "epoch": 0.291697601791291, + "grad_norm": 6.4793500900268555, + "learning_rate": 1.8059897907009885e-05, + "loss": 1.8929, + "step": 46410 + }, + { + "epoch": 0.2917604541079881, + "grad_norm": 6.396585941314697, + "learning_rate": 1.805947880606523e-05, + "loss": 1.841, + "step": 46420 + }, + { + "epoch": 0.29182330642468524, + "grad_norm": 6.193444728851318, + "learning_rate": 1.8059059705120575e-05, + "loss": 2.0445, + "step": 46430 + }, + { + "epoch": 0.29188615874138235, + "grad_norm": 6.106712341308594, + "learning_rate": 1.8058640604175922e-05, + "loss": 1.872, + "step": 46440 + }, + { + "epoch": 0.29194901105807947, + "grad_norm": 8.70683765411377, + "learning_rate": 1.805822150323127e-05, + "loss": 1.6543, + "step": 46450 + }, + { + "epoch": 0.2920118633747766, + "grad_norm": 6.549591064453125, + "learning_rate": 1.8057802402286617e-05, + "loss": 1.893, + "step": 46460 + }, + { + "epoch": 0.2920747156914737, + "grad_norm": 6.526978492736816, + "learning_rate": 1.805738330134196e-05, + "loss": 1.9956, + "step": 46470 + }, + { + "epoch": 0.2921375680081708, + "grad_norm": 6.979129791259766, + "learning_rate": 1.8056964200397307e-05, + "loss": 1.6427, + "step": 46480 + }, + { + "epoch": 0.2922004203248679, + "grad_norm": 7.817368984222412, + "learning_rate": 1.8056545099452654e-05, + "loss": 1.7581, + "step": 46490 + }, + { + "epoch": 0.29226327264156504, + "grad_norm": 7.753480911254883, + "learning_rate": 1.8056125998508e-05, + "loss": 2.1084, + "step": 46500 + }, + { + "epoch": 0.29232612495826216, + "grad_norm": 6.885097980499268, + "learning_rate": 1.805570689756335e-05, + "loss": 2.0702, + "step": 46510 + }, + { + "epoch": 0.29238897727495927, + "grad_norm": 6.970781326293945, + "learning_rate": 1.8055287796618696e-05, + "loss": 1.8628, + "step": 46520 + }, + { + "epoch": 0.29245182959165633, + "grad_norm": 6.915135860443115, + "learning_rate": 1.8054868695674043e-05, + "loss": 1.698, + "step": 46530 + }, + { + "epoch": 0.29251468190835345, + "grad_norm": 6.631259918212891, + "learning_rate": 1.805444959472939e-05, + "loss": 1.7241, + "step": 46540 + }, + { + "epoch": 0.29257753422505056, + "grad_norm": 7.211366653442383, + "learning_rate": 1.8054030493784737e-05, + "loss": 1.7791, + "step": 46550 + }, + { + "epoch": 0.2926403865417477, + "grad_norm": 6.618610858917236, + "learning_rate": 1.805361139284008e-05, + "loss": 1.8971, + "step": 46560 + }, + { + "epoch": 0.2927032388584448, + "grad_norm": 7.058317184448242, + "learning_rate": 1.8053192291895428e-05, + "loss": 1.9995, + "step": 46570 + }, + { + "epoch": 0.2927660911751419, + "grad_norm": 6.45781946182251, + "learning_rate": 1.8052773190950775e-05, + "loss": 1.7701, + "step": 46580 + }, + { + "epoch": 0.292828943491839, + "grad_norm": 6.641473293304443, + "learning_rate": 1.8052354090006122e-05, + "loss": 1.5614, + "step": 46590 + }, + { + "epoch": 0.29289179580853614, + "grad_norm": 7.312847137451172, + "learning_rate": 1.8051934989061465e-05, + "loss": 1.8116, + "step": 46600 + }, + { + "epoch": 0.29295464812523325, + "grad_norm": 6.886174201965332, + "learning_rate": 1.8051515888116813e-05, + "loss": 1.7406, + "step": 46610 + }, + { + "epoch": 0.29301750044193037, + "grad_norm": 6.168983459472656, + "learning_rate": 1.805109678717216e-05, + "loss": 1.694, + "step": 46620 + }, + { + "epoch": 0.2930803527586275, + "grad_norm": 7.689616680145264, + "learning_rate": 1.8050677686227507e-05, + "loss": 1.813, + "step": 46630 + }, + { + "epoch": 0.2931432050753246, + "grad_norm": 5.971601486206055, + "learning_rate": 1.8050258585282854e-05, + "loss": 1.8417, + "step": 46640 + }, + { + "epoch": 0.29320605739202166, + "grad_norm": 7.689766883850098, + "learning_rate": 1.8049839484338197e-05, + "loss": 2.1305, + "step": 46650 + }, + { + "epoch": 0.2932689097087188, + "grad_norm": 7.959043025970459, + "learning_rate": 1.8049420383393544e-05, + "loss": 1.8383, + "step": 46660 + }, + { + "epoch": 0.2933317620254159, + "grad_norm": 6.360825061798096, + "learning_rate": 1.804900128244889e-05, + "loss": 1.6932, + "step": 46670 + }, + { + "epoch": 0.293394614342113, + "grad_norm": 6.770597457885742, + "learning_rate": 1.804858218150424e-05, + "loss": 1.7634, + "step": 46680 + }, + { + "epoch": 0.2934574666588101, + "grad_norm": 6.878189563751221, + "learning_rate": 1.8048163080559586e-05, + "loss": 1.6384, + "step": 46690 + }, + { + "epoch": 0.29352031897550723, + "grad_norm": 7.4565510749816895, + "learning_rate": 1.8047743979614933e-05, + "loss": 1.9091, + "step": 46700 + }, + { + "epoch": 0.29358317129220435, + "grad_norm": 7.244266986846924, + "learning_rate": 1.804732487867028e-05, + "loss": 1.8722, + "step": 46710 + }, + { + "epoch": 0.29364602360890146, + "grad_norm": 6.477084159851074, + "learning_rate": 1.8046905777725624e-05, + "loss": 1.6486, + "step": 46720 + }, + { + "epoch": 0.2937088759255986, + "grad_norm": 6.180047512054443, + "learning_rate": 1.804648667678097e-05, + "loss": 1.9805, + "step": 46730 + }, + { + "epoch": 0.2937717282422957, + "grad_norm": 6.933464050292969, + "learning_rate": 1.8046067575836318e-05, + "loss": 1.8354, + "step": 46740 + }, + { + "epoch": 0.2938345805589928, + "grad_norm": 7.443251132965088, + "learning_rate": 1.8045648474891665e-05, + "loss": 1.7307, + "step": 46750 + }, + { + "epoch": 0.2938974328756899, + "grad_norm": 6.6274003982543945, + "learning_rate": 1.8045229373947012e-05, + "loss": 1.9998, + "step": 46760 + }, + { + "epoch": 0.29396028519238704, + "grad_norm": 6.9516472816467285, + "learning_rate": 1.804481027300236e-05, + "loss": 1.6248, + "step": 46770 + }, + { + "epoch": 0.2940231375090841, + "grad_norm": 6.207671165466309, + "learning_rate": 1.8044391172057703e-05, + "loss": 1.7965, + "step": 46780 + }, + { + "epoch": 0.2940859898257812, + "grad_norm": 7.07285737991333, + "learning_rate": 1.804397207111305e-05, + "loss": 1.8847, + "step": 46790 + }, + { + "epoch": 0.29414884214247833, + "grad_norm": 7.29025936126709, + "learning_rate": 1.8043552970168397e-05, + "loss": 1.9441, + "step": 46800 + }, + { + "epoch": 0.29421169445917544, + "grad_norm": 6.398433685302734, + "learning_rate": 1.8043133869223744e-05, + "loss": 1.9725, + "step": 46810 + }, + { + "epoch": 0.29427454677587256, + "grad_norm": 7.145827293395996, + "learning_rate": 1.8042714768279087e-05, + "loss": 1.7165, + "step": 46820 + }, + { + "epoch": 0.2943373990925697, + "grad_norm": 7.262965202331543, + "learning_rate": 1.8042295667334435e-05, + "loss": 2.0032, + "step": 46830 + }, + { + "epoch": 0.2944002514092668, + "grad_norm": 7.3345866203308105, + "learning_rate": 1.804187656638978e-05, + "loss": 1.9671, + "step": 46840 + }, + { + "epoch": 0.2944631037259639, + "grad_norm": 6.392716884613037, + "learning_rate": 1.804145746544513e-05, + "loss": 1.9767, + "step": 46850 + }, + { + "epoch": 0.294525956042661, + "grad_norm": 7.186389446258545, + "learning_rate": 1.8041038364500476e-05, + "loss": 2.0334, + "step": 46860 + }, + { + "epoch": 0.29458880835935813, + "grad_norm": 6.9807257652282715, + "learning_rate": 1.804061926355582e-05, + "loss": 1.7917, + "step": 46870 + }, + { + "epoch": 0.29465166067605525, + "grad_norm": 6.366339206695557, + "learning_rate": 1.8040200162611166e-05, + "loss": 1.6576, + "step": 46880 + }, + { + "epoch": 0.29471451299275236, + "grad_norm": 6.354554176330566, + "learning_rate": 1.8039781061666514e-05, + "loss": 1.682, + "step": 46890 + }, + { + "epoch": 0.2947773653094495, + "grad_norm": 7.513469219207764, + "learning_rate": 1.803936196072186e-05, + "loss": 1.7218, + "step": 46900 + }, + { + "epoch": 0.29484021762614654, + "grad_norm": 10.029930114746094, + "learning_rate": 1.8038942859777208e-05, + "loss": 2.184, + "step": 46910 + }, + { + "epoch": 0.29490306994284365, + "grad_norm": 6.429253101348877, + "learning_rate": 1.8038523758832555e-05, + "loss": 1.8852, + "step": 46920 + }, + { + "epoch": 0.29496592225954077, + "grad_norm": 6.832610607147217, + "learning_rate": 1.8038104657887902e-05, + "loss": 1.685, + "step": 46930 + }, + { + "epoch": 0.2950287745762379, + "grad_norm": 6.212717056274414, + "learning_rate": 1.803768555694325e-05, + "loss": 1.9077, + "step": 46940 + }, + { + "epoch": 0.295091626892935, + "grad_norm": 7.340845584869385, + "learning_rate": 1.8037266455998596e-05, + "loss": 1.8025, + "step": 46950 + }, + { + "epoch": 0.2951544792096321, + "grad_norm": 7.158466339111328, + "learning_rate": 1.803684735505394e-05, + "loss": 1.5885, + "step": 46960 + }, + { + "epoch": 0.29521733152632923, + "grad_norm": 6.452349662780762, + "learning_rate": 1.8036428254109287e-05, + "loss": 1.6982, + "step": 46970 + }, + { + "epoch": 0.29528018384302634, + "grad_norm": 7.0004448890686035, + "learning_rate": 1.8036009153164634e-05, + "loss": 1.8416, + "step": 46980 + }, + { + "epoch": 0.29534303615972346, + "grad_norm": 7.211448669433594, + "learning_rate": 1.803559005221998e-05, + "loss": 1.7706, + "step": 46990 + }, + { + "epoch": 0.2954058884764206, + "grad_norm": 6.377902507781982, + "learning_rate": 1.8035170951275325e-05, + "loss": 1.8846, + "step": 47000 + }, + { + "epoch": 0.2954687407931177, + "grad_norm": 10.337285995483398, + "learning_rate": 1.803475185033067e-05, + "loss": 1.8624, + "step": 47010 + }, + { + "epoch": 0.2955315931098148, + "grad_norm": 7.350049018859863, + "learning_rate": 1.803433274938602e-05, + "loss": 1.8305, + "step": 47020 + }, + { + "epoch": 0.2955944454265119, + "grad_norm": 6.6064372062683105, + "learning_rate": 1.8033913648441366e-05, + "loss": 1.8282, + "step": 47030 + }, + { + "epoch": 0.295657297743209, + "grad_norm": 8.158204078674316, + "learning_rate": 1.803349454749671e-05, + "loss": 1.9169, + "step": 47040 + }, + { + "epoch": 0.2957201500599061, + "grad_norm": 7.254039287567139, + "learning_rate": 1.8033075446552057e-05, + "loss": 1.8077, + "step": 47050 + }, + { + "epoch": 0.2957830023766032, + "grad_norm": 7.76493501663208, + "learning_rate": 1.8032656345607404e-05, + "loss": 1.986, + "step": 47060 + }, + { + "epoch": 0.2958458546933003, + "grad_norm": 6.843966007232666, + "learning_rate": 1.803223724466275e-05, + "loss": 1.984, + "step": 47070 + }, + { + "epoch": 0.29590870700999744, + "grad_norm": 5.823937892913818, + "learning_rate": 1.8031818143718098e-05, + "loss": 1.9044, + "step": 47080 + }, + { + "epoch": 0.29597155932669456, + "grad_norm": 6.896873950958252, + "learning_rate": 1.803139904277344e-05, + "loss": 1.845, + "step": 47090 + }, + { + "epoch": 0.29603441164339167, + "grad_norm": 7.583927631378174, + "learning_rate": 1.803097994182879e-05, + "loss": 1.8615, + "step": 47100 + }, + { + "epoch": 0.2960972639600888, + "grad_norm": 6.692607402801514, + "learning_rate": 1.8030560840884136e-05, + "loss": 1.9815, + "step": 47110 + }, + { + "epoch": 0.2961601162767859, + "grad_norm": 7.335759162902832, + "learning_rate": 1.8030141739939483e-05, + "loss": 1.8262, + "step": 47120 + }, + { + "epoch": 0.296222968593483, + "grad_norm": 5.8943986892700195, + "learning_rate": 1.802972263899483e-05, + "loss": 1.9539, + "step": 47130 + }, + { + "epoch": 0.29628582091018013, + "grad_norm": 7.7029595375061035, + "learning_rate": 1.8029303538050177e-05, + "loss": 1.8097, + "step": 47140 + }, + { + "epoch": 0.29634867322687725, + "grad_norm": 7.473947525024414, + "learning_rate": 1.8028884437105524e-05, + "loss": 1.8542, + "step": 47150 + }, + { + "epoch": 0.2964115255435743, + "grad_norm": 6.227741241455078, + "learning_rate": 1.802846533616087e-05, + "loss": 1.6428, + "step": 47160 + }, + { + "epoch": 0.2964743778602714, + "grad_norm": 7.363375186920166, + "learning_rate": 1.8028046235216218e-05, + "loss": 1.8623, + "step": 47170 + }, + { + "epoch": 0.29653723017696854, + "grad_norm": 5.548816680908203, + "learning_rate": 1.802762713427156e-05, + "loss": 1.833, + "step": 47180 + }, + { + "epoch": 0.29660008249366565, + "grad_norm": 6.321163654327393, + "learning_rate": 1.802720803332691e-05, + "loss": 1.8314, + "step": 47190 + }, + { + "epoch": 0.29666293481036277, + "grad_norm": 5.64842414855957, + "learning_rate": 1.8026788932382256e-05, + "loss": 1.8445, + "step": 47200 + }, + { + "epoch": 0.2967257871270599, + "grad_norm": 8.194724082946777, + "learning_rate": 1.8026369831437603e-05, + "loss": 1.9798, + "step": 47210 + }, + { + "epoch": 0.296788639443757, + "grad_norm": 7.17202615737915, + "learning_rate": 1.8025950730492947e-05, + "loss": 1.965, + "step": 47220 + }, + { + "epoch": 0.2968514917604541, + "grad_norm": 6.299026966094971, + "learning_rate": 1.8025531629548294e-05, + "loss": 1.6138, + "step": 47230 + }, + { + "epoch": 0.2969143440771512, + "grad_norm": 7.963582515716553, + "learning_rate": 1.802511252860364e-05, + "loss": 1.9727, + "step": 47240 + }, + { + "epoch": 0.29697719639384834, + "grad_norm": 7.803959369659424, + "learning_rate": 1.8024693427658988e-05, + "loss": 1.8211, + "step": 47250 + }, + { + "epoch": 0.29704004871054546, + "grad_norm": 7.53137731552124, + "learning_rate": 1.8024274326714335e-05, + "loss": 1.6584, + "step": 47260 + }, + { + "epoch": 0.29710290102724257, + "grad_norm": 7.338706970214844, + "learning_rate": 1.802385522576968e-05, + "loss": 1.8276, + "step": 47270 + }, + { + "epoch": 0.2971657533439397, + "grad_norm": 7.736049175262451, + "learning_rate": 1.8023436124825026e-05, + "loss": 1.7143, + "step": 47280 + }, + { + "epoch": 0.29722860566063675, + "grad_norm": 7.593161106109619, + "learning_rate": 1.8023017023880373e-05, + "loss": 2.0029, + "step": 47290 + }, + { + "epoch": 0.29729145797733386, + "grad_norm": 7.548622131347656, + "learning_rate": 1.802259792293572e-05, + "loss": 1.8466, + "step": 47300 + }, + { + "epoch": 0.297354310294031, + "grad_norm": 7.36475944519043, + "learning_rate": 1.8022178821991067e-05, + "loss": 1.7804, + "step": 47310 + }, + { + "epoch": 0.2974171626107281, + "grad_norm": 7.36644172668457, + "learning_rate": 1.8021759721046414e-05, + "loss": 1.9277, + "step": 47320 + }, + { + "epoch": 0.2974800149274252, + "grad_norm": 6.477627754211426, + "learning_rate": 1.802134062010176e-05, + "loss": 1.822, + "step": 47330 + }, + { + "epoch": 0.2975428672441223, + "grad_norm": 7.44856595993042, + "learning_rate": 1.8020921519157105e-05, + "loss": 1.8445, + "step": 47340 + }, + { + "epoch": 0.29760571956081944, + "grad_norm": 7.568237781524658, + "learning_rate": 1.8020502418212452e-05, + "loss": 1.9055, + "step": 47350 + }, + { + "epoch": 0.29766857187751655, + "grad_norm": 7.216434001922607, + "learning_rate": 1.80200833172678e-05, + "loss": 1.905, + "step": 47360 + }, + { + "epoch": 0.29773142419421367, + "grad_norm": 7.183870315551758, + "learning_rate": 1.8019664216323146e-05, + "loss": 1.6605, + "step": 47370 + }, + { + "epoch": 0.2977942765109108, + "grad_norm": 6.750730514526367, + "learning_rate": 1.8019245115378493e-05, + "loss": 1.8036, + "step": 47380 + }, + { + "epoch": 0.2978571288276079, + "grad_norm": 6.851963996887207, + "learning_rate": 1.801882601443384e-05, + "loss": 1.8385, + "step": 47390 + }, + { + "epoch": 0.297919981144305, + "grad_norm": 6.923430442810059, + "learning_rate": 1.8018406913489184e-05, + "loss": 1.7631, + "step": 47400 + }, + { + "epoch": 0.29798283346100213, + "grad_norm": 6.543471813201904, + "learning_rate": 1.801798781254453e-05, + "loss": 1.7407, + "step": 47410 + }, + { + "epoch": 0.2980456857776992, + "grad_norm": 6.620553970336914, + "learning_rate": 1.8017568711599878e-05, + "loss": 1.8049, + "step": 47420 + }, + { + "epoch": 0.2981085380943963, + "grad_norm": 6.741138935089111, + "learning_rate": 1.8017149610655225e-05, + "loss": 1.8804, + "step": 47430 + }, + { + "epoch": 0.2981713904110934, + "grad_norm": 7.707770824432373, + "learning_rate": 1.801673050971057e-05, + "loss": 1.7485, + "step": 47440 + }, + { + "epoch": 0.29823424272779053, + "grad_norm": 7.0002312660217285, + "learning_rate": 1.8016311408765916e-05, + "loss": 1.7442, + "step": 47450 + }, + { + "epoch": 0.29829709504448765, + "grad_norm": 6.893242835998535, + "learning_rate": 1.8015892307821263e-05, + "loss": 1.824, + "step": 47460 + }, + { + "epoch": 0.29835994736118476, + "grad_norm": 6.108610153198242, + "learning_rate": 1.801547320687661e-05, + "loss": 1.7694, + "step": 47470 + }, + { + "epoch": 0.2984227996778819, + "grad_norm": 7.697864532470703, + "learning_rate": 1.8015054105931957e-05, + "loss": 1.7751, + "step": 47480 + }, + { + "epoch": 0.298485651994579, + "grad_norm": 6.688353061676025, + "learning_rate": 1.80146350049873e-05, + "loss": 2.0156, + "step": 47490 + }, + { + "epoch": 0.2985485043112761, + "grad_norm": 7.048642158508301, + "learning_rate": 1.8014215904042648e-05, + "loss": 1.7247, + "step": 47500 + }, + { + "epoch": 0.2986113566279732, + "grad_norm": 6.9464850425720215, + "learning_rate": 1.8013796803097995e-05, + "loss": 1.9576, + "step": 47510 + }, + { + "epoch": 0.29867420894467034, + "grad_norm": 6.998134136199951, + "learning_rate": 1.8013377702153342e-05, + "loss": 1.9937, + "step": 47520 + }, + { + "epoch": 0.29873706126136745, + "grad_norm": 7.306562423706055, + "learning_rate": 1.801295860120869e-05, + "loss": 1.9435, + "step": 47530 + }, + { + "epoch": 0.29879991357806457, + "grad_norm": 7.738890647888184, + "learning_rate": 1.8012539500264036e-05, + "loss": 1.837, + "step": 47540 + }, + { + "epoch": 0.29886276589476163, + "grad_norm": 8.114222526550293, + "learning_rate": 1.8012120399319383e-05, + "loss": 2.0996, + "step": 47550 + }, + { + "epoch": 0.29892561821145874, + "grad_norm": 7.040402412414551, + "learning_rate": 1.801170129837473e-05, + "loss": 1.725, + "step": 47560 + }, + { + "epoch": 0.29898847052815586, + "grad_norm": 6.799810409545898, + "learning_rate": 1.8011282197430077e-05, + "loss": 1.8267, + "step": 47570 + }, + { + "epoch": 0.299051322844853, + "grad_norm": 6.3567585945129395, + "learning_rate": 1.801086309648542e-05, + "loss": 1.7863, + "step": 47580 + }, + { + "epoch": 0.2991141751615501, + "grad_norm": 7.715065002441406, + "learning_rate": 1.8010443995540768e-05, + "loss": 1.6239, + "step": 47590 + }, + { + "epoch": 0.2991770274782472, + "grad_norm": 7.012716770172119, + "learning_rate": 1.8010024894596115e-05, + "loss": 1.7821, + "step": 47600 + }, + { + "epoch": 0.2992398797949443, + "grad_norm": 6.990746021270752, + "learning_rate": 1.8009605793651462e-05, + "loss": 1.6305, + "step": 47610 + }, + { + "epoch": 0.29930273211164143, + "grad_norm": 7.111852645874023, + "learning_rate": 1.8009186692706806e-05, + "loss": 2.0926, + "step": 47620 + }, + { + "epoch": 0.29936558442833855, + "grad_norm": 8.18101692199707, + "learning_rate": 1.8008767591762153e-05, + "loss": 1.8121, + "step": 47630 + }, + { + "epoch": 0.29942843674503566, + "grad_norm": 7.045371055603027, + "learning_rate": 1.80083484908175e-05, + "loss": 1.8821, + "step": 47640 + }, + { + "epoch": 0.2994912890617328, + "grad_norm": 7.540232181549072, + "learning_rate": 1.8007929389872847e-05, + "loss": 1.9423, + "step": 47650 + }, + { + "epoch": 0.2995541413784299, + "grad_norm": 7.191141605377197, + "learning_rate": 1.800751028892819e-05, + "loss": 1.8398, + "step": 47660 + }, + { + "epoch": 0.29961699369512695, + "grad_norm": 7.146394729614258, + "learning_rate": 1.8007091187983538e-05, + "loss": 1.9698, + "step": 47670 + }, + { + "epoch": 0.29967984601182407, + "grad_norm": 7.083776473999023, + "learning_rate": 1.8006672087038885e-05, + "loss": 1.7142, + "step": 47680 + }, + { + "epoch": 0.2997426983285212, + "grad_norm": 7.160184860229492, + "learning_rate": 1.8006252986094232e-05, + "loss": 1.9591, + "step": 47690 + }, + { + "epoch": 0.2998055506452183, + "grad_norm": 7.3451313972473145, + "learning_rate": 1.800583388514958e-05, + "loss": 1.9017, + "step": 47700 + }, + { + "epoch": 0.2998684029619154, + "grad_norm": 6.632269382476807, + "learning_rate": 1.8005414784204926e-05, + "loss": 1.7079, + "step": 47710 + }, + { + "epoch": 0.29993125527861253, + "grad_norm": 6.296167850494385, + "learning_rate": 1.800499568326027e-05, + "loss": 1.9561, + "step": 47720 + }, + { + "epoch": 0.29999410759530964, + "grad_norm": 8.638937950134277, + "learning_rate": 1.8004576582315617e-05, + "loss": 1.7177, + "step": 47730 + }, + { + "epoch": 0.30005695991200676, + "grad_norm": 6.743673324584961, + "learning_rate": 1.8004157481370964e-05, + "loss": 1.9555, + "step": 47740 + }, + { + "epoch": 0.3001198122287039, + "grad_norm": 6.653906345367432, + "learning_rate": 1.800373838042631e-05, + "loss": 1.7253, + "step": 47750 + }, + { + "epoch": 0.300182664545401, + "grad_norm": 8.051403999328613, + "learning_rate": 1.8003319279481658e-05, + "loss": 1.9951, + "step": 47760 + }, + { + "epoch": 0.3002455168620981, + "grad_norm": 7.247729778289795, + "learning_rate": 1.8002900178537005e-05, + "loss": 1.8491, + "step": 47770 + }, + { + "epoch": 0.3003083691787952, + "grad_norm": 8.287534713745117, + "learning_rate": 1.8002481077592352e-05, + "loss": 1.7213, + "step": 47780 + }, + { + "epoch": 0.30037122149549234, + "grad_norm": 7.613092422485352, + "learning_rate": 1.80020619766477e-05, + "loss": 1.8323, + "step": 47790 + }, + { + "epoch": 0.3004340738121894, + "grad_norm": 7.131540775299072, + "learning_rate": 1.8001642875703043e-05, + "loss": 1.8019, + "step": 47800 + }, + { + "epoch": 0.3004969261288865, + "grad_norm": 7.072052001953125, + "learning_rate": 1.800122377475839e-05, + "loss": 2.1704, + "step": 47810 + }, + { + "epoch": 0.3005597784455836, + "grad_norm": 8.470544815063477, + "learning_rate": 1.8000804673813737e-05, + "loss": 1.9052, + "step": 47820 + }, + { + "epoch": 0.30062263076228074, + "grad_norm": 7.4394850730896, + "learning_rate": 1.8000385572869084e-05, + "loss": 1.9449, + "step": 47830 + }, + { + "epoch": 0.30068548307897786, + "grad_norm": 7.438833236694336, + "learning_rate": 1.7999966471924428e-05, + "loss": 1.9655, + "step": 47840 + }, + { + "epoch": 0.30074833539567497, + "grad_norm": 6.787623405456543, + "learning_rate": 1.7999547370979775e-05, + "loss": 1.8052, + "step": 47850 + }, + { + "epoch": 0.3008111877123721, + "grad_norm": 6.316556930541992, + "learning_rate": 1.7999128270035122e-05, + "loss": 1.7274, + "step": 47860 + }, + { + "epoch": 0.3008740400290692, + "grad_norm": 7.249434947967529, + "learning_rate": 1.799870916909047e-05, + "loss": 1.6915, + "step": 47870 + }, + { + "epoch": 0.3009368923457663, + "grad_norm": 7.901944637298584, + "learning_rate": 1.7998290068145816e-05, + "loss": 1.7021, + "step": 47880 + }, + { + "epoch": 0.30099974466246343, + "grad_norm": 6.7186760902404785, + "learning_rate": 1.799787096720116e-05, + "loss": 1.7394, + "step": 47890 + }, + { + "epoch": 0.30106259697916055, + "grad_norm": 8.327082633972168, + "learning_rate": 1.7997451866256507e-05, + "loss": 1.8368, + "step": 47900 + }, + { + "epoch": 0.30112544929585766, + "grad_norm": 6.639060020446777, + "learning_rate": 1.7997032765311854e-05, + "loss": 1.7183, + "step": 47910 + }, + { + "epoch": 0.3011883016125548, + "grad_norm": 6.945801734924316, + "learning_rate": 1.79966136643672e-05, + "loss": 1.919, + "step": 47920 + }, + { + "epoch": 0.30125115392925184, + "grad_norm": 7.766027927398682, + "learning_rate": 1.7996194563422548e-05, + "loss": 1.846, + "step": 47930 + }, + { + "epoch": 0.30131400624594895, + "grad_norm": 6.516760349273682, + "learning_rate": 1.7995775462477895e-05, + "loss": 1.864, + "step": 47940 + }, + { + "epoch": 0.30137685856264607, + "grad_norm": 5.535265922546387, + "learning_rate": 1.7995356361533242e-05, + "loss": 1.6883, + "step": 47950 + }, + { + "epoch": 0.3014397108793432, + "grad_norm": 7.9575090408325195, + "learning_rate": 1.799493726058859e-05, + "loss": 1.8749, + "step": 47960 + }, + { + "epoch": 0.3015025631960403, + "grad_norm": 7.324628829956055, + "learning_rate": 1.7994518159643933e-05, + "loss": 1.547, + "step": 47970 + }, + { + "epoch": 0.3015654155127374, + "grad_norm": 7.135219573974609, + "learning_rate": 1.799409905869928e-05, + "loss": 1.9229, + "step": 47980 + }, + { + "epoch": 0.3016282678294345, + "grad_norm": 7.590478897094727, + "learning_rate": 1.7993679957754627e-05, + "loss": 2.0679, + "step": 47990 + }, + { + "epoch": 0.30169112014613164, + "grad_norm": 7.243574142456055, + "learning_rate": 1.7993260856809974e-05, + "loss": 2.0274, + "step": 48000 + }, + { + "epoch": 0.30175397246282876, + "grad_norm": 7.255794048309326, + "learning_rate": 1.799284175586532e-05, + "loss": 1.6656, + "step": 48010 + }, + { + "epoch": 0.30181682477952587, + "grad_norm": 6.9301934242248535, + "learning_rate": 1.7992422654920665e-05, + "loss": 1.9641, + "step": 48020 + }, + { + "epoch": 0.301879677096223, + "grad_norm": 7.461561679840088, + "learning_rate": 1.7992003553976012e-05, + "loss": 1.9576, + "step": 48030 + }, + { + "epoch": 0.3019425294129201, + "grad_norm": 7.48775053024292, + "learning_rate": 1.799158445303136e-05, + "loss": 2.0371, + "step": 48040 + }, + { + "epoch": 0.3020053817296172, + "grad_norm": 7.324866771697998, + "learning_rate": 1.7991165352086706e-05, + "loss": 1.9388, + "step": 48050 + }, + { + "epoch": 0.3020682340463143, + "grad_norm": 6.965330123901367, + "learning_rate": 1.799074625114205e-05, + "loss": 2.1266, + "step": 48060 + }, + { + "epoch": 0.3021310863630114, + "grad_norm": 7.181867599487305, + "learning_rate": 1.7990327150197397e-05, + "loss": 1.8953, + "step": 48070 + }, + { + "epoch": 0.3021939386797085, + "grad_norm": 8.220254898071289, + "learning_rate": 1.7989908049252744e-05, + "loss": 1.864, + "step": 48080 + }, + { + "epoch": 0.3022567909964056, + "grad_norm": 7.029447555541992, + "learning_rate": 1.798948894830809e-05, + "loss": 1.5146, + "step": 48090 + }, + { + "epoch": 0.30231964331310274, + "grad_norm": 6.5770792961120605, + "learning_rate": 1.7989069847363438e-05, + "loss": 1.921, + "step": 48100 + }, + { + "epoch": 0.30238249562979985, + "grad_norm": 6.378840923309326, + "learning_rate": 1.798865074641878e-05, + "loss": 2.0148, + "step": 48110 + }, + { + "epoch": 0.30244534794649697, + "grad_norm": 7.669800758361816, + "learning_rate": 1.798823164547413e-05, + "loss": 1.9965, + "step": 48120 + }, + { + "epoch": 0.3025082002631941, + "grad_norm": 7.05275297164917, + "learning_rate": 1.7987812544529476e-05, + "loss": 1.642, + "step": 48130 + }, + { + "epoch": 0.3025710525798912, + "grad_norm": 8.19581413269043, + "learning_rate": 1.7987393443584823e-05, + "loss": 1.7397, + "step": 48140 + }, + { + "epoch": 0.3026339048965883, + "grad_norm": 6.76801872253418, + "learning_rate": 1.798697434264017e-05, + "loss": 2.24, + "step": 48150 + }, + { + "epoch": 0.30269675721328543, + "grad_norm": 6.347983360290527, + "learning_rate": 1.7986555241695517e-05, + "loss": 1.9528, + "step": 48160 + }, + { + "epoch": 0.30275960952998254, + "grad_norm": 7.854902744293213, + "learning_rate": 1.7986136140750864e-05, + "loss": 1.6761, + "step": 48170 + }, + { + "epoch": 0.3028224618466796, + "grad_norm": 7.092661380767822, + "learning_rate": 1.798571703980621e-05, + "loss": 2.0425, + "step": 48180 + }, + { + "epoch": 0.3028853141633767, + "grad_norm": 6.634575843811035, + "learning_rate": 1.7985297938861558e-05, + "loss": 1.9488, + "step": 48190 + }, + { + "epoch": 0.30294816648007383, + "grad_norm": 7.243200302124023, + "learning_rate": 1.7984878837916902e-05, + "loss": 1.8827, + "step": 48200 + }, + { + "epoch": 0.30301101879677095, + "grad_norm": 6.852927207946777, + "learning_rate": 1.798445973697225e-05, + "loss": 1.7027, + "step": 48210 + }, + { + "epoch": 0.30307387111346806, + "grad_norm": 7.248819351196289, + "learning_rate": 1.7984040636027596e-05, + "loss": 1.9543, + "step": 48220 + }, + { + "epoch": 0.3031367234301652, + "grad_norm": 7.36978006362915, + "learning_rate": 1.7983621535082943e-05, + "loss": 1.9045, + "step": 48230 + }, + { + "epoch": 0.3031995757468623, + "grad_norm": 6.938103675842285, + "learning_rate": 1.7983202434138287e-05, + "loss": 1.7748, + "step": 48240 + }, + { + "epoch": 0.3032624280635594, + "grad_norm": 6.852223873138428, + "learning_rate": 1.7982783333193634e-05, + "loss": 1.7587, + "step": 48250 + }, + { + "epoch": 0.3033252803802565, + "grad_norm": 6.4312334060668945, + "learning_rate": 1.798236423224898e-05, + "loss": 1.6611, + "step": 48260 + }, + { + "epoch": 0.30338813269695364, + "grad_norm": 6.497596263885498, + "learning_rate": 1.7981945131304328e-05, + "loss": 1.732, + "step": 48270 + }, + { + "epoch": 0.30345098501365075, + "grad_norm": 5.9871602058410645, + "learning_rate": 1.798152603035967e-05, + "loss": 1.7099, + "step": 48280 + }, + { + "epoch": 0.30351383733034787, + "grad_norm": 6.564183235168457, + "learning_rate": 1.798110692941502e-05, + "loss": 1.7541, + "step": 48290 + }, + { + "epoch": 0.303576689647045, + "grad_norm": 6.92208194732666, + "learning_rate": 1.7980687828470366e-05, + "loss": 1.887, + "step": 48300 + }, + { + "epoch": 0.30363954196374204, + "grad_norm": 6.861618995666504, + "learning_rate": 1.7980268727525713e-05, + "loss": 1.6396, + "step": 48310 + }, + { + "epoch": 0.30370239428043916, + "grad_norm": 8.422454833984375, + "learning_rate": 1.797984962658106e-05, + "loss": 1.9241, + "step": 48320 + }, + { + "epoch": 0.3037652465971363, + "grad_norm": 8.636983871459961, + "learning_rate": 1.7979430525636407e-05, + "loss": 1.772, + "step": 48330 + }, + { + "epoch": 0.3038280989138334, + "grad_norm": 5.715958595275879, + "learning_rate": 1.7979011424691754e-05, + "loss": 1.8422, + "step": 48340 + }, + { + "epoch": 0.3038909512305305, + "grad_norm": 7.868222713470459, + "learning_rate": 1.7978592323747098e-05, + "loss": 1.9577, + "step": 48350 + }, + { + "epoch": 0.3039538035472276, + "grad_norm": 8.06572437286377, + "learning_rate": 1.7978173222802445e-05, + "loss": 1.9822, + "step": 48360 + }, + { + "epoch": 0.30401665586392473, + "grad_norm": 7.032797336578369, + "learning_rate": 1.7977754121857792e-05, + "loss": 2.0774, + "step": 48370 + }, + { + "epoch": 0.30407950818062185, + "grad_norm": 6.042160987854004, + "learning_rate": 1.797733502091314e-05, + "loss": 1.7674, + "step": 48380 + }, + { + "epoch": 0.30414236049731896, + "grad_norm": 7.035040855407715, + "learning_rate": 1.7976915919968486e-05, + "loss": 1.9051, + "step": 48390 + }, + { + "epoch": 0.3042052128140161, + "grad_norm": 7.070200443267822, + "learning_rate": 1.7976496819023833e-05, + "loss": 1.7928, + "step": 48400 + }, + { + "epoch": 0.3042680651307132, + "grad_norm": 8.123950958251953, + "learning_rate": 1.797607771807918e-05, + "loss": 1.9142, + "step": 48410 + }, + { + "epoch": 0.3043309174474103, + "grad_norm": 7.851001262664795, + "learning_rate": 1.7975658617134524e-05, + "loss": 1.7116, + "step": 48420 + }, + { + "epoch": 0.3043937697641074, + "grad_norm": 7.491074085235596, + "learning_rate": 1.797523951618987e-05, + "loss": 1.7995, + "step": 48430 + }, + { + "epoch": 0.3044566220808045, + "grad_norm": 7.63198184967041, + "learning_rate": 1.7974820415245218e-05, + "loss": 1.8103, + "step": 48440 + }, + { + "epoch": 0.3045194743975016, + "grad_norm": 6.391507148742676, + "learning_rate": 1.7974401314300565e-05, + "loss": 1.7648, + "step": 48450 + }, + { + "epoch": 0.3045823267141987, + "grad_norm": 7.051711082458496, + "learning_rate": 1.797398221335591e-05, + "loss": 1.8767, + "step": 48460 + }, + { + "epoch": 0.30464517903089583, + "grad_norm": 6.65802526473999, + "learning_rate": 1.7973563112411256e-05, + "loss": 1.7294, + "step": 48470 + }, + { + "epoch": 0.30470803134759294, + "grad_norm": 6.325926780700684, + "learning_rate": 1.7973144011466603e-05, + "loss": 1.8462, + "step": 48480 + }, + { + "epoch": 0.30477088366429006, + "grad_norm": 6.50266170501709, + "learning_rate": 1.797272491052195e-05, + "loss": 1.6773, + "step": 48490 + }, + { + "epoch": 0.3048337359809872, + "grad_norm": 7.261912822723389, + "learning_rate": 1.7972305809577297e-05, + "loss": 1.8061, + "step": 48500 + }, + { + "epoch": 0.3048965882976843, + "grad_norm": 6.066399574279785, + "learning_rate": 1.797188670863264e-05, + "loss": 1.793, + "step": 48510 + }, + { + "epoch": 0.3049594406143814, + "grad_norm": 7.92596960067749, + "learning_rate": 1.7971467607687988e-05, + "loss": 1.9376, + "step": 48520 + }, + { + "epoch": 0.3050222929310785, + "grad_norm": 7.034647464752197, + "learning_rate": 1.7971048506743335e-05, + "loss": 1.9117, + "step": 48530 + }, + { + "epoch": 0.30508514524777564, + "grad_norm": 6.429213523864746, + "learning_rate": 1.7970629405798682e-05, + "loss": 2.0952, + "step": 48540 + }, + { + "epoch": 0.30514799756447275, + "grad_norm": 7.143803119659424, + "learning_rate": 1.797021030485403e-05, + "loss": 1.8642, + "step": 48550 + }, + { + "epoch": 0.30521084988116987, + "grad_norm": Infinity, + "learning_rate": 1.7969791203909376e-05, + "loss": 1.8707, + "step": 48560 + }, + { + "epoch": 0.3052737021978669, + "grad_norm": 7.328855514526367, + "learning_rate": 1.7969414013059187e-05, + "loss": 1.7873, + "step": 48570 + }, + { + "epoch": 0.30533655451456404, + "grad_norm": 7.020454406738281, + "learning_rate": 1.796899491211453e-05, + "loss": 1.8623, + "step": 48580 + }, + { + "epoch": 0.30539940683126116, + "grad_norm": 7.066745758056641, + "learning_rate": 1.7968575811169878e-05, + "loss": 1.6394, + "step": 48590 + }, + { + "epoch": 0.30546225914795827, + "grad_norm": 6.923233985900879, + "learning_rate": 1.7968156710225225e-05, + "loss": 1.6722, + "step": 48600 + }, + { + "epoch": 0.3055251114646554, + "grad_norm": 6.17056131362915, + "learning_rate": 1.7967737609280572e-05, + "loss": 1.7665, + "step": 48610 + }, + { + "epoch": 0.3055879637813525, + "grad_norm": 7.043300151824951, + "learning_rate": 1.796731850833592e-05, + "loss": 1.6778, + "step": 48620 + }, + { + "epoch": 0.3056508160980496, + "grad_norm": 7.796075344085693, + "learning_rate": 1.7966899407391266e-05, + "loss": 2.0575, + "step": 48630 + }, + { + "epoch": 0.30571366841474673, + "grad_norm": 6.610686302185059, + "learning_rate": 1.7966480306446614e-05, + "loss": 1.8745, + "step": 48640 + }, + { + "epoch": 0.30577652073144385, + "grad_norm": 6.844950199127197, + "learning_rate": 1.796606120550196e-05, + "loss": 2.0337, + "step": 48650 + }, + { + "epoch": 0.30583937304814096, + "grad_norm": 7.483672618865967, + "learning_rate": 1.7965642104557304e-05, + "loss": 1.7752, + "step": 48660 + }, + { + "epoch": 0.3059022253648381, + "grad_norm": 7.143709659576416, + "learning_rate": 1.796522300361265e-05, + "loss": 1.9864, + "step": 48670 + }, + { + "epoch": 0.3059650776815352, + "grad_norm": 7.549054145812988, + "learning_rate": 1.7964803902668e-05, + "loss": 1.7346, + "step": 48680 + }, + { + "epoch": 0.30602792999823225, + "grad_norm": 7.867265701293945, + "learning_rate": 1.7964384801723346e-05, + "loss": 1.771, + "step": 48690 + }, + { + "epoch": 0.30609078231492937, + "grad_norm": 5.640731334686279, + "learning_rate": 1.7963965700778693e-05, + "loss": 1.7527, + "step": 48700 + }, + { + "epoch": 0.3061536346316265, + "grad_norm": 6.865826606750488, + "learning_rate": 1.796354659983404e-05, + "loss": 1.7794, + "step": 48710 + }, + { + "epoch": 0.3062164869483236, + "grad_norm": 8.240419387817383, + "learning_rate": 1.7963127498889383e-05, + "loss": 1.707, + "step": 48720 + }, + { + "epoch": 0.3062793392650207, + "grad_norm": 7.254922866821289, + "learning_rate": 1.796270839794473e-05, + "loss": 1.8926, + "step": 48730 + }, + { + "epoch": 0.3063421915817178, + "grad_norm": 8.631532669067383, + "learning_rate": 1.7962289297000077e-05, + "loss": 1.784, + "step": 48740 + }, + { + "epoch": 0.30640504389841494, + "grad_norm": 7.44426965713501, + "learning_rate": 1.7961870196055425e-05, + "loss": 1.8762, + "step": 48750 + }, + { + "epoch": 0.30646789621511206, + "grad_norm": 7.574648857116699, + "learning_rate": 1.7961451095110768e-05, + "loss": 1.6253, + "step": 48760 + }, + { + "epoch": 0.30653074853180917, + "grad_norm": 6.930304050445557, + "learning_rate": 1.7961031994166115e-05, + "loss": 2.0645, + "step": 48770 + }, + { + "epoch": 0.3065936008485063, + "grad_norm": 6.743585586547852, + "learning_rate": 1.7960612893221462e-05, + "loss": 1.9567, + "step": 48780 + }, + { + "epoch": 0.3066564531652034, + "grad_norm": 8.479488372802734, + "learning_rate": 1.796019379227681e-05, + "loss": 1.9408, + "step": 48790 + }, + { + "epoch": 0.3067193054819005, + "grad_norm": 6.826617240905762, + "learning_rate": 1.7959774691332153e-05, + "loss": 1.8654, + "step": 48800 + }, + { + "epoch": 0.30678215779859763, + "grad_norm": 7.865091800689697, + "learning_rate": 1.79593555903875e-05, + "loss": 2.0285, + "step": 48810 + }, + { + "epoch": 0.3068450101152947, + "grad_norm": 7.427491188049316, + "learning_rate": 1.7958936489442847e-05, + "loss": 1.9897, + "step": 48820 + }, + { + "epoch": 0.3069078624319918, + "grad_norm": 8.017901420593262, + "learning_rate": 1.7958517388498194e-05, + "loss": 1.9974, + "step": 48830 + }, + { + "epoch": 0.3069707147486889, + "grad_norm": 7.371147155761719, + "learning_rate": 1.795809828755354e-05, + "loss": 1.6837, + "step": 48840 + }, + { + "epoch": 0.30703356706538604, + "grad_norm": 8.3818998336792, + "learning_rate": 1.795767918660889e-05, + "loss": 1.7212, + "step": 48850 + }, + { + "epoch": 0.30709641938208315, + "grad_norm": 7.13516092300415, + "learning_rate": 1.7957260085664236e-05, + "loss": 1.6428, + "step": 48860 + }, + { + "epoch": 0.30715927169878027, + "grad_norm": 7.678220748901367, + "learning_rate": 1.7956840984719583e-05, + "loss": 1.9003, + "step": 48870 + }, + { + "epoch": 0.3072221240154774, + "grad_norm": 7.682860851287842, + "learning_rate": 1.795642188377493e-05, + "loss": 1.849, + "step": 48880 + }, + { + "epoch": 0.3072849763321745, + "grad_norm": 6.572717666625977, + "learning_rate": 1.7956002782830273e-05, + "loss": 1.86, + "step": 48890 + }, + { + "epoch": 0.3073478286488716, + "grad_norm": 7.628412246704102, + "learning_rate": 1.795558368188562e-05, + "loss": 1.8827, + "step": 48900 + }, + { + "epoch": 0.30741068096556873, + "grad_norm": 7.541478633880615, + "learning_rate": 1.7955164580940968e-05, + "loss": 1.9272, + "step": 48910 + }, + { + "epoch": 0.30747353328226584, + "grad_norm": 7.739235877990723, + "learning_rate": 1.7954745479996315e-05, + "loss": 1.8035, + "step": 48920 + }, + { + "epoch": 0.30753638559896296, + "grad_norm": 5.950796127319336, + "learning_rate": 1.795432637905166e-05, + "loss": 1.9516, + "step": 48930 + }, + { + "epoch": 0.3075992379156601, + "grad_norm": 6.327701568603516, + "learning_rate": 1.7953907278107005e-05, + "loss": 1.8042, + "step": 48940 + }, + { + "epoch": 0.30766209023235713, + "grad_norm": 8.778162956237793, + "learning_rate": 1.7953488177162352e-05, + "loss": 1.7809, + "step": 48950 + }, + { + "epoch": 0.30772494254905425, + "grad_norm": 7.731008052825928, + "learning_rate": 1.79530690762177e-05, + "loss": 1.7748, + "step": 48960 + }, + { + "epoch": 0.30778779486575136, + "grad_norm": 7.498165607452393, + "learning_rate": 1.7952649975273047e-05, + "loss": 1.9784, + "step": 48970 + }, + { + "epoch": 0.3078506471824485, + "grad_norm": 6.195873737335205, + "learning_rate": 1.795223087432839e-05, + "loss": 1.8752, + "step": 48980 + }, + { + "epoch": 0.3079134994991456, + "grad_norm": 6.936201572418213, + "learning_rate": 1.7951811773383737e-05, + "loss": 2.047, + "step": 48990 + }, + { + "epoch": 0.3079763518158427, + "grad_norm": 6.399014472961426, + "learning_rate": 1.7951392672439084e-05, + "loss": 1.963, + "step": 49000 + }, + { + "epoch": 0.3080392041325398, + "grad_norm": 6.7194504737854, + "learning_rate": 1.795097357149443e-05, + "loss": 2.106, + "step": 49010 + }, + { + "epoch": 0.30810205644923694, + "grad_norm": 5.826361656188965, + "learning_rate": 1.795055447054978e-05, + "loss": 1.5659, + "step": 49020 + }, + { + "epoch": 0.30816490876593405, + "grad_norm": 7.676239490509033, + "learning_rate": 1.7950135369605126e-05, + "loss": 1.5651, + "step": 49030 + }, + { + "epoch": 0.30822776108263117, + "grad_norm": 6.70309591293335, + "learning_rate": 1.794971626866047e-05, + "loss": 2.0287, + "step": 49040 + }, + { + "epoch": 0.3082906133993283, + "grad_norm": 7.022078514099121, + "learning_rate": 1.7949297167715816e-05, + "loss": 1.8823, + "step": 49050 + }, + { + "epoch": 0.3083534657160254, + "grad_norm": 7.787981033325195, + "learning_rate": 1.7948878066771163e-05, + "loss": 1.7635, + "step": 49060 + }, + { + "epoch": 0.3084163180327225, + "grad_norm": 6.165613651275635, + "learning_rate": 1.794845896582651e-05, + "loss": 1.7764, + "step": 49070 + }, + { + "epoch": 0.3084791703494196, + "grad_norm": 7.295814037322998, + "learning_rate": 1.7948039864881858e-05, + "loss": 1.9999, + "step": 49080 + }, + { + "epoch": 0.3085420226661167, + "grad_norm": 6.8880486488342285, + "learning_rate": 1.7947620763937205e-05, + "loss": 1.7568, + "step": 49090 + }, + { + "epoch": 0.3086048749828138, + "grad_norm": 7.967652320861816, + "learning_rate": 1.7947201662992552e-05, + "loss": 1.9172, + "step": 49100 + }, + { + "epoch": 0.3086677272995109, + "grad_norm": 5.634479522705078, + "learning_rate": 1.7946782562047895e-05, + "loss": 1.4705, + "step": 49110 + }, + { + "epoch": 0.30873057961620803, + "grad_norm": 26.34225845336914, + "learning_rate": 1.7946363461103242e-05, + "loss": 2.1408, + "step": 49120 + }, + { + "epoch": 0.30879343193290515, + "grad_norm": 7.0778632164001465, + "learning_rate": 1.794594436015859e-05, + "loss": 1.5962, + "step": 49130 + }, + { + "epoch": 0.30885628424960226, + "grad_norm": 7.073582172393799, + "learning_rate": 1.7945525259213937e-05, + "loss": 1.7534, + "step": 49140 + }, + { + "epoch": 0.3089191365662994, + "grad_norm": 7.2108049392700195, + "learning_rate": 1.7945106158269284e-05, + "loss": 1.7949, + "step": 49150 + }, + { + "epoch": 0.3089819888829965, + "grad_norm": 7.800317287445068, + "learning_rate": 1.7944687057324627e-05, + "loss": 2.0386, + "step": 49160 + }, + { + "epoch": 0.3090448411996936, + "grad_norm": 7.265311241149902, + "learning_rate": 1.7944267956379974e-05, + "loss": 1.7653, + "step": 49170 + }, + { + "epoch": 0.3091076935163907, + "grad_norm": 7.880399227142334, + "learning_rate": 1.794384885543532e-05, + "loss": 1.9162, + "step": 49180 + }, + { + "epoch": 0.30917054583308784, + "grad_norm": 6.755533695220947, + "learning_rate": 1.794342975449067e-05, + "loss": 1.9217, + "step": 49190 + }, + { + "epoch": 0.30923339814978495, + "grad_norm": 7.04044771194458, + "learning_rate": 1.7943010653546012e-05, + "loss": 1.8448, + "step": 49200 + }, + { + "epoch": 0.309296250466482, + "grad_norm": 6.240016460418701, + "learning_rate": 1.794259155260136e-05, + "loss": 1.7467, + "step": 49210 + }, + { + "epoch": 0.30935910278317913, + "grad_norm": 7.164734363555908, + "learning_rate": 1.7942172451656706e-05, + "loss": 2.0081, + "step": 49220 + }, + { + "epoch": 0.30942195509987624, + "grad_norm": 7.578107833862305, + "learning_rate": 1.7941753350712053e-05, + "loss": 1.7392, + "step": 49230 + }, + { + "epoch": 0.30948480741657336, + "grad_norm": 7.135012149810791, + "learning_rate": 1.79413342497674e-05, + "loss": 1.7525, + "step": 49240 + }, + { + "epoch": 0.3095476597332705, + "grad_norm": 7.098381042480469, + "learning_rate": 1.7940915148822748e-05, + "loss": 1.9605, + "step": 49250 + }, + { + "epoch": 0.3096105120499676, + "grad_norm": 8.046966552734375, + "learning_rate": 1.7940496047878095e-05, + "loss": 1.706, + "step": 49260 + }, + { + "epoch": 0.3096733643666647, + "grad_norm": 7.1453094482421875, + "learning_rate": 1.7940076946933442e-05, + "loss": 2.1353, + "step": 49270 + }, + { + "epoch": 0.3097362166833618, + "grad_norm": 7.904295444488525, + "learning_rate": 1.793965784598879e-05, + "loss": 2.0071, + "step": 49280 + }, + { + "epoch": 0.30979906900005894, + "grad_norm": 8.429654121398926, + "learning_rate": 1.7939238745044132e-05, + "loss": 1.8943, + "step": 49290 + }, + { + "epoch": 0.30986192131675605, + "grad_norm": 7.261959552764893, + "learning_rate": 1.793881964409948e-05, + "loss": 1.754, + "step": 49300 + }, + { + "epoch": 0.30992477363345317, + "grad_norm": 6.630239009857178, + "learning_rate": 1.7938400543154827e-05, + "loss": 2.0018, + "step": 49310 + }, + { + "epoch": 0.3099876259501503, + "grad_norm": 7.209163188934326, + "learning_rate": 1.7937981442210174e-05, + "loss": 1.8888, + "step": 49320 + }, + { + "epoch": 0.31005047826684734, + "grad_norm": 7.261234283447266, + "learning_rate": 1.793756234126552e-05, + "loss": 1.8513, + "step": 49330 + }, + { + "epoch": 0.31011333058354446, + "grad_norm": 6.507667541503906, + "learning_rate": 1.7937143240320864e-05, + "loss": 1.7614, + "step": 49340 + }, + { + "epoch": 0.31017618290024157, + "grad_norm": 6.24437141418457, + "learning_rate": 1.793672413937621e-05, + "loss": 2.1852, + "step": 49350 + }, + { + "epoch": 0.3102390352169387, + "grad_norm": 5.852570056915283, + "learning_rate": 1.793630503843156e-05, + "loss": 1.7248, + "step": 49360 + }, + { + "epoch": 0.3103018875336358, + "grad_norm": 6.463139533996582, + "learning_rate": 1.7935885937486906e-05, + "loss": 1.7355, + "step": 49370 + }, + { + "epoch": 0.3103647398503329, + "grad_norm": 6.722446918487549, + "learning_rate": 1.793546683654225e-05, + "loss": 1.7005, + "step": 49380 + }, + { + "epoch": 0.31042759216703003, + "grad_norm": 6.7306718826293945, + "learning_rate": 1.7935047735597596e-05, + "loss": 1.9504, + "step": 49390 + }, + { + "epoch": 0.31049044448372715, + "grad_norm": 6.818288803100586, + "learning_rate": 1.7934628634652943e-05, + "loss": 1.8979, + "step": 49400 + }, + { + "epoch": 0.31055329680042426, + "grad_norm": 6.045130252838135, + "learning_rate": 1.793420953370829e-05, + "loss": 1.7419, + "step": 49410 + }, + { + "epoch": 0.3106161491171214, + "grad_norm": 5.838948726654053, + "learning_rate": 1.7933790432763634e-05, + "loss": 1.6398, + "step": 49420 + }, + { + "epoch": 0.3106790014338185, + "grad_norm": 7.738033294677734, + "learning_rate": 1.793337133181898e-05, + "loss": 1.6713, + "step": 49430 + }, + { + "epoch": 0.3107418537505156, + "grad_norm": 6.2949604988098145, + "learning_rate": 1.793295223087433e-05, + "loss": 2.0401, + "step": 49440 + }, + { + "epoch": 0.3108047060672127, + "grad_norm": 6.236536502838135, + "learning_rate": 1.7932533129929675e-05, + "loss": 1.8538, + "step": 49450 + }, + { + "epoch": 0.3108675583839098, + "grad_norm": 6.050281524658203, + "learning_rate": 1.7932114028985023e-05, + "loss": 1.7109, + "step": 49460 + }, + { + "epoch": 0.3109304107006069, + "grad_norm": 7.410684108734131, + "learning_rate": 1.793169492804037e-05, + "loss": 2.0338, + "step": 49470 + }, + { + "epoch": 0.310993263017304, + "grad_norm": 6.922196388244629, + "learning_rate": 1.7931275827095717e-05, + "loss": 1.8601, + "step": 49480 + }, + { + "epoch": 0.3110561153340011, + "grad_norm": 7.497621059417725, + "learning_rate": 1.7930856726151064e-05, + "loss": 1.8235, + "step": 49490 + }, + { + "epoch": 0.31111896765069824, + "grad_norm": 7.287035942077637, + "learning_rate": 1.793043762520641e-05, + "loss": 1.8273, + "step": 49500 + }, + { + "epoch": 0.31118181996739536, + "grad_norm": 7.15346622467041, + "learning_rate": 1.7930018524261754e-05, + "loss": 1.6921, + "step": 49510 + }, + { + "epoch": 0.31124467228409247, + "grad_norm": 6.8751444816589355, + "learning_rate": 1.79295994233171e-05, + "loss": 1.6717, + "step": 49520 + }, + { + "epoch": 0.3113075246007896, + "grad_norm": 7.623301982879639, + "learning_rate": 1.792918032237245e-05, + "loss": 2.1327, + "step": 49530 + }, + { + "epoch": 0.3113703769174867, + "grad_norm": 6.589799880981445, + "learning_rate": 1.7928761221427796e-05, + "loss": 1.7977, + "step": 49540 + }, + { + "epoch": 0.3114332292341838, + "grad_norm": 7.977503776550293, + "learning_rate": 1.7928342120483143e-05, + "loss": 1.5672, + "step": 49550 + }, + { + "epoch": 0.31149608155088093, + "grad_norm": 6.824627876281738, + "learning_rate": 1.7927923019538486e-05, + "loss": 1.7908, + "step": 49560 + }, + { + "epoch": 0.31155893386757805, + "grad_norm": 5.8089070320129395, + "learning_rate": 1.7927503918593834e-05, + "loss": 1.8488, + "step": 49570 + }, + { + "epoch": 0.31162178618427516, + "grad_norm": 8.658555030822754, + "learning_rate": 1.792708481764918e-05, + "loss": 1.5759, + "step": 49580 + }, + { + "epoch": 0.3116846385009722, + "grad_norm": 7.306936740875244, + "learning_rate": 1.7926665716704528e-05, + "loss": 1.817, + "step": 49590 + }, + { + "epoch": 0.31174749081766934, + "grad_norm": 8.089118003845215, + "learning_rate": 1.792624661575987e-05, + "loss": 2.0118, + "step": 49600 + }, + { + "epoch": 0.31181034313436645, + "grad_norm": 6.11497688293457, + "learning_rate": 1.792582751481522e-05, + "loss": 1.8159, + "step": 49610 + }, + { + "epoch": 0.31187319545106357, + "grad_norm": 7.032067775726318, + "learning_rate": 1.7925408413870565e-05, + "loss": 1.9235, + "step": 49620 + }, + { + "epoch": 0.3119360477677607, + "grad_norm": 6.375333786010742, + "learning_rate": 1.7924989312925913e-05, + "loss": 1.9218, + "step": 49630 + }, + { + "epoch": 0.3119989000844578, + "grad_norm": 6.956419467926025, + "learning_rate": 1.792457021198126e-05, + "loss": 2.0338, + "step": 49640 + }, + { + "epoch": 0.3120617524011549, + "grad_norm": 9.121163368225098, + "learning_rate": 1.7924151111036607e-05, + "loss": 1.6713, + "step": 49650 + }, + { + "epoch": 0.31212460471785203, + "grad_norm": 8.117715835571289, + "learning_rate": 1.792373201009195e-05, + "loss": 1.8347, + "step": 49660 + }, + { + "epoch": 0.31218745703454914, + "grad_norm": 7.388977527618408, + "learning_rate": 1.7923312909147297e-05, + "loss": 1.8706, + "step": 49670 + }, + { + "epoch": 0.31225030935124626, + "grad_norm": 8.16901683807373, + "learning_rate": 1.7922893808202645e-05, + "loss": 1.9452, + "step": 49680 + }, + { + "epoch": 0.3123131616679434, + "grad_norm": 7.1762213706970215, + "learning_rate": 1.792247470725799e-05, + "loss": 1.8667, + "step": 49690 + }, + { + "epoch": 0.3123760139846405, + "grad_norm": 6.232288837432861, + "learning_rate": 1.792205560631334e-05, + "loss": 1.9377, + "step": 49700 + }, + { + "epoch": 0.3124388663013376, + "grad_norm": 6.773346900939941, + "learning_rate": 1.7921636505368686e-05, + "loss": 1.8434, + "step": 49710 + }, + { + "epoch": 0.31250171861803466, + "grad_norm": 7.3948869705200195, + "learning_rate": 1.7921217404424033e-05, + "loss": 2.0011, + "step": 49720 + }, + { + "epoch": 0.3125645709347318, + "grad_norm": 6.538943767547607, + "learning_rate": 1.7920798303479376e-05, + "loss": 1.8106, + "step": 49730 + }, + { + "epoch": 0.3126274232514289, + "grad_norm": 7.071255683898926, + "learning_rate": 1.7920379202534724e-05, + "loss": 2.0634, + "step": 49740 + }, + { + "epoch": 0.312690275568126, + "grad_norm": 6.963105201721191, + "learning_rate": 1.791996010159007e-05, + "loss": 1.7158, + "step": 49750 + }, + { + "epoch": 0.3127531278848231, + "grad_norm": 6.8943586349487305, + "learning_rate": 1.7919541000645418e-05, + "loss": 1.6281, + "step": 49760 + }, + { + "epoch": 0.31281598020152024, + "grad_norm": 6.684320449829102, + "learning_rate": 1.7919121899700765e-05, + "loss": 1.5789, + "step": 49770 + }, + { + "epoch": 0.31287883251821735, + "grad_norm": 7.835089206695557, + "learning_rate": 1.791870279875611e-05, + "loss": 1.9414, + "step": 49780 + }, + { + "epoch": 0.31294168483491447, + "grad_norm": 8.6885347366333, + "learning_rate": 1.7918283697811456e-05, + "loss": 1.7312, + "step": 49790 + }, + { + "epoch": 0.3130045371516116, + "grad_norm": 7.475964069366455, + "learning_rate": 1.7917864596866803e-05, + "loss": 1.8954, + "step": 49800 + }, + { + "epoch": 0.3130673894683087, + "grad_norm": 6.240327835083008, + "learning_rate": 1.791744549592215e-05, + "loss": 1.8695, + "step": 49810 + }, + { + "epoch": 0.3131302417850058, + "grad_norm": 8.597575187683105, + "learning_rate": 1.7917026394977493e-05, + "loss": 1.8715, + "step": 49820 + }, + { + "epoch": 0.31319309410170293, + "grad_norm": 7.659453868865967, + "learning_rate": 1.791660729403284e-05, + "loss": 1.9177, + "step": 49830 + }, + { + "epoch": 0.3132559464184, + "grad_norm": 7.241260528564453, + "learning_rate": 1.7916188193088187e-05, + "loss": 1.8623, + "step": 49840 + }, + { + "epoch": 0.3133187987350971, + "grad_norm": 6.449772834777832, + "learning_rate": 1.7915769092143535e-05, + "loss": 2.0723, + "step": 49850 + }, + { + "epoch": 0.3133816510517942, + "grad_norm": 6.881225109100342, + "learning_rate": 1.791534999119888e-05, + "loss": 1.8445, + "step": 49860 + }, + { + "epoch": 0.31344450336849133, + "grad_norm": 6.207837104797363, + "learning_rate": 1.791493089025423e-05, + "loss": 2.0165, + "step": 49870 + }, + { + "epoch": 0.31350735568518845, + "grad_norm": 7.346057415008545, + "learning_rate": 1.7914511789309576e-05, + "loss": 1.7559, + "step": 49880 + }, + { + "epoch": 0.31357020800188556, + "grad_norm": 6.961569309234619, + "learning_rate": 1.7914092688364923e-05, + "loss": 1.8131, + "step": 49890 + }, + { + "epoch": 0.3136330603185827, + "grad_norm": 7.810278415679932, + "learning_rate": 1.791367358742027e-05, + "loss": 1.8027, + "step": 49900 + }, + { + "epoch": 0.3136959126352798, + "grad_norm": 6.59714412689209, + "learning_rate": 1.7913254486475614e-05, + "loss": 1.7132, + "step": 49910 + }, + { + "epoch": 0.3137587649519769, + "grad_norm": 6.5777153968811035, + "learning_rate": 1.791283538553096e-05, + "loss": 1.7878, + "step": 49920 + }, + { + "epoch": 0.313821617268674, + "grad_norm": 7.4422736167907715, + "learning_rate": 1.7912416284586308e-05, + "loss": 1.9738, + "step": 49930 + }, + { + "epoch": 0.31388446958537114, + "grad_norm": 7.056987285614014, + "learning_rate": 1.7911997183641655e-05, + "loss": 2.0433, + "step": 49940 + }, + { + "epoch": 0.31394732190206825, + "grad_norm": 6.372257232666016, + "learning_rate": 1.7911578082697002e-05, + "loss": 1.593, + "step": 49950 + }, + { + "epoch": 0.31401017421876537, + "grad_norm": 7.499957084655762, + "learning_rate": 1.7911158981752346e-05, + "loss": 1.9129, + "step": 49960 + }, + { + "epoch": 0.31407302653546243, + "grad_norm": 7.2810444831848145, + "learning_rate": 1.7910739880807693e-05, + "loss": 1.9261, + "step": 49970 + }, + { + "epoch": 0.31413587885215954, + "grad_norm": 6.837697982788086, + "learning_rate": 1.791032077986304e-05, + "loss": 1.8936, + "step": 49980 + }, + { + "epoch": 0.31419873116885666, + "grad_norm": 7.222091197967529, + "learning_rate": 1.7909901678918387e-05, + "loss": 1.9858, + "step": 49990 + }, + { + "epoch": 0.3142615834855538, + "grad_norm": 7.050332069396973, + "learning_rate": 1.790948257797373e-05, + "loss": 1.9582, + "step": 50000 + }, + { + "epoch": 0.3143244358022509, + "grad_norm": 7.38700532913208, + "learning_rate": 1.7909063477029078e-05, + "loss": 1.7374, + "step": 50010 + }, + { + "epoch": 0.314387288118948, + "grad_norm": 7.6068501472473145, + "learning_rate": 1.7908644376084425e-05, + "loss": 1.9596, + "step": 50020 + }, + { + "epoch": 0.3144501404356451, + "grad_norm": 6.117990493774414, + "learning_rate": 1.790822527513977e-05, + "loss": 1.8358, + "step": 50030 + }, + { + "epoch": 0.31451299275234224, + "grad_norm": 7.923231601715088, + "learning_rate": 1.7907806174195115e-05, + "loss": 1.8882, + "step": 50040 + }, + { + "epoch": 0.31457584506903935, + "grad_norm": 6.715137004852295, + "learning_rate": 1.7907387073250462e-05, + "loss": 1.7807, + "step": 50050 + }, + { + "epoch": 0.31463869738573647, + "grad_norm": 6.373053550720215, + "learning_rate": 1.790696797230581e-05, + "loss": 1.9853, + "step": 50060 + }, + { + "epoch": 0.3147015497024336, + "grad_norm": 6.411943435668945, + "learning_rate": 1.7906548871361157e-05, + "loss": 1.8439, + "step": 50070 + }, + { + "epoch": 0.3147644020191307, + "grad_norm": 7.262099266052246, + "learning_rate": 1.7906129770416504e-05, + "loss": 1.8817, + "step": 50080 + }, + { + "epoch": 0.3148272543358278, + "grad_norm": 6.456963062286377, + "learning_rate": 1.790571066947185e-05, + "loss": 1.767, + "step": 50090 + }, + { + "epoch": 0.31489010665252487, + "grad_norm": 7.869819164276123, + "learning_rate": 1.7905291568527198e-05, + "loss": 2.1192, + "step": 50100 + }, + { + "epoch": 0.314952958969222, + "grad_norm": 6.73624849319458, + "learning_rate": 1.7904872467582545e-05, + "loss": 1.6259, + "step": 50110 + }, + { + "epoch": 0.3150158112859191, + "grad_norm": 6.531667232513428, + "learning_rate": 1.7904453366637892e-05, + "loss": 1.8657, + "step": 50120 + }, + { + "epoch": 0.3150786636026162, + "grad_norm": 6.859806537628174, + "learning_rate": 1.7904034265693236e-05, + "loss": 1.9632, + "step": 50130 + }, + { + "epoch": 0.31514151591931333, + "grad_norm": 8.881882667541504, + "learning_rate": 1.7903615164748583e-05, + "loss": 2.0181, + "step": 50140 + }, + { + "epoch": 0.31520436823601045, + "grad_norm": 7.2561235427856445, + "learning_rate": 1.790319606380393e-05, + "loss": 1.9784, + "step": 50150 + }, + { + "epoch": 0.31526722055270756, + "grad_norm": 6.737002372741699, + "learning_rate": 1.7902776962859277e-05, + "loss": 1.7579, + "step": 50160 + }, + { + "epoch": 0.3153300728694047, + "grad_norm": 6.801862716674805, + "learning_rate": 1.7902357861914624e-05, + "loss": 1.812, + "step": 50170 + }, + { + "epoch": 0.3153929251861018, + "grad_norm": 6.360828876495361, + "learning_rate": 1.7901938760969968e-05, + "loss": 1.9614, + "step": 50180 + }, + { + "epoch": 0.3154557775027989, + "grad_norm": 6.732676029205322, + "learning_rate": 1.7901519660025315e-05, + "loss": 1.6523, + "step": 50190 + }, + { + "epoch": 0.315518629819496, + "grad_norm": 7.454711437225342, + "learning_rate": 1.7901100559080662e-05, + "loss": 1.83, + "step": 50200 + }, + { + "epoch": 0.31558148213619314, + "grad_norm": 6.141359806060791, + "learning_rate": 1.790068145813601e-05, + "loss": 1.6708, + "step": 50210 + }, + { + "epoch": 0.31564433445289025, + "grad_norm": 9.075355529785156, + "learning_rate": 1.7900262357191352e-05, + "loss": 1.8219, + "step": 50220 + }, + { + "epoch": 0.3157071867695873, + "grad_norm": 6.9223246574401855, + "learning_rate": 1.78998432562467e-05, + "loss": 1.8902, + "step": 50230 + }, + { + "epoch": 0.3157700390862844, + "grad_norm": 7.6331562995910645, + "learning_rate": 1.7899424155302047e-05, + "loss": 1.9828, + "step": 50240 + }, + { + "epoch": 0.31583289140298154, + "grad_norm": 8.22348403930664, + "learning_rate": 1.7899005054357394e-05, + "loss": 1.7224, + "step": 50250 + }, + { + "epoch": 0.31589574371967866, + "grad_norm": 7.708650588989258, + "learning_rate": 1.789858595341274e-05, + "loss": 1.7092, + "step": 50260 + }, + { + "epoch": 0.31595859603637577, + "grad_norm": 7.123536586761475, + "learning_rate": 1.7898166852468088e-05, + "loss": 1.736, + "step": 50270 + }, + { + "epoch": 0.3160214483530729, + "grad_norm": 6.277158260345459, + "learning_rate": 1.7897747751523435e-05, + "loss": 1.794, + "step": 50280 + }, + { + "epoch": 0.31608430066977, + "grad_norm": 7.302791118621826, + "learning_rate": 1.789732865057878e-05, + "loss": 1.8666, + "step": 50290 + }, + { + "epoch": 0.3161471529864671, + "grad_norm": 6.053914546966553, + "learning_rate": 1.7896909549634126e-05, + "loss": 1.7688, + "step": 50300 + }, + { + "epoch": 0.31621000530316423, + "grad_norm": 7.230604648590088, + "learning_rate": 1.7896490448689473e-05, + "loss": 1.78, + "step": 50310 + }, + { + "epoch": 0.31627285761986135, + "grad_norm": 6.054196834564209, + "learning_rate": 1.789607134774482e-05, + "loss": 2.0002, + "step": 50320 + }, + { + "epoch": 0.31633570993655846, + "grad_norm": 7.253074645996094, + "learning_rate": 1.7895652246800167e-05, + "loss": 1.9383, + "step": 50330 + }, + { + "epoch": 0.3163985622532556, + "grad_norm": 6.574306964874268, + "learning_rate": 1.7895233145855514e-05, + "loss": 1.7574, + "step": 50340 + }, + { + "epoch": 0.31646141456995264, + "grad_norm": 6.2237420082092285, + "learning_rate": 1.7894814044910858e-05, + "loss": 1.8644, + "step": 50350 + }, + { + "epoch": 0.31652426688664975, + "grad_norm": 6.561565399169922, + "learning_rate": 1.7894394943966205e-05, + "loss": 2.0126, + "step": 50360 + }, + { + "epoch": 0.31658711920334687, + "grad_norm": 6.336477756500244, + "learning_rate": 1.7893975843021552e-05, + "loss": 1.9635, + "step": 50370 + }, + { + "epoch": 0.316649971520044, + "grad_norm": 8.280914306640625, + "learning_rate": 1.78935567420769e-05, + "loss": 1.8634, + "step": 50380 + }, + { + "epoch": 0.3167128238367411, + "grad_norm": 7.454405784606934, + "learning_rate": 1.7893137641132246e-05, + "loss": 1.9546, + "step": 50390 + }, + { + "epoch": 0.3167756761534382, + "grad_norm": 8.053701400756836, + "learning_rate": 1.789271854018759e-05, + "loss": 1.846, + "step": 50400 + }, + { + "epoch": 0.3168385284701353, + "grad_norm": 7.830573081970215, + "learning_rate": 1.7892299439242937e-05, + "loss": 1.9054, + "step": 50410 + }, + { + "epoch": 0.31690138078683244, + "grad_norm": 6.7762908935546875, + "learning_rate": 1.7891880338298284e-05, + "loss": 1.749, + "step": 50420 + }, + { + "epoch": 0.31696423310352956, + "grad_norm": 7.049383163452148, + "learning_rate": 1.789146123735363e-05, + "loss": 1.7843, + "step": 50430 + }, + { + "epoch": 0.3170270854202267, + "grad_norm": 6.608985424041748, + "learning_rate": 1.7891042136408974e-05, + "loss": 1.8268, + "step": 50440 + }, + { + "epoch": 0.3170899377369238, + "grad_norm": 6.4440388679504395, + "learning_rate": 1.789062303546432e-05, + "loss": 1.9668, + "step": 50450 + }, + { + "epoch": 0.3171527900536209, + "grad_norm": 6.358787536621094, + "learning_rate": 1.789020393451967e-05, + "loss": 1.7871, + "step": 50460 + }, + { + "epoch": 0.317215642370318, + "grad_norm": 6.855319976806641, + "learning_rate": 1.7889784833575016e-05, + "loss": 1.9543, + "step": 50470 + }, + { + "epoch": 0.3172784946870151, + "grad_norm": 6.997275352478027, + "learning_rate": 1.7889365732630363e-05, + "loss": 1.7011, + "step": 50480 + }, + { + "epoch": 0.3173413470037122, + "grad_norm": 7.220674991607666, + "learning_rate": 1.788894663168571e-05, + "loss": 1.6187, + "step": 50490 + }, + { + "epoch": 0.3174041993204093, + "grad_norm": 7.348036766052246, + "learning_rate": 1.7888527530741057e-05, + "loss": 1.9247, + "step": 50500 + }, + { + "epoch": 0.3174670516371064, + "grad_norm": 7.006748199462891, + "learning_rate": 1.7888108429796404e-05, + "loss": 1.8376, + "step": 50510 + }, + { + "epoch": 0.31752990395380354, + "grad_norm": 8.228358268737793, + "learning_rate": 1.788768932885175e-05, + "loss": 1.8103, + "step": 50520 + }, + { + "epoch": 0.31759275627050065, + "grad_norm": 7.424829006195068, + "learning_rate": 1.7887270227907095e-05, + "loss": 1.7843, + "step": 50530 + }, + { + "epoch": 0.31765560858719777, + "grad_norm": 6.651792526245117, + "learning_rate": 1.7886851126962442e-05, + "loss": 1.7476, + "step": 50540 + }, + { + "epoch": 0.3177184609038949, + "grad_norm": 6.952655792236328, + "learning_rate": 1.788643202601779e-05, + "loss": 1.8087, + "step": 50550 + }, + { + "epoch": 0.317781313220592, + "grad_norm": 7.491165637969971, + "learning_rate": 1.7886012925073136e-05, + "loss": 1.8656, + "step": 50560 + }, + { + "epoch": 0.3178441655372891, + "grad_norm": 6.939685344696045, + "learning_rate": 1.7885593824128483e-05, + "loss": 1.8724, + "step": 50570 + }, + { + "epoch": 0.31790701785398623, + "grad_norm": 6.545870780944824, + "learning_rate": 1.7885174723183827e-05, + "loss": 1.8012, + "step": 50580 + }, + { + "epoch": 0.31796987017068334, + "grad_norm": 5.817804336547852, + "learning_rate": 1.7884755622239174e-05, + "loss": 1.7552, + "step": 50590 + }, + { + "epoch": 0.31803272248738046, + "grad_norm": 6.606317043304443, + "learning_rate": 1.788433652129452e-05, + "loss": 1.7003, + "step": 50600 + }, + { + "epoch": 0.3180955748040775, + "grad_norm": 7.408612251281738, + "learning_rate": 1.7883917420349868e-05, + "loss": 1.842, + "step": 50610 + }, + { + "epoch": 0.31815842712077463, + "grad_norm": 7.27371883392334, + "learning_rate": 1.788349831940521e-05, + "loss": 1.8896, + "step": 50620 + }, + { + "epoch": 0.31822127943747175, + "grad_norm": 7.040416717529297, + "learning_rate": 1.788307921846056e-05, + "loss": 1.9259, + "step": 50630 + }, + { + "epoch": 0.31828413175416886, + "grad_norm": 7.271168231964111, + "learning_rate": 1.7882660117515906e-05, + "loss": 1.7213, + "step": 50640 + }, + { + "epoch": 0.318346984070866, + "grad_norm": 6.543405532836914, + "learning_rate": 1.7882241016571253e-05, + "loss": 2.0821, + "step": 50650 + }, + { + "epoch": 0.3184098363875631, + "grad_norm": 7.510614395141602, + "learning_rate": 1.78818219156266e-05, + "loss": 1.8321, + "step": 50660 + }, + { + "epoch": 0.3184726887042602, + "grad_norm": 5.736481189727783, + "learning_rate": 1.7881402814681944e-05, + "loss": 1.779, + "step": 50670 + }, + { + "epoch": 0.3185355410209573, + "grad_norm": 7.5980048179626465, + "learning_rate": 1.788098371373729e-05, + "loss": 1.7945, + "step": 50680 + }, + { + "epoch": 0.31859839333765444, + "grad_norm": 6.607179164886475, + "learning_rate": 1.7880606522887105e-05, + "loss": 1.7979, + "step": 50690 + }, + { + "epoch": 0.31866124565435155, + "grad_norm": 8.14891242980957, + "learning_rate": 1.788018742194245e-05, + "loss": 1.833, + "step": 50700 + }, + { + "epoch": 0.31872409797104867, + "grad_norm": 7.936985015869141, + "learning_rate": 1.7879768320997796e-05, + "loss": 1.5912, + "step": 50710 + }, + { + "epoch": 0.3187869502877458, + "grad_norm": 7.009968280792236, + "learning_rate": 1.7879349220053143e-05, + "loss": 1.8891, + "step": 50720 + }, + { + "epoch": 0.3188498026044429, + "grad_norm": 5.5439863204956055, + "learning_rate": 1.787893011910849e-05, + "loss": 1.9336, + "step": 50730 + }, + { + "epoch": 0.31891265492113996, + "grad_norm": 5.964497089385986, + "learning_rate": 1.7878511018163834e-05, + "loss": 1.9025, + "step": 50740 + }, + { + "epoch": 0.3189755072378371, + "grad_norm": 7.623325824737549, + "learning_rate": 1.787809191721918e-05, + "loss": 1.8252, + "step": 50750 + }, + { + "epoch": 0.3190383595545342, + "grad_norm": 7.806591987609863, + "learning_rate": 1.7877672816274528e-05, + "loss": 1.9262, + "step": 50760 + }, + { + "epoch": 0.3191012118712313, + "grad_norm": 7.820315361022949, + "learning_rate": 1.7877253715329875e-05, + "loss": 1.9376, + "step": 50770 + }, + { + "epoch": 0.3191640641879284, + "grad_norm": 7.506632328033447, + "learning_rate": 1.7876834614385222e-05, + "loss": 1.8455, + "step": 50780 + }, + { + "epoch": 0.31922691650462554, + "grad_norm": 7.440194129943848, + "learning_rate": 1.787641551344057e-05, + "loss": 2.0055, + "step": 50790 + }, + { + "epoch": 0.31928976882132265, + "grad_norm": 7.476811408996582, + "learning_rate": 1.7875996412495916e-05, + "loss": 1.8239, + "step": 50800 + }, + { + "epoch": 0.31935262113801977, + "grad_norm": 6.638411998748779, + "learning_rate": 1.7875577311551263e-05, + "loss": 1.6158, + "step": 50810 + }, + { + "epoch": 0.3194154734547169, + "grad_norm": 7.170370578765869, + "learning_rate": 1.787515821060661e-05, + "loss": 1.7892, + "step": 50820 + }, + { + "epoch": 0.319478325771414, + "grad_norm": 6.0496931076049805, + "learning_rate": 1.7874739109661954e-05, + "loss": 1.8532, + "step": 50830 + }, + { + "epoch": 0.3195411780881111, + "grad_norm": 7.209271430969238, + "learning_rate": 1.78743200087173e-05, + "loss": 2.1903, + "step": 50840 + }, + { + "epoch": 0.3196040304048082, + "grad_norm": 6.790227890014648, + "learning_rate": 1.7873900907772648e-05, + "loss": 1.9094, + "step": 50850 + }, + { + "epoch": 0.3196668827215053, + "grad_norm": 6.356524467468262, + "learning_rate": 1.7873481806827995e-05, + "loss": 1.8637, + "step": 50860 + }, + { + "epoch": 0.3197297350382024, + "grad_norm": 6.975244045257568, + "learning_rate": 1.787306270588334e-05, + "loss": 1.8636, + "step": 50870 + }, + { + "epoch": 0.3197925873548995, + "grad_norm": 6.814340114593506, + "learning_rate": 1.7872643604938686e-05, + "loss": 1.7427, + "step": 50880 + }, + { + "epoch": 0.31985543967159663, + "grad_norm": 7.969485759735107, + "learning_rate": 1.7872224503994033e-05, + "loss": 1.7993, + "step": 50890 + }, + { + "epoch": 0.31991829198829375, + "grad_norm": 7.632097244262695, + "learning_rate": 1.787180540304938e-05, + "loss": 1.7034, + "step": 50900 + }, + { + "epoch": 0.31998114430499086, + "grad_norm": 7.777673721313477, + "learning_rate": 1.7871386302104727e-05, + "loss": 1.7781, + "step": 50910 + }, + { + "epoch": 0.320043996621688, + "grad_norm": 6.5799384117126465, + "learning_rate": 1.787096720116007e-05, + "loss": 1.8563, + "step": 50920 + }, + { + "epoch": 0.3201068489383851, + "grad_norm": 7.1976447105407715, + "learning_rate": 1.7870548100215418e-05, + "loss": 1.943, + "step": 50930 + }, + { + "epoch": 0.3201697012550822, + "grad_norm": 6.877608299255371, + "learning_rate": 1.7870128999270765e-05, + "loss": 1.9797, + "step": 50940 + }, + { + "epoch": 0.3202325535717793, + "grad_norm": 6.036958694458008, + "learning_rate": 1.7869709898326112e-05, + "loss": 1.6093, + "step": 50950 + }, + { + "epoch": 0.32029540588847644, + "grad_norm": 5.940581321716309, + "learning_rate": 1.786929079738146e-05, + "loss": 1.6609, + "step": 50960 + }, + { + "epoch": 0.32035825820517355, + "grad_norm": 7.371578216552734, + "learning_rate": 1.7868871696436806e-05, + "loss": 2.0982, + "step": 50970 + }, + { + "epoch": 0.32042111052187067, + "grad_norm": 6.936854362487793, + "learning_rate": 1.786845259549215e-05, + "loss": 1.7427, + "step": 50980 + }, + { + "epoch": 0.3204839628385677, + "grad_norm": 5.890512943267822, + "learning_rate": 1.7868033494547497e-05, + "loss": 1.8179, + "step": 50990 + }, + { + "epoch": 0.32054681515526484, + "grad_norm": 7.359219074249268, + "learning_rate": 1.7867614393602844e-05, + "loss": 1.9621, + "step": 51000 + }, + { + "epoch": 0.32060966747196196, + "grad_norm": 8.057191848754883, + "learning_rate": 1.786719529265819e-05, + "loss": 1.8713, + "step": 51010 + }, + { + "epoch": 0.32067251978865907, + "grad_norm": 6.030718803405762, + "learning_rate": 1.786677619171354e-05, + "loss": 1.8131, + "step": 51020 + }, + { + "epoch": 0.3207353721053562, + "grad_norm": 5.996182441711426, + "learning_rate": 1.7866357090768885e-05, + "loss": 1.8038, + "step": 51030 + }, + { + "epoch": 0.3207982244220533, + "grad_norm": 6.6590046882629395, + "learning_rate": 1.7865937989824232e-05, + "loss": 1.6819, + "step": 51040 + }, + { + "epoch": 0.3208610767387504, + "grad_norm": 6.439087867736816, + "learning_rate": 1.7865518888879576e-05, + "loss": 1.5105, + "step": 51050 + }, + { + "epoch": 0.32092392905544753, + "grad_norm": 7.181485652923584, + "learning_rate": 1.7865099787934923e-05, + "loss": 1.6931, + "step": 51060 + }, + { + "epoch": 0.32098678137214465, + "grad_norm": 6.688985347747803, + "learning_rate": 1.786468068699027e-05, + "loss": 1.7462, + "step": 51070 + }, + { + "epoch": 0.32104963368884176, + "grad_norm": 7.273684501647949, + "learning_rate": 1.7864261586045617e-05, + "loss": 1.6931, + "step": 51080 + }, + { + "epoch": 0.3211124860055389, + "grad_norm": 6.832235813140869, + "learning_rate": 1.7863842485100964e-05, + "loss": 1.8136, + "step": 51090 + }, + { + "epoch": 0.321175338322236, + "grad_norm": 7.846092224121094, + "learning_rate": 1.7863423384156308e-05, + "loss": 2.0732, + "step": 51100 + }, + { + "epoch": 0.3212381906389331, + "grad_norm": 8.31198787689209, + "learning_rate": 1.7863004283211655e-05, + "loss": 1.8665, + "step": 51110 + }, + { + "epoch": 0.32130104295563017, + "grad_norm": 6.93269157409668, + "learning_rate": 1.7862585182267002e-05, + "loss": 1.6924, + "step": 51120 + }, + { + "epoch": 0.3213638952723273, + "grad_norm": 5.975748538970947, + "learning_rate": 1.786216608132235e-05, + "loss": 1.871, + "step": 51130 + }, + { + "epoch": 0.3214267475890244, + "grad_norm": 7.039586067199707, + "learning_rate": 1.7861746980377693e-05, + "loss": 1.6259, + "step": 51140 + }, + { + "epoch": 0.3214895999057215, + "grad_norm": 7.075222969055176, + "learning_rate": 1.786132787943304e-05, + "loss": 1.7098, + "step": 51150 + }, + { + "epoch": 0.3215524522224186, + "grad_norm": 7.806368350982666, + "learning_rate": 1.7860908778488387e-05, + "loss": 1.7068, + "step": 51160 + }, + { + "epoch": 0.32161530453911574, + "grad_norm": 7.327364921569824, + "learning_rate": 1.7860489677543734e-05, + "loss": 1.6505, + "step": 51170 + }, + { + "epoch": 0.32167815685581286, + "grad_norm": 7.286188125610352, + "learning_rate": 1.786007057659908e-05, + "loss": 1.9491, + "step": 51180 + }, + { + "epoch": 0.32174100917251, + "grad_norm": 6.699967384338379, + "learning_rate": 1.785965147565443e-05, + "loss": 1.7551, + "step": 51190 + }, + { + "epoch": 0.3218038614892071, + "grad_norm": 7.496636867523193, + "learning_rate": 1.7859232374709775e-05, + "loss": 1.6995, + "step": 51200 + }, + { + "epoch": 0.3218667138059042, + "grad_norm": 6.468504428863525, + "learning_rate": 1.7858813273765123e-05, + "loss": 1.9599, + "step": 51210 + }, + { + "epoch": 0.3219295661226013, + "grad_norm": 6.733185768127441, + "learning_rate": 1.785839417282047e-05, + "loss": 1.6771, + "step": 51220 + }, + { + "epoch": 0.32199241843929843, + "grad_norm": 7.011502265930176, + "learning_rate": 1.7857975071875813e-05, + "loss": 1.6548, + "step": 51230 + }, + { + "epoch": 0.32205527075599555, + "grad_norm": 8.619169235229492, + "learning_rate": 1.785755597093116e-05, + "loss": 2.0093, + "step": 51240 + }, + { + "epoch": 0.3221181230726926, + "grad_norm": 7.195321083068848, + "learning_rate": 1.7857136869986507e-05, + "loss": 1.6438, + "step": 51250 + }, + { + "epoch": 0.3221809753893897, + "grad_norm": 6.684089660644531, + "learning_rate": 1.7856717769041854e-05, + "loss": 1.756, + "step": 51260 + }, + { + "epoch": 0.32224382770608684, + "grad_norm": 6.9751434326171875, + "learning_rate": 1.7856298668097198e-05, + "loss": 2.0399, + "step": 51270 + }, + { + "epoch": 0.32230668002278395, + "grad_norm": 6.216689109802246, + "learning_rate": 1.7855879567152545e-05, + "loss": 1.8779, + "step": 51280 + }, + { + "epoch": 0.32236953233948107, + "grad_norm": 7.417315483093262, + "learning_rate": 1.7855460466207892e-05, + "loss": 1.8112, + "step": 51290 + }, + { + "epoch": 0.3224323846561782, + "grad_norm": 7.832452774047852, + "learning_rate": 1.785504136526324e-05, + "loss": 2.0227, + "step": 51300 + }, + { + "epoch": 0.3224952369728753, + "grad_norm": 7.068687438964844, + "learning_rate": 1.7854622264318586e-05, + "loss": 1.7686, + "step": 51310 + }, + { + "epoch": 0.3225580892895724, + "grad_norm": 7.699952602386475, + "learning_rate": 1.785420316337393e-05, + "loss": 1.7498, + "step": 51320 + }, + { + "epoch": 0.32262094160626953, + "grad_norm": 6.384725570678711, + "learning_rate": 1.7853784062429277e-05, + "loss": 1.6738, + "step": 51330 + }, + { + "epoch": 0.32268379392296664, + "grad_norm": 6.444098949432373, + "learning_rate": 1.7853364961484624e-05, + "loss": 1.8666, + "step": 51340 + }, + { + "epoch": 0.32274664623966376, + "grad_norm": 6.82034969329834, + "learning_rate": 1.785294586053997e-05, + "loss": 1.6276, + "step": 51350 + }, + { + "epoch": 0.3228094985563609, + "grad_norm": 6.927557945251465, + "learning_rate": 1.7852526759595315e-05, + "loss": 1.8343, + "step": 51360 + }, + { + "epoch": 0.32287235087305793, + "grad_norm": 6.251853942871094, + "learning_rate": 1.7852107658650662e-05, + "loss": 1.819, + "step": 51370 + }, + { + "epoch": 0.32293520318975505, + "grad_norm": 5.410449028015137, + "learning_rate": 1.785168855770601e-05, + "loss": 1.6984, + "step": 51380 + }, + { + "epoch": 0.32299805550645216, + "grad_norm": 6.403854846954346, + "learning_rate": 1.7851269456761356e-05, + "loss": 2.1146, + "step": 51390 + }, + { + "epoch": 0.3230609078231493, + "grad_norm": 6.982732772827148, + "learning_rate": 1.7850850355816703e-05, + "loss": 1.8227, + "step": 51400 + }, + { + "epoch": 0.3231237601398464, + "grad_norm": 7.60330057144165, + "learning_rate": 1.785043125487205e-05, + "loss": 1.5598, + "step": 51410 + }, + { + "epoch": 0.3231866124565435, + "grad_norm": 6.407942771911621, + "learning_rate": 1.7850012153927397e-05, + "loss": 1.741, + "step": 51420 + }, + { + "epoch": 0.3232494647732406, + "grad_norm": 6.775578498840332, + "learning_rate": 1.7849593052982745e-05, + "loss": 1.7673, + "step": 51430 + }, + { + "epoch": 0.32331231708993774, + "grad_norm": 6.695420265197754, + "learning_rate": 1.784917395203809e-05, + "loss": 1.9047, + "step": 51440 + }, + { + "epoch": 0.32337516940663485, + "grad_norm": 6.92299222946167, + "learning_rate": 1.7848754851093435e-05, + "loss": 1.7528, + "step": 51450 + }, + { + "epoch": 0.32343802172333197, + "grad_norm": 6.831806182861328, + "learning_rate": 1.7848335750148782e-05, + "loss": 1.835, + "step": 51460 + }, + { + "epoch": 0.3235008740400291, + "grad_norm": 7.060969829559326, + "learning_rate": 1.784791664920413e-05, + "loss": 1.7881, + "step": 51470 + }, + { + "epoch": 0.3235637263567262, + "grad_norm": 6.520586013793945, + "learning_rate": 1.7847497548259476e-05, + "loss": 1.8229, + "step": 51480 + }, + { + "epoch": 0.3236265786734233, + "grad_norm": 7.508065700531006, + "learning_rate": 1.784707844731482e-05, + "loss": 1.8881, + "step": 51490 + }, + { + "epoch": 0.3236894309901204, + "grad_norm": 7.780220031738281, + "learning_rate": 1.7846659346370167e-05, + "loss": 1.9137, + "step": 51500 + }, + { + "epoch": 0.3237522833068175, + "grad_norm": 6.283895015716553, + "learning_rate": 1.7846240245425514e-05, + "loss": 1.8163, + "step": 51510 + }, + { + "epoch": 0.3238151356235146, + "grad_norm": 7.579999923706055, + "learning_rate": 1.784582114448086e-05, + "loss": 1.7848, + "step": 51520 + }, + { + "epoch": 0.3238779879402117, + "grad_norm": 6.87309455871582, + "learning_rate": 1.784540204353621e-05, + "loss": 1.7596, + "step": 51530 + }, + { + "epoch": 0.32394084025690884, + "grad_norm": 8.016010284423828, + "learning_rate": 1.7844982942591552e-05, + "loss": 1.8643, + "step": 51540 + }, + { + "epoch": 0.32400369257360595, + "grad_norm": 6.44862699508667, + "learning_rate": 1.78445638416469e-05, + "loss": 1.7422, + "step": 51550 + }, + { + "epoch": 0.32406654489030307, + "grad_norm": 7.54352331161499, + "learning_rate": 1.7844144740702246e-05, + "loss": 1.8573, + "step": 51560 + }, + { + "epoch": 0.3241293972070002, + "grad_norm": 7.547618389129639, + "learning_rate": 1.7843725639757593e-05, + "loss": 1.6722, + "step": 51570 + }, + { + "epoch": 0.3241922495236973, + "grad_norm": 5.8921613693237305, + "learning_rate": 1.784330653881294e-05, + "loss": 1.9092, + "step": 51580 + }, + { + "epoch": 0.3242551018403944, + "grad_norm": 6.372741222381592, + "learning_rate": 1.7842887437868287e-05, + "loss": 1.8572, + "step": 51590 + }, + { + "epoch": 0.3243179541570915, + "grad_norm": 7.012420654296875, + "learning_rate": 1.7842468336923635e-05, + "loss": 1.8991, + "step": 51600 + }, + { + "epoch": 0.32438080647378864, + "grad_norm": 7.101135730743408, + "learning_rate": 1.7842049235978978e-05, + "loss": 1.8706, + "step": 51610 + }, + { + "epoch": 0.32444365879048576, + "grad_norm": 6.085858345031738, + "learning_rate": 1.7841630135034325e-05, + "loss": 1.577, + "step": 51620 + }, + { + "epoch": 0.3245065111071828, + "grad_norm": 6.90290641784668, + "learning_rate": 1.7841211034089672e-05, + "loss": 1.915, + "step": 51630 + }, + { + "epoch": 0.32456936342387993, + "grad_norm": 6.020164966583252, + "learning_rate": 1.784079193314502e-05, + "loss": 1.887, + "step": 51640 + }, + { + "epoch": 0.32463221574057705, + "grad_norm": 7.010438442230225, + "learning_rate": 1.7840372832200367e-05, + "loss": 1.9698, + "step": 51650 + }, + { + "epoch": 0.32469506805727416, + "grad_norm": 7.063260555267334, + "learning_rate": 1.7839953731255714e-05, + "loss": 1.5653, + "step": 51660 + }, + { + "epoch": 0.3247579203739713, + "grad_norm": 7.6758952140808105, + "learning_rate": 1.7839534630311057e-05, + "loss": 1.7361, + "step": 51670 + }, + { + "epoch": 0.3248207726906684, + "grad_norm": 6.925780773162842, + "learning_rate": 1.7839115529366404e-05, + "loss": 1.911, + "step": 51680 + }, + { + "epoch": 0.3248836250073655, + "grad_norm": 7.4563751220703125, + "learning_rate": 1.783869642842175e-05, + "loss": 1.8287, + "step": 51690 + }, + { + "epoch": 0.3249464773240626, + "grad_norm": 7.764344692230225, + "learning_rate": 1.78382773274771e-05, + "loss": 1.9369, + "step": 51700 + }, + { + "epoch": 0.32500932964075974, + "grad_norm": 7.151037216186523, + "learning_rate": 1.7837858226532446e-05, + "loss": 1.8742, + "step": 51710 + }, + { + "epoch": 0.32507218195745685, + "grad_norm": 6.498341083526611, + "learning_rate": 1.783743912558779e-05, + "loss": 1.9373, + "step": 51720 + }, + { + "epoch": 0.32513503427415397, + "grad_norm": 6.145724296569824, + "learning_rate": 1.7837020024643136e-05, + "loss": 1.7477, + "step": 51730 + }, + { + "epoch": 0.3251978865908511, + "grad_norm": 9.115324020385742, + "learning_rate": 1.7836600923698483e-05, + "loss": 1.7087, + "step": 51740 + }, + { + "epoch": 0.3252607389075482, + "grad_norm": 5.0653486251831055, + "learning_rate": 1.783618182275383e-05, + "loss": 1.7502, + "step": 51750 + }, + { + "epoch": 0.32532359122424526, + "grad_norm": 6.756585597991943, + "learning_rate": 1.7835762721809174e-05, + "loss": 1.7235, + "step": 51760 + }, + { + "epoch": 0.32538644354094237, + "grad_norm": 7.557977199554443, + "learning_rate": 1.783534362086452e-05, + "loss": 1.8098, + "step": 51770 + }, + { + "epoch": 0.3254492958576395, + "grad_norm": 6.938634872436523, + "learning_rate": 1.7834924519919868e-05, + "loss": 2.329, + "step": 51780 + }, + { + "epoch": 0.3255121481743366, + "grad_norm": 7.438292026519775, + "learning_rate": 1.7834505418975215e-05, + "loss": 1.9686, + "step": 51790 + }, + { + "epoch": 0.3255750004910337, + "grad_norm": 6.877195358276367, + "learning_rate": 1.7834086318030562e-05, + "loss": 1.7302, + "step": 51800 + }, + { + "epoch": 0.32563785280773083, + "grad_norm": 7.628830909729004, + "learning_rate": 1.783366721708591e-05, + "loss": 1.9505, + "step": 51810 + }, + { + "epoch": 0.32570070512442795, + "grad_norm": 7.419105529785156, + "learning_rate": 1.7833248116141257e-05, + "loss": 1.9519, + "step": 51820 + }, + { + "epoch": 0.32576355744112506, + "grad_norm": 7.3178019523620605, + "learning_rate": 1.7832829015196604e-05, + "loss": 1.7879, + "step": 51830 + }, + { + "epoch": 0.3258264097578222, + "grad_norm": 6.704282760620117, + "learning_rate": 1.783240991425195e-05, + "loss": 1.9448, + "step": 51840 + }, + { + "epoch": 0.3258892620745193, + "grad_norm": 7.331361770629883, + "learning_rate": 1.7831990813307294e-05, + "loss": 1.6215, + "step": 51850 + }, + { + "epoch": 0.3259521143912164, + "grad_norm": 7.451310157775879, + "learning_rate": 1.783157171236264e-05, + "loss": 1.7762, + "step": 51860 + }, + { + "epoch": 0.3260149667079135, + "grad_norm": 8.309782981872559, + "learning_rate": 1.783115261141799e-05, + "loss": 1.9013, + "step": 51870 + }, + { + "epoch": 0.32607781902461064, + "grad_norm": 7.002206325531006, + "learning_rate": 1.7830733510473336e-05, + "loss": 1.8646, + "step": 51880 + }, + { + "epoch": 0.3261406713413077, + "grad_norm": 8.223302841186523, + "learning_rate": 1.783031440952868e-05, + "loss": 1.7973, + "step": 51890 + }, + { + "epoch": 0.3262035236580048, + "grad_norm": 8.51438045501709, + "learning_rate": 1.7829895308584026e-05, + "loss": 1.9426, + "step": 51900 + }, + { + "epoch": 0.3262663759747019, + "grad_norm": 7.0320563316345215, + "learning_rate": 1.7829476207639373e-05, + "loss": 1.8232, + "step": 51910 + }, + { + "epoch": 0.32632922829139904, + "grad_norm": 8.625299453735352, + "learning_rate": 1.782905710669472e-05, + "loss": 1.8611, + "step": 51920 + }, + { + "epoch": 0.32639208060809616, + "grad_norm": 6.61185359954834, + "learning_rate": 1.7828638005750068e-05, + "loss": 1.6157, + "step": 51930 + }, + { + "epoch": 0.3264549329247933, + "grad_norm": 6.66688871383667, + "learning_rate": 1.782821890480541e-05, + "loss": 1.7748, + "step": 51940 + }, + { + "epoch": 0.3265177852414904, + "grad_norm": 7.54803991317749, + "learning_rate": 1.782779980386076e-05, + "loss": 1.8101, + "step": 51950 + }, + { + "epoch": 0.3265806375581875, + "grad_norm": 7.581246852874756, + "learning_rate": 1.7827380702916105e-05, + "loss": 1.7941, + "step": 51960 + }, + { + "epoch": 0.3266434898748846, + "grad_norm": 7.599559783935547, + "learning_rate": 1.7826961601971452e-05, + "loss": 1.763, + "step": 51970 + }, + { + "epoch": 0.32670634219158173, + "grad_norm": 6.512584686279297, + "learning_rate": 1.78265425010268e-05, + "loss": 1.7339, + "step": 51980 + }, + { + "epoch": 0.32676919450827885, + "grad_norm": 6.6277055740356445, + "learning_rate": 1.7826123400082143e-05, + "loss": 1.7485, + "step": 51990 + }, + { + "epoch": 0.32683204682497596, + "grad_norm": 8.265957832336426, + "learning_rate": 1.782570429913749e-05, + "loss": 1.5965, + "step": 52000 + }, + { + "epoch": 0.326894899141673, + "grad_norm": 8.15899658203125, + "learning_rate": 1.7825285198192837e-05, + "loss": 1.8245, + "step": 52010 + }, + { + "epoch": 0.32695775145837014, + "grad_norm": 6.8389081954956055, + "learning_rate": 1.7824866097248184e-05, + "loss": 1.6119, + "step": 52020 + }, + { + "epoch": 0.32702060377506725, + "grad_norm": 7.083780765533447, + "learning_rate": 1.782444699630353e-05, + "loss": 1.774, + "step": 52030 + }, + { + "epoch": 0.32708345609176437, + "grad_norm": 7.419913291931152, + "learning_rate": 1.782402789535888e-05, + "loss": 1.6966, + "step": 52040 + }, + { + "epoch": 0.3271463084084615, + "grad_norm": 5.5379462242126465, + "learning_rate": 1.7823608794414226e-05, + "loss": 2.0746, + "step": 52050 + }, + { + "epoch": 0.3272091607251586, + "grad_norm": 7.2828192710876465, + "learning_rate": 1.7823189693469573e-05, + "loss": 1.8912, + "step": 52060 + }, + { + "epoch": 0.3272720130418557, + "grad_norm": 8.095076560974121, + "learning_rate": 1.7822770592524916e-05, + "loss": 2.0495, + "step": 52070 + }, + { + "epoch": 0.32733486535855283, + "grad_norm": 7.0146484375, + "learning_rate": 1.7822351491580263e-05, + "loss": 1.7892, + "step": 52080 + }, + { + "epoch": 0.32739771767524994, + "grad_norm": 7.499518871307373, + "learning_rate": 1.782193239063561e-05, + "loss": 1.6955, + "step": 52090 + }, + { + "epoch": 0.32746056999194706, + "grad_norm": 6.1241774559021, + "learning_rate": 1.7821513289690958e-05, + "loss": 1.8733, + "step": 52100 + }, + { + "epoch": 0.3275234223086442, + "grad_norm": 6.723125457763672, + "learning_rate": 1.78210941887463e-05, + "loss": 1.6152, + "step": 52110 + }, + { + "epoch": 0.3275862746253413, + "grad_norm": 6.300307273864746, + "learning_rate": 1.782067508780165e-05, + "loss": 1.7901, + "step": 52120 + }, + { + "epoch": 0.3276491269420384, + "grad_norm": 6.7384748458862305, + "learning_rate": 1.7820255986856995e-05, + "loss": 1.8631, + "step": 52130 + }, + { + "epoch": 0.32771197925873546, + "grad_norm": 7.527587890625, + "learning_rate": 1.7819836885912342e-05, + "loss": 1.7774, + "step": 52140 + }, + { + "epoch": 0.3277748315754326, + "grad_norm": 7.368362903594971, + "learning_rate": 1.781941778496769e-05, + "loss": 2.0007, + "step": 52150 + }, + { + "epoch": 0.3278376838921297, + "grad_norm": 8.34139347076416, + "learning_rate": 1.7818998684023033e-05, + "loss": 1.7857, + "step": 52160 + }, + { + "epoch": 0.3279005362088268, + "grad_norm": 7.319367408752441, + "learning_rate": 1.781857958307838e-05, + "loss": 1.8501, + "step": 52170 + }, + { + "epoch": 0.3279633885255239, + "grad_norm": 7.252373218536377, + "learning_rate": 1.7818160482133727e-05, + "loss": 1.8423, + "step": 52180 + }, + { + "epoch": 0.32802624084222104, + "grad_norm": 7.950265884399414, + "learning_rate": 1.7817741381189074e-05, + "loss": 1.7884, + "step": 52190 + }, + { + "epoch": 0.32808909315891815, + "grad_norm": 6.485204219818115, + "learning_rate": 1.781732228024442e-05, + "loss": 1.7036, + "step": 52200 + }, + { + "epoch": 0.32815194547561527, + "grad_norm": 7.385952949523926, + "learning_rate": 1.781690317929977e-05, + "loss": 1.7588, + "step": 52210 + }, + { + "epoch": 0.3282147977923124, + "grad_norm": 6.963294506072998, + "learning_rate": 1.7816484078355116e-05, + "loss": 1.6952, + "step": 52220 + }, + { + "epoch": 0.3282776501090095, + "grad_norm": 8.078579902648926, + "learning_rate": 1.781606497741046e-05, + "loss": 1.8461, + "step": 52230 + }, + { + "epoch": 0.3283405024257066, + "grad_norm": 7.735701084136963, + "learning_rate": 1.7815645876465806e-05, + "loss": 1.9047, + "step": 52240 + }, + { + "epoch": 0.32840335474240373, + "grad_norm": 6.707212924957275, + "learning_rate": 1.7815226775521153e-05, + "loss": 2.0122, + "step": 52250 + }, + { + "epoch": 0.32846620705910085, + "grad_norm": 8.503338813781738, + "learning_rate": 1.78148076745765e-05, + "loss": 1.7775, + "step": 52260 + }, + { + "epoch": 0.3285290593757979, + "grad_norm": 7.937143325805664, + "learning_rate": 1.7814388573631848e-05, + "loss": 1.9494, + "step": 52270 + }, + { + "epoch": 0.328591911692495, + "grad_norm": 8.503870964050293, + "learning_rate": 1.7813969472687195e-05, + "loss": 1.8309, + "step": 52280 + }, + { + "epoch": 0.32865476400919214, + "grad_norm": 6.416479587554932, + "learning_rate": 1.781355037174254e-05, + "loss": 1.6226, + "step": 52290 + }, + { + "epoch": 0.32871761632588925, + "grad_norm": 7.439314365386963, + "learning_rate": 1.7813131270797885e-05, + "loss": 1.6882, + "step": 52300 + }, + { + "epoch": 0.32878046864258637, + "grad_norm": 6.373034477233887, + "learning_rate": 1.7812712169853233e-05, + "loss": 1.7671, + "step": 52310 + }, + { + "epoch": 0.3288433209592835, + "grad_norm": 6.135987281799316, + "learning_rate": 1.781229306890858e-05, + "loss": 2.0315, + "step": 52320 + }, + { + "epoch": 0.3289061732759806, + "grad_norm": 8.025102615356445, + "learning_rate": 1.7811873967963927e-05, + "loss": 1.703, + "step": 52330 + }, + { + "epoch": 0.3289690255926777, + "grad_norm": 8.254257202148438, + "learning_rate": 1.781145486701927e-05, + "loss": 1.8818, + "step": 52340 + }, + { + "epoch": 0.3290318779093748, + "grad_norm": 6.1511688232421875, + "learning_rate": 1.7811035766074617e-05, + "loss": 1.8207, + "step": 52350 + }, + { + "epoch": 0.32909473022607194, + "grad_norm": 6.602275371551514, + "learning_rate": 1.7810616665129964e-05, + "loss": 1.8641, + "step": 52360 + }, + { + "epoch": 0.32915758254276906, + "grad_norm": 7.317796230316162, + "learning_rate": 1.781019756418531e-05, + "loss": 1.7598, + "step": 52370 + }, + { + "epoch": 0.32922043485946617, + "grad_norm": 6.873623847961426, + "learning_rate": 1.7809778463240655e-05, + "loss": 1.9122, + "step": 52380 + }, + { + "epoch": 0.3292832871761633, + "grad_norm": 7.018415451049805, + "learning_rate": 1.7809359362296002e-05, + "loss": 1.8689, + "step": 52390 + }, + { + "epoch": 0.32934613949286035, + "grad_norm": 8.09970760345459, + "learning_rate": 1.780894026135135e-05, + "loss": 1.7076, + "step": 52400 + }, + { + "epoch": 0.32940899180955746, + "grad_norm": 7.095788955688477, + "learning_rate": 1.7808521160406696e-05, + "loss": 1.8184, + "step": 52410 + }, + { + "epoch": 0.3294718441262546, + "grad_norm": 6.006564617156982, + "learning_rate": 1.7808102059462044e-05, + "loss": 1.798, + "step": 52420 + }, + { + "epoch": 0.3295346964429517, + "grad_norm": 7.368823528289795, + "learning_rate": 1.780768295851739e-05, + "loss": 2.0833, + "step": 52430 + }, + { + "epoch": 0.3295975487596488, + "grad_norm": 6.977514266967773, + "learning_rate": 1.7807263857572738e-05, + "loss": 1.7669, + "step": 52440 + }, + { + "epoch": 0.3296604010763459, + "grad_norm": 8.111364364624023, + "learning_rate": 1.7806844756628085e-05, + "loss": 1.7443, + "step": 52450 + }, + { + "epoch": 0.32972325339304304, + "grad_norm": 7.839640140533447, + "learning_rate": 1.7806425655683432e-05, + "loss": 1.7325, + "step": 52460 + }, + { + "epoch": 0.32978610570974015, + "grad_norm": 5.942218780517578, + "learning_rate": 1.7806006554738775e-05, + "loss": 1.6901, + "step": 52470 + }, + { + "epoch": 0.32984895802643727, + "grad_norm": 7.037097930908203, + "learning_rate": 1.7805587453794123e-05, + "loss": 2.0469, + "step": 52480 + }, + { + "epoch": 0.3299118103431344, + "grad_norm": 7.099818229675293, + "learning_rate": 1.780516835284947e-05, + "loss": 1.8316, + "step": 52490 + }, + { + "epoch": 0.3299746626598315, + "grad_norm": 8.093073844909668, + "learning_rate": 1.7804749251904817e-05, + "loss": 1.8714, + "step": 52500 + }, + { + "epoch": 0.3300375149765286, + "grad_norm": 6.274770736694336, + "learning_rate": 1.780433015096016e-05, + "loss": 1.8357, + "step": 52510 + }, + { + "epoch": 0.33010036729322567, + "grad_norm": 5.988602161407471, + "learning_rate": 1.7803911050015507e-05, + "loss": 1.6467, + "step": 52520 + }, + { + "epoch": 0.3301632196099228, + "grad_norm": 6.645780563354492, + "learning_rate": 1.7803491949070855e-05, + "loss": 1.7149, + "step": 52530 + }, + { + "epoch": 0.3302260719266199, + "grad_norm": 6.206699848175049, + "learning_rate": 1.78030728481262e-05, + "loss": 1.7399, + "step": 52540 + }, + { + "epoch": 0.330288924243317, + "grad_norm": 6.97754430770874, + "learning_rate": 1.780265374718155e-05, + "loss": 2.0122, + "step": 52550 + }, + { + "epoch": 0.33035177656001413, + "grad_norm": 7.309817790985107, + "learning_rate": 1.7802234646236892e-05, + "loss": 1.5856, + "step": 52560 + }, + { + "epoch": 0.33041462887671125, + "grad_norm": 6.189268589019775, + "learning_rate": 1.780181554529224e-05, + "loss": 1.9549, + "step": 52570 + }, + { + "epoch": 0.33047748119340836, + "grad_norm": 7.027437210083008, + "learning_rate": 1.7801396444347586e-05, + "loss": 1.5906, + "step": 52580 + }, + { + "epoch": 0.3305403335101055, + "grad_norm": 7.115314960479736, + "learning_rate": 1.7800977343402934e-05, + "loss": 1.7271, + "step": 52590 + }, + { + "epoch": 0.3306031858268026, + "grad_norm": 7.591831684112549, + "learning_rate": 1.780055824245828e-05, + "loss": 2.0359, + "step": 52600 + }, + { + "epoch": 0.3306660381434997, + "grad_norm": 7.378714561462402, + "learning_rate": 1.7800139141513624e-05, + "loss": 1.735, + "step": 52610 + }, + { + "epoch": 0.3307288904601968, + "grad_norm": 6.624408721923828, + "learning_rate": 1.779972004056897e-05, + "loss": 1.8707, + "step": 52620 + }, + { + "epoch": 0.33079174277689394, + "grad_norm": 7.652662754058838, + "learning_rate": 1.779930093962432e-05, + "loss": 1.682, + "step": 52630 + }, + { + "epoch": 0.33085459509359105, + "grad_norm": 8.113582611083984, + "learning_rate": 1.7798881838679666e-05, + "loss": 1.8986, + "step": 52640 + }, + { + "epoch": 0.3309174474102881, + "grad_norm": 5.646933555603027, + "learning_rate": 1.7798462737735013e-05, + "loss": 1.6427, + "step": 52650 + }, + { + "epoch": 0.3309802997269852, + "grad_norm": 7.363295555114746, + "learning_rate": 1.779804363679036e-05, + "loss": 1.763, + "step": 52660 + }, + { + "epoch": 0.33104315204368234, + "grad_norm": 6.794223785400391, + "learning_rate": 1.7797624535845707e-05, + "loss": 1.8697, + "step": 52670 + }, + { + "epoch": 0.33110600436037946, + "grad_norm": 7.367708206176758, + "learning_rate": 1.7797205434901054e-05, + "loss": 1.8522, + "step": 52680 + }, + { + "epoch": 0.3311688566770766, + "grad_norm": 7.230896949768066, + "learning_rate": 1.7796786333956397e-05, + "loss": 1.8344, + "step": 52690 + }, + { + "epoch": 0.3312317089937737, + "grad_norm": 6.658416748046875, + "learning_rate": 1.7796367233011745e-05, + "loss": 1.7248, + "step": 52700 + }, + { + "epoch": 0.3312945613104708, + "grad_norm": 6.473068714141846, + "learning_rate": 1.779594813206709e-05, + "loss": 1.8158, + "step": 52710 + }, + { + "epoch": 0.3313574136271679, + "grad_norm": 5.824365615844727, + "learning_rate": 1.779552903112244e-05, + "loss": 1.8317, + "step": 52720 + }, + { + "epoch": 0.33142026594386503, + "grad_norm": 7.017134189605713, + "learning_rate": 1.7795109930177782e-05, + "loss": 1.7481, + "step": 52730 + }, + { + "epoch": 0.33148311826056215, + "grad_norm": 6.68671178817749, + "learning_rate": 1.779469082923313e-05, + "loss": 1.8106, + "step": 52740 + }, + { + "epoch": 0.33154597057725926, + "grad_norm": 5.893295764923096, + "learning_rate": 1.7794271728288477e-05, + "loss": 1.636, + "step": 52750 + }, + { + "epoch": 0.3316088228939564, + "grad_norm": 7.513364791870117, + "learning_rate": 1.7793852627343824e-05, + "loss": 1.7196, + "step": 52760 + }, + { + "epoch": 0.3316716752106535, + "grad_norm": 7.483827590942383, + "learning_rate": 1.779343352639917e-05, + "loss": 1.8745, + "step": 52770 + }, + { + "epoch": 0.33173452752735055, + "grad_norm": 7.519413948059082, + "learning_rate": 1.7793014425454514e-05, + "loss": 1.8776, + "step": 52780 + }, + { + "epoch": 0.33179737984404767, + "grad_norm": 8.790800094604492, + "learning_rate": 1.779259532450986e-05, + "loss": 1.8679, + "step": 52790 + }, + { + "epoch": 0.3318602321607448, + "grad_norm": 7.194511890411377, + "learning_rate": 1.779217622356521e-05, + "loss": 2.0707, + "step": 52800 + }, + { + "epoch": 0.3319230844774419, + "grad_norm": 6.432770252227783, + "learning_rate": 1.7791757122620556e-05, + "loss": 1.7891, + "step": 52810 + }, + { + "epoch": 0.331985936794139, + "grad_norm": 7.2645111083984375, + "learning_rate": 1.7791338021675903e-05, + "loss": 1.7714, + "step": 52820 + }, + { + "epoch": 0.33204878911083613, + "grad_norm": 8.450857162475586, + "learning_rate": 1.779091892073125e-05, + "loss": 1.7657, + "step": 52830 + }, + { + "epoch": 0.33211164142753324, + "grad_norm": 7.276505947113037, + "learning_rate": 1.7790499819786597e-05, + "loss": 1.8322, + "step": 52840 + }, + { + "epoch": 0.33217449374423036, + "grad_norm": 6.824415683746338, + "learning_rate": 1.7790080718841944e-05, + "loss": 1.7138, + "step": 52850 + }, + { + "epoch": 0.3322373460609275, + "grad_norm": 6.38921594619751, + "learning_rate": 1.7789661617897288e-05, + "loss": 1.7159, + "step": 52860 + }, + { + "epoch": 0.3323001983776246, + "grad_norm": 6.562678813934326, + "learning_rate": 1.7789242516952635e-05, + "loss": 1.9596, + "step": 52870 + }, + { + "epoch": 0.3323630506943217, + "grad_norm": 7.544395446777344, + "learning_rate": 1.778882341600798e-05, + "loss": 1.9372, + "step": 52880 + }, + { + "epoch": 0.3324259030110188, + "grad_norm": 7.433197975158691, + "learning_rate": 1.778840431506333e-05, + "loss": 1.8617, + "step": 52890 + }, + { + "epoch": 0.33248875532771593, + "grad_norm": 6.466828346252441, + "learning_rate": 1.7787985214118676e-05, + "loss": 1.594, + "step": 52900 + }, + { + "epoch": 0.332551607644413, + "grad_norm": 6.611237049102783, + "learning_rate": 1.778756611317402e-05, + "loss": 1.831, + "step": 52910 + }, + { + "epoch": 0.3326144599611101, + "grad_norm": 7.221104145050049, + "learning_rate": 1.7787147012229367e-05, + "loss": 1.5431, + "step": 52920 + }, + { + "epoch": 0.3326773122778072, + "grad_norm": 6.376213073730469, + "learning_rate": 1.7786727911284714e-05, + "loss": 1.7078, + "step": 52930 + }, + { + "epoch": 0.33274016459450434, + "grad_norm": 5.897716045379639, + "learning_rate": 1.778630881034006e-05, + "loss": 1.8387, + "step": 52940 + }, + { + "epoch": 0.33280301691120145, + "grad_norm": 6.521352767944336, + "learning_rate": 1.7785889709395408e-05, + "loss": 1.7553, + "step": 52950 + }, + { + "epoch": 0.33286586922789857, + "grad_norm": 5.935555458068848, + "learning_rate": 1.778547060845075e-05, + "loss": 1.8631, + "step": 52960 + }, + { + "epoch": 0.3329287215445957, + "grad_norm": 8.263205528259277, + "learning_rate": 1.77850515075061e-05, + "loss": 1.8783, + "step": 52970 + }, + { + "epoch": 0.3329915738612928, + "grad_norm": 6.078148365020752, + "learning_rate": 1.7784632406561446e-05, + "loss": 1.8977, + "step": 52980 + }, + { + "epoch": 0.3330544261779899, + "grad_norm": 6.746679782867432, + "learning_rate": 1.7784213305616793e-05, + "loss": 1.6983, + "step": 52990 + }, + { + "epoch": 0.33311727849468703, + "grad_norm": 7.494751930236816, + "learning_rate": 1.7783794204672136e-05, + "loss": 1.9726, + "step": 53000 + }, + { + "epoch": 0.33318013081138415, + "grad_norm": 6.933189868927002, + "learning_rate": 1.7783375103727483e-05, + "loss": 1.7022, + "step": 53010 + }, + { + "epoch": 0.33324298312808126, + "grad_norm": 7.498746871948242, + "learning_rate": 1.778295600278283e-05, + "loss": 1.8275, + "step": 53020 + }, + { + "epoch": 0.3333058354447783, + "grad_norm": 6.554727077484131, + "learning_rate": 1.7782536901838178e-05, + "loss": 1.8502, + "step": 53030 + }, + { + "epoch": 0.33336868776147544, + "grad_norm": 6.357724189758301, + "learning_rate": 1.7782117800893525e-05, + "loss": 1.7458, + "step": 53040 + }, + { + "epoch": 0.33343154007817255, + "grad_norm": 6.919586658477783, + "learning_rate": 1.7781698699948872e-05, + "loss": 1.6825, + "step": 53050 + }, + { + "epoch": 0.33349439239486967, + "grad_norm": 7.433612823486328, + "learning_rate": 1.778127959900422e-05, + "loss": 1.7833, + "step": 53060 + }, + { + "epoch": 0.3335572447115668, + "grad_norm": 5.948964595794678, + "learning_rate": 1.7780860498059566e-05, + "loss": 1.6469, + "step": 53070 + }, + { + "epoch": 0.3336200970282639, + "grad_norm": 8.402131080627441, + "learning_rate": 1.7780441397114913e-05, + "loss": 1.8364, + "step": 53080 + }, + { + "epoch": 0.333682949344961, + "grad_norm": 7.735292911529541, + "learning_rate": 1.7780022296170257e-05, + "loss": 1.9143, + "step": 53090 + }, + { + "epoch": 0.3337458016616581, + "grad_norm": 6.684525012969971, + "learning_rate": 1.7779603195225604e-05, + "loss": 1.7593, + "step": 53100 + }, + { + "epoch": 0.33380865397835524, + "grad_norm": 6.776787757873535, + "learning_rate": 1.777918409428095e-05, + "loss": 1.9056, + "step": 53110 + }, + { + "epoch": 0.33387150629505236, + "grad_norm": 6.470606803894043, + "learning_rate": 1.7778764993336298e-05, + "loss": 1.4876, + "step": 53120 + }, + { + "epoch": 0.33393435861174947, + "grad_norm": 6.753844261169434, + "learning_rate": 1.777834589239164e-05, + "loss": 1.7287, + "step": 53130 + }, + { + "epoch": 0.3339972109284466, + "grad_norm": 6.296436786651611, + "learning_rate": 1.777792679144699e-05, + "loss": 1.8241, + "step": 53140 + }, + { + "epoch": 0.3340600632451437, + "grad_norm": 7.144308567047119, + "learning_rate": 1.7777507690502336e-05, + "loss": 1.7879, + "step": 53150 + }, + { + "epoch": 0.33412291556184076, + "grad_norm": 6.159611701965332, + "learning_rate": 1.7777088589557683e-05, + "loss": 1.7772, + "step": 53160 + }, + { + "epoch": 0.3341857678785379, + "grad_norm": 7.0396528244018555, + "learning_rate": 1.777666948861303e-05, + "loss": 1.7225, + "step": 53170 + }, + { + "epoch": 0.334248620195235, + "grad_norm": 7.040628433227539, + "learning_rate": 1.7776250387668373e-05, + "loss": 1.626, + "step": 53180 + }, + { + "epoch": 0.3343114725119321, + "grad_norm": 6.439341068267822, + "learning_rate": 1.777583128672372e-05, + "loss": 1.8533, + "step": 53190 + }, + { + "epoch": 0.3343743248286292, + "grad_norm": 8.195597648620605, + "learning_rate": 1.7775412185779068e-05, + "loss": 1.7672, + "step": 53200 + }, + { + "epoch": 0.33443717714532634, + "grad_norm": 8.984902381896973, + "learning_rate": 1.7774993084834415e-05, + "loss": 1.8945, + "step": 53210 + }, + { + "epoch": 0.33450002946202345, + "grad_norm": 7.737391948699951, + "learning_rate": 1.7774573983889762e-05, + "loss": 1.8272, + "step": 53220 + }, + { + "epoch": 0.33456288177872057, + "grad_norm": 7.130795955657959, + "learning_rate": 1.777415488294511e-05, + "loss": 1.9769, + "step": 53230 + }, + { + "epoch": 0.3346257340954177, + "grad_norm": 6.71053409576416, + "learning_rate": 1.7773735782000452e-05, + "loss": 1.6326, + "step": 53240 + }, + { + "epoch": 0.3346885864121148, + "grad_norm": 7.596933364868164, + "learning_rate": 1.77733166810558e-05, + "loss": 1.754, + "step": 53250 + }, + { + "epoch": 0.3347514387288119, + "grad_norm": 6.899796009063721, + "learning_rate": 1.7772897580111147e-05, + "loss": 1.9545, + "step": 53260 + }, + { + "epoch": 0.334814291045509, + "grad_norm": 7.802043914794922, + "learning_rate": 1.7772478479166494e-05, + "loss": 1.8785, + "step": 53270 + }, + { + "epoch": 0.33487714336220614, + "grad_norm": 6.9452056884765625, + "learning_rate": 1.777205937822184e-05, + "loss": 1.8725, + "step": 53280 + }, + { + "epoch": 0.3349399956789032, + "grad_norm": 6.828030109405518, + "learning_rate": 1.7771640277277188e-05, + "loss": 1.8363, + "step": 53290 + }, + { + "epoch": 0.3350028479956003, + "grad_norm": 6.437834739685059, + "learning_rate": 1.7771221176332535e-05, + "loss": 1.5645, + "step": 53300 + }, + { + "epoch": 0.33506570031229743, + "grad_norm": 6.539505958557129, + "learning_rate": 1.777080207538788e-05, + "loss": 1.8865, + "step": 53310 + }, + { + "epoch": 0.33512855262899455, + "grad_norm": 6.247645854949951, + "learning_rate": 1.7770382974443226e-05, + "loss": 1.8822, + "step": 53320 + }, + { + "epoch": 0.33519140494569166, + "grad_norm": 7.20156192779541, + "learning_rate": 1.7769963873498573e-05, + "loss": 1.8122, + "step": 53330 + }, + { + "epoch": 0.3352542572623888, + "grad_norm": 7.3843207359313965, + "learning_rate": 1.776954477255392e-05, + "loss": 1.7941, + "step": 53340 + }, + { + "epoch": 0.3353171095790859, + "grad_norm": 6.303149700164795, + "learning_rate": 1.7769125671609263e-05, + "loss": 1.5762, + "step": 53350 + }, + { + "epoch": 0.335379961895783, + "grad_norm": 6.769227981567383, + "learning_rate": 1.776870657066461e-05, + "loss": 1.7282, + "step": 53360 + }, + { + "epoch": 0.3354428142124801, + "grad_norm": 5.572915077209473, + "learning_rate": 1.7768287469719958e-05, + "loss": 1.6846, + "step": 53370 + }, + { + "epoch": 0.33550566652917724, + "grad_norm": 8.161036491394043, + "learning_rate": 1.7767868368775305e-05, + "loss": 1.7322, + "step": 53380 + }, + { + "epoch": 0.33556851884587435, + "grad_norm": 7.607860565185547, + "learning_rate": 1.7767449267830652e-05, + "loss": 1.756, + "step": 53390 + }, + { + "epoch": 0.33563137116257147, + "grad_norm": 7.829543113708496, + "learning_rate": 1.7767030166885995e-05, + "loss": 1.6983, + "step": 53400 + }, + { + "epoch": 0.3356942234792686, + "grad_norm": 7.995700359344482, + "learning_rate": 1.7766611065941343e-05, + "loss": 1.817, + "step": 53410 + }, + { + "epoch": 0.33575707579596564, + "grad_norm": 6.766124725341797, + "learning_rate": 1.776619196499669e-05, + "loss": 1.7901, + "step": 53420 + }, + { + "epoch": 0.33581992811266276, + "grad_norm": 7.617290019989014, + "learning_rate": 1.7765772864052037e-05, + "loss": 2.0277, + "step": 53430 + }, + { + "epoch": 0.3358827804293599, + "grad_norm": Infinity, + "learning_rate": 1.7765353763107384e-05, + "loss": 1.8824, + "step": 53440 + }, + { + "epoch": 0.335945632746057, + "grad_norm": 9.397178649902344, + "learning_rate": 1.7764976572257195e-05, + "loss": 2.0714, + "step": 53450 + }, + { + "epoch": 0.3360084850627541, + "grad_norm": 7.2699360847473145, + "learning_rate": 1.7764557471312542e-05, + "loss": 1.7717, + "step": 53460 + }, + { + "epoch": 0.3360713373794512, + "grad_norm": 5.961643218994141, + "learning_rate": 1.776413837036789e-05, + "loss": 1.6335, + "step": 53470 + }, + { + "epoch": 0.33613418969614833, + "grad_norm": 7.276872634887695, + "learning_rate": 1.7763719269423233e-05, + "loss": 1.661, + "step": 53480 + }, + { + "epoch": 0.33619704201284545, + "grad_norm": 6.956408500671387, + "learning_rate": 1.776330016847858e-05, + "loss": 1.7756, + "step": 53490 + }, + { + "epoch": 0.33625989432954256, + "grad_norm": 8.05853271484375, + "learning_rate": 1.7762881067533927e-05, + "loss": 2.0086, + "step": 53500 + }, + { + "epoch": 0.3363227466462397, + "grad_norm": 7.309393405914307, + "learning_rate": 1.7762461966589274e-05, + "loss": 1.965, + "step": 53510 + }, + { + "epoch": 0.3363855989629368, + "grad_norm": 7.199671268463135, + "learning_rate": 1.776204286564462e-05, + "loss": 1.915, + "step": 53520 + }, + { + "epoch": 0.3364484512796339, + "grad_norm": 7.527480602264404, + "learning_rate": 1.7761623764699968e-05, + "loss": 1.8693, + "step": 53530 + }, + { + "epoch": 0.33651130359633097, + "grad_norm": 7.157961845397949, + "learning_rate": 1.7761204663755315e-05, + "loss": 1.8545, + "step": 53540 + }, + { + "epoch": 0.3365741559130281, + "grad_norm": 7.603402614593506, + "learning_rate": 1.776078556281066e-05, + "loss": 1.8986, + "step": 53550 + }, + { + "epoch": 0.3366370082297252, + "grad_norm": 6.873359680175781, + "learning_rate": 1.7760366461866006e-05, + "loss": 1.648, + "step": 53560 + }, + { + "epoch": 0.3366998605464223, + "grad_norm": 7.6509809494018555, + "learning_rate": 1.7759947360921353e-05, + "loss": 1.869, + "step": 53570 + }, + { + "epoch": 0.33676271286311943, + "grad_norm": 7.747193336486816, + "learning_rate": 1.77595282599767e-05, + "loss": 1.7266, + "step": 53580 + }, + { + "epoch": 0.33682556517981654, + "grad_norm": 7.056433200836182, + "learning_rate": 1.7759109159032047e-05, + "loss": 1.8732, + "step": 53590 + }, + { + "epoch": 0.33688841749651366, + "grad_norm": 8.493775367736816, + "learning_rate": 1.7758690058087394e-05, + "loss": 1.8027, + "step": 53600 + }, + { + "epoch": 0.3369512698132108, + "grad_norm": 7.364907741546631, + "learning_rate": 1.7758270957142738e-05, + "loss": 1.8418, + "step": 53610 + }, + { + "epoch": 0.3370141221299079, + "grad_norm": 7.566737651824951, + "learning_rate": 1.7757851856198085e-05, + "loss": 1.8186, + "step": 53620 + }, + { + "epoch": 0.337076974446605, + "grad_norm": 5.394718170166016, + "learning_rate": 1.7757432755253432e-05, + "loss": 1.6161, + "step": 53630 + }, + { + "epoch": 0.3371398267633021, + "grad_norm": 7.8384904861450195, + "learning_rate": 1.775701365430878e-05, + "loss": 1.8057, + "step": 53640 + }, + { + "epoch": 0.33720267907999923, + "grad_norm": 7.048176288604736, + "learning_rate": 1.7756594553364123e-05, + "loss": 1.7877, + "step": 53650 + }, + { + "epoch": 0.33726553139669635, + "grad_norm": 7.966719150543213, + "learning_rate": 1.775617545241947e-05, + "loss": 1.9503, + "step": 53660 + }, + { + "epoch": 0.3373283837133934, + "grad_norm": 6.892568588256836, + "learning_rate": 1.7755756351474817e-05, + "loss": 1.8589, + "step": 53670 + }, + { + "epoch": 0.3373912360300905, + "grad_norm": 5.600237846374512, + "learning_rate": 1.7755337250530164e-05, + "loss": 1.9034, + "step": 53680 + }, + { + "epoch": 0.33745408834678764, + "grad_norm": 6.192532539367676, + "learning_rate": 1.775491814958551e-05, + "loss": 1.7382, + "step": 53690 + }, + { + "epoch": 0.33751694066348475, + "grad_norm": 6.748945713043213, + "learning_rate": 1.7754499048640855e-05, + "loss": 2.1606, + "step": 53700 + }, + { + "epoch": 0.33757979298018187, + "grad_norm": 5.532161712646484, + "learning_rate": 1.7754079947696202e-05, + "loss": 1.7523, + "step": 53710 + }, + { + "epoch": 0.337642645296879, + "grad_norm": 7.209076881408691, + "learning_rate": 1.775366084675155e-05, + "loss": 1.8265, + "step": 53720 + }, + { + "epoch": 0.3377054976135761, + "grad_norm": 7.280900955200195, + "learning_rate": 1.7753241745806896e-05, + "loss": 1.6002, + "step": 53730 + }, + { + "epoch": 0.3377683499302732, + "grad_norm": 5.666275978088379, + "learning_rate": 1.7752822644862243e-05, + "loss": 1.6816, + "step": 53740 + }, + { + "epoch": 0.33783120224697033, + "grad_norm": 6.765921592712402, + "learning_rate": 1.775240354391759e-05, + "loss": 1.9081, + "step": 53750 + }, + { + "epoch": 0.33789405456366745, + "grad_norm": 7.1952290534973145, + "learning_rate": 1.7751984442972937e-05, + "loss": 1.6662, + "step": 53760 + }, + { + "epoch": 0.33795690688036456, + "grad_norm": 7.036599159240723, + "learning_rate": 1.7751565342028284e-05, + "loss": 1.7319, + "step": 53770 + }, + { + "epoch": 0.3380197591970617, + "grad_norm": 7.425091743469238, + "learning_rate": 1.775114624108363e-05, + "loss": 1.9976, + "step": 53780 + }, + { + "epoch": 0.3380826115137588, + "grad_norm": 6.500268459320068, + "learning_rate": 1.7750727140138975e-05, + "loss": 1.6409, + "step": 53790 + }, + { + "epoch": 0.33814546383045585, + "grad_norm": 7.112756729125977, + "learning_rate": 1.7750308039194322e-05, + "loss": 1.8804, + "step": 53800 + }, + { + "epoch": 0.33820831614715297, + "grad_norm": 6.661694049835205, + "learning_rate": 1.774988893824967e-05, + "loss": 1.9926, + "step": 53810 + }, + { + "epoch": 0.3382711684638501, + "grad_norm": 7.794707775115967, + "learning_rate": 1.7749469837305016e-05, + "loss": 1.6295, + "step": 53820 + }, + { + "epoch": 0.3383340207805472, + "grad_norm": 6.211827754974365, + "learning_rate": 1.774905073636036e-05, + "loss": 2.0199, + "step": 53830 + }, + { + "epoch": 0.3383968730972443, + "grad_norm": 7.719405174255371, + "learning_rate": 1.7748631635415707e-05, + "loss": 1.8618, + "step": 53840 + }, + { + "epoch": 0.3384597254139414, + "grad_norm": 6.117846965789795, + "learning_rate": 1.7748212534471054e-05, + "loss": 1.78, + "step": 53850 + }, + { + "epoch": 0.33852257773063854, + "grad_norm": 9.416608810424805, + "learning_rate": 1.77477934335264e-05, + "loss": 1.7694, + "step": 53860 + }, + { + "epoch": 0.33858543004733566, + "grad_norm": 7.474032878875732, + "learning_rate": 1.7747374332581745e-05, + "loss": 1.9287, + "step": 53870 + }, + { + "epoch": 0.33864828236403277, + "grad_norm": 6.153302192687988, + "learning_rate": 1.7746955231637092e-05, + "loss": 1.8605, + "step": 53880 + }, + { + "epoch": 0.3387111346807299, + "grad_norm": 5.692074298858643, + "learning_rate": 1.774653613069244e-05, + "loss": 1.853, + "step": 53890 + }, + { + "epoch": 0.338773986997427, + "grad_norm": 6.859311103820801, + "learning_rate": 1.7746117029747786e-05, + "loss": 1.6117, + "step": 53900 + }, + { + "epoch": 0.3388368393141241, + "grad_norm": 6.528646469116211, + "learning_rate": 1.7745697928803133e-05, + "loss": 1.9483, + "step": 53910 + }, + { + "epoch": 0.33889969163082123, + "grad_norm": 7.2122979164123535, + "learning_rate": 1.774527882785848e-05, + "loss": 1.8461, + "step": 53920 + }, + { + "epoch": 0.3389625439475183, + "grad_norm": 7.391670227050781, + "learning_rate": 1.7744859726913824e-05, + "loss": 1.7604, + "step": 53930 + }, + { + "epoch": 0.3390253962642154, + "grad_norm": 7.458083629608154, + "learning_rate": 1.774444062596917e-05, + "loss": 2.1509, + "step": 53940 + }, + { + "epoch": 0.3390882485809125, + "grad_norm": 8.052411079406738, + "learning_rate": 1.7744021525024518e-05, + "loss": 1.7441, + "step": 53950 + }, + { + "epoch": 0.33915110089760964, + "grad_norm": 7.353691101074219, + "learning_rate": 1.7743602424079865e-05, + "loss": 2.1435, + "step": 53960 + }, + { + "epoch": 0.33921395321430675, + "grad_norm": 7.005837440490723, + "learning_rate": 1.7743183323135212e-05, + "loss": 1.5142, + "step": 53970 + }, + { + "epoch": 0.33927680553100387, + "grad_norm": 7.651731967926025, + "learning_rate": 1.774276422219056e-05, + "loss": 1.8029, + "step": 53980 + }, + { + "epoch": 0.339339657847701, + "grad_norm": 6.751585483551025, + "learning_rate": 1.7742345121245906e-05, + "loss": 1.7118, + "step": 53990 + }, + { + "epoch": 0.3394025101643981, + "grad_norm": 7.8414177894592285, + "learning_rate": 1.7741926020301253e-05, + "loss": 1.6463, + "step": 54000 + }, + { + "epoch": 0.3394653624810952, + "grad_norm": 6.46080207824707, + "learning_rate": 1.7741506919356597e-05, + "loss": 1.7061, + "step": 54010 + }, + { + "epoch": 0.3395282147977923, + "grad_norm": 6.9414777755737305, + "learning_rate": 1.7741087818411944e-05, + "loss": 1.7382, + "step": 54020 + }, + { + "epoch": 0.33959106711448944, + "grad_norm": 7.154740333557129, + "learning_rate": 1.774066871746729e-05, + "loss": 1.9247, + "step": 54030 + }, + { + "epoch": 0.33965391943118656, + "grad_norm": 6.026335716247559, + "learning_rate": 1.774024961652264e-05, + "loss": 1.7836, + "step": 54040 + }, + { + "epoch": 0.3397167717478837, + "grad_norm": 7.3941497802734375, + "learning_rate": 1.7739830515577982e-05, + "loss": 1.8794, + "step": 54050 + }, + { + "epoch": 0.33977962406458073, + "grad_norm": 6.438807010650635, + "learning_rate": 1.773941141463333e-05, + "loss": 1.5947, + "step": 54060 + }, + { + "epoch": 0.33984247638127785, + "grad_norm": 6.488143444061279, + "learning_rate": 1.7738992313688676e-05, + "loss": 1.8451, + "step": 54070 + }, + { + "epoch": 0.33990532869797496, + "grad_norm": 6.967753887176514, + "learning_rate": 1.7738573212744023e-05, + "loss": 1.7057, + "step": 54080 + }, + { + "epoch": 0.3399681810146721, + "grad_norm": 6.895341396331787, + "learning_rate": 1.773815411179937e-05, + "loss": 1.7863, + "step": 54090 + }, + { + "epoch": 0.3400310333313692, + "grad_norm": 7.644385814666748, + "learning_rate": 1.7737735010854714e-05, + "loss": 1.7148, + "step": 54100 + }, + { + "epoch": 0.3400938856480663, + "grad_norm": 6.567039489746094, + "learning_rate": 1.773731590991006e-05, + "loss": 1.5497, + "step": 54110 + }, + { + "epoch": 0.3401567379647634, + "grad_norm": 6.824104309082031, + "learning_rate": 1.7736896808965408e-05, + "loss": 1.5358, + "step": 54120 + }, + { + "epoch": 0.34021959028146054, + "grad_norm": 6.814744472503662, + "learning_rate": 1.7736477708020755e-05, + "loss": 1.787, + "step": 54130 + }, + { + "epoch": 0.34028244259815765, + "grad_norm": 7.5279741287231445, + "learning_rate": 1.7736058607076102e-05, + "loss": 1.7153, + "step": 54140 + }, + { + "epoch": 0.34034529491485477, + "grad_norm": 6.229637622833252, + "learning_rate": 1.773563950613145e-05, + "loss": 1.7908, + "step": 54150 + }, + { + "epoch": 0.3404081472315519, + "grad_norm": 7.762417316436768, + "learning_rate": 1.7735220405186796e-05, + "loss": 1.9416, + "step": 54160 + }, + { + "epoch": 0.340470999548249, + "grad_norm": 7.071630954742432, + "learning_rate": 1.7734801304242144e-05, + "loss": 1.9099, + "step": 54170 + }, + { + "epoch": 0.34053385186494606, + "grad_norm": 7.530044078826904, + "learning_rate": 1.7734382203297487e-05, + "loss": 1.6938, + "step": 54180 + }, + { + "epoch": 0.3405967041816432, + "grad_norm": 7.272950172424316, + "learning_rate": 1.7733963102352834e-05, + "loss": 1.8578, + "step": 54190 + }, + { + "epoch": 0.3406595564983403, + "grad_norm": 7.612779140472412, + "learning_rate": 1.773354400140818e-05, + "loss": 1.7059, + "step": 54200 + }, + { + "epoch": 0.3407224088150374, + "grad_norm": 6.924613952636719, + "learning_rate": 1.773312490046353e-05, + "loss": 1.768, + "step": 54210 + }, + { + "epoch": 0.3407852611317345, + "grad_norm": 6.755637168884277, + "learning_rate": 1.7732705799518875e-05, + "loss": 1.9023, + "step": 54220 + }, + { + "epoch": 0.34084811344843163, + "grad_norm": 6.792628288269043, + "learning_rate": 1.773228669857422e-05, + "loss": 1.7773, + "step": 54230 + }, + { + "epoch": 0.34091096576512875, + "grad_norm": 6.276157379150391, + "learning_rate": 1.7731867597629566e-05, + "loss": 1.779, + "step": 54240 + }, + { + "epoch": 0.34097381808182586, + "grad_norm": 6.791790962219238, + "learning_rate": 1.7731448496684913e-05, + "loss": 1.8692, + "step": 54250 + }, + { + "epoch": 0.341036670398523, + "grad_norm": 7.605082988739014, + "learning_rate": 1.773102939574026e-05, + "loss": 1.5803, + "step": 54260 + }, + { + "epoch": 0.3410995227152201, + "grad_norm": 6.673621654510498, + "learning_rate": 1.7730610294795604e-05, + "loss": 1.7076, + "step": 54270 + }, + { + "epoch": 0.3411623750319172, + "grad_norm": 8.742812156677246, + "learning_rate": 1.773019119385095e-05, + "loss": 1.6247, + "step": 54280 + }, + { + "epoch": 0.3412252273486143, + "grad_norm": 7.297802448272705, + "learning_rate": 1.7729772092906298e-05, + "loss": 1.5804, + "step": 54290 + }, + { + "epoch": 0.34128807966531144, + "grad_norm": 7.189021587371826, + "learning_rate": 1.7729352991961645e-05, + "loss": 1.7823, + "step": 54300 + }, + { + "epoch": 0.3413509319820085, + "grad_norm": 6.340190410614014, + "learning_rate": 1.7728933891016992e-05, + "loss": 1.5445, + "step": 54310 + }, + { + "epoch": 0.3414137842987056, + "grad_norm": 7.6919989585876465, + "learning_rate": 1.7728514790072336e-05, + "loss": 1.7746, + "step": 54320 + }, + { + "epoch": 0.34147663661540273, + "grad_norm": 7.8139753341674805, + "learning_rate": 1.7728095689127683e-05, + "loss": 1.797, + "step": 54330 + }, + { + "epoch": 0.34153948893209984, + "grad_norm": 7.525743007659912, + "learning_rate": 1.772767658818303e-05, + "loss": 1.9476, + "step": 54340 + }, + { + "epoch": 0.34160234124879696, + "grad_norm": 6.680002689361572, + "learning_rate": 1.7727257487238377e-05, + "loss": 1.8654, + "step": 54350 + }, + { + "epoch": 0.3416651935654941, + "grad_norm": 7.364837169647217, + "learning_rate": 1.7726838386293724e-05, + "loss": 1.8191, + "step": 54360 + }, + { + "epoch": 0.3417280458821912, + "grad_norm": 6.190256118774414, + "learning_rate": 1.772641928534907e-05, + "loss": 1.7854, + "step": 54370 + }, + { + "epoch": 0.3417908981988883, + "grad_norm": 5.911518096923828, + "learning_rate": 1.772600018440442e-05, + "loss": 1.6279, + "step": 54380 + }, + { + "epoch": 0.3418537505155854, + "grad_norm": 6.656449794769287, + "learning_rate": 1.7725581083459766e-05, + "loss": 1.7996, + "step": 54390 + }, + { + "epoch": 0.34191660283228253, + "grad_norm": 8.080692291259766, + "learning_rate": 1.7725161982515113e-05, + "loss": 1.8572, + "step": 54400 + }, + { + "epoch": 0.34197945514897965, + "grad_norm": 6.838883876800537, + "learning_rate": 1.7724742881570456e-05, + "loss": 1.6238, + "step": 54410 + }, + { + "epoch": 0.34204230746567676, + "grad_norm": 9.394396781921387, + "learning_rate": 1.7724323780625803e-05, + "loss": 1.7309, + "step": 54420 + }, + { + "epoch": 0.3421051597823739, + "grad_norm": 9.8051118850708, + "learning_rate": 1.772390467968115e-05, + "loss": 1.9172, + "step": 54430 + }, + { + "epoch": 0.34216801209907094, + "grad_norm": 6.973281383514404, + "learning_rate": 1.7723485578736497e-05, + "loss": 1.7838, + "step": 54440 + }, + { + "epoch": 0.34223086441576805, + "grad_norm": 6.628882884979248, + "learning_rate": 1.772306647779184e-05, + "loss": 1.7447, + "step": 54450 + }, + { + "epoch": 0.34229371673246517, + "grad_norm": 7.0264363288879395, + "learning_rate": 1.7722647376847188e-05, + "loss": 1.9837, + "step": 54460 + }, + { + "epoch": 0.3423565690491623, + "grad_norm": 7.414371013641357, + "learning_rate": 1.7722228275902535e-05, + "loss": 1.8179, + "step": 54470 + }, + { + "epoch": 0.3424194213658594, + "grad_norm": 7.856083393096924, + "learning_rate": 1.7721809174957882e-05, + "loss": 1.8555, + "step": 54480 + }, + { + "epoch": 0.3424822736825565, + "grad_norm": 6.976076602935791, + "learning_rate": 1.7721390074013226e-05, + "loss": 1.8994, + "step": 54490 + }, + { + "epoch": 0.34254512599925363, + "grad_norm": 7.5447516441345215, + "learning_rate": 1.7720970973068573e-05, + "loss": 1.7808, + "step": 54500 + }, + { + "epoch": 0.34260797831595075, + "grad_norm": 7.636044502258301, + "learning_rate": 1.772055187212392e-05, + "loss": 2.0319, + "step": 54510 + }, + { + "epoch": 0.34267083063264786, + "grad_norm": 8.525298118591309, + "learning_rate": 1.7720132771179267e-05, + "loss": 1.8216, + "step": 54520 + }, + { + "epoch": 0.342733682949345, + "grad_norm": 7.616324424743652, + "learning_rate": 1.7719713670234614e-05, + "loss": 1.9823, + "step": 54530 + }, + { + "epoch": 0.3427965352660421, + "grad_norm": 7.189038276672363, + "learning_rate": 1.771929456928996e-05, + "loss": 1.8789, + "step": 54540 + }, + { + "epoch": 0.3428593875827392, + "grad_norm": 7.485154628753662, + "learning_rate": 1.771887546834531e-05, + "loss": 1.7238, + "step": 54550 + }, + { + "epoch": 0.3429222398994363, + "grad_norm": 7.713175296783447, + "learning_rate": 1.7718456367400652e-05, + "loss": 1.78, + "step": 54560 + }, + { + "epoch": 0.3429850922161334, + "grad_norm": 6.5029296875, + "learning_rate": 1.7718037266456e-05, + "loss": 1.8195, + "step": 54570 + }, + { + "epoch": 0.3430479445328305, + "grad_norm": 6.359293460845947, + "learning_rate": 1.7717618165511346e-05, + "loss": 1.7736, + "step": 54580 + }, + { + "epoch": 0.3431107968495276, + "grad_norm": 7.335869789123535, + "learning_rate": 1.7717199064566693e-05, + "loss": 1.629, + "step": 54590 + }, + { + "epoch": 0.3431736491662247, + "grad_norm": 6.607364654541016, + "learning_rate": 1.771677996362204e-05, + "loss": 1.7003, + "step": 54600 + }, + { + "epoch": 0.34323650148292184, + "grad_norm": 7.606721878051758, + "learning_rate": 1.7716360862677388e-05, + "loss": 2.1147, + "step": 54610 + }, + { + "epoch": 0.34329935379961896, + "grad_norm": 7.278506755828857, + "learning_rate": 1.7715941761732735e-05, + "loss": 1.7123, + "step": 54620 + }, + { + "epoch": 0.34336220611631607, + "grad_norm": 6.562756538391113, + "learning_rate": 1.7715522660788078e-05, + "loss": 1.8377, + "step": 54630 + }, + { + "epoch": 0.3434250584330132, + "grad_norm": 7.337499141693115, + "learning_rate": 1.7715103559843425e-05, + "loss": 1.7935, + "step": 54640 + }, + { + "epoch": 0.3434879107497103, + "grad_norm": 5.972623825073242, + "learning_rate": 1.7714684458898772e-05, + "loss": 1.8112, + "step": 54650 + }, + { + "epoch": 0.3435507630664074, + "grad_norm": 5.967720985412598, + "learning_rate": 1.771426535795412e-05, + "loss": 1.8698, + "step": 54660 + }, + { + "epoch": 0.34361361538310453, + "grad_norm": 7.01886510848999, + "learning_rate": 1.7713846257009463e-05, + "loss": 1.8793, + "step": 54670 + }, + { + "epoch": 0.34367646769980165, + "grad_norm": 7.248289585113525, + "learning_rate": 1.771342715606481e-05, + "loss": 1.6498, + "step": 54680 + }, + { + "epoch": 0.3437393200164987, + "grad_norm": 6.365307331085205, + "learning_rate": 1.7713008055120157e-05, + "loss": 1.8024, + "step": 54690 + }, + { + "epoch": 0.3438021723331958, + "grad_norm": 9.144411087036133, + "learning_rate": 1.7712588954175504e-05, + "loss": 1.7123, + "step": 54700 + }, + { + "epoch": 0.34386502464989294, + "grad_norm": 5.928186893463135, + "learning_rate": 1.771216985323085e-05, + "loss": 1.5943, + "step": 54710 + }, + { + "epoch": 0.34392787696659005, + "grad_norm": 6.9440436363220215, + "learning_rate": 1.7711750752286195e-05, + "loss": 1.9885, + "step": 54720 + }, + { + "epoch": 0.34399072928328717, + "grad_norm": 6.48063850402832, + "learning_rate": 1.7711331651341542e-05, + "loss": 1.5523, + "step": 54730 + }, + { + "epoch": 0.3440535815999843, + "grad_norm": 7.519895553588867, + "learning_rate": 1.771091255039689e-05, + "loss": 2.0059, + "step": 54740 + }, + { + "epoch": 0.3441164339166814, + "grad_norm": 7.490455627441406, + "learning_rate": 1.7710493449452236e-05, + "loss": 1.5682, + "step": 54750 + }, + { + "epoch": 0.3441792862333785, + "grad_norm": 8.436594009399414, + "learning_rate": 1.7710074348507583e-05, + "loss": 1.7293, + "step": 54760 + }, + { + "epoch": 0.3442421385500756, + "grad_norm": 7.138073444366455, + "learning_rate": 1.770965524756293e-05, + "loss": 2.0913, + "step": 54770 + }, + { + "epoch": 0.34430499086677274, + "grad_norm": 7.458728790283203, + "learning_rate": 1.7709236146618278e-05, + "loss": 1.8845, + "step": 54780 + }, + { + "epoch": 0.34436784318346986, + "grad_norm": 7.629421710968018, + "learning_rate": 1.7708817045673625e-05, + "loss": 1.93, + "step": 54790 + }, + { + "epoch": 0.344430695500167, + "grad_norm": 6.514352798461914, + "learning_rate": 1.7708397944728968e-05, + "loss": 1.6549, + "step": 54800 + }, + { + "epoch": 0.3444935478168641, + "grad_norm": 7.668230056762695, + "learning_rate": 1.7707978843784315e-05, + "loss": 1.8372, + "step": 54810 + }, + { + "epoch": 0.34455640013356115, + "grad_norm": 6.99606466293335, + "learning_rate": 1.7707559742839662e-05, + "loss": 1.7217, + "step": 54820 + }, + { + "epoch": 0.34461925245025826, + "grad_norm": 6.23781156539917, + "learning_rate": 1.770714064189501e-05, + "loss": 1.6478, + "step": 54830 + }, + { + "epoch": 0.3446821047669554, + "grad_norm": 7.972962856292725, + "learning_rate": 1.7706721540950357e-05, + "loss": 1.8881, + "step": 54840 + }, + { + "epoch": 0.3447449570836525, + "grad_norm": 6.710640907287598, + "learning_rate": 1.77063024400057e-05, + "loss": 1.7905, + "step": 54850 + }, + { + "epoch": 0.3448078094003496, + "grad_norm": 7.677894592285156, + "learning_rate": 1.7705883339061047e-05, + "loss": 1.8085, + "step": 54860 + }, + { + "epoch": 0.3448706617170467, + "grad_norm": 7.830748081207275, + "learning_rate": 1.7705464238116394e-05, + "loss": 1.9429, + "step": 54870 + }, + { + "epoch": 0.34493351403374384, + "grad_norm": 6.730197429656982, + "learning_rate": 1.770504513717174e-05, + "loss": 1.6322, + "step": 54880 + }, + { + "epoch": 0.34499636635044095, + "grad_norm": 7.165801048278809, + "learning_rate": 1.7704626036227085e-05, + "loss": 1.9474, + "step": 54890 + }, + { + "epoch": 0.34505921866713807, + "grad_norm": 6.8697710037231445, + "learning_rate": 1.7704206935282432e-05, + "loss": 1.7506, + "step": 54900 + }, + { + "epoch": 0.3451220709838352, + "grad_norm": 6.679593086242676, + "learning_rate": 1.770378783433778e-05, + "loss": 1.8068, + "step": 54910 + }, + { + "epoch": 0.3451849233005323, + "grad_norm": 7.5652241706848145, + "learning_rate": 1.7703368733393126e-05, + "loss": 1.8456, + "step": 54920 + }, + { + "epoch": 0.3452477756172294, + "grad_norm": 6.1844353675842285, + "learning_rate": 1.7702949632448473e-05, + "loss": 1.7487, + "step": 54930 + }, + { + "epoch": 0.34531062793392653, + "grad_norm": 6.531343936920166, + "learning_rate": 1.7702530531503817e-05, + "loss": 1.8684, + "step": 54940 + }, + { + "epoch": 0.3453734802506236, + "grad_norm": 6.96888542175293, + "learning_rate": 1.7702111430559164e-05, + "loss": 1.8261, + "step": 54950 + }, + { + "epoch": 0.3454363325673207, + "grad_norm": 6.074754238128662, + "learning_rate": 1.770169232961451e-05, + "loss": 1.7163, + "step": 54960 + }, + { + "epoch": 0.3454991848840178, + "grad_norm": 7.811606407165527, + "learning_rate": 1.770127322866986e-05, + "loss": 1.9222, + "step": 54970 + }, + { + "epoch": 0.34556203720071493, + "grad_norm": 6.472822666168213, + "learning_rate": 1.7700854127725205e-05, + "loss": 1.6429, + "step": 54980 + }, + { + "epoch": 0.34562488951741205, + "grad_norm": 6.850888252258301, + "learning_rate": 1.7700435026780552e-05, + "loss": 1.7288, + "step": 54990 + }, + { + "epoch": 0.34568774183410916, + "grad_norm": 7.054748058319092, + "learning_rate": 1.77000159258359e-05, + "loss": 1.972, + "step": 55000 + }, + { + "epoch": 0.3457505941508063, + "grad_norm": 6.785097599029541, + "learning_rate": 1.7699596824891247e-05, + "loss": 1.8971, + "step": 55010 + }, + { + "epoch": 0.3458134464675034, + "grad_norm": 7.310171604156494, + "learning_rate": 1.7699177723946594e-05, + "loss": 1.6131, + "step": 55020 + }, + { + "epoch": 0.3458762987842005, + "grad_norm": 6.57820987701416, + "learning_rate": 1.7698758623001937e-05, + "loss": 1.8606, + "step": 55030 + }, + { + "epoch": 0.3459391511008976, + "grad_norm": 7.374070167541504, + "learning_rate": 1.7698339522057284e-05, + "loss": 1.9107, + "step": 55040 + }, + { + "epoch": 0.34600200341759474, + "grad_norm": 7.071714401245117, + "learning_rate": 1.769792042111263e-05, + "loss": 1.6747, + "step": 55050 + }, + { + "epoch": 0.34606485573429185, + "grad_norm": 8.598275184631348, + "learning_rate": 1.769750132016798e-05, + "loss": 1.8978, + "step": 55060 + }, + { + "epoch": 0.34612770805098897, + "grad_norm": 7.332151889801025, + "learning_rate": 1.7697082219223322e-05, + "loss": 1.9403, + "step": 55070 + }, + { + "epoch": 0.34619056036768603, + "grad_norm": 6.51117467880249, + "learning_rate": 1.769666311827867e-05, + "loss": 1.7394, + "step": 55080 + }, + { + "epoch": 0.34625341268438314, + "grad_norm": 6.8738603591918945, + "learning_rate": 1.7696244017334016e-05, + "loss": 1.6063, + "step": 55090 + }, + { + "epoch": 0.34631626500108026, + "grad_norm": 7.8134307861328125, + "learning_rate": 1.7695824916389363e-05, + "loss": 1.7705, + "step": 55100 + }, + { + "epoch": 0.3463791173177774, + "grad_norm": 6.688620090484619, + "learning_rate": 1.7695405815444707e-05, + "loss": 1.7648, + "step": 55110 + }, + { + "epoch": 0.3464419696344745, + "grad_norm": 6.747702121734619, + "learning_rate": 1.7694986714500054e-05, + "loss": 2.0141, + "step": 55120 + }, + { + "epoch": 0.3465048219511716, + "grad_norm": 7.8479461669921875, + "learning_rate": 1.76945676135554e-05, + "loss": 1.7764, + "step": 55130 + }, + { + "epoch": 0.3465676742678687, + "grad_norm": 6.962363243103027, + "learning_rate": 1.769414851261075e-05, + "loss": 1.7716, + "step": 55140 + }, + { + "epoch": 0.34663052658456583, + "grad_norm": 8.543752670288086, + "learning_rate": 1.7693729411666095e-05, + "loss": 1.8749, + "step": 55150 + }, + { + "epoch": 0.34669337890126295, + "grad_norm": 6.781923770904541, + "learning_rate": 1.7693310310721443e-05, + "loss": 1.6538, + "step": 55160 + }, + { + "epoch": 0.34675623121796006, + "grad_norm": 6.297466278076172, + "learning_rate": 1.769289120977679e-05, + "loss": 1.9625, + "step": 55170 + }, + { + "epoch": 0.3468190835346572, + "grad_norm": 6.437690258026123, + "learning_rate": 1.7692472108832133e-05, + "loss": 1.6855, + "step": 55180 + }, + { + "epoch": 0.3468819358513543, + "grad_norm": 7.9445648193359375, + "learning_rate": 1.769205300788748e-05, + "loss": 1.8701, + "step": 55190 + }, + { + "epoch": 0.34694478816805135, + "grad_norm": 7.036828994750977, + "learning_rate": 1.7691633906942827e-05, + "loss": 1.7085, + "step": 55200 + }, + { + "epoch": 0.34700764048474847, + "grad_norm": 5.771168231964111, + "learning_rate": 1.7691214805998174e-05, + "loss": 1.7395, + "step": 55210 + }, + { + "epoch": 0.3470704928014456, + "grad_norm": 7.6461181640625, + "learning_rate": 1.769079570505352e-05, + "loss": 1.7988, + "step": 55220 + }, + { + "epoch": 0.3471333451181427, + "grad_norm": 7.132679462432861, + "learning_rate": 1.769037660410887e-05, + "loss": 1.6534, + "step": 55230 + }, + { + "epoch": 0.3471961974348398, + "grad_norm": 7.007255554199219, + "learning_rate": 1.7689957503164216e-05, + "loss": 2.0081, + "step": 55240 + }, + { + "epoch": 0.34725904975153693, + "grad_norm": 7.715878009796143, + "learning_rate": 1.768953840221956e-05, + "loss": 1.8827, + "step": 55250 + }, + { + "epoch": 0.34732190206823405, + "grad_norm": 7.1222453117370605, + "learning_rate": 1.7689119301274906e-05, + "loss": 1.7782, + "step": 55260 + }, + { + "epoch": 0.34738475438493116, + "grad_norm": 8.725252151489258, + "learning_rate": 1.7688700200330254e-05, + "loss": 1.8346, + "step": 55270 + }, + { + "epoch": 0.3474476067016283, + "grad_norm": 7.030858039855957, + "learning_rate": 1.76882810993856e-05, + "loss": 1.8693, + "step": 55280 + }, + { + "epoch": 0.3475104590183254, + "grad_norm": 6.237267017364502, + "learning_rate": 1.7687861998440944e-05, + "loss": 1.7428, + "step": 55290 + }, + { + "epoch": 0.3475733113350225, + "grad_norm": 6.757734775543213, + "learning_rate": 1.768744289749629e-05, + "loss": 1.6203, + "step": 55300 + }, + { + "epoch": 0.3476361636517196, + "grad_norm": 7.839578628540039, + "learning_rate": 1.768702379655164e-05, + "loss": 1.6332, + "step": 55310 + }, + { + "epoch": 0.34769901596841674, + "grad_norm": 7.068745136260986, + "learning_rate": 1.7686604695606985e-05, + "loss": 2.0161, + "step": 55320 + }, + { + "epoch": 0.3477618682851138, + "grad_norm": 7.032870769500732, + "learning_rate": 1.7686185594662333e-05, + "loss": 1.8001, + "step": 55330 + }, + { + "epoch": 0.3478247206018109, + "grad_norm": 6.8951568603515625, + "learning_rate": 1.7685766493717676e-05, + "loss": 1.946, + "step": 55340 + }, + { + "epoch": 0.347887572918508, + "grad_norm": 6.67368745803833, + "learning_rate": 1.7685347392773023e-05, + "loss": 1.6941, + "step": 55350 + }, + { + "epoch": 0.34795042523520514, + "grad_norm": 7.398200988769531, + "learning_rate": 1.768492829182837e-05, + "loss": 1.623, + "step": 55360 + }, + { + "epoch": 0.34801327755190226, + "grad_norm": 7.012044429779053, + "learning_rate": 1.7684509190883717e-05, + "loss": 1.8401, + "step": 55370 + }, + { + "epoch": 0.34807612986859937, + "grad_norm": 7.086763858795166, + "learning_rate": 1.7684090089939065e-05, + "loss": 1.6442, + "step": 55380 + }, + { + "epoch": 0.3481389821852965, + "grad_norm": 6.619652271270752, + "learning_rate": 1.768367098899441e-05, + "loss": 1.6453, + "step": 55390 + }, + { + "epoch": 0.3482018345019936, + "grad_norm": 8.069409370422363, + "learning_rate": 1.768325188804976e-05, + "loss": 1.8919, + "step": 55400 + }, + { + "epoch": 0.3482646868186907, + "grad_norm": 6.691745758056641, + "learning_rate": 1.7682832787105106e-05, + "loss": 1.7724, + "step": 55410 + }, + { + "epoch": 0.34832753913538783, + "grad_norm": 6.368143558502197, + "learning_rate": 1.768241368616045e-05, + "loss": 1.6134, + "step": 55420 + }, + { + "epoch": 0.34839039145208495, + "grad_norm": 7.056820869445801, + "learning_rate": 1.7681994585215796e-05, + "loss": 2.0447, + "step": 55430 + }, + { + "epoch": 0.34845324376878206, + "grad_norm": 7.876461982727051, + "learning_rate": 1.7681575484271144e-05, + "loss": 1.6026, + "step": 55440 + }, + { + "epoch": 0.3485160960854792, + "grad_norm": 6.074370384216309, + "learning_rate": 1.768115638332649e-05, + "loss": 1.8327, + "step": 55450 + }, + { + "epoch": 0.34857894840217624, + "grad_norm": 6.838947772979736, + "learning_rate": 1.7680737282381838e-05, + "loss": 1.8354, + "step": 55460 + }, + { + "epoch": 0.34864180071887335, + "grad_norm": 6.722227096557617, + "learning_rate": 1.768031818143718e-05, + "loss": 1.7603, + "step": 55470 + }, + { + "epoch": 0.34870465303557047, + "grad_norm": 7.02903413772583, + "learning_rate": 1.767989908049253e-05, + "loss": 1.6104, + "step": 55480 + }, + { + "epoch": 0.3487675053522676, + "grad_norm": 7.181138515472412, + "learning_rate": 1.7679479979547876e-05, + "loss": 1.8003, + "step": 55490 + }, + { + "epoch": 0.3488303576689647, + "grad_norm": 6.824518203735352, + "learning_rate": 1.7679060878603223e-05, + "loss": 1.761, + "step": 55500 + }, + { + "epoch": 0.3488932099856618, + "grad_norm": 6.2776312828063965, + "learning_rate": 1.7678641777658566e-05, + "loss": 1.7548, + "step": 55510 + }, + { + "epoch": 0.3489560623023589, + "grad_norm": 6.086516857147217, + "learning_rate": 1.7678222676713913e-05, + "loss": 1.8004, + "step": 55520 + }, + { + "epoch": 0.34901891461905604, + "grad_norm": 5.898885250091553, + "learning_rate": 1.767780357576926e-05, + "loss": 1.9629, + "step": 55530 + }, + { + "epoch": 0.34908176693575316, + "grad_norm": 7.097609519958496, + "learning_rate": 1.7677384474824607e-05, + "loss": 1.5366, + "step": 55540 + }, + { + "epoch": 0.3491446192524503, + "grad_norm": 8.600543975830078, + "learning_rate": 1.7676965373879955e-05, + "loss": 1.6635, + "step": 55550 + }, + { + "epoch": 0.3492074715691474, + "grad_norm": 7.00174617767334, + "learning_rate": 1.7676546272935298e-05, + "loss": 1.7336, + "step": 55560 + }, + { + "epoch": 0.3492703238858445, + "grad_norm": 7.8161139488220215, + "learning_rate": 1.7676127171990645e-05, + "loss": 1.7531, + "step": 55570 + }, + { + "epoch": 0.3493331762025416, + "grad_norm": 8.320804595947266, + "learning_rate": 1.7675708071045992e-05, + "loss": 1.6346, + "step": 55580 + }, + { + "epoch": 0.3493960285192387, + "grad_norm": 6.894820213317871, + "learning_rate": 1.767528897010134e-05, + "loss": 1.6617, + "step": 55590 + }, + { + "epoch": 0.3494588808359358, + "grad_norm": 7.591100692749023, + "learning_rate": 1.7674869869156687e-05, + "loss": 1.9333, + "step": 55600 + }, + { + "epoch": 0.3495217331526329, + "grad_norm": 7.248533248901367, + "learning_rate": 1.7674450768212034e-05, + "loss": 1.8233, + "step": 55610 + }, + { + "epoch": 0.34958458546933, + "grad_norm": 7.202202320098877, + "learning_rate": 1.767403166726738e-05, + "loss": 1.79, + "step": 55620 + }, + { + "epoch": 0.34964743778602714, + "grad_norm": 7.5717620849609375, + "learning_rate": 1.7673612566322728e-05, + "loss": 1.808, + "step": 55630 + }, + { + "epoch": 0.34971029010272425, + "grad_norm": 7.001557350158691, + "learning_rate": 1.7673193465378075e-05, + "loss": 1.7344, + "step": 55640 + }, + { + "epoch": 0.34977314241942137, + "grad_norm": 6.482027530670166, + "learning_rate": 1.767277436443342e-05, + "loss": 1.7985, + "step": 55650 + }, + { + "epoch": 0.3498359947361185, + "grad_norm": 7.108136177062988, + "learning_rate": 1.7672355263488766e-05, + "loss": 1.7908, + "step": 55660 + }, + { + "epoch": 0.3498988470528156, + "grad_norm": 7.42733097076416, + "learning_rate": 1.7671936162544113e-05, + "loss": 1.8822, + "step": 55670 + }, + { + "epoch": 0.3499616993695127, + "grad_norm": 6.271646976470947, + "learning_rate": 1.767151706159946e-05, + "loss": 1.685, + "step": 55680 + }, + { + "epoch": 0.35002455168620983, + "grad_norm": 6.688443183898926, + "learning_rate": 1.7671097960654803e-05, + "loss": 1.7514, + "step": 55690 + }, + { + "epoch": 0.35008740400290694, + "grad_norm": 6.73694372177124, + "learning_rate": 1.767067885971015e-05, + "loss": 1.658, + "step": 55700 + }, + { + "epoch": 0.350150256319604, + "grad_norm": 6.9528350830078125, + "learning_rate": 1.7670259758765498e-05, + "loss": 1.7752, + "step": 55710 + }, + { + "epoch": 0.3502131086363011, + "grad_norm": 9.307398796081543, + "learning_rate": 1.7669840657820845e-05, + "loss": 1.7258, + "step": 55720 + }, + { + "epoch": 0.35027596095299823, + "grad_norm": 7.5922088623046875, + "learning_rate": 1.7669421556876188e-05, + "loss": 1.6945, + "step": 55730 + }, + { + "epoch": 0.35033881326969535, + "grad_norm": 7.935495376586914, + "learning_rate": 1.7669002455931535e-05, + "loss": 1.9312, + "step": 55740 + }, + { + "epoch": 0.35040166558639246, + "grad_norm": 7.695913791656494, + "learning_rate": 1.7668583354986882e-05, + "loss": 1.7118, + "step": 55750 + }, + { + "epoch": 0.3504645179030896, + "grad_norm": 6.249854564666748, + "learning_rate": 1.766816425404223e-05, + "loss": 1.7209, + "step": 55760 + }, + { + "epoch": 0.3505273702197867, + "grad_norm": 7.787451267242432, + "learning_rate": 1.7667745153097577e-05, + "loss": 1.7256, + "step": 55770 + }, + { + "epoch": 0.3505902225364838, + "grad_norm": 6.477205753326416, + "learning_rate": 1.7667326052152924e-05, + "loss": 1.7085, + "step": 55780 + }, + { + "epoch": 0.3506530748531809, + "grad_norm": 6.334481716156006, + "learning_rate": 1.766690695120827e-05, + "loss": 1.6984, + "step": 55790 + }, + { + "epoch": 0.35071592716987804, + "grad_norm": 7.644689083099365, + "learning_rate": 1.7666487850263618e-05, + "loss": 1.8495, + "step": 55800 + }, + { + "epoch": 0.35077877948657515, + "grad_norm": 6.420305252075195, + "learning_rate": 1.766606874931896e-05, + "loss": 1.6571, + "step": 55810 + }, + { + "epoch": 0.35084163180327227, + "grad_norm": 6.823786735534668, + "learning_rate": 1.766564964837431e-05, + "loss": 1.942, + "step": 55820 + }, + { + "epoch": 0.3509044841199694, + "grad_norm": 7.600020408630371, + "learning_rate": 1.7665230547429656e-05, + "loss": 1.7846, + "step": 55830 + }, + { + "epoch": 0.35096733643666644, + "grad_norm": 7.309628486633301, + "learning_rate": 1.7664811446485003e-05, + "loss": 1.81, + "step": 55840 + }, + { + "epoch": 0.35103018875336356, + "grad_norm": 8.144548416137695, + "learning_rate": 1.766439234554035e-05, + "loss": 2.0249, + "step": 55850 + }, + { + "epoch": 0.3510930410700607, + "grad_norm": 7.690821170806885, + "learning_rate": 1.7663973244595697e-05, + "loss": 1.7633, + "step": 55860 + }, + { + "epoch": 0.3511558933867578, + "grad_norm": 7.913681983947754, + "learning_rate": 1.766355414365104e-05, + "loss": 1.8007, + "step": 55870 + }, + { + "epoch": 0.3512187457034549, + "grad_norm": 6.931325435638428, + "learning_rate": 1.7663135042706388e-05, + "loss": 1.9441, + "step": 55880 + }, + { + "epoch": 0.351281598020152, + "grad_norm": 6.242430686950684, + "learning_rate": 1.7662715941761735e-05, + "loss": 1.5776, + "step": 55890 + }, + { + "epoch": 0.35134445033684913, + "grad_norm": 6.786398410797119, + "learning_rate": 1.7662296840817082e-05, + "loss": 1.7039, + "step": 55900 + }, + { + "epoch": 0.35140730265354625, + "grad_norm": 5.989193916320801, + "learning_rate": 1.7661877739872425e-05, + "loss": 1.869, + "step": 55910 + }, + { + "epoch": 0.35147015497024336, + "grad_norm": 5.95859956741333, + "learning_rate": 1.7661458638927772e-05, + "loss": 1.704, + "step": 55920 + }, + { + "epoch": 0.3515330072869405, + "grad_norm": 6.023372173309326, + "learning_rate": 1.766103953798312e-05, + "loss": 1.8161, + "step": 55930 + }, + { + "epoch": 0.3515958596036376, + "grad_norm": 7.1503005027771, + "learning_rate": 1.7660620437038467e-05, + "loss": 1.911, + "step": 55940 + }, + { + "epoch": 0.3516587119203347, + "grad_norm": 7.601272106170654, + "learning_rate": 1.7660201336093814e-05, + "loss": 1.8136, + "step": 55950 + }, + { + "epoch": 0.3517215642370318, + "grad_norm": 5.760213375091553, + "learning_rate": 1.7659782235149157e-05, + "loss": 1.8077, + "step": 55960 + }, + { + "epoch": 0.3517844165537289, + "grad_norm": 7.783840179443359, + "learning_rate": 1.7659363134204504e-05, + "loss": 1.9415, + "step": 55970 + }, + { + "epoch": 0.351847268870426, + "grad_norm": 6.4394073486328125, + "learning_rate": 1.765894403325985e-05, + "loss": 1.7765, + "step": 55980 + }, + { + "epoch": 0.3519101211871231, + "grad_norm": 8.876471519470215, + "learning_rate": 1.76585249323152e-05, + "loss": 2.0747, + "step": 55990 + }, + { + "epoch": 0.35197297350382023, + "grad_norm": 6.266054630279541, + "learning_rate": 1.7658105831370546e-05, + "loss": 1.7959, + "step": 56000 + }, + { + "epoch": 0.35203582582051735, + "grad_norm": 7.980495452880859, + "learning_rate": 1.7657686730425893e-05, + "loss": 1.6238, + "step": 56010 + }, + { + "epoch": 0.35209867813721446, + "grad_norm": 6.909236907958984, + "learning_rate": 1.765726762948124e-05, + "loss": 1.7801, + "step": 56020 + }, + { + "epoch": 0.3521615304539116, + "grad_norm": 7.640350341796875, + "learning_rate": 1.7656848528536587e-05, + "loss": 1.6987, + "step": 56030 + }, + { + "epoch": 0.3522243827706087, + "grad_norm": 6.777658462524414, + "learning_rate": 1.765642942759193e-05, + "loss": 1.8309, + "step": 56040 + }, + { + "epoch": 0.3522872350873058, + "grad_norm": 7.5713725090026855, + "learning_rate": 1.7656010326647278e-05, + "loss": 1.7728, + "step": 56050 + }, + { + "epoch": 0.3523500874040029, + "grad_norm": 7.952373027801514, + "learning_rate": 1.7655591225702625e-05, + "loss": 1.9563, + "step": 56060 + }, + { + "epoch": 0.35241293972070004, + "grad_norm": 7.443223476409912, + "learning_rate": 1.7655172124757972e-05, + "loss": 1.8321, + "step": 56070 + }, + { + "epoch": 0.35247579203739715, + "grad_norm": 6.388519287109375, + "learning_rate": 1.765475302381332e-05, + "loss": 1.593, + "step": 56080 + }, + { + "epoch": 0.35253864435409427, + "grad_norm": 6.434205055236816, + "learning_rate": 1.7654333922868662e-05, + "loss": 1.991, + "step": 56090 + }, + { + "epoch": 0.3526014966707913, + "grad_norm": 6.544973850250244, + "learning_rate": 1.765391482192401e-05, + "loss": 1.7038, + "step": 56100 + }, + { + "epoch": 0.35266434898748844, + "grad_norm": 6.311568260192871, + "learning_rate": 1.7653495720979357e-05, + "loss": 1.8253, + "step": 56110 + }, + { + "epoch": 0.35272720130418556, + "grad_norm": 7.9773359298706055, + "learning_rate": 1.7653076620034704e-05, + "loss": 1.7429, + "step": 56120 + }, + { + "epoch": 0.35279005362088267, + "grad_norm": 7.168190002441406, + "learning_rate": 1.7652657519090047e-05, + "loss": 1.7981, + "step": 56130 + }, + { + "epoch": 0.3528529059375798, + "grad_norm": 7.579220294952393, + "learning_rate": 1.7652238418145394e-05, + "loss": 1.6961, + "step": 56140 + }, + { + "epoch": 0.3529157582542769, + "grad_norm": 8.63365364074707, + "learning_rate": 1.765181931720074e-05, + "loss": 1.8155, + "step": 56150 + }, + { + "epoch": 0.352978610570974, + "grad_norm": 6.581787109375, + "learning_rate": 1.765140021625609e-05, + "loss": 1.8945, + "step": 56160 + }, + { + "epoch": 0.35304146288767113, + "grad_norm": 6.249863147735596, + "learning_rate": 1.7650981115311436e-05, + "loss": 1.8585, + "step": 56170 + }, + { + "epoch": 0.35310431520436825, + "grad_norm": 7.682397365570068, + "learning_rate": 1.7650562014366783e-05, + "loss": 1.5529, + "step": 56180 + }, + { + "epoch": 0.35316716752106536, + "grad_norm": 6.750224590301514, + "learning_rate": 1.7650142913422126e-05, + "loss": 1.7804, + "step": 56190 + }, + { + "epoch": 0.3532300198377625, + "grad_norm": 7.524214267730713, + "learning_rate": 1.7649723812477473e-05, + "loss": 1.5981, + "step": 56200 + }, + { + "epoch": 0.3532928721544596, + "grad_norm": 6.7702317237854, + "learning_rate": 1.764930471153282e-05, + "loss": 1.9306, + "step": 56210 + }, + { + "epoch": 0.35335572447115665, + "grad_norm": 7.400282859802246, + "learning_rate": 1.7648885610588168e-05, + "loss": 1.7075, + "step": 56220 + }, + { + "epoch": 0.35341857678785377, + "grad_norm": 7.539808750152588, + "learning_rate": 1.7648466509643515e-05, + "loss": 1.8542, + "step": 56230 + }, + { + "epoch": 0.3534814291045509, + "grad_norm": 6.924534320831299, + "learning_rate": 1.7648047408698862e-05, + "loss": 1.8836, + "step": 56240 + }, + { + "epoch": 0.353544281421248, + "grad_norm": 7.1117844581604, + "learning_rate": 1.764762830775421e-05, + "loss": 1.63, + "step": 56250 + }, + { + "epoch": 0.3536071337379451, + "grad_norm": 7.669754981994629, + "learning_rate": 1.7647209206809556e-05, + "loss": 1.6712, + "step": 56260 + }, + { + "epoch": 0.3536699860546422, + "grad_norm": 6.416409969329834, + "learning_rate": 1.76467901058649e-05, + "loss": 1.9183, + "step": 56270 + }, + { + "epoch": 0.35373283837133934, + "grad_norm": 6.7374587059021, + "learning_rate": 1.7646371004920247e-05, + "loss": 1.8945, + "step": 56280 + }, + { + "epoch": 0.35379569068803646, + "grad_norm": 5.99789571762085, + "learning_rate": 1.7645951903975594e-05, + "loss": 1.6532, + "step": 56290 + }, + { + "epoch": 0.3538585430047336, + "grad_norm": 6.0619354248046875, + "learning_rate": 1.764553280303094e-05, + "loss": 1.7796, + "step": 56300 + }, + { + "epoch": 0.3539213953214307, + "grad_norm": 6.364901542663574, + "learning_rate": 1.7645113702086284e-05, + "loss": 1.744, + "step": 56310 + }, + { + "epoch": 0.3539842476381278, + "grad_norm": 6.619995594024658, + "learning_rate": 1.764469460114163e-05, + "loss": 1.7518, + "step": 56320 + }, + { + "epoch": 0.3540470999548249, + "grad_norm": 9.150291442871094, + "learning_rate": 1.764427550019698e-05, + "loss": 1.6545, + "step": 56330 + }, + { + "epoch": 0.35410995227152203, + "grad_norm": 7.180041790008545, + "learning_rate": 1.7643856399252326e-05, + "loss": 1.6696, + "step": 56340 + }, + { + "epoch": 0.3541728045882191, + "grad_norm": 6.223598480224609, + "learning_rate": 1.7643437298307673e-05, + "loss": 1.9895, + "step": 56350 + }, + { + "epoch": 0.3542356569049162, + "grad_norm": 6.764214038848877, + "learning_rate": 1.7643018197363016e-05, + "loss": 1.7926, + "step": 56360 + }, + { + "epoch": 0.3542985092216133, + "grad_norm": 7.384403705596924, + "learning_rate": 1.7642599096418364e-05, + "loss": 1.7538, + "step": 56370 + }, + { + "epoch": 0.35436136153831044, + "grad_norm": 6.367946624755859, + "learning_rate": 1.764217999547371e-05, + "loss": 1.9666, + "step": 56380 + }, + { + "epoch": 0.35442421385500755, + "grad_norm": 6.674903869628906, + "learning_rate": 1.7641760894529058e-05, + "loss": 1.7066, + "step": 56390 + }, + { + "epoch": 0.35448706617170467, + "grad_norm": 7.0921549797058105, + "learning_rate": 1.7641341793584405e-05, + "loss": 1.5394, + "step": 56400 + }, + { + "epoch": 0.3545499184884018, + "grad_norm": 7.411281585693359, + "learning_rate": 1.7640922692639752e-05, + "loss": 1.6813, + "step": 56410 + }, + { + "epoch": 0.3546127708050989, + "grad_norm": 6.548740386962891, + "learning_rate": 1.76405035916951e-05, + "loss": 1.7838, + "step": 56420 + }, + { + "epoch": 0.354675623121796, + "grad_norm": 7.3828840255737305, + "learning_rate": 1.7640084490750446e-05, + "loss": 1.7112, + "step": 56430 + }, + { + "epoch": 0.35473847543849313, + "grad_norm": 6.164304733276367, + "learning_rate": 1.763966538980579e-05, + "loss": 1.7097, + "step": 56440 + }, + { + "epoch": 0.35480132775519024, + "grad_norm": 7.027005672454834, + "learning_rate": 1.7639246288861137e-05, + "loss": 2.0333, + "step": 56450 + }, + { + "epoch": 0.35486418007188736, + "grad_norm": 7.092922210693359, + "learning_rate": 1.7638827187916484e-05, + "loss": 1.9929, + "step": 56460 + }, + { + "epoch": 0.3549270323885845, + "grad_norm": 6.158232688903809, + "learning_rate": 1.763840808697183e-05, + "loss": 1.7665, + "step": 56470 + }, + { + "epoch": 0.35498988470528153, + "grad_norm": 7.426692485809326, + "learning_rate": 1.7637988986027178e-05, + "loss": 1.9048, + "step": 56480 + }, + { + "epoch": 0.35505273702197865, + "grad_norm": 7.2738871574401855, + "learning_rate": 1.763756988508252e-05, + "loss": 1.6266, + "step": 56490 + }, + { + "epoch": 0.35511558933867576, + "grad_norm": 6.55369234085083, + "learning_rate": 1.763715078413787e-05, + "loss": 1.6879, + "step": 56500 + }, + { + "epoch": 0.3551784416553729, + "grad_norm": 6.6980767250061035, + "learning_rate": 1.7636731683193216e-05, + "loss": 1.7281, + "step": 56510 + }, + { + "epoch": 0.35524129397207, + "grad_norm": 6.520328044891357, + "learning_rate": 1.7636312582248563e-05, + "loss": 2.037, + "step": 56520 + }, + { + "epoch": 0.3553041462887671, + "grad_norm": 5.384095191955566, + "learning_rate": 1.7635893481303906e-05, + "loss": 1.7778, + "step": 56530 + }, + { + "epoch": 0.3553669986054642, + "grad_norm": 6.548918724060059, + "learning_rate": 1.7635474380359254e-05, + "loss": 1.9156, + "step": 56540 + }, + { + "epoch": 0.35542985092216134, + "grad_norm": 7.573610782623291, + "learning_rate": 1.76350552794146e-05, + "loss": 1.8323, + "step": 56550 + }, + { + "epoch": 0.35549270323885845, + "grad_norm": 8.479838371276855, + "learning_rate": 1.7634636178469948e-05, + "loss": 1.8084, + "step": 56560 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 6.977817058563232, + "learning_rate": 1.7634217077525295e-05, + "loss": 1.6215, + "step": 56570 + }, + { + "epoch": 0.3556184078722527, + "grad_norm": 6.460672855377197, + "learning_rate": 1.763379797658064e-05, + "loss": 1.9539, + "step": 56580 + }, + { + "epoch": 0.3556812601889498, + "grad_norm": 6.926926612854004, + "learning_rate": 1.7633378875635986e-05, + "loss": 1.7129, + "step": 56590 + }, + { + "epoch": 0.3557441125056469, + "grad_norm": 5.917463779449463, + "learning_rate": 1.7632959774691333e-05, + "loss": 1.8907, + "step": 56600 + }, + { + "epoch": 0.355806964822344, + "grad_norm": 8.021463394165039, + "learning_rate": 1.763254067374668e-05, + "loss": 1.7799, + "step": 56610 + }, + { + "epoch": 0.3558698171390411, + "grad_norm": 7.956747531890869, + "learning_rate": 1.7632121572802027e-05, + "loss": 1.598, + "step": 56620 + }, + { + "epoch": 0.3559326694557382, + "grad_norm": 7.105515003204346, + "learning_rate": 1.7631702471857374e-05, + "loss": 1.8763, + "step": 56630 + }, + { + "epoch": 0.3559955217724353, + "grad_norm": 6.7249274253845215, + "learning_rate": 1.763128337091272e-05, + "loss": 1.6253, + "step": 56640 + }, + { + "epoch": 0.35605837408913243, + "grad_norm": 6.784914970397949, + "learning_rate": 1.7630864269968068e-05, + "loss": 1.8498, + "step": 56650 + }, + { + "epoch": 0.35612122640582955, + "grad_norm": 7.381931304931641, + "learning_rate": 1.7630445169023415e-05, + "loss": 1.9151, + "step": 56660 + }, + { + "epoch": 0.35618407872252666, + "grad_norm": 7.608487606048584, + "learning_rate": 1.763002606807876e-05, + "loss": 1.7975, + "step": 56670 + }, + { + "epoch": 0.3562469310392238, + "grad_norm": 7.6619486808776855, + "learning_rate": 1.7629606967134106e-05, + "loss": 1.614, + "step": 56680 + }, + { + "epoch": 0.3563097833559209, + "grad_norm": 5.907870292663574, + "learning_rate": 1.7629187866189453e-05, + "loss": 2.0249, + "step": 56690 + }, + { + "epoch": 0.356372635672618, + "grad_norm": 6.495619773864746, + "learning_rate": 1.76287687652448e-05, + "loss": 2.0282, + "step": 56700 + }, + { + "epoch": 0.3564354879893151, + "grad_norm": 7.167433738708496, + "learning_rate": 1.7628349664300144e-05, + "loss": 1.6641, + "step": 56710 + }, + { + "epoch": 0.35649834030601224, + "grad_norm": 7.4635796546936035, + "learning_rate": 1.762793056335549e-05, + "loss": 2.0146, + "step": 56720 + }, + { + "epoch": 0.35656119262270936, + "grad_norm": 7.188560962677002, + "learning_rate": 1.7627511462410838e-05, + "loss": 1.5855, + "step": 56730 + }, + { + "epoch": 0.3566240449394064, + "grad_norm": 8.38038444519043, + "learning_rate": 1.7627092361466185e-05, + "loss": 1.8899, + "step": 56740 + }, + { + "epoch": 0.35668689725610353, + "grad_norm": 8.065149307250977, + "learning_rate": 1.762667326052153e-05, + "loss": 1.8397, + "step": 56750 + }, + { + "epoch": 0.35674974957280065, + "grad_norm": 6.2089314460754395, + "learning_rate": 1.7626254159576876e-05, + "loss": 1.5817, + "step": 56760 + }, + { + "epoch": 0.35681260188949776, + "grad_norm": 7.829500675201416, + "learning_rate": 1.7625835058632223e-05, + "loss": 1.7704, + "step": 56770 + }, + { + "epoch": 0.3568754542061949, + "grad_norm": 6.593550205230713, + "learning_rate": 1.762541595768757e-05, + "loss": 1.6037, + "step": 56780 + }, + { + "epoch": 0.356938306522892, + "grad_norm": 5.2003045082092285, + "learning_rate": 1.7624996856742917e-05, + "loss": 1.5487, + "step": 56790 + }, + { + "epoch": 0.3570011588395891, + "grad_norm": 7.296988487243652, + "learning_rate": 1.7624577755798264e-05, + "loss": 1.999, + "step": 56800 + }, + { + "epoch": 0.3570640111562862, + "grad_norm": 6.731233596801758, + "learning_rate": 1.762415865485361e-05, + "loss": 1.68, + "step": 56810 + }, + { + "epoch": 0.35712686347298334, + "grad_norm": 8.03806209564209, + "learning_rate": 1.7623739553908955e-05, + "loss": 1.9335, + "step": 56820 + }, + { + "epoch": 0.35718971578968045, + "grad_norm": 7.351186275482178, + "learning_rate": 1.76233204529643e-05, + "loss": 1.7425, + "step": 56830 + }, + { + "epoch": 0.35725256810637757, + "grad_norm": 7.279660701751709, + "learning_rate": 1.762290135201965e-05, + "loss": 1.8906, + "step": 56840 + }, + { + "epoch": 0.3573154204230747, + "grad_norm": 7.247702598571777, + "learning_rate": 1.7622482251074996e-05, + "loss": 1.7041, + "step": 56850 + }, + { + "epoch": 0.35737827273977174, + "grad_norm": 7.7054290771484375, + "learning_rate": 1.7622063150130343e-05, + "loss": 1.7601, + "step": 56860 + }, + { + "epoch": 0.35744112505646886, + "grad_norm": 7.007997989654541, + "learning_rate": 1.762164404918569e-05, + "loss": 2.0024, + "step": 56870 + }, + { + "epoch": 0.35750397737316597, + "grad_norm": 5.976104736328125, + "learning_rate": 1.7621224948241037e-05, + "loss": 2.0155, + "step": 56880 + }, + { + "epoch": 0.3575668296898631, + "grad_norm": 6.137852668762207, + "learning_rate": 1.762080584729638e-05, + "loss": 1.6285, + "step": 56890 + }, + { + "epoch": 0.3576296820065602, + "grad_norm": 7.945735454559326, + "learning_rate": 1.7620386746351728e-05, + "loss": 1.8247, + "step": 56900 + }, + { + "epoch": 0.3576925343232573, + "grad_norm": 6.811553955078125, + "learning_rate": 1.7619967645407075e-05, + "loss": 1.7663, + "step": 56910 + }, + { + "epoch": 0.35775538663995443, + "grad_norm": 7.087430000305176, + "learning_rate": 1.7619548544462422e-05, + "loss": 1.6412, + "step": 56920 + }, + { + "epoch": 0.35781823895665155, + "grad_norm": 8.247753143310547, + "learning_rate": 1.7619129443517766e-05, + "loss": 1.8546, + "step": 56930 + }, + { + "epoch": 0.35788109127334866, + "grad_norm": 7.284507751464844, + "learning_rate": 1.7618710342573113e-05, + "loss": 1.7229, + "step": 56940 + }, + { + "epoch": 0.3579439435900458, + "grad_norm": 6.562889575958252, + "learning_rate": 1.761829124162846e-05, + "loss": 1.823, + "step": 56950 + }, + { + "epoch": 0.3580067959067429, + "grad_norm": 7.026453971862793, + "learning_rate": 1.7617872140683807e-05, + "loss": 1.8297, + "step": 56960 + }, + { + "epoch": 0.35806964822344, + "grad_norm": 6.744218349456787, + "learning_rate": 1.7617453039739154e-05, + "loss": 1.7583, + "step": 56970 + }, + { + "epoch": 0.3581325005401371, + "grad_norm": 7.250420093536377, + "learning_rate": 1.7617033938794498e-05, + "loss": 1.8777, + "step": 56980 + }, + { + "epoch": 0.3581953528568342, + "grad_norm": 7.0358123779296875, + "learning_rate": 1.7616614837849845e-05, + "loss": 1.6609, + "step": 56990 + }, + { + "epoch": 0.3582582051735313, + "grad_norm": 6.225969314575195, + "learning_rate": 1.7616195736905192e-05, + "loss": 1.5569, + "step": 57000 + }, + { + "epoch": 0.3583210574902284, + "grad_norm": 5.838583469390869, + "learning_rate": 1.761577663596054e-05, + "loss": 1.4379, + "step": 57010 + }, + { + "epoch": 0.3583839098069255, + "grad_norm": 6.432792663574219, + "learning_rate": 1.7615357535015886e-05, + "loss": 2.0285, + "step": 57020 + }, + { + "epoch": 0.35844676212362264, + "grad_norm": 9.680623054504395, + "learning_rate": 1.7614938434071233e-05, + "loss": 1.7972, + "step": 57030 + }, + { + "epoch": 0.35850961444031976, + "grad_norm": 7.464869499206543, + "learning_rate": 1.761451933312658e-05, + "loss": 1.6548, + "step": 57040 + }, + { + "epoch": 0.35857246675701687, + "grad_norm": 7.695644855499268, + "learning_rate": 1.7614100232181927e-05, + "loss": 2.1453, + "step": 57050 + }, + { + "epoch": 0.358635319073714, + "grad_norm": 7.432145595550537, + "learning_rate": 1.761368113123727e-05, + "loss": 1.8533, + "step": 57060 + }, + { + "epoch": 0.3586981713904111, + "grad_norm": 8.367108345031738, + "learning_rate": 1.7613262030292618e-05, + "loss": 2.0194, + "step": 57070 + }, + { + "epoch": 0.3587610237071082, + "grad_norm": 8.933586120605469, + "learning_rate": 1.7612842929347965e-05, + "loss": 1.9469, + "step": 57080 + }, + { + "epoch": 0.35882387602380533, + "grad_norm": 7.142405986785889, + "learning_rate": 1.7612423828403312e-05, + "loss": 1.8927, + "step": 57090 + }, + { + "epoch": 0.35888672834050245, + "grad_norm": 6.7373738288879395, + "learning_rate": 1.761200472745866e-05, + "loss": 1.6524, + "step": 57100 + }, + { + "epoch": 0.35894958065719956, + "grad_norm": 6.742250442504883, + "learning_rate": 1.7611585626514003e-05, + "loss": 1.7861, + "step": 57110 + }, + { + "epoch": 0.3590124329738966, + "grad_norm": 6.655519962310791, + "learning_rate": 1.761116652556935e-05, + "loss": 1.8957, + "step": 57120 + }, + { + "epoch": 0.35907528529059374, + "grad_norm": 7.259047508239746, + "learning_rate": 1.7610747424624697e-05, + "loss": 1.8484, + "step": 57130 + }, + { + "epoch": 0.35913813760729085, + "grad_norm": 6.946603298187256, + "learning_rate": 1.7610328323680044e-05, + "loss": 1.8535, + "step": 57140 + }, + { + "epoch": 0.35920098992398797, + "grad_norm": 7.809139728546143, + "learning_rate": 1.7609909222735388e-05, + "loss": 1.7191, + "step": 57150 + }, + { + "epoch": 0.3592638422406851, + "grad_norm": 6.612559795379639, + "learning_rate": 1.7609490121790735e-05, + "loss": 1.9152, + "step": 57160 + }, + { + "epoch": 0.3593266945573822, + "grad_norm": 6.120494365692139, + "learning_rate": 1.7609071020846082e-05, + "loss": 1.8226, + "step": 57170 + }, + { + "epoch": 0.3593895468740793, + "grad_norm": 6.220257759094238, + "learning_rate": 1.760865191990143e-05, + "loss": 1.8188, + "step": 57180 + }, + { + "epoch": 0.35945239919077643, + "grad_norm": 7.373902320861816, + "learning_rate": 1.7608232818956776e-05, + "loss": 1.8435, + "step": 57190 + }, + { + "epoch": 0.35951525150747354, + "grad_norm": 6.463894367218018, + "learning_rate": 1.760781371801212e-05, + "loss": 1.695, + "step": 57200 + }, + { + "epoch": 0.35957810382417066, + "grad_norm": 7.294466495513916, + "learning_rate": 1.7607394617067467e-05, + "loss": 1.7689, + "step": 57210 + }, + { + "epoch": 0.3596409561408678, + "grad_norm": 7.174213409423828, + "learning_rate": 1.7606975516122814e-05, + "loss": 1.9515, + "step": 57220 + }, + { + "epoch": 0.3597038084575649, + "grad_norm": 8.09652042388916, + "learning_rate": 1.760655641517816e-05, + "loss": 1.732, + "step": 57230 + }, + { + "epoch": 0.359766660774262, + "grad_norm": 6.779608726501465, + "learning_rate": 1.7606137314233508e-05, + "loss": 1.8146, + "step": 57240 + }, + { + "epoch": 0.35982951309095906, + "grad_norm": 6.073554039001465, + "learning_rate": 1.7605718213288855e-05, + "loss": 2.0628, + "step": 57250 + }, + { + "epoch": 0.3598923654076562, + "grad_norm": 7.001215934753418, + "learning_rate": 1.7605299112344202e-05, + "loss": 1.7843, + "step": 57260 + }, + { + "epoch": 0.3599552177243533, + "grad_norm": 6.757166862487793, + "learning_rate": 1.760488001139955e-05, + "loss": 1.6847, + "step": 57270 + }, + { + "epoch": 0.3600180700410504, + "grad_norm": 7.732872009277344, + "learning_rate": 1.7604460910454896e-05, + "loss": 1.6415, + "step": 57280 + }, + { + "epoch": 0.3600809223577475, + "grad_norm": 6.817984104156494, + "learning_rate": 1.760404180951024e-05, + "loss": 1.68, + "step": 57290 + }, + { + "epoch": 0.36014377467444464, + "grad_norm": 7.102740287780762, + "learning_rate": 1.7603622708565587e-05, + "loss": 1.7997, + "step": 57300 + }, + { + "epoch": 0.36020662699114175, + "grad_norm": 6.813784122467041, + "learning_rate": 1.7603203607620934e-05, + "loss": 1.8612, + "step": 57310 + }, + { + "epoch": 0.36026947930783887, + "grad_norm": 6.347372055053711, + "learning_rate": 1.760278450667628e-05, + "loss": 1.6315, + "step": 57320 + }, + { + "epoch": 0.360332331624536, + "grad_norm": 8.70351505279541, + "learning_rate": 1.7602365405731625e-05, + "loss": 1.7351, + "step": 57330 + }, + { + "epoch": 0.3603951839412331, + "grad_norm": 7.5761823654174805, + "learning_rate": 1.7601946304786972e-05, + "loss": 1.7419, + "step": 57340 + }, + { + "epoch": 0.3604580362579302, + "grad_norm": 6.638217449188232, + "learning_rate": 1.760152720384232e-05, + "loss": 1.8665, + "step": 57350 + }, + { + "epoch": 0.36052088857462733, + "grad_norm": 7.505904197692871, + "learning_rate": 1.7601108102897666e-05, + "loss": 1.715, + "step": 57360 + }, + { + "epoch": 0.3605837408913244, + "grad_norm": 7.13544225692749, + "learning_rate": 1.760068900195301e-05, + "loss": 1.9328, + "step": 57370 + }, + { + "epoch": 0.3606465932080215, + "grad_norm": 7.589418411254883, + "learning_rate": 1.7600269901008357e-05, + "loss": 1.6518, + "step": 57380 + }, + { + "epoch": 0.3607094455247186, + "grad_norm": 7.7186713218688965, + "learning_rate": 1.7599850800063704e-05, + "loss": 1.7416, + "step": 57390 + }, + { + "epoch": 0.36077229784141573, + "grad_norm": 7.319962978363037, + "learning_rate": 1.759943169911905e-05, + "loss": 1.829, + "step": 57400 + }, + { + "epoch": 0.36083515015811285, + "grad_norm": 7.251899242401123, + "learning_rate": 1.7599012598174398e-05, + "loss": 2.0038, + "step": 57410 + }, + { + "epoch": 0.36089800247480996, + "grad_norm": 6.347203731536865, + "learning_rate": 1.7598593497229745e-05, + "loss": 1.791, + "step": 57420 + }, + { + "epoch": 0.3609608547915071, + "grad_norm": 6.426996231079102, + "learning_rate": 1.7598174396285092e-05, + "loss": 1.917, + "step": 57430 + }, + { + "epoch": 0.3610237071082042, + "grad_norm": 6.93100643157959, + "learning_rate": 1.7597755295340436e-05, + "loss": 1.7918, + "step": 57440 + }, + { + "epoch": 0.3610865594249013, + "grad_norm": 6.777101516723633, + "learning_rate": 1.7597336194395783e-05, + "loss": 1.8492, + "step": 57450 + }, + { + "epoch": 0.3611494117415984, + "grad_norm": 6.8046698570251465, + "learning_rate": 1.759691709345113e-05, + "loss": 1.5858, + "step": 57460 + }, + { + "epoch": 0.36121226405829554, + "grad_norm": 6.279371738433838, + "learning_rate": 1.7596497992506477e-05, + "loss": 1.7344, + "step": 57470 + }, + { + "epoch": 0.36127511637499266, + "grad_norm": 7.617173671722412, + "learning_rate": 1.7596078891561824e-05, + "loss": 1.8127, + "step": 57480 + }, + { + "epoch": 0.36133796869168977, + "grad_norm": 6.920362949371338, + "learning_rate": 1.7595701700711635e-05, + "loss": 1.749, + "step": 57490 + }, + { + "epoch": 0.36140082100838683, + "grad_norm": 6.679447174072266, + "learning_rate": 1.7595282599766982e-05, + "loss": 1.7885, + "step": 57500 + }, + { + "epoch": 0.36146367332508395, + "grad_norm": 6.789583206176758, + "learning_rate": 1.7594863498822326e-05, + "loss": 1.9391, + "step": 57510 + }, + { + "epoch": 0.36152652564178106, + "grad_norm": 7.509593486785889, + "learning_rate": 1.7594444397877673e-05, + "loss": 1.9867, + "step": 57520 + }, + { + "epoch": 0.3615893779584782, + "grad_norm": 7.413074970245361, + "learning_rate": 1.759402529693302e-05, + "loss": 1.8085, + "step": 57530 + }, + { + "epoch": 0.3616522302751753, + "grad_norm": 6.680906772613525, + "learning_rate": 1.7593606195988367e-05, + "loss": 1.8518, + "step": 57540 + }, + { + "epoch": 0.3617150825918724, + "grad_norm": 6.255010604858398, + "learning_rate": 1.7593187095043714e-05, + "loss": 1.8553, + "step": 57550 + }, + { + "epoch": 0.3617779349085695, + "grad_norm": 7.531172275543213, + "learning_rate": 1.759276799409906e-05, + "loss": 1.9392, + "step": 57560 + }, + { + "epoch": 0.36184078722526664, + "grad_norm": 7.901610374450684, + "learning_rate": 1.759234889315441e-05, + "loss": 1.7256, + "step": 57570 + }, + { + "epoch": 0.36190363954196375, + "grad_norm": 7.6437458992004395, + "learning_rate": 1.7591929792209752e-05, + "loss": 1.6673, + "step": 57580 + }, + { + "epoch": 0.36196649185866087, + "grad_norm": 6.726452827453613, + "learning_rate": 1.75915106912651e-05, + "loss": 1.8192, + "step": 57590 + }, + { + "epoch": 0.362029344175358, + "grad_norm": 7.075668811798096, + "learning_rate": 1.7591091590320446e-05, + "loss": 1.5295, + "step": 57600 + }, + { + "epoch": 0.3620921964920551, + "grad_norm": 6.800719738006592, + "learning_rate": 1.7590672489375793e-05, + "loss": 1.5967, + "step": 57610 + }, + { + "epoch": 0.3621550488087522, + "grad_norm": 6.986769676208496, + "learning_rate": 1.759025338843114e-05, + "loss": 1.7338, + "step": 57620 + }, + { + "epoch": 0.36221790112544927, + "grad_norm": 6.586205005645752, + "learning_rate": 1.7589834287486484e-05, + "loss": 1.7687, + "step": 57630 + }, + { + "epoch": 0.3622807534421464, + "grad_norm": 7.2887372970581055, + "learning_rate": 1.758941518654183e-05, + "loss": 1.7958, + "step": 57640 + }, + { + "epoch": 0.3623436057588435, + "grad_norm": 7.047494888305664, + "learning_rate": 1.7588996085597178e-05, + "loss": 1.8834, + "step": 57650 + }, + { + "epoch": 0.3624064580755406, + "grad_norm": 7.804978370666504, + "learning_rate": 1.7588576984652525e-05, + "loss": 1.6487, + "step": 57660 + }, + { + "epoch": 0.36246931039223773, + "grad_norm": 6.975365161895752, + "learning_rate": 1.758815788370787e-05, + "loss": 1.7598, + "step": 57670 + }, + { + "epoch": 0.36253216270893485, + "grad_norm": 7.373959541320801, + "learning_rate": 1.7587738782763216e-05, + "loss": 1.7554, + "step": 57680 + }, + { + "epoch": 0.36259501502563196, + "grad_norm": 7.800610542297363, + "learning_rate": 1.7587319681818563e-05, + "loss": 1.7406, + "step": 57690 + }, + { + "epoch": 0.3626578673423291, + "grad_norm": 7.360167980194092, + "learning_rate": 1.758690058087391e-05, + "loss": 1.6835, + "step": 57700 + }, + { + "epoch": 0.3627207196590262, + "grad_norm": 6.3451247215271, + "learning_rate": 1.7586481479929257e-05, + "loss": 1.9422, + "step": 57710 + }, + { + "epoch": 0.3627835719757233, + "grad_norm": 8.34115219116211, + "learning_rate": 1.7586062378984604e-05, + "loss": 1.9304, + "step": 57720 + }, + { + "epoch": 0.3628464242924204, + "grad_norm": 6.679686546325684, + "learning_rate": 1.758564327803995e-05, + "loss": 1.6348, + "step": 57730 + }, + { + "epoch": 0.36290927660911754, + "grad_norm": 6.9500017166137695, + "learning_rate": 1.75852241770953e-05, + "loss": 1.8888, + "step": 57740 + }, + { + "epoch": 0.36297212892581465, + "grad_norm": 6.302811622619629, + "learning_rate": 1.7584805076150642e-05, + "loss": 1.7369, + "step": 57750 + }, + { + "epoch": 0.3630349812425117, + "grad_norm": 7.762177467346191, + "learning_rate": 1.758438597520599e-05, + "loss": 1.7919, + "step": 57760 + }, + { + "epoch": 0.3630978335592088, + "grad_norm": 6.639379024505615, + "learning_rate": 1.7583966874261336e-05, + "loss": 1.8843, + "step": 57770 + }, + { + "epoch": 0.36316068587590594, + "grad_norm": 6.072000026702881, + "learning_rate": 1.7583547773316683e-05, + "loss": 1.5028, + "step": 57780 + }, + { + "epoch": 0.36322353819260306, + "grad_norm": 7.485529899597168, + "learning_rate": 1.758312867237203e-05, + "loss": 1.9513, + "step": 57790 + }, + { + "epoch": 0.36328639050930017, + "grad_norm": 7.676440238952637, + "learning_rate": 1.7582709571427374e-05, + "loss": 1.569, + "step": 57800 + }, + { + "epoch": 0.3633492428259973, + "grad_norm": 7.731540679931641, + "learning_rate": 1.758229047048272e-05, + "loss": 2.0069, + "step": 57810 + }, + { + "epoch": 0.3634120951426944, + "grad_norm": 7.498253345489502, + "learning_rate": 1.758187136953807e-05, + "loss": 1.9307, + "step": 57820 + }, + { + "epoch": 0.3634749474593915, + "grad_norm": 5.711714267730713, + "learning_rate": 1.7581452268593415e-05, + "loss": 1.6662, + "step": 57830 + }, + { + "epoch": 0.36353779977608863, + "grad_norm": 7.644040584564209, + "learning_rate": 1.7581033167648762e-05, + "loss": 1.7376, + "step": 57840 + }, + { + "epoch": 0.36360065209278575, + "grad_norm": 6.760619163513184, + "learning_rate": 1.7580614066704106e-05, + "loss": 1.8354, + "step": 57850 + }, + { + "epoch": 0.36366350440948286, + "grad_norm": 6.621782302856445, + "learning_rate": 1.7580194965759453e-05, + "loss": 1.7233, + "step": 57860 + }, + { + "epoch": 0.36372635672618, + "grad_norm": 7.562590599060059, + "learning_rate": 1.75797758648148e-05, + "loss": 1.636, + "step": 57870 + }, + { + "epoch": 0.36378920904287704, + "grad_norm": 6.979267120361328, + "learning_rate": 1.7579356763870147e-05, + "loss": 1.8222, + "step": 57880 + }, + { + "epoch": 0.36385206135957415, + "grad_norm": 6.307051181793213, + "learning_rate": 1.757893766292549e-05, + "loss": 1.5847, + "step": 57890 + }, + { + "epoch": 0.36391491367627127, + "grad_norm": 6.724384307861328, + "learning_rate": 1.7578518561980838e-05, + "loss": 1.8899, + "step": 57900 + }, + { + "epoch": 0.3639777659929684, + "grad_norm": 6.279197692871094, + "learning_rate": 1.7578099461036185e-05, + "loss": 1.8336, + "step": 57910 + }, + { + "epoch": 0.3640406183096655, + "grad_norm": 6.882331371307373, + "learning_rate": 1.7577680360091532e-05, + "loss": 1.665, + "step": 57920 + }, + { + "epoch": 0.3641034706263626, + "grad_norm": 5.85809850692749, + "learning_rate": 1.757726125914688e-05, + "loss": 1.8763, + "step": 57930 + }, + { + "epoch": 0.36416632294305973, + "grad_norm": 6.824677467346191, + "learning_rate": 1.7576842158202226e-05, + "loss": 1.7015, + "step": 57940 + }, + { + "epoch": 0.36422917525975684, + "grad_norm": 5.771215915679932, + "learning_rate": 1.7576423057257573e-05, + "loss": 1.759, + "step": 57950 + }, + { + "epoch": 0.36429202757645396, + "grad_norm": 6.597382545471191, + "learning_rate": 1.757600395631292e-05, + "loss": 1.7348, + "step": 57960 + }, + { + "epoch": 0.3643548798931511, + "grad_norm": 7.467145919799805, + "learning_rate": 1.7575584855368268e-05, + "loss": 2.0836, + "step": 57970 + }, + { + "epoch": 0.3644177322098482, + "grad_norm": 7.873134613037109, + "learning_rate": 1.757516575442361e-05, + "loss": 1.8236, + "step": 57980 + }, + { + "epoch": 0.3644805845265453, + "grad_norm": 7.0622239112854, + "learning_rate": 1.757474665347896e-05, + "loss": 1.843, + "step": 57990 + }, + { + "epoch": 0.3645434368432424, + "grad_norm": 7.284359931945801, + "learning_rate": 1.7574327552534305e-05, + "loss": 1.6614, + "step": 58000 + }, + { + "epoch": 0.3646062891599395, + "grad_norm": 6.610737323760986, + "learning_rate": 1.7573908451589653e-05, + "loss": 1.767, + "step": 58010 + }, + { + "epoch": 0.3646691414766366, + "grad_norm": 6.371236324310303, + "learning_rate": 1.7573489350645e-05, + "loss": 1.8746, + "step": 58020 + }, + { + "epoch": 0.3647319937933337, + "grad_norm": 6.598259449005127, + "learning_rate": 1.7573070249700343e-05, + "loss": 1.9581, + "step": 58030 + }, + { + "epoch": 0.3647948461100308, + "grad_norm": 6.762608528137207, + "learning_rate": 1.757265114875569e-05, + "loss": 1.6776, + "step": 58040 + }, + { + "epoch": 0.36485769842672794, + "grad_norm": 6.016773700714111, + "learning_rate": 1.7572232047811037e-05, + "loss": 1.6855, + "step": 58050 + }, + { + "epoch": 0.36492055074342505, + "grad_norm": 5.729305744171143, + "learning_rate": 1.7571812946866384e-05, + "loss": 1.7281, + "step": 58060 + }, + { + "epoch": 0.36498340306012217, + "grad_norm": 6.6363348960876465, + "learning_rate": 1.7571393845921728e-05, + "loss": 1.8752, + "step": 58070 + }, + { + "epoch": 0.3650462553768193, + "grad_norm": 6.771478176116943, + "learning_rate": 1.7570974744977075e-05, + "loss": 1.7828, + "step": 58080 + }, + { + "epoch": 0.3651091076935164, + "grad_norm": 6.573596954345703, + "learning_rate": 1.7570555644032422e-05, + "loss": 1.7419, + "step": 58090 + }, + { + "epoch": 0.3651719600102135, + "grad_norm": 6.653927803039551, + "learning_rate": 1.757013654308777e-05, + "loss": 1.8487, + "step": 58100 + }, + { + "epoch": 0.36523481232691063, + "grad_norm": 8.33236026763916, + "learning_rate": 1.7569717442143116e-05, + "loss": 1.7902, + "step": 58110 + }, + { + "epoch": 0.36529766464360774, + "grad_norm": 6.885913372039795, + "learning_rate": 1.7569298341198464e-05, + "loss": 1.8252, + "step": 58120 + }, + { + "epoch": 0.36536051696030486, + "grad_norm": 6.6269121170043945, + "learning_rate": 1.7568879240253807e-05, + "loss": 1.6639, + "step": 58130 + }, + { + "epoch": 0.3654233692770019, + "grad_norm": 7.685762882232666, + "learning_rate": 1.7568460139309154e-05, + "loss": 2.0038, + "step": 58140 + }, + { + "epoch": 0.36548622159369903, + "grad_norm": 7.660758018493652, + "learning_rate": 1.75680410383645e-05, + "loss": 1.8155, + "step": 58150 + }, + { + "epoch": 0.36554907391039615, + "grad_norm": 6.429567337036133, + "learning_rate": 1.756762193741985e-05, + "loss": 1.78, + "step": 58160 + }, + { + "epoch": 0.36561192622709326, + "grad_norm": 8.127532005310059, + "learning_rate": 1.7567202836475195e-05, + "loss": 1.7227, + "step": 58170 + }, + { + "epoch": 0.3656747785437904, + "grad_norm": 7.09838342666626, + "learning_rate": 1.7566783735530543e-05, + "loss": 1.9313, + "step": 58180 + }, + { + "epoch": 0.3657376308604875, + "grad_norm": 8.586379051208496, + "learning_rate": 1.756636463458589e-05, + "loss": 1.8507, + "step": 58190 + }, + { + "epoch": 0.3658004831771846, + "grad_norm": 7.072535514831543, + "learning_rate": 1.7565945533641233e-05, + "loss": 1.7274, + "step": 58200 + }, + { + "epoch": 0.3658633354938817, + "grad_norm": 7.332555770874023, + "learning_rate": 1.756552643269658e-05, + "loss": 1.5983, + "step": 58210 + }, + { + "epoch": 0.36592618781057884, + "grad_norm": 7.714618682861328, + "learning_rate": 1.7565107331751927e-05, + "loss": 1.7813, + "step": 58220 + }, + { + "epoch": 0.36598904012727596, + "grad_norm": 7.031050682067871, + "learning_rate": 1.7564688230807275e-05, + "loss": 2.0464, + "step": 58230 + }, + { + "epoch": 0.36605189244397307, + "grad_norm": 6.090056419372559, + "learning_rate": 1.756426912986262e-05, + "loss": 1.6593, + "step": 58240 + }, + { + "epoch": 0.3661147447606702, + "grad_norm": 7.858686447143555, + "learning_rate": 1.7563850028917965e-05, + "loss": 1.8271, + "step": 58250 + }, + { + "epoch": 0.3661775970773673, + "grad_norm": 6.74784517288208, + "learning_rate": 1.7563430927973312e-05, + "loss": 1.6977, + "step": 58260 + }, + { + "epoch": 0.36624044939406436, + "grad_norm": 6.780416965484619, + "learning_rate": 1.756301182702866e-05, + "loss": 1.7107, + "step": 58270 + }, + { + "epoch": 0.3663033017107615, + "grad_norm": 7.492496490478516, + "learning_rate": 1.756263463617847e-05, + "loss": 1.5698, + "step": 58280 + }, + { + "epoch": 0.3663661540274586, + "grad_norm": 6.76776647567749, + "learning_rate": 1.7562215535233818e-05, + "loss": 1.8447, + "step": 58290 + }, + { + "epoch": 0.3664290063441557, + "grad_norm": 6.72606086730957, + "learning_rate": 1.7561796434289165e-05, + "loss": 1.6991, + "step": 58300 + }, + { + "epoch": 0.3664918586608528, + "grad_norm": 7.176609516143799, + "learning_rate": 1.7561377333344512e-05, + "loss": 1.7002, + "step": 58310 + }, + { + "epoch": 0.36655471097754994, + "grad_norm": 7.325291633605957, + "learning_rate": 1.7560958232399856e-05, + "loss": 1.663, + "step": 58320 + }, + { + "epoch": 0.36661756329424705, + "grad_norm": 7.918809413909912, + "learning_rate": 1.7560539131455203e-05, + "loss": 1.854, + "step": 58330 + }, + { + "epoch": 0.36668041561094417, + "grad_norm": 7.725828170776367, + "learning_rate": 1.756012003051055e-05, + "loss": 1.7275, + "step": 58340 + }, + { + "epoch": 0.3667432679276413, + "grad_norm": 6.682360649108887, + "learning_rate": 1.7559700929565897e-05, + "loss": 2.0832, + "step": 58350 + }, + { + "epoch": 0.3668061202443384, + "grad_norm": 8.492762565612793, + "learning_rate": 1.7559281828621244e-05, + "loss": 1.5955, + "step": 58360 + }, + { + "epoch": 0.3668689725610355, + "grad_norm": 6.3802385330200195, + "learning_rate": 1.7558862727676588e-05, + "loss": 1.8547, + "step": 58370 + }, + { + "epoch": 0.3669318248777326, + "grad_norm": 6.269882678985596, + "learning_rate": 1.7558443626731935e-05, + "loss": 1.7349, + "step": 58380 + }, + { + "epoch": 0.3669946771944297, + "grad_norm": 7.0758562088012695, + "learning_rate": 1.7558024525787282e-05, + "loss": 1.7896, + "step": 58390 + }, + { + "epoch": 0.3670575295111268, + "grad_norm": 8.282215118408203, + "learning_rate": 1.755760542484263e-05, + "loss": 1.8961, + "step": 58400 + }, + { + "epoch": 0.3671203818278239, + "grad_norm": 6.388606071472168, + "learning_rate": 1.7557186323897976e-05, + "loss": 1.7292, + "step": 58410 + }, + { + "epoch": 0.36718323414452103, + "grad_norm": 6.925660133361816, + "learning_rate": 1.7556767222953323e-05, + "loss": 1.8551, + "step": 58420 + }, + { + "epoch": 0.36724608646121815, + "grad_norm": 6.5192036628723145, + "learning_rate": 1.755634812200867e-05, + "loss": 1.8022, + "step": 58430 + }, + { + "epoch": 0.36730893877791526, + "grad_norm": 7.54512357711792, + "learning_rate": 1.7555929021064017e-05, + "loss": 1.6527, + "step": 58440 + }, + { + "epoch": 0.3673717910946124, + "grad_norm": 7.943971633911133, + "learning_rate": 1.755550992011936e-05, + "loss": 1.9448, + "step": 58450 + }, + { + "epoch": 0.3674346434113095, + "grad_norm": 6.8775811195373535, + "learning_rate": 1.7555090819174708e-05, + "loss": 1.7978, + "step": 58460 + }, + { + "epoch": 0.3674974957280066, + "grad_norm": 7.68377161026001, + "learning_rate": 1.7554671718230055e-05, + "loss": 2.0381, + "step": 58470 + }, + { + "epoch": 0.3675603480447037, + "grad_norm": 6.402366638183594, + "learning_rate": 1.7554252617285402e-05, + "loss": 1.6738, + "step": 58480 + }, + { + "epoch": 0.36762320036140084, + "grad_norm": 7.0784502029418945, + "learning_rate": 1.755383351634075e-05, + "loss": 1.6803, + "step": 58490 + }, + { + "epoch": 0.36768605267809795, + "grad_norm": 7.339481830596924, + "learning_rate": 1.7553414415396093e-05, + "loss": 1.6937, + "step": 58500 + }, + { + "epoch": 0.36774890499479507, + "grad_norm": 6.1548991203308105, + "learning_rate": 1.755299531445144e-05, + "loss": 1.9838, + "step": 58510 + }, + { + "epoch": 0.3678117573114921, + "grad_norm": 6.306966781616211, + "learning_rate": 1.7552576213506787e-05, + "loss": 1.5747, + "step": 58520 + }, + { + "epoch": 0.36787460962818924, + "grad_norm": 6.573767185211182, + "learning_rate": 1.7552157112562134e-05, + "loss": 1.8879, + "step": 58530 + }, + { + "epoch": 0.36793746194488636, + "grad_norm": 6.0497260093688965, + "learning_rate": 1.755173801161748e-05, + "loss": 1.6662, + "step": 58540 + }, + { + "epoch": 0.36800031426158347, + "grad_norm": 7.675485134124756, + "learning_rate": 1.7551318910672825e-05, + "loss": 1.7222, + "step": 58550 + }, + { + "epoch": 0.3680631665782806, + "grad_norm": 7.188555717468262, + "learning_rate": 1.7550899809728172e-05, + "loss": 1.7689, + "step": 58560 + }, + { + "epoch": 0.3681260188949777, + "grad_norm": 7.378721714019775, + "learning_rate": 1.755048070878352e-05, + "loss": 1.8371, + "step": 58570 + }, + { + "epoch": 0.3681888712116748, + "grad_norm": 6.536661148071289, + "learning_rate": 1.7550061607838866e-05, + "loss": 1.8305, + "step": 58580 + }, + { + "epoch": 0.36825172352837193, + "grad_norm": 6.832995891571045, + "learning_rate": 1.754964250689421e-05, + "loss": 1.9062, + "step": 58590 + }, + { + "epoch": 0.36831457584506905, + "grad_norm": 9.356914520263672, + "learning_rate": 1.7549223405949557e-05, + "loss": 1.9665, + "step": 58600 + }, + { + "epoch": 0.36837742816176616, + "grad_norm": 7.143482685089111, + "learning_rate": 1.7548804305004904e-05, + "loss": 1.7404, + "step": 58610 + }, + { + "epoch": 0.3684402804784633, + "grad_norm": 6.5126214027404785, + "learning_rate": 1.754838520406025e-05, + "loss": 1.7681, + "step": 58620 + }, + { + "epoch": 0.3685031327951604, + "grad_norm": 6.593245506286621, + "learning_rate": 1.7547966103115598e-05, + "loss": 1.7833, + "step": 58630 + }, + { + "epoch": 0.3685659851118575, + "grad_norm": 7.299543380737305, + "learning_rate": 1.7547547002170945e-05, + "loss": 1.7029, + "step": 58640 + }, + { + "epoch": 0.36862883742855457, + "grad_norm": 6.611456871032715, + "learning_rate": 1.7547127901226292e-05, + "loss": 1.8048, + "step": 58650 + }, + { + "epoch": 0.3686916897452517, + "grad_norm": 6.490697383880615, + "learning_rate": 1.754670880028164e-05, + "loss": 1.8531, + "step": 58660 + }, + { + "epoch": 0.3687545420619488, + "grad_norm": 6.8458943367004395, + "learning_rate": 1.7546289699336986e-05, + "loss": 1.6215, + "step": 58670 + }, + { + "epoch": 0.3688173943786459, + "grad_norm": 7.012301445007324, + "learning_rate": 1.754587059839233e-05, + "loss": 1.8162, + "step": 58680 + }, + { + "epoch": 0.36888024669534303, + "grad_norm": 6.759365558624268, + "learning_rate": 1.7545451497447677e-05, + "loss": 1.6949, + "step": 58690 + }, + { + "epoch": 0.36894309901204014, + "grad_norm": 6.9743571281433105, + "learning_rate": 1.7545032396503024e-05, + "loss": 1.8654, + "step": 58700 + }, + { + "epoch": 0.36900595132873726, + "grad_norm": 6.6136016845703125, + "learning_rate": 1.754461329555837e-05, + "loss": 1.7852, + "step": 58710 + }, + { + "epoch": 0.3690688036454344, + "grad_norm": 7.7909016609191895, + "learning_rate": 1.7544194194613715e-05, + "loss": 1.8846, + "step": 58720 + }, + { + "epoch": 0.3691316559621315, + "grad_norm": 6.4810404777526855, + "learning_rate": 1.7543775093669062e-05, + "loss": 1.843, + "step": 58730 + }, + { + "epoch": 0.3691945082788286, + "grad_norm": 7.58692741394043, + "learning_rate": 1.754335599272441e-05, + "loss": 1.6527, + "step": 58740 + }, + { + "epoch": 0.3692573605955257, + "grad_norm": 6.1824951171875, + "learning_rate": 1.7542936891779756e-05, + "loss": 1.6651, + "step": 58750 + }, + { + "epoch": 0.36932021291222283, + "grad_norm": 7.710651874542236, + "learning_rate": 1.7542517790835103e-05, + "loss": 1.9004, + "step": 58760 + }, + { + "epoch": 0.36938306522891995, + "grad_norm": 7.556338787078857, + "learning_rate": 1.7542098689890447e-05, + "loss": 1.6783, + "step": 58770 + }, + { + "epoch": 0.369445917545617, + "grad_norm": 7.3827223777771, + "learning_rate": 1.7541679588945794e-05, + "loss": 1.9175, + "step": 58780 + }, + { + "epoch": 0.3695087698623141, + "grad_norm": 5.807162761688232, + "learning_rate": 1.754126048800114e-05, + "loss": 1.8387, + "step": 58790 + }, + { + "epoch": 0.36957162217901124, + "grad_norm": 6.519895553588867, + "learning_rate": 1.7540841387056488e-05, + "loss": 1.9513, + "step": 58800 + }, + { + "epoch": 0.36963447449570835, + "grad_norm": 7.20941686630249, + "learning_rate": 1.7540422286111835e-05, + "loss": 1.6034, + "step": 58810 + }, + { + "epoch": 0.36969732681240547, + "grad_norm": 7.563182353973389, + "learning_rate": 1.754000318516718e-05, + "loss": 1.5939, + "step": 58820 + }, + { + "epoch": 0.3697601791291026, + "grad_norm": 7.159434795379639, + "learning_rate": 1.7539584084222526e-05, + "loss": 1.7039, + "step": 58830 + }, + { + "epoch": 0.3698230314457997, + "grad_norm": 6.115586280822754, + "learning_rate": 1.7539164983277873e-05, + "loss": 1.7867, + "step": 58840 + }, + { + "epoch": 0.3698858837624968, + "grad_norm": 5.896388053894043, + "learning_rate": 1.753874588233322e-05, + "loss": 1.7796, + "step": 58850 + }, + { + "epoch": 0.36994873607919393, + "grad_norm": 6.0734100341796875, + "learning_rate": 1.7538326781388567e-05, + "loss": 1.5258, + "step": 58860 + }, + { + "epoch": 0.37001158839589104, + "grad_norm": 6.663506031036377, + "learning_rate": 1.7537907680443914e-05, + "loss": 1.7604, + "step": 58870 + }, + { + "epoch": 0.37007444071258816, + "grad_norm": 7.216512203216553, + "learning_rate": 1.753748857949926e-05, + "loss": 2.0099, + "step": 58880 + }, + { + "epoch": 0.3701372930292853, + "grad_norm": 7.385026454925537, + "learning_rate": 1.7537069478554608e-05, + "loss": 1.7933, + "step": 58890 + }, + { + "epoch": 0.37020014534598233, + "grad_norm": 7.46403694152832, + "learning_rate": 1.7536650377609952e-05, + "loss": 1.8139, + "step": 58900 + }, + { + "epoch": 0.37026299766267945, + "grad_norm": 8.221874237060547, + "learning_rate": 1.75362312766653e-05, + "loss": 2.0104, + "step": 58910 + }, + { + "epoch": 0.37032584997937656, + "grad_norm": 5.9333109855651855, + "learning_rate": 1.7535812175720646e-05, + "loss": 1.8058, + "step": 58920 + }, + { + "epoch": 0.3703887022960737, + "grad_norm": 5.677400588989258, + "learning_rate": 1.7535393074775993e-05, + "loss": 1.5655, + "step": 58930 + }, + { + "epoch": 0.3704515546127708, + "grad_norm": 6.823220252990723, + "learning_rate": 1.7534973973831337e-05, + "loss": 1.8187, + "step": 58940 + }, + { + "epoch": 0.3705144069294679, + "grad_norm": 6.220883846282959, + "learning_rate": 1.7534554872886684e-05, + "loss": 1.6937, + "step": 58950 + }, + { + "epoch": 0.370577259246165, + "grad_norm": 6.2850189208984375, + "learning_rate": 1.753413577194203e-05, + "loss": 1.6505, + "step": 58960 + }, + { + "epoch": 0.37064011156286214, + "grad_norm": 6.755478382110596, + "learning_rate": 1.7533716670997378e-05, + "loss": 1.6412, + "step": 58970 + }, + { + "epoch": 0.37070296387955926, + "grad_norm": 7.051673412322998, + "learning_rate": 1.7533297570052725e-05, + "loss": 1.9897, + "step": 58980 + }, + { + "epoch": 0.37076581619625637, + "grad_norm": 6.870248317718506, + "learning_rate": 1.753287846910807e-05, + "loss": 1.7811, + "step": 58990 + }, + { + "epoch": 0.3708286685129535, + "grad_norm": 7.873626232147217, + "learning_rate": 1.7532459368163416e-05, + "loss": 2.0148, + "step": 59000 + }, + { + "epoch": 0.3708915208296506, + "grad_norm": 6.326781749725342, + "learning_rate": 1.7532040267218763e-05, + "loss": 1.6766, + "step": 59010 + }, + { + "epoch": 0.3709543731463477, + "grad_norm": 7.179531097412109, + "learning_rate": 1.753162116627411e-05, + "loss": 1.8273, + "step": 59020 + }, + { + "epoch": 0.3710172254630448, + "grad_norm": 6.448795318603516, + "learning_rate": 1.7531202065329457e-05, + "loss": 1.7084, + "step": 59030 + }, + { + "epoch": 0.3710800777797419, + "grad_norm": 7.391201496124268, + "learning_rate": 1.7530782964384804e-05, + "loss": 1.7986, + "step": 59040 + }, + { + "epoch": 0.371142930096439, + "grad_norm": 7.2317070960998535, + "learning_rate": 1.753036386344015e-05, + "loss": 1.6227, + "step": 59050 + }, + { + "epoch": 0.3712057824131361, + "grad_norm": 6.9276227951049805, + "learning_rate": 1.7529944762495498e-05, + "loss": 1.9586, + "step": 59060 + }, + { + "epoch": 0.37126863472983324, + "grad_norm": 6.809334754943848, + "learning_rate": 1.7529525661550842e-05, + "loss": 1.7877, + "step": 59070 + }, + { + "epoch": 0.37133148704653035, + "grad_norm": 6.7816901206970215, + "learning_rate": 1.752910656060619e-05, + "loss": 1.7312, + "step": 59080 + }, + { + "epoch": 0.37139433936322747, + "grad_norm": 6.264307498931885, + "learning_rate": 1.7528687459661536e-05, + "loss": 1.6797, + "step": 59090 + }, + { + "epoch": 0.3714571916799246, + "grad_norm": 6.3187689781188965, + "learning_rate": 1.7528268358716883e-05, + "loss": 1.7063, + "step": 59100 + }, + { + "epoch": 0.3715200439966217, + "grad_norm": 8.220011711120605, + "learning_rate": 1.752784925777223e-05, + "loss": 1.7625, + "step": 59110 + }, + { + "epoch": 0.3715828963133188, + "grad_norm": 6.573159694671631, + "learning_rate": 1.7527430156827574e-05, + "loss": 1.9129, + "step": 59120 + }, + { + "epoch": 0.3716457486300159, + "grad_norm": 6.078289031982422, + "learning_rate": 1.752701105588292e-05, + "loss": 1.9416, + "step": 59130 + }, + { + "epoch": 0.37170860094671304, + "grad_norm": 6.652973175048828, + "learning_rate": 1.7526591954938268e-05, + "loss": 1.8134, + "step": 59140 + }, + { + "epoch": 0.37177145326341016, + "grad_norm": 6.304858684539795, + "learning_rate": 1.7526172853993615e-05, + "loss": 1.7168, + "step": 59150 + }, + { + "epoch": 0.3718343055801072, + "grad_norm": 8.342355728149414, + "learning_rate": 1.7525753753048962e-05, + "loss": 1.9052, + "step": 59160 + }, + { + "epoch": 0.37189715789680433, + "grad_norm": 8.306211471557617, + "learning_rate": 1.7525334652104306e-05, + "loss": 1.7468, + "step": 59170 + }, + { + "epoch": 0.37196001021350145, + "grad_norm": 7.454753398895264, + "learning_rate": 1.7524915551159653e-05, + "loss": 1.7553, + "step": 59180 + }, + { + "epoch": 0.37202286253019856, + "grad_norm": 7.92028284072876, + "learning_rate": 1.7524496450215e-05, + "loss": 1.842, + "step": 59190 + }, + { + "epoch": 0.3720857148468957, + "grad_norm": 5.945263385772705, + "learning_rate": 1.7524077349270347e-05, + "loss": 1.8005, + "step": 59200 + }, + { + "epoch": 0.3721485671635928, + "grad_norm": 7.089756965637207, + "learning_rate": 1.752365824832569e-05, + "loss": 1.8219, + "step": 59210 + }, + { + "epoch": 0.3722114194802899, + "grad_norm": 7.197872638702393, + "learning_rate": 1.7523239147381038e-05, + "loss": 1.9255, + "step": 59220 + }, + { + "epoch": 0.372274271796987, + "grad_norm": 7.652667045593262, + "learning_rate": 1.7522820046436385e-05, + "loss": 1.8811, + "step": 59230 + }, + { + "epoch": 0.37233712411368414, + "grad_norm": 7.3847808837890625, + "learning_rate": 1.7522400945491732e-05, + "loss": 1.7606, + "step": 59240 + }, + { + "epoch": 0.37239997643038125, + "grad_norm": 6.941809177398682, + "learning_rate": 1.752198184454708e-05, + "loss": 1.8537, + "step": 59250 + }, + { + "epoch": 0.37246282874707837, + "grad_norm": 7.309818267822266, + "learning_rate": 1.7521562743602426e-05, + "loss": 1.7466, + "step": 59260 + }, + { + "epoch": 0.3725256810637755, + "grad_norm": 7.6321845054626465, + "learning_rate": 1.7521143642657773e-05, + "loss": 1.9646, + "step": 59270 + }, + { + "epoch": 0.3725885333804726, + "grad_norm": 7.124085426330566, + "learning_rate": 1.752072454171312e-05, + "loss": 1.8152, + "step": 59280 + }, + { + "epoch": 0.37265138569716966, + "grad_norm": 6.121925354003906, + "learning_rate": 1.7520305440768467e-05, + "loss": 1.7495, + "step": 59290 + }, + { + "epoch": 0.37271423801386677, + "grad_norm": 6.808883190155029, + "learning_rate": 1.751988633982381e-05, + "loss": 1.7018, + "step": 59300 + }, + { + "epoch": 0.3727770903305639, + "grad_norm": 6.999438762664795, + "learning_rate": 1.7519467238879158e-05, + "loss": 1.6712, + "step": 59310 + }, + { + "epoch": 0.372839942647261, + "grad_norm": 6.6644086837768555, + "learning_rate": 1.7519048137934505e-05, + "loss": 1.7708, + "step": 59320 + }, + { + "epoch": 0.3729027949639581, + "grad_norm": 7.2013840675354, + "learning_rate": 1.7518629036989852e-05, + "loss": 1.8504, + "step": 59330 + }, + { + "epoch": 0.37296564728065523, + "grad_norm": 7.2592082023620605, + "learning_rate": 1.7518209936045196e-05, + "loss": 1.796, + "step": 59340 + }, + { + "epoch": 0.37302849959735235, + "grad_norm": 6.626367568969727, + "learning_rate": 1.7517790835100543e-05, + "loss": 1.8003, + "step": 59350 + }, + { + "epoch": 0.37309135191404946, + "grad_norm": 6.0213398933410645, + "learning_rate": 1.751737173415589e-05, + "loss": 1.5787, + "step": 59360 + }, + { + "epoch": 0.3731542042307466, + "grad_norm": 7.501685619354248, + "learning_rate": 1.7516952633211237e-05, + "loss": 1.9043, + "step": 59370 + }, + { + "epoch": 0.3732170565474437, + "grad_norm": 5.493393898010254, + "learning_rate": 1.7516533532266584e-05, + "loss": 1.6693, + "step": 59380 + }, + { + "epoch": 0.3732799088641408, + "grad_norm": 5.734950542449951, + "learning_rate": 1.7516114431321928e-05, + "loss": 1.8853, + "step": 59390 + }, + { + "epoch": 0.3733427611808379, + "grad_norm": 6.7122273445129395, + "learning_rate": 1.7515695330377275e-05, + "loss": 1.8712, + "step": 59400 + }, + { + "epoch": 0.37340561349753504, + "grad_norm": 7.50879430770874, + "learning_rate": 1.7515276229432622e-05, + "loss": 1.9913, + "step": 59410 + }, + { + "epoch": 0.3734684658142321, + "grad_norm": 8.851447105407715, + "learning_rate": 1.751485712848797e-05, + "loss": 1.763, + "step": 59420 + }, + { + "epoch": 0.3735313181309292, + "grad_norm": 8.338323593139648, + "learning_rate": 1.7514438027543316e-05, + "loss": 1.708, + "step": 59430 + }, + { + "epoch": 0.37359417044762633, + "grad_norm": 6.182701110839844, + "learning_rate": 1.7514018926598663e-05, + "loss": 1.6702, + "step": 59440 + }, + { + "epoch": 0.37365702276432344, + "grad_norm": 7.472273826599121, + "learning_rate": 1.7513599825654007e-05, + "loss": 1.9166, + "step": 59450 + }, + { + "epoch": 0.37371987508102056, + "grad_norm": 7.576421737670898, + "learning_rate": 1.7513180724709354e-05, + "loss": 1.6216, + "step": 59460 + }, + { + "epoch": 0.3737827273977177, + "grad_norm": 7.299145698547363, + "learning_rate": 1.75127616237647e-05, + "loss": 1.7993, + "step": 59470 + }, + { + "epoch": 0.3738455797144148, + "grad_norm": 5.993757247924805, + "learning_rate": 1.7512342522820048e-05, + "loss": 1.8102, + "step": 59480 + }, + { + "epoch": 0.3739084320311119, + "grad_norm": 6.252512454986572, + "learning_rate": 1.7511923421875395e-05, + "loss": 1.6485, + "step": 59490 + }, + { + "epoch": 0.373971284347809, + "grad_norm": 7.279608249664307, + "learning_rate": 1.7511504320930742e-05, + "loss": 1.7594, + "step": 59500 + }, + { + "epoch": 0.37403413666450613, + "grad_norm": 7.6866655349731445, + "learning_rate": 1.751108521998609e-05, + "loss": 1.6542, + "step": 59510 + }, + { + "epoch": 0.37409698898120325, + "grad_norm": 6.671911239624023, + "learning_rate": 1.7510666119041433e-05, + "loss": 1.6437, + "step": 59520 + }, + { + "epoch": 0.37415984129790036, + "grad_norm": 6.562828063964844, + "learning_rate": 1.751024701809678e-05, + "loss": 1.5359, + "step": 59530 + }, + { + "epoch": 0.3742226936145974, + "grad_norm": 6.7453413009643555, + "learning_rate": 1.7509827917152127e-05, + "loss": 1.8324, + "step": 59540 + }, + { + "epoch": 0.37428554593129454, + "grad_norm": 8.931385040283203, + "learning_rate": 1.7509408816207474e-05, + "loss": 2.0589, + "step": 59550 + }, + { + "epoch": 0.37434839824799165, + "grad_norm": 7.867937088012695, + "learning_rate": 1.7508989715262818e-05, + "loss": 1.8412, + "step": 59560 + }, + { + "epoch": 0.37441125056468877, + "grad_norm": 6.942878246307373, + "learning_rate": 1.7508570614318165e-05, + "loss": 1.8466, + "step": 59570 + }, + { + "epoch": 0.3744741028813859, + "grad_norm": 6.9903178215026855, + "learning_rate": 1.7508151513373512e-05, + "loss": 1.6826, + "step": 59580 + }, + { + "epoch": 0.374536955198083, + "grad_norm": 7.423864841461182, + "learning_rate": 1.750773241242886e-05, + "loss": 1.6371, + "step": 59590 + }, + { + "epoch": 0.3745998075147801, + "grad_norm": 7.129739761352539, + "learning_rate": 1.7507313311484206e-05, + "loss": 1.8208, + "step": 59600 + }, + { + "epoch": 0.37466265983147723, + "grad_norm": 8.435739517211914, + "learning_rate": 1.750689421053955e-05, + "loss": 1.7287, + "step": 59610 + }, + { + "epoch": 0.37472551214817434, + "grad_norm": 6.890371799468994, + "learning_rate": 1.7506475109594897e-05, + "loss": 1.7205, + "step": 59620 + }, + { + "epoch": 0.37478836446487146, + "grad_norm": 7.0437726974487305, + "learning_rate": 1.7506056008650244e-05, + "loss": 1.9148, + "step": 59630 + }, + { + "epoch": 0.3748512167815686, + "grad_norm": 6.208669662475586, + "learning_rate": 1.750563690770559e-05, + "loss": 1.7117, + "step": 59640 + }, + { + "epoch": 0.3749140690982657, + "grad_norm": 6.926197052001953, + "learning_rate": 1.7505217806760938e-05, + "loss": 1.6599, + "step": 59650 + }, + { + "epoch": 0.3749769214149628, + "grad_norm": 6.988066673278809, + "learning_rate": 1.7504798705816285e-05, + "loss": 1.6229, + "step": 59660 + }, + { + "epoch": 0.37503977373165986, + "grad_norm": 8.055120468139648, + "learning_rate": 1.7504379604871632e-05, + "loss": 1.8998, + "step": 59670 + }, + { + "epoch": 0.375102626048357, + "grad_norm": 7.21201753616333, + "learning_rate": 1.750396050392698e-05, + "loss": 1.8485, + "step": 59680 + }, + { + "epoch": 0.3751654783650541, + "grad_norm": 5.845964431762695, + "learning_rate": 1.7503541402982326e-05, + "loss": 1.8329, + "step": 59690 + }, + { + "epoch": 0.3752283306817512, + "grad_norm": 7.649561405181885, + "learning_rate": 1.750312230203767e-05, + "loss": 1.6308, + "step": 59700 + }, + { + "epoch": 0.3752911829984483, + "grad_norm": 7.195222854614258, + "learning_rate": 1.7502703201093017e-05, + "loss": 1.8236, + "step": 59710 + }, + { + "epoch": 0.37535403531514544, + "grad_norm": 7.8561248779296875, + "learning_rate": 1.7502284100148364e-05, + "loss": 1.8016, + "step": 59720 + }, + { + "epoch": 0.37541688763184256, + "grad_norm": 6.7510600090026855, + "learning_rate": 1.750186499920371e-05, + "loss": 1.747, + "step": 59730 + }, + { + "epoch": 0.37547973994853967, + "grad_norm": 7.114448547363281, + "learning_rate": 1.7501445898259055e-05, + "loss": 1.7153, + "step": 59740 + }, + { + "epoch": 0.3755425922652368, + "grad_norm": 6.795496940612793, + "learning_rate": 1.7501026797314402e-05, + "loss": 1.7033, + "step": 59750 + }, + { + "epoch": 0.3756054445819339, + "grad_norm": 9.095891952514648, + "learning_rate": 1.750060769636975e-05, + "loss": 1.6289, + "step": 59760 + }, + { + "epoch": 0.375668296898631, + "grad_norm": 6.375208377838135, + "learning_rate": 1.7500188595425096e-05, + "loss": 1.7403, + "step": 59770 + }, + { + "epoch": 0.37573114921532813, + "grad_norm": 7.159587383270264, + "learning_rate": 1.7499769494480443e-05, + "loss": 1.8371, + "step": 59780 + }, + { + "epoch": 0.37579400153202525, + "grad_norm": 7.09494686126709, + "learning_rate": 1.7499350393535787e-05, + "loss": 1.8446, + "step": 59790 + }, + { + "epoch": 0.3758568538487223, + "grad_norm": 6.484232425689697, + "learning_rate": 1.7498931292591134e-05, + "loss": 1.8591, + "step": 59800 + }, + { + "epoch": 0.3759197061654194, + "grad_norm": 7.286050796508789, + "learning_rate": 1.749851219164648e-05, + "loss": 1.7384, + "step": 59810 + }, + { + "epoch": 0.37598255848211654, + "grad_norm": 7.048933506011963, + "learning_rate": 1.7498093090701828e-05, + "loss": 1.5255, + "step": 59820 + }, + { + "epoch": 0.37604541079881365, + "grad_norm": 6.8245768547058105, + "learning_rate": 1.7497673989757172e-05, + "loss": 1.7599, + "step": 59830 + }, + { + "epoch": 0.37610826311551077, + "grad_norm": 8.256148338317871, + "learning_rate": 1.749725488881252e-05, + "loss": 1.7403, + "step": 59840 + }, + { + "epoch": 0.3761711154322079, + "grad_norm": 7.225186347961426, + "learning_rate": 1.7496835787867866e-05, + "loss": 1.6914, + "step": 59850 + }, + { + "epoch": 0.376233967748905, + "grad_norm": 6.334831237792969, + "learning_rate": 1.7496416686923213e-05, + "loss": 1.7208, + "step": 59860 + }, + { + "epoch": 0.3762968200656021, + "grad_norm": 6.329988479614258, + "learning_rate": 1.749599758597856e-05, + "loss": 1.8027, + "step": 59870 + }, + { + "epoch": 0.3763596723822992, + "grad_norm": 6.6955342292785645, + "learning_rate": 1.7495578485033907e-05, + "loss": 1.7901, + "step": 59880 + }, + { + "epoch": 0.37642252469899634, + "grad_norm": 5.487803936004639, + "learning_rate": 1.7495159384089254e-05, + "loss": 1.6651, + "step": 59890 + }, + { + "epoch": 0.37648537701569346, + "grad_norm": 6.468836784362793, + "learning_rate": 1.74947402831446e-05, + "loss": 1.683, + "step": 59900 + }, + { + "epoch": 0.37654822933239057, + "grad_norm": 7.434545040130615, + "learning_rate": 1.749432118219995e-05, + "loss": 1.9459, + "step": 59910 + }, + { + "epoch": 0.3766110816490877, + "grad_norm": 6.61981201171875, + "learning_rate": 1.7493902081255292e-05, + "loss": 1.8659, + "step": 59920 + }, + { + "epoch": 0.37667393396578475, + "grad_norm": 7.554393768310547, + "learning_rate": 1.749348298031064e-05, + "loss": 1.8676, + "step": 59930 + }, + { + "epoch": 0.37673678628248186, + "grad_norm": 7.570027828216553, + "learning_rate": 1.7493063879365986e-05, + "loss": 1.5633, + "step": 59940 + }, + { + "epoch": 0.376799638599179, + "grad_norm": 7.682457447052002, + "learning_rate": 1.7492644778421333e-05, + "loss": 1.8556, + "step": 59950 + }, + { + "epoch": 0.3768624909158761, + "grad_norm": 6.537892818450928, + "learning_rate": 1.7492225677476677e-05, + "loss": 1.8287, + "step": 59960 + }, + { + "epoch": 0.3769253432325732, + "grad_norm": 7.347871780395508, + "learning_rate": 1.7491806576532024e-05, + "loss": 1.9065, + "step": 59970 + }, + { + "epoch": 0.3769881955492703, + "grad_norm": 5.919852256774902, + "learning_rate": 1.749138747558737e-05, + "loss": 1.725, + "step": 59980 + }, + { + "epoch": 0.37705104786596744, + "grad_norm": 7.147500514984131, + "learning_rate": 1.7490968374642718e-05, + "loss": 2.0688, + "step": 59990 + }, + { + "epoch": 0.37711390018266455, + "grad_norm": 6.638935565948486, + "learning_rate": 1.7490549273698065e-05, + "loss": 1.752, + "step": 60000 + }, + { + "epoch": 0.37717675249936167, + "grad_norm": 7.027769088745117, + "learning_rate": 1.749013017275341e-05, + "loss": 1.9077, + "step": 60010 + }, + { + "epoch": 0.3772396048160588, + "grad_norm": 7.6059250831604, + "learning_rate": 1.7489711071808756e-05, + "loss": 1.8625, + "step": 60020 + }, + { + "epoch": 0.3773024571327559, + "grad_norm": 6.746246814727783, + "learning_rate": 1.7489291970864103e-05, + "loss": 1.5552, + "step": 60030 + }, + { + "epoch": 0.377365309449453, + "grad_norm": 6.389588832855225, + "learning_rate": 1.748887286991945e-05, + "loss": 1.7082, + "step": 60040 + }, + { + "epoch": 0.37742816176615007, + "grad_norm": 6.3765363693237305, + "learning_rate": 1.7488453768974797e-05, + "loss": 1.888, + "step": 60050 + }, + { + "epoch": 0.3774910140828472, + "grad_norm": 8.022350311279297, + "learning_rate": 1.7488034668030144e-05, + "loss": 1.7982, + "step": 60060 + }, + { + "epoch": 0.3775538663995443, + "grad_norm": 6.376769542694092, + "learning_rate": 1.748761556708549e-05, + "loss": 1.5281, + "step": 60070 + }, + { + "epoch": 0.3776167187162414, + "grad_norm": 6.17994499206543, + "learning_rate": 1.7487196466140835e-05, + "loss": 1.7159, + "step": 60080 + }, + { + "epoch": 0.37767957103293853, + "grad_norm": 6.3481364250183105, + "learning_rate": 1.7486777365196182e-05, + "loss": 1.8968, + "step": 60090 + }, + { + "epoch": 0.37774242334963565, + "grad_norm": 6.469996452331543, + "learning_rate": 1.748635826425153e-05, + "loss": 1.6666, + "step": 60100 + }, + { + "epoch": 0.37780527566633276, + "grad_norm": 6.1791253089904785, + "learning_rate": 1.7485939163306876e-05, + "loss": 1.7415, + "step": 60110 + }, + { + "epoch": 0.3778681279830299, + "grad_norm": 7.888062477111816, + "learning_rate": 1.7485520062362223e-05, + "loss": 1.7466, + "step": 60120 + }, + { + "epoch": 0.377930980299727, + "grad_norm": 5.9787821769714355, + "learning_rate": 1.748510096141757e-05, + "loss": 1.6393, + "step": 60130 + }, + { + "epoch": 0.3779938326164241, + "grad_norm": 7.3440752029418945, + "learning_rate": 1.7484681860472914e-05, + "loss": 1.783, + "step": 60140 + }, + { + "epoch": 0.3780566849331212, + "grad_norm": 7.354781150817871, + "learning_rate": 1.748426275952826e-05, + "loss": 1.8741, + "step": 60150 + }, + { + "epoch": 0.37811953724981834, + "grad_norm": 6.851288795471191, + "learning_rate": 1.7483843658583608e-05, + "loss": 2.0126, + "step": 60160 + }, + { + "epoch": 0.37818238956651545, + "grad_norm": 6.357627868652344, + "learning_rate": 1.7483424557638955e-05, + "loss": 1.6932, + "step": 60170 + }, + { + "epoch": 0.3782452418832125, + "grad_norm": 7.157871246337891, + "learning_rate": 1.74830054566943e-05, + "loss": 1.9015, + "step": 60180 + }, + { + "epoch": 0.37830809419990963, + "grad_norm": 6.958586692810059, + "learning_rate": 1.7482586355749646e-05, + "loss": 1.7936, + "step": 60190 + }, + { + "epoch": 0.37837094651660674, + "grad_norm": 7.132724285125732, + "learning_rate": 1.7482167254804993e-05, + "loss": 1.6724, + "step": 60200 + }, + { + "epoch": 0.37843379883330386, + "grad_norm": 6.834670066833496, + "learning_rate": 1.748174815386034e-05, + "loss": 1.6689, + "step": 60210 + }, + { + "epoch": 0.378496651150001, + "grad_norm": 6.891313552856445, + "learning_rate": 1.7481329052915687e-05, + "loss": 1.5845, + "step": 60220 + }, + { + "epoch": 0.3785595034666981, + "grad_norm": 7.128862380981445, + "learning_rate": 1.748090995197103e-05, + "loss": 1.5651, + "step": 60230 + }, + { + "epoch": 0.3786223557833952, + "grad_norm": 7.177995681762695, + "learning_rate": 1.7480490851026378e-05, + "loss": 1.806, + "step": 60240 + }, + { + "epoch": 0.3786852081000923, + "grad_norm": 7.514861106872559, + "learning_rate": 1.7480071750081725e-05, + "loss": 1.7179, + "step": 60250 + }, + { + "epoch": 0.37874806041678943, + "grad_norm": 7.260415077209473, + "learning_rate": 1.7479652649137072e-05, + "loss": 1.6272, + "step": 60260 + }, + { + "epoch": 0.37881091273348655, + "grad_norm": 8.503983497619629, + "learning_rate": 1.747923354819242e-05, + "loss": 1.7617, + "step": 60270 + }, + { + "epoch": 0.37887376505018366, + "grad_norm": 7.926694869995117, + "learning_rate": 1.7478814447247766e-05, + "loss": 1.7532, + "step": 60280 + }, + { + "epoch": 0.3789366173668808, + "grad_norm": 7.101754665374756, + "learning_rate": 1.7478395346303113e-05, + "loss": 1.8402, + "step": 60290 + }, + { + "epoch": 0.3789994696835779, + "grad_norm": 7.921205520629883, + "learning_rate": 1.747797624535846e-05, + "loss": 1.7564, + "step": 60300 + }, + { + "epoch": 0.37906232200027495, + "grad_norm": 6.938370704650879, + "learning_rate": 1.7477557144413808e-05, + "loss": 1.7927, + "step": 60310 + }, + { + "epoch": 0.37912517431697207, + "grad_norm": 7.330945014953613, + "learning_rate": 1.747713804346915e-05, + "loss": 1.9731, + "step": 60320 + }, + { + "epoch": 0.3791880266336692, + "grad_norm": 6.738017559051514, + "learning_rate": 1.7476718942524498e-05, + "loss": 2.0006, + "step": 60330 + }, + { + "epoch": 0.3792508789503663, + "grad_norm": 6.754965305328369, + "learning_rate": 1.7476299841579845e-05, + "loss": 1.7241, + "step": 60340 + }, + { + "epoch": 0.3793137312670634, + "grad_norm": 6.526510238647461, + "learning_rate": 1.7475880740635192e-05, + "loss": 1.7795, + "step": 60350 + }, + { + "epoch": 0.37937658358376053, + "grad_norm": 6.938403606414795, + "learning_rate": 1.7475461639690536e-05, + "loss": 1.7768, + "step": 60360 + }, + { + "epoch": 0.37943943590045764, + "grad_norm": 7.293728828430176, + "learning_rate": 1.7475042538745883e-05, + "loss": 1.9032, + "step": 60370 + }, + { + "epoch": 0.37950228821715476, + "grad_norm": 8.014728546142578, + "learning_rate": 1.747462343780123e-05, + "loss": 1.7381, + "step": 60380 + }, + { + "epoch": 0.3795651405338519, + "grad_norm": 6.761390209197998, + "learning_rate": 1.7474204336856577e-05, + "loss": 1.497, + "step": 60390 + }, + { + "epoch": 0.379627992850549, + "grad_norm": 5.933043003082275, + "learning_rate": 1.7473785235911924e-05, + "loss": 1.856, + "step": 60400 + }, + { + "epoch": 0.3796908451672461, + "grad_norm": 6.399038314819336, + "learning_rate": 1.7473408045061736e-05, + "loss": 1.7518, + "step": 60410 + }, + { + "epoch": 0.3797536974839432, + "grad_norm": 5.682687282562256, + "learning_rate": 1.7472988944117083e-05, + "loss": 1.6777, + "step": 60420 + }, + { + "epoch": 0.37981654980064034, + "grad_norm": 7.3017497062683105, + "learning_rate": 1.747256984317243e-05, + "loss": 1.853, + "step": 60430 + }, + { + "epoch": 0.3798794021173374, + "grad_norm": 7.732819557189941, + "learning_rate": 1.7472150742227774e-05, + "loss": 1.812, + "step": 60440 + }, + { + "epoch": 0.3799422544340345, + "grad_norm": 8.50419807434082, + "learning_rate": 1.747173164128312e-05, + "loss": 1.9775, + "step": 60450 + }, + { + "epoch": 0.3800051067507316, + "grad_norm": 6.747570514678955, + "learning_rate": 1.7471312540338468e-05, + "loss": 1.7447, + "step": 60460 + }, + { + "epoch": 0.38006795906742874, + "grad_norm": 7.3443121910095215, + "learning_rate": 1.7470893439393815e-05, + "loss": 1.5567, + "step": 60470 + }, + { + "epoch": 0.38013081138412586, + "grad_norm": 6.715612411499023, + "learning_rate": 1.747047433844916e-05, + "loss": 1.6435, + "step": 60480 + }, + { + "epoch": 0.38019366370082297, + "grad_norm": 7.518825531005859, + "learning_rate": 1.7470055237504505e-05, + "loss": 1.7859, + "step": 60490 + }, + { + "epoch": 0.3802565160175201, + "grad_norm": 6.818528652191162, + "learning_rate": 1.7469636136559853e-05, + "loss": 1.867, + "step": 60500 + }, + { + "epoch": 0.3803193683342172, + "grad_norm": 6.7483296394348145, + "learning_rate": 1.74692170356152e-05, + "loss": 1.695, + "step": 60510 + }, + { + "epoch": 0.3803822206509143, + "grad_norm": 8.435516357421875, + "learning_rate": 1.7468797934670547e-05, + "loss": 1.7875, + "step": 60520 + }, + { + "epoch": 0.38044507296761143, + "grad_norm": 6.491191387176514, + "learning_rate": 1.746837883372589e-05, + "loss": 1.8847, + "step": 60530 + }, + { + "epoch": 0.38050792528430855, + "grad_norm": 7.434518814086914, + "learning_rate": 1.7467959732781237e-05, + "loss": 1.5926, + "step": 60540 + }, + { + "epoch": 0.38057077760100566, + "grad_norm": 7.427109718322754, + "learning_rate": 1.7467540631836585e-05, + "loss": 1.7158, + "step": 60550 + }, + { + "epoch": 0.3806336299177027, + "grad_norm": 5.753595352172852, + "learning_rate": 1.746712153089193e-05, + "loss": 1.7352, + "step": 60560 + }, + { + "epoch": 0.38069648223439984, + "grad_norm": 6.416884422302246, + "learning_rate": 1.746670242994728e-05, + "loss": 1.9335, + "step": 60570 + }, + { + "epoch": 0.38075933455109695, + "grad_norm": 6.566325664520264, + "learning_rate": 1.7466283329002626e-05, + "loss": 1.7366, + "step": 60580 + }, + { + "epoch": 0.38082218686779407, + "grad_norm": 7.040223121643066, + "learning_rate": 1.7465864228057973e-05, + "loss": 1.7657, + "step": 60590 + }, + { + "epoch": 0.3808850391844912, + "grad_norm": 6.551633834838867, + "learning_rate": 1.746544512711332e-05, + "loss": 1.8301, + "step": 60600 + }, + { + "epoch": 0.3809478915011883, + "grad_norm": 6.2591142654418945, + "learning_rate": 1.7465026026168667e-05, + "loss": 1.905, + "step": 60610 + }, + { + "epoch": 0.3810107438178854, + "grad_norm": 7.5698561668396, + "learning_rate": 1.746460692522401e-05, + "loss": 1.6883, + "step": 60620 + }, + { + "epoch": 0.3810735961345825, + "grad_norm": 6.377468585968018, + "learning_rate": 1.7464187824279358e-05, + "loss": 1.8105, + "step": 60630 + }, + { + "epoch": 0.38113644845127964, + "grad_norm": 6.358702659606934, + "learning_rate": 1.7463768723334705e-05, + "loss": 1.6391, + "step": 60640 + }, + { + "epoch": 0.38119930076797676, + "grad_norm": 7.569583892822266, + "learning_rate": 1.7463349622390052e-05, + "loss": 1.5885, + "step": 60650 + }, + { + "epoch": 0.38126215308467387, + "grad_norm": 6.374160289764404, + "learning_rate": 1.7462930521445396e-05, + "loss": 1.6662, + "step": 60660 + }, + { + "epoch": 0.381325005401371, + "grad_norm": 8.042657852172852, + "learning_rate": 1.7462511420500743e-05, + "loss": 1.6635, + "step": 60670 + }, + { + "epoch": 0.3813878577180681, + "grad_norm": 6.4280781745910645, + "learning_rate": 1.746209231955609e-05, + "loss": 1.8211, + "step": 60680 + }, + { + "epoch": 0.38145071003476516, + "grad_norm": 7.481120586395264, + "learning_rate": 1.7461673218611437e-05, + "loss": 1.9808, + "step": 60690 + }, + { + "epoch": 0.3815135623514623, + "grad_norm": 10.263879776000977, + "learning_rate": 1.746125411766678e-05, + "loss": 1.8184, + "step": 60700 + }, + { + "epoch": 0.3815764146681594, + "grad_norm": 6.977829456329346, + "learning_rate": 1.7460835016722127e-05, + "loss": 1.8257, + "step": 60710 + }, + { + "epoch": 0.3816392669848565, + "grad_norm": 6.320673942565918, + "learning_rate": 1.7460415915777475e-05, + "loss": 1.71, + "step": 60720 + }, + { + "epoch": 0.3817021193015536, + "grad_norm": 7.3813886642456055, + "learning_rate": 1.745999681483282e-05, + "loss": 1.7503, + "step": 60730 + }, + { + "epoch": 0.38176497161825074, + "grad_norm": 6.991902828216553, + "learning_rate": 1.745957771388817e-05, + "loss": 1.9265, + "step": 60740 + }, + { + "epoch": 0.38182782393494785, + "grad_norm": 7.4620256423950195, + "learning_rate": 1.7459158612943516e-05, + "loss": 1.5813, + "step": 60750 + }, + { + "epoch": 0.38189067625164497, + "grad_norm": 7.323355197906494, + "learning_rate": 1.7458739511998863e-05, + "loss": 1.665, + "step": 60760 + }, + { + "epoch": 0.3819535285683421, + "grad_norm": 6.123446941375732, + "learning_rate": 1.7458320411054207e-05, + "loss": 1.5442, + "step": 60770 + }, + { + "epoch": 0.3820163808850392, + "grad_norm": 7.154541492462158, + "learning_rate": 1.7457901310109554e-05, + "loss": 1.6706, + "step": 60780 + }, + { + "epoch": 0.3820792332017363, + "grad_norm": 7.16757869720459, + "learning_rate": 1.74574822091649e-05, + "loss": 1.778, + "step": 60790 + }, + { + "epoch": 0.3821420855184334, + "grad_norm": 7.341074466705322, + "learning_rate": 1.7457063108220248e-05, + "loss": 1.7444, + "step": 60800 + }, + { + "epoch": 0.38220493783513054, + "grad_norm": 7.525569915771484, + "learning_rate": 1.7456644007275595e-05, + "loss": 1.7723, + "step": 60810 + }, + { + "epoch": 0.3822677901518276, + "grad_norm": 5.7784423828125, + "learning_rate": 1.7456224906330942e-05, + "loss": 1.8289, + "step": 60820 + }, + { + "epoch": 0.3823306424685247, + "grad_norm": 6.461688995361328, + "learning_rate": 1.745580580538629e-05, + "loss": 1.7749, + "step": 60830 + }, + { + "epoch": 0.38239349478522183, + "grad_norm": 7.5797529220581055, + "learning_rate": 1.7455386704441633e-05, + "loss": 1.7896, + "step": 60840 + }, + { + "epoch": 0.38245634710191895, + "grad_norm": 6.6320624351501465, + "learning_rate": 1.745496760349698e-05, + "loss": 1.7124, + "step": 60850 + }, + { + "epoch": 0.38251919941861606, + "grad_norm": 6.536380767822266, + "learning_rate": 1.7454548502552327e-05, + "loss": 1.7042, + "step": 60860 + }, + { + "epoch": 0.3825820517353132, + "grad_norm": 6.43516731262207, + "learning_rate": 1.7454129401607674e-05, + "loss": 1.613, + "step": 60870 + }, + { + "epoch": 0.3826449040520103, + "grad_norm": 6.561049461364746, + "learning_rate": 1.7453710300663018e-05, + "loss": 1.6386, + "step": 60880 + }, + { + "epoch": 0.3827077563687074, + "grad_norm": 6.353400230407715, + "learning_rate": 1.7453291199718365e-05, + "loss": 1.6232, + "step": 60890 + }, + { + "epoch": 0.3827706086854045, + "grad_norm": 8.064393043518066, + "learning_rate": 1.745287209877371e-05, + "loss": 1.7805, + "step": 60900 + }, + { + "epoch": 0.38283346100210164, + "grad_norm": 8.582117080688477, + "learning_rate": 1.745245299782906e-05, + "loss": 1.9209, + "step": 60910 + }, + { + "epoch": 0.38289631331879875, + "grad_norm": 6.980828762054443, + "learning_rate": 1.7452033896884406e-05, + "loss": 1.6657, + "step": 60920 + }, + { + "epoch": 0.38295916563549587, + "grad_norm": 5.844123363494873, + "learning_rate": 1.745161479593975e-05, + "loss": 1.7771, + "step": 60930 + }, + { + "epoch": 0.383022017952193, + "grad_norm": 7.514682292938232, + "learning_rate": 1.7451195694995097e-05, + "loss": 1.8881, + "step": 60940 + }, + { + "epoch": 0.38308487026889004, + "grad_norm": 7.010300159454346, + "learning_rate": 1.7450776594050444e-05, + "loss": 1.6587, + "step": 60950 + }, + { + "epoch": 0.38314772258558716, + "grad_norm": 8.163060188293457, + "learning_rate": 1.745035749310579e-05, + "loss": 1.8552, + "step": 60960 + }, + { + "epoch": 0.3832105749022843, + "grad_norm": 5.847114562988281, + "learning_rate": 1.7449938392161138e-05, + "loss": 1.7179, + "step": 60970 + }, + { + "epoch": 0.3832734272189814, + "grad_norm": 6.140045166015625, + "learning_rate": 1.7449519291216485e-05, + "loss": 1.9446, + "step": 60980 + }, + { + "epoch": 0.3833362795356785, + "grad_norm": 6.357498645782471, + "learning_rate": 1.7449100190271832e-05, + "loss": 1.8126, + "step": 60990 + }, + { + "epoch": 0.3833991318523756, + "grad_norm": 7.207118988037109, + "learning_rate": 1.744868108932718e-05, + "loss": 1.9658, + "step": 61000 + }, + { + "epoch": 0.38346198416907273, + "grad_norm": 6.827266693115234, + "learning_rate": 1.7448261988382523e-05, + "loss": 2.061, + "step": 61010 + }, + { + "epoch": 0.38352483648576985, + "grad_norm": 7.136649131774902, + "learning_rate": 1.744784288743787e-05, + "loss": 1.5873, + "step": 61020 + }, + { + "epoch": 0.38358768880246696, + "grad_norm": 7.03965950012207, + "learning_rate": 1.744746569658768e-05, + "loss": 1.7496, + "step": 61030 + }, + { + "epoch": 0.3836505411191641, + "grad_norm": 6.262448310852051, + "learning_rate": 1.7447046595643028e-05, + "loss": 1.8426, + "step": 61040 + }, + { + "epoch": 0.3837133934358612, + "grad_norm": 7.471109390258789, + "learning_rate": 1.7446627494698375e-05, + "loss": 1.7518, + "step": 61050 + }, + { + "epoch": 0.3837762457525583, + "grad_norm": 6.2768940925598145, + "learning_rate": 1.7446208393753722e-05, + "loss": 1.6824, + "step": 61060 + }, + { + "epoch": 0.38383909806925537, + "grad_norm": 7.512392520904541, + "learning_rate": 1.744578929280907e-05, + "loss": 1.7053, + "step": 61070 + }, + { + "epoch": 0.3839019503859525, + "grad_norm": 6.868130207061768, + "learning_rate": 1.7445370191864413e-05, + "loss": 1.7538, + "step": 61080 + }, + { + "epoch": 0.3839648027026496, + "grad_norm": 6.069267749786377, + "learning_rate": 1.744495109091976e-05, + "loss": 1.7242, + "step": 61090 + }, + { + "epoch": 0.3840276550193467, + "grad_norm": 6.6341376304626465, + "learning_rate": 1.7444531989975107e-05, + "loss": 1.8129, + "step": 61100 + }, + { + "epoch": 0.38409050733604383, + "grad_norm": 7.144861221313477, + "learning_rate": 1.7444112889030454e-05, + "loss": 1.5183, + "step": 61110 + }, + { + "epoch": 0.38415335965274094, + "grad_norm": 6.238708972930908, + "learning_rate": 1.74436937880858e-05, + "loss": 2.1609, + "step": 61120 + }, + { + "epoch": 0.38421621196943806, + "grad_norm": 7.683382034301758, + "learning_rate": 1.7443274687141145e-05, + "loss": 1.8292, + "step": 61130 + }, + { + "epoch": 0.3842790642861352, + "grad_norm": 6.920597553253174, + "learning_rate": 1.7442855586196492e-05, + "loss": 1.4433, + "step": 61140 + }, + { + "epoch": 0.3843419166028323, + "grad_norm": 6.853091239929199, + "learning_rate": 1.744243648525184e-05, + "loss": 1.727, + "step": 61150 + }, + { + "epoch": 0.3844047689195294, + "grad_norm": 6.160371780395508, + "learning_rate": 1.7442017384307186e-05, + "loss": 1.6501, + "step": 61160 + }, + { + "epoch": 0.3844676212362265, + "grad_norm": 8.203758239746094, + "learning_rate": 1.7441598283362533e-05, + "loss": 1.7284, + "step": 61170 + }, + { + "epoch": 0.38453047355292364, + "grad_norm": 8.511927604675293, + "learning_rate": 1.7441179182417877e-05, + "loss": 1.6751, + "step": 61180 + }, + { + "epoch": 0.38459332586962075, + "grad_norm": 8.361741065979004, + "learning_rate": 1.7440760081473224e-05, + "loss": 1.8765, + "step": 61190 + }, + { + "epoch": 0.3846561781863178, + "grad_norm": 6.204860210418701, + "learning_rate": 1.744034098052857e-05, + "loss": 1.8143, + "step": 61200 + }, + { + "epoch": 0.3847190305030149, + "grad_norm": 6.404733657836914, + "learning_rate": 1.7439921879583918e-05, + "loss": 1.8478, + "step": 61210 + }, + { + "epoch": 0.38478188281971204, + "grad_norm": 6.132648944854736, + "learning_rate": 1.7439502778639262e-05, + "loss": 1.6354, + "step": 61220 + }, + { + "epoch": 0.38484473513640916, + "grad_norm": 7.414051532745361, + "learning_rate": 1.743908367769461e-05, + "loss": 1.6365, + "step": 61230 + }, + { + "epoch": 0.38490758745310627, + "grad_norm": 6.387333393096924, + "learning_rate": 1.7438664576749956e-05, + "loss": 1.4594, + "step": 61240 + }, + { + "epoch": 0.3849704397698034, + "grad_norm": 6.648262977600098, + "learning_rate": 1.7438245475805303e-05, + "loss": 1.914, + "step": 61250 + }, + { + "epoch": 0.3850332920865005, + "grad_norm": 7.251654624938965, + "learning_rate": 1.743782637486065e-05, + "loss": 1.6843, + "step": 61260 + }, + { + "epoch": 0.3850961444031976, + "grad_norm": 7.042481422424316, + "learning_rate": 1.7437407273915997e-05, + "loss": 1.7401, + "step": 61270 + }, + { + "epoch": 0.38515899671989473, + "grad_norm": 6.673430919647217, + "learning_rate": 1.7436988172971344e-05, + "loss": 1.4713, + "step": 61280 + }, + { + "epoch": 0.38522184903659185, + "grad_norm": 6.783602237701416, + "learning_rate": 1.743656907202669e-05, + "loss": 1.7352, + "step": 61290 + }, + { + "epoch": 0.38528470135328896, + "grad_norm": 6.437345027923584, + "learning_rate": 1.743614997108204e-05, + "loss": 1.7249, + "step": 61300 + }, + { + "epoch": 0.3853475536699861, + "grad_norm": 8.222586631774902, + "learning_rate": 1.7435730870137382e-05, + "loss": 1.7668, + "step": 61310 + }, + { + "epoch": 0.3854104059866832, + "grad_norm": 7.5672383308410645, + "learning_rate": 1.743531176919273e-05, + "loss": 1.9099, + "step": 61320 + }, + { + "epoch": 0.38547325830338025, + "grad_norm": 6.734484672546387, + "learning_rate": 1.7434892668248076e-05, + "loss": 1.7436, + "step": 61330 + }, + { + "epoch": 0.38553611062007737, + "grad_norm": 7.042230129241943, + "learning_rate": 1.7434473567303423e-05, + "loss": 1.6302, + "step": 61340 + }, + { + "epoch": 0.3855989629367745, + "grad_norm": 6.5508551597595215, + "learning_rate": 1.743405446635877e-05, + "loss": 1.6532, + "step": 61350 + }, + { + "epoch": 0.3856618152534716, + "grad_norm": 5.1469502449035645, + "learning_rate": 1.7433635365414114e-05, + "loss": 1.4144, + "step": 61360 + }, + { + "epoch": 0.3857246675701687, + "grad_norm": 6.1664652824401855, + "learning_rate": 1.743321626446946e-05, + "loss": 1.5423, + "step": 61370 + }, + { + "epoch": 0.3857875198868658, + "grad_norm": 7.240113735198975, + "learning_rate": 1.7432797163524808e-05, + "loss": 1.6542, + "step": 61380 + }, + { + "epoch": 0.38585037220356294, + "grad_norm": 7.81046199798584, + "learning_rate": 1.7432378062580155e-05, + "loss": 1.9639, + "step": 61390 + }, + { + "epoch": 0.38591322452026006, + "grad_norm": 7.0568037033081055, + "learning_rate": 1.74319589616355e-05, + "loss": 1.9366, + "step": 61400 + }, + { + "epoch": 0.38597607683695717, + "grad_norm": 6.542177677154541, + "learning_rate": 1.7431539860690846e-05, + "loss": 1.7901, + "step": 61410 + }, + { + "epoch": 0.3860389291536543, + "grad_norm": 7.521069049835205, + "learning_rate": 1.7431120759746193e-05, + "loss": 1.753, + "step": 61420 + }, + { + "epoch": 0.3861017814703514, + "grad_norm": 6.601457118988037, + "learning_rate": 1.743070165880154e-05, + "loss": 1.7678, + "step": 61430 + }, + { + "epoch": 0.3861646337870485, + "grad_norm": 7.136099338531494, + "learning_rate": 1.7430282557856887e-05, + "loss": 1.5849, + "step": 61440 + }, + { + "epoch": 0.38622748610374563, + "grad_norm": 7.009994983673096, + "learning_rate": 1.7429863456912234e-05, + "loss": 1.493, + "step": 61450 + }, + { + "epoch": 0.3862903384204427, + "grad_norm": 6.2310686111450195, + "learning_rate": 1.7429444355967578e-05, + "loss": 1.7945, + "step": 61460 + }, + { + "epoch": 0.3863531907371398, + "grad_norm": 5.899852275848389, + "learning_rate": 1.7429025255022925e-05, + "loss": 1.7684, + "step": 61470 + }, + { + "epoch": 0.3864160430538369, + "grad_norm": 6.108722686767578, + "learning_rate": 1.7428606154078272e-05, + "loss": 1.7854, + "step": 61480 + }, + { + "epoch": 0.38647889537053404, + "grad_norm": 6.275115489959717, + "learning_rate": 1.742818705313362e-05, + "loss": 1.6947, + "step": 61490 + }, + { + "epoch": 0.38654174768723115, + "grad_norm": 6.80659294128418, + "learning_rate": 1.7427767952188966e-05, + "loss": 1.7221, + "step": 61500 + }, + { + "epoch": 0.38660460000392827, + "grad_norm": 6.598296642303467, + "learning_rate": 1.7427348851244313e-05, + "loss": 1.7728, + "step": 61510 + }, + { + "epoch": 0.3866674523206254, + "grad_norm": 6.046689510345459, + "learning_rate": 1.742692975029966e-05, + "loss": 1.8357, + "step": 61520 + }, + { + "epoch": 0.3867303046373225, + "grad_norm": 6.522720813751221, + "learning_rate": 1.7426510649355004e-05, + "loss": 1.8675, + "step": 61530 + }, + { + "epoch": 0.3867931569540196, + "grad_norm": 8.329684257507324, + "learning_rate": 1.742609154841035e-05, + "loss": 1.8171, + "step": 61540 + }, + { + "epoch": 0.3868560092707167, + "grad_norm": 5.505519390106201, + "learning_rate": 1.7425672447465698e-05, + "loss": 1.5185, + "step": 61550 + }, + { + "epoch": 0.38691886158741384, + "grad_norm": 7.60694694519043, + "learning_rate": 1.7425253346521045e-05, + "loss": 1.8365, + "step": 61560 + }, + { + "epoch": 0.38698171390411096, + "grad_norm": 5.743784427642822, + "learning_rate": 1.7424834245576392e-05, + "loss": 1.7219, + "step": 61570 + }, + { + "epoch": 0.3870445662208081, + "grad_norm": 6.197338104248047, + "learning_rate": 1.7424415144631736e-05, + "loss": 1.6295, + "step": 61580 + }, + { + "epoch": 0.38710741853750513, + "grad_norm": 7.487074851989746, + "learning_rate": 1.7423996043687083e-05, + "loss": 1.9203, + "step": 61590 + }, + { + "epoch": 0.38717027085420225, + "grad_norm": 6.8445515632629395, + "learning_rate": 1.742357694274243e-05, + "loss": 1.6947, + "step": 61600 + }, + { + "epoch": 0.38723312317089936, + "grad_norm": 7.205432891845703, + "learning_rate": 1.7423157841797777e-05, + "loss": 1.6026, + "step": 61610 + }, + { + "epoch": 0.3872959754875965, + "grad_norm": 6.194027423858643, + "learning_rate": 1.742273874085312e-05, + "loss": 1.7346, + "step": 61620 + }, + { + "epoch": 0.3873588278042936, + "grad_norm": 6.567793369293213, + "learning_rate": 1.7422319639908468e-05, + "loss": 1.7449, + "step": 61630 + }, + { + "epoch": 0.3874216801209907, + "grad_norm": 5.166746616363525, + "learning_rate": 1.7421900538963815e-05, + "loss": 1.7528, + "step": 61640 + }, + { + "epoch": 0.3874845324376878, + "grad_norm": 6.169304847717285, + "learning_rate": 1.7421481438019162e-05, + "loss": 1.4929, + "step": 61650 + }, + { + "epoch": 0.38754738475438494, + "grad_norm": 7.2083587646484375, + "learning_rate": 1.742106233707451e-05, + "loss": 1.7279, + "step": 61660 + }, + { + "epoch": 0.38761023707108205, + "grad_norm": 7.021596908569336, + "learning_rate": 1.7420643236129856e-05, + "loss": 1.8054, + "step": 61670 + }, + { + "epoch": 0.38767308938777917, + "grad_norm": 6.886139392852783, + "learning_rate": 1.7420224135185203e-05, + "loss": 1.7988, + "step": 61680 + }, + { + "epoch": 0.3877359417044763, + "grad_norm": 6.56005859375, + "learning_rate": 1.741980503424055e-05, + "loss": 1.4582, + "step": 61690 + }, + { + "epoch": 0.3877987940211734, + "grad_norm": 7.746946334838867, + "learning_rate": 1.7419385933295898e-05, + "loss": 1.7964, + "step": 61700 + }, + { + "epoch": 0.38786164633787046, + "grad_norm": 7.342988967895508, + "learning_rate": 1.741896683235124e-05, + "loss": 1.9857, + "step": 61710 + }, + { + "epoch": 0.3879244986545676, + "grad_norm": 7.007579326629639, + "learning_rate": 1.741854773140659e-05, + "loss": 1.6829, + "step": 61720 + }, + { + "epoch": 0.3879873509712647, + "grad_norm": 7.58625602722168, + "learning_rate": 1.7418128630461935e-05, + "loss": 1.779, + "step": 61730 + }, + { + "epoch": 0.3880502032879618, + "grad_norm": 6.65321683883667, + "learning_rate": 1.7417709529517282e-05, + "loss": 1.8255, + "step": 61740 + }, + { + "epoch": 0.3881130556046589, + "grad_norm": 5.722879886627197, + "learning_rate": 1.741729042857263e-05, + "loss": 1.7794, + "step": 61750 + }, + { + "epoch": 0.38817590792135603, + "grad_norm": 5.910167217254639, + "learning_rate": 1.7416871327627973e-05, + "loss": 1.7098, + "step": 61760 + }, + { + "epoch": 0.38823876023805315, + "grad_norm": 7.410051345825195, + "learning_rate": 1.741645222668332e-05, + "loss": 1.9137, + "step": 61770 + }, + { + "epoch": 0.38830161255475026, + "grad_norm": 11.012842178344727, + "learning_rate": 1.7416033125738667e-05, + "loss": 1.9382, + "step": 61780 + }, + { + "epoch": 0.3883644648714474, + "grad_norm": 5.913793563842773, + "learning_rate": 1.7415614024794014e-05, + "loss": 1.7182, + "step": 61790 + }, + { + "epoch": 0.3884273171881445, + "grad_norm": 6.666558742523193, + "learning_rate": 1.7415194923849358e-05, + "loss": 1.6937, + "step": 61800 + }, + { + "epoch": 0.3884901695048416, + "grad_norm": 6.517996311187744, + "learning_rate": 1.7414775822904705e-05, + "loss": 1.3727, + "step": 61810 + }, + { + "epoch": 0.3885530218215387, + "grad_norm": 5.659692764282227, + "learning_rate": 1.7414356721960052e-05, + "loss": 1.7622, + "step": 61820 + }, + { + "epoch": 0.38861587413823584, + "grad_norm": 6.496676445007324, + "learning_rate": 1.74139376210154e-05, + "loss": 1.7122, + "step": 61830 + }, + { + "epoch": 0.3886787264549329, + "grad_norm": 6.02579927444458, + "learning_rate": 1.7413518520070743e-05, + "loss": 1.5066, + "step": 61840 + }, + { + "epoch": 0.38874157877163, + "grad_norm": 6.5224785804748535, + "learning_rate": 1.741309941912609e-05, + "loss": 1.6482, + "step": 61850 + }, + { + "epoch": 0.38880443108832713, + "grad_norm": 5.896265983581543, + "learning_rate": 1.7412680318181437e-05, + "loss": 1.7601, + "step": 61860 + }, + { + "epoch": 0.38886728340502424, + "grad_norm": 7.0013275146484375, + "learning_rate": 1.7412261217236784e-05, + "loss": 1.8265, + "step": 61870 + }, + { + "epoch": 0.38893013572172136, + "grad_norm": 7.07061243057251, + "learning_rate": 1.741184211629213e-05, + "loss": 1.85, + "step": 61880 + }, + { + "epoch": 0.3889929880384185, + "grad_norm": 7.437373161315918, + "learning_rate": 1.741142301534748e-05, + "loss": 1.8319, + "step": 61890 + }, + { + "epoch": 0.3890558403551156, + "grad_norm": 6.504358291625977, + "learning_rate": 1.7411003914402825e-05, + "loss": 1.7933, + "step": 61900 + }, + { + "epoch": 0.3891186926718127, + "grad_norm": 9.265369415283203, + "learning_rate": 1.7410584813458172e-05, + "loss": 1.9063, + "step": 61910 + }, + { + "epoch": 0.3891815449885098, + "grad_norm": 7.179880142211914, + "learning_rate": 1.741016571251352e-05, + "loss": 1.8708, + "step": 61920 + }, + { + "epoch": 0.38924439730520694, + "grad_norm": 7.085618019104004, + "learning_rate": 1.7409746611568863e-05, + "loss": 1.6742, + "step": 61930 + }, + { + "epoch": 0.38930724962190405, + "grad_norm": 6.503653049468994, + "learning_rate": 1.740932751062421e-05, + "loss": 1.7477, + "step": 61940 + }, + { + "epoch": 0.38937010193860117, + "grad_norm": 7.99690580368042, + "learning_rate": 1.7408908409679557e-05, + "loss": 1.8255, + "step": 61950 + }, + { + "epoch": 0.3894329542552983, + "grad_norm": 6.84827995300293, + "learning_rate": 1.7408489308734904e-05, + "loss": 1.7582, + "step": 61960 + }, + { + "epoch": 0.38949580657199534, + "grad_norm": 6.803924083709717, + "learning_rate": 1.740807020779025e-05, + "loss": 1.8283, + "step": 61970 + }, + { + "epoch": 0.38955865888869246, + "grad_norm": 6.952895164489746, + "learning_rate": 1.7407651106845595e-05, + "loss": 1.8876, + "step": 61980 + }, + { + "epoch": 0.38962151120538957, + "grad_norm": 6.024576187133789, + "learning_rate": 1.7407232005900942e-05, + "loss": 1.7315, + "step": 61990 + }, + { + "epoch": 0.3896843635220867, + "grad_norm": 6.177873611450195, + "learning_rate": 1.740681290495629e-05, + "loss": 1.7841, + "step": 62000 + }, + { + "epoch": 0.3897472158387838, + "grad_norm": 7.057847499847412, + "learning_rate": 1.7406393804011636e-05, + "loss": 2.1219, + "step": 62010 + }, + { + "epoch": 0.3898100681554809, + "grad_norm": 6.767395496368408, + "learning_rate": 1.740597470306698e-05, + "loss": 1.749, + "step": 62020 + }, + { + "epoch": 0.38987292047217803, + "grad_norm": 7.810880661010742, + "learning_rate": 1.7405555602122327e-05, + "loss": 1.7328, + "step": 62030 + }, + { + "epoch": 0.38993577278887515, + "grad_norm": 7.274215221405029, + "learning_rate": 1.7405136501177674e-05, + "loss": 1.7232, + "step": 62040 + }, + { + "epoch": 0.38999862510557226, + "grad_norm": 6.604606628417969, + "learning_rate": 1.740471740023302e-05, + "loss": 1.5776, + "step": 62050 + }, + { + "epoch": 0.3900614774222694, + "grad_norm": 7.570003986358643, + "learning_rate": 1.740429829928837e-05, + "loss": 1.6579, + "step": 62060 + }, + { + "epoch": 0.3901243297389665, + "grad_norm": 7.769916534423828, + "learning_rate": 1.7403879198343715e-05, + "loss": 2.3428, + "step": 62070 + }, + { + "epoch": 0.3901871820556636, + "grad_norm": 8.162944793701172, + "learning_rate": 1.7403460097399063e-05, + "loss": 1.8261, + "step": 62080 + }, + { + "epoch": 0.3902500343723607, + "grad_norm": 7.546502113342285, + "learning_rate": 1.7403040996454406e-05, + "loss": 1.7824, + "step": 62090 + }, + { + "epoch": 0.3903128866890578, + "grad_norm": 6.176581382751465, + "learning_rate": 1.7402621895509753e-05, + "loss": 1.9418, + "step": 62100 + }, + { + "epoch": 0.3903757390057549, + "grad_norm": 7.330261707305908, + "learning_rate": 1.74022027945651e-05, + "loss": 1.6838, + "step": 62110 + }, + { + "epoch": 0.390438591322452, + "grad_norm": 7.880338668823242, + "learning_rate": 1.7401783693620447e-05, + "loss": 1.9394, + "step": 62120 + }, + { + "epoch": 0.3905014436391491, + "grad_norm": 6.187344551086426, + "learning_rate": 1.7401364592675794e-05, + "loss": 1.6459, + "step": 62130 + }, + { + "epoch": 0.39056429595584624, + "grad_norm": 7.3748297691345215, + "learning_rate": 1.740094549173114e-05, + "loss": 1.8857, + "step": 62140 + }, + { + "epoch": 0.39062714827254336, + "grad_norm": 7.511769771575928, + "learning_rate": 1.7400526390786485e-05, + "loss": 1.7479, + "step": 62150 + }, + { + "epoch": 0.39069000058924047, + "grad_norm": 6.027830123901367, + "learning_rate": 1.7400107289841832e-05, + "loss": 1.6398, + "step": 62160 + }, + { + "epoch": 0.3907528529059376, + "grad_norm": 6.44254732131958, + "learning_rate": 1.739968818889718e-05, + "loss": 1.885, + "step": 62170 + }, + { + "epoch": 0.3908157052226347, + "grad_norm": 7.95977783203125, + "learning_rate": 1.7399269087952526e-05, + "loss": 1.8606, + "step": 62180 + }, + { + "epoch": 0.3908785575393318, + "grad_norm": 6.054286479949951, + "learning_rate": 1.7398849987007874e-05, + "loss": 1.6732, + "step": 62190 + }, + { + "epoch": 0.39094140985602893, + "grad_norm": 7.351673603057861, + "learning_rate": 1.7398430886063217e-05, + "loss": 1.7577, + "step": 62200 + }, + { + "epoch": 0.39100426217272605, + "grad_norm": 7.569562911987305, + "learning_rate": 1.7398011785118564e-05, + "loss": 1.8935, + "step": 62210 + }, + { + "epoch": 0.3910671144894231, + "grad_norm": 6.332158088684082, + "learning_rate": 1.739759268417391e-05, + "loss": 1.5981, + "step": 62220 + }, + { + "epoch": 0.3911299668061202, + "grad_norm": 6.264937877655029, + "learning_rate": 1.739717358322926e-05, + "loss": 1.8077, + "step": 62230 + }, + { + "epoch": 0.39119281912281734, + "grad_norm": 7.290078163146973, + "learning_rate": 1.7396754482284602e-05, + "loss": 1.7831, + "step": 62240 + }, + { + "epoch": 0.39125567143951445, + "grad_norm": 7.162696361541748, + "learning_rate": 1.739633538133995e-05, + "loss": 1.9486, + "step": 62250 + }, + { + "epoch": 0.39131852375621157, + "grad_norm": 8.368673324584961, + "learning_rate": 1.7395916280395296e-05, + "loss": 1.8482, + "step": 62260 + }, + { + "epoch": 0.3913813760729087, + "grad_norm": 8.074396133422852, + "learning_rate": 1.7395497179450643e-05, + "loss": 2.0184, + "step": 62270 + }, + { + "epoch": 0.3914442283896058, + "grad_norm": 6.705972194671631, + "learning_rate": 1.739507807850599e-05, + "loss": 1.7899, + "step": 62280 + }, + { + "epoch": 0.3915070807063029, + "grad_norm": 7.192666053771973, + "learning_rate": 1.7394658977561337e-05, + "loss": 1.81, + "step": 62290 + }, + { + "epoch": 0.391569933023, + "grad_norm": 7.759625434875488, + "learning_rate": 1.7394239876616685e-05, + "loss": 1.4557, + "step": 62300 + }, + { + "epoch": 0.39163278533969714, + "grad_norm": 7.5127949714660645, + "learning_rate": 1.739382077567203e-05, + "loss": 1.6934, + "step": 62310 + }, + { + "epoch": 0.39169563765639426, + "grad_norm": 7.240886688232422, + "learning_rate": 1.739340167472738e-05, + "loss": 1.7088, + "step": 62320 + }, + { + "epoch": 0.3917584899730914, + "grad_norm": 7.17133092880249, + "learning_rate": 1.7392982573782722e-05, + "loss": 2.0221, + "step": 62330 + }, + { + "epoch": 0.3918213422897885, + "grad_norm": 6.620835304260254, + "learning_rate": 1.739256347283807e-05, + "loss": 1.5721, + "step": 62340 + }, + { + "epoch": 0.39188419460648555, + "grad_norm": 16.08406639099121, + "learning_rate": 1.7392144371893416e-05, + "loss": 1.9188, + "step": 62350 + }, + { + "epoch": 0.39194704692318266, + "grad_norm": 7.027464389801025, + "learning_rate": 1.7391725270948764e-05, + "loss": 1.799, + "step": 62360 + }, + { + "epoch": 0.3920098992398798, + "grad_norm": 7.47585391998291, + "learning_rate": 1.739130617000411e-05, + "loss": 1.8251, + "step": 62370 + }, + { + "epoch": 0.3920727515565769, + "grad_norm": 6.758760929107666, + "learning_rate": 1.7390887069059454e-05, + "loss": 1.6702, + "step": 62380 + }, + { + "epoch": 0.392135603873274, + "grad_norm": 8.870641708374023, + "learning_rate": 1.73904679681148e-05, + "loss": 1.7451, + "step": 62390 + }, + { + "epoch": 0.3921984561899711, + "grad_norm": 7.466734409332275, + "learning_rate": 1.739004886717015e-05, + "loss": 1.8321, + "step": 62400 + }, + { + "epoch": 0.39226130850666824, + "grad_norm": 7.565104961395264, + "learning_rate": 1.7389629766225496e-05, + "loss": 1.5627, + "step": 62410 + }, + { + "epoch": 0.39232416082336535, + "grad_norm": 6.403449535369873, + "learning_rate": 1.738921066528084e-05, + "loss": 1.7451, + "step": 62420 + }, + { + "epoch": 0.39238701314006247, + "grad_norm": 9.319280624389648, + "learning_rate": 1.7388791564336186e-05, + "loss": 1.9225, + "step": 62430 + }, + { + "epoch": 0.3924498654567596, + "grad_norm": 6.362649917602539, + "learning_rate": 1.7388372463391533e-05, + "loss": 1.766, + "step": 62440 + }, + { + "epoch": 0.3925127177734567, + "grad_norm": 5.756836414337158, + "learning_rate": 1.738795336244688e-05, + "loss": 1.6647, + "step": 62450 + }, + { + "epoch": 0.3925755700901538, + "grad_norm": 6.929123401641846, + "learning_rate": 1.7387534261502224e-05, + "loss": 1.5016, + "step": 62460 + }, + { + "epoch": 0.39263842240685093, + "grad_norm": 6.545899391174316, + "learning_rate": 1.738711516055757e-05, + "loss": 1.8822, + "step": 62470 + }, + { + "epoch": 0.392701274723548, + "grad_norm": 5.8037109375, + "learning_rate": 1.7386696059612918e-05, + "loss": 1.99, + "step": 62480 + }, + { + "epoch": 0.3927641270402451, + "grad_norm": 6.601983547210693, + "learning_rate": 1.7386276958668265e-05, + "loss": 1.8231, + "step": 62490 + }, + { + "epoch": 0.3928269793569422, + "grad_norm": 7.560104846954346, + "learning_rate": 1.7385857857723612e-05, + "loss": 1.8229, + "step": 62500 + }, + { + "epoch": 0.39288983167363933, + "grad_norm": 6.890317916870117, + "learning_rate": 1.738543875677896e-05, + "loss": 1.778, + "step": 62510 + }, + { + "epoch": 0.39295268399033645, + "grad_norm": 7.7869672775268555, + "learning_rate": 1.7385019655834307e-05, + "loss": 1.9573, + "step": 62520 + }, + { + "epoch": 0.39301553630703356, + "grad_norm": 7.312685489654541, + "learning_rate": 1.7384600554889654e-05, + "loss": 1.902, + "step": 62530 + }, + { + "epoch": 0.3930783886237307, + "grad_norm": 7.124302387237549, + "learning_rate": 1.7384181453945e-05, + "loss": 1.6969, + "step": 62540 + }, + { + "epoch": 0.3931412409404278, + "grad_norm": 6.825855731964111, + "learning_rate": 1.7383762353000344e-05, + "loss": 1.8034, + "step": 62550 + }, + { + "epoch": 0.3932040932571249, + "grad_norm": 5.885743618011475, + "learning_rate": 1.738334325205569e-05, + "loss": 1.6697, + "step": 62560 + }, + { + "epoch": 0.393266945573822, + "grad_norm": 7.019294738769531, + "learning_rate": 1.738292415111104e-05, + "loss": 1.736, + "step": 62570 + }, + { + "epoch": 0.39332979789051914, + "grad_norm": 7.226141929626465, + "learning_rate": 1.7382505050166386e-05, + "loss": 1.8095, + "step": 62580 + }, + { + "epoch": 0.39339265020721625, + "grad_norm": 6.208901405334473, + "learning_rate": 1.7382085949221733e-05, + "loss": 1.5413, + "step": 62590 + }, + { + "epoch": 0.39345550252391337, + "grad_norm": 8.056857109069824, + "learning_rate": 1.7381666848277076e-05, + "loss": 1.7156, + "step": 62600 + }, + { + "epoch": 0.39351835484061043, + "grad_norm": 7.027343273162842, + "learning_rate": 1.7381247747332423e-05, + "loss": 1.707, + "step": 62610 + }, + { + "epoch": 0.39358120715730754, + "grad_norm": 6.758009910583496, + "learning_rate": 1.738082864638777e-05, + "loss": 1.6405, + "step": 62620 + }, + { + "epoch": 0.39364405947400466, + "grad_norm": 7.0669331550598145, + "learning_rate": 1.7380409545443118e-05, + "loss": 1.6977, + "step": 62630 + }, + { + "epoch": 0.3937069117907018, + "grad_norm": 8.230191230773926, + "learning_rate": 1.737999044449846e-05, + "loss": 2.0547, + "step": 62640 + }, + { + "epoch": 0.3937697641073989, + "grad_norm": 6.7205681800842285, + "learning_rate": 1.7379571343553808e-05, + "loss": 1.7277, + "step": 62650 + }, + { + "epoch": 0.393832616424096, + "grad_norm": 6.831026077270508, + "learning_rate": 1.7379152242609155e-05, + "loss": 1.7897, + "step": 62660 + }, + { + "epoch": 0.3938954687407931, + "grad_norm": 7.239831924438477, + "learning_rate": 1.7378733141664502e-05, + "loss": 1.6177, + "step": 62670 + }, + { + "epoch": 0.39395832105749023, + "grad_norm": 6.872457504272461, + "learning_rate": 1.737831404071985e-05, + "loss": 1.9241, + "step": 62680 + }, + { + "epoch": 0.39402117337418735, + "grad_norm": 8.175776481628418, + "learning_rate": 1.7377894939775197e-05, + "loss": 1.7966, + "step": 62690 + }, + { + "epoch": 0.39408402569088447, + "grad_norm": 7.143253803253174, + "learning_rate": 1.7377475838830544e-05, + "loss": 1.8268, + "step": 62700 + }, + { + "epoch": 0.3941468780075816, + "grad_norm": 5.8643269538879395, + "learning_rate": 1.7377056737885887e-05, + "loss": 1.7752, + "step": 62710 + }, + { + "epoch": 0.3942097303242787, + "grad_norm": 8.041696548461914, + "learning_rate": 1.7376637636941234e-05, + "loss": 1.8251, + "step": 62720 + }, + { + "epoch": 0.39427258264097576, + "grad_norm": 7.070854187011719, + "learning_rate": 1.737621853599658e-05, + "loss": 1.6292, + "step": 62730 + }, + { + "epoch": 0.39433543495767287, + "grad_norm": 8.591217041015625, + "learning_rate": 1.737579943505193e-05, + "loss": 1.7556, + "step": 62740 + }, + { + "epoch": 0.39439828727437, + "grad_norm": 7.1850996017456055, + "learning_rate": 1.7375380334107276e-05, + "loss": 1.8506, + "step": 62750 + }, + { + "epoch": 0.3944611395910671, + "grad_norm": 6.187245845794678, + "learning_rate": 1.7374961233162623e-05, + "loss": 1.7453, + "step": 62760 + }, + { + "epoch": 0.3945239919077642, + "grad_norm": 8.487533569335938, + "learning_rate": 1.7374542132217966e-05, + "loss": 1.9688, + "step": 62770 + }, + { + "epoch": 0.39458684422446133, + "grad_norm": 7.419347763061523, + "learning_rate": 1.7374123031273313e-05, + "loss": 1.8618, + "step": 62780 + }, + { + "epoch": 0.39464969654115845, + "grad_norm": 7.015792369842529, + "learning_rate": 1.737370393032866e-05, + "loss": 1.6115, + "step": 62790 + }, + { + "epoch": 0.39471254885785556, + "grad_norm": 6.974392890930176, + "learning_rate": 1.7373284829384008e-05, + "loss": 1.9494, + "step": 62800 + }, + { + "epoch": 0.3947754011745527, + "grad_norm": 6.271472930908203, + "learning_rate": 1.7372865728439355e-05, + "loss": 1.5843, + "step": 62810 + }, + { + "epoch": 0.3948382534912498, + "grad_norm": 6.81941032409668, + "learning_rate": 1.73724466274947e-05, + "loss": 1.6105, + "step": 62820 + }, + { + "epoch": 0.3949011058079469, + "grad_norm": 5.670494556427002, + "learning_rate": 1.7372027526550045e-05, + "loss": 1.6365, + "step": 62830 + }, + { + "epoch": 0.394963958124644, + "grad_norm": 7.861020565032959, + "learning_rate": 1.7371608425605392e-05, + "loss": 1.8997, + "step": 62840 + }, + { + "epoch": 0.39502681044134114, + "grad_norm": 6.499483585357666, + "learning_rate": 1.737118932466074e-05, + "loss": 1.7477, + "step": 62850 + }, + { + "epoch": 0.3950896627580382, + "grad_norm": 6.675130367279053, + "learning_rate": 1.7370770223716083e-05, + "loss": 1.7941, + "step": 62860 + }, + { + "epoch": 0.3951525150747353, + "grad_norm": 7.489518165588379, + "learning_rate": 1.737035112277143e-05, + "loss": 1.754, + "step": 62870 + }, + { + "epoch": 0.3952153673914324, + "grad_norm": 9.1337251663208, + "learning_rate": 1.7369932021826777e-05, + "loss": 1.7115, + "step": 62880 + }, + { + "epoch": 0.39527821970812954, + "grad_norm": 6.008447647094727, + "learning_rate": 1.7369512920882124e-05, + "loss": 1.8006, + "step": 62890 + }, + { + "epoch": 0.39534107202482666, + "grad_norm": 6.136394500732422, + "learning_rate": 1.736909381993747e-05, + "loss": 1.8918, + "step": 62900 + }, + { + "epoch": 0.39540392434152377, + "grad_norm": 7.775247097015381, + "learning_rate": 1.736867471899282e-05, + "loss": 1.8088, + "step": 62910 + }, + { + "epoch": 0.3954667766582209, + "grad_norm": 6.778098106384277, + "learning_rate": 1.7368255618048166e-05, + "loss": 1.7606, + "step": 62920 + }, + { + "epoch": 0.395529628974918, + "grad_norm": 6.544760704040527, + "learning_rate": 1.7367836517103513e-05, + "loss": 1.7363, + "step": 62930 + }, + { + "epoch": 0.3955924812916151, + "grad_norm": 6.746118068695068, + "learning_rate": 1.736741741615886e-05, + "loss": 1.8645, + "step": 62940 + }, + { + "epoch": 0.39565533360831223, + "grad_norm": 6.127651214599609, + "learning_rate": 1.7366998315214203e-05, + "loss": 1.5936, + "step": 62950 + }, + { + "epoch": 0.39571818592500935, + "grad_norm": 6.827913761138916, + "learning_rate": 1.736657921426955e-05, + "loss": 1.9603, + "step": 62960 + }, + { + "epoch": 0.39578103824170646, + "grad_norm": 7.778383731842041, + "learning_rate": 1.7366160113324898e-05, + "loss": 1.7447, + "step": 62970 + }, + { + "epoch": 0.3958438905584036, + "grad_norm": 7.143712997436523, + "learning_rate": 1.7365741012380245e-05, + "loss": 1.8716, + "step": 62980 + }, + { + "epoch": 0.39590674287510064, + "grad_norm": 5.906332969665527, + "learning_rate": 1.7365321911435592e-05, + "loss": 1.5589, + "step": 62990 + }, + { + "epoch": 0.39596959519179775, + "grad_norm": 7.487616539001465, + "learning_rate": 1.7364902810490935e-05, + "loss": 1.8074, + "step": 63000 + }, + { + "epoch": 0.39603244750849487, + "grad_norm": 6.404102325439453, + "learning_rate": 1.7364483709546282e-05, + "loss": 1.5414, + "step": 63010 + }, + { + "epoch": 0.396095299825192, + "grad_norm": 6.926724910736084, + "learning_rate": 1.736406460860163e-05, + "loss": 1.9859, + "step": 63020 + }, + { + "epoch": 0.3961581521418891, + "grad_norm": 6.548278331756592, + "learning_rate": 1.7363645507656977e-05, + "loss": 1.6036, + "step": 63030 + }, + { + "epoch": 0.3962210044585862, + "grad_norm": 7.005793571472168, + "learning_rate": 1.736322640671232e-05, + "loss": 1.9394, + "step": 63040 + }, + { + "epoch": 0.3962838567752833, + "grad_norm": 6.672649383544922, + "learning_rate": 1.7362807305767667e-05, + "loss": 1.9095, + "step": 63050 + }, + { + "epoch": 0.39634670909198044, + "grad_norm": 6.91698694229126, + "learning_rate": 1.7362388204823014e-05, + "loss": 1.5982, + "step": 63060 + }, + { + "epoch": 0.39640956140867756, + "grad_norm": 6.953579425811768, + "learning_rate": 1.736196910387836e-05, + "loss": 1.5735, + "step": 63070 + }, + { + "epoch": 0.3964724137253747, + "grad_norm": 6.486627101898193, + "learning_rate": 1.736155000293371e-05, + "loss": 1.5392, + "step": 63080 + }, + { + "epoch": 0.3965352660420718, + "grad_norm": 7.088765621185303, + "learning_rate": 1.7361130901989052e-05, + "loss": 1.7184, + "step": 63090 + }, + { + "epoch": 0.3965981183587689, + "grad_norm": 7.858634948730469, + "learning_rate": 1.73607118010444e-05, + "loss": 1.587, + "step": 63100 + }, + { + "epoch": 0.396660970675466, + "grad_norm": 6.86468505859375, + "learning_rate": 1.7360292700099746e-05, + "loss": 1.6296, + "step": 63110 + }, + { + "epoch": 0.3967238229921631, + "grad_norm": 7.362751007080078, + "learning_rate": 1.7359873599155093e-05, + "loss": 1.8125, + "step": 63120 + }, + { + "epoch": 0.3967866753088602, + "grad_norm": 6.04849910736084, + "learning_rate": 1.735945449821044e-05, + "loss": 1.9038, + "step": 63130 + }, + { + "epoch": 0.3968495276255573, + "grad_norm": 7.203058242797852, + "learning_rate": 1.7359035397265788e-05, + "loss": 1.7335, + "step": 63140 + }, + { + "epoch": 0.3969123799422544, + "grad_norm": 7.279017448425293, + "learning_rate": 1.7358616296321135e-05, + "loss": 1.9118, + "step": 63150 + }, + { + "epoch": 0.39697523225895154, + "grad_norm": 7.012674808502197, + "learning_rate": 1.7358197195376482e-05, + "loss": 1.7824, + "step": 63160 + }, + { + "epoch": 0.39703808457564865, + "grad_norm": 7.520296573638916, + "learning_rate": 1.7357778094431825e-05, + "loss": 1.686, + "step": 63170 + }, + { + "epoch": 0.39710093689234577, + "grad_norm": 7.244654178619385, + "learning_rate": 1.7357358993487173e-05, + "loss": 1.749, + "step": 63180 + }, + { + "epoch": 0.3971637892090429, + "grad_norm": 7.64955472946167, + "learning_rate": 1.735693989254252e-05, + "loss": 1.5669, + "step": 63190 + }, + { + "epoch": 0.39722664152574, + "grad_norm": 8.233468055725098, + "learning_rate": 1.7356520791597867e-05, + "loss": 1.4642, + "step": 63200 + }, + { + "epoch": 0.3972894938424371, + "grad_norm": 6.241605758666992, + "learning_rate": 1.7356101690653214e-05, + "loss": 1.813, + "step": 63210 + }, + { + "epoch": 0.39735234615913423, + "grad_norm": 7.647136688232422, + "learning_rate": 1.7355682589708557e-05, + "loss": 1.7688, + "step": 63220 + }, + { + "epoch": 0.39741519847583134, + "grad_norm": 6.980048179626465, + "learning_rate": 1.7355263488763904e-05, + "loss": 1.7126, + "step": 63230 + }, + { + "epoch": 0.3974780507925284, + "grad_norm": 9.072611808776855, + "learning_rate": 1.735484438781925e-05, + "loss": 1.6422, + "step": 63240 + }, + { + "epoch": 0.3975409031092255, + "grad_norm": 7.8335041999816895, + "learning_rate": 1.73544252868746e-05, + "loss": 1.997, + "step": 63250 + }, + { + "epoch": 0.39760375542592263, + "grad_norm": 7.354805946350098, + "learning_rate": 1.7354006185929942e-05, + "loss": 1.7942, + "step": 63260 + }, + { + "epoch": 0.39766660774261975, + "grad_norm": 7.97997522354126, + "learning_rate": 1.735358708498529e-05, + "loss": 1.856, + "step": 63270 + }, + { + "epoch": 0.39772946005931686, + "grad_norm": 8.068892478942871, + "learning_rate": 1.7353167984040636e-05, + "loss": 1.7931, + "step": 63280 + }, + { + "epoch": 0.397792312376014, + "grad_norm": 6.895265102386475, + "learning_rate": 1.7352748883095984e-05, + "loss": 1.9954, + "step": 63290 + }, + { + "epoch": 0.3978551646927111, + "grad_norm": 7.230385780334473, + "learning_rate": 1.735232978215133e-05, + "loss": 1.7045, + "step": 63300 + }, + { + "epoch": 0.3979180170094082, + "grad_norm": 8.533286094665527, + "learning_rate": 1.7351910681206678e-05, + "loss": 1.5355, + "step": 63310 + }, + { + "epoch": 0.3979808693261053, + "grad_norm": 7.085193634033203, + "learning_rate": 1.7351491580262025e-05, + "loss": 1.5851, + "step": 63320 + }, + { + "epoch": 0.39804372164280244, + "grad_norm": 6.402557849884033, + "learning_rate": 1.7351072479317372e-05, + "loss": 1.7385, + "step": 63330 + }, + { + "epoch": 0.39810657395949955, + "grad_norm": 7.216492652893066, + "learning_rate": 1.7350653378372715e-05, + "loss": 1.8626, + "step": 63340 + }, + { + "epoch": 0.39816942627619667, + "grad_norm": 7.015349388122559, + "learning_rate": 1.7350234277428063e-05, + "loss": 1.9229, + "step": 63350 + }, + { + "epoch": 0.3982322785928938, + "grad_norm": 6.161546230316162, + "learning_rate": 1.734981517648341e-05, + "loss": 1.9836, + "step": 63360 + }, + { + "epoch": 0.39829513090959084, + "grad_norm": 7.59517240524292, + "learning_rate": 1.7349396075538757e-05, + "loss": 1.6441, + "step": 63370 + }, + { + "epoch": 0.39835798322628796, + "grad_norm": 6.2155561447143555, + "learning_rate": 1.7348976974594104e-05, + "loss": 1.767, + "step": 63380 + }, + { + "epoch": 0.3984208355429851, + "grad_norm": 6.6172990798950195, + "learning_rate": 1.7348557873649447e-05, + "loss": 1.5974, + "step": 63390 + }, + { + "epoch": 0.3984836878596822, + "grad_norm": 6.872488021850586, + "learning_rate": 1.7348138772704795e-05, + "loss": 1.7944, + "step": 63400 + }, + { + "epoch": 0.3985465401763793, + "grad_norm": 6.841820240020752, + "learning_rate": 1.734771967176014e-05, + "loss": 1.7968, + "step": 63410 + }, + { + "epoch": 0.3986093924930764, + "grad_norm": 7.346033573150635, + "learning_rate": 1.734730057081549e-05, + "loss": 1.8267, + "step": 63420 + }, + { + "epoch": 0.39867224480977353, + "grad_norm": 6.4456305503845215, + "learning_rate": 1.7346881469870836e-05, + "loss": 1.8374, + "step": 63430 + }, + { + "epoch": 0.39873509712647065, + "grad_norm": 5.8930463790893555, + "learning_rate": 1.734646236892618e-05, + "loss": 1.442, + "step": 63440 + }, + { + "epoch": 0.39879794944316777, + "grad_norm": 7.168188095092773, + "learning_rate": 1.7346043267981526e-05, + "loss": 1.9853, + "step": 63450 + }, + { + "epoch": 0.3988608017598649, + "grad_norm": 5.9383745193481445, + "learning_rate": 1.7345624167036874e-05, + "loss": 2.0134, + "step": 63460 + }, + { + "epoch": 0.398923654076562, + "grad_norm": 6.50671911239624, + "learning_rate": 1.734520506609222e-05, + "loss": 1.6749, + "step": 63470 + }, + { + "epoch": 0.3989865063932591, + "grad_norm": 6.94907808303833, + "learning_rate": 1.7344785965147564e-05, + "loss": 1.8436, + "step": 63480 + }, + { + "epoch": 0.3990493587099562, + "grad_norm": 7.192526340484619, + "learning_rate": 1.734436686420291e-05, + "loss": 1.8829, + "step": 63490 + }, + { + "epoch": 0.3991122110266533, + "grad_norm": 6.359023571014404, + "learning_rate": 1.734394776325826e-05, + "loss": 1.8296, + "step": 63500 + }, + { + "epoch": 0.3991750633433504, + "grad_norm": 6.977709770202637, + "learning_rate": 1.7343528662313606e-05, + "loss": 1.658, + "step": 63510 + }, + { + "epoch": 0.3992379156600475, + "grad_norm": 8.42409896850586, + "learning_rate": 1.7343109561368953e-05, + "loss": 1.8929, + "step": 63520 + }, + { + "epoch": 0.39930076797674463, + "grad_norm": 6.620844841003418, + "learning_rate": 1.73426904604243e-05, + "loss": 1.7655, + "step": 63530 + }, + { + "epoch": 0.39936362029344175, + "grad_norm": 7.603841304779053, + "learning_rate": 1.7342271359479647e-05, + "loss": 1.9693, + "step": 63540 + }, + { + "epoch": 0.39942647261013886, + "grad_norm": 6.572299480438232, + "learning_rate": 1.7341852258534994e-05, + "loss": 1.5739, + "step": 63550 + }, + { + "epoch": 0.399489324926836, + "grad_norm": 6.238868713378906, + "learning_rate": 1.734143315759034e-05, + "loss": 1.9381, + "step": 63560 + }, + { + "epoch": 0.3995521772435331, + "grad_norm": 7.944374084472656, + "learning_rate": 1.7341014056645685e-05, + "loss": 1.9841, + "step": 63570 + }, + { + "epoch": 0.3996150295602302, + "grad_norm": 8.332205772399902, + "learning_rate": 1.734059495570103e-05, + "loss": 1.8276, + "step": 63580 + }, + { + "epoch": 0.3996778818769273, + "grad_norm": 7.850004196166992, + "learning_rate": 1.734017585475638e-05, + "loss": 1.6116, + "step": 63590 + }, + { + "epoch": 0.39974073419362444, + "grad_norm": 5.909306049346924, + "learning_rate": 1.7339756753811726e-05, + "loss": 1.7175, + "step": 63600 + }, + { + "epoch": 0.39980358651032155, + "grad_norm": 6.362872123718262, + "learning_rate": 1.7339337652867073e-05, + "loss": 1.6664, + "step": 63610 + }, + { + "epoch": 0.39986643882701867, + "grad_norm": 6.35066032409668, + "learning_rate": 1.7338918551922417e-05, + "loss": 1.5737, + "step": 63620 + }, + { + "epoch": 0.3999292911437157, + "grad_norm": 6.947273254394531, + "learning_rate": 1.7338499450977764e-05, + "loss": 1.8287, + "step": 63630 + }, + { + "epoch": 0.39999214346041284, + "grad_norm": 6.292657375335693, + "learning_rate": 1.733808035003311e-05, + "loss": 1.8216, + "step": 63640 + }, + { + "epoch": 0.40005499577710996, + "grad_norm": 6.504972457885742, + "learning_rate": 1.7337661249088458e-05, + "loss": 1.8389, + "step": 63650 + }, + { + "epoch": 0.40011784809380707, + "grad_norm": 6.173849582672119, + "learning_rate": 1.73372421481438e-05, + "loss": 1.6541, + "step": 63660 + }, + { + "epoch": 0.4001807004105042, + "grad_norm": 7.108821868896484, + "learning_rate": 1.733682304719915e-05, + "loss": 1.5928, + "step": 63670 + }, + { + "epoch": 0.4002435527272013, + "grad_norm": 5.635346412658691, + "learning_rate": 1.7336403946254496e-05, + "loss": 1.7374, + "step": 63680 + }, + { + "epoch": 0.4003064050438984, + "grad_norm": 6.614495754241943, + "learning_rate": 1.7335984845309843e-05, + "loss": 1.6753, + "step": 63690 + }, + { + "epoch": 0.40036925736059553, + "grad_norm": 6.775355815887451, + "learning_rate": 1.733556574436519e-05, + "loss": 1.7663, + "step": 63700 + }, + { + "epoch": 0.40043210967729265, + "grad_norm": 6.525432109832764, + "learning_rate": 1.7335146643420537e-05, + "loss": 1.7315, + "step": 63710 + }, + { + "epoch": 0.40049496199398976, + "grad_norm": 7.270900726318359, + "learning_rate": 1.733472754247588e-05, + "loss": 1.9045, + "step": 63720 + }, + { + "epoch": 0.4005578143106869, + "grad_norm": 7.312255859375, + "learning_rate": 1.7334308441531228e-05, + "loss": 1.7577, + "step": 63730 + }, + { + "epoch": 0.400620666627384, + "grad_norm": 6.933480262756348, + "learning_rate": 1.7333889340586575e-05, + "loss": 1.6655, + "step": 63740 + }, + { + "epoch": 0.40068351894408105, + "grad_norm": 7.075981616973877, + "learning_rate": 1.733347023964192e-05, + "loss": 1.7332, + "step": 63750 + }, + { + "epoch": 0.40074637126077817, + "grad_norm": 7.954800128936768, + "learning_rate": 1.733305113869727e-05, + "loss": 1.8044, + "step": 63760 + }, + { + "epoch": 0.4008092235774753, + "grad_norm": 6.6571455001831055, + "learning_rate": 1.7332632037752616e-05, + "loss": 1.6542, + "step": 63770 + }, + { + "epoch": 0.4008720758941724, + "grad_norm": 6.37064266204834, + "learning_rate": 1.7332212936807963e-05, + "loss": 1.8351, + "step": 63780 + }, + { + "epoch": 0.4009349282108695, + "grad_norm": 6.50276517868042, + "learning_rate": 1.7331793835863307e-05, + "loss": 1.5103, + "step": 63790 + }, + { + "epoch": 0.4009977805275666, + "grad_norm": 5.96057653427124, + "learning_rate": 1.7331374734918654e-05, + "loss": 1.5626, + "step": 63800 + }, + { + "epoch": 0.40106063284426374, + "grad_norm": 6.829263687133789, + "learning_rate": 1.7330955633974e-05, + "loss": 1.4947, + "step": 63810 + }, + { + "epoch": 0.40112348516096086, + "grad_norm": 7.6001715660095215, + "learning_rate": 1.7330536533029348e-05, + "loss": 1.8695, + "step": 63820 + }, + { + "epoch": 0.401186337477658, + "grad_norm": 5.44834566116333, + "learning_rate": 1.7330117432084695e-05, + "loss": 1.6288, + "step": 63830 + }, + { + "epoch": 0.4012491897943551, + "grad_norm": 7.005091190338135, + "learning_rate": 1.732969833114004e-05, + "loss": 1.5295, + "step": 63840 + }, + { + "epoch": 0.4013120421110522, + "grad_norm": 6.27988338470459, + "learning_rate": 1.7329279230195386e-05, + "loss": 1.6237, + "step": 63850 + }, + { + "epoch": 0.4013748944277493, + "grad_norm": 8.009944915771484, + "learning_rate": 1.7328860129250733e-05, + "loss": 1.64, + "step": 63860 + }, + { + "epoch": 0.40143774674444643, + "grad_norm": 6.172440528869629, + "learning_rate": 1.732844102830608e-05, + "loss": 1.6258, + "step": 63870 + }, + { + "epoch": 0.4015005990611435, + "grad_norm": 6.858620643615723, + "learning_rate": 1.7328021927361423e-05, + "loss": 1.777, + "step": 63880 + }, + { + "epoch": 0.4015634513778406, + "grad_norm": 7.138575553894043, + "learning_rate": 1.732760282641677e-05, + "loss": 1.6494, + "step": 63890 + }, + { + "epoch": 0.4016263036945377, + "grad_norm": 7.080873012542725, + "learning_rate": 1.7327183725472118e-05, + "loss": 1.9679, + "step": 63900 + }, + { + "epoch": 0.40168915601123484, + "grad_norm": 6.977520942687988, + "learning_rate": 1.7326764624527465e-05, + "loss": 1.8074, + "step": 63910 + }, + { + "epoch": 0.40175200832793195, + "grad_norm": 6.923065662384033, + "learning_rate": 1.7326345523582812e-05, + "loss": 1.7449, + "step": 63920 + }, + { + "epoch": 0.40181486064462907, + "grad_norm": 7.034780979156494, + "learning_rate": 1.732592642263816e-05, + "loss": 1.7452, + "step": 63930 + }, + { + "epoch": 0.4018777129613262, + "grad_norm": 6.530808448791504, + "learning_rate": 1.7325507321693506e-05, + "loss": 1.4537, + "step": 63940 + }, + { + "epoch": 0.4019405652780233, + "grad_norm": 7.575244426727295, + "learning_rate": 1.7325088220748853e-05, + "loss": 1.7735, + "step": 63950 + }, + { + "epoch": 0.4020034175947204, + "grad_norm": 7.449587345123291, + "learning_rate": 1.73246691198042e-05, + "loss": 1.6958, + "step": 63960 + }, + { + "epoch": 0.40206626991141753, + "grad_norm": 6.258814811706543, + "learning_rate": 1.7324250018859544e-05, + "loss": 1.8769, + "step": 63970 + }, + { + "epoch": 0.40212912222811464, + "grad_norm": 6.345718860626221, + "learning_rate": 1.732383091791489e-05, + "loss": 1.7294, + "step": 63980 + }, + { + "epoch": 0.40219197454481176, + "grad_norm": 7.540582656860352, + "learning_rate": 1.7323411816970238e-05, + "loss": 1.7429, + "step": 63990 + }, + { + "epoch": 0.4022548268615089, + "grad_norm": 7.103134632110596, + "learning_rate": 1.7322992716025585e-05, + "loss": 1.6767, + "step": 64000 + }, + { + "epoch": 0.40231767917820593, + "grad_norm": 5.747287750244141, + "learning_rate": 1.732257361508093e-05, + "loss": 1.5247, + "step": 64010 + }, + { + "epoch": 0.40238053149490305, + "grad_norm": 8.805617332458496, + "learning_rate": 1.7322154514136276e-05, + "loss": 1.6903, + "step": 64020 + }, + { + "epoch": 0.40244338381160016, + "grad_norm": 7.009683132171631, + "learning_rate": 1.7321735413191623e-05, + "loss": 1.7406, + "step": 64030 + }, + { + "epoch": 0.4025062361282973, + "grad_norm": 6.678035736083984, + "learning_rate": 1.732131631224697e-05, + "loss": 1.7652, + "step": 64040 + }, + { + "epoch": 0.4025690884449944, + "grad_norm": 6.537189960479736, + "learning_rate": 1.7320897211302317e-05, + "loss": 1.7145, + "step": 64050 + }, + { + "epoch": 0.4026319407616915, + "grad_norm": 6.691839218139648, + "learning_rate": 1.732047811035766e-05, + "loss": 1.9445, + "step": 64060 + }, + { + "epoch": 0.4026947930783886, + "grad_norm": 7.19918155670166, + "learning_rate": 1.7320059009413008e-05, + "loss": 1.8886, + "step": 64070 + }, + { + "epoch": 0.40275764539508574, + "grad_norm": 6.629178047180176, + "learning_rate": 1.7319639908468355e-05, + "loss": 1.7704, + "step": 64080 + }, + { + "epoch": 0.40282049771178285, + "grad_norm": 6.735805034637451, + "learning_rate": 1.7319220807523702e-05, + "loss": 1.6564, + "step": 64090 + }, + { + "epoch": 0.40288335002847997, + "grad_norm": 7.237814426422119, + "learning_rate": 1.7318801706579045e-05, + "loss": 1.6536, + "step": 64100 + }, + { + "epoch": 0.4029462023451771, + "grad_norm": 6.82144832611084, + "learning_rate": 1.7318382605634392e-05, + "loss": 1.6636, + "step": 64110 + }, + { + "epoch": 0.4030090546618742, + "grad_norm": 6.825602054595947, + "learning_rate": 1.731796350468974e-05, + "loss": 1.8003, + "step": 64120 + }, + { + "epoch": 0.4030719069785713, + "grad_norm": 6.9392619132995605, + "learning_rate": 1.7317544403745087e-05, + "loss": 1.6909, + "step": 64130 + }, + { + "epoch": 0.4031347592952684, + "grad_norm": 7.321532249450684, + "learning_rate": 1.7317125302800434e-05, + "loss": 1.8516, + "step": 64140 + }, + { + "epoch": 0.4031976116119655, + "grad_norm": 7.79758882522583, + "learning_rate": 1.731670620185578e-05, + "loss": 1.7862, + "step": 64150 + }, + { + "epoch": 0.4032604639286626, + "grad_norm": 6.494131565093994, + "learning_rate": 1.7316287100911128e-05, + "loss": 1.7009, + "step": 64160 + }, + { + "epoch": 0.4033233162453597, + "grad_norm": 6.491816997528076, + "learning_rate": 1.7315867999966475e-05, + "loss": 1.7408, + "step": 64170 + }, + { + "epoch": 0.40338616856205683, + "grad_norm": 6.526629447937012, + "learning_rate": 1.7315448899021822e-05, + "loss": 1.7, + "step": 64180 + }, + { + "epoch": 0.40344902087875395, + "grad_norm": 8.186490058898926, + "learning_rate": 1.7315029798077166e-05, + "loss": 1.7309, + "step": 64190 + }, + { + "epoch": 0.40351187319545107, + "grad_norm": 7.1548638343811035, + "learning_rate": 1.7314610697132513e-05, + "loss": 1.827, + "step": 64200 + }, + { + "epoch": 0.4035747255121482, + "grad_norm": 7.3159918785095215, + "learning_rate": 1.731419159618786e-05, + "loss": 1.644, + "step": 64210 + }, + { + "epoch": 0.4036375778288453, + "grad_norm": 7.803012847900391, + "learning_rate": 1.7313772495243207e-05, + "loss": 1.7641, + "step": 64220 + }, + { + "epoch": 0.4037004301455424, + "grad_norm": 7.93739652633667, + "learning_rate": 1.7313353394298554e-05, + "loss": 1.6864, + "step": 64230 + }, + { + "epoch": 0.4037632824622395, + "grad_norm": 7.197110652923584, + "learning_rate": 1.7312934293353898e-05, + "loss": 1.8326, + "step": 64240 + }, + { + "epoch": 0.40382613477893664, + "grad_norm": 6.009327411651611, + "learning_rate": 1.7312515192409245e-05, + "loss": 1.6372, + "step": 64250 + }, + { + "epoch": 0.40388898709563376, + "grad_norm": 6.794830799102783, + "learning_rate": 1.7312096091464592e-05, + "loss": 1.9629, + "step": 64260 + }, + { + "epoch": 0.4039518394123308, + "grad_norm": 6.994704723358154, + "learning_rate": 1.731167699051994e-05, + "loss": 1.9373, + "step": 64270 + }, + { + "epoch": 0.40401469172902793, + "grad_norm": 7.244943141937256, + "learning_rate": 1.7311257889575283e-05, + "loss": 1.616, + "step": 64280 + }, + { + "epoch": 0.40407754404572505, + "grad_norm": 5.833102703094482, + "learning_rate": 1.731083878863063e-05, + "loss": 1.801, + "step": 64290 + }, + { + "epoch": 0.40414039636242216, + "grad_norm": 7.377758502960205, + "learning_rate": 1.7310419687685977e-05, + "loss": 1.7893, + "step": 64300 + }, + { + "epoch": 0.4042032486791193, + "grad_norm": 6.23237419128418, + "learning_rate": 1.7310000586741324e-05, + "loss": 1.7011, + "step": 64310 + }, + { + "epoch": 0.4042661009958164, + "grad_norm": 7.188849449157715, + "learning_rate": 1.730958148579667e-05, + "loss": 1.6293, + "step": 64320 + }, + { + "epoch": 0.4043289533125135, + "grad_norm": 7.2781243324279785, + "learning_rate": 1.7309162384852018e-05, + "loss": 1.7722, + "step": 64330 + }, + { + "epoch": 0.4043918056292106, + "grad_norm": 6.869524955749512, + "learning_rate": 1.730874328390736e-05, + "loss": 1.8963, + "step": 64340 + }, + { + "epoch": 0.40445465794590774, + "grad_norm": 8.475284576416016, + "learning_rate": 1.730832418296271e-05, + "loss": 1.7004, + "step": 64350 + }, + { + "epoch": 0.40451751026260485, + "grad_norm": 5.951559543609619, + "learning_rate": 1.7307905082018056e-05, + "loss": 1.4649, + "step": 64360 + }, + { + "epoch": 0.40458036257930197, + "grad_norm": 8.366894721984863, + "learning_rate": 1.7307485981073403e-05, + "loss": 1.7709, + "step": 64370 + }, + { + "epoch": 0.4046432148959991, + "grad_norm": 6.629880428314209, + "learning_rate": 1.730706688012875e-05, + "loss": 1.7315, + "step": 64380 + }, + { + "epoch": 0.40470606721269614, + "grad_norm": 6.540175914764404, + "learning_rate": 1.7306647779184097e-05, + "loss": 1.691, + "step": 64390 + }, + { + "epoch": 0.40476891952939326, + "grad_norm": 7.060180187225342, + "learning_rate": 1.7306228678239444e-05, + "loss": 1.7188, + "step": 64400 + }, + { + "epoch": 0.40483177184609037, + "grad_norm": 8.70560073852539, + "learning_rate": 1.7305809577294788e-05, + "loss": 1.8337, + "step": 64410 + }, + { + "epoch": 0.4048946241627875, + "grad_norm": 15.648215293884277, + "learning_rate": 1.7305390476350135e-05, + "loss": 1.5735, + "step": 64420 + }, + { + "epoch": 0.4049574764794846, + "grad_norm": 6.112917900085449, + "learning_rate": 1.7304971375405482e-05, + "loss": 1.5983, + "step": 64430 + }, + { + "epoch": 0.4050203287961817, + "grad_norm": 6.319818019866943, + "learning_rate": 1.730455227446083e-05, + "loss": 1.8616, + "step": 64440 + }, + { + "epoch": 0.40508318111287883, + "grad_norm": 6.186047554016113, + "learning_rate": 1.7304133173516176e-05, + "loss": 1.515, + "step": 64450 + }, + { + "epoch": 0.40514603342957595, + "grad_norm": 7.4625139236450195, + "learning_rate": 1.730371407257152e-05, + "loss": 1.6605, + "step": 64460 + }, + { + "epoch": 0.40520888574627306, + "grad_norm": 5.847615718841553, + "learning_rate": 1.7303294971626867e-05, + "loss": 1.6329, + "step": 64470 + }, + { + "epoch": 0.4052717380629702, + "grad_norm": 7.5985894203186035, + "learning_rate": 1.7302875870682214e-05, + "loss": 1.7322, + "step": 64480 + }, + { + "epoch": 0.4053345903796673, + "grad_norm": 6.878131866455078, + "learning_rate": 1.730245676973756e-05, + "loss": 1.8163, + "step": 64490 + }, + { + "epoch": 0.4053974426963644, + "grad_norm": 6.180505275726318, + "learning_rate": 1.7302037668792905e-05, + "loss": 1.566, + "step": 64500 + }, + { + "epoch": 0.4054602950130615, + "grad_norm": 7.333800315856934, + "learning_rate": 1.730161856784825e-05, + "loss": 1.8852, + "step": 64510 + }, + { + "epoch": 0.4055231473297586, + "grad_norm": 6.708546161651611, + "learning_rate": 1.73011994669036e-05, + "loss": 1.7546, + "step": 64520 + }, + { + "epoch": 0.4055859996464557, + "grad_norm": 6.404630661010742, + "learning_rate": 1.7300780365958946e-05, + "loss": 1.9791, + "step": 64530 + }, + { + "epoch": 0.4056488519631528, + "grad_norm": 6.5063557624816895, + "learning_rate": 1.7300361265014293e-05, + "loss": 1.8629, + "step": 64540 + }, + { + "epoch": 0.4057117042798499, + "grad_norm": 6.890121936798096, + "learning_rate": 1.729994216406964e-05, + "loss": 1.8914, + "step": 64550 + }, + { + "epoch": 0.40577455659654704, + "grad_norm": 6.5508012771606445, + "learning_rate": 1.7299523063124987e-05, + "loss": 1.8905, + "step": 64560 + }, + { + "epoch": 0.40583740891324416, + "grad_norm": 6.7240447998046875, + "learning_rate": 1.7299103962180334e-05, + "loss": 1.6553, + "step": 64570 + }, + { + "epoch": 0.4059002612299413, + "grad_norm": 7.4773077964782715, + "learning_rate": 1.729868486123568e-05, + "loss": 1.7865, + "step": 64580 + }, + { + "epoch": 0.4059631135466384, + "grad_norm": 6.3153533935546875, + "learning_rate": 1.7298265760291025e-05, + "loss": 1.7086, + "step": 64590 + }, + { + "epoch": 0.4060259658633355, + "grad_norm": 7.008026123046875, + "learning_rate": 1.7297846659346372e-05, + "loss": 1.5691, + "step": 64600 + }, + { + "epoch": 0.4060888181800326, + "grad_norm": 7.000103950500488, + "learning_rate": 1.729742755840172e-05, + "loss": 2.2333, + "step": 64610 + }, + { + "epoch": 0.40615167049672973, + "grad_norm": 6.496999263763428, + "learning_rate": 1.7297008457457066e-05, + "loss": 1.6231, + "step": 64620 + }, + { + "epoch": 0.40621452281342685, + "grad_norm": 6.620175361633301, + "learning_rate": 1.729658935651241e-05, + "loss": 1.753, + "step": 64630 + }, + { + "epoch": 0.40627737513012396, + "grad_norm": 6.827455043792725, + "learning_rate": 1.7296170255567757e-05, + "loss": 1.7794, + "step": 64640 + }, + { + "epoch": 0.406340227446821, + "grad_norm": 8.18285846710205, + "learning_rate": 1.7295751154623104e-05, + "loss": 1.7071, + "step": 64650 + }, + { + "epoch": 0.40640307976351814, + "grad_norm": 6.997250556945801, + "learning_rate": 1.729533205367845e-05, + "loss": 1.8244, + "step": 64660 + }, + { + "epoch": 0.40646593208021525, + "grad_norm": 7.275681972503662, + "learning_rate": 1.7294912952733798e-05, + "loss": 1.8053, + "step": 64670 + }, + { + "epoch": 0.40652878439691237, + "grad_norm": 6.914024353027344, + "learning_rate": 1.729449385178914e-05, + "loss": 1.6802, + "step": 64680 + }, + { + "epoch": 0.4065916367136095, + "grad_norm": 7.151669979095459, + "learning_rate": 1.729407475084449e-05, + "loss": 1.5671, + "step": 64690 + }, + { + "epoch": 0.4066544890303066, + "grad_norm": 7.415504455566406, + "learning_rate": 1.7293655649899836e-05, + "loss": 1.667, + "step": 64700 + }, + { + "epoch": 0.4067173413470037, + "grad_norm": 7.724193572998047, + "learning_rate": 1.7293236548955183e-05, + "loss": 1.7058, + "step": 64710 + }, + { + "epoch": 0.40678019366370083, + "grad_norm": 6.968172550201416, + "learning_rate": 1.7292817448010527e-05, + "loss": 1.8305, + "step": 64720 + }, + { + "epoch": 0.40684304598039794, + "grad_norm": 6.436429023742676, + "learning_rate": 1.7292398347065874e-05, + "loss": 1.8455, + "step": 64730 + }, + { + "epoch": 0.40690589829709506, + "grad_norm": 8.1800537109375, + "learning_rate": 1.729197924612122e-05, + "loss": 1.7654, + "step": 64740 + }, + { + "epoch": 0.4069687506137922, + "grad_norm": 6.217105865478516, + "learning_rate": 1.7291560145176568e-05, + "loss": 1.6422, + "step": 64750 + }, + { + "epoch": 0.4070316029304893, + "grad_norm": 6.0421624183654785, + "learning_rate": 1.7291141044231915e-05, + "loss": 1.781, + "step": 64760 + }, + { + "epoch": 0.4070944552471864, + "grad_norm": 8.104823112487793, + "learning_rate": 1.7290721943287262e-05, + "loss": 1.9295, + "step": 64770 + }, + { + "epoch": 0.40715730756388346, + "grad_norm": 8.161705017089844, + "learning_rate": 1.729030284234261e-05, + "loss": 1.6782, + "step": 64780 + }, + { + "epoch": 0.4072201598805806, + "grad_norm": 7.609684467315674, + "learning_rate": 1.7289883741397956e-05, + "loss": 1.6898, + "step": 64790 + }, + { + "epoch": 0.4072830121972777, + "grad_norm": 6.365793228149414, + "learning_rate": 1.7289464640453303e-05, + "loss": 1.7787, + "step": 64800 + }, + { + "epoch": 0.4073458645139748, + "grad_norm": 6.17117166519165, + "learning_rate": 1.7289045539508647e-05, + "loss": 1.699, + "step": 64810 + }, + { + "epoch": 0.4074087168306719, + "grad_norm": 6.393515110015869, + "learning_rate": 1.7288626438563994e-05, + "loss": 1.6626, + "step": 64820 + }, + { + "epoch": 0.40747156914736904, + "grad_norm": 7.535359859466553, + "learning_rate": 1.728820733761934e-05, + "loss": 1.9596, + "step": 64830 + }, + { + "epoch": 0.40753442146406615, + "grad_norm": 5.87127685546875, + "learning_rate": 1.7287788236674688e-05, + "loss": 1.9254, + "step": 64840 + }, + { + "epoch": 0.40759727378076327, + "grad_norm": 7.311943531036377, + "learning_rate": 1.7287369135730035e-05, + "loss": 1.6165, + "step": 64850 + }, + { + "epoch": 0.4076601260974604, + "grad_norm": 8.07356071472168, + "learning_rate": 1.728695003478538e-05, + "loss": 2.111, + "step": 64860 + }, + { + "epoch": 0.4077229784141575, + "grad_norm": 6.676053047180176, + "learning_rate": 1.7286530933840726e-05, + "loss": 1.8156, + "step": 64870 + }, + { + "epoch": 0.4077858307308546, + "grad_norm": 6.35545539855957, + "learning_rate": 1.7286111832896073e-05, + "loss": 1.6365, + "step": 64880 + }, + { + "epoch": 0.40784868304755173, + "grad_norm": 7.015282154083252, + "learning_rate": 1.728569273195142e-05, + "loss": 1.6851, + "step": 64890 + }, + { + "epoch": 0.4079115353642488, + "grad_norm": 6.763131618499756, + "learning_rate": 1.7285273631006764e-05, + "loss": 1.6802, + "step": 64900 + }, + { + "epoch": 0.4079743876809459, + "grad_norm": 7.126100063323975, + "learning_rate": 1.728485453006211e-05, + "loss": 1.7282, + "step": 64910 + }, + { + "epoch": 0.408037239997643, + "grad_norm": 7.30333948135376, + "learning_rate": 1.7284435429117458e-05, + "loss": 1.7624, + "step": 64920 + }, + { + "epoch": 0.40810009231434013, + "grad_norm": 6.8902058601379395, + "learning_rate": 1.7284016328172805e-05, + "loss": 1.8086, + "step": 64930 + }, + { + "epoch": 0.40816294463103725, + "grad_norm": 6.535923957824707, + "learning_rate": 1.7283597227228152e-05, + "loss": 1.5891, + "step": 64940 + }, + { + "epoch": 0.40822579694773437, + "grad_norm": 6.411938190460205, + "learning_rate": 1.72831781262835e-05, + "loss": 1.6533, + "step": 64950 + }, + { + "epoch": 0.4082886492644315, + "grad_norm": 6.7717742919921875, + "learning_rate": 1.7282759025338846e-05, + "loss": 1.9807, + "step": 64960 + }, + { + "epoch": 0.4083515015811286, + "grad_norm": 7.145508289337158, + "learning_rate": 1.728233992439419e-05, + "loss": 1.7605, + "step": 64970 + }, + { + "epoch": 0.4084143538978257, + "grad_norm": 7.144276142120361, + "learning_rate": 1.7281920823449537e-05, + "loss": 1.8039, + "step": 64980 + }, + { + "epoch": 0.4084772062145228, + "grad_norm": 5.917950630187988, + "learning_rate": 1.7281501722504884e-05, + "loss": 1.6906, + "step": 64990 + }, + { + "epoch": 0.40854005853121994, + "grad_norm": 6.059743881225586, + "learning_rate": 1.728108262156023e-05, + "loss": 1.6918, + "step": 65000 + }, + { + "epoch": 0.40860291084791706, + "grad_norm": 7.979092597961426, + "learning_rate": 1.7280663520615578e-05, + "loss": 1.7555, + "step": 65010 + }, + { + "epoch": 0.40866576316461417, + "grad_norm": 6.71980619430542, + "learning_rate": 1.7280244419670925e-05, + "loss": 1.6185, + "step": 65020 + }, + { + "epoch": 0.40872861548131123, + "grad_norm": 7.242041110992432, + "learning_rate": 1.727982531872627e-05, + "loss": 1.7957, + "step": 65030 + }, + { + "epoch": 0.40879146779800835, + "grad_norm": 6.984872817993164, + "learning_rate": 1.7279406217781616e-05, + "loss": 1.7645, + "step": 65040 + }, + { + "epoch": 0.40885432011470546, + "grad_norm": 6.94974946975708, + "learning_rate": 1.7278987116836963e-05, + "loss": 1.9315, + "step": 65050 + }, + { + "epoch": 0.4089171724314026, + "grad_norm": 7.313066482543945, + "learning_rate": 1.727856801589231e-05, + "loss": 1.8741, + "step": 65060 + }, + { + "epoch": 0.4089800247480997, + "grad_norm": 6.230828762054443, + "learning_rate": 1.7278148914947657e-05, + "loss": 1.8841, + "step": 65070 + }, + { + "epoch": 0.4090428770647968, + "grad_norm": 6.700216770172119, + "learning_rate": 1.727777172409747e-05, + "loss": 1.7687, + "step": 65080 + }, + { + "epoch": 0.4091057293814939, + "grad_norm": 7.0344977378845215, + "learning_rate": 1.7277352623152815e-05, + "loss": 1.7114, + "step": 65090 + }, + { + "epoch": 0.40916858169819104, + "grad_norm": 7.262002468109131, + "learning_rate": 1.7276933522208163e-05, + "loss": 1.9332, + "step": 65100 + }, + { + "epoch": 0.40923143401488815, + "grad_norm": 6.58958625793457, + "learning_rate": 1.7276514421263506e-05, + "loss": 1.7787, + "step": 65110 + }, + { + "epoch": 0.40929428633158527, + "grad_norm": 6.534708499908447, + "learning_rate": 1.7276095320318853e-05, + "loss": 1.6371, + "step": 65120 + }, + { + "epoch": 0.4093571386482824, + "grad_norm": 5.883193016052246, + "learning_rate": 1.72756762193742e-05, + "loss": 1.513, + "step": 65130 + }, + { + "epoch": 0.4094199909649795, + "grad_norm": 7.480133056640625, + "learning_rate": 1.7275257118429547e-05, + "loss": 1.6982, + "step": 65140 + }, + { + "epoch": 0.4094828432816766, + "grad_norm": 6.219967842102051, + "learning_rate": 1.727483801748489e-05, + "loss": 1.5562, + "step": 65150 + }, + { + "epoch": 0.40954569559837367, + "grad_norm": 7.834683895111084, + "learning_rate": 1.7274418916540238e-05, + "loss": 1.7408, + "step": 65160 + }, + { + "epoch": 0.4096085479150708, + "grad_norm": 6.424805641174316, + "learning_rate": 1.7273999815595585e-05, + "loss": 1.8484, + "step": 65170 + }, + { + "epoch": 0.4096714002317679, + "grad_norm": 7.738645553588867, + "learning_rate": 1.7273580714650932e-05, + "loss": 1.7559, + "step": 65180 + }, + { + "epoch": 0.409734252548465, + "grad_norm": 6.217718124389648, + "learning_rate": 1.727316161370628e-05, + "loss": 1.7107, + "step": 65190 + }, + { + "epoch": 0.40979710486516213, + "grad_norm": 7.466714859008789, + "learning_rate": 1.7272742512761623e-05, + "loss": 1.8146, + "step": 65200 + }, + { + "epoch": 0.40985995718185925, + "grad_norm": 5.817049026489258, + "learning_rate": 1.7272365321911438e-05, + "loss": 1.6548, + "step": 65210 + }, + { + "epoch": 0.40992280949855636, + "grad_norm": 6.468529224395752, + "learning_rate": 1.7271946220966785e-05, + "loss": 1.8484, + "step": 65220 + }, + { + "epoch": 0.4099856618152535, + "grad_norm": 6.295289039611816, + "learning_rate": 1.727152712002213e-05, + "loss": 1.7962, + "step": 65230 + }, + { + "epoch": 0.4100485141319506, + "grad_norm": 7.373312950134277, + "learning_rate": 1.7271108019077476e-05, + "loss": 1.6707, + "step": 65240 + }, + { + "epoch": 0.4101113664486477, + "grad_norm": 6.659286022186279, + "learning_rate": 1.7270688918132823e-05, + "loss": 1.9766, + "step": 65250 + }, + { + "epoch": 0.4101742187653448, + "grad_norm": 6.687599182128906, + "learning_rate": 1.727026981718817e-05, + "loss": 1.9147, + "step": 65260 + }, + { + "epoch": 0.41023707108204194, + "grad_norm": 6.625199317932129, + "learning_rate": 1.7269850716243513e-05, + "loss": 1.6965, + "step": 65270 + }, + { + "epoch": 0.41029992339873905, + "grad_norm": 8.33799934387207, + "learning_rate": 1.726943161529886e-05, + "loss": 1.9639, + "step": 65280 + }, + { + "epoch": 0.4103627757154361, + "grad_norm": 8.036014556884766, + "learning_rate": 1.7269012514354208e-05, + "loss": 1.7142, + "step": 65290 + }, + { + "epoch": 0.4104256280321332, + "grad_norm": 7.296436786651611, + "learning_rate": 1.7268593413409555e-05, + "loss": 1.7163, + "step": 65300 + }, + { + "epoch": 0.41048848034883034, + "grad_norm": 8.26138973236084, + "learning_rate": 1.7268174312464902e-05, + "loss": 1.6409, + "step": 65310 + }, + { + "epoch": 0.41055133266552746, + "grad_norm": 7.372732639312744, + "learning_rate": 1.726775521152025e-05, + "loss": 1.6704, + "step": 65320 + }, + { + "epoch": 0.4106141849822246, + "grad_norm": 6.2835516929626465, + "learning_rate": 1.7267336110575596e-05, + "loss": 1.7482, + "step": 65330 + }, + { + "epoch": 0.4106770372989217, + "grad_norm": 5.813044548034668, + "learning_rate": 1.7266917009630943e-05, + "loss": 1.7742, + "step": 65340 + }, + { + "epoch": 0.4107398896156188, + "grad_norm": 7.238397598266602, + "learning_rate": 1.7266497908686287e-05, + "loss": 1.5224, + "step": 65350 + }, + { + "epoch": 0.4108027419323159, + "grad_norm": 7.718009948730469, + "learning_rate": 1.7266078807741634e-05, + "loss": 2.0273, + "step": 65360 + }, + { + "epoch": 0.41086559424901303, + "grad_norm": 6.700657367706299, + "learning_rate": 1.726565970679698e-05, + "loss": 1.8385, + "step": 65370 + }, + { + "epoch": 0.41092844656571015, + "grad_norm": 7.539520263671875, + "learning_rate": 1.7265240605852328e-05, + "loss": 1.7639, + "step": 65380 + }, + { + "epoch": 0.41099129888240726, + "grad_norm": 6.936416149139404, + "learning_rate": 1.7264821504907675e-05, + "loss": 1.8539, + "step": 65390 + }, + { + "epoch": 0.4110541511991044, + "grad_norm": 7.549854755401611, + "learning_rate": 1.7264402403963022e-05, + "loss": 1.9081, + "step": 65400 + }, + { + "epoch": 0.41111700351580144, + "grad_norm": 7.006616115570068, + "learning_rate": 1.7263983303018366e-05, + "loss": 1.6595, + "step": 65410 + }, + { + "epoch": 0.41117985583249855, + "grad_norm": 5.82361364364624, + "learning_rate": 1.7263564202073713e-05, + "loss": 1.5125, + "step": 65420 + }, + { + "epoch": 0.41124270814919567, + "grad_norm": 6.20650577545166, + "learning_rate": 1.726314510112906e-05, + "loss": 2.0813, + "step": 65430 + }, + { + "epoch": 0.4113055604658928, + "grad_norm": 8.615455627441406, + "learning_rate": 1.7262726000184407e-05, + "loss": 1.7214, + "step": 65440 + }, + { + "epoch": 0.4113684127825899, + "grad_norm": 5.808997631072998, + "learning_rate": 1.726230689923975e-05, + "loss": 1.6858, + "step": 65450 + }, + { + "epoch": 0.411431265099287, + "grad_norm": 7.5976762771606445, + "learning_rate": 1.7261887798295098e-05, + "loss": 1.6086, + "step": 65460 + }, + { + "epoch": 0.41149411741598413, + "grad_norm": 6.59607458114624, + "learning_rate": 1.7261468697350445e-05, + "loss": 1.7889, + "step": 65470 + }, + { + "epoch": 0.41155696973268124, + "grad_norm": 6.698685646057129, + "learning_rate": 1.7261049596405792e-05, + "loss": 1.997, + "step": 65480 + }, + { + "epoch": 0.41161982204937836, + "grad_norm": 6.6149749755859375, + "learning_rate": 1.726063049546114e-05, + "loss": 1.8442, + "step": 65490 + }, + { + "epoch": 0.4116826743660755, + "grad_norm": 9.340747833251953, + "learning_rate": 1.7260211394516483e-05, + "loss": 1.7613, + "step": 65500 + }, + { + "epoch": 0.4117455266827726, + "grad_norm": 7.007955074310303, + "learning_rate": 1.725979229357183e-05, + "loss": 1.6861, + "step": 65510 + }, + { + "epoch": 0.4118083789994697, + "grad_norm": 7.721170425415039, + "learning_rate": 1.7259373192627177e-05, + "loss": 1.9393, + "step": 65520 + }, + { + "epoch": 0.4118712313161668, + "grad_norm": 7.229162693023682, + "learning_rate": 1.7258954091682524e-05, + "loss": 1.9263, + "step": 65530 + }, + { + "epoch": 0.4119340836328639, + "grad_norm": 6.689350128173828, + "learning_rate": 1.725853499073787e-05, + "loss": 1.6923, + "step": 65540 + }, + { + "epoch": 0.411996935949561, + "grad_norm": 6.703178882598877, + "learning_rate": 1.7258115889793218e-05, + "loss": 1.7659, + "step": 65550 + }, + { + "epoch": 0.4120597882662581, + "grad_norm": 6.903580665588379, + "learning_rate": 1.7257696788848565e-05, + "loss": 1.7734, + "step": 65560 + }, + { + "epoch": 0.4121226405829552, + "grad_norm": 7.508144378662109, + "learning_rate": 1.7257277687903912e-05, + "loss": 1.8409, + "step": 65570 + }, + { + "epoch": 0.41218549289965234, + "grad_norm": 7.502862453460693, + "learning_rate": 1.7256858586959256e-05, + "loss": 1.7433, + "step": 65580 + }, + { + "epoch": 0.41224834521634945, + "grad_norm": 8.012452125549316, + "learning_rate": 1.7256439486014603e-05, + "loss": 1.7268, + "step": 65590 + }, + { + "epoch": 0.41231119753304657, + "grad_norm": 7.023150444030762, + "learning_rate": 1.725602038506995e-05, + "loss": 1.7775, + "step": 65600 + }, + { + "epoch": 0.4123740498497437, + "grad_norm": 7.646528720855713, + "learning_rate": 1.7255601284125297e-05, + "loss": 1.7078, + "step": 65610 + }, + { + "epoch": 0.4124369021664408, + "grad_norm": 7.526142120361328, + "learning_rate": 1.7255182183180644e-05, + "loss": 1.8062, + "step": 65620 + }, + { + "epoch": 0.4124997544831379, + "grad_norm": 7.300693511962891, + "learning_rate": 1.7254763082235988e-05, + "loss": 1.6165, + "step": 65630 + }, + { + "epoch": 0.41256260679983503, + "grad_norm": 7.158872127532959, + "learning_rate": 1.7254343981291335e-05, + "loss": 1.741, + "step": 65640 + }, + { + "epoch": 0.41262545911653215, + "grad_norm": 7.359210014343262, + "learning_rate": 1.7253924880346682e-05, + "loss": 1.6974, + "step": 65650 + }, + { + "epoch": 0.41268831143322926, + "grad_norm": 7.219731330871582, + "learning_rate": 1.725350577940203e-05, + "loss": 1.879, + "step": 65660 + }, + { + "epoch": 0.4127511637499263, + "grad_norm": 6.3508405685424805, + "learning_rate": 1.7253086678457373e-05, + "loss": 1.6002, + "step": 65670 + }, + { + "epoch": 0.41281401606662343, + "grad_norm": 6.639791011810303, + "learning_rate": 1.725266757751272e-05, + "loss": 1.4928, + "step": 65680 + }, + { + "epoch": 0.41287686838332055, + "grad_norm": 6.588821887969971, + "learning_rate": 1.7252248476568067e-05, + "loss": 1.8052, + "step": 65690 + }, + { + "epoch": 0.41293972070001767, + "grad_norm": 7.214931964874268, + "learning_rate": 1.7251829375623414e-05, + "loss": 1.686, + "step": 65700 + }, + { + "epoch": 0.4130025730167148, + "grad_norm": 6.759382724761963, + "learning_rate": 1.725141027467876e-05, + "loss": 1.7647, + "step": 65710 + }, + { + "epoch": 0.4130654253334119, + "grad_norm": 6.056048393249512, + "learning_rate": 1.7250991173734108e-05, + "loss": 1.6579, + "step": 65720 + }, + { + "epoch": 0.413128277650109, + "grad_norm": 6.751076698303223, + "learning_rate": 1.725057207278945e-05, + "loss": 1.8221, + "step": 65730 + }, + { + "epoch": 0.4131911299668061, + "grad_norm": 7.223300457000732, + "learning_rate": 1.72501529718448e-05, + "loss": 1.7375, + "step": 65740 + }, + { + "epoch": 0.41325398228350324, + "grad_norm": 7.070038318634033, + "learning_rate": 1.7249733870900146e-05, + "loss": 1.8717, + "step": 65750 + }, + { + "epoch": 0.41331683460020036, + "grad_norm": 6.8550262451171875, + "learning_rate": 1.7249314769955493e-05, + "loss": 1.9206, + "step": 65760 + }, + { + "epoch": 0.41337968691689747, + "grad_norm": 6.547621726989746, + "learning_rate": 1.724889566901084e-05, + "loss": 1.529, + "step": 65770 + }, + { + "epoch": 0.4134425392335946, + "grad_norm": 6.848862171173096, + "learning_rate": 1.7248476568066187e-05, + "loss": 1.7646, + "step": 65780 + }, + { + "epoch": 0.4135053915502917, + "grad_norm": 6.237651824951172, + "learning_rate": 1.7248057467121534e-05, + "loss": 1.7265, + "step": 65790 + }, + { + "epoch": 0.41356824386698876, + "grad_norm": 7.226813793182373, + "learning_rate": 1.724763836617688e-05, + "loss": 1.6863, + "step": 65800 + }, + { + "epoch": 0.4136310961836859, + "grad_norm": 6.415926456451416, + "learning_rate": 1.7247219265232225e-05, + "loss": 1.5202, + "step": 65810 + }, + { + "epoch": 0.413693948500383, + "grad_norm": 7.4868245124816895, + "learning_rate": 1.7246800164287572e-05, + "loss": 1.7867, + "step": 65820 + }, + { + "epoch": 0.4137568008170801, + "grad_norm": 5.861806392669678, + "learning_rate": 1.724638106334292e-05, + "loss": 1.6878, + "step": 65830 + }, + { + "epoch": 0.4138196531337772, + "grad_norm": 8.50329303741455, + "learning_rate": 1.7245961962398266e-05, + "loss": 1.8611, + "step": 65840 + }, + { + "epoch": 0.41388250545047434, + "grad_norm": 7.272984504699707, + "learning_rate": 1.724554286145361e-05, + "loss": 1.8541, + "step": 65850 + }, + { + "epoch": 0.41394535776717145, + "grad_norm": 6.840214729309082, + "learning_rate": 1.7245123760508957e-05, + "loss": 1.8274, + "step": 65860 + }, + { + "epoch": 0.41400821008386857, + "grad_norm": 6.844021797180176, + "learning_rate": 1.7244704659564304e-05, + "loss": 1.5502, + "step": 65870 + }, + { + "epoch": 0.4140710624005657, + "grad_norm": 6.481375217437744, + "learning_rate": 1.724428555861965e-05, + "loss": 1.6152, + "step": 65880 + }, + { + "epoch": 0.4141339147172628, + "grad_norm": 6.612030029296875, + "learning_rate": 1.7243866457674998e-05, + "loss": 1.7488, + "step": 65890 + }, + { + "epoch": 0.4141967670339599, + "grad_norm": 6.541953086853027, + "learning_rate": 1.724344735673034e-05, + "loss": 1.7981, + "step": 65900 + }, + { + "epoch": 0.414259619350657, + "grad_norm": 6.864079475402832, + "learning_rate": 1.724302825578569e-05, + "loss": 1.744, + "step": 65910 + }, + { + "epoch": 0.4143224716673541, + "grad_norm": 6.154926300048828, + "learning_rate": 1.7242609154841036e-05, + "loss": 2.0588, + "step": 65920 + }, + { + "epoch": 0.4143853239840512, + "grad_norm": 5.997568130493164, + "learning_rate": 1.7242190053896383e-05, + "loss": 1.7858, + "step": 65930 + }, + { + "epoch": 0.4144481763007483, + "grad_norm": 7.9769463539123535, + "learning_rate": 1.724177095295173e-05, + "loss": 1.6325, + "step": 65940 + }, + { + "epoch": 0.41451102861744543, + "grad_norm": 6.107345104217529, + "learning_rate": 1.7241351852007077e-05, + "loss": 1.9093, + "step": 65950 + }, + { + "epoch": 0.41457388093414255, + "grad_norm": 6.253636837005615, + "learning_rate": 1.7240932751062424e-05, + "loss": 1.834, + "step": 65960 + }, + { + "epoch": 0.41463673325083966, + "grad_norm": 6.682056427001953, + "learning_rate": 1.7240513650117768e-05, + "loss": 1.7277, + "step": 65970 + }, + { + "epoch": 0.4146995855675368, + "grad_norm": 5.611754417419434, + "learning_rate": 1.7240094549173115e-05, + "loss": 1.8684, + "step": 65980 + }, + { + "epoch": 0.4147624378842339, + "grad_norm": 7.929423809051514, + "learning_rate": 1.7239675448228462e-05, + "loss": 1.8078, + "step": 65990 + }, + { + "epoch": 0.414825290200931, + "grad_norm": 7.0556960105896, + "learning_rate": 1.723925634728381e-05, + "loss": 1.6299, + "step": 66000 + }, + { + "epoch": 0.4148881425176281, + "grad_norm": 5.946358680725098, + "learning_rate": 1.7238837246339156e-05, + "loss": 1.829, + "step": 66010 + }, + { + "epoch": 0.41495099483432524, + "grad_norm": 6.1297688484191895, + "learning_rate": 1.7238418145394503e-05, + "loss": 1.7613, + "step": 66020 + }, + { + "epoch": 0.41501384715102235, + "grad_norm": 6.7420477867126465, + "learning_rate": 1.7237999044449847e-05, + "loss": 1.7146, + "step": 66030 + }, + { + "epoch": 0.41507669946771947, + "grad_norm": 7.604292869567871, + "learning_rate": 1.7237579943505194e-05, + "loss": 1.9379, + "step": 66040 + }, + { + "epoch": 0.4151395517844165, + "grad_norm": 6.331554889678955, + "learning_rate": 1.723716084256054e-05, + "loss": 1.7841, + "step": 66050 + }, + { + "epoch": 0.41520240410111364, + "grad_norm": 6.867188930511475, + "learning_rate": 1.7236741741615888e-05, + "loss": 1.6822, + "step": 66060 + }, + { + "epoch": 0.41526525641781076, + "grad_norm": 7.866869926452637, + "learning_rate": 1.723632264067123e-05, + "loss": 1.5881, + "step": 66070 + }, + { + "epoch": 0.4153281087345079, + "grad_norm": 7.904226303100586, + "learning_rate": 1.723590353972658e-05, + "loss": 1.7964, + "step": 66080 + }, + { + "epoch": 0.415390961051205, + "grad_norm": 6.318147659301758, + "learning_rate": 1.7235484438781926e-05, + "loss": 1.7034, + "step": 66090 + }, + { + "epoch": 0.4154538133679021, + "grad_norm": 7.195476055145264, + "learning_rate": 1.7235065337837273e-05, + "loss": 1.787, + "step": 66100 + }, + { + "epoch": 0.4155166656845992, + "grad_norm": 8.014933586120605, + "learning_rate": 1.723464623689262e-05, + "loss": 1.6416, + "step": 66110 + }, + { + "epoch": 0.41557951800129633, + "grad_norm": 6.8371100425720215, + "learning_rate": 1.7234227135947964e-05, + "loss": 1.8213, + "step": 66120 + }, + { + "epoch": 0.41564237031799345, + "grad_norm": 6.7408976554870605, + "learning_rate": 1.723380803500331e-05, + "loss": 2.039, + "step": 66130 + }, + { + "epoch": 0.41570522263469056, + "grad_norm": 7.483088493347168, + "learning_rate": 1.7233388934058658e-05, + "loss": 1.9312, + "step": 66140 + }, + { + "epoch": 0.4157680749513877, + "grad_norm": 6.651445388793945, + "learning_rate": 1.7232969833114005e-05, + "loss": 1.7359, + "step": 66150 + }, + { + "epoch": 0.4158309272680848, + "grad_norm": 7.179247856140137, + "learning_rate": 1.7232550732169352e-05, + "loss": 1.7614, + "step": 66160 + }, + { + "epoch": 0.4158937795847819, + "grad_norm": 7.6218132972717285, + "learning_rate": 1.72321316312247e-05, + "loss": 1.7491, + "step": 66170 + }, + { + "epoch": 0.41595663190147897, + "grad_norm": 6.225204944610596, + "learning_rate": 1.7231712530280046e-05, + "loss": 1.7596, + "step": 66180 + }, + { + "epoch": 0.4160194842181761, + "grad_norm": 8.149336814880371, + "learning_rate": 1.7231293429335393e-05, + "loss": 2.0318, + "step": 66190 + }, + { + "epoch": 0.4160823365348732, + "grad_norm": 6.81826639175415, + "learning_rate": 1.723087432839074e-05, + "loss": 1.9221, + "step": 66200 + }, + { + "epoch": 0.4161451888515703, + "grad_norm": 7.789238929748535, + "learning_rate": 1.7230455227446084e-05, + "loss": 1.8142, + "step": 66210 + }, + { + "epoch": 0.41620804116826743, + "grad_norm": 7.309952735900879, + "learning_rate": 1.723003612650143e-05, + "loss": 1.8951, + "step": 66220 + }, + { + "epoch": 0.41627089348496454, + "grad_norm": 6.235332012176514, + "learning_rate": 1.7229617025556778e-05, + "loss": 1.8876, + "step": 66230 + }, + { + "epoch": 0.41633374580166166, + "grad_norm": 6.2471160888671875, + "learning_rate": 1.7229197924612125e-05, + "loss": 1.6217, + "step": 66240 + }, + { + "epoch": 0.4163965981183588, + "grad_norm": 9.248604774475098, + "learning_rate": 1.722877882366747e-05, + "loss": 2.0903, + "step": 66250 + }, + { + "epoch": 0.4164594504350559, + "grad_norm": 6.332358360290527, + "learning_rate": 1.7228359722722816e-05, + "loss": 1.6627, + "step": 66260 + }, + { + "epoch": 0.416522302751753, + "grad_norm": 6.084262847900391, + "learning_rate": 1.7227940621778163e-05, + "loss": 1.7674, + "step": 66270 + }, + { + "epoch": 0.4165851550684501, + "grad_norm": 6.957821846008301, + "learning_rate": 1.722752152083351e-05, + "loss": 1.5903, + "step": 66280 + }, + { + "epoch": 0.41664800738514723, + "grad_norm": 7.637058258056641, + "learning_rate": 1.7227102419888854e-05, + "loss": 1.7404, + "step": 66290 + }, + { + "epoch": 0.41671085970184435, + "grad_norm": 6.842432975769043, + "learning_rate": 1.72266833189442e-05, + "loss": 1.8943, + "step": 66300 + }, + { + "epoch": 0.4167737120185414, + "grad_norm": 7.005809783935547, + "learning_rate": 1.7226264217999548e-05, + "loss": 1.7298, + "step": 66310 + }, + { + "epoch": 0.4168365643352385, + "grad_norm": 6.967994689941406, + "learning_rate": 1.7225845117054895e-05, + "loss": 1.8196, + "step": 66320 + }, + { + "epoch": 0.41689941665193564, + "grad_norm": 7.2940287590026855, + "learning_rate": 1.7225426016110242e-05, + "loss": 1.5849, + "step": 66330 + }, + { + "epoch": 0.41696226896863275, + "grad_norm": 7.64329719543457, + "learning_rate": 1.722500691516559e-05, + "loss": 1.8588, + "step": 66340 + }, + { + "epoch": 0.41702512128532987, + "grad_norm": 7.028941631317139, + "learning_rate": 1.7224587814220933e-05, + "loss": 1.6441, + "step": 66350 + }, + { + "epoch": 0.417087973602027, + "grad_norm": 6.863855838775635, + "learning_rate": 1.722416871327628e-05, + "loss": 1.8029, + "step": 66360 + }, + { + "epoch": 0.4171508259187241, + "grad_norm": 5.859476089477539, + "learning_rate": 1.7223749612331627e-05, + "loss": 1.487, + "step": 66370 + }, + { + "epoch": 0.4172136782354212, + "grad_norm": 7.921928882598877, + "learning_rate": 1.7223330511386974e-05, + "loss": 1.6424, + "step": 66380 + }, + { + "epoch": 0.41727653055211833, + "grad_norm": 6.965579986572266, + "learning_rate": 1.722291141044232e-05, + "loss": 1.788, + "step": 66390 + }, + { + "epoch": 0.41733938286881544, + "grad_norm": 7.220767974853516, + "learning_rate": 1.7222492309497668e-05, + "loss": 1.6314, + "step": 66400 + }, + { + "epoch": 0.41740223518551256, + "grad_norm": 7.693778991699219, + "learning_rate": 1.7222073208553015e-05, + "loss": 1.5888, + "step": 66410 + }, + { + "epoch": 0.4174650875022097, + "grad_norm": 6.9228410720825195, + "learning_rate": 1.7221654107608362e-05, + "loss": 1.7609, + "step": 66420 + }, + { + "epoch": 0.41752793981890673, + "grad_norm": 6.110254287719727, + "learning_rate": 1.7221235006663706e-05, + "loss": 1.7262, + "step": 66430 + }, + { + "epoch": 0.41759079213560385, + "grad_norm": 5.9352240562438965, + "learning_rate": 1.7220815905719053e-05, + "loss": 1.878, + "step": 66440 + }, + { + "epoch": 0.41765364445230097, + "grad_norm": 7.226113796234131, + "learning_rate": 1.72203968047744e-05, + "loss": 1.6799, + "step": 66450 + }, + { + "epoch": 0.4177164967689981, + "grad_norm": 7.249061584472656, + "learning_rate": 1.7219977703829747e-05, + "loss": 1.7067, + "step": 66460 + }, + { + "epoch": 0.4177793490856952, + "grad_norm": 6.2751336097717285, + "learning_rate": 1.721955860288509e-05, + "loss": 1.7584, + "step": 66470 + }, + { + "epoch": 0.4178422014023923, + "grad_norm": 6.313525199890137, + "learning_rate": 1.7219139501940438e-05, + "loss": 1.7579, + "step": 66480 + }, + { + "epoch": 0.4179050537190894, + "grad_norm": 7.393110752105713, + "learning_rate": 1.7218720400995785e-05, + "loss": 1.6926, + "step": 66490 + }, + { + "epoch": 0.41796790603578654, + "grad_norm": 7.071122646331787, + "learning_rate": 1.7218301300051132e-05, + "loss": 1.5097, + "step": 66500 + }, + { + "epoch": 0.41803075835248366, + "grad_norm": 6.8215813636779785, + "learning_rate": 1.721788219910648e-05, + "loss": 2.0171, + "step": 66510 + }, + { + "epoch": 0.41809361066918077, + "grad_norm": 6.949654579162598, + "learning_rate": 1.7217463098161823e-05, + "loss": 1.5347, + "step": 66520 + }, + { + "epoch": 0.4181564629858779, + "grad_norm": 7.977286338806152, + "learning_rate": 1.721704399721717e-05, + "loss": 1.9713, + "step": 66530 + }, + { + "epoch": 0.418219315302575, + "grad_norm": 6.974081516265869, + "learning_rate": 1.7216624896272517e-05, + "loss": 1.8234, + "step": 66540 + }, + { + "epoch": 0.4182821676192721, + "grad_norm": 7.043870449066162, + "learning_rate": 1.7216205795327864e-05, + "loss": 1.7966, + "step": 66550 + }, + { + "epoch": 0.4183450199359692, + "grad_norm": 6.155953884124756, + "learning_rate": 1.721578669438321e-05, + "loss": 1.5279, + "step": 66560 + }, + { + "epoch": 0.4184078722526663, + "grad_norm": 7.128800392150879, + "learning_rate": 1.7215367593438558e-05, + "loss": 1.7763, + "step": 66570 + }, + { + "epoch": 0.4184707245693634, + "grad_norm": 7.194254398345947, + "learning_rate": 1.7214948492493905e-05, + "loss": 1.7651, + "step": 66580 + }, + { + "epoch": 0.4185335768860605, + "grad_norm": 6.361778736114502, + "learning_rate": 1.7214529391549252e-05, + "loss": 1.638, + "step": 66590 + }, + { + "epoch": 0.41859642920275764, + "grad_norm": 6.098928928375244, + "learning_rate": 1.7214110290604596e-05, + "loss": 1.7984, + "step": 66600 + }, + { + "epoch": 0.41865928151945475, + "grad_norm": 6.760110378265381, + "learning_rate": 1.7213691189659943e-05, + "loss": 1.7262, + "step": 66610 + }, + { + "epoch": 0.41872213383615187, + "grad_norm": 7.264941215515137, + "learning_rate": 1.721327208871529e-05, + "loss": 1.6386, + "step": 66620 + }, + { + "epoch": 0.418784986152849, + "grad_norm": 7.49856424331665, + "learning_rate": 1.7212852987770637e-05, + "loss": 1.7972, + "step": 66630 + }, + { + "epoch": 0.4188478384695461, + "grad_norm": 6.58585786819458, + "learning_rate": 1.7212433886825984e-05, + "loss": 1.7253, + "step": 66640 + }, + { + "epoch": 0.4189106907862432, + "grad_norm": 6.299633026123047, + "learning_rate": 1.7212014785881328e-05, + "loss": 1.7737, + "step": 66650 + }, + { + "epoch": 0.4189735431029403, + "grad_norm": 4.980933666229248, + "learning_rate": 1.7211595684936675e-05, + "loss": 1.5795, + "step": 66660 + }, + { + "epoch": 0.41903639541963744, + "grad_norm": 6.099027633666992, + "learning_rate": 1.7211176583992022e-05, + "loss": 1.9556, + "step": 66670 + }, + { + "epoch": 0.41909924773633456, + "grad_norm": 7.245108127593994, + "learning_rate": 1.721075748304737e-05, + "loss": 1.59, + "step": 66680 + }, + { + "epoch": 0.4191621000530316, + "grad_norm": 6.28397798538208, + "learning_rate": 1.7210338382102713e-05, + "loss": 1.795, + "step": 66690 + }, + { + "epoch": 0.41922495236972873, + "grad_norm": 6.623855113983154, + "learning_rate": 1.720991928115806e-05, + "loss": 1.6809, + "step": 66700 + }, + { + "epoch": 0.41928780468642585, + "grad_norm": 6.899521827697754, + "learning_rate": 1.7209500180213407e-05, + "loss": 1.6451, + "step": 66710 + }, + { + "epoch": 0.41935065700312296, + "grad_norm": 6.536463737487793, + "learning_rate": 1.7209081079268754e-05, + "loss": 1.7583, + "step": 66720 + }, + { + "epoch": 0.4194135093198201, + "grad_norm": 6.291255474090576, + "learning_rate": 1.72086619783241e-05, + "loss": 1.9444, + "step": 66730 + }, + { + "epoch": 0.4194763616365172, + "grad_norm": 6.284396171569824, + "learning_rate": 1.7208242877379445e-05, + "loss": 1.5567, + "step": 66740 + }, + { + "epoch": 0.4195392139532143, + "grad_norm": 6.022761821746826, + "learning_rate": 1.7207823776434792e-05, + "loss": 1.8496, + "step": 66750 + }, + { + "epoch": 0.4196020662699114, + "grad_norm": 7.062186241149902, + "learning_rate": 1.720740467549014e-05, + "loss": 1.7722, + "step": 66760 + }, + { + "epoch": 0.41966491858660854, + "grad_norm": 6.43408727645874, + "learning_rate": 1.7206985574545486e-05, + "loss": 1.8628, + "step": 66770 + }, + { + "epoch": 0.41972777090330565, + "grad_norm": 7.805745601654053, + "learning_rate": 1.7206566473600833e-05, + "loss": 1.7588, + "step": 66780 + }, + { + "epoch": 0.41979062322000277, + "grad_norm": 8.450122833251953, + "learning_rate": 1.720614737265618e-05, + "loss": 1.7457, + "step": 66790 + }, + { + "epoch": 0.4198534755366999, + "grad_norm": 7.328549385070801, + "learning_rate": 1.7205728271711527e-05, + "loss": 1.7419, + "step": 66800 + }, + { + "epoch": 0.419916327853397, + "grad_norm": 7.124634742736816, + "learning_rate": 1.7205309170766874e-05, + "loss": 1.5527, + "step": 66810 + }, + { + "epoch": 0.41997918017009406, + "grad_norm": 6.787344932556152, + "learning_rate": 1.720489006982222e-05, + "loss": 1.9586, + "step": 66820 + }, + { + "epoch": 0.4200420324867912, + "grad_norm": 6.533176898956299, + "learning_rate": 1.7204470968877565e-05, + "loss": 1.611, + "step": 66830 + }, + { + "epoch": 0.4201048848034883, + "grad_norm": 6.889014720916748, + "learning_rate": 1.7204051867932912e-05, + "loss": 1.8108, + "step": 66840 + }, + { + "epoch": 0.4201677371201854, + "grad_norm": 6.502508640289307, + "learning_rate": 1.720363276698826e-05, + "loss": 2.0798, + "step": 66850 + }, + { + "epoch": 0.4202305894368825, + "grad_norm": 7.50160026550293, + "learning_rate": 1.7203213666043606e-05, + "loss": 1.8618, + "step": 66860 + }, + { + "epoch": 0.42029344175357963, + "grad_norm": 6.552640914916992, + "learning_rate": 1.720279456509895e-05, + "loss": 1.7817, + "step": 66870 + }, + { + "epoch": 0.42035629407027675, + "grad_norm": 6.772343635559082, + "learning_rate": 1.7202375464154297e-05, + "loss": 1.7162, + "step": 66880 + }, + { + "epoch": 0.42041914638697386, + "grad_norm": 6.845734596252441, + "learning_rate": 1.7201956363209644e-05, + "loss": 1.7223, + "step": 66890 + }, + { + "epoch": 0.420481998703671, + "grad_norm": 6.288269519805908, + "learning_rate": 1.720153726226499e-05, + "loss": 1.6642, + "step": 66900 + }, + { + "epoch": 0.4205448510203681, + "grad_norm": 6.5025458335876465, + "learning_rate": 1.7201118161320335e-05, + "loss": 1.688, + "step": 66910 + }, + { + "epoch": 0.4206077033370652, + "grad_norm": 6.833642959594727, + "learning_rate": 1.7200699060375682e-05, + "loss": 1.6041, + "step": 66920 + }, + { + "epoch": 0.4206705556537623, + "grad_norm": 6.569134712219238, + "learning_rate": 1.720027995943103e-05, + "loss": 1.7687, + "step": 66930 + }, + { + "epoch": 0.42073340797045944, + "grad_norm": 6.5684051513671875, + "learning_rate": 1.7199860858486376e-05, + "loss": 1.7422, + "step": 66940 + }, + { + "epoch": 0.4207962602871565, + "grad_norm": 6.582808017730713, + "learning_rate": 1.7199441757541723e-05, + "loss": 1.5826, + "step": 66950 + }, + { + "epoch": 0.4208591126038536, + "grad_norm": 6.532323837280273, + "learning_rate": 1.719902265659707e-05, + "loss": 1.549, + "step": 66960 + }, + { + "epoch": 0.42092196492055073, + "grad_norm": 6.004312038421631, + "learning_rate": 1.7198603555652417e-05, + "loss": 1.7504, + "step": 66970 + }, + { + "epoch": 0.42098481723724784, + "grad_norm": 6.040466785430908, + "learning_rate": 1.719818445470776e-05, + "loss": 1.686, + "step": 66980 + }, + { + "epoch": 0.42104766955394496, + "grad_norm": 5.869112968444824, + "learning_rate": 1.7197765353763108e-05, + "loss": 1.4853, + "step": 66990 + }, + { + "epoch": 0.4211105218706421, + "grad_norm": 5.894684791564941, + "learning_rate": 1.7197346252818455e-05, + "loss": 1.5959, + "step": 67000 + }, + { + "epoch": 0.4211733741873392, + "grad_norm": 7.29726505279541, + "learning_rate": 1.7196927151873802e-05, + "loss": 1.5494, + "step": 67010 + }, + { + "epoch": 0.4212362265040363, + "grad_norm": 6.248456954956055, + "learning_rate": 1.719650805092915e-05, + "loss": 1.6962, + "step": 67020 + }, + { + "epoch": 0.4212990788207334, + "grad_norm": 6.856325149536133, + "learning_rate": 1.7196088949984496e-05, + "loss": 1.9297, + "step": 67030 + }, + { + "epoch": 0.42136193113743053, + "grad_norm": 7.646697998046875, + "learning_rate": 1.7195669849039843e-05, + "loss": 1.9425, + "step": 67040 + }, + { + "epoch": 0.42142478345412765, + "grad_norm": 7.795719623565674, + "learning_rate": 1.7195250748095187e-05, + "loss": 1.6795, + "step": 67050 + }, + { + "epoch": 0.42148763577082476, + "grad_norm": 6.1254143714904785, + "learning_rate": 1.7194831647150534e-05, + "loss": 1.6857, + "step": 67060 + }, + { + "epoch": 0.4215504880875218, + "grad_norm": 7.006941318511963, + "learning_rate": 1.719441254620588e-05, + "loss": 1.9168, + "step": 67070 + }, + { + "epoch": 0.42161334040421894, + "grad_norm": 7.390153408050537, + "learning_rate": 1.7193993445261228e-05, + "loss": 1.5826, + "step": 67080 + }, + { + "epoch": 0.42167619272091605, + "grad_norm": 6.724299907684326, + "learning_rate": 1.7193574344316572e-05, + "loss": 1.6652, + "step": 67090 + }, + { + "epoch": 0.42173904503761317, + "grad_norm": 7.16749382019043, + "learning_rate": 1.719315524337192e-05, + "loss": 1.8747, + "step": 67100 + }, + { + "epoch": 0.4218018973543103, + "grad_norm": 5.848663806915283, + "learning_rate": 1.7192736142427266e-05, + "loss": 1.5958, + "step": 67110 + }, + { + "epoch": 0.4218647496710074, + "grad_norm": 7.2142815589904785, + "learning_rate": 1.7192317041482613e-05, + "loss": 1.7606, + "step": 67120 + }, + { + "epoch": 0.4219276019877045, + "grad_norm": 6.202139854431152, + "learning_rate": 1.719189794053796e-05, + "loss": 1.6537, + "step": 67130 + }, + { + "epoch": 0.42199045430440163, + "grad_norm": 7.26724910736084, + "learning_rate": 1.7191478839593304e-05, + "loss": 1.7531, + "step": 67140 + }, + { + "epoch": 0.42205330662109874, + "grad_norm": 6.575324535369873, + "learning_rate": 1.719105973864865e-05, + "loss": 1.699, + "step": 67150 + }, + { + "epoch": 0.42211615893779586, + "grad_norm": 7.1311564445495605, + "learning_rate": 1.7190640637703998e-05, + "loss": 1.7081, + "step": 67160 + }, + { + "epoch": 0.422179011254493, + "grad_norm": 7.583156585693359, + "learning_rate": 1.7190221536759345e-05, + "loss": 1.6017, + "step": 67170 + }, + { + "epoch": 0.4222418635711901, + "grad_norm": 4.99326229095459, + "learning_rate": 1.7189802435814692e-05, + "loss": 1.6263, + "step": 67180 + }, + { + "epoch": 0.4223047158878872, + "grad_norm": 7.643944263458252, + "learning_rate": 1.718938333487004e-05, + "loss": 1.6513, + "step": 67190 + }, + { + "epoch": 0.42236756820458426, + "grad_norm": 7.959160804748535, + "learning_rate": 1.7188964233925386e-05, + "loss": 1.6504, + "step": 67200 + }, + { + "epoch": 0.4224304205212814, + "grad_norm": 8.044618606567383, + "learning_rate": 1.7188545132980733e-05, + "loss": 1.7734, + "step": 67210 + }, + { + "epoch": 0.4224932728379785, + "grad_norm": 5.893154144287109, + "learning_rate": 1.7188126032036077e-05, + "loss": 1.8729, + "step": 67220 + }, + { + "epoch": 0.4225561251546756, + "grad_norm": 8.05775260925293, + "learning_rate": 1.7187706931091424e-05, + "loss": 1.7189, + "step": 67230 + }, + { + "epoch": 0.4226189774713727, + "grad_norm": 7.354771137237549, + "learning_rate": 1.718728783014677e-05, + "loss": 1.7098, + "step": 67240 + }, + { + "epoch": 0.42268182978806984, + "grad_norm": 6.8519287109375, + "learning_rate": 1.7186868729202118e-05, + "loss": 1.843, + "step": 67250 + }, + { + "epoch": 0.42274468210476696, + "grad_norm": 6.724393844604492, + "learning_rate": 1.7186449628257465e-05, + "loss": 1.9114, + "step": 67260 + }, + { + "epoch": 0.42280753442146407, + "grad_norm": 6.693548679351807, + "learning_rate": 1.718603052731281e-05, + "loss": 1.6675, + "step": 67270 + }, + { + "epoch": 0.4228703867381612, + "grad_norm": 7.3344621658325195, + "learning_rate": 1.7185611426368156e-05, + "loss": 1.8287, + "step": 67280 + }, + { + "epoch": 0.4229332390548583, + "grad_norm": 7.433871746063232, + "learning_rate": 1.7185192325423503e-05, + "loss": 1.9375, + "step": 67290 + }, + { + "epoch": 0.4229960913715554, + "grad_norm": 6.614596366882324, + "learning_rate": 1.718477322447885e-05, + "loss": 1.692, + "step": 67300 + }, + { + "epoch": 0.42305894368825253, + "grad_norm": 7.543428421020508, + "learning_rate": 1.7184354123534194e-05, + "loss": 1.7797, + "step": 67310 + }, + { + "epoch": 0.42312179600494965, + "grad_norm": 6.51392936706543, + "learning_rate": 1.718393502258954e-05, + "loss": 1.4786, + "step": 67320 + }, + { + "epoch": 0.4231846483216467, + "grad_norm": 6.313328266143799, + "learning_rate": 1.7183515921644888e-05, + "loss": 1.748, + "step": 67330 + }, + { + "epoch": 0.4232475006383438, + "grad_norm": 7.1494269371032715, + "learning_rate": 1.7183096820700235e-05, + "loss": 1.9026, + "step": 67340 + }, + { + "epoch": 0.42331035295504094, + "grad_norm": 7.357445240020752, + "learning_rate": 1.7182677719755582e-05, + "loss": 1.7604, + "step": 67350 + }, + { + "epoch": 0.42337320527173805, + "grad_norm": 7.715597629547119, + "learning_rate": 1.7182258618810926e-05, + "loss": 1.7441, + "step": 67360 + }, + { + "epoch": 0.42343605758843517, + "grad_norm": 7.762784957885742, + "learning_rate": 1.7181839517866273e-05, + "loss": 1.93, + "step": 67370 + }, + { + "epoch": 0.4234989099051323, + "grad_norm": 6.988029956817627, + "learning_rate": 1.718142041692162e-05, + "loss": 1.7947, + "step": 67380 + }, + { + "epoch": 0.4235617622218294, + "grad_norm": 7.190766334533691, + "learning_rate": 1.7181001315976967e-05, + "loss": 1.7791, + "step": 67390 + }, + { + "epoch": 0.4236246145385265, + "grad_norm": 7.197295188903809, + "learning_rate": 1.7180582215032314e-05, + "loss": 1.7481, + "step": 67400 + }, + { + "epoch": 0.4236874668552236, + "grad_norm": 6.324467182159424, + "learning_rate": 1.718016311408766e-05, + "loss": 1.6548, + "step": 67410 + }, + { + "epoch": 0.42375031917192074, + "grad_norm": 6.4799485206604, + "learning_rate": 1.7179744013143008e-05, + "loss": 1.794, + "step": 67420 + }, + { + "epoch": 0.42381317148861786, + "grad_norm": 7.101329326629639, + "learning_rate": 1.7179324912198355e-05, + "loss": 1.6493, + "step": 67430 + }, + { + "epoch": 0.42387602380531497, + "grad_norm": 7.107308387756348, + "learning_rate": 1.7178905811253702e-05, + "loss": 1.652, + "step": 67440 + }, + { + "epoch": 0.4239388761220121, + "grad_norm": 6.932032585144043, + "learning_rate": 1.7178486710309046e-05, + "loss": 1.7858, + "step": 67450 + }, + { + "epoch": 0.42400172843870915, + "grad_norm": 8.148683547973633, + "learning_rate": 1.7178067609364393e-05, + "loss": 1.6824, + "step": 67460 + }, + { + "epoch": 0.42406458075540626, + "grad_norm": 6.516316890716553, + "learning_rate": 1.717764850841974e-05, + "loss": 1.5451, + "step": 67470 + }, + { + "epoch": 0.4241274330721034, + "grad_norm": 5.768672943115234, + "learning_rate": 1.7177229407475087e-05, + "loss": 1.761, + "step": 67480 + }, + { + "epoch": 0.4241902853888005, + "grad_norm": 6.416257381439209, + "learning_rate": 1.717681030653043e-05, + "loss": 1.6693, + "step": 67490 + }, + { + "epoch": 0.4242531377054976, + "grad_norm": 7.094727516174316, + "learning_rate": 1.7176391205585778e-05, + "loss": 1.7253, + "step": 67500 + }, + { + "epoch": 0.4243159900221947, + "grad_norm": 6.927624702453613, + "learning_rate": 1.7175972104641125e-05, + "loss": 1.942, + "step": 67510 + }, + { + "epoch": 0.42437884233889184, + "grad_norm": 5.8472514152526855, + "learning_rate": 1.7175553003696472e-05, + "loss": 1.6038, + "step": 67520 + }, + { + "epoch": 0.42444169465558895, + "grad_norm": 7.299180030822754, + "learning_rate": 1.7175133902751816e-05, + "loss": 1.8532, + "step": 67530 + }, + { + "epoch": 0.42450454697228607, + "grad_norm": 7.734817981719971, + "learning_rate": 1.7174714801807163e-05, + "loss": 1.621, + "step": 67540 + }, + { + "epoch": 0.4245673992889832, + "grad_norm": 6.606222629547119, + "learning_rate": 1.717429570086251e-05, + "loss": 1.6545, + "step": 67550 + }, + { + "epoch": 0.4246302516056803, + "grad_norm": 6.60004186630249, + "learning_rate": 1.7173876599917857e-05, + "loss": 1.8597, + "step": 67560 + }, + { + "epoch": 0.4246931039223774, + "grad_norm": 6.90226411819458, + "learning_rate": 1.7173457498973204e-05, + "loss": 2.0013, + "step": 67570 + }, + { + "epoch": 0.4247559562390745, + "grad_norm": 8.462608337402344, + "learning_rate": 1.717303839802855e-05, + "loss": 1.8148, + "step": 67580 + }, + { + "epoch": 0.4248188085557716, + "grad_norm": 6.221778869628906, + "learning_rate": 1.71726192970839e-05, + "loss": 1.6299, + "step": 67590 + }, + { + "epoch": 0.4248816608724687, + "grad_norm": 6.240743160247803, + "learning_rate": 1.7172200196139245e-05, + "loss": 1.9142, + "step": 67600 + }, + { + "epoch": 0.4249445131891658, + "grad_norm": 20.15860939025879, + "learning_rate": 1.717178109519459e-05, + "loss": 1.8206, + "step": 67610 + }, + { + "epoch": 0.42500736550586293, + "grad_norm": 5.853695392608643, + "learning_rate": 1.7171361994249936e-05, + "loss": 1.4446, + "step": 67620 + }, + { + "epoch": 0.42507021782256005, + "grad_norm": 8.076128005981445, + "learning_rate": 1.7170942893305283e-05, + "loss": 1.7001, + "step": 67630 + }, + { + "epoch": 0.42513307013925716, + "grad_norm": 7.701231479644775, + "learning_rate": 1.717052379236063e-05, + "loss": 1.833, + "step": 67640 + }, + { + "epoch": 0.4251959224559543, + "grad_norm": 7.213332653045654, + "learning_rate": 1.7170104691415977e-05, + "loss": 1.6603, + "step": 67650 + }, + { + "epoch": 0.4252587747726514, + "grad_norm": 6.962845325469971, + "learning_rate": 1.7169685590471324e-05, + "loss": 1.7795, + "step": 67660 + }, + { + "epoch": 0.4253216270893485, + "grad_norm": 7.002661228179932, + "learning_rate": 1.7169266489526668e-05, + "loss": 1.7748, + "step": 67670 + }, + { + "epoch": 0.4253844794060456, + "grad_norm": 6.931869983673096, + "learning_rate": 1.7168847388582015e-05, + "loss": 1.7315, + "step": 67680 + }, + { + "epoch": 0.42544733172274274, + "grad_norm": 6.835071563720703, + "learning_rate": 1.7168428287637362e-05, + "loss": 1.8342, + "step": 67690 + }, + { + "epoch": 0.42551018403943985, + "grad_norm": 5.688884735107422, + "learning_rate": 1.716800918669271e-05, + "loss": 1.7369, + "step": 67700 + }, + { + "epoch": 0.4255730363561369, + "grad_norm": 5.7779645919799805, + "learning_rate": 1.7167590085748053e-05, + "loss": 1.7024, + "step": 67710 + }, + { + "epoch": 0.42563588867283403, + "grad_norm": 7.721713066101074, + "learning_rate": 1.71671709848034e-05, + "loss": 1.7237, + "step": 67720 + }, + { + "epoch": 0.42569874098953114, + "grad_norm": 6.258790969848633, + "learning_rate": 1.7166751883858747e-05, + "loss": 1.5013, + "step": 67730 + }, + { + "epoch": 0.42576159330622826, + "grad_norm": 6.449336051940918, + "learning_rate": 1.7166332782914094e-05, + "loss": 1.6477, + "step": 67740 + }, + { + "epoch": 0.4258244456229254, + "grad_norm": 7.289432048797607, + "learning_rate": 1.716591368196944e-05, + "loss": 1.6998, + "step": 67750 + }, + { + "epoch": 0.4258872979396225, + "grad_norm": 7.474597930908203, + "learning_rate": 1.7165494581024785e-05, + "loss": 1.545, + "step": 67760 + }, + { + "epoch": 0.4259501502563196, + "grad_norm": 6.538856506347656, + "learning_rate": 1.7165075480080132e-05, + "loss": 1.9273, + "step": 67770 + }, + { + "epoch": 0.4260130025730167, + "grad_norm": 6.761141777038574, + "learning_rate": 1.716465637913548e-05, + "loss": 1.7875, + "step": 67780 + }, + { + "epoch": 0.42607585488971383, + "grad_norm": 6.8022637367248535, + "learning_rate": 1.7164237278190826e-05, + "loss": 1.6618, + "step": 67790 + }, + { + "epoch": 0.42613870720641095, + "grad_norm": 7.009593486785889, + "learning_rate": 1.7163818177246173e-05, + "loss": 1.7295, + "step": 67800 + }, + { + "epoch": 0.42620155952310806, + "grad_norm": 7.822202682495117, + "learning_rate": 1.716339907630152e-05, + "loss": 1.7239, + "step": 67810 + }, + { + "epoch": 0.4262644118398052, + "grad_norm": 7.272392272949219, + "learning_rate": 1.7162979975356867e-05, + "loss": 1.8299, + "step": 67820 + }, + { + "epoch": 0.4263272641565023, + "grad_norm": 7.358494758605957, + "learning_rate": 1.7162560874412214e-05, + "loss": 1.7296, + "step": 67830 + }, + { + "epoch": 0.42639011647319935, + "grad_norm": 6.851416110992432, + "learning_rate": 1.7162141773467558e-05, + "loss": 1.5437, + "step": 67840 + }, + { + "epoch": 0.42645296878989647, + "grad_norm": 6.438014507293701, + "learning_rate": 1.7161722672522905e-05, + "loss": 1.6468, + "step": 67850 + }, + { + "epoch": 0.4265158211065936, + "grad_norm": 4.515395641326904, + "learning_rate": 1.7161303571578252e-05, + "loss": 1.5819, + "step": 67860 + }, + { + "epoch": 0.4265786734232907, + "grad_norm": 6.547482013702393, + "learning_rate": 1.71608844706336e-05, + "loss": 1.6972, + "step": 67870 + }, + { + "epoch": 0.4266415257399878, + "grad_norm": 8.230692863464355, + "learning_rate": 1.7160465369688946e-05, + "loss": 1.6475, + "step": 67880 + }, + { + "epoch": 0.42670437805668493, + "grad_norm": 6.764297008514404, + "learning_rate": 1.716004626874429e-05, + "loss": 1.4645, + "step": 67890 + }, + { + "epoch": 0.42676723037338204, + "grad_norm": 6.4913740158081055, + "learning_rate": 1.7159627167799637e-05, + "loss": 1.9947, + "step": 67900 + }, + { + "epoch": 0.42683008269007916, + "grad_norm": 8.162996292114258, + "learning_rate": 1.7159208066854984e-05, + "loss": 1.8477, + "step": 67910 + }, + { + "epoch": 0.4268929350067763, + "grad_norm": 5.915576934814453, + "learning_rate": 1.715878896591033e-05, + "loss": 1.6375, + "step": 67920 + }, + { + "epoch": 0.4269557873234734, + "grad_norm": 7.290924549102783, + "learning_rate": 1.7158369864965675e-05, + "loss": 1.6407, + "step": 67930 + }, + { + "epoch": 0.4270186396401705, + "grad_norm": 6.409750938415527, + "learning_rate": 1.7157950764021022e-05, + "loss": 1.6545, + "step": 67940 + }, + { + "epoch": 0.4270814919568676, + "grad_norm": 6.6010823249816895, + "learning_rate": 1.715753166307637e-05, + "loss": 1.7267, + "step": 67950 + }, + { + "epoch": 0.42714434427356474, + "grad_norm": 8.809877395629883, + "learning_rate": 1.7157112562131716e-05, + "loss": 1.9648, + "step": 67960 + }, + { + "epoch": 0.4272071965902618, + "grad_norm": 7.118371486663818, + "learning_rate": 1.7156693461187063e-05, + "loss": 1.977, + "step": 67970 + }, + { + "epoch": 0.4272700489069589, + "grad_norm": 6.144242286682129, + "learning_rate": 1.7156274360242407e-05, + "loss": 1.7578, + "step": 67980 + }, + { + "epoch": 0.427332901223656, + "grad_norm": 7.426531791687012, + "learning_rate": 1.7155855259297754e-05, + "loss": 1.7804, + "step": 67990 + }, + { + "epoch": 0.42739575354035314, + "grad_norm": 6.703150272369385, + "learning_rate": 1.71554361583531e-05, + "loss": 1.8699, + "step": 68000 + }, + { + "epoch": 0.42745860585705026, + "grad_norm": 7.641908168792725, + "learning_rate": 1.7155017057408448e-05, + "loss": 1.8321, + "step": 68010 + }, + { + "epoch": 0.42752145817374737, + "grad_norm": 7.908365726470947, + "learning_rate": 1.7154597956463795e-05, + "loss": 1.6424, + "step": 68020 + }, + { + "epoch": 0.4275843104904445, + "grad_norm": 6.221214771270752, + "learning_rate": 1.7154178855519142e-05, + "loss": 1.5296, + "step": 68030 + }, + { + "epoch": 0.4276471628071416, + "grad_norm": 6.144346237182617, + "learning_rate": 1.715375975457449e-05, + "loss": 1.7549, + "step": 68040 + }, + { + "epoch": 0.4277100151238387, + "grad_norm": 6.489060401916504, + "learning_rate": 1.7153340653629836e-05, + "loss": 1.5271, + "step": 68050 + }, + { + "epoch": 0.42777286744053583, + "grad_norm": 7.255828857421875, + "learning_rate": 1.7152921552685184e-05, + "loss": 1.8189, + "step": 68060 + }, + { + "epoch": 0.42783571975723295, + "grad_norm": 6.923028469085693, + "learning_rate": 1.7152502451740527e-05, + "loss": 1.7012, + "step": 68070 + }, + { + "epoch": 0.42789857207393006, + "grad_norm": 7.817262172698975, + "learning_rate": 1.7152083350795874e-05, + "loss": 1.9792, + "step": 68080 + }, + { + "epoch": 0.4279614243906271, + "grad_norm": 7.205074310302734, + "learning_rate": 1.715166424985122e-05, + "loss": 1.7221, + "step": 68090 + }, + { + "epoch": 0.42802427670732424, + "grad_norm": 6.439834117889404, + "learning_rate": 1.715124514890657e-05, + "loss": 1.7847, + "step": 68100 + }, + { + "epoch": 0.42808712902402135, + "grad_norm": 8.112502098083496, + "learning_rate": 1.7150826047961912e-05, + "loss": 1.7084, + "step": 68110 + }, + { + "epoch": 0.42814998134071847, + "grad_norm": 7.40590763092041, + "learning_rate": 1.715040694701726e-05, + "loss": 1.8056, + "step": 68120 + }, + { + "epoch": 0.4282128336574156, + "grad_norm": 6.197878837585449, + "learning_rate": 1.7149987846072606e-05, + "loss": 1.7148, + "step": 68130 + }, + { + "epoch": 0.4282756859741127, + "grad_norm": 7.215755939483643, + "learning_rate": 1.7149568745127953e-05, + "loss": 1.93, + "step": 68140 + }, + { + "epoch": 0.4283385382908098, + "grad_norm": 7.1226806640625, + "learning_rate": 1.7149149644183297e-05, + "loss": 1.7791, + "step": 68150 + }, + { + "epoch": 0.4284013906075069, + "grad_norm": 8.49282455444336, + "learning_rate": 1.7148730543238644e-05, + "loss": 1.8896, + "step": 68160 + }, + { + "epoch": 0.42846424292420404, + "grad_norm": 7.0488057136535645, + "learning_rate": 1.714831144229399e-05, + "loss": 1.7185, + "step": 68170 + }, + { + "epoch": 0.42852709524090116, + "grad_norm": 7.961507797241211, + "learning_rate": 1.7147892341349338e-05, + "loss": 1.6971, + "step": 68180 + }, + { + "epoch": 0.42858994755759827, + "grad_norm": 7.726258754730225, + "learning_rate": 1.7147473240404685e-05, + "loss": 1.606, + "step": 68190 + }, + { + "epoch": 0.4286527998742954, + "grad_norm": 6.355605125427246, + "learning_rate": 1.7147054139460032e-05, + "loss": 1.4763, + "step": 68200 + }, + { + "epoch": 0.4287156521909925, + "grad_norm": 5.938056945800781, + "learning_rate": 1.714663503851538e-05, + "loss": 1.7344, + "step": 68210 + }, + { + "epoch": 0.42877850450768956, + "grad_norm": 7.089502811431885, + "learning_rate": 1.7146215937570727e-05, + "loss": 1.5126, + "step": 68220 + }, + { + "epoch": 0.4288413568243867, + "grad_norm": 5.78463077545166, + "learning_rate": 1.714579683662607e-05, + "loss": 1.7468, + "step": 68230 + }, + { + "epoch": 0.4289042091410838, + "grad_norm": 6.663919448852539, + "learning_rate": 1.7145377735681417e-05, + "loss": 1.8403, + "step": 68240 + }, + { + "epoch": 0.4289670614577809, + "grad_norm": 9.549799919128418, + "learning_rate": 1.7144958634736764e-05, + "loss": 2.0516, + "step": 68250 + }, + { + "epoch": 0.429029913774478, + "grad_norm": 7.30894136428833, + "learning_rate": 1.714453953379211e-05, + "loss": 1.7817, + "step": 68260 + }, + { + "epoch": 0.42909276609117514, + "grad_norm": 7.875905513763428, + "learning_rate": 1.714412043284746e-05, + "loss": 2.006, + "step": 68270 + }, + { + "epoch": 0.42915561840787225, + "grad_norm": 7.074160099029541, + "learning_rate": 1.7143701331902806e-05, + "loss": 1.782, + "step": 68280 + }, + { + "epoch": 0.42921847072456937, + "grad_norm": 6.277737617492676, + "learning_rate": 1.714328223095815e-05, + "loss": 1.6764, + "step": 68290 + }, + { + "epoch": 0.4292813230412665, + "grad_norm": 6.88519811630249, + "learning_rate": 1.714290504010796e-05, + "loss": 1.9277, + "step": 68300 + }, + { + "epoch": 0.4293441753579636, + "grad_norm": 7.190648555755615, + "learning_rate": 1.7142485939163308e-05, + "loss": 1.8115, + "step": 68310 + }, + { + "epoch": 0.4294070276746607, + "grad_norm": 6.650430679321289, + "learning_rate": 1.7142066838218655e-05, + "loss": 1.9107, + "step": 68320 + }, + { + "epoch": 0.42946987999135783, + "grad_norm": 7.447507381439209, + "learning_rate": 1.7141647737274002e-05, + "loss": 1.707, + "step": 68330 + }, + { + "epoch": 0.42953273230805494, + "grad_norm": 6.656262397766113, + "learning_rate": 1.714122863632935e-05, + "loss": 1.6456, + "step": 68340 + }, + { + "epoch": 0.429595584624752, + "grad_norm": 6.548450469970703, + "learning_rate": 1.7140809535384696e-05, + "loss": 1.9959, + "step": 68350 + }, + { + "epoch": 0.4296584369414491, + "grad_norm": 7.175817012786865, + "learning_rate": 1.714039043444004e-05, + "loss": 1.7946, + "step": 68360 + }, + { + "epoch": 0.42972128925814623, + "grad_norm": 6.8483710289001465, + "learning_rate": 1.7139971333495387e-05, + "loss": 1.8434, + "step": 68370 + }, + { + "epoch": 0.42978414157484335, + "grad_norm": 6.899672031402588, + "learning_rate": 1.7139552232550734e-05, + "loss": 1.5863, + "step": 68380 + }, + { + "epoch": 0.42984699389154046, + "grad_norm": 6.126796722412109, + "learning_rate": 1.713913313160608e-05, + "loss": 1.5894, + "step": 68390 + }, + { + "epoch": 0.4299098462082376, + "grad_norm": 7.532485485076904, + "learning_rate": 1.7138714030661428e-05, + "loss": 1.8445, + "step": 68400 + }, + { + "epoch": 0.4299726985249347, + "grad_norm": 8.191269874572754, + "learning_rate": 1.713829492971677e-05, + "loss": 1.6811, + "step": 68410 + }, + { + "epoch": 0.4300355508416318, + "grad_norm": 8.057154655456543, + "learning_rate": 1.713787582877212e-05, + "loss": 1.8348, + "step": 68420 + }, + { + "epoch": 0.4300984031583289, + "grad_norm": 7.179137229919434, + "learning_rate": 1.7137456727827466e-05, + "loss": 1.8079, + "step": 68430 + }, + { + "epoch": 0.43016125547502604, + "grad_norm": 7.086436748504639, + "learning_rate": 1.7137037626882813e-05, + "loss": 1.8095, + "step": 68440 + }, + { + "epoch": 0.43022410779172315, + "grad_norm": 7.228525161743164, + "learning_rate": 1.7136618525938156e-05, + "loss": 1.7663, + "step": 68450 + }, + { + "epoch": 0.43028696010842027, + "grad_norm": 7.063848495483398, + "learning_rate": 1.7136199424993504e-05, + "loss": 1.706, + "step": 68460 + }, + { + "epoch": 0.4303498124251174, + "grad_norm": 7.752283573150635, + "learning_rate": 1.713578032404885e-05, + "loss": 1.8401, + "step": 68470 + }, + { + "epoch": 0.43041266474181444, + "grad_norm": 7.277685642242432, + "learning_rate": 1.7135361223104198e-05, + "loss": 1.65, + "step": 68480 + }, + { + "epoch": 0.43047551705851156, + "grad_norm": 6.834372520446777, + "learning_rate": 1.7134942122159545e-05, + "loss": 1.7414, + "step": 68490 + }, + { + "epoch": 0.4305383693752087, + "grad_norm": 7.359886169433594, + "learning_rate": 1.7134523021214892e-05, + "loss": 1.5748, + "step": 68500 + }, + { + "epoch": 0.4306012216919058, + "grad_norm": 7.043386459350586, + "learning_rate": 1.713410392027024e-05, + "loss": 1.9207, + "step": 68510 + }, + { + "epoch": 0.4306640740086029, + "grad_norm": 7.472099304199219, + "learning_rate": 1.7133684819325586e-05, + "loss": 1.9185, + "step": 68520 + }, + { + "epoch": 0.4307269263253, + "grad_norm": 7.232861518859863, + "learning_rate": 1.7133265718380933e-05, + "loss": 1.8134, + "step": 68530 + }, + { + "epoch": 0.43078977864199713, + "grad_norm": 7.0091423988342285, + "learning_rate": 1.7132846617436277e-05, + "loss": 1.7788, + "step": 68540 + }, + { + "epoch": 0.43085263095869425, + "grad_norm": 7.006239414215088, + "learning_rate": 1.7132427516491624e-05, + "loss": 1.7941, + "step": 68550 + }, + { + "epoch": 0.43091548327539136, + "grad_norm": 8.227099418640137, + "learning_rate": 1.713200841554697e-05, + "loss": 1.7631, + "step": 68560 + }, + { + "epoch": 0.4309783355920885, + "grad_norm": 6.526317596435547, + "learning_rate": 1.7131589314602318e-05, + "loss": 1.5846, + "step": 68570 + }, + { + "epoch": 0.4310411879087856, + "grad_norm": 7.143447399139404, + "learning_rate": 1.7131170213657665e-05, + "loss": 1.886, + "step": 68580 + }, + { + "epoch": 0.4311040402254827, + "grad_norm": 6.684122085571289, + "learning_rate": 1.713075111271301e-05, + "loss": 1.9493, + "step": 68590 + }, + { + "epoch": 0.43116689254217977, + "grad_norm": 7.29036808013916, + "learning_rate": 1.7130332011768356e-05, + "loss": 1.6145, + "step": 68600 + }, + { + "epoch": 0.4312297448588769, + "grad_norm": 7.188809394836426, + "learning_rate": 1.7129912910823703e-05, + "loss": 1.6851, + "step": 68610 + }, + { + "epoch": 0.431292597175574, + "grad_norm": 7.1672163009643555, + "learning_rate": 1.712949380987905e-05, + "loss": 1.6537, + "step": 68620 + }, + { + "epoch": 0.4313554494922711, + "grad_norm": 6.353119850158691, + "learning_rate": 1.7129074708934394e-05, + "loss": 1.5965, + "step": 68630 + }, + { + "epoch": 0.43141830180896823, + "grad_norm": 8.291101455688477, + "learning_rate": 1.712865560798974e-05, + "loss": 1.7326, + "step": 68640 + }, + { + "epoch": 0.43148115412566534, + "grad_norm": 7.423107147216797, + "learning_rate": 1.7128236507045088e-05, + "loss": 1.7529, + "step": 68650 + }, + { + "epoch": 0.43154400644236246, + "grad_norm": 7.277958393096924, + "learning_rate": 1.7127817406100435e-05, + "loss": 1.6147, + "step": 68660 + }, + { + "epoch": 0.4316068587590596, + "grad_norm": 6.638142108917236, + "learning_rate": 1.7127398305155782e-05, + "loss": 1.7505, + "step": 68670 + }, + { + "epoch": 0.4316697110757567, + "grad_norm": 7.172138690948486, + "learning_rate": 1.7126979204211126e-05, + "loss": 1.6679, + "step": 68680 + }, + { + "epoch": 0.4317325633924538, + "grad_norm": 8.023433685302734, + "learning_rate": 1.7126560103266473e-05, + "loss": 1.7418, + "step": 68690 + }, + { + "epoch": 0.4317954157091509, + "grad_norm": 7.74282693862915, + "learning_rate": 1.712614100232182e-05, + "loss": 1.6706, + "step": 68700 + }, + { + "epoch": 0.43185826802584804, + "grad_norm": 7.428801536560059, + "learning_rate": 1.7125721901377167e-05, + "loss": 1.9181, + "step": 68710 + }, + { + "epoch": 0.43192112034254515, + "grad_norm": 6.023731231689453, + "learning_rate": 1.7125302800432514e-05, + "loss": 1.7713, + "step": 68720 + }, + { + "epoch": 0.4319839726592422, + "grad_norm": 4.9429731369018555, + "learning_rate": 1.712488369948786e-05, + "loss": 1.6503, + "step": 68730 + }, + { + "epoch": 0.4320468249759393, + "grad_norm": 6.637831211090088, + "learning_rate": 1.7124464598543208e-05, + "loss": 1.6801, + "step": 68740 + }, + { + "epoch": 0.43210967729263644, + "grad_norm": 6.555577754974365, + "learning_rate": 1.7124045497598555e-05, + "loss": 1.6762, + "step": 68750 + }, + { + "epoch": 0.43217252960933356, + "grad_norm": 7.642809867858887, + "learning_rate": 1.71236263966539e-05, + "loss": 1.8938, + "step": 68760 + }, + { + "epoch": 0.43223538192603067, + "grad_norm": 7.905426979064941, + "learning_rate": 1.7123207295709246e-05, + "loss": 1.7699, + "step": 68770 + }, + { + "epoch": 0.4322982342427278, + "grad_norm": 6.7034759521484375, + "learning_rate": 1.7122788194764593e-05, + "loss": 1.6905, + "step": 68780 + }, + { + "epoch": 0.4323610865594249, + "grad_norm": 6.467471122741699, + "learning_rate": 1.712236909381994e-05, + "loss": 1.7919, + "step": 68790 + }, + { + "epoch": 0.432423938876122, + "grad_norm": 6.592196464538574, + "learning_rate": 1.7121949992875287e-05, + "loss": 2.0191, + "step": 68800 + }, + { + "epoch": 0.43248679119281913, + "grad_norm": 6.909712791442871, + "learning_rate": 1.712153089193063e-05, + "loss": 1.6579, + "step": 68810 + }, + { + "epoch": 0.43254964350951625, + "grad_norm": 7.228925704956055, + "learning_rate": 1.7121111790985978e-05, + "loss": 1.8762, + "step": 68820 + }, + { + "epoch": 0.43261249582621336, + "grad_norm": 7.228713035583496, + "learning_rate": 1.7120692690041325e-05, + "loss": 1.8007, + "step": 68830 + }, + { + "epoch": 0.4326753481429105, + "grad_norm": 5.614768981933594, + "learning_rate": 1.7120273589096672e-05, + "loss": 1.9519, + "step": 68840 + }, + { + "epoch": 0.4327382004596076, + "grad_norm": 6.956311225891113, + "learning_rate": 1.7119854488152016e-05, + "loss": 1.7204, + "step": 68850 + }, + { + "epoch": 0.43280105277630465, + "grad_norm": 6.350851535797119, + "learning_rate": 1.7119435387207363e-05, + "loss": 1.6029, + "step": 68860 + }, + { + "epoch": 0.43286390509300177, + "grad_norm": 7.310161113739014, + "learning_rate": 1.711901628626271e-05, + "loss": 1.7023, + "step": 68870 + }, + { + "epoch": 0.4329267574096989, + "grad_norm": 7.155307292938232, + "learning_rate": 1.7118597185318057e-05, + "loss": 1.7279, + "step": 68880 + }, + { + "epoch": 0.432989609726396, + "grad_norm": 7.455556869506836, + "learning_rate": 1.7118178084373404e-05, + "loss": 1.7048, + "step": 68890 + }, + { + "epoch": 0.4330524620430931, + "grad_norm": 7.175830364227295, + "learning_rate": 1.711775898342875e-05, + "loss": 1.7005, + "step": 68900 + }, + { + "epoch": 0.4331153143597902, + "grad_norm": 6.088528633117676, + "learning_rate": 1.7117339882484098e-05, + "loss": 1.6173, + "step": 68910 + }, + { + "epoch": 0.43317816667648734, + "grad_norm": 6.129864692687988, + "learning_rate": 1.711692078153944e-05, + "loss": 1.7082, + "step": 68920 + }, + { + "epoch": 0.43324101899318446, + "grad_norm": 7.055078983306885, + "learning_rate": 1.711650168059479e-05, + "loss": 1.8112, + "step": 68930 + }, + { + "epoch": 0.43330387130988157, + "grad_norm": 6.802575588226318, + "learning_rate": 1.7116082579650136e-05, + "loss": 1.9003, + "step": 68940 + }, + { + "epoch": 0.4333667236265787, + "grad_norm": 6.301969051361084, + "learning_rate": 1.7115663478705483e-05, + "loss": 1.7973, + "step": 68950 + }, + { + "epoch": 0.4334295759432758, + "grad_norm": 8.61343765258789, + "learning_rate": 1.711524437776083e-05, + "loss": 1.7553, + "step": 68960 + }, + { + "epoch": 0.4334924282599729, + "grad_norm": 6.700798511505127, + "learning_rate": 1.7114825276816177e-05, + "loss": 1.6193, + "step": 68970 + }, + { + "epoch": 0.43355528057667003, + "grad_norm": 6.971966743469238, + "learning_rate": 1.711440617587152e-05, + "loss": 1.6059, + "step": 68980 + }, + { + "epoch": 0.4336181328933671, + "grad_norm": 7.719508171081543, + "learning_rate": 1.7113987074926868e-05, + "loss": 1.9536, + "step": 68990 + }, + { + "epoch": 0.4336809852100642, + "grad_norm": 6.376918792724609, + "learning_rate": 1.7113567973982215e-05, + "loss": 1.7461, + "step": 69000 + }, + { + "epoch": 0.4337438375267613, + "grad_norm": 5.054595947265625, + "learning_rate": 1.7113148873037562e-05, + "loss": 1.5952, + "step": 69010 + }, + { + "epoch": 0.43380668984345844, + "grad_norm": 7.998342037200928, + "learning_rate": 1.711272977209291e-05, + "loss": 1.6446, + "step": 69020 + }, + { + "epoch": 0.43386954216015555, + "grad_norm": 6.992016315460205, + "learning_rate": 1.7112310671148253e-05, + "loss": 1.982, + "step": 69030 + }, + { + "epoch": 0.43393239447685267, + "grad_norm": 6.751170635223389, + "learning_rate": 1.71118915702036e-05, + "loss": 1.7092, + "step": 69040 + }, + { + "epoch": 0.4339952467935498, + "grad_norm": 7.379878997802734, + "learning_rate": 1.7111472469258947e-05, + "loss": 1.8219, + "step": 69050 + }, + { + "epoch": 0.4340580991102469, + "grad_norm": 7.311999797821045, + "learning_rate": 1.7111053368314294e-05, + "loss": 1.82, + "step": 69060 + }, + { + "epoch": 0.434120951426944, + "grad_norm": 6.659695148468018, + "learning_rate": 1.7110634267369638e-05, + "loss": 1.8394, + "step": 69070 + }, + { + "epoch": 0.43418380374364113, + "grad_norm": 7.174983978271484, + "learning_rate": 1.7110215166424985e-05, + "loss": 1.5287, + "step": 69080 + }, + { + "epoch": 0.43424665606033824, + "grad_norm": 7.208707809448242, + "learning_rate": 1.7109796065480332e-05, + "loss": 2.0224, + "step": 69090 + }, + { + "epoch": 0.43430950837703536, + "grad_norm": 7.024357318878174, + "learning_rate": 1.710937696453568e-05, + "loss": 1.9896, + "step": 69100 + }, + { + "epoch": 0.4343723606937325, + "grad_norm": 7.056058883666992, + "learning_rate": 1.7108957863591026e-05, + "loss": 1.7408, + "step": 69110 + }, + { + "epoch": 0.43443521301042953, + "grad_norm": 6.121402740478516, + "learning_rate": 1.7108538762646373e-05, + "loss": 1.6714, + "step": 69120 + }, + { + "epoch": 0.43449806532712665, + "grad_norm": 5.951923370361328, + "learning_rate": 1.710811966170172e-05, + "loss": 1.7484, + "step": 69130 + }, + { + "epoch": 0.43456091764382376, + "grad_norm": 6.897318363189697, + "learning_rate": 1.7107700560757067e-05, + "loss": 1.6989, + "step": 69140 + }, + { + "epoch": 0.4346237699605209, + "grad_norm": 6.115614414215088, + "learning_rate": 1.7107281459812414e-05, + "loss": 1.9817, + "step": 69150 + }, + { + "epoch": 0.434686622277218, + "grad_norm": 7.430760383605957, + "learning_rate": 1.7106862358867758e-05, + "loss": 1.8993, + "step": 69160 + }, + { + "epoch": 0.4347494745939151, + "grad_norm": 7.004673480987549, + "learning_rate": 1.7106443257923105e-05, + "loss": 1.5811, + "step": 69170 + }, + { + "epoch": 0.4348123269106122, + "grad_norm": 7.1759819984436035, + "learning_rate": 1.7106024156978452e-05, + "loss": 1.7459, + "step": 69180 + }, + { + "epoch": 0.43487517922730934, + "grad_norm": 7.4145097732543945, + "learning_rate": 1.71056050560338e-05, + "loss": 1.7356, + "step": 69190 + }, + { + "epoch": 0.43493803154400645, + "grad_norm": 7.906942367553711, + "learning_rate": 1.7105185955089146e-05, + "loss": 1.8496, + "step": 69200 + }, + { + "epoch": 0.43500088386070357, + "grad_norm": 5.566155910491943, + "learning_rate": 1.710476685414449e-05, + "loss": 1.6328, + "step": 69210 + }, + { + "epoch": 0.4350637361774007, + "grad_norm": 7.766795635223389, + "learning_rate": 1.7104347753199837e-05, + "loss": 1.777, + "step": 69220 + }, + { + "epoch": 0.4351265884940978, + "grad_norm": 7.152865409851074, + "learning_rate": 1.7103928652255184e-05, + "loss": 1.6119, + "step": 69230 + }, + { + "epoch": 0.43518944081079486, + "grad_norm": 6.858283519744873, + "learning_rate": 1.710350955131053e-05, + "loss": 1.8616, + "step": 69240 + }, + { + "epoch": 0.435252293127492, + "grad_norm": 6.3837103843688965, + "learning_rate": 1.7103090450365875e-05, + "loss": 1.8093, + "step": 69250 + }, + { + "epoch": 0.4353151454441891, + "grad_norm": 6.967526435852051, + "learning_rate": 1.7102671349421222e-05, + "loss": 1.8023, + "step": 69260 + }, + { + "epoch": 0.4353779977608862, + "grad_norm": 6.755220890045166, + "learning_rate": 1.710225224847657e-05, + "loss": 1.7628, + "step": 69270 + }, + { + "epoch": 0.4354408500775833, + "grad_norm": 6.2029337882995605, + "learning_rate": 1.7101833147531916e-05, + "loss": 1.6785, + "step": 69280 + }, + { + "epoch": 0.43550370239428043, + "grad_norm": 7.696803569793701, + "learning_rate": 1.7101414046587263e-05, + "loss": 1.7776, + "step": 69290 + }, + { + "epoch": 0.43556655471097755, + "grad_norm": 6.981212139129639, + "learning_rate": 1.7100994945642607e-05, + "loss": 1.6779, + "step": 69300 + }, + { + "epoch": 0.43562940702767466, + "grad_norm": 6.717888832092285, + "learning_rate": 1.7100575844697954e-05, + "loss": 1.6449, + "step": 69310 + }, + { + "epoch": 0.4356922593443718, + "grad_norm": 7.516409873962402, + "learning_rate": 1.71001567437533e-05, + "loss": 1.714, + "step": 69320 + }, + { + "epoch": 0.4357551116610689, + "grad_norm": 6.508599281311035, + "learning_rate": 1.7099737642808648e-05, + "loss": 1.8393, + "step": 69330 + }, + { + "epoch": 0.435817963977766, + "grad_norm": 7.194772243499756, + "learning_rate": 1.7099318541863995e-05, + "loss": 1.6077, + "step": 69340 + }, + { + "epoch": 0.4358808162944631, + "grad_norm": 6.115171432495117, + "learning_rate": 1.7098899440919342e-05, + "loss": 1.8525, + "step": 69350 + }, + { + "epoch": 0.43594366861116024, + "grad_norm": 6.4120378494262695, + "learning_rate": 1.709848033997469e-05, + "loss": 1.7427, + "step": 69360 + }, + { + "epoch": 0.4360065209278573, + "grad_norm": 6.608808994293213, + "learning_rate": 1.7098061239030036e-05, + "loss": 1.746, + "step": 69370 + }, + { + "epoch": 0.4360693732445544, + "grad_norm": 6.515633583068848, + "learning_rate": 1.709764213808538e-05, + "loss": 1.8448, + "step": 69380 + }, + { + "epoch": 0.43613222556125153, + "grad_norm": 6.324481964111328, + "learning_rate": 1.7097223037140727e-05, + "loss": 1.7625, + "step": 69390 + }, + { + "epoch": 0.43619507787794864, + "grad_norm": 5.291093349456787, + "learning_rate": 1.7096803936196074e-05, + "loss": 1.6256, + "step": 69400 + }, + { + "epoch": 0.43625793019464576, + "grad_norm": 8.002639770507812, + "learning_rate": 1.709638483525142e-05, + "loss": 2.08, + "step": 69410 + }, + { + "epoch": 0.4363207825113429, + "grad_norm": 6.381542205810547, + "learning_rate": 1.7095965734306768e-05, + "loss": 1.8249, + "step": 69420 + }, + { + "epoch": 0.43638363482804, + "grad_norm": 6.60446310043335, + "learning_rate": 1.7095546633362112e-05, + "loss": 1.6198, + "step": 69430 + }, + { + "epoch": 0.4364464871447371, + "grad_norm": 6.089707374572754, + "learning_rate": 1.709512753241746e-05, + "loss": 1.5444, + "step": 69440 + }, + { + "epoch": 0.4365093394614342, + "grad_norm": 7.365067481994629, + "learning_rate": 1.7094708431472806e-05, + "loss": 1.6366, + "step": 69450 + }, + { + "epoch": 0.43657219177813134, + "grad_norm": 6.4489593505859375, + "learning_rate": 1.7094289330528153e-05, + "loss": 1.7174, + "step": 69460 + }, + { + "epoch": 0.43663504409482845, + "grad_norm": 6.833393573760986, + "learning_rate": 1.7093870229583497e-05, + "loss": 2.04, + "step": 69470 + }, + { + "epoch": 0.43669789641152557, + "grad_norm": 8.178645133972168, + "learning_rate": 1.7093451128638844e-05, + "loss": 2.1001, + "step": 69480 + }, + { + "epoch": 0.4367607487282227, + "grad_norm": 6.296994686126709, + "learning_rate": 1.709303202769419e-05, + "loss": 1.497, + "step": 69490 + }, + { + "epoch": 0.43682360104491974, + "grad_norm": 6.104201793670654, + "learning_rate": 1.7092612926749538e-05, + "loss": 1.8025, + "step": 69500 + }, + { + "epoch": 0.43688645336161686, + "grad_norm": 8.042522430419922, + "learning_rate": 1.7092193825804885e-05, + "loss": 1.8581, + "step": 69510 + }, + { + "epoch": 0.43694930567831397, + "grad_norm": 7.047634124755859, + "learning_rate": 1.7091774724860232e-05, + "loss": 1.7825, + "step": 69520 + }, + { + "epoch": 0.4370121579950111, + "grad_norm": 5.55924654006958, + "learning_rate": 1.709135562391558e-05, + "loss": 1.6944, + "step": 69530 + }, + { + "epoch": 0.4370750103117082, + "grad_norm": 7.6144938468933105, + "learning_rate": 1.7090936522970926e-05, + "loss": 1.6686, + "step": 69540 + }, + { + "epoch": 0.4371378626284053, + "grad_norm": 8.387451171875, + "learning_rate": 1.709051742202627e-05, + "loss": 1.7809, + "step": 69550 + }, + { + "epoch": 0.43720071494510243, + "grad_norm": 5.343925952911377, + "learning_rate": 1.7090098321081617e-05, + "loss": 1.7923, + "step": 69560 + }, + { + "epoch": 0.43726356726179955, + "grad_norm": 8.009215354919434, + "learning_rate": 1.7089679220136964e-05, + "loss": 1.6454, + "step": 69570 + }, + { + "epoch": 0.43732641957849666, + "grad_norm": 6.277082920074463, + "learning_rate": 1.708926011919231e-05, + "loss": 1.6737, + "step": 69580 + }, + { + "epoch": 0.4373892718951938, + "grad_norm": 8.452055931091309, + "learning_rate": 1.7088841018247658e-05, + "loss": 1.7448, + "step": 69590 + }, + { + "epoch": 0.4374521242118909, + "grad_norm": 6.058176040649414, + "learning_rate": 1.7088421917303002e-05, + "loss": 1.7696, + "step": 69600 + }, + { + "epoch": 0.437514976528588, + "grad_norm": 7.19420051574707, + "learning_rate": 1.708800281635835e-05, + "loss": 1.8384, + "step": 69610 + }, + { + "epoch": 0.4375778288452851, + "grad_norm": 6.660956382751465, + "learning_rate": 1.7087583715413696e-05, + "loss": 1.5671, + "step": 69620 + }, + { + "epoch": 0.4376406811619822, + "grad_norm": 6.775627136230469, + "learning_rate": 1.7087164614469043e-05, + "loss": 1.8158, + "step": 69630 + }, + { + "epoch": 0.4377035334786793, + "grad_norm": 7.618368625640869, + "learning_rate": 1.708674551352439e-05, + "loss": 1.8476, + "step": 69640 + }, + { + "epoch": 0.4377663857953764, + "grad_norm": 6.1381964683532715, + "learning_rate": 1.7086326412579734e-05, + "loss": 1.8181, + "step": 69650 + }, + { + "epoch": 0.4378292381120735, + "grad_norm": 6.716647624969482, + "learning_rate": 1.708590731163508e-05, + "loss": 1.5799, + "step": 69660 + }, + { + "epoch": 0.43789209042877064, + "grad_norm": 6.992799758911133, + "learning_rate": 1.7085488210690428e-05, + "loss": 1.9658, + "step": 69670 + }, + { + "epoch": 0.43795494274546776, + "grad_norm": 6.472326755523682, + "learning_rate": 1.7085069109745775e-05, + "loss": 1.7605, + "step": 69680 + }, + { + "epoch": 0.43801779506216487, + "grad_norm": 6.143565654754639, + "learning_rate": 1.708465000880112e-05, + "loss": 1.5635, + "step": 69690 + }, + { + "epoch": 0.438080647378862, + "grad_norm": 6.250063896179199, + "learning_rate": 1.7084230907856466e-05, + "loss": 1.5388, + "step": 69700 + }, + { + "epoch": 0.4381434996955591, + "grad_norm": 7.396320819854736, + "learning_rate": 1.7083811806911813e-05, + "loss": 1.6943, + "step": 69710 + }, + { + "epoch": 0.4382063520122562, + "grad_norm": 6.950412273406982, + "learning_rate": 1.708339270596716e-05, + "loss": 1.6593, + "step": 69720 + }, + { + "epoch": 0.43826920432895333, + "grad_norm": 7.082437038421631, + "learning_rate": 1.7082973605022507e-05, + "loss": 1.997, + "step": 69730 + }, + { + "epoch": 0.43833205664565045, + "grad_norm": 7.463531970977783, + "learning_rate": 1.7082554504077854e-05, + "loss": 1.683, + "step": 69740 + }, + { + "epoch": 0.4383949089623475, + "grad_norm": 6.712361812591553, + "learning_rate": 1.70821354031332e-05, + "loss": 2.094, + "step": 69750 + }, + { + "epoch": 0.4384577612790446, + "grad_norm": 6.187252998352051, + "learning_rate": 1.7081716302188548e-05, + "loss": 1.6655, + "step": 69760 + }, + { + "epoch": 0.43852061359574174, + "grad_norm": 5.67270040512085, + "learning_rate": 1.7081297201243895e-05, + "loss": 1.5296, + "step": 69770 + }, + { + "epoch": 0.43858346591243885, + "grad_norm": 5.254736423492432, + "learning_rate": 1.708087810029924e-05, + "loss": 1.8248, + "step": 69780 + }, + { + "epoch": 0.43864631822913597, + "grad_norm": 7.134040832519531, + "learning_rate": 1.7080458999354586e-05, + "loss": 1.8283, + "step": 69790 + }, + { + "epoch": 0.4387091705458331, + "grad_norm": 7.79978609085083, + "learning_rate": 1.7080039898409933e-05, + "loss": 1.7114, + "step": 69800 + }, + { + "epoch": 0.4387720228625302, + "grad_norm": 6.676272869110107, + "learning_rate": 1.707962079746528e-05, + "loss": 1.6352, + "step": 69810 + }, + { + "epoch": 0.4388348751792273, + "grad_norm": 6.6913299560546875, + "learning_rate": 1.7079201696520627e-05, + "loss": 1.6483, + "step": 69820 + }, + { + "epoch": 0.43889772749592443, + "grad_norm": 6.569377422332764, + "learning_rate": 1.707878259557597e-05, + "loss": 1.6161, + "step": 69830 + }, + { + "epoch": 0.43896057981262154, + "grad_norm": 6.990094184875488, + "learning_rate": 1.7078363494631318e-05, + "loss": 1.7794, + "step": 69840 + }, + { + "epoch": 0.43902343212931866, + "grad_norm": 8.219917297363281, + "learning_rate": 1.7077944393686665e-05, + "loss": 1.8896, + "step": 69850 + }, + { + "epoch": 0.4390862844460158, + "grad_norm": 7.872071266174316, + "learning_rate": 1.7077525292742012e-05, + "loss": 1.7814, + "step": 69860 + }, + { + "epoch": 0.4391491367627129, + "grad_norm": 6.119419574737549, + "learning_rate": 1.7077106191797356e-05, + "loss": 1.578, + "step": 69870 + }, + { + "epoch": 0.43921198907940995, + "grad_norm": 6.6119608879089355, + "learning_rate": 1.7076687090852703e-05, + "loss": 1.9156, + "step": 69880 + }, + { + "epoch": 0.43927484139610706, + "grad_norm": 7.422851085662842, + "learning_rate": 1.707626798990805e-05, + "loss": 1.5678, + "step": 69890 + }, + { + "epoch": 0.4393376937128042, + "grad_norm": 8.162243843078613, + "learning_rate": 1.7075848888963397e-05, + "loss": 1.7778, + "step": 69900 + }, + { + "epoch": 0.4394005460295013, + "grad_norm": 10.364338874816895, + "learning_rate": 1.7075429788018744e-05, + "loss": 1.9557, + "step": 69910 + }, + { + "epoch": 0.4394633983461984, + "grad_norm": 7.00361442565918, + "learning_rate": 1.707501068707409e-05, + "loss": 1.6572, + "step": 69920 + }, + { + "epoch": 0.4395262506628955, + "grad_norm": 6.680110454559326, + "learning_rate": 1.7074591586129435e-05, + "loss": 1.6079, + "step": 69930 + }, + { + "epoch": 0.43958910297959264, + "grad_norm": 6.4366984367370605, + "learning_rate": 1.7074172485184782e-05, + "loss": 1.7348, + "step": 69940 + }, + { + "epoch": 0.43965195529628975, + "grad_norm": 5.561446189880371, + "learning_rate": 1.707375338424013e-05, + "loss": 1.5654, + "step": 69950 + }, + { + "epoch": 0.43971480761298687, + "grad_norm": 5.447123050689697, + "learning_rate": 1.7073334283295476e-05, + "loss": 1.6663, + "step": 69960 + }, + { + "epoch": 0.439777659929684, + "grad_norm": 7.836920738220215, + "learning_rate": 1.7072915182350823e-05, + "loss": 1.7213, + "step": 69970 + }, + { + "epoch": 0.4398405122463811, + "grad_norm": 6.675960540771484, + "learning_rate": 1.707249608140617e-05, + "loss": 1.5798, + "step": 69980 + }, + { + "epoch": 0.4399033645630782, + "grad_norm": 6.375387668609619, + "learning_rate": 1.7072076980461517e-05, + "loss": 1.5938, + "step": 69990 + }, + { + "epoch": 0.43996621687977533, + "grad_norm": 6.819417476654053, + "learning_rate": 1.707165787951686e-05, + "loss": 1.6739, + "step": 70000 + }, + { + "epoch": 0.4400290691964724, + "grad_norm": 7.389351844787598, + "learning_rate": 1.7071238778572208e-05, + "loss": 1.882, + "step": 70010 + }, + { + "epoch": 0.4400919215131695, + "grad_norm": 7.2243499755859375, + "learning_rate": 1.7070819677627555e-05, + "loss": 1.7576, + "step": 70020 + }, + { + "epoch": 0.4401547738298666, + "grad_norm": 6.548774719238281, + "learning_rate": 1.7070400576682902e-05, + "loss": 1.5623, + "step": 70030 + }, + { + "epoch": 0.44021762614656373, + "grad_norm": 9.590810775756836, + "learning_rate": 1.706998147573825e-05, + "loss": 1.6819, + "step": 70040 + }, + { + "epoch": 0.44028047846326085, + "grad_norm": 7.229656219482422, + "learning_rate": 1.7069562374793593e-05, + "loss": 1.8326, + "step": 70050 + }, + { + "epoch": 0.44034333077995796, + "grad_norm": 6.59816312789917, + "learning_rate": 1.706914327384894e-05, + "loss": 1.7817, + "step": 70060 + }, + { + "epoch": 0.4404061830966551, + "grad_norm": 6.553726673126221, + "learning_rate": 1.7068724172904287e-05, + "loss": 1.923, + "step": 70070 + }, + { + "epoch": 0.4404690354133522, + "grad_norm": 7.365018367767334, + "learning_rate": 1.7068305071959634e-05, + "loss": 1.8541, + "step": 70080 + }, + { + "epoch": 0.4405318877300493, + "grad_norm": 6.91716194152832, + "learning_rate": 1.7067885971014978e-05, + "loss": 1.8022, + "step": 70090 + }, + { + "epoch": 0.4405947400467464, + "grad_norm": 7.806150436401367, + "learning_rate": 1.7067466870070325e-05, + "loss": 1.695, + "step": 70100 + }, + { + "epoch": 0.44065759236344354, + "grad_norm": 8.75064754486084, + "learning_rate": 1.7067047769125672e-05, + "loss": 2.0616, + "step": 70110 + }, + { + "epoch": 0.44072044468014065, + "grad_norm": 7.268632411956787, + "learning_rate": 1.706662866818102e-05, + "loss": 1.4863, + "step": 70120 + }, + { + "epoch": 0.44078329699683777, + "grad_norm": 7.369198799133301, + "learning_rate": 1.7066209567236366e-05, + "loss": 1.5836, + "step": 70130 + }, + { + "epoch": 0.44084614931353483, + "grad_norm": 7.380405426025391, + "learning_rate": 1.7065790466291713e-05, + "loss": 1.8607, + "step": 70140 + }, + { + "epoch": 0.44090900163023194, + "grad_norm": 7.630134582519531, + "learning_rate": 1.706537136534706e-05, + "loss": 1.9416, + "step": 70150 + }, + { + "epoch": 0.44097185394692906, + "grad_norm": 7.153118133544922, + "learning_rate": 1.7064952264402407e-05, + "loss": 1.823, + "step": 70160 + }, + { + "epoch": 0.4410347062636262, + "grad_norm": 6.811437606811523, + "learning_rate": 1.7064533163457754e-05, + "loss": 1.6643, + "step": 70170 + }, + { + "epoch": 0.4410975585803233, + "grad_norm": 6.356449604034424, + "learning_rate": 1.7064114062513098e-05, + "loss": 1.6124, + "step": 70180 + }, + { + "epoch": 0.4411604108970204, + "grad_norm": 5.7468671798706055, + "learning_rate": 1.7063694961568445e-05, + "loss": 1.8172, + "step": 70190 + }, + { + "epoch": 0.4412232632137175, + "grad_norm": 7.016430377960205, + "learning_rate": 1.7063275860623792e-05, + "loss": 1.9987, + "step": 70200 + }, + { + "epoch": 0.44128611553041464, + "grad_norm": 6.297787189483643, + "learning_rate": 1.706285675967914e-05, + "loss": 1.714, + "step": 70210 + }, + { + "epoch": 0.44134896784711175, + "grad_norm": 6.5773186683654785, + "learning_rate": 1.7062437658734483e-05, + "loss": 1.7148, + "step": 70220 + }, + { + "epoch": 0.44141182016380887, + "grad_norm": 6.49793815612793, + "learning_rate": 1.706201855778983e-05, + "loss": 1.8228, + "step": 70230 + }, + { + "epoch": 0.441474672480506, + "grad_norm": 6.518189430236816, + "learning_rate": 1.7061599456845177e-05, + "loss": 1.6114, + "step": 70240 + }, + { + "epoch": 0.4415375247972031, + "grad_norm": 8.319125175476074, + "learning_rate": 1.7061180355900524e-05, + "loss": 1.631, + "step": 70250 + }, + { + "epoch": 0.44160037711390016, + "grad_norm": 5.953863143920898, + "learning_rate": 1.706076125495587e-05, + "loss": 1.8944, + "step": 70260 + }, + { + "epoch": 0.44166322943059727, + "grad_norm": 5.739317893981934, + "learning_rate": 1.7060342154011215e-05, + "loss": 1.566, + "step": 70270 + }, + { + "epoch": 0.4417260817472944, + "grad_norm": 6.951610088348389, + "learning_rate": 1.7059923053066562e-05, + "loss": 1.8221, + "step": 70280 + }, + { + "epoch": 0.4417889340639915, + "grad_norm": 6.231076240539551, + "learning_rate": 1.705950395212191e-05, + "loss": 1.5316, + "step": 70290 + }, + { + "epoch": 0.4418517863806886, + "grad_norm": 7.177138805389404, + "learning_rate": 1.7059084851177256e-05, + "loss": 1.8151, + "step": 70300 + }, + { + "epoch": 0.44191463869738573, + "grad_norm": 5.447716236114502, + "learning_rate": 1.70586657502326e-05, + "loss": 1.5773, + "step": 70310 + }, + { + "epoch": 0.44197749101408285, + "grad_norm": 7.0117621421813965, + "learning_rate": 1.7058246649287947e-05, + "loss": 1.6816, + "step": 70320 + }, + { + "epoch": 0.44204034333077996, + "grad_norm": 6.275534152984619, + "learning_rate": 1.7057827548343294e-05, + "loss": 1.7795, + "step": 70330 + }, + { + "epoch": 0.4421031956474771, + "grad_norm": 7.140209197998047, + "learning_rate": 1.705740844739864e-05, + "loss": 1.6769, + "step": 70340 + }, + { + "epoch": 0.4421660479641742, + "grad_norm": 7.414947032928467, + "learning_rate": 1.7056989346453988e-05, + "loss": 1.8123, + "step": 70350 + }, + { + "epoch": 0.4422289002808713, + "grad_norm": 7.385918140411377, + "learning_rate": 1.7056570245509335e-05, + "loss": 1.6472, + "step": 70360 + }, + { + "epoch": 0.4422917525975684, + "grad_norm": 5.923056125640869, + "learning_rate": 1.7056151144564682e-05, + "loss": 1.6644, + "step": 70370 + }, + { + "epoch": 0.44235460491426554, + "grad_norm": 6.5220627784729, + "learning_rate": 1.705573204362003e-05, + "loss": 1.8159, + "step": 70380 + }, + { + "epoch": 0.4424174572309626, + "grad_norm": 7.185425758361816, + "learning_rate": 1.7055312942675376e-05, + "loss": 1.6309, + "step": 70390 + }, + { + "epoch": 0.4424803095476597, + "grad_norm": 9.08899974822998, + "learning_rate": 1.705489384173072e-05, + "loss": 1.7753, + "step": 70400 + }, + { + "epoch": 0.4425431618643568, + "grad_norm": 7.046308994293213, + "learning_rate": 1.7054474740786067e-05, + "loss": 1.6317, + "step": 70410 + }, + { + "epoch": 0.44260601418105394, + "grad_norm": 6.433578014373779, + "learning_rate": 1.7054055639841414e-05, + "loss": 1.6979, + "step": 70420 + }, + { + "epoch": 0.44266886649775106, + "grad_norm": 8.256340980529785, + "learning_rate": 1.705363653889676e-05, + "loss": 1.6845, + "step": 70430 + }, + { + "epoch": 0.44273171881444817, + "grad_norm": 6.462786674499512, + "learning_rate": 1.705321743795211e-05, + "loss": 1.801, + "step": 70440 + }, + { + "epoch": 0.4427945711311453, + "grad_norm": 6.751453399658203, + "learning_rate": 1.7052798337007452e-05, + "loss": 1.7099, + "step": 70450 + }, + { + "epoch": 0.4428574234478424, + "grad_norm": 7.0939483642578125, + "learning_rate": 1.70523792360628e-05, + "loss": 1.6693, + "step": 70460 + }, + { + "epoch": 0.4429202757645395, + "grad_norm": 5.824983596801758, + "learning_rate": 1.7051960135118146e-05, + "loss": 1.6418, + "step": 70470 + }, + { + "epoch": 0.44298312808123663, + "grad_norm": 7.333545684814453, + "learning_rate": 1.7051541034173493e-05, + "loss": 1.8104, + "step": 70480 + }, + { + "epoch": 0.44304598039793375, + "grad_norm": 7.342357158660889, + "learning_rate": 1.7051121933228837e-05, + "loss": 1.7328, + "step": 70490 + }, + { + "epoch": 0.44310883271463086, + "grad_norm": 6.030719757080078, + "learning_rate": 1.7050702832284184e-05, + "loss": 1.7738, + "step": 70500 + }, + { + "epoch": 0.443171685031328, + "grad_norm": 6.481091022491455, + "learning_rate": 1.705028373133953e-05, + "loss": 1.7931, + "step": 70510 + }, + { + "epoch": 0.44323453734802504, + "grad_norm": 6.386503219604492, + "learning_rate": 1.7049864630394878e-05, + "loss": 1.8141, + "step": 70520 + }, + { + "epoch": 0.44329738966472215, + "grad_norm": 6.556507587432861, + "learning_rate": 1.7049445529450225e-05, + "loss": 1.9126, + "step": 70530 + }, + { + "epoch": 0.44336024198141927, + "grad_norm": 7.444990634918213, + "learning_rate": 1.7049026428505572e-05, + "loss": 1.9579, + "step": 70540 + }, + { + "epoch": 0.4434230942981164, + "grad_norm": 6.353214740753174, + "learning_rate": 1.7048607327560916e-05, + "loss": 1.6042, + "step": 70550 + }, + { + "epoch": 0.4434859466148135, + "grad_norm": 6.988030433654785, + "learning_rate": 1.7048188226616263e-05, + "loss": 1.8534, + "step": 70560 + }, + { + "epoch": 0.4435487989315106, + "grad_norm": 6.261846542358398, + "learning_rate": 1.704776912567161e-05, + "loss": 1.8178, + "step": 70570 + }, + { + "epoch": 0.44361165124820773, + "grad_norm": 7.959524631500244, + "learning_rate": 1.7047350024726957e-05, + "loss": 1.8082, + "step": 70580 + }, + { + "epoch": 0.44367450356490484, + "grad_norm": 7.8182806968688965, + "learning_rate": 1.7046930923782304e-05, + "loss": 1.6912, + "step": 70590 + }, + { + "epoch": 0.44373735588160196, + "grad_norm": 6.153756141662598, + "learning_rate": 1.704651182283765e-05, + "loss": 1.7776, + "step": 70600 + }, + { + "epoch": 0.4438002081982991, + "grad_norm": 6.9526753425598145, + "learning_rate": 1.7046092721893e-05, + "loss": 1.736, + "step": 70610 + }, + { + "epoch": 0.4438630605149962, + "grad_norm": 8.29808521270752, + "learning_rate": 1.7045673620948342e-05, + "loss": 1.6857, + "step": 70620 + }, + { + "epoch": 0.4439259128316933, + "grad_norm": 7.707038402557373, + "learning_rate": 1.704525452000369e-05, + "loss": 1.8685, + "step": 70630 + }, + { + "epoch": 0.4439887651483904, + "grad_norm": 7.182012557983398, + "learning_rate": 1.7044835419059036e-05, + "loss": 1.7884, + "step": 70640 + }, + { + "epoch": 0.4440516174650875, + "grad_norm": 6.495720386505127, + "learning_rate": 1.7044416318114383e-05, + "loss": 1.6684, + "step": 70650 + }, + { + "epoch": 0.4441144697817846, + "grad_norm": 7.441178321838379, + "learning_rate": 1.704399721716973e-05, + "loss": 1.5172, + "step": 70660 + }, + { + "epoch": 0.4441773220984817, + "grad_norm": 6.320635795593262, + "learning_rate": 1.7043578116225074e-05, + "loss": 1.519, + "step": 70670 + }, + { + "epoch": 0.4442401744151788, + "grad_norm": 6.959654808044434, + "learning_rate": 1.704315901528042e-05, + "loss": 1.7838, + "step": 70680 + }, + { + "epoch": 0.44430302673187594, + "grad_norm": 7.093775749206543, + "learning_rate": 1.7042739914335768e-05, + "loss": 1.6259, + "step": 70690 + }, + { + "epoch": 0.44436587904857305, + "grad_norm": 8.001116752624512, + "learning_rate": 1.7042320813391115e-05, + "loss": 1.9448, + "step": 70700 + }, + { + "epoch": 0.44442873136527017, + "grad_norm": 7.232832431793213, + "learning_rate": 1.704190171244646e-05, + "loss": 1.7938, + "step": 70710 + }, + { + "epoch": 0.4444915836819673, + "grad_norm": 7.079518795013428, + "learning_rate": 1.7041482611501806e-05, + "loss": 1.5559, + "step": 70720 + }, + { + "epoch": 0.4445544359986644, + "grad_norm": 8.122894287109375, + "learning_rate": 1.7041063510557153e-05, + "loss": 1.6795, + "step": 70730 + }, + { + "epoch": 0.4446172883153615, + "grad_norm": 6.774159908294678, + "learning_rate": 1.70406444096125e-05, + "loss": 1.6674, + "step": 70740 + }, + { + "epoch": 0.44468014063205863, + "grad_norm": 5.806540012359619, + "learning_rate": 1.7040225308667847e-05, + "loss": 1.5207, + "step": 70750 + }, + { + "epoch": 0.44474299294875574, + "grad_norm": 7.335987091064453, + "learning_rate": 1.7039806207723194e-05, + "loss": 1.7307, + "step": 70760 + }, + { + "epoch": 0.4448058452654528, + "grad_norm": 6.601565837860107, + "learning_rate": 1.703938710677854e-05, + "loss": 1.9337, + "step": 70770 + }, + { + "epoch": 0.4448686975821499, + "grad_norm": 7.305120944976807, + "learning_rate": 1.703896800583389e-05, + "loss": 2.0206, + "step": 70780 + }, + { + "epoch": 0.44493154989884703, + "grad_norm": 5.766336917877197, + "learning_rate": 1.7038548904889235e-05, + "loss": 1.6814, + "step": 70790 + }, + { + "epoch": 0.44499440221554415, + "grad_norm": 7.237926483154297, + "learning_rate": 1.703812980394458e-05, + "loss": 1.6617, + "step": 70800 + }, + { + "epoch": 0.44505725453224126, + "grad_norm": 8.00814151763916, + "learning_rate": 1.7037710702999926e-05, + "loss": 1.7771, + "step": 70810 + }, + { + "epoch": 0.4451201068489384, + "grad_norm": 7.366685390472412, + "learning_rate": 1.7037291602055273e-05, + "loss": 1.5731, + "step": 70820 + }, + { + "epoch": 0.4451829591656355, + "grad_norm": 6.8207855224609375, + "learning_rate": 1.703687250111062e-05, + "loss": 1.8505, + "step": 70830 + }, + { + "epoch": 0.4452458114823326, + "grad_norm": 6.556506633758545, + "learning_rate": 1.7036453400165964e-05, + "loss": 1.8803, + "step": 70840 + }, + { + "epoch": 0.4453086637990297, + "grad_norm": 6.4768524169921875, + "learning_rate": 1.703603429922131e-05, + "loss": 1.9091, + "step": 70850 + }, + { + "epoch": 0.44537151611572684, + "grad_norm": 7.1526970863342285, + "learning_rate": 1.7035615198276658e-05, + "loss": 1.7211, + "step": 70860 + }, + { + "epoch": 0.44543436843242395, + "grad_norm": 7.244419574737549, + "learning_rate": 1.7035196097332005e-05, + "loss": 1.7652, + "step": 70870 + }, + { + "epoch": 0.44549722074912107, + "grad_norm": 6.412376880645752, + "learning_rate": 1.7034776996387352e-05, + "loss": 1.5863, + "step": 70880 + }, + { + "epoch": 0.4455600730658182, + "grad_norm": 5.949934482574463, + "learning_rate": 1.7034357895442696e-05, + "loss": 1.8857, + "step": 70890 + }, + { + "epoch": 0.44562292538251524, + "grad_norm": 7.144877910614014, + "learning_rate": 1.7033938794498043e-05, + "loss": 1.7353, + "step": 70900 + }, + { + "epoch": 0.44568577769921236, + "grad_norm": 6.605494499206543, + "learning_rate": 1.703351969355339e-05, + "loss": 1.5508, + "step": 70910 + }, + { + "epoch": 0.4457486300159095, + "grad_norm": 5.3954033851623535, + "learning_rate": 1.7033100592608737e-05, + "loss": 1.7277, + "step": 70920 + }, + { + "epoch": 0.4458114823326066, + "grad_norm": 6.628122806549072, + "learning_rate": 1.703268149166408e-05, + "loss": 1.8054, + "step": 70930 + }, + { + "epoch": 0.4458743346493037, + "grad_norm": 7.13183069229126, + "learning_rate": 1.7032262390719428e-05, + "loss": 1.7426, + "step": 70940 + }, + { + "epoch": 0.4459371869660008, + "grad_norm": 7.770249366760254, + "learning_rate": 1.7031843289774775e-05, + "loss": 1.682, + "step": 70950 + }, + { + "epoch": 0.44600003928269794, + "grad_norm": 7.339083194732666, + "learning_rate": 1.7031424188830122e-05, + "loss": 1.8275, + "step": 70960 + }, + { + "epoch": 0.44606289159939505, + "grad_norm": 6.73902702331543, + "learning_rate": 1.703100508788547e-05, + "loss": 1.7288, + "step": 70970 + }, + { + "epoch": 0.44612574391609217, + "grad_norm": 8.257696151733398, + "learning_rate": 1.7030585986940816e-05, + "loss": 1.6587, + "step": 70980 + }, + { + "epoch": 0.4461885962327893, + "grad_norm": 5.991543292999268, + "learning_rate": 1.7030166885996163e-05, + "loss": 1.5641, + "step": 70990 + }, + { + "epoch": 0.4462514485494864, + "grad_norm": 6.239089488983154, + "learning_rate": 1.702974778505151e-05, + "loss": 1.7716, + "step": 71000 + }, + { + "epoch": 0.4463143008661835, + "grad_norm": 7.279812812805176, + "learning_rate": 1.7029328684106857e-05, + "loss": 1.8226, + "step": 71010 + }, + { + "epoch": 0.4463771531828806, + "grad_norm": 8.477140426635742, + "learning_rate": 1.70289095831622e-05, + "loss": 1.7446, + "step": 71020 + }, + { + "epoch": 0.4464400054995777, + "grad_norm": 6.918874263763428, + "learning_rate": 1.7028490482217548e-05, + "loss": 1.5897, + "step": 71030 + }, + { + "epoch": 0.4465028578162748, + "grad_norm": 6.781617164611816, + "learning_rate": 1.7028071381272895e-05, + "loss": 1.5441, + "step": 71040 + }, + { + "epoch": 0.4465657101329719, + "grad_norm": 6.946547031402588, + "learning_rate": 1.7027652280328242e-05, + "loss": 1.5858, + "step": 71050 + }, + { + "epoch": 0.44662856244966903, + "grad_norm": 6.65471887588501, + "learning_rate": 1.702723317938359e-05, + "loss": 1.7303, + "step": 71060 + }, + { + "epoch": 0.44669141476636615, + "grad_norm": 5.427356243133545, + "learning_rate": 1.7026814078438933e-05, + "loss": 1.6519, + "step": 71070 + }, + { + "epoch": 0.44675426708306326, + "grad_norm": 7.257509708404541, + "learning_rate": 1.702639497749428e-05, + "loss": 1.9381, + "step": 71080 + }, + { + "epoch": 0.4468171193997604, + "grad_norm": 6.8030781745910645, + "learning_rate": 1.7025975876549627e-05, + "loss": 1.8593, + "step": 71090 + }, + { + "epoch": 0.4468799717164575, + "grad_norm": 6.951048374176025, + "learning_rate": 1.7025556775604974e-05, + "loss": 1.6926, + "step": 71100 + }, + { + "epoch": 0.4469428240331546, + "grad_norm": 6.181455612182617, + "learning_rate": 1.7025137674660318e-05, + "loss": 1.4505, + "step": 71110 + }, + { + "epoch": 0.4470056763498517, + "grad_norm": 8.337657928466797, + "learning_rate": 1.7024718573715665e-05, + "loss": 1.8906, + "step": 71120 + }, + { + "epoch": 0.44706852866654884, + "grad_norm": 6.0072550773620605, + "learning_rate": 1.7024299472771012e-05, + "loss": 1.6255, + "step": 71130 + }, + { + "epoch": 0.44713138098324595, + "grad_norm": 6.303506851196289, + "learning_rate": 1.702388037182636e-05, + "loss": 1.6077, + "step": 71140 + }, + { + "epoch": 0.44719423329994307, + "grad_norm": 6.815176486968994, + "learning_rate": 1.7023461270881706e-05, + "loss": 1.6572, + "step": 71150 + }, + { + "epoch": 0.4472570856166401, + "grad_norm": 7.286723613739014, + "learning_rate": 1.7023042169937053e-05, + "loss": 1.6682, + "step": 71160 + }, + { + "epoch": 0.44731993793333724, + "grad_norm": 7.008586883544922, + "learning_rate": 1.70226230689924e-05, + "loss": 1.6249, + "step": 71170 + }, + { + "epoch": 0.44738279025003436, + "grad_norm": 7.969468593597412, + "learning_rate": 1.7022203968047744e-05, + "loss": 1.47, + "step": 71180 + }, + { + "epoch": 0.44744564256673147, + "grad_norm": 8.08353042602539, + "learning_rate": 1.702178486710309e-05, + "loss": 1.6048, + "step": 71190 + }, + { + "epoch": 0.4475084948834286, + "grad_norm": 7.194873332977295, + "learning_rate": 1.7021365766158438e-05, + "loss": 1.7399, + "step": 71200 + }, + { + "epoch": 0.4475713472001257, + "grad_norm": 7.963724136352539, + "learning_rate": 1.7020946665213785e-05, + "loss": 1.6106, + "step": 71210 + }, + { + "epoch": 0.4476341995168228, + "grad_norm": 6.300773620605469, + "learning_rate": 1.7020527564269132e-05, + "loss": 1.7566, + "step": 71220 + }, + { + "epoch": 0.44769705183351993, + "grad_norm": 5.7120842933654785, + "learning_rate": 1.702010846332448e-05, + "loss": 1.9862, + "step": 71230 + }, + { + "epoch": 0.44775990415021705, + "grad_norm": 6.56306791305542, + "learning_rate": 1.7019689362379823e-05, + "loss": 1.7944, + "step": 71240 + }, + { + "epoch": 0.44782275646691416, + "grad_norm": 6.820886135101318, + "learning_rate": 1.701927026143517e-05, + "loss": 1.7516, + "step": 71250 + }, + { + "epoch": 0.4478856087836113, + "grad_norm": 7.675322532653809, + "learning_rate": 1.7018851160490517e-05, + "loss": 1.5747, + "step": 71260 + }, + { + "epoch": 0.4479484611003084, + "grad_norm": 6.196045398712158, + "learning_rate": 1.7018432059545864e-05, + "loss": 1.6493, + "step": 71270 + }, + { + "epoch": 0.44801131341700545, + "grad_norm": 6.478281497955322, + "learning_rate": 1.701801295860121e-05, + "loss": 1.5458, + "step": 71280 + }, + { + "epoch": 0.44807416573370257, + "grad_norm": 6.8187575340271, + "learning_rate": 1.7017593857656555e-05, + "loss": 1.887, + "step": 71290 + }, + { + "epoch": 0.4481370180503997, + "grad_norm": 6.972844123840332, + "learning_rate": 1.7017174756711902e-05, + "loss": 1.7298, + "step": 71300 + }, + { + "epoch": 0.4481998703670968, + "grad_norm": 6.094895362854004, + "learning_rate": 1.701675565576725e-05, + "loss": 1.9406, + "step": 71310 + }, + { + "epoch": 0.4482627226837939, + "grad_norm": 6.521101951599121, + "learning_rate": 1.7016336554822596e-05, + "loss": 1.5613, + "step": 71320 + }, + { + "epoch": 0.44832557500049103, + "grad_norm": 7.059791564941406, + "learning_rate": 1.701591745387794e-05, + "loss": 1.7796, + "step": 71330 + }, + { + "epoch": 0.44838842731718814, + "grad_norm": 6.8557047843933105, + "learning_rate": 1.7015498352933287e-05, + "loss": 1.6299, + "step": 71340 + }, + { + "epoch": 0.44845127963388526, + "grad_norm": 6.9734296798706055, + "learning_rate": 1.7015079251988634e-05, + "loss": 1.4914, + "step": 71350 + }, + { + "epoch": 0.4485141319505824, + "grad_norm": 7.521538257598877, + "learning_rate": 1.701466015104398e-05, + "loss": 1.7855, + "step": 71360 + }, + { + "epoch": 0.4485769842672795, + "grad_norm": 6.527483940124512, + "learning_rate": 1.7014241050099328e-05, + "loss": 1.8622, + "step": 71370 + }, + { + "epoch": 0.4486398365839766, + "grad_norm": 6.91995096206665, + "learning_rate": 1.7013821949154675e-05, + "loss": 1.8563, + "step": 71380 + }, + { + "epoch": 0.4487026889006737, + "grad_norm": 6.213915824890137, + "learning_rate": 1.7013402848210022e-05, + "loss": 1.6592, + "step": 71390 + }, + { + "epoch": 0.44876554121737083, + "grad_norm": 6.30928897857666, + "learning_rate": 1.701298374726537e-05, + "loss": 1.6303, + "step": 71400 + }, + { + "epoch": 0.4488283935340679, + "grad_norm": 6.645300388336182, + "learning_rate": 1.7012564646320717e-05, + "loss": 1.6825, + "step": 71410 + }, + { + "epoch": 0.448891245850765, + "grad_norm": 6.3673248291015625, + "learning_rate": 1.701214554537606e-05, + "loss": 1.6719, + "step": 71420 + }, + { + "epoch": 0.4489540981674621, + "grad_norm": 6.976233005523682, + "learning_rate": 1.7011726444431407e-05, + "loss": 1.8079, + "step": 71430 + }, + { + "epoch": 0.44901695048415924, + "grad_norm": 6.400688171386719, + "learning_rate": 1.7011307343486754e-05, + "loss": 1.7852, + "step": 71440 + }, + { + "epoch": 0.44907980280085635, + "grad_norm": 7.425689697265625, + "learning_rate": 1.70108882425421e-05, + "loss": 1.847, + "step": 71450 + }, + { + "epoch": 0.44914265511755347, + "grad_norm": 6.973445892333984, + "learning_rate": 1.7010469141597445e-05, + "loss": 1.9481, + "step": 71460 + }, + { + "epoch": 0.4492055074342506, + "grad_norm": 6.803900718688965, + "learning_rate": 1.7010050040652792e-05, + "loss": 1.5558, + "step": 71470 + }, + { + "epoch": 0.4492683597509477, + "grad_norm": 7.7245988845825195, + "learning_rate": 1.700963093970814e-05, + "loss": 1.7535, + "step": 71480 + }, + { + "epoch": 0.4493312120676448, + "grad_norm": 7.229686737060547, + "learning_rate": 1.7009211838763486e-05, + "loss": 1.7796, + "step": 71490 + }, + { + "epoch": 0.44939406438434193, + "grad_norm": 6.046517848968506, + "learning_rate": 1.7008792737818833e-05, + "loss": 1.6553, + "step": 71500 + }, + { + "epoch": 0.44945691670103904, + "grad_norm": 6.89406681060791, + "learning_rate": 1.7008373636874177e-05, + "loss": 1.5955, + "step": 71510 + }, + { + "epoch": 0.44951976901773616, + "grad_norm": 6.609005451202393, + "learning_rate": 1.7007954535929524e-05, + "loss": 1.9899, + "step": 71520 + }, + { + "epoch": 0.4495826213344333, + "grad_norm": 5.675281524658203, + "learning_rate": 1.700753543498487e-05, + "loss": 1.8324, + "step": 71530 + }, + { + "epoch": 0.44964547365113033, + "grad_norm": 7.0303826332092285, + "learning_rate": 1.700711633404022e-05, + "loss": 1.5863, + "step": 71540 + }, + { + "epoch": 0.44970832596782745, + "grad_norm": 7.449792385101318, + "learning_rate": 1.7006697233095565e-05, + "loss": 1.9384, + "step": 71550 + }, + { + "epoch": 0.44977117828452456, + "grad_norm": 7.167794227600098, + "learning_rate": 1.700627813215091e-05, + "loss": 1.5772, + "step": 71560 + }, + { + "epoch": 0.4498340306012217, + "grad_norm": 7.127641677856445, + "learning_rate": 1.7005859031206256e-05, + "loss": 1.9993, + "step": 71570 + }, + { + "epoch": 0.4498968829179188, + "grad_norm": 5.376589775085449, + "learning_rate": 1.7005439930261603e-05, + "loss": 1.8241, + "step": 71580 + }, + { + "epoch": 0.4499597352346159, + "grad_norm": 6.894751071929932, + "learning_rate": 1.700502082931695e-05, + "loss": 1.7184, + "step": 71590 + }, + { + "epoch": 0.450022587551313, + "grad_norm": 6.802367687225342, + "learning_rate": 1.7004601728372297e-05, + "loss": 1.7378, + "step": 71600 + }, + { + "epoch": 0.45008543986801014, + "grad_norm": 6.673643589019775, + "learning_rate": 1.7004182627427644e-05, + "loss": 1.5584, + "step": 71610 + }, + { + "epoch": 0.45014829218470725, + "grad_norm": 8.011720657348633, + "learning_rate": 1.700376352648299e-05, + "loss": 1.6937, + "step": 71620 + }, + { + "epoch": 0.45021114450140437, + "grad_norm": 8.510849952697754, + "learning_rate": 1.700334442553834e-05, + "loss": 1.6512, + "step": 71630 + }, + { + "epoch": 0.4502739968181015, + "grad_norm": 6.473636627197266, + "learning_rate": 1.7002925324593682e-05, + "loss": 1.7975, + "step": 71640 + }, + { + "epoch": 0.4503368491347986, + "grad_norm": 11.131653785705566, + "learning_rate": 1.700250622364903e-05, + "loss": 1.6276, + "step": 71650 + }, + { + "epoch": 0.4503997014514957, + "grad_norm": 7.675611972808838, + "learning_rate": 1.7002087122704376e-05, + "loss": 1.5814, + "step": 71660 + }, + { + "epoch": 0.4504625537681928, + "grad_norm": 5.326087951660156, + "learning_rate": 1.7001668021759723e-05, + "loss": 1.5803, + "step": 71670 + }, + { + "epoch": 0.4505254060848899, + "grad_norm": 8.270243644714355, + "learning_rate": 1.700124892081507e-05, + "loss": 1.8582, + "step": 71680 + }, + { + "epoch": 0.450588258401587, + "grad_norm": 6.738409042358398, + "learning_rate": 1.7000829819870414e-05, + "loss": 1.7065, + "step": 71690 + }, + { + "epoch": 0.4506511107182841, + "grad_norm": 8.476706504821777, + "learning_rate": 1.700041071892576e-05, + "loss": 1.873, + "step": 71700 + }, + { + "epoch": 0.45071396303498124, + "grad_norm": 7.574090480804443, + "learning_rate": 1.699999161798111e-05, + "loss": 1.8404, + "step": 71710 + }, + { + "epoch": 0.45077681535167835, + "grad_norm": 6.264162063598633, + "learning_rate": 1.6999572517036455e-05, + "loss": 1.6393, + "step": 71720 + }, + { + "epoch": 0.45083966766837547, + "grad_norm": 7.23504638671875, + "learning_rate": 1.69991534160918e-05, + "loss": 1.8151, + "step": 71730 + }, + { + "epoch": 0.4509025199850726, + "grad_norm": 6.774835586547852, + "learning_rate": 1.6998734315147146e-05, + "loss": 1.7495, + "step": 71740 + }, + { + "epoch": 0.4509653723017697, + "grad_norm": 7.305593490600586, + "learning_rate": 1.6998315214202493e-05, + "loss": 1.603, + "step": 71750 + }, + { + "epoch": 0.4510282246184668, + "grad_norm": 6.9782328605651855, + "learning_rate": 1.699789611325784e-05, + "loss": 1.8465, + "step": 71760 + }, + { + "epoch": 0.4510910769351639, + "grad_norm": 7.389726161956787, + "learning_rate": 1.6997477012313187e-05, + "loss": 1.531, + "step": 71770 + }, + { + "epoch": 0.45115392925186104, + "grad_norm": 7.15920877456665, + "learning_rate": 1.6997057911368534e-05, + "loss": 1.6857, + "step": 71780 + }, + { + "epoch": 0.45121678156855816, + "grad_norm": 7.240107536315918, + "learning_rate": 1.699663881042388e-05, + "loss": 1.7708, + "step": 71790 + }, + { + "epoch": 0.4512796338852552, + "grad_norm": 6.3036980628967285, + "learning_rate": 1.699621970947923e-05, + "loss": 1.624, + "step": 71800 + }, + { + "epoch": 0.45134248620195233, + "grad_norm": 7.612619876861572, + "learning_rate": 1.6995800608534572e-05, + "loss": 1.5405, + "step": 71810 + }, + { + "epoch": 0.45140533851864945, + "grad_norm": 6.979082107543945, + "learning_rate": 1.699538150758992e-05, + "loss": 1.6972, + "step": 71820 + }, + { + "epoch": 0.45146819083534656, + "grad_norm": 6.545334339141846, + "learning_rate": 1.6994962406645266e-05, + "loss": 1.5835, + "step": 71830 + }, + { + "epoch": 0.4515310431520437, + "grad_norm": 6.032336711883545, + "learning_rate": 1.6994543305700614e-05, + "loss": 1.7, + "step": 71840 + }, + { + "epoch": 0.4515938954687408, + "grad_norm": 7.708831310272217, + "learning_rate": 1.699412420475596e-05, + "loss": 1.6892, + "step": 71850 + }, + { + "epoch": 0.4516567477854379, + "grad_norm": 6.805393695831299, + "learning_rate": 1.6993705103811304e-05, + "loss": 1.8059, + "step": 71860 + }, + { + "epoch": 0.451719600102135, + "grad_norm": 7.413393974304199, + "learning_rate": 1.699328600286665e-05, + "loss": 1.8585, + "step": 71870 + }, + { + "epoch": 0.45178245241883214, + "grad_norm": 5.643686294555664, + "learning_rate": 1.6992866901922e-05, + "loss": 1.5059, + "step": 71880 + }, + { + "epoch": 0.45184530473552925, + "grad_norm": 5.965382099151611, + "learning_rate": 1.6992447800977345e-05, + "loss": 1.4114, + "step": 71890 + }, + { + "epoch": 0.45190815705222637, + "grad_norm": 6.276783466339111, + "learning_rate": 1.6992028700032693e-05, + "loss": 1.6649, + "step": 71900 + }, + { + "epoch": 0.4519710093689235, + "grad_norm": 8.128314971923828, + "learning_rate": 1.6991609599088036e-05, + "loss": 1.5287, + "step": 71910 + }, + { + "epoch": 0.45203386168562054, + "grad_norm": 7.705959320068359, + "learning_rate": 1.6991190498143383e-05, + "loss": 1.7249, + "step": 71920 + }, + { + "epoch": 0.45209671400231766, + "grad_norm": 5.786853313446045, + "learning_rate": 1.699077139719873e-05, + "loss": 1.6895, + "step": 71930 + }, + { + "epoch": 0.45215956631901477, + "grad_norm": 6.402442455291748, + "learning_rate": 1.6990352296254077e-05, + "loss": 1.772, + "step": 71940 + }, + { + "epoch": 0.4522224186357119, + "grad_norm": 5.8434367179870605, + "learning_rate": 1.698993319530942e-05, + "loss": 1.5583, + "step": 71950 + }, + { + "epoch": 0.452285270952409, + "grad_norm": 6.627134799957275, + "learning_rate": 1.6989514094364768e-05, + "loss": 1.6571, + "step": 71960 + }, + { + "epoch": 0.4523481232691061, + "grad_norm": 6.048925399780273, + "learning_rate": 1.6989094993420115e-05, + "loss": 1.8085, + "step": 71970 + }, + { + "epoch": 0.45241097558580323, + "grad_norm": 6.704756736755371, + "learning_rate": 1.6988675892475462e-05, + "loss": 1.6309, + "step": 71980 + }, + { + "epoch": 0.45247382790250035, + "grad_norm": 6.748854160308838, + "learning_rate": 1.698825679153081e-05, + "loss": 1.95, + "step": 71990 + }, + { + "epoch": 0.45253668021919746, + "grad_norm": 5.85832405090332, + "learning_rate": 1.6987837690586156e-05, + "loss": 1.8155, + "step": 72000 + }, + { + "epoch": 0.4525995325358946, + "grad_norm": 6.134837627410889, + "learning_rate": 1.6987418589641504e-05, + "loss": 1.7403, + "step": 72010 + }, + { + "epoch": 0.4526623848525917, + "grad_norm": 8.114599227905273, + "learning_rate": 1.698699948869685e-05, + "loss": 1.7724, + "step": 72020 + }, + { + "epoch": 0.4527252371692888, + "grad_norm": 6.4747700691223145, + "learning_rate": 1.6986580387752198e-05, + "loss": 1.9581, + "step": 72030 + }, + { + "epoch": 0.4527880894859859, + "grad_norm": 6.428544521331787, + "learning_rate": 1.698616128680754e-05, + "loss": 1.6771, + "step": 72040 + }, + { + "epoch": 0.452850941802683, + "grad_norm": 7.327491760253906, + "learning_rate": 1.698574218586289e-05, + "loss": 1.6727, + "step": 72050 + }, + { + "epoch": 0.4529137941193801, + "grad_norm": 7.482061386108398, + "learning_rate": 1.6985323084918236e-05, + "loss": 1.6585, + "step": 72060 + }, + { + "epoch": 0.4529766464360772, + "grad_norm": 6.531540870666504, + "learning_rate": 1.6984903983973583e-05, + "loss": 1.6971, + "step": 72070 + }, + { + "epoch": 0.45303949875277433, + "grad_norm": 7.134012699127197, + "learning_rate": 1.698448488302893e-05, + "loss": 1.8148, + "step": 72080 + }, + { + "epoch": 0.45310235106947144, + "grad_norm": 5.541568756103516, + "learning_rate": 1.6984065782084273e-05, + "loss": 1.7753, + "step": 72090 + }, + { + "epoch": 0.45316520338616856, + "grad_norm": 6.51377010345459, + "learning_rate": 1.698364668113962e-05, + "loss": 2.0393, + "step": 72100 + }, + { + "epoch": 0.4532280557028657, + "grad_norm": 6.593539237976074, + "learning_rate": 1.6983227580194967e-05, + "loss": 1.6683, + "step": 72110 + }, + { + "epoch": 0.4532909080195628, + "grad_norm": 6.5326972007751465, + "learning_rate": 1.6982808479250315e-05, + "loss": 1.8713, + "step": 72120 + }, + { + "epoch": 0.4533537603362599, + "grad_norm": 6.634287357330322, + "learning_rate": 1.6982389378305658e-05, + "loss": 1.6738, + "step": 72130 + }, + { + "epoch": 0.453416612652957, + "grad_norm": 8.520031929016113, + "learning_rate": 1.6981970277361005e-05, + "loss": 1.938, + "step": 72140 + }, + { + "epoch": 0.45347946496965413, + "grad_norm": 6.543417453765869, + "learning_rate": 1.6981551176416352e-05, + "loss": 1.7549, + "step": 72150 + }, + { + "epoch": 0.45354231728635125, + "grad_norm": 6.342586040496826, + "learning_rate": 1.69811320754717e-05, + "loss": 1.6541, + "step": 72160 + }, + { + "epoch": 0.45360516960304836, + "grad_norm": 6.786216735839844, + "learning_rate": 1.6980712974527047e-05, + "loss": 1.8846, + "step": 72170 + }, + { + "epoch": 0.4536680219197454, + "grad_norm": 8.198033332824707, + "learning_rate": 1.6980293873582394e-05, + "loss": 1.6326, + "step": 72180 + }, + { + "epoch": 0.45373087423644254, + "grad_norm": 7.122391223907471, + "learning_rate": 1.6979874772637737e-05, + "loss": 1.6513, + "step": 72190 + }, + { + "epoch": 0.45379372655313965, + "grad_norm": 5.494532585144043, + "learning_rate": 1.6979455671693084e-05, + "loss": 1.6417, + "step": 72200 + }, + { + "epoch": 0.45385657886983677, + "grad_norm": 7.167390823364258, + "learning_rate": 1.697903657074843e-05, + "loss": 1.7278, + "step": 72210 + }, + { + "epoch": 0.4539194311865339, + "grad_norm": 7.346843719482422, + "learning_rate": 1.697861746980378e-05, + "loss": 1.7696, + "step": 72220 + }, + { + "epoch": 0.453982283503231, + "grad_norm": 7.8218865394592285, + "learning_rate": 1.6978198368859126e-05, + "loss": 1.6169, + "step": 72230 + }, + { + "epoch": 0.4540451358199281, + "grad_norm": 7.494811534881592, + "learning_rate": 1.6977779267914473e-05, + "loss": 1.7006, + "step": 72240 + }, + { + "epoch": 0.45410798813662523, + "grad_norm": 6.5424017906188965, + "learning_rate": 1.697736016696982e-05, + "loss": 1.6564, + "step": 72250 + }, + { + "epoch": 0.45417084045332234, + "grad_norm": 6.879222393035889, + "learning_rate": 1.6976941066025163e-05, + "loss": 1.8634, + "step": 72260 + }, + { + "epoch": 0.45423369277001946, + "grad_norm": 7.229594707489014, + "learning_rate": 1.697652196508051e-05, + "loss": 1.6717, + "step": 72270 + }, + { + "epoch": 0.4542965450867166, + "grad_norm": 7.287384510040283, + "learning_rate": 1.6976102864135858e-05, + "loss": 1.4973, + "step": 72280 + }, + { + "epoch": 0.4543593974034137, + "grad_norm": 6.4875922203063965, + "learning_rate": 1.6975683763191205e-05, + "loss": 1.7654, + "step": 72290 + }, + { + "epoch": 0.4544222497201108, + "grad_norm": 7.1652092933654785, + "learning_rate": 1.697526466224655e-05, + "loss": 1.4093, + "step": 72300 + }, + { + "epoch": 0.45448510203680786, + "grad_norm": 7.26404333114624, + "learning_rate": 1.6974845561301895e-05, + "loss": 1.4883, + "step": 72310 + }, + { + "epoch": 0.454547954353505, + "grad_norm": 6.425233364105225, + "learning_rate": 1.6974426460357242e-05, + "loss": 1.6787, + "step": 72320 + }, + { + "epoch": 0.4546108066702021, + "grad_norm": 8.020889282226562, + "learning_rate": 1.697400735941259e-05, + "loss": 1.6346, + "step": 72330 + }, + { + "epoch": 0.4546736589868992, + "grad_norm": 6.626036167144775, + "learning_rate": 1.6973588258467937e-05, + "loss": 1.838, + "step": 72340 + }, + { + "epoch": 0.4547365113035963, + "grad_norm": 7.176641464233398, + "learning_rate": 1.697316915752328e-05, + "loss": 1.7114, + "step": 72350 + }, + { + "epoch": 0.45479936362029344, + "grad_norm": 6.502776145935059, + "learning_rate": 1.6972750056578627e-05, + "loss": 1.6543, + "step": 72360 + }, + { + "epoch": 0.45486221593699055, + "grad_norm": 5.991310119628906, + "learning_rate": 1.6972330955633974e-05, + "loss": 1.7302, + "step": 72370 + }, + { + "epoch": 0.45492506825368767, + "grad_norm": 6.242975234985352, + "learning_rate": 1.697191185468932e-05, + "loss": 1.682, + "step": 72380 + }, + { + "epoch": 0.4549879205703848, + "grad_norm": 7.033971786499023, + "learning_rate": 1.697149275374467e-05, + "loss": 1.5691, + "step": 72390 + }, + { + "epoch": 0.4550507728870819, + "grad_norm": 6.049368858337402, + "learning_rate": 1.6971073652800016e-05, + "loss": 1.7311, + "step": 72400 + }, + { + "epoch": 0.455113625203779, + "grad_norm": 8.386807441711426, + "learning_rate": 1.6970654551855363e-05, + "loss": 1.6526, + "step": 72410 + }, + { + "epoch": 0.45517647752047613, + "grad_norm": 6.7603840827941895, + "learning_rate": 1.697023545091071e-05, + "loss": 1.7794, + "step": 72420 + }, + { + "epoch": 0.4552393298371732, + "grad_norm": 7.872735500335693, + "learning_rate": 1.6969816349966053e-05, + "loss": 1.7921, + "step": 72430 + }, + { + "epoch": 0.4553021821538703, + "grad_norm": 6.434782981872559, + "learning_rate": 1.69693972490214e-05, + "loss": 1.7876, + "step": 72440 + }, + { + "epoch": 0.4553650344705674, + "grad_norm": 7.069159507751465, + "learning_rate": 1.6968978148076748e-05, + "loss": 1.6929, + "step": 72450 + }, + { + "epoch": 0.45542788678726454, + "grad_norm": 5.2417497634887695, + "learning_rate": 1.6968559047132095e-05, + "loss": 1.6391, + "step": 72460 + }, + { + "epoch": 0.45549073910396165, + "grad_norm": 6.261465549468994, + "learning_rate": 1.696813994618744e-05, + "loss": 1.766, + "step": 72470 + }, + { + "epoch": 0.45555359142065877, + "grad_norm": 7.114291191101074, + "learning_rate": 1.6967720845242785e-05, + "loss": 1.8308, + "step": 72480 + }, + { + "epoch": 0.4556164437373559, + "grad_norm": 5.9655585289001465, + "learning_rate": 1.6967301744298132e-05, + "loss": 1.7141, + "step": 72490 + }, + { + "epoch": 0.455679296054053, + "grad_norm": 6.264899253845215, + "learning_rate": 1.696688264335348e-05, + "loss": 1.744, + "step": 72500 + }, + { + "epoch": 0.4557421483707501, + "grad_norm": 7.141603469848633, + "learning_rate": 1.6966463542408827e-05, + "loss": 1.7852, + "step": 72510 + }, + { + "epoch": 0.4558050006874472, + "grad_norm": 6.6123857498168945, + "learning_rate": 1.6966044441464174e-05, + "loss": 1.7439, + "step": 72520 + }, + { + "epoch": 0.45586785300414434, + "grad_norm": 6.842229843139648, + "learning_rate": 1.6965625340519517e-05, + "loss": 1.6891, + "step": 72530 + }, + { + "epoch": 0.45593070532084146, + "grad_norm": 6.660259246826172, + "learning_rate": 1.6965206239574864e-05, + "loss": 1.6473, + "step": 72540 + }, + { + "epoch": 0.45599355763753857, + "grad_norm": 6.573643684387207, + "learning_rate": 1.696478713863021e-05, + "loss": 1.6939, + "step": 72550 + }, + { + "epoch": 0.45605640995423563, + "grad_norm": 7.359166145324707, + "learning_rate": 1.696436803768556e-05, + "loss": 1.7793, + "step": 72560 + }, + { + "epoch": 0.45611926227093275, + "grad_norm": 7.229355812072754, + "learning_rate": 1.6963948936740902e-05, + "loss": 1.7815, + "step": 72570 + }, + { + "epoch": 0.45618211458762986, + "grad_norm": 7.012868404388428, + "learning_rate": 1.696352983579625e-05, + "loss": 1.6657, + "step": 72580 + }, + { + "epoch": 0.456244966904327, + "grad_norm": 6.879143238067627, + "learning_rate": 1.6963110734851596e-05, + "loss": 1.6057, + "step": 72590 + }, + { + "epoch": 0.4563078192210241, + "grad_norm": 7.133265018463135, + "learning_rate": 1.6962691633906943e-05, + "loss": 1.7353, + "step": 72600 + }, + { + "epoch": 0.4563706715377212, + "grad_norm": 6.666014194488525, + "learning_rate": 1.696227253296229e-05, + "loss": 1.744, + "step": 72610 + }, + { + "epoch": 0.4564335238544183, + "grad_norm": 7.25177001953125, + "learning_rate": 1.6961853432017638e-05, + "loss": 1.5536, + "step": 72620 + }, + { + "epoch": 0.45649637617111544, + "grad_norm": 6.873335361480713, + "learning_rate": 1.6961434331072985e-05, + "loss": 1.8091, + "step": 72630 + }, + { + "epoch": 0.45655922848781255, + "grad_norm": 5.979950904846191, + "learning_rate": 1.6961015230128332e-05, + "loss": 1.8286, + "step": 72640 + }, + { + "epoch": 0.45662208080450967, + "grad_norm": 7.808071613311768, + "learning_rate": 1.696059612918368e-05, + "loss": 1.8732, + "step": 72650 + }, + { + "epoch": 0.4566849331212068, + "grad_norm": 8.281644821166992, + "learning_rate": 1.6960177028239022e-05, + "loss": 1.8113, + "step": 72660 + }, + { + "epoch": 0.4567477854379039, + "grad_norm": 7.271381378173828, + "learning_rate": 1.695975792729437e-05, + "loss": 1.9359, + "step": 72670 + }, + { + "epoch": 0.456810637754601, + "grad_norm": 6.758279323577881, + "learning_rate": 1.6959338826349717e-05, + "loss": 1.5058, + "step": 72680 + }, + { + "epoch": 0.45687349007129807, + "grad_norm": 7.0523834228515625, + "learning_rate": 1.6958919725405064e-05, + "loss": 1.8413, + "step": 72690 + }, + { + "epoch": 0.4569363423879952, + "grad_norm": 6.535349369049072, + "learning_rate": 1.695850062446041e-05, + "loss": 1.7019, + "step": 72700 + }, + { + "epoch": 0.4569991947046923, + "grad_norm": 6.913873195648193, + "learning_rate": 1.6958081523515754e-05, + "loss": 1.7541, + "step": 72710 + }, + { + "epoch": 0.4570620470213894, + "grad_norm": 6.172971725463867, + "learning_rate": 1.69576624225711e-05, + "loss": 1.8061, + "step": 72720 + }, + { + "epoch": 0.45712489933808653, + "grad_norm": 6.995718955993652, + "learning_rate": 1.695724332162645e-05, + "loss": 1.8055, + "step": 72730 + }, + { + "epoch": 0.45718775165478365, + "grad_norm": 5.739918231964111, + "learning_rate": 1.6956824220681796e-05, + "loss": 1.8473, + "step": 72740 + }, + { + "epoch": 0.45725060397148076, + "grad_norm": 8.108476638793945, + "learning_rate": 1.695640511973714e-05, + "loss": 1.8737, + "step": 72750 + }, + { + "epoch": 0.4573134562881779, + "grad_norm": 6.1044440269470215, + "learning_rate": 1.6955986018792486e-05, + "loss": 1.7364, + "step": 72760 + }, + { + "epoch": 0.457376308604875, + "grad_norm": 7.338126182556152, + "learning_rate": 1.6955566917847833e-05, + "loss": 1.8628, + "step": 72770 + }, + { + "epoch": 0.4574391609215721, + "grad_norm": 5.92439603805542, + "learning_rate": 1.695514781690318e-05, + "loss": 1.6609, + "step": 72780 + }, + { + "epoch": 0.4575020132382692, + "grad_norm": 5.798541069030762, + "learning_rate": 1.6954728715958528e-05, + "loss": 1.7788, + "step": 72790 + }, + { + "epoch": 0.45756486555496634, + "grad_norm": 6.529398441314697, + "learning_rate": 1.6954309615013875e-05, + "loss": 1.7826, + "step": 72800 + }, + { + "epoch": 0.45762771787166345, + "grad_norm": 7.10496187210083, + "learning_rate": 1.695389051406922e-05, + "loss": 1.5668, + "step": 72810 + }, + { + "epoch": 0.4576905701883605, + "grad_norm": 6.67068338394165, + "learning_rate": 1.6953471413124565e-05, + "loss": 1.6258, + "step": 72820 + }, + { + "epoch": 0.45775342250505763, + "grad_norm": 6.297972679138184, + "learning_rate": 1.6953094222274377e-05, + "loss": 1.8902, + "step": 72830 + }, + { + "epoch": 0.45781627482175474, + "grad_norm": 5.803272247314453, + "learning_rate": 1.6952675121329724e-05, + "loss": 1.9535, + "step": 72840 + }, + { + "epoch": 0.45787912713845186, + "grad_norm": 8.457806587219238, + "learning_rate": 1.695225602038507e-05, + "loss": 1.8868, + "step": 72850 + }, + { + "epoch": 0.457941979455149, + "grad_norm": 6.783581733703613, + "learning_rate": 1.6951836919440418e-05, + "loss": 1.6544, + "step": 72860 + }, + { + "epoch": 0.4580048317718461, + "grad_norm": 6.673780918121338, + "learning_rate": 1.6951417818495765e-05, + "loss": 1.7468, + "step": 72870 + }, + { + "epoch": 0.4580676840885432, + "grad_norm": 7.6437668800354, + "learning_rate": 1.695099871755111e-05, + "loss": 1.8997, + "step": 72880 + }, + { + "epoch": 0.4581305364052403, + "grad_norm": 6.677247524261475, + "learning_rate": 1.6950579616606456e-05, + "loss": 1.9311, + "step": 72890 + }, + { + "epoch": 0.45819338872193743, + "grad_norm": 6.750226020812988, + "learning_rate": 1.6950160515661803e-05, + "loss": 1.7603, + "step": 72900 + }, + { + "epoch": 0.45825624103863455, + "grad_norm": 6.881356716156006, + "learning_rate": 1.694974141471715e-05, + "loss": 1.5952, + "step": 72910 + }, + { + "epoch": 0.45831909335533166, + "grad_norm": 6.134947776794434, + "learning_rate": 1.6949322313772497e-05, + "loss": 1.8821, + "step": 72920 + }, + { + "epoch": 0.4583819456720288, + "grad_norm": 6.7909135818481445, + "learning_rate": 1.6948903212827844e-05, + "loss": 1.7302, + "step": 72930 + }, + { + "epoch": 0.45844479798872584, + "grad_norm": 7.331709384918213, + "learning_rate": 1.694848411188319e-05, + "loss": 1.6622, + "step": 72940 + }, + { + "epoch": 0.45850765030542295, + "grad_norm": 7.265378475189209, + "learning_rate": 1.6948065010938538e-05, + "loss": 1.7583, + "step": 72950 + }, + { + "epoch": 0.45857050262212007, + "grad_norm": 5.8029279708862305, + "learning_rate": 1.6947645909993882e-05, + "loss": 1.6582, + "step": 72960 + }, + { + "epoch": 0.4586333549388172, + "grad_norm": 6.222194194793701, + "learning_rate": 1.694722680904923e-05, + "loss": 1.6333, + "step": 72970 + }, + { + "epoch": 0.4586962072555143, + "grad_norm": 7.174890995025635, + "learning_rate": 1.6946807708104576e-05, + "loss": 1.705, + "step": 72980 + }, + { + "epoch": 0.4587590595722114, + "grad_norm": 6.686669826507568, + "learning_rate": 1.6946388607159923e-05, + "loss": 1.7082, + "step": 72990 + }, + { + "epoch": 0.45882191188890853, + "grad_norm": 7.4787750244140625, + "learning_rate": 1.6945969506215267e-05, + "loss": 1.8003, + "step": 73000 + }, + { + "epoch": 0.45888476420560564, + "grad_norm": 6.632627964019775, + "learning_rate": 1.6945550405270614e-05, + "loss": 1.4592, + "step": 73010 + }, + { + "epoch": 0.45894761652230276, + "grad_norm": 7.123968124389648, + "learning_rate": 1.694513130432596e-05, + "loss": 1.9138, + "step": 73020 + }, + { + "epoch": 0.4590104688389999, + "grad_norm": 6.550754070281982, + "learning_rate": 1.6944712203381308e-05, + "loss": 1.5374, + "step": 73030 + }, + { + "epoch": 0.459073321155697, + "grad_norm": 7.170216083526611, + "learning_rate": 1.6944293102436655e-05, + "loss": 1.7027, + "step": 73040 + }, + { + "epoch": 0.4591361734723941, + "grad_norm": 7.3206048011779785, + "learning_rate": 1.6943874001492e-05, + "loss": 1.7429, + "step": 73050 + }, + { + "epoch": 0.4591990257890912, + "grad_norm": 9.7173433303833, + "learning_rate": 1.6943454900547346e-05, + "loss": 2.1381, + "step": 73060 + }, + { + "epoch": 0.4592618781057883, + "grad_norm": 6.854995250701904, + "learning_rate": 1.6943035799602693e-05, + "loss": 1.8794, + "step": 73070 + }, + { + "epoch": 0.4593247304224854, + "grad_norm": 9.510405540466309, + "learning_rate": 1.694261669865804e-05, + "loss": 1.5659, + "step": 73080 + }, + { + "epoch": 0.4593875827391825, + "grad_norm": 6.91724157333374, + "learning_rate": 1.6942197597713387e-05, + "loss": 1.8883, + "step": 73090 + }, + { + "epoch": 0.4594504350558796, + "grad_norm": 7.3841166496276855, + "learning_rate": 1.6941778496768734e-05, + "loss": 1.7679, + "step": 73100 + }, + { + "epoch": 0.45951328737257674, + "grad_norm": 7.427146911621094, + "learning_rate": 1.694135939582408e-05, + "loss": 1.5073, + "step": 73110 + }, + { + "epoch": 0.45957613968927385, + "grad_norm": 6.2699761390686035, + "learning_rate": 1.6940940294879428e-05, + "loss": 1.5289, + "step": 73120 + }, + { + "epoch": 0.45963899200597097, + "grad_norm": 6.542108058929443, + "learning_rate": 1.6940521193934772e-05, + "loss": 1.8077, + "step": 73130 + }, + { + "epoch": 0.4597018443226681, + "grad_norm": 6.790388584136963, + "learning_rate": 1.694010209299012e-05, + "loss": 1.6832, + "step": 73140 + }, + { + "epoch": 0.4597646966393652, + "grad_norm": 7.237472057342529, + "learning_rate": 1.6939682992045466e-05, + "loss": 1.7433, + "step": 73150 + }, + { + "epoch": 0.4598275489560623, + "grad_norm": 7.028692722320557, + "learning_rate": 1.6939263891100813e-05, + "loss": 1.6186, + "step": 73160 + }, + { + "epoch": 0.45989040127275943, + "grad_norm": 7.17855978012085, + "learning_rate": 1.693884479015616e-05, + "loss": 1.8079, + "step": 73170 + }, + { + "epoch": 0.45995325358945655, + "grad_norm": 9.221105575561523, + "learning_rate": 1.6938425689211504e-05, + "loss": 1.598, + "step": 73180 + }, + { + "epoch": 0.46001610590615366, + "grad_norm": 7.384937286376953, + "learning_rate": 1.693800658826685e-05, + "loss": 1.7084, + "step": 73190 + }, + { + "epoch": 0.4600789582228507, + "grad_norm": 6.056758880615234, + "learning_rate": 1.6937587487322198e-05, + "loss": 1.7156, + "step": 73200 + }, + { + "epoch": 0.46014181053954784, + "grad_norm": 6.282230854034424, + "learning_rate": 1.6937168386377545e-05, + "loss": 1.5146, + "step": 73210 + }, + { + "epoch": 0.46020466285624495, + "grad_norm": 7.369024753570557, + "learning_rate": 1.693674928543289e-05, + "loss": 1.795, + "step": 73220 + }, + { + "epoch": 0.46026751517294207, + "grad_norm": 7.202399253845215, + "learning_rate": 1.6936330184488236e-05, + "loss": 1.867, + "step": 73230 + }, + { + "epoch": 0.4603303674896392, + "grad_norm": 6.399475574493408, + "learning_rate": 1.6935911083543583e-05, + "loss": 1.6532, + "step": 73240 + }, + { + "epoch": 0.4603932198063363, + "grad_norm": 7.484232425689697, + "learning_rate": 1.693549198259893e-05, + "loss": 1.7963, + "step": 73250 + }, + { + "epoch": 0.4604560721230334, + "grad_norm": 6.41881799697876, + "learning_rate": 1.6935072881654277e-05, + "loss": 1.9135, + "step": 73260 + }, + { + "epoch": 0.4605189244397305, + "grad_norm": 6.067859172821045, + "learning_rate": 1.693465378070962e-05, + "loss": 1.6958, + "step": 73270 + }, + { + "epoch": 0.46058177675642764, + "grad_norm": 7.959996700286865, + "learning_rate": 1.6934234679764968e-05, + "loss": 2.0227, + "step": 73280 + }, + { + "epoch": 0.46064462907312476, + "grad_norm": 5.5783610343933105, + "learning_rate": 1.6933815578820315e-05, + "loss": 1.5307, + "step": 73290 + }, + { + "epoch": 0.46070748138982187, + "grad_norm": 6.5437445640563965, + "learning_rate": 1.6933396477875662e-05, + "loss": 1.5558, + "step": 73300 + }, + { + "epoch": 0.460770333706519, + "grad_norm": 6.402663230895996, + "learning_rate": 1.693297737693101e-05, + "loss": 1.7255, + "step": 73310 + }, + { + "epoch": 0.4608331860232161, + "grad_norm": 6.267522811889648, + "learning_rate": 1.6932558275986356e-05, + "loss": 1.7829, + "step": 73320 + }, + { + "epoch": 0.46089603833991316, + "grad_norm": 6.446778297424316, + "learning_rate": 1.6932139175041703e-05, + "loss": 1.691, + "step": 73330 + }, + { + "epoch": 0.4609588906566103, + "grad_norm": 8.30638599395752, + "learning_rate": 1.693172007409705e-05, + "loss": 1.7292, + "step": 73340 + }, + { + "epoch": 0.4610217429733074, + "grad_norm": 8.48983383178711, + "learning_rate": 1.6931300973152397e-05, + "loss": 1.547, + "step": 73350 + }, + { + "epoch": 0.4610845952900045, + "grad_norm": 7.699466705322266, + "learning_rate": 1.693088187220774e-05, + "loss": 1.7697, + "step": 73360 + }, + { + "epoch": 0.4611474476067016, + "grad_norm": 7.0006208419799805, + "learning_rate": 1.6930462771263088e-05, + "loss": 1.5617, + "step": 73370 + }, + { + "epoch": 0.46121029992339874, + "grad_norm": 7.954681873321533, + "learning_rate": 1.6930043670318435e-05, + "loss": 1.8519, + "step": 73380 + }, + { + "epoch": 0.46127315224009585, + "grad_norm": 6.38613748550415, + "learning_rate": 1.6929624569373782e-05, + "loss": 1.6511, + "step": 73390 + }, + { + "epoch": 0.46133600455679297, + "grad_norm": 7.790059566497803, + "learning_rate": 1.6929205468429126e-05, + "loss": 1.7144, + "step": 73400 + }, + { + "epoch": 0.4613988568734901, + "grad_norm": 6.947166919708252, + "learning_rate": 1.6928786367484473e-05, + "loss": 1.8265, + "step": 73410 + }, + { + "epoch": 0.4614617091901872, + "grad_norm": 6.62131929397583, + "learning_rate": 1.692836726653982e-05, + "loss": 1.6418, + "step": 73420 + }, + { + "epoch": 0.4615245615068843, + "grad_norm": 6.732690334320068, + "learning_rate": 1.6927948165595167e-05, + "loss": 1.7439, + "step": 73430 + }, + { + "epoch": 0.4615874138235814, + "grad_norm": 7.452075004577637, + "learning_rate": 1.6927529064650514e-05, + "loss": 1.7972, + "step": 73440 + }, + { + "epoch": 0.4616502661402785, + "grad_norm": 6.767248630523682, + "learning_rate": 1.6927109963705858e-05, + "loss": 1.862, + "step": 73450 + }, + { + "epoch": 0.4617131184569756, + "grad_norm": 6.166742324829102, + "learning_rate": 1.6926690862761205e-05, + "loss": 1.5783, + "step": 73460 + }, + { + "epoch": 0.4617759707736727, + "grad_norm": 6.472208023071289, + "learning_rate": 1.6926271761816552e-05, + "loss": 1.7238, + "step": 73470 + }, + { + "epoch": 0.46183882309036983, + "grad_norm": 9.513326644897461, + "learning_rate": 1.69258526608719e-05, + "loss": 1.7613, + "step": 73480 + }, + { + "epoch": 0.46190167540706695, + "grad_norm": 7.1478657722473145, + "learning_rate": 1.6925433559927246e-05, + "loss": 1.7873, + "step": 73490 + }, + { + "epoch": 0.46196452772376406, + "grad_norm": 7.685591220855713, + "learning_rate": 1.692501445898259e-05, + "loss": 1.6279, + "step": 73500 + }, + { + "epoch": 0.4620273800404612, + "grad_norm": 7.648070812225342, + "learning_rate": 1.6924595358037937e-05, + "loss": 1.8225, + "step": 73510 + }, + { + "epoch": 0.4620902323571583, + "grad_norm": 7.248754024505615, + "learning_rate": 1.6924176257093284e-05, + "loss": 1.6426, + "step": 73520 + }, + { + "epoch": 0.4621530846738554, + "grad_norm": 7.591023921966553, + "learning_rate": 1.692375715614863e-05, + "loss": 1.8353, + "step": 73530 + }, + { + "epoch": 0.4622159369905525, + "grad_norm": 6.6562347412109375, + "learning_rate": 1.6923338055203978e-05, + "loss": 1.8618, + "step": 73540 + }, + { + "epoch": 0.46227878930724964, + "grad_norm": 6.6715617179870605, + "learning_rate": 1.6922918954259325e-05, + "loss": 1.8978, + "step": 73550 + }, + { + "epoch": 0.46234164162394675, + "grad_norm": 7.414534091949463, + "learning_rate": 1.6922499853314672e-05, + "loss": 1.6267, + "step": 73560 + }, + { + "epoch": 0.46240449394064387, + "grad_norm": 5.956836700439453, + "learning_rate": 1.692208075237002e-05, + "loss": 1.7615, + "step": 73570 + }, + { + "epoch": 0.4624673462573409, + "grad_norm": 7.848029136657715, + "learning_rate": 1.6921661651425363e-05, + "loss": 1.7014, + "step": 73580 + }, + { + "epoch": 0.46253019857403804, + "grad_norm": 6.871232986450195, + "learning_rate": 1.692124255048071e-05, + "loss": 1.7674, + "step": 73590 + }, + { + "epoch": 0.46259305089073516, + "grad_norm": 7.2657880783081055, + "learning_rate": 1.6920823449536057e-05, + "loss": 1.7561, + "step": 73600 + }, + { + "epoch": 0.4626559032074323, + "grad_norm": 7.009253978729248, + "learning_rate": 1.6920404348591404e-05, + "loss": 1.9371, + "step": 73610 + }, + { + "epoch": 0.4627187555241294, + "grad_norm": 8.488468170166016, + "learning_rate": 1.6919985247646748e-05, + "loss": 1.634, + "step": 73620 + }, + { + "epoch": 0.4627816078408265, + "grad_norm": 8.554583549499512, + "learning_rate": 1.6919566146702095e-05, + "loss": 1.8174, + "step": 73630 + }, + { + "epoch": 0.4628444601575236, + "grad_norm": 6.670780658721924, + "learning_rate": 1.6919147045757442e-05, + "loss": 1.6491, + "step": 73640 + }, + { + "epoch": 0.46290731247422073, + "grad_norm": 6.9372477531433105, + "learning_rate": 1.691872794481279e-05, + "loss": 1.5245, + "step": 73650 + }, + { + "epoch": 0.46297016479091785, + "grad_norm": 7.845040321350098, + "learning_rate": 1.6918308843868136e-05, + "loss": 1.713, + "step": 73660 + }, + { + "epoch": 0.46303301710761496, + "grad_norm": 8.10694694519043, + "learning_rate": 1.691788974292348e-05, + "loss": 1.5741, + "step": 73670 + }, + { + "epoch": 0.4630958694243121, + "grad_norm": 7.517847537994385, + "learning_rate": 1.6917470641978827e-05, + "loss": 1.7044, + "step": 73680 + }, + { + "epoch": 0.4631587217410092, + "grad_norm": 6.285558700561523, + "learning_rate": 1.6917051541034174e-05, + "loss": 1.9023, + "step": 73690 + }, + { + "epoch": 0.4632215740577063, + "grad_norm": 7.671931743621826, + "learning_rate": 1.691663244008952e-05, + "loss": 1.6278, + "step": 73700 + }, + { + "epoch": 0.46328442637440337, + "grad_norm": 6.878096580505371, + "learning_rate": 1.6916213339144868e-05, + "loss": 1.6788, + "step": 73710 + }, + { + "epoch": 0.4633472786911005, + "grad_norm": 6.303915500640869, + "learning_rate": 1.6915794238200215e-05, + "loss": 1.6047, + "step": 73720 + }, + { + "epoch": 0.4634101310077976, + "grad_norm": 5.825152397155762, + "learning_rate": 1.6915375137255562e-05, + "loss": 1.4785, + "step": 73730 + }, + { + "epoch": 0.4634729833244947, + "grad_norm": 6.75026273727417, + "learning_rate": 1.691495603631091e-05, + "loss": 1.5219, + "step": 73740 + }, + { + "epoch": 0.46353583564119183, + "grad_norm": 6.23679780960083, + "learning_rate": 1.6914536935366253e-05, + "loss": 1.64, + "step": 73750 + }, + { + "epoch": 0.46359868795788894, + "grad_norm": 7.054623603820801, + "learning_rate": 1.69141178344216e-05, + "loss": 1.5494, + "step": 73760 + }, + { + "epoch": 0.46366154027458606, + "grad_norm": 6.4822998046875, + "learning_rate": 1.6913698733476947e-05, + "loss": 1.7413, + "step": 73770 + }, + { + "epoch": 0.4637243925912832, + "grad_norm": 7.870069980621338, + "learning_rate": 1.6913279632532294e-05, + "loss": 1.7077, + "step": 73780 + }, + { + "epoch": 0.4637872449079803, + "grad_norm": 6.3491291999816895, + "learning_rate": 1.691286053158764e-05, + "loss": 1.6945, + "step": 73790 + }, + { + "epoch": 0.4638500972246774, + "grad_norm": 7.511515140533447, + "learning_rate": 1.6912441430642985e-05, + "loss": 1.8034, + "step": 73800 + }, + { + "epoch": 0.4639129495413745, + "grad_norm": 6.393674850463867, + "learning_rate": 1.6912022329698332e-05, + "loss": 1.7938, + "step": 73810 + }, + { + "epoch": 0.46397580185807163, + "grad_norm": 6.74291467666626, + "learning_rate": 1.691160322875368e-05, + "loss": 1.7669, + "step": 73820 + }, + { + "epoch": 0.46403865417476875, + "grad_norm": 6.482278347015381, + "learning_rate": 1.6911184127809026e-05, + "loss": 1.7606, + "step": 73830 + }, + { + "epoch": 0.4641015064914658, + "grad_norm": 6.72927188873291, + "learning_rate": 1.691076502686437e-05, + "loss": 1.4387, + "step": 73840 + }, + { + "epoch": 0.4641643588081629, + "grad_norm": 7.548161506652832, + "learning_rate": 1.6910345925919717e-05, + "loss": 1.5602, + "step": 73850 + }, + { + "epoch": 0.46422721112486004, + "grad_norm": 6.613569736480713, + "learning_rate": 1.6909926824975064e-05, + "loss": 1.7397, + "step": 73860 + }, + { + "epoch": 0.46429006344155715, + "grad_norm": 6.6580586433410645, + "learning_rate": 1.690950772403041e-05, + "loss": 1.7557, + "step": 73870 + }, + { + "epoch": 0.46435291575825427, + "grad_norm": 7.397098064422607, + "learning_rate": 1.6909088623085758e-05, + "loss": 1.6191, + "step": 73880 + }, + { + "epoch": 0.4644157680749514, + "grad_norm": 6.1506829261779785, + "learning_rate": 1.6908669522141102e-05, + "loss": 1.5727, + "step": 73890 + }, + { + "epoch": 0.4644786203916485, + "grad_norm": 7.352633476257324, + "learning_rate": 1.690825042119645e-05, + "loss": 1.8109, + "step": 73900 + }, + { + "epoch": 0.4645414727083456, + "grad_norm": 6.859440803527832, + "learning_rate": 1.6907831320251796e-05, + "loss": 1.6813, + "step": 73910 + }, + { + "epoch": 0.46460432502504273, + "grad_norm": 7.155480861663818, + "learning_rate": 1.6907412219307143e-05, + "loss": 1.9739, + "step": 73920 + }, + { + "epoch": 0.46466717734173985, + "grad_norm": 7.0241546630859375, + "learning_rate": 1.690699311836249e-05, + "loss": 1.9009, + "step": 73930 + }, + { + "epoch": 0.46473002965843696, + "grad_norm": 6.739370822906494, + "learning_rate": 1.6906574017417837e-05, + "loss": 1.8424, + "step": 73940 + }, + { + "epoch": 0.4647928819751341, + "grad_norm": 6.187098979949951, + "learning_rate": 1.6906154916473184e-05, + "loss": 1.5841, + "step": 73950 + }, + { + "epoch": 0.46485573429183114, + "grad_norm": 6.150927543640137, + "learning_rate": 1.690573581552853e-05, + "loss": 1.619, + "step": 73960 + }, + { + "epoch": 0.46491858660852825, + "grad_norm": 6.579783916473389, + "learning_rate": 1.690531671458388e-05, + "loss": 1.7277, + "step": 73970 + }, + { + "epoch": 0.46498143892522537, + "grad_norm": 6.820524215698242, + "learning_rate": 1.6904897613639222e-05, + "loss": 1.5031, + "step": 73980 + }, + { + "epoch": 0.4650442912419225, + "grad_norm": 6.518802642822266, + "learning_rate": 1.690447851269457e-05, + "loss": 1.6477, + "step": 73990 + }, + { + "epoch": 0.4651071435586196, + "grad_norm": 7.290344715118408, + "learning_rate": 1.6904059411749916e-05, + "loss": 1.7982, + "step": 74000 + }, + { + "epoch": 0.4651699958753167, + "grad_norm": 7.121784210205078, + "learning_rate": 1.6903640310805263e-05, + "loss": 1.7743, + "step": 74010 + }, + { + "epoch": 0.4652328481920138, + "grad_norm": 6.875063896179199, + "learning_rate": 1.6903221209860607e-05, + "loss": 1.6827, + "step": 74020 + }, + { + "epoch": 0.46529570050871094, + "grad_norm": 8.333395957946777, + "learning_rate": 1.6902802108915954e-05, + "loss": 1.6369, + "step": 74030 + }, + { + "epoch": 0.46535855282540806, + "grad_norm": 6.934069633483887, + "learning_rate": 1.69023830079713e-05, + "loss": 1.6542, + "step": 74040 + }, + { + "epoch": 0.46542140514210517, + "grad_norm": 7.221209526062012, + "learning_rate": 1.6901963907026648e-05, + "loss": 1.8351, + "step": 74050 + }, + { + "epoch": 0.4654842574588023, + "grad_norm": 6.288447380065918, + "learning_rate": 1.6901544806081995e-05, + "loss": 1.4954, + "step": 74060 + }, + { + "epoch": 0.4655471097754994, + "grad_norm": 6.521042823791504, + "learning_rate": 1.690112570513734e-05, + "loss": 1.6492, + "step": 74070 + }, + { + "epoch": 0.4656099620921965, + "grad_norm": 8.660316467285156, + "learning_rate": 1.6900706604192686e-05, + "loss": 1.6829, + "step": 74080 + }, + { + "epoch": 0.4656728144088936, + "grad_norm": 7.834723472595215, + "learning_rate": 1.6900287503248033e-05, + "loss": 1.6765, + "step": 74090 + }, + { + "epoch": 0.4657356667255907, + "grad_norm": 6.806433200836182, + "learning_rate": 1.689986840230338e-05, + "loss": 1.5411, + "step": 74100 + }, + { + "epoch": 0.4657985190422878, + "grad_norm": 7.692269802093506, + "learning_rate": 1.6899449301358727e-05, + "loss": 1.7579, + "step": 74110 + }, + { + "epoch": 0.4658613713589849, + "grad_norm": 8.500774383544922, + "learning_rate": 1.6899030200414074e-05, + "loss": 1.8493, + "step": 74120 + }, + { + "epoch": 0.46592422367568204, + "grad_norm": 8.580452919006348, + "learning_rate": 1.6898611099469418e-05, + "loss": 1.8775, + "step": 74130 + }, + { + "epoch": 0.46598707599237915, + "grad_norm": 6.9834113121032715, + "learning_rate": 1.6898191998524765e-05, + "loss": 1.5928, + "step": 74140 + }, + { + "epoch": 0.46604992830907627, + "grad_norm": 6.779479026794434, + "learning_rate": 1.6897772897580112e-05, + "loss": 1.7107, + "step": 74150 + }, + { + "epoch": 0.4661127806257734, + "grad_norm": 7.981831073760986, + "learning_rate": 1.689735379663546e-05, + "loss": 1.7173, + "step": 74160 + }, + { + "epoch": 0.4661756329424705, + "grad_norm": 7.5072245597839355, + "learning_rate": 1.6896934695690806e-05, + "loss": 1.8305, + "step": 74170 + }, + { + "epoch": 0.4662384852591676, + "grad_norm": 7.0078535079956055, + "learning_rate": 1.6896515594746153e-05, + "loss": 1.7212, + "step": 74180 + }, + { + "epoch": 0.4663013375758647, + "grad_norm": 6.888160705566406, + "learning_rate": 1.68960964938015e-05, + "loss": 1.6908, + "step": 74190 + }, + { + "epoch": 0.46636418989256184, + "grad_norm": 6.368884086608887, + "learning_rate": 1.6895677392856844e-05, + "loss": 1.6535, + "step": 74200 + }, + { + "epoch": 0.46642704220925896, + "grad_norm": 6.441951751708984, + "learning_rate": 1.689525829191219e-05, + "loss": 1.8026, + "step": 74210 + }, + { + "epoch": 0.466489894525956, + "grad_norm": 7.454028606414795, + "learning_rate": 1.6894839190967538e-05, + "loss": 1.5824, + "step": 74220 + }, + { + "epoch": 0.46655274684265313, + "grad_norm": 6.690650939941406, + "learning_rate": 1.6894420090022885e-05, + "loss": 1.6732, + "step": 74230 + }, + { + "epoch": 0.46661559915935025, + "grad_norm": 5.540317535400391, + "learning_rate": 1.689400098907823e-05, + "loss": 1.4798, + "step": 74240 + }, + { + "epoch": 0.46667845147604736, + "grad_norm": 7.390773773193359, + "learning_rate": 1.6893581888133576e-05, + "loss": 1.7011, + "step": 74250 + }, + { + "epoch": 0.4667413037927445, + "grad_norm": 5.674918174743652, + "learning_rate": 1.6893162787188923e-05, + "loss": 1.897, + "step": 74260 + }, + { + "epoch": 0.4668041561094416, + "grad_norm": 6.368553161621094, + "learning_rate": 1.689274368624427e-05, + "loss": 1.687, + "step": 74270 + }, + { + "epoch": 0.4668670084261387, + "grad_norm": 6.636124610900879, + "learning_rate": 1.6892324585299617e-05, + "loss": 1.8943, + "step": 74280 + }, + { + "epoch": 0.4669298607428358, + "grad_norm": 6.706605434417725, + "learning_rate": 1.689190548435496e-05, + "loss": 1.6624, + "step": 74290 + }, + { + "epoch": 0.46699271305953294, + "grad_norm": 7.414637565612793, + "learning_rate": 1.6891486383410308e-05, + "loss": 1.8333, + "step": 74300 + }, + { + "epoch": 0.46705556537623005, + "grad_norm": 5.616257190704346, + "learning_rate": 1.6891067282465655e-05, + "loss": 1.7722, + "step": 74310 + }, + { + "epoch": 0.46711841769292717, + "grad_norm": 6.927336692810059, + "learning_rate": 1.6890648181521002e-05, + "loss": 1.5495, + "step": 74320 + }, + { + "epoch": 0.4671812700096243, + "grad_norm": 6.845500469207764, + "learning_rate": 1.689022908057635e-05, + "loss": 1.6143, + "step": 74330 + }, + { + "epoch": 0.4672441223263214, + "grad_norm": 7.378794193267822, + "learning_rate": 1.6889809979631696e-05, + "loss": 1.6623, + "step": 74340 + }, + { + "epoch": 0.46730697464301846, + "grad_norm": 6.336019515991211, + "learning_rate": 1.6889390878687043e-05, + "loss": 1.7102, + "step": 74350 + }, + { + "epoch": 0.4673698269597156, + "grad_norm": 6.820666313171387, + "learning_rate": 1.688897177774239e-05, + "loss": 1.8539, + "step": 74360 + }, + { + "epoch": 0.4674326792764127, + "grad_norm": 5.6884870529174805, + "learning_rate": 1.6888552676797738e-05, + "loss": 1.6753, + "step": 74370 + }, + { + "epoch": 0.4674955315931098, + "grad_norm": 7.648561477661133, + "learning_rate": 1.688813357585308e-05, + "loss": 1.6738, + "step": 74380 + }, + { + "epoch": 0.4675583839098069, + "grad_norm": 6.930350303649902, + "learning_rate": 1.688771447490843e-05, + "loss": 1.7851, + "step": 74390 + }, + { + "epoch": 0.46762123622650403, + "grad_norm": 7.078684329986572, + "learning_rate": 1.6887295373963775e-05, + "loss": 1.558, + "step": 74400 + }, + { + "epoch": 0.46768408854320115, + "grad_norm": 8.631214141845703, + "learning_rate": 1.6886876273019122e-05, + "loss": 1.8714, + "step": 74410 + }, + { + "epoch": 0.46774694085989826, + "grad_norm": 7.361252307891846, + "learning_rate": 1.6886457172074466e-05, + "loss": 1.6751, + "step": 74420 + }, + { + "epoch": 0.4678097931765954, + "grad_norm": 5.998310089111328, + "learning_rate": 1.6886038071129813e-05, + "loss": 1.5445, + "step": 74430 + }, + { + "epoch": 0.4678726454932925, + "grad_norm": 6.633802890777588, + "learning_rate": 1.688561897018516e-05, + "loss": 1.5037, + "step": 74440 + }, + { + "epoch": 0.4679354978099896, + "grad_norm": 7.296174049377441, + "learning_rate": 1.6885199869240507e-05, + "loss": 1.7964, + "step": 74450 + }, + { + "epoch": 0.4679983501266867, + "grad_norm": 7.240650653839111, + "learning_rate": 1.6884780768295854e-05, + "loss": 1.8011, + "step": 74460 + }, + { + "epoch": 0.46806120244338384, + "grad_norm": 6.495932102203369, + "learning_rate": 1.6884361667351198e-05, + "loss": 1.7533, + "step": 74470 + }, + { + "epoch": 0.4681240547600809, + "grad_norm": 7.80958366394043, + "learning_rate": 1.6883942566406545e-05, + "loss": 2.043, + "step": 74480 + }, + { + "epoch": 0.468186907076778, + "grad_norm": 6.533959865570068, + "learning_rate": 1.6883523465461892e-05, + "loss": 1.6634, + "step": 74490 + }, + { + "epoch": 0.46824975939347513, + "grad_norm": 6.347193241119385, + "learning_rate": 1.688310436451724e-05, + "loss": 1.4688, + "step": 74500 + }, + { + "epoch": 0.46831261171017224, + "grad_norm": 6.545567512512207, + "learning_rate": 1.6882685263572583e-05, + "loss": 1.7698, + "step": 74510 + }, + { + "epoch": 0.46837546402686936, + "grad_norm": 6.875192165374756, + "learning_rate": 1.688226616262793e-05, + "loss": 1.7476, + "step": 74520 + }, + { + "epoch": 0.4684383163435665, + "grad_norm": 6.761386871337891, + "learning_rate": 1.6881847061683277e-05, + "loss": 1.6175, + "step": 74530 + }, + { + "epoch": 0.4685011686602636, + "grad_norm": 7.0894365310668945, + "learning_rate": 1.6881427960738624e-05, + "loss": 1.7662, + "step": 74540 + }, + { + "epoch": 0.4685640209769607, + "grad_norm": 6.652477264404297, + "learning_rate": 1.688100885979397e-05, + "loss": 1.6835, + "step": 74550 + }, + { + "epoch": 0.4686268732936578, + "grad_norm": 6.749616622924805, + "learning_rate": 1.688058975884932e-05, + "loss": 1.7092, + "step": 74560 + }, + { + "epoch": 0.46868972561035493, + "grad_norm": 6.41374397277832, + "learning_rate": 1.6880170657904665e-05, + "loss": 1.7461, + "step": 74570 + }, + { + "epoch": 0.46875257792705205, + "grad_norm": 6.185507774353027, + "learning_rate": 1.6879751556960013e-05, + "loss": 1.5894, + "step": 74580 + }, + { + "epoch": 0.46881543024374916, + "grad_norm": 6.641927242279053, + "learning_rate": 1.687933245601536e-05, + "loss": 1.7548, + "step": 74590 + }, + { + "epoch": 0.4688782825604462, + "grad_norm": 7.611270904541016, + "learning_rate": 1.6878913355070703e-05, + "loss": 2.065, + "step": 74600 + }, + { + "epoch": 0.46894113487714334, + "grad_norm": 6.557180404663086, + "learning_rate": 1.687849425412605e-05, + "loss": 1.568, + "step": 74610 + }, + { + "epoch": 0.46900398719384045, + "grad_norm": 7.316890239715576, + "learning_rate": 1.6878075153181397e-05, + "loss": 1.592, + "step": 74620 + }, + { + "epoch": 0.46906683951053757, + "grad_norm": 7.576214790344238, + "learning_rate": 1.6877656052236744e-05, + "loss": 1.7492, + "step": 74630 + }, + { + "epoch": 0.4691296918272347, + "grad_norm": 6.319741725921631, + "learning_rate": 1.6877236951292088e-05, + "loss": 1.6616, + "step": 74640 + }, + { + "epoch": 0.4691925441439318, + "grad_norm": 7.309403419494629, + "learning_rate": 1.6876817850347435e-05, + "loss": 1.7179, + "step": 74650 + }, + { + "epoch": 0.4692553964606289, + "grad_norm": 8.033734321594238, + "learning_rate": 1.6876398749402782e-05, + "loss": 1.9204, + "step": 74660 + }, + { + "epoch": 0.46931824877732603, + "grad_norm": 6.8274054527282715, + "learning_rate": 1.687597964845813e-05, + "loss": 1.702, + "step": 74670 + }, + { + "epoch": 0.46938110109402315, + "grad_norm": 7.155275821685791, + "learning_rate": 1.6875560547513476e-05, + "loss": 1.7586, + "step": 74680 + }, + { + "epoch": 0.46944395341072026, + "grad_norm": 5.7840471267700195, + "learning_rate": 1.687514144656882e-05, + "loss": 1.8591, + "step": 74690 + }, + { + "epoch": 0.4695068057274174, + "grad_norm": 7.553875923156738, + "learning_rate": 1.6874722345624167e-05, + "loss": 1.6966, + "step": 74700 + }, + { + "epoch": 0.4695696580441145, + "grad_norm": 6.658247947692871, + "learning_rate": 1.6874303244679514e-05, + "loss": 1.8337, + "step": 74710 + }, + { + "epoch": 0.4696325103608116, + "grad_norm": 6.50600004196167, + "learning_rate": 1.687388414373486e-05, + "loss": 1.4966, + "step": 74720 + }, + { + "epoch": 0.46969536267750867, + "grad_norm": 6.239086151123047, + "learning_rate": 1.687346504279021e-05, + "loss": 1.6827, + "step": 74730 + }, + { + "epoch": 0.4697582149942058, + "grad_norm": 6.735509395599365, + "learning_rate": 1.6873045941845555e-05, + "loss": 1.6745, + "step": 74740 + }, + { + "epoch": 0.4698210673109029, + "grad_norm": 7.19131326675415, + "learning_rate": 1.6872626840900903e-05, + "loss": 1.7236, + "step": 74750 + }, + { + "epoch": 0.4698839196276, + "grad_norm": 6.133017539978027, + "learning_rate": 1.6872207739956246e-05, + "loss": 1.4435, + "step": 74760 + }, + { + "epoch": 0.4699467719442971, + "grad_norm": 6.491494178771973, + "learning_rate": 1.6871788639011593e-05, + "loss": 1.7601, + "step": 74770 + }, + { + "epoch": 0.47000962426099424, + "grad_norm": 7.579554080963135, + "learning_rate": 1.687136953806694e-05, + "loss": 1.7644, + "step": 74780 + }, + { + "epoch": 0.47007247657769136, + "grad_norm": 7.176877021789551, + "learning_rate": 1.6870950437122287e-05, + "loss": 1.7589, + "step": 74790 + }, + { + "epoch": 0.47013532889438847, + "grad_norm": 7.731554985046387, + "learning_rate": 1.6870531336177635e-05, + "loss": 1.8631, + "step": 74800 + }, + { + "epoch": 0.4701981812110856, + "grad_norm": 6.453170299530029, + "learning_rate": 1.687011223523298e-05, + "loss": 1.8514, + "step": 74810 + }, + { + "epoch": 0.4702610335277827, + "grad_norm": 8.215258598327637, + "learning_rate": 1.6869693134288325e-05, + "loss": 1.9341, + "step": 74820 + }, + { + "epoch": 0.4703238858444798, + "grad_norm": 6.7729105949401855, + "learning_rate": 1.6869274033343672e-05, + "loss": 1.6897, + "step": 74830 + }, + { + "epoch": 0.47038673816117693, + "grad_norm": 6.5331621170043945, + "learning_rate": 1.686885493239902e-05, + "loss": 1.6208, + "step": 74840 + }, + { + "epoch": 0.47044959047787405, + "grad_norm": 7.255948543548584, + "learning_rate": 1.6868435831454366e-05, + "loss": 1.7441, + "step": 74850 + }, + { + "epoch": 0.4705124427945711, + "grad_norm": 6.3399553298950195, + "learning_rate": 1.686801673050971e-05, + "loss": 1.786, + "step": 74860 + }, + { + "epoch": 0.4705752951112682, + "grad_norm": 6.474696636199951, + "learning_rate": 1.6867597629565057e-05, + "loss": 1.7946, + "step": 74870 + }, + { + "epoch": 0.47063814742796534, + "grad_norm": 7.178191661834717, + "learning_rate": 1.6867178528620404e-05, + "loss": 1.8623, + "step": 74880 + }, + { + "epoch": 0.47070099974466245, + "grad_norm": 6.79795503616333, + "learning_rate": 1.686675942767575e-05, + "loss": 1.8388, + "step": 74890 + }, + { + "epoch": 0.47076385206135957, + "grad_norm": 6.400870323181152, + "learning_rate": 1.68663403267311e-05, + "loss": 1.5224, + "step": 74900 + }, + { + "epoch": 0.4708267043780567, + "grad_norm": 6.47348165512085, + "learning_rate": 1.6865921225786442e-05, + "loss": 1.5566, + "step": 74910 + }, + { + "epoch": 0.4708895566947538, + "grad_norm": 6.684679985046387, + "learning_rate": 1.686550212484179e-05, + "loss": 1.6205, + "step": 74920 + }, + { + "epoch": 0.4709524090114509, + "grad_norm": 7.984795093536377, + "learning_rate": 1.6865083023897136e-05, + "loss": 1.5545, + "step": 74930 + }, + { + "epoch": 0.471015261328148, + "grad_norm": 7.289670467376709, + "learning_rate": 1.6864663922952483e-05, + "loss": 1.8546, + "step": 74940 + }, + { + "epoch": 0.47107811364484514, + "grad_norm": 6.218448638916016, + "learning_rate": 1.686424482200783e-05, + "loss": 1.7009, + "step": 74950 + }, + { + "epoch": 0.47114096596154226, + "grad_norm": 6.013876438140869, + "learning_rate": 1.6863825721063177e-05, + "loss": 1.669, + "step": 74960 + }, + { + "epoch": 0.4712038182782394, + "grad_norm": 6.878289699554443, + "learning_rate": 1.6863406620118525e-05, + "loss": 1.7634, + "step": 74970 + }, + { + "epoch": 0.4712666705949365, + "grad_norm": 6.458973407745361, + "learning_rate": 1.686298751917387e-05, + "loss": 1.7187, + "step": 74980 + }, + { + "epoch": 0.47132952291163355, + "grad_norm": 6.173418998718262, + "learning_rate": 1.686256841822922e-05, + "loss": 1.9044, + "step": 74990 + }, + { + "epoch": 0.47139237522833066, + "grad_norm": 6.662987232208252, + "learning_rate": 1.6862149317284562e-05, + "loss": 1.6913, + "step": 75000 + }, + { + "epoch": 0.4714552275450278, + "grad_norm": 7.989409923553467, + "learning_rate": 1.686173021633991e-05, + "loss": 1.7866, + "step": 75010 + }, + { + "epoch": 0.4715180798617249, + "grad_norm": 6.370418548583984, + "learning_rate": 1.6861311115395257e-05, + "loss": 1.6304, + "step": 75020 + }, + { + "epoch": 0.471580932178422, + "grad_norm": 6.886153221130371, + "learning_rate": 1.6860892014450604e-05, + "loss": 1.7567, + "step": 75030 + }, + { + "epoch": 0.4716437844951191, + "grad_norm": 6.359077453613281, + "learning_rate": 1.6860472913505947e-05, + "loss": 1.9381, + "step": 75040 + }, + { + "epoch": 0.47170663681181624, + "grad_norm": 6.715882778167725, + "learning_rate": 1.6860053812561294e-05, + "loss": 1.8876, + "step": 75050 + }, + { + "epoch": 0.47176948912851335, + "grad_norm": 6.847312927246094, + "learning_rate": 1.685963471161664e-05, + "loss": 1.6414, + "step": 75060 + }, + { + "epoch": 0.47183234144521047, + "grad_norm": 5.654614448547363, + "learning_rate": 1.685921561067199e-05, + "loss": 1.9713, + "step": 75070 + }, + { + "epoch": 0.4718951937619076, + "grad_norm": 6.608804702758789, + "learning_rate": 1.6858796509727336e-05, + "loss": 1.6387, + "step": 75080 + }, + { + "epoch": 0.4719580460786047, + "grad_norm": 7.266916275024414, + "learning_rate": 1.685837740878268e-05, + "loss": 1.8478, + "step": 75090 + }, + { + "epoch": 0.4720208983953018, + "grad_norm": 7.289012908935547, + "learning_rate": 1.6857958307838026e-05, + "loss": 1.8796, + "step": 75100 + }, + { + "epoch": 0.4720837507119989, + "grad_norm": 6.832249641418457, + "learning_rate": 1.6857539206893373e-05, + "loss": 1.7268, + "step": 75110 + }, + { + "epoch": 0.472146603028696, + "grad_norm": 7.332949161529541, + "learning_rate": 1.685712010594872e-05, + "loss": 1.9223, + "step": 75120 + }, + { + "epoch": 0.4722094553453931, + "grad_norm": 6.668570041656494, + "learning_rate": 1.6856701005004068e-05, + "loss": 1.7615, + "step": 75130 + }, + { + "epoch": 0.4722723076620902, + "grad_norm": 6.870800495147705, + "learning_rate": 1.685628190405941e-05, + "loss": 1.5939, + "step": 75140 + }, + { + "epoch": 0.47233515997878733, + "grad_norm": 6.370723247528076, + "learning_rate": 1.6855862803114758e-05, + "loss": 1.7346, + "step": 75150 + }, + { + "epoch": 0.47239801229548445, + "grad_norm": 7.461625576019287, + "learning_rate": 1.6855443702170105e-05, + "loss": 1.638, + "step": 75160 + }, + { + "epoch": 0.47246086461218156, + "grad_norm": 7.121856689453125, + "learning_rate": 1.6855024601225452e-05, + "loss": 1.7512, + "step": 75170 + }, + { + "epoch": 0.4725237169288787, + "grad_norm": 7.248109340667725, + "learning_rate": 1.68546055002808e-05, + "loss": 1.6313, + "step": 75180 + }, + { + "epoch": 0.4725865692455758, + "grad_norm": 6.1002397537231445, + "learning_rate": 1.6854186399336147e-05, + "loss": 1.6249, + "step": 75190 + }, + { + "epoch": 0.4726494215622729, + "grad_norm": 6.94600248336792, + "learning_rate": 1.6853767298391494e-05, + "loss": 1.8185, + "step": 75200 + }, + { + "epoch": 0.47271227387897, + "grad_norm": 6.894454479217529, + "learning_rate": 1.685334819744684e-05, + "loss": 1.899, + "step": 75210 + }, + { + "epoch": 0.47277512619566714, + "grad_norm": 5.67385196685791, + "learning_rate": 1.6852929096502184e-05, + "loss": 1.6894, + "step": 75220 + }, + { + "epoch": 0.47283797851236425, + "grad_norm": 7.093928337097168, + "learning_rate": 1.685250999555753e-05, + "loss": 1.5867, + "step": 75230 + }, + { + "epoch": 0.4729008308290613, + "grad_norm": 7.607110500335693, + "learning_rate": 1.685209089461288e-05, + "loss": 1.6814, + "step": 75240 + }, + { + "epoch": 0.47296368314575843, + "grad_norm": 7.096514701843262, + "learning_rate": 1.6851671793668226e-05, + "loss": 1.728, + "step": 75250 + }, + { + "epoch": 0.47302653546245554, + "grad_norm": 7.261518955230713, + "learning_rate": 1.685125269272357e-05, + "loss": 1.7906, + "step": 75260 + }, + { + "epoch": 0.47308938777915266, + "grad_norm": 6.688602447509766, + "learning_rate": 1.6850833591778916e-05, + "loss": 1.5978, + "step": 75270 + }, + { + "epoch": 0.4731522400958498, + "grad_norm": 6.605575084686279, + "learning_rate": 1.6850414490834263e-05, + "loss": 1.8219, + "step": 75280 + }, + { + "epoch": 0.4732150924125469, + "grad_norm": 7.412301540374756, + "learning_rate": 1.684999538988961e-05, + "loss": 1.8662, + "step": 75290 + }, + { + "epoch": 0.473277944729244, + "grad_norm": 6.660956859588623, + "learning_rate": 1.6849576288944958e-05, + "loss": 1.6767, + "step": 75300 + }, + { + "epoch": 0.4733407970459411, + "grad_norm": 6.37428092956543, + "learning_rate": 1.68491571880003e-05, + "loss": 1.7547, + "step": 75310 + }, + { + "epoch": 0.47340364936263823, + "grad_norm": 5.873388290405273, + "learning_rate": 1.6848738087055648e-05, + "loss": 1.5674, + "step": 75320 + }, + { + "epoch": 0.47346650167933535, + "grad_norm": 7.413193702697754, + "learning_rate": 1.6848318986110995e-05, + "loss": 1.5481, + "step": 75330 + }, + { + "epoch": 0.47352935399603246, + "grad_norm": 4.985287666320801, + "learning_rate": 1.6847899885166342e-05, + "loss": 1.6463, + "step": 75340 + }, + { + "epoch": 0.4735922063127296, + "grad_norm": 7.10650110244751, + "learning_rate": 1.684748078422169e-05, + "loss": 2.0231, + "step": 75350 + }, + { + "epoch": 0.4736550586294267, + "grad_norm": 6.295893669128418, + "learning_rate": 1.6847061683277037e-05, + "loss": 1.6833, + "step": 75360 + }, + { + "epoch": 0.47371791094612375, + "grad_norm": 6.590880870819092, + "learning_rate": 1.6846642582332384e-05, + "loss": 1.5646, + "step": 75370 + }, + { + "epoch": 0.47378076326282087, + "grad_norm": 6.749689102172852, + "learning_rate": 1.6846223481387727e-05, + "loss": 1.7946, + "step": 75380 + }, + { + "epoch": 0.473843615579518, + "grad_norm": 5.869183540344238, + "learning_rate": 1.6845804380443074e-05, + "loss": 1.6161, + "step": 75390 + }, + { + "epoch": 0.4739064678962151, + "grad_norm": 6.468136787414551, + "learning_rate": 1.684538527949842e-05, + "loss": 1.6574, + "step": 75400 + }, + { + "epoch": 0.4739693202129122, + "grad_norm": 6.457572937011719, + "learning_rate": 1.684496617855377e-05, + "loss": 1.5288, + "step": 75410 + }, + { + "epoch": 0.47403217252960933, + "grad_norm": 7.331090450286865, + "learning_rate": 1.6844547077609116e-05, + "loss": 1.6546, + "step": 75420 + }, + { + "epoch": 0.47409502484630645, + "grad_norm": 6.027551174163818, + "learning_rate": 1.6844127976664463e-05, + "loss": 1.609, + "step": 75430 + }, + { + "epoch": 0.47415787716300356, + "grad_norm": 6.717035293579102, + "learning_rate": 1.6843708875719806e-05, + "loss": 1.6238, + "step": 75440 + }, + { + "epoch": 0.4742207294797007, + "grad_norm": 7.718981742858887, + "learning_rate": 1.6843289774775153e-05, + "loss": 1.6955, + "step": 75450 + }, + { + "epoch": 0.4742835817963978, + "grad_norm": 5.82558012008667, + "learning_rate": 1.68428706738305e-05, + "loss": 1.5148, + "step": 75460 + }, + { + "epoch": 0.4743464341130949, + "grad_norm": 6.919063091278076, + "learning_rate": 1.6842451572885848e-05, + "loss": 1.6523, + "step": 75470 + }, + { + "epoch": 0.474409286429792, + "grad_norm": 6.56036376953125, + "learning_rate": 1.684203247194119e-05, + "loss": 1.6036, + "step": 75480 + }, + { + "epoch": 0.47447213874648914, + "grad_norm": 7.16939640045166, + "learning_rate": 1.684161337099654e-05, + "loss": 1.9083, + "step": 75490 + }, + { + "epoch": 0.4745349910631862, + "grad_norm": 6.926846027374268, + "learning_rate": 1.6841194270051885e-05, + "loss": 1.7733, + "step": 75500 + }, + { + "epoch": 0.4745978433798833, + "grad_norm": 6.7428741455078125, + "learning_rate": 1.6840775169107232e-05, + "loss": 1.6344, + "step": 75510 + }, + { + "epoch": 0.4746606956965804, + "grad_norm": 6.791428089141846, + "learning_rate": 1.684035606816258e-05, + "loss": 1.697, + "step": 75520 + }, + { + "epoch": 0.47472354801327754, + "grad_norm": 6.997214317321777, + "learning_rate": 1.6839936967217923e-05, + "loss": 1.8743, + "step": 75530 + }, + { + "epoch": 0.47478640032997466, + "grad_norm": 8.502320289611816, + "learning_rate": 1.683951786627327e-05, + "loss": 1.7792, + "step": 75540 + }, + { + "epoch": 0.47484925264667177, + "grad_norm": 6.98447322845459, + "learning_rate": 1.6839098765328617e-05, + "loss": 1.8488, + "step": 75550 + }, + { + "epoch": 0.4749121049633689, + "grad_norm": 6.466368675231934, + "learning_rate": 1.6838679664383964e-05, + "loss": 1.7191, + "step": 75560 + }, + { + "epoch": 0.474974957280066, + "grad_norm": 6.1501383781433105, + "learning_rate": 1.683826056343931e-05, + "loss": 1.8504, + "step": 75570 + }, + { + "epoch": 0.4750378095967631, + "grad_norm": 8.074056625366211, + "learning_rate": 1.683784146249466e-05, + "loss": 1.7625, + "step": 75580 + }, + { + "epoch": 0.47510066191346023, + "grad_norm": 6.821253776550293, + "learning_rate": 1.683746427164447e-05, + "loss": 1.598, + "step": 75590 + }, + { + "epoch": 0.47516351423015735, + "grad_norm": 6.949539661407471, + "learning_rate": 1.6837045170699814e-05, + "loss": 1.6408, + "step": 75600 + }, + { + "epoch": 0.47522636654685446, + "grad_norm": 6.667018890380859, + "learning_rate": 1.683662606975516e-05, + "loss": 1.7095, + "step": 75610 + }, + { + "epoch": 0.4752892188635515, + "grad_norm": 6.688776969909668, + "learning_rate": 1.6836206968810508e-05, + "loss": 2.0056, + "step": 75620 + }, + { + "epoch": 0.47535207118024864, + "grad_norm": 7.515380382537842, + "learning_rate": 1.6835787867865855e-05, + "loss": 1.4865, + "step": 75630 + }, + { + "epoch": 0.47541492349694575, + "grad_norm": 6.006285667419434, + "learning_rate": 1.6835368766921202e-05, + "loss": 1.7319, + "step": 75640 + }, + { + "epoch": 0.47547777581364287, + "grad_norm": 8.443488121032715, + "learning_rate": 1.683494966597655e-05, + "loss": 1.6396, + "step": 75650 + }, + { + "epoch": 0.47554062813034, + "grad_norm": 8.113880157470703, + "learning_rate": 1.6834530565031896e-05, + "loss": 1.5713, + "step": 75660 + }, + { + "epoch": 0.4756034804470371, + "grad_norm": 5.771048069000244, + "learning_rate": 1.6834111464087243e-05, + "loss": 1.7873, + "step": 75670 + }, + { + "epoch": 0.4756663327637342, + "grad_norm": 7.612918376922607, + "learning_rate": 1.683369236314259e-05, + "loss": 1.5751, + "step": 75680 + }, + { + "epoch": 0.4757291850804313, + "grad_norm": 7.876788139343262, + "learning_rate": 1.6833273262197934e-05, + "loss": 1.8266, + "step": 75690 + }, + { + "epoch": 0.47579203739712844, + "grad_norm": 6.927786827087402, + "learning_rate": 1.683285416125328e-05, + "loss": 2.0776, + "step": 75700 + }, + { + "epoch": 0.47585488971382556, + "grad_norm": 6.485779285430908, + "learning_rate": 1.6832435060308628e-05, + "loss": 1.7823, + "step": 75710 + }, + { + "epoch": 0.4759177420305227, + "grad_norm": 6.72318696975708, + "learning_rate": 1.6832015959363975e-05, + "loss": 1.9249, + "step": 75720 + }, + { + "epoch": 0.4759805943472198, + "grad_norm": 6.445530414581299, + "learning_rate": 1.6831596858419322e-05, + "loss": 1.7381, + "step": 75730 + }, + { + "epoch": 0.4760434466639169, + "grad_norm": 6.378854274749756, + "learning_rate": 1.6831177757474666e-05, + "loss": 1.7287, + "step": 75740 + }, + { + "epoch": 0.47610629898061396, + "grad_norm": 6.627330303192139, + "learning_rate": 1.6830758656530013e-05, + "loss": 1.6475, + "step": 75750 + }, + { + "epoch": 0.4761691512973111, + "grad_norm": 6.085504531860352, + "learning_rate": 1.683033955558536e-05, + "loss": 1.6755, + "step": 75760 + }, + { + "epoch": 0.4762320036140082, + "grad_norm": 6.829914093017578, + "learning_rate": 1.6829920454640707e-05, + "loss": 1.7258, + "step": 75770 + }, + { + "epoch": 0.4762948559307053, + "grad_norm": 6.772364616394043, + "learning_rate": 1.682950135369605e-05, + "loss": 1.504, + "step": 75780 + }, + { + "epoch": 0.4763577082474024, + "grad_norm": 5.8424577713012695, + "learning_rate": 1.6829082252751398e-05, + "loss": 1.4472, + "step": 75790 + }, + { + "epoch": 0.47642056056409954, + "grad_norm": 6.815469264984131, + "learning_rate": 1.6828663151806745e-05, + "loss": 1.6529, + "step": 75800 + }, + { + "epoch": 0.47648341288079665, + "grad_norm": 7.46605110168457, + "learning_rate": 1.6828244050862092e-05, + "loss": 1.8595, + "step": 75810 + }, + { + "epoch": 0.47654626519749377, + "grad_norm": 6.85030460357666, + "learning_rate": 1.682782494991744e-05, + "loss": 1.8455, + "step": 75820 + }, + { + "epoch": 0.4766091175141909, + "grad_norm": 6.598263740539551, + "learning_rate": 1.6827405848972783e-05, + "loss": 1.471, + "step": 75830 + }, + { + "epoch": 0.476671969830888, + "grad_norm": 6.376602649688721, + "learning_rate": 1.682698674802813e-05, + "loss": 1.5548, + "step": 75840 + }, + { + "epoch": 0.4767348221475851, + "grad_norm": 5.831735134124756, + "learning_rate": 1.6826567647083477e-05, + "loss": 1.9294, + "step": 75850 + }, + { + "epoch": 0.47679767446428223, + "grad_norm": 7.579250335693359, + "learning_rate": 1.6826148546138824e-05, + "loss": 1.7143, + "step": 75860 + }, + { + "epoch": 0.47686052678097934, + "grad_norm": 6.94968318939209, + "learning_rate": 1.682572944519417e-05, + "loss": 1.694, + "step": 75870 + }, + { + "epoch": 0.4769233790976764, + "grad_norm": 6.045395851135254, + "learning_rate": 1.6825310344249518e-05, + "loss": 1.7431, + "step": 75880 + }, + { + "epoch": 0.4769862314143735, + "grad_norm": 15.732362747192383, + "learning_rate": 1.6824891243304865e-05, + "loss": 1.7184, + "step": 75890 + }, + { + "epoch": 0.47704908373107063, + "grad_norm": 6.817939758300781, + "learning_rate": 1.6824472142360212e-05, + "loss": 1.6804, + "step": 75900 + }, + { + "epoch": 0.47711193604776775, + "grad_norm": 5.561984062194824, + "learning_rate": 1.6824053041415556e-05, + "loss": 1.7968, + "step": 75910 + }, + { + "epoch": 0.47717478836446486, + "grad_norm": 7.767377853393555, + "learning_rate": 1.6823633940470903e-05, + "loss": 2.0454, + "step": 75920 + }, + { + "epoch": 0.477237640681162, + "grad_norm": 6.020562648773193, + "learning_rate": 1.682321483952625e-05, + "loss": 1.7706, + "step": 75930 + }, + { + "epoch": 0.4773004929978591, + "grad_norm": 7.518442153930664, + "learning_rate": 1.6822795738581597e-05, + "loss": 1.729, + "step": 75940 + }, + { + "epoch": 0.4773633453145562, + "grad_norm": 7.066908836364746, + "learning_rate": 1.6822376637636944e-05, + "loss": 1.8053, + "step": 75950 + }, + { + "epoch": 0.4774261976312533, + "grad_norm": 7.802619934082031, + "learning_rate": 1.6821957536692288e-05, + "loss": 1.6715, + "step": 75960 + }, + { + "epoch": 0.47748904994795044, + "grad_norm": 8.189960479736328, + "learning_rate": 1.6821538435747635e-05, + "loss": 1.8093, + "step": 75970 + }, + { + "epoch": 0.47755190226464755, + "grad_norm": 5.803287982940674, + "learning_rate": 1.6821119334802982e-05, + "loss": 1.5923, + "step": 75980 + }, + { + "epoch": 0.47761475458134467, + "grad_norm": 6.04809045791626, + "learning_rate": 1.682070023385833e-05, + "loss": 1.4471, + "step": 75990 + }, + { + "epoch": 0.4776776068980418, + "grad_norm": 7.733794689178467, + "learning_rate": 1.6820281132913673e-05, + "loss": 1.7247, + "step": 76000 + }, + { + "epoch": 0.47774045921473884, + "grad_norm": 6.097110271453857, + "learning_rate": 1.681986203196902e-05, + "loss": 1.4478, + "step": 76010 + }, + { + "epoch": 0.47780331153143596, + "grad_norm": 7.205755233764648, + "learning_rate": 1.6819442931024367e-05, + "loss": 1.8043, + "step": 76020 + }, + { + "epoch": 0.4778661638481331, + "grad_norm": 6.37203311920166, + "learning_rate": 1.6819023830079714e-05, + "loss": 1.6534, + "step": 76030 + }, + { + "epoch": 0.4779290161648302, + "grad_norm": 6.525231838226318, + "learning_rate": 1.681860472913506e-05, + "loss": 1.6822, + "step": 76040 + }, + { + "epoch": 0.4779918684815273, + "grad_norm": 6.487207889556885, + "learning_rate": 1.6818185628190408e-05, + "loss": 1.5818, + "step": 76050 + }, + { + "epoch": 0.4780547207982244, + "grad_norm": 6.249277114868164, + "learning_rate": 1.6817766527245755e-05, + "loss": 1.4675, + "step": 76060 + }, + { + "epoch": 0.47811757311492153, + "grad_norm": 7.021834373474121, + "learning_rate": 1.68173474263011e-05, + "loss": 1.866, + "step": 76070 + }, + { + "epoch": 0.47818042543161865, + "grad_norm": 7.881227493286133, + "learning_rate": 1.6816928325356446e-05, + "loss": 1.6331, + "step": 76080 + }, + { + "epoch": 0.47824327774831576, + "grad_norm": 6.758789539337158, + "learning_rate": 1.6816509224411793e-05, + "loss": 1.9898, + "step": 76090 + }, + { + "epoch": 0.4783061300650129, + "grad_norm": 6.642223834991455, + "learning_rate": 1.681609012346714e-05, + "loss": 1.7728, + "step": 76100 + }, + { + "epoch": 0.47836898238171, + "grad_norm": 6.30720329284668, + "learning_rate": 1.6815671022522487e-05, + "loss": 1.4039, + "step": 76110 + }, + { + "epoch": 0.4784318346984071, + "grad_norm": 6.062718391418457, + "learning_rate": 1.6815251921577834e-05, + "loss": 1.5843, + "step": 76120 + }, + { + "epoch": 0.47849468701510417, + "grad_norm": 6.683797359466553, + "learning_rate": 1.681483282063318e-05, + "loss": 1.6862, + "step": 76130 + }, + { + "epoch": 0.4785575393318013, + "grad_norm": 6.161579608917236, + "learning_rate": 1.6814413719688525e-05, + "loss": 1.6474, + "step": 76140 + }, + { + "epoch": 0.4786203916484984, + "grad_norm": 7.291128635406494, + "learning_rate": 1.6813994618743872e-05, + "loss": 1.7735, + "step": 76150 + }, + { + "epoch": 0.4786832439651955, + "grad_norm": 5.702531814575195, + "learning_rate": 1.681357551779922e-05, + "loss": 1.7341, + "step": 76160 + }, + { + "epoch": 0.47874609628189263, + "grad_norm": 6.238299369812012, + "learning_rate": 1.6813156416854566e-05, + "loss": 1.6386, + "step": 76170 + }, + { + "epoch": 0.47880894859858975, + "grad_norm": 6.214787483215332, + "learning_rate": 1.681273731590991e-05, + "loss": 1.5607, + "step": 76180 + }, + { + "epoch": 0.47887180091528686, + "grad_norm": 8.346977233886719, + "learning_rate": 1.6812318214965257e-05, + "loss": 1.5856, + "step": 76190 + }, + { + "epoch": 0.478934653231984, + "grad_norm": 7.1682281494140625, + "learning_rate": 1.6811899114020604e-05, + "loss": 1.9296, + "step": 76200 + }, + { + "epoch": 0.4789975055486811, + "grad_norm": 7.234790325164795, + "learning_rate": 1.681148001307595e-05, + "loss": 1.7518, + "step": 76210 + }, + { + "epoch": 0.4790603578653782, + "grad_norm": 6.4557671546936035, + "learning_rate": 1.6811060912131298e-05, + "loss": 1.6877, + "step": 76220 + }, + { + "epoch": 0.4791232101820753, + "grad_norm": 7.287628173828125, + "learning_rate": 1.6810641811186642e-05, + "loss": 1.7578, + "step": 76230 + }, + { + "epoch": 0.47918606249877244, + "grad_norm": 7.659276962280273, + "learning_rate": 1.681022271024199e-05, + "loss": 1.761, + "step": 76240 + }, + { + "epoch": 0.47924891481546955, + "grad_norm": 6.70458984375, + "learning_rate": 1.6809803609297336e-05, + "loss": 1.7808, + "step": 76250 + }, + { + "epoch": 0.4793117671321666, + "grad_norm": 6.866919040679932, + "learning_rate": 1.6809384508352683e-05, + "loss": 1.6334, + "step": 76260 + }, + { + "epoch": 0.4793746194488637, + "grad_norm": 8.058916091918945, + "learning_rate": 1.680896540740803e-05, + "loss": 1.6893, + "step": 76270 + }, + { + "epoch": 0.47943747176556084, + "grad_norm": 7.43437385559082, + "learning_rate": 1.6808546306463377e-05, + "loss": 1.6279, + "step": 76280 + }, + { + "epoch": 0.47950032408225796, + "grad_norm": 6.599917888641357, + "learning_rate": 1.6808127205518724e-05, + "loss": 1.6329, + "step": 76290 + }, + { + "epoch": 0.47956317639895507, + "grad_norm": 6.738684177398682, + "learning_rate": 1.680770810457407e-05, + "loss": 1.5999, + "step": 76300 + }, + { + "epoch": 0.4796260287156522, + "grad_norm": 6.267560005187988, + "learning_rate": 1.6807289003629415e-05, + "loss": 1.6746, + "step": 76310 + }, + { + "epoch": 0.4796888810323493, + "grad_norm": 6.663815975189209, + "learning_rate": 1.6806869902684762e-05, + "loss": 1.5523, + "step": 76320 + }, + { + "epoch": 0.4797517333490464, + "grad_norm": 7.0863356590271, + "learning_rate": 1.680645080174011e-05, + "loss": 1.4604, + "step": 76330 + }, + { + "epoch": 0.47981458566574353, + "grad_norm": 7.865902423858643, + "learning_rate": 1.6806031700795456e-05, + "loss": 1.6081, + "step": 76340 + }, + { + "epoch": 0.47987743798244065, + "grad_norm": 6.287507057189941, + "learning_rate": 1.6805612599850803e-05, + "loss": 1.7478, + "step": 76350 + }, + { + "epoch": 0.47994029029913776, + "grad_norm": 6.601611137390137, + "learning_rate": 1.6805193498906147e-05, + "loss": 1.67, + "step": 76360 + }, + { + "epoch": 0.4800031426158349, + "grad_norm": 7.671497344970703, + "learning_rate": 1.6804774397961494e-05, + "loss": 1.8226, + "step": 76370 + }, + { + "epoch": 0.480065994932532, + "grad_norm": 7.103155612945557, + "learning_rate": 1.680435529701684e-05, + "loss": 1.8356, + "step": 76380 + }, + { + "epoch": 0.48012884724922905, + "grad_norm": 6.71427059173584, + "learning_rate": 1.6803936196072188e-05, + "loss": 1.6828, + "step": 76390 + }, + { + "epoch": 0.48019169956592617, + "grad_norm": 6.308364391326904, + "learning_rate": 1.6803517095127532e-05, + "loss": 1.8505, + "step": 76400 + }, + { + "epoch": 0.4802545518826233, + "grad_norm": 6.404458522796631, + "learning_rate": 1.680309799418288e-05, + "loss": 1.7192, + "step": 76410 + }, + { + "epoch": 0.4803174041993204, + "grad_norm": 7.096067428588867, + "learning_rate": 1.6802678893238226e-05, + "loss": 1.6234, + "step": 76420 + }, + { + "epoch": 0.4803802565160175, + "grad_norm": 6.121633529663086, + "learning_rate": 1.6802259792293573e-05, + "loss": 1.5322, + "step": 76430 + }, + { + "epoch": 0.4804431088327146, + "grad_norm": 6.883768081665039, + "learning_rate": 1.680184069134892e-05, + "loss": 1.4982, + "step": 76440 + }, + { + "epoch": 0.48050596114941174, + "grad_norm": 5.9583635330200195, + "learning_rate": 1.6801421590404264e-05, + "loss": 1.6888, + "step": 76450 + }, + { + "epoch": 0.48056881346610886, + "grad_norm": 7.673823356628418, + "learning_rate": 1.680100248945961e-05, + "loss": 1.6281, + "step": 76460 + }, + { + "epoch": 0.480631665782806, + "grad_norm": 7.7770586013793945, + "learning_rate": 1.6800583388514958e-05, + "loss": 1.8198, + "step": 76470 + }, + { + "epoch": 0.4806945180995031, + "grad_norm": 6.3411173820495605, + "learning_rate": 1.6800164287570305e-05, + "loss": 1.6632, + "step": 76480 + }, + { + "epoch": 0.4807573704162002, + "grad_norm": 6.581920623779297, + "learning_rate": 1.6799745186625652e-05, + "loss": 1.9932, + "step": 76490 + }, + { + "epoch": 0.4808202227328973, + "grad_norm": 6.3683085441589355, + "learning_rate": 1.6799326085681e-05, + "loss": 1.671, + "step": 76500 + }, + { + "epoch": 0.48088307504959443, + "grad_norm": 7.540045261383057, + "learning_rate": 1.6798906984736346e-05, + "loss": 1.817, + "step": 76510 + }, + { + "epoch": 0.4809459273662915, + "grad_norm": 6.9328389167785645, + "learning_rate": 1.6798487883791693e-05, + "loss": 1.8116, + "step": 76520 + }, + { + "epoch": 0.4810087796829886, + "grad_norm": 6.642269611358643, + "learning_rate": 1.679806878284704e-05, + "loss": 1.7668, + "step": 76530 + }, + { + "epoch": 0.4810716319996857, + "grad_norm": 5.931235313415527, + "learning_rate": 1.6797649681902384e-05, + "loss": 1.5424, + "step": 76540 + }, + { + "epoch": 0.48113448431638284, + "grad_norm": 6.841514587402344, + "learning_rate": 1.679723058095773e-05, + "loss": 1.7151, + "step": 76550 + }, + { + "epoch": 0.48119733663307995, + "grad_norm": 6.333242893218994, + "learning_rate": 1.6796811480013078e-05, + "loss": 1.6082, + "step": 76560 + }, + { + "epoch": 0.48126018894977707, + "grad_norm": 7.489376068115234, + "learning_rate": 1.6796392379068425e-05, + "loss": 1.7848, + "step": 76570 + }, + { + "epoch": 0.4813230412664742, + "grad_norm": 7.716389179229736, + "learning_rate": 1.679597327812377e-05, + "loss": 1.5968, + "step": 76580 + }, + { + "epoch": 0.4813858935831713, + "grad_norm": 6.728993892669678, + "learning_rate": 1.6795554177179116e-05, + "loss": 1.5973, + "step": 76590 + }, + { + "epoch": 0.4814487458998684, + "grad_norm": 7.20167350769043, + "learning_rate": 1.6795135076234463e-05, + "loss": 1.7179, + "step": 76600 + }, + { + "epoch": 0.48151159821656553, + "grad_norm": 7.126608371734619, + "learning_rate": 1.679471597528981e-05, + "loss": 1.9309, + "step": 76610 + }, + { + "epoch": 0.48157445053326264, + "grad_norm": 6.837955951690674, + "learning_rate": 1.6794296874345154e-05, + "loss": 1.8812, + "step": 76620 + }, + { + "epoch": 0.48163730284995976, + "grad_norm": 8.498090744018555, + "learning_rate": 1.67938777734005e-05, + "loss": 1.6451, + "step": 76630 + }, + { + "epoch": 0.4817001551666569, + "grad_norm": 6.694071292877197, + "learning_rate": 1.6793458672455848e-05, + "loss": 1.7285, + "step": 76640 + }, + { + "epoch": 0.48176300748335393, + "grad_norm": 6.439844131469727, + "learning_rate": 1.6793039571511195e-05, + "loss": 1.7931, + "step": 76650 + }, + { + "epoch": 0.48182585980005105, + "grad_norm": 6.896914482116699, + "learning_rate": 1.6792620470566542e-05, + "loss": 1.6776, + "step": 76660 + }, + { + "epoch": 0.48188871211674816, + "grad_norm": 6.71177864074707, + "learning_rate": 1.679220136962189e-05, + "loss": 1.8234, + "step": 76670 + }, + { + "epoch": 0.4819515644334453, + "grad_norm": 7.299954891204834, + "learning_rate": 1.6791782268677236e-05, + "loss": 1.6831, + "step": 76680 + }, + { + "epoch": 0.4820144167501424, + "grad_norm": 7.252218723297119, + "learning_rate": 1.6791363167732583e-05, + "loss": 1.7688, + "step": 76690 + }, + { + "epoch": 0.4820772690668395, + "grad_norm": 6.525442123413086, + "learning_rate": 1.6790944066787927e-05, + "loss": 1.8474, + "step": 76700 + }, + { + "epoch": 0.4821401213835366, + "grad_norm": 6.625932693481445, + "learning_rate": 1.6790524965843274e-05, + "loss": 1.696, + "step": 76710 + }, + { + "epoch": 0.48220297370023374, + "grad_norm": 7.926068305969238, + "learning_rate": 1.679010586489862e-05, + "loss": 1.569, + "step": 76720 + }, + { + "epoch": 0.48226582601693085, + "grad_norm": 6.686048984527588, + "learning_rate": 1.6789686763953968e-05, + "loss": 1.5974, + "step": 76730 + }, + { + "epoch": 0.48232867833362797, + "grad_norm": 7.3481268882751465, + "learning_rate": 1.6789267663009315e-05, + "loss": 1.6895, + "step": 76740 + }, + { + "epoch": 0.4823915306503251, + "grad_norm": 6.539943695068359, + "learning_rate": 1.6788848562064662e-05, + "loss": 1.6513, + "step": 76750 + }, + { + "epoch": 0.4824543829670222, + "grad_norm": 6.613016605377197, + "learning_rate": 1.6788429461120006e-05, + "loss": 1.8012, + "step": 76760 + }, + { + "epoch": 0.48251723528371926, + "grad_norm": 6.274559020996094, + "learning_rate": 1.6788010360175353e-05, + "loss": 1.525, + "step": 76770 + }, + { + "epoch": 0.4825800876004164, + "grad_norm": 7.586381912231445, + "learning_rate": 1.67875912592307e-05, + "loss": 1.7118, + "step": 76780 + }, + { + "epoch": 0.4826429399171135, + "grad_norm": 6.268805027008057, + "learning_rate": 1.6787172158286047e-05, + "loss": 2.0469, + "step": 76790 + }, + { + "epoch": 0.4827057922338106, + "grad_norm": 6.7833781242370605, + "learning_rate": 1.678675305734139e-05, + "loss": 1.6206, + "step": 76800 + }, + { + "epoch": 0.4827686445505077, + "grad_norm": 9.56633472442627, + "learning_rate": 1.6786333956396738e-05, + "loss": 1.7133, + "step": 76810 + }, + { + "epoch": 0.48283149686720483, + "grad_norm": 6.577396392822266, + "learning_rate": 1.6785914855452085e-05, + "loss": 1.6786, + "step": 76820 + }, + { + "epoch": 0.48289434918390195, + "grad_norm": 6.394055366516113, + "learning_rate": 1.6785495754507432e-05, + "loss": 1.7993, + "step": 76830 + }, + { + "epoch": 0.48295720150059906, + "grad_norm": 7.305932521820068, + "learning_rate": 1.678507665356278e-05, + "loss": 1.7944, + "step": 76840 + }, + { + "epoch": 0.4830200538172962, + "grad_norm": 7.5725274085998535, + "learning_rate": 1.6784657552618123e-05, + "loss": 1.5526, + "step": 76850 + }, + { + "epoch": 0.4830829061339933, + "grad_norm": 6.820903778076172, + "learning_rate": 1.678423845167347e-05, + "loss": 1.5832, + "step": 76860 + }, + { + "epoch": 0.4831457584506904, + "grad_norm": 7.331640720367432, + "learning_rate": 1.6783819350728817e-05, + "loss": 1.7553, + "step": 76870 + }, + { + "epoch": 0.4832086107673875, + "grad_norm": 5.045251369476318, + "learning_rate": 1.6783400249784164e-05, + "loss": 1.5728, + "step": 76880 + }, + { + "epoch": 0.48327146308408464, + "grad_norm": 7.099085330963135, + "learning_rate": 1.678298114883951e-05, + "loss": 1.5843, + "step": 76890 + }, + { + "epoch": 0.4833343154007817, + "grad_norm": 6.82592248916626, + "learning_rate": 1.6782562047894858e-05, + "loss": 1.7638, + "step": 76900 + }, + { + "epoch": 0.4833971677174788, + "grad_norm": 6.2563910484313965, + "learning_rate": 1.6782142946950205e-05, + "loss": 1.5981, + "step": 76910 + }, + { + "epoch": 0.48346002003417593, + "grad_norm": 6.211933612823486, + "learning_rate": 1.6781723846005552e-05, + "loss": 1.6231, + "step": 76920 + }, + { + "epoch": 0.48352287235087305, + "grad_norm": 8.02187728881836, + "learning_rate": 1.6781304745060896e-05, + "loss": 1.5676, + "step": 76930 + }, + { + "epoch": 0.48358572466757016, + "grad_norm": 7.387375831604004, + "learning_rate": 1.6780885644116243e-05, + "loss": 1.5103, + "step": 76940 + }, + { + "epoch": 0.4836485769842673, + "grad_norm": 6.681406021118164, + "learning_rate": 1.678046654317159e-05, + "loss": 1.7857, + "step": 76950 + }, + { + "epoch": 0.4837114293009644, + "grad_norm": 6.307636260986328, + "learning_rate": 1.6780047442226937e-05, + "loss": 1.6065, + "step": 76960 + }, + { + "epoch": 0.4837742816176615, + "grad_norm": 7.979382038116455, + "learning_rate": 1.6779628341282284e-05, + "loss": 1.7488, + "step": 76970 + }, + { + "epoch": 0.4838371339343586, + "grad_norm": 7.439484596252441, + "learning_rate": 1.6779209240337628e-05, + "loss": 1.9083, + "step": 76980 + }, + { + "epoch": 0.48389998625105574, + "grad_norm": 6.592164039611816, + "learning_rate": 1.6778790139392975e-05, + "loss": 1.5036, + "step": 76990 + }, + { + "epoch": 0.48396283856775285, + "grad_norm": 6.88593053817749, + "learning_rate": 1.6778371038448322e-05, + "loss": 1.7836, + "step": 77000 + }, + { + "epoch": 0.48402569088444997, + "grad_norm": 6.5450849533081055, + "learning_rate": 1.677795193750367e-05, + "loss": 1.774, + "step": 77010 + }, + { + "epoch": 0.4840885432011471, + "grad_norm": 7.682107448577881, + "learning_rate": 1.6777532836559013e-05, + "loss": 1.7071, + "step": 77020 + }, + { + "epoch": 0.48415139551784414, + "grad_norm": 6.716879367828369, + "learning_rate": 1.677711373561436e-05, + "loss": 1.4301, + "step": 77030 + }, + { + "epoch": 0.48421424783454126, + "grad_norm": 6.929161071777344, + "learning_rate": 1.6776694634669707e-05, + "loss": 1.552, + "step": 77040 + }, + { + "epoch": 0.48427710015123837, + "grad_norm": 6.5387420654296875, + "learning_rate": 1.6776275533725054e-05, + "loss": 1.7443, + "step": 77050 + }, + { + "epoch": 0.4843399524679355, + "grad_norm": 7.476447105407715, + "learning_rate": 1.67758564327804e-05, + "loss": 1.8439, + "step": 77060 + }, + { + "epoch": 0.4844028047846326, + "grad_norm": 5.991411209106445, + "learning_rate": 1.6775437331835748e-05, + "loss": 1.8954, + "step": 77070 + }, + { + "epoch": 0.4844656571013297, + "grad_norm": 6.784745693206787, + "learning_rate": 1.6775018230891092e-05, + "loss": 1.7183, + "step": 77080 + }, + { + "epoch": 0.48452850941802683, + "grad_norm": 7.5521769523620605, + "learning_rate": 1.677459912994644e-05, + "loss": 1.8052, + "step": 77090 + }, + { + "epoch": 0.48459136173472395, + "grad_norm": 6.470150947570801, + "learning_rate": 1.6774180029001786e-05, + "loss": 1.6111, + "step": 77100 + }, + { + "epoch": 0.48465421405142106, + "grad_norm": 6.512275218963623, + "learning_rate": 1.6773760928057133e-05, + "loss": 1.7136, + "step": 77110 + }, + { + "epoch": 0.4847170663681182, + "grad_norm": 6.011322021484375, + "learning_rate": 1.677334182711248e-05, + "loss": 1.7882, + "step": 77120 + }, + { + "epoch": 0.4847799186848153, + "grad_norm": 7.649730205535889, + "learning_rate": 1.6772922726167827e-05, + "loss": 1.654, + "step": 77130 + }, + { + "epoch": 0.4848427710015124, + "grad_norm": 6.561295032501221, + "learning_rate": 1.6772503625223174e-05, + "loss": 1.6828, + "step": 77140 + }, + { + "epoch": 0.4849056233182095, + "grad_norm": 7.136097431182861, + "learning_rate": 1.677208452427852e-05, + "loss": 1.621, + "step": 77150 + }, + { + "epoch": 0.4849684756349066, + "grad_norm": 6.467504501342773, + "learning_rate": 1.6771665423333865e-05, + "loss": 1.799, + "step": 77160 + }, + { + "epoch": 0.4850313279516037, + "grad_norm": 6.1977314949035645, + "learning_rate": 1.6771246322389212e-05, + "loss": 1.8724, + "step": 77170 + }, + { + "epoch": 0.4850941802683008, + "grad_norm": 7.321646690368652, + "learning_rate": 1.677082722144456e-05, + "loss": 1.7538, + "step": 77180 + }, + { + "epoch": 0.4851570325849979, + "grad_norm": 7.129180431365967, + "learning_rate": 1.6770408120499906e-05, + "loss": 1.6605, + "step": 77190 + }, + { + "epoch": 0.48521988490169504, + "grad_norm": 6.395766735076904, + "learning_rate": 1.676998901955525e-05, + "loss": 1.4973, + "step": 77200 + }, + { + "epoch": 0.48528273721839216, + "grad_norm": 5.811442852020264, + "learning_rate": 1.6769569918610597e-05, + "loss": 1.7174, + "step": 77210 + }, + { + "epoch": 0.4853455895350893, + "grad_norm": 7.0175018310546875, + "learning_rate": 1.6769150817665944e-05, + "loss": 1.517, + "step": 77220 + }, + { + "epoch": 0.4854084418517864, + "grad_norm": 7.165356636047363, + "learning_rate": 1.676873171672129e-05, + "loss": 1.8301, + "step": 77230 + }, + { + "epoch": 0.4854712941684835, + "grad_norm": 6.414263725280762, + "learning_rate": 1.6768312615776635e-05, + "loss": 1.5513, + "step": 77240 + }, + { + "epoch": 0.4855341464851806, + "grad_norm": 6.830236911773682, + "learning_rate": 1.6767893514831982e-05, + "loss": 1.7215, + "step": 77250 + }, + { + "epoch": 0.48559699880187773, + "grad_norm": 7.03541374206543, + "learning_rate": 1.676747441388733e-05, + "loss": 1.6454, + "step": 77260 + }, + { + "epoch": 0.48565985111857485, + "grad_norm": 6.4914631843566895, + "learning_rate": 1.6767055312942676e-05, + "loss": 1.751, + "step": 77270 + }, + { + "epoch": 0.4857227034352719, + "grad_norm": 6.393687725067139, + "learning_rate": 1.6766636211998023e-05, + "loss": 1.6691, + "step": 77280 + }, + { + "epoch": 0.485785555751969, + "grad_norm": 5.885585784912109, + "learning_rate": 1.676621711105337e-05, + "loss": 1.6191, + "step": 77290 + }, + { + "epoch": 0.48584840806866614, + "grad_norm": 6.8038201332092285, + "learning_rate": 1.6765798010108717e-05, + "loss": 1.5134, + "step": 77300 + }, + { + "epoch": 0.48591126038536325, + "grad_norm": 6.377265453338623, + "learning_rate": 1.6765378909164064e-05, + "loss": 1.6292, + "step": 77310 + }, + { + "epoch": 0.48597411270206037, + "grad_norm": 6.908006191253662, + "learning_rate": 1.676495980821941e-05, + "loss": 1.5764, + "step": 77320 + }, + { + "epoch": 0.4860369650187575, + "grad_norm": 8.175592422485352, + "learning_rate": 1.6764540707274755e-05, + "loss": 1.993, + "step": 77330 + }, + { + "epoch": 0.4860998173354546, + "grad_norm": 6.585998058319092, + "learning_rate": 1.6764121606330102e-05, + "loss": 1.8986, + "step": 77340 + }, + { + "epoch": 0.4861626696521517, + "grad_norm": 6.87686824798584, + "learning_rate": 1.676370250538545e-05, + "loss": 1.8219, + "step": 77350 + }, + { + "epoch": 0.48622552196884883, + "grad_norm": 6.103376388549805, + "learning_rate": 1.6763283404440796e-05, + "loss": 1.538, + "step": 77360 + }, + { + "epoch": 0.48628837428554594, + "grad_norm": 7.167880535125732, + "learning_rate": 1.6762864303496143e-05, + "loss": 1.7229, + "step": 77370 + }, + { + "epoch": 0.48635122660224306, + "grad_norm": 7.27005672454834, + "learning_rate": 1.6762445202551487e-05, + "loss": 1.7178, + "step": 77380 + }, + { + "epoch": 0.4864140789189402, + "grad_norm": 7.195834636688232, + "learning_rate": 1.6762026101606834e-05, + "loss": 1.601, + "step": 77390 + }, + { + "epoch": 0.4864769312356373, + "grad_norm": 6.97745418548584, + "learning_rate": 1.676160700066218e-05, + "loss": 1.6146, + "step": 77400 + }, + { + "epoch": 0.48653978355233435, + "grad_norm": 6.174566268920898, + "learning_rate": 1.676118789971753e-05, + "loss": 1.5699, + "step": 77410 + }, + { + "epoch": 0.48660263586903146, + "grad_norm": 6.945740699768066, + "learning_rate": 1.6760768798772872e-05, + "loss": 1.8153, + "step": 77420 + }, + { + "epoch": 0.4866654881857286, + "grad_norm": 6.563122749328613, + "learning_rate": 1.676034969782822e-05, + "loss": 1.827, + "step": 77430 + }, + { + "epoch": 0.4867283405024257, + "grad_norm": 7.262126445770264, + "learning_rate": 1.6759930596883566e-05, + "loss": 1.8561, + "step": 77440 + }, + { + "epoch": 0.4867911928191228, + "grad_norm": 6.856802463531494, + "learning_rate": 1.6759511495938913e-05, + "loss": 1.772, + "step": 77450 + }, + { + "epoch": 0.4868540451358199, + "grad_norm": 6.213690757751465, + "learning_rate": 1.675909239499426e-05, + "loss": 1.7232, + "step": 77460 + }, + { + "epoch": 0.48691689745251704, + "grad_norm": 7.799118518829346, + "learning_rate": 1.6758673294049604e-05, + "loss": 1.9163, + "step": 77470 + }, + { + "epoch": 0.48697974976921415, + "grad_norm": 6.059864044189453, + "learning_rate": 1.675825419310495e-05, + "loss": 1.714, + "step": 77480 + }, + { + "epoch": 0.48704260208591127, + "grad_norm": 6.988504886627197, + "learning_rate": 1.6757835092160298e-05, + "loss": 1.629, + "step": 77490 + }, + { + "epoch": 0.4871054544026084, + "grad_norm": 6.0013227462768555, + "learning_rate": 1.6757415991215645e-05, + "loss": 1.6388, + "step": 77500 + }, + { + "epoch": 0.4871683067193055, + "grad_norm": 7.691008567810059, + "learning_rate": 1.6756996890270992e-05, + "loss": 1.8351, + "step": 77510 + }, + { + "epoch": 0.4872311590360026, + "grad_norm": 6.733763217926025, + "learning_rate": 1.675657778932634e-05, + "loss": 1.7723, + "step": 77520 + }, + { + "epoch": 0.48729401135269973, + "grad_norm": 5.495975971221924, + "learning_rate": 1.6756158688381686e-05, + "loss": 1.6548, + "step": 77530 + }, + { + "epoch": 0.4873568636693968, + "grad_norm": 6.805240154266357, + "learning_rate": 1.6755739587437033e-05, + "loss": 1.6496, + "step": 77540 + }, + { + "epoch": 0.4874197159860939, + "grad_norm": 6.348954677581787, + "learning_rate": 1.6755320486492377e-05, + "loss": 1.6526, + "step": 77550 + }, + { + "epoch": 0.487482568302791, + "grad_norm": 6.872445106506348, + "learning_rate": 1.6754901385547724e-05, + "loss": 1.5751, + "step": 77560 + }, + { + "epoch": 0.48754542061948813, + "grad_norm": 6.3247246742248535, + "learning_rate": 1.675448228460307e-05, + "loss": 1.898, + "step": 77570 + }, + { + "epoch": 0.48760827293618525, + "grad_norm": 8.797094345092773, + "learning_rate": 1.675406318365842e-05, + "loss": 1.5272, + "step": 77580 + }, + { + "epoch": 0.48767112525288236, + "grad_norm": 7.387877464294434, + "learning_rate": 1.6753644082713765e-05, + "loss": 1.5454, + "step": 77590 + }, + { + "epoch": 0.4877339775695795, + "grad_norm": 6.240447521209717, + "learning_rate": 1.675322498176911e-05, + "loss": 1.4453, + "step": 77600 + }, + { + "epoch": 0.4877968298862766, + "grad_norm": 6.7662763595581055, + "learning_rate": 1.6752805880824456e-05, + "loss": 1.6475, + "step": 77610 + }, + { + "epoch": 0.4878596822029737, + "grad_norm": 5.877827167510986, + "learning_rate": 1.6752386779879803e-05, + "loss": 1.6666, + "step": 77620 + }, + { + "epoch": 0.4879225345196708, + "grad_norm": 7.133870601654053, + "learning_rate": 1.675196767893515e-05, + "loss": 1.8037, + "step": 77630 + }, + { + "epoch": 0.48798538683636794, + "grad_norm": 7.127045631408691, + "learning_rate": 1.6751548577990494e-05, + "loss": 1.7026, + "step": 77640 + }, + { + "epoch": 0.48804823915306506, + "grad_norm": 6.184054374694824, + "learning_rate": 1.675112947704584e-05, + "loss": 1.7624, + "step": 77650 + }, + { + "epoch": 0.48811109146976217, + "grad_norm": 6.619475841522217, + "learning_rate": 1.6750710376101188e-05, + "loss": 1.6668, + "step": 77660 + }, + { + "epoch": 0.48817394378645923, + "grad_norm": 7.2268290519714355, + "learning_rate": 1.6750291275156535e-05, + "loss": 1.682, + "step": 77670 + }, + { + "epoch": 0.48823679610315635, + "grad_norm": 5.725668907165527, + "learning_rate": 1.6749872174211882e-05, + "loss": 1.6467, + "step": 77680 + }, + { + "epoch": 0.48829964841985346, + "grad_norm": 6.959593296051025, + "learning_rate": 1.674945307326723e-05, + "loss": 1.6727, + "step": 77690 + }, + { + "epoch": 0.4883625007365506, + "grad_norm": 5.738344192504883, + "learning_rate": 1.6749033972322576e-05, + "loss": 1.7752, + "step": 77700 + }, + { + "epoch": 0.4884253530532477, + "grad_norm": 6.951902866363525, + "learning_rate": 1.674861487137792e-05, + "loss": 1.5168, + "step": 77710 + }, + { + "epoch": 0.4884882053699448, + "grad_norm": 6.028759479522705, + "learning_rate": 1.6748195770433267e-05, + "loss": 1.7481, + "step": 77720 + }, + { + "epoch": 0.4885510576866419, + "grad_norm": 6.964917182922363, + "learning_rate": 1.6747776669488614e-05, + "loss": 1.7408, + "step": 77730 + }, + { + "epoch": 0.48861391000333904, + "grad_norm": 6.195134162902832, + "learning_rate": 1.674735756854396e-05, + "loss": 1.6444, + "step": 77740 + }, + { + "epoch": 0.48867676232003615, + "grad_norm": 7.4523115158081055, + "learning_rate": 1.674693846759931e-05, + "loss": 1.6964, + "step": 77750 + }, + { + "epoch": 0.48873961463673327, + "grad_norm": 6.267302989959717, + "learning_rate": 1.6746519366654655e-05, + "loss": 1.58, + "step": 77760 + }, + { + "epoch": 0.4888024669534304, + "grad_norm": 8.292152404785156, + "learning_rate": 1.6746100265710003e-05, + "loss": 1.698, + "step": 77770 + }, + { + "epoch": 0.4888653192701275, + "grad_norm": 7.232621192932129, + "learning_rate": 1.6745681164765346e-05, + "loss": 1.6959, + "step": 77780 + }, + { + "epoch": 0.48892817158682456, + "grad_norm": 7.289283275604248, + "learning_rate": 1.6745262063820693e-05, + "loss": 1.7407, + "step": 77790 + }, + { + "epoch": 0.48899102390352167, + "grad_norm": 6.819472789764404, + "learning_rate": 1.674484296287604e-05, + "loss": 1.7476, + "step": 77800 + }, + { + "epoch": 0.4890538762202188, + "grad_norm": 6.648898124694824, + "learning_rate": 1.6744423861931387e-05, + "loss": 1.5295, + "step": 77810 + }, + { + "epoch": 0.4891167285369159, + "grad_norm": 7.825111389160156, + "learning_rate": 1.674400476098673e-05, + "loss": 1.7742, + "step": 77820 + }, + { + "epoch": 0.489179580853613, + "grad_norm": 7.094618320465088, + "learning_rate": 1.6743585660042078e-05, + "loss": 1.5701, + "step": 77830 + }, + { + "epoch": 0.48924243317031013, + "grad_norm": 7.05112361907959, + "learning_rate": 1.6743166559097425e-05, + "loss": 1.9165, + "step": 77840 + }, + { + "epoch": 0.48930528548700725, + "grad_norm": 6.470577716827393, + "learning_rate": 1.6742747458152772e-05, + "loss": 1.768, + "step": 77850 + }, + { + "epoch": 0.48936813780370436, + "grad_norm": 6.4770331382751465, + "learning_rate": 1.6742328357208116e-05, + "loss": 1.5896, + "step": 77860 + }, + { + "epoch": 0.4894309901204015, + "grad_norm": 6.927404880523682, + "learning_rate": 1.6741909256263463e-05, + "loss": 1.701, + "step": 77870 + }, + { + "epoch": 0.4894938424370986, + "grad_norm": 6.723846912384033, + "learning_rate": 1.674149015531881e-05, + "loss": 1.5947, + "step": 77880 + }, + { + "epoch": 0.4895566947537957, + "grad_norm": 7.334015846252441, + "learning_rate": 1.6741071054374157e-05, + "loss": 1.6423, + "step": 77890 + }, + { + "epoch": 0.4896195470704928, + "grad_norm": 6.482966899871826, + "learning_rate": 1.6740651953429504e-05, + "loss": 1.5698, + "step": 77900 + }, + { + "epoch": 0.48968239938718994, + "grad_norm": 6.900713920593262, + "learning_rate": 1.674023285248485e-05, + "loss": 1.704, + "step": 77910 + }, + { + "epoch": 0.489745251703887, + "grad_norm": 7.279245853424072, + "learning_rate": 1.67398137515402e-05, + "loss": 1.5754, + "step": 77920 + }, + { + "epoch": 0.4898081040205841, + "grad_norm": 7.552469730377197, + "learning_rate": 1.6739394650595546e-05, + "loss": 1.7267, + "step": 77930 + }, + { + "epoch": 0.4898709563372812, + "grad_norm": 5.7016754150390625, + "learning_rate": 1.6738975549650893e-05, + "loss": 1.5803, + "step": 77940 + }, + { + "epoch": 0.48993380865397834, + "grad_norm": 5.878109931945801, + "learning_rate": 1.6738556448706236e-05, + "loss": 1.8408, + "step": 77950 + }, + { + "epoch": 0.48999666097067546, + "grad_norm": 6.1266679763793945, + "learning_rate": 1.6738137347761583e-05, + "loss": 1.7481, + "step": 77960 + }, + { + "epoch": 0.4900595132873726, + "grad_norm": 6.185999393463135, + "learning_rate": 1.673771824681693e-05, + "loss": 1.6293, + "step": 77970 + }, + { + "epoch": 0.4901223656040697, + "grad_norm": 7.004542827606201, + "learning_rate": 1.6737299145872277e-05, + "loss": 1.7205, + "step": 77980 + }, + { + "epoch": 0.4901852179207668, + "grad_norm": 6.641819000244141, + "learning_rate": 1.6736880044927625e-05, + "loss": 1.796, + "step": 77990 + }, + { + "epoch": 0.4902480702374639, + "grad_norm": 6.25460958480835, + "learning_rate": 1.6736460943982968e-05, + "loss": 1.7323, + "step": 78000 + }, + { + "epoch": 0.49031092255416103, + "grad_norm": 7.75309419631958, + "learning_rate": 1.6736041843038315e-05, + "loss": 1.6054, + "step": 78010 + }, + { + "epoch": 0.49037377487085815, + "grad_norm": 6.942826271057129, + "learning_rate": 1.6735622742093662e-05, + "loss": 1.7125, + "step": 78020 + }, + { + "epoch": 0.49043662718755526, + "grad_norm": 7.00164270401001, + "learning_rate": 1.673520364114901e-05, + "loss": 1.6341, + "step": 78030 + }, + { + "epoch": 0.4904994795042524, + "grad_norm": 8.243274688720703, + "learning_rate": 1.6734784540204353e-05, + "loss": 1.5607, + "step": 78040 + }, + { + "epoch": 0.49056233182094944, + "grad_norm": 7.010618209838867, + "learning_rate": 1.67343654392597e-05, + "loss": 1.6654, + "step": 78050 + }, + { + "epoch": 0.49062518413764655, + "grad_norm": 7.62802791595459, + "learning_rate": 1.6733946338315047e-05, + "loss": 1.8686, + "step": 78060 + }, + { + "epoch": 0.49068803645434367, + "grad_norm": 8.912038803100586, + "learning_rate": 1.6733527237370394e-05, + "loss": 1.5066, + "step": 78070 + }, + { + "epoch": 0.4907508887710408, + "grad_norm": 6.84856653213501, + "learning_rate": 1.673310813642574e-05, + "loss": 1.5826, + "step": 78080 + }, + { + "epoch": 0.4908137410877379, + "grad_norm": 7.078650951385498, + "learning_rate": 1.6732689035481085e-05, + "loss": 1.7848, + "step": 78090 + }, + { + "epoch": 0.490876593404435, + "grad_norm": 6.207369327545166, + "learning_rate": 1.6732269934536432e-05, + "loss": 1.5481, + "step": 78100 + }, + { + "epoch": 0.49093944572113213, + "grad_norm": 5.817479133605957, + "learning_rate": 1.673185083359178e-05, + "loss": 1.8581, + "step": 78110 + }, + { + "epoch": 0.49100229803782924, + "grad_norm": 6.951427936553955, + "learning_rate": 1.6731431732647126e-05, + "loss": 1.6912, + "step": 78120 + }, + { + "epoch": 0.49106515035452636, + "grad_norm": 7.501084804534912, + "learning_rate": 1.6731012631702473e-05, + "loss": 1.5541, + "step": 78130 + }, + { + "epoch": 0.4911280026712235, + "grad_norm": 7.9845051765441895, + "learning_rate": 1.673059353075782e-05, + "loss": 1.7753, + "step": 78140 + }, + { + "epoch": 0.4911908549879206, + "grad_norm": 6.4379563331604, + "learning_rate": 1.6730174429813168e-05, + "loss": 1.6814, + "step": 78150 + }, + { + "epoch": 0.4912537073046177, + "grad_norm": 6.529316425323486, + "learning_rate": 1.6729755328868515e-05, + "loss": 1.5828, + "step": 78160 + }, + { + "epoch": 0.4913165596213148, + "grad_norm": 7.063477516174316, + "learning_rate": 1.6729336227923858e-05, + "loss": 1.6801, + "step": 78170 + }, + { + "epoch": 0.4913794119380119, + "grad_norm": 6.500266075134277, + "learning_rate": 1.6728917126979205e-05, + "loss": 1.5657, + "step": 78180 + }, + { + "epoch": 0.491442264254709, + "grad_norm": 5.756186008453369, + "learning_rate": 1.6728498026034552e-05, + "loss": 1.8106, + "step": 78190 + }, + { + "epoch": 0.4915051165714061, + "grad_norm": 7.048885345458984, + "learning_rate": 1.67280789250899e-05, + "loss": 1.6772, + "step": 78200 + }, + { + "epoch": 0.4915679688881032, + "grad_norm": 7.077173709869385, + "learning_rate": 1.6727659824145247e-05, + "loss": 1.6553, + "step": 78210 + }, + { + "epoch": 0.49163082120480034, + "grad_norm": 6.204776287078857, + "learning_rate": 1.672724072320059e-05, + "loss": 1.6815, + "step": 78220 + }, + { + "epoch": 0.49169367352149745, + "grad_norm": 6.870537757873535, + "learning_rate": 1.6726821622255937e-05, + "loss": 1.7491, + "step": 78230 + }, + { + "epoch": 0.49175652583819457, + "grad_norm": 6.51535177230835, + "learning_rate": 1.6726402521311284e-05, + "loss": 1.8883, + "step": 78240 + }, + { + "epoch": 0.4918193781548917, + "grad_norm": 6.133185863494873, + "learning_rate": 1.672598342036663e-05, + "loss": 1.6109, + "step": 78250 + }, + { + "epoch": 0.4918822304715888, + "grad_norm": 6.261854648590088, + "learning_rate": 1.6725564319421975e-05, + "loss": 2.0821, + "step": 78260 + }, + { + "epoch": 0.4919450827882859, + "grad_norm": 6.175626754760742, + "learning_rate": 1.6725145218477322e-05, + "loss": 1.5385, + "step": 78270 + }, + { + "epoch": 0.49200793510498303, + "grad_norm": 5.714633941650391, + "learning_rate": 1.672472611753267e-05, + "loss": 1.6944, + "step": 78280 + }, + { + "epoch": 0.49207078742168014, + "grad_norm": 6.655765056610107, + "learning_rate": 1.6724307016588016e-05, + "loss": 1.6819, + "step": 78290 + }, + { + "epoch": 0.4921336397383772, + "grad_norm": 7.588636875152588, + "learning_rate": 1.6723887915643363e-05, + "loss": 1.9455, + "step": 78300 + }, + { + "epoch": 0.4921964920550743, + "grad_norm": 7.5459442138671875, + "learning_rate": 1.672346881469871e-05, + "loss": 1.5781, + "step": 78310 + }, + { + "epoch": 0.49225934437177143, + "grad_norm": 7.523612022399902, + "learning_rate": 1.6723049713754058e-05, + "loss": 1.7942, + "step": 78320 + }, + { + "epoch": 0.49232219668846855, + "grad_norm": 6.231264114379883, + "learning_rate": 1.67226306128094e-05, + "loss": 1.5826, + "step": 78330 + }, + { + "epoch": 0.49238504900516566, + "grad_norm": 6.2151641845703125, + "learning_rate": 1.672221151186475e-05, + "loss": 1.6539, + "step": 78340 + }, + { + "epoch": 0.4924479013218628, + "grad_norm": 5.359531879425049, + "learning_rate": 1.6721792410920095e-05, + "loss": 1.5909, + "step": 78350 + }, + { + "epoch": 0.4925107536385599, + "grad_norm": 6.7791290283203125, + "learning_rate": 1.6721373309975442e-05, + "loss": 1.6958, + "step": 78360 + }, + { + "epoch": 0.492573605955257, + "grad_norm": 6.420655250549316, + "learning_rate": 1.672095420903079e-05, + "loss": 1.5403, + "step": 78370 + }, + { + "epoch": 0.4926364582719541, + "grad_norm": 6.764841079711914, + "learning_rate": 1.6720535108086137e-05, + "loss": 1.7971, + "step": 78380 + }, + { + "epoch": 0.49269931058865124, + "grad_norm": 8.062089920043945, + "learning_rate": 1.6720116007141484e-05, + "loss": 1.9059, + "step": 78390 + }, + { + "epoch": 0.49276216290534836, + "grad_norm": 7.529369831085205, + "learning_rate": 1.6719696906196827e-05, + "loss": 1.7136, + "step": 78400 + }, + { + "epoch": 0.49282501522204547, + "grad_norm": 7.275436878204346, + "learning_rate": 1.6719277805252174e-05, + "loss": 1.7757, + "step": 78410 + }, + { + "epoch": 0.4928878675387426, + "grad_norm": 6.625938892364502, + "learning_rate": 1.671885870430752e-05, + "loss": 1.7824, + "step": 78420 + }, + { + "epoch": 0.49295071985543965, + "grad_norm": 7.415990829467773, + "learning_rate": 1.671843960336287e-05, + "loss": 1.6414, + "step": 78430 + }, + { + "epoch": 0.49301357217213676, + "grad_norm": 6.522861480712891, + "learning_rate": 1.6718020502418212e-05, + "loss": 1.6451, + "step": 78440 + }, + { + "epoch": 0.4930764244888339, + "grad_norm": 6.326927185058594, + "learning_rate": 1.671760140147356e-05, + "loss": 1.6692, + "step": 78450 + }, + { + "epoch": 0.493139276805531, + "grad_norm": 6.64226770401001, + "learning_rate": 1.6717182300528906e-05, + "loss": 1.817, + "step": 78460 + }, + { + "epoch": 0.4932021291222281, + "grad_norm": 7.481614112854004, + "learning_rate": 1.6716763199584253e-05, + "loss": 1.7001, + "step": 78470 + }, + { + "epoch": 0.4932649814389252, + "grad_norm": 7.955791473388672, + "learning_rate": 1.6716344098639597e-05, + "loss": 1.6302, + "step": 78480 + }, + { + "epoch": 0.49332783375562234, + "grad_norm": 6.260321140289307, + "learning_rate": 1.6715924997694944e-05, + "loss": 1.6797, + "step": 78490 + }, + { + "epoch": 0.49339068607231945, + "grad_norm": 7.346848487854004, + "learning_rate": 1.671550589675029e-05, + "loss": 1.7796, + "step": 78500 + }, + { + "epoch": 0.49345353838901657, + "grad_norm": 7.295152187347412, + "learning_rate": 1.671508679580564e-05, + "loss": 1.7205, + "step": 78510 + }, + { + "epoch": 0.4935163907057137, + "grad_norm": 6.241100788116455, + "learning_rate": 1.6714667694860985e-05, + "loss": 1.6379, + "step": 78520 + }, + { + "epoch": 0.4935792430224108, + "grad_norm": 6.412459850311279, + "learning_rate": 1.6714290504010797e-05, + "loss": 1.804, + "step": 78530 + }, + { + "epoch": 0.4936420953391079, + "grad_norm": 5.450584411621094, + "learning_rate": 1.6713871403066144e-05, + "loss": 1.7511, + "step": 78540 + }, + { + "epoch": 0.493704947655805, + "grad_norm": 6.743354797363281, + "learning_rate": 1.671345230212149e-05, + "loss": 1.5647, + "step": 78550 + }, + { + "epoch": 0.4937677999725021, + "grad_norm": 8.319733619689941, + "learning_rate": 1.6713033201176835e-05, + "loss": 1.8374, + "step": 78560 + }, + { + "epoch": 0.4938306522891992, + "grad_norm": 6.66218376159668, + "learning_rate": 1.671261410023218e-05, + "loss": 2.0294, + "step": 78570 + }, + { + "epoch": 0.4938935046058963, + "grad_norm": 6.698323726654053, + "learning_rate": 1.671219499928753e-05, + "loss": 1.7704, + "step": 78580 + }, + { + "epoch": 0.49395635692259343, + "grad_norm": 7.755212783813477, + "learning_rate": 1.6711775898342876e-05, + "loss": 1.6396, + "step": 78590 + }, + { + "epoch": 0.49401920923929055, + "grad_norm": 8.131885528564453, + "learning_rate": 1.6711356797398223e-05, + "loss": 1.66, + "step": 78600 + }, + { + "epoch": 0.49408206155598766, + "grad_norm": 6.574060916900635, + "learning_rate": 1.671093769645357e-05, + "loss": 1.6609, + "step": 78610 + }, + { + "epoch": 0.4941449138726848, + "grad_norm": 5.90941047668457, + "learning_rate": 1.6710518595508917e-05, + "loss": 1.6647, + "step": 78620 + }, + { + "epoch": 0.4942077661893819, + "grad_norm": 7.4213948249816895, + "learning_rate": 1.6710099494564264e-05, + "loss": 1.3468, + "step": 78630 + }, + { + "epoch": 0.494270618506079, + "grad_norm": 5.807704925537109, + "learning_rate": 1.670968039361961e-05, + "loss": 1.6507, + "step": 78640 + }, + { + "epoch": 0.4943334708227761, + "grad_norm": 7.539387226104736, + "learning_rate": 1.6709261292674955e-05, + "loss": 1.8559, + "step": 78650 + }, + { + "epoch": 0.49439632313947324, + "grad_norm": 6.764973163604736, + "learning_rate": 1.6708842191730302e-05, + "loss": 1.7355, + "step": 78660 + }, + { + "epoch": 0.49445917545617035, + "grad_norm": 6.689443588256836, + "learning_rate": 1.670842309078565e-05, + "loss": 1.6667, + "step": 78670 + }, + { + "epoch": 0.49452202777286747, + "grad_norm": 7.405571460723877, + "learning_rate": 1.6708003989840996e-05, + "loss": 1.7678, + "step": 78680 + }, + { + "epoch": 0.4945848800895645, + "grad_norm": 6.4708943367004395, + "learning_rate": 1.670758488889634e-05, + "loss": 1.7772, + "step": 78690 + }, + { + "epoch": 0.49464773240626164, + "grad_norm": 6.734555721282959, + "learning_rate": 1.6707165787951687e-05, + "loss": 1.6194, + "step": 78700 + }, + { + "epoch": 0.49471058472295876, + "grad_norm": 7.194657325744629, + "learning_rate": 1.6706746687007034e-05, + "loss": 1.6331, + "step": 78710 + }, + { + "epoch": 0.4947734370396559, + "grad_norm": 6.688826560974121, + "learning_rate": 1.670632758606238e-05, + "loss": 1.768, + "step": 78720 + }, + { + "epoch": 0.494836289356353, + "grad_norm": 8.241920471191406, + "learning_rate": 1.6705908485117728e-05, + "loss": 1.6582, + "step": 78730 + }, + { + "epoch": 0.4948991416730501, + "grad_norm": 6.8648152351379395, + "learning_rate": 1.6705489384173072e-05, + "loss": 1.7388, + "step": 78740 + }, + { + "epoch": 0.4949619939897472, + "grad_norm": 7.46151876449585, + "learning_rate": 1.670507028322842e-05, + "loss": 1.8008, + "step": 78750 + }, + { + "epoch": 0.49502484630644433, + "grad_norm": 5.9637131690979, + "learning_rate": 1.6704651182283766e-05, + "loss": 1.6126, + "step": 78760 + }, + { + "epoch": 0.49508769862314145, + "grad_norm": 6.775414943695068, + "learning_rate": 1.6704232081339113e-05, + "loss": 1.6424, + "step": 78770 + }, + { + "epoch": 0.49515055093983856, + "grad_norm": 5.462296962738037, + "learning_rate": 1.6703812980394457e-05, + "loss": 1.559, + "step": 78780 + }, + { + "epoch": 0.4952134032565357, + "grad_norm": 6.365492343902588, + "learning_rate": 1.6703393879449804e-05, + "loss": 1.6461, + "step": 78790 + }, + { + "epoch": 0.4952762555732328, + "grad_norm": 6.2100510597229, + "learning_rate": 1.670297477850515e-05, + "loss": 1.6577, + "step": 78800 + }, + { + "epoch": 0.49533910788992985, + "grad_norm": 6.8301849365234375, + "learning_rate": 1.6702555677560498e-05, + "loss": 1.7671, + "step": 78810 + }, + { + "epoch": 0.49540196020662697, + "grad_norm": 6.008386611938477, + "learning_rate": 1.6702136576615845e-05, + "loss": 1.7751, + "step": 78820 + }, + { + "epoch": 0.4954648125233241, + "grad_norm": 6.045980930328369, + "learning_rate": 1.6701717475671192e-05, + "loss": 1.5703, + "step": 78830 + }, + { + "epoch": 0.4955276648400212, + "grad_norm": 7.105350971221924, + "learning_rate": 1.670129837472654e-05, + "loss": 1.8338, + "step": 78840 + }, + { + "epoch": 0.4955905171567183, + "grad_norm": 7.170408248901367, + "learning_rate": 1.6700879273781886e-05, + "loss": 1.6635, + "step": 78850 + }, + { + "epoch": 0.49565336947341543, + "grad_norm": 10.234091758728027, + "learning_rate": 1.6700460172837233e-05, + "loss": 2.009, + "step": 78860 + }, + { + "epoch": 0.49571622179011254, + "grad_norm": 7.489864349365234, + "learning_rate": 1.6700041071892577e-05, + "loss": 1.8661, + "step": 78870 + }, + { + "epoch": 0.49577907410680966, + "grad_norm": 6.527287006378174, + "learning_rate": 1.6699621970947924e-05, + "loss": 1.566, + "step": 78880 + }, + { + "epoch": 0.4958419264235068, + "grad_norm": 7.5397443771362305, + "learning_rate": 1.669920287000327e-05, + "loss": 1.6793, + "step": 78890 + }, + { + "epoch": 0.4959047787402039, + "grad_norm": 8.03945541381836, + "learning_rate": 1.6698783769058618e-05, + "loss": 1.8563, + "step": 78900 + }, + { + "epoch": 0.495967631056901, + "grad_norm": 6.880329608917236, + "learning_rate": 1.6698364668113965e-05, + "loss": 1.5937, + "step": 78910 + }, + { + "epoch": 0.4960304833735981, + "grad_norm": 6.510516166687012, + "learning_rate": 1.669794556716931e-05, + "loss": 1.6486, + "step": 78920 + }, + { + "epoch": 0.49609333569029523, + "grad_norm": 6.234617710113525, + "learning_rate": 1.6697526466224656e-05, + "loss": 1.9359, + "step": 78930 + }, + { + "epoch": 0.4961561880069923, + "grad_norm": 6.716326713562012, + "learning_rate": 1.6697107365280003e-05, + "loss": 1.9386, + "step": 78940 + }, + { + "epoch": 0.4962190403236894, + "grad_norm": 5.632035255432129, + "learning_rate": 1.669668826433535e-05, + "loss": 1.7872, + "step": 78950 + }, + { + "epoch": 0.4962818926403865, + "grad_norm": 7.106645107269287, + "learning_rate": 1.6696269163390694e-05, + "loss": 1.6277, + "step": 78960 + }, + { + "epoch": 0.49634474495708364, + "grad_norm": 7.37431001663208, + "learning_rate": 1.669585006244604e-05, + "loss": 1.8052, + "step": 78970 + }, + { + "epoch": 0.49640759727378075, + "grad_norm": 7.185262203216553, + "learning_rate": 1.6695430961501388e-05, + "loss": 1.8382, + "step": 78980 + }, + { + "epoch": 0.49647044959047787, + "grad_norm": 7.572688102722168, + "learning_rate": 1.6695011860556735e-05, + "loss": 1.7511, + "step": 78990 + }, + { + "epoch": 0.496533301907175, + "grad_norm": 6.76294469833374, + "learning_rate": 1.6694592759612082e-05, + "loss": 1.8781, + "step": 79000 + }, + { + "epoch": 0.4965961542238721, + "grad_norm": 6.124716758728027, + "learning_rate": 1.669417365866743e-05, + "loss": 1.5367, + "step": 79010 + }, + { + "epoch": 0.4966590065405692, + "grad_norm": 6.655550956726074, + "learning_rate": 1.6693754557722773e-05, + "loss": 1.6984, + "step": 79020 + }, + { + "epoch": 0.49672185885726633, + "grad_norm": 7.736663341522217, + "learning_rate": 1.669333545677812e-05, + "loss": 1.6341, + "step": 79030 + }, + { + "epoch": 0.49678471117396344, + "grad_norm": 6.517747402191162, + "learning_rate": 1.6692916355833467e-05, + "loss": 1.5439, + "step": 79040 + }, + { + "epoch": 0.49684756349066056, + "grad_norm": 7.1836113929748535, + "learning_rate": 1.6692497254888814e-05, + "loss": 1.7337, + "step": 79050 + }, + { + "epoch": 0.4969104158073577, + "grad_norm": 6.330719947814941, + "learning_rate": 1.669207815394416e-05, + "loss": 1.6814, + "step": 79060 + }, + { + "epoch": 0.49697326812405473, + "grad_norm": 7.029351234436035, + "learning_rate": 1.6691659052999508e-05, + "loss": 1.8586, + "step": 79070 + }, + { + "epoch": 0.49703612044075185, + "grad_norm": 5.868019104003906, + "learning_rate": 1.6691239952054855e-05, + "loss": 1.7189, + "step": 79080 + }, + { + "epoch": 0.49709897275744896, + "grad_norm": 7.495911598205566, + "learning_rate": 1.66908208511102e-05, + "loss": 1.5853, + "step": 79090 + }, + { + "epoch": 0.4971618250741461, + "grad_norm": 8.74422550201416, + "learning_rate": 1.6690401750165546e-05, + "loss": 1.7277, + "step": 79100 + }, + { + "epoch": 0.4972246773908432, + "grad_norm": 7.246800899505615, + "learning_rate": 1.6689982649220893e-05, + "loss": 1.5798, + "step": 79110 + }, + { + "epoch": 0.4972875297075403, + "grad_norm": 5.6232500076293945, + "learning_rate": 1.668956354827624e-05, + "loss": 1.8372, + "step": 79120 + }, + { + "epoch": 0.4973503820242374, + "grad_norm": 6.304361820220947, + "learning_rate": 1.6689144447331587e-05, + "loss": 1.7998, + "step": 79130 + }, + { + "epoch": 0.49741323434093454, + "grad_norm": 6.0489702224731445, + "learning_rate": 1.668872534638693e-05, + "loss": 1.6389, + "step": 79140 + }, + { + "epoch": 0.49747608665763166, + "grad_norm": 7.240331649780273, + "learning_rate": 1.6688306245442278e-05, + "loss": 1.8321, + "step": 79150 + }, + { + "epoch": 0.49753893897432877, + "grad_norm": 6.915675163269043, + "learning_rate": 1.6687887144497625e-05, + "loss": 1.7416, + "step": 79160 + }, + { + "epoch": 0.4976017912910259, + "grad_norm": 6.851961612701416, + "learning_rate": 1.6687468043552972e-05, + "loss": 1.8687, + "step": 79170 + }, + { + "epoch": 0.497664643607723, + "grad_norm": 6.435169219970703, + "learning_rate": 1.6687048942608316e-05, + "loss": 1.7454, + "step": 79180 + }, + { + "epoch": 0.4977274959244201, + "grad_norm": 6.554869174957275, + "learning_rate": 1.6686629841663663e-05, + "loss": 1.667, + "step": 79190 + }, + { + "epoch": 0.4977903482411172, + "grad_norm": 5.473569393157959, + "learning_rate": 1.668621074071901e-05, + "loss": 1.6133, + "step": 79200 + }, + { + "epoch": 0.4978532005578143, + "grad_norm": 6.134435653686523, + "learning_rate": 1.6685791639774357e-05, + "loss": 1.4101, + "step": 79210 + }, + { + "epoch": 0.4979160528745114, + "grad_norm": 6.592728614807129, + "learning_rate": 1.6685372538829704e-05, + "loss": 1.398, + "step": 79220 + }, + { + "epoch": 0.4979789051912085, + "grad_norm": 5.825088977813721, + "learning_rate": 1.668495343788505e-05, + "loss": 1.629, + "step": 79230 + }, + { + "epoch": 0.49804175750790564, + "grad_norm": 6.5205769538879395, + "learning_rate": 1.6684534336940398e-05, + "loss": 1.773, + "step": 79240 + }, + { + "epoch": 0.49810460982460275, + "grad_norm": 5.9171833992004395, + "learning_rate": 1.6684115235995745e-05, + "loss": 1.5642, + "step": 79250 + }, + { + "epoch": 0.49816746214129987, + "grad_norm": 6.904202461242676, + "learning_rate": 1.6683696135051092e-05, + "loss": 1.5912, + "step": 79260 + }, + { + "epoch": 0.498230314457997, + "grad_norm": 5.861415863037109, + "learning_rate": 1.6683277034106436e-05, + "loss": 1.6165, + "step": 79270 + }, + { + "epoch": 0.4982931667746941, + "grad_norm": 6.141399383544922, + "learning_rate": 1.6682857933161783e-05, + "loss": 1.7482, + "step": 79280 + }, + { + "epoch": 0.4983560190913912, + "grad_norm": 6.803699016571045, + "learning_rate": 1.668243883221713e-05, + "loss": 1.6359, + "step": 79290 + }, + { + "epoch": 0.4984188714080883, + "grad_norm": 7.952588081359863, + "learning_rate": 1.6682019731272477e-05, + "loss": 1.743, + "step": 79300 + }, + { + "epoch": 0.49848172372478544, + "grad_norm": 6.018862724304199, + "learning_rate": 1.668160063032782e-05, + "loss": 1.6078, + "step": 79310 + }, + { + "epoch": 0.49854457604148256, + "grad_norm": 6.411401748657227, + "learning_rate": 1.6681181529383168e-05, + "loss": 1.673, + "step": 79320 + }, + { + "epoch": 0.4986074283581796, + "grad_norm": 6.5400872230529785, + "learning_rate": 1.6680762428438515e-05, + "loss": 1.6518, + "step": 79330 + }, + { + "epoch": 0.49867028067487673, + "grad_norm": 7.193464279174805, + "learning_rate": 1.6680343327493862e-05, + "loss": 1.6081, + "step": 79340 + }, + { + "epoch": 0.49873313299157385, + "grad_norm": 7.338587284088135, + "learning_rate": 1.667992422654921e-05, + "loss": 1.7216, + "step": 79350 + }, + { + "epoch": 0.49879598530827096, + "grad_norm": 6.640892028808594, + "learning_rate": 1.6679505125604553e-05, + "loss": 1.7834, + "step": 79360 + }, + { + "epoch": 0.4988588376249681, + "grad_norm": 6.7227349281311035, + "learning_rate": 1.66790860246599e-05, + "loss": 1.7339, + "step": 79370 + }, + { + "epoch": 0.4989216899416652, + "grad_norm": 7.434032917022705, + "learning_rate": 1.6678666923715247e-05, + "loss": 1.6488, + "step": 79380 + }, + { + "epoch": 0.4989845422583623, + "grad_norm": 8.0999174118042, + "learning_rate": 1.6678247822770594e-05, + "loss": 1.7659, + "step": 79390 + }, + { + "epoch": 0.4990473945750594, + "grad_norm": 6.468531608581543, + "learning_rate": 1.6677828721825938e-05, + "loss": 1.7375, + "step": 79400 + }, + { + "epoch": 0.49911024689175654, + "grad_norm": 6.481029987335205, + "learning_rate": 1.6677409620881285e-05, + "loss": 1.8437, + "step": 79410 + }, + { + "epoch": 0.49917309920845365, + "grad_norm": 5.684129238128662, + "learning_rate": 1.6676990519936632e-05, + "loss": 1.7311, + "step": 79420 + }, + { + "epoch": 0.49923595152515077, + "grad_norm": 5.615518569946289, + "learning_rate": 1.667657141899198e-05, + "loss": 1.6639, + "step": 79430 + }, + { + "epoch": 0.4992988038418479, + "grad_norm": 6.510375499725342, + "learning_rate": 1.6676152318047326e-05, + "loss": 1.6085, + "step": 79440 + }, + { + "epoch": 0.49936165615854494, + "grad_norm": 6.378992080688477, + "learning_rate": 1.6675733217102673e-05, + "loss": 1.5479, + "step": 79450 + }, + { + "epoch": 0.49942450847524206, + "grad_norm": 5.967948913574219, + "learning_rate": 1.667531411615802e-05, + "loss": 1.5811, + "step": 79460 + }, + { + "epoch": 0.4994873607919392, + "grad_norm": 6.28527307510376, + "learning_rate": 1.6674895015213367e-05, + "loss": 1.732, + "step": 79470 + }, + { + "epoch": 0.4995502131086363, + "grad_norm": 7.023561954498291, + "learning_rate": 1.6674475914268714e-05, + "loss": 1.6311, + "step": 79480 + }, + { + "epoch": 0.4996130654253334, + "grad_norm": 12.292658805847168, + "learning_rate": 1.6674056813324058e-05, + "loss": 1.5554, + "step": 79490 + }, + { + "epoch": 0.4996759177420305, + "grad_norm": 7.064693450927734, + "learning_rate": 1.6673637712379405e-05, + "loss": 1.8701, + "step": 79500 + }, + { + "epoch": 0.49973877005872763, + "grad_norm": 6.772129058837891, + "learning_rate": 1.6673218611434752e-05, + "loss": 1.728, + "step": 79510 + }, + { + "epoch": 0.49980162237542475, + "grad_norm": 6.433325290679932, + "learning_rate": 1.66727995104901e-05, + "loss": 1.6161, + "step": 79520 + }, + { + "epoch": 0.49986447469212186, + "grad_norm": 6.294651985168457, + "learning_rate": 1.6672380409545446e-05, + "loss": 1.8939, + "step": 79530 + }, + { + "epoch": 0.499927327008819, + "grad_norm": 7.601191997528076, + "learning_rate": 1.667196130860079e-05, + "loss": 1.646, + "step": 79540 + }, + { + "epoch": 0.4999901793255161, + "grad_norm": 6.567856311798096, + "learning_rate": 1.6671542207656137e-05, + "loss": 1.6362, + "step": 79550 + }, + { + "epoch": 0.5000530316422132, + "grad_norm": 7.186223983764648, + "learning_rate": 1.6671123106711484e-05, + "loss": 1.7599, + "step": 79560 + }, + { + "epoch": 0.5001158839589103, + "grad_norm": 7.041337490081787, + "learning_rate": 1.667070400576683e-05, + "loss": 1.6379, + "step": 79570 + }, + { + "epoch": 0.5001787362756074, + "grad_norm": 6.522541522979736, + "learning_rate": 1.6670284904822175e-05, + "loss": 1.4968, + "step": 79580 + }, + { + "epoch": 0.5002415885923045, + "grad_norm": 7.405561923980713, + "learning_rate": 1.6669865803877522e-05, + "loss": 1.6606, + "step": 79590 + }, + { + "epoch": 0.5003044409090016, + "grad_norm": 7.2410173416137695, + "learning_rate": 1.666944670293287e-05, + "loss": 1.7189, + "step": 79600 + }, + { + "epoch": 0.5003672932256987, + "grad_norm": 6.9519476890563965, + "learning_rate": 1.6669027601988216e-05, + "loss": 1.6469, + "step": 79610 + }, + { + "epoch": 0.5004301455423958, + "grad_norm": 6.269879341125488, + "learning_rate": 1.6668608501043563e-05, + "loss": 1.6043, + "step": 79620 + }, + { + "epoch": 0.500492997859093, + "grad_norm": 5.454013824462891, + "learning_rate": 1.666818940009891e-05, + "loss": 1.703, + "step": 79630 + }, + { + "epoch": 0.5005558501757901, + "grad_norm": 6.524136066436768, + "learning_rate": 1.6667770299154257e-05, + "loss": 1.6494, + "step": 79640 + }, + { + "epoch": 0.5006187024924872, + "grad_norm": 7.585860252380371, + "learning_rate": 1.66673511982096e-05, + "loss": 1.9085, + "step": 79650 + }, + { + "epoch": 0.5006815548091843, + "grad_norm": 6.055868148803711, + "learning_rate": 1.6666932097264948e-05, + "loss": 1.7279, + "step": 79660 + }, + { + "epoch": 0.5007444071258814, + "grad_norm": 6.640485763549805, + "learning_rate": 1.6666512996320295e-05, + "loss": 1.617, + "step": 79670 + }, + { + "epoch": 0.5008072594425785, + "grad_norm": 7.06798791885376, + "learning_rate": 1.6666093895375642e-05, + "loss": 1.834, + "step": 79680 + }, + { + "epoch": 0.5008701117592756, + "grad_norm": 6.391556262969971, + "learning_rate": 1.666567479443099e-05, + "loss": 1.6155, + "step": 79690 + }, + { + "epoch": 0.5009329640759728, + "grad_norm": 6.465502738952637, + "learning_rate": 1.6665255693486336e-05, + "loss": 1.6234, + "step": 79700 + }, + { + "epoch": 0.5009958163926699, + "grad_norm": 6.327841758728027, + "learning_rate": 1.6664878502636144e-05, + "loss": 1.5977, + "step": 79710 + }, + { + "epoch": 0.501058668709367, + "grad_norm": 7.11573600769043, + "learning_rate": 1.666445940169149e-05, + "loss": 1.9059, + "step": 79720 + }, + { + "epoch": 0.5011215210260641, + "grad_norm": 7.2466840744018555, + "learning_rate": 1.666404030074684e-05, + "loss": 1.8072, + "step": 79730 + }, + { + "epoch": 0.5011843733427612, + "grad_norm": 7.251349925994873, + "learning_rate": 1.6663621199802185e-05, + "loss": 1.7005, + "step": 79740 + }, + { + "epoch": 0.5012472256594583, + "grad_norm": 6.59246301651001, + "learning_rate": 1.6663202098857533e-05, + "loss": 1.7184, + "step": 79750 + }, + { + "epoch": 0.5013100779761555, + "grad_norm": 6.917023658752441, + "learning_rate": 1.666278299791288e-05, + "loss": 1.7669, + "step": 79760 + }, + { + "epoch": 0.5013729302928525, + "grad_norm": 7.197429656982422, + "learning_rate": 1.6662363896968227e-05, + "loss": 1.9412, + "step": 79770 + }, + { + "epoch": 0.5014357826095496, + "grad_norm": 6.37503719329834, + "learning_rate": 1.6661944796023574e-05, + "loss": 1.8577, + "step": 79780 + }, + { + "epoch": 0.5014986349262467, + "grad_norm": 8.113883972167969, + "learning_rate": 1.6661525695078917e-05, + "loss": 1.7181, + "step": 79790 + }, + { + "epoch": 0.5015614872429438, + "grad_norm": 8.317326545715332, + "learning_rate": 1.6661106594134264e-05, + "loss": 1.6948, + "step": 79800 + }, + { + "epoch": 0.5016243395596409, + "grad_norm": 6.044445037841797, + "learning_rate": 1.666068749318961e-05, + "loss": 1.753, + "step": 79810 + }, + { + "epoch": 0.501687191876338, + "grad_norm": 5.891312599182129, + "learning_rate": 1.666026839224496e-05, + "loss": 1.8129, + "step": 79820 + }, + { + "epoch": 0.5017500441930351, + "grad_norm": 6.871014595031738, + "learning_rate": 1.6659849291300302e-05, + "loss": 1.8918, + "step": 79830 + }, + { + "epoch": 0.5018128965097323, + "grad_norm": 7.177075386047363, + "learning_rate": 1.665943019035565e-05, + "loss": 1.8326, + "step": 79840 + }, + { + "epoch": 0.5018757488264294, + "grad_norm": 6.608696460723877, + "learning_rate": 1.6659011089410996e-05, + "loss": 1.6227, + "step": 79850 + }, + { + "epoch": 0.5019386011431265, + "grad_norm": 7.607576370239258, + "learning_rate": 1.6658591988466344e-05, + "loss": 1.766, + "step": 79860 + }, + { + "epoch": 0.5020014534598236, + "grad_norm": 6.3681254386901855, + "learning_rate": 1.665817288752169e-05, + "loss": 1.8021, + "step": 79870 + }, + { + "epoch": 0.5020643057765207, + "grad_norm": 6.925334930419922, + "learning_rate": 1.6657753786577034e-05, + "loss": 1.565, + "step": 79880 + }, + { + "epoch": 0.5021271580932178, + "grad_norm": 5.767662048339844, + "learning_rate": 1.665733468563238e-05, + "loss": 1.5511, + "step": 79890 + }, + { + "epoch": 0.502190010409915, + "grad_norm": 6.4178147315979, + "learning_rate": 1.665691558468773e-05, + "loss": 1.982, + "step": 79900 + }, + { + "epoch": 0.5022528627266121, + "grad_norm": 6.5203142166137695, + "learning_rate": 1.6656496483743075e-05, + "loss": 1.6557, + "step": 79910 + }, + { + "epoch": 0.5023157150433092, + "grad_norm": 6.301761150360107, + "learning_rate": 1.6656077382798423e-05, + "loss": 1.7178, + "step": 79920 + }, + { + "epoch": 0.5023785673600063, + "grad_norm": 7.500464916229248, + "learning_rate": 1.665565828185377e-05, + "loss": 1.5183, + "step": 79930 + }, + { + "epoch": 0.5024414196767034, + "grad_norm": 6.666856288909912, + "learning_rate": 1.6655239180909117e-05, + "loss": 1.7092, + "step": 79940 + }, + { + "epoch": 0.5025042719934005, + "grad_norm": 6.485771656036377, + "learning_rate": 1.6654820079964464e-05, + "loss": 1.6388, + "step": 79950 + }, + { + "epoch": 0.5025671243100976, + "grad_norm": 6.726188659667969, + "learning_rate": 1.6654400979019807e-05, + "loss": 1.7191, + "step": 79960 + }, + { + "epoch": 0.5026299766267948, + "grad_norm": 6.920906066894531, + "learning_rate": 1.6653981878075155e-05, + "loss": 1.7832, + "step": 79970 + }, + { + "epoch": 0.5026928289434919, + "grad_norm": 7.067952632904053, + "learning_rate": 1.66535627771305e-05, + "loss": 1.8002, + "step": 79980 + }, + { + "epoch": 0.502755681260189, + "grad_norm": 7.4023518562316895, + "learning_rate": 1.665314367618585e-05, + "loss": 1.5641, + "step": 79990 + }, + { + "epoch": 0.5028185335768861, + "grad_norm": 6.506004333496094, + "learning_rate": 1.6652724575241196e-05, + "loss": 1.5715, + "step": 80000 + }, + { + "epoch": 0.5028813858935832, + "grad_norm": 6.265328884124756, + "learning_rate": 1.665230547429654e-05, + "loss": 1.505, + "step": 80010 + }, + { + "epoch": 0.5029442382102803, + "grad_norm": 6.966944217681885, + "learning_rate": 1.6651886373351886e-05, + "loss": 1.6603, + "step": 80020 + }, + { + "epoch": 0.5030070905269773, + "grad_norm": 6.564925670623779, + "learning_rate": 1.6651467272407234e-05, + "loss": 1.8559, + "step": 80030 + }, + { + "epoch": 0.5030699428436745, + "grad_norm": 6.52651834487915, + "learning_rate": 1.665104817146258e-05, + "loss": 1.8099, + "step": 80040 + }, + { + "epoch": 0.5031327951603716, + "grad_norm": 5.758645057678223, + "learning_rate": 1.6650629070517924e-05, + "loss": 1.7989, + "step": 80050 + }, + { + "epoch": 0.5031956474770687, + "grad_norm": 6.67716646194458, + "learning_rate": 1.665020996957327e-05, + "loss": 1.5506, + "step": 80060 + }, + { + "epoch": 0.5032584997937658, + "grad_norm": 6.314333915710449, + "learning_rate": 1.664979086862862e-05, + "loss": 1.6887, + "step": 80070 + }, + { + "epoch": 0.5033213521104629, + "grad_norm": 6.7269768714904785, + "learning_rate": 1.6649371767683966e-05, + "loss": 1.4443, + "step": 80080 + }, + { + "epoch": 0.50338420442716, + "grad_norm": 6.974546432495117, + "learning_rate": 1.6648952666739313e-05, + "loss": 1.7091, + "step": 80090 + }, + { + "epoch": 0.5034470567438571, + "grad_norm": 7.477077007293701, + "learning_rate": 1.6648533565794656e-05, + "loss": 1.6621, + "step": 80100 + }, + { + "epoch": 0.5035099090605543, + "grad_norm": 6.204904556274414, + "learning_rate": 1.6648114464850003e-05, + "loss": 1.8354, + "step": 80110 + }, + { + "epoch": 0.5035727613772514, + "grad_norm": 6.625732898712158, + "learning_rate": 1.664769536390535e-05, + "loss": 1.6104, + "step": 80120 + }, + { + "epoch": 0.5036356136939485, + "grad_norm": 6.183162689208984, + "learning_rate": 1.6647276262960697e-05, + "loss": 1.557, + "step": 80130 + }, + { + "epoch": 0.5036984660106456, + "grad_norm": 7.019721031188965, + "learning_rate": 1.6646857162016045e-05, + "loss": 1.589, + "step": 80140 + }, + { + "epoch": 0.5037613183273427, + "grad_norm": 6.763312816619873, + "learning_rate": 1.664643806107139e-05, + "loss": 1.4686, + "step": 80150 + }, + { + "epoch": 0.5038241706440398, + "grad_norm": 6.7790727615356445, + "learning_rate": 1.664601896012674e-05, + "loss": 1.6208, + "step": 80160 + }, + { + "epoch": 0.503887022960737, + "grad_norm": 6.917080402374268, + "learning_rate": 1.6645599859182086e-05, + "loss": 1.8251, + "step": 80170 + }, + { + "epoch": 0.5039498752774341, + "grad_norm": 6.307061672210693, + "learning_rate": 1.6645180758237433e-05, + "loss": 1.9444, + "step": 80180 + }, + { + "epoch": 0.5040127275941312, + "grad_norm": 8.953438758850098, + "learning_rate": 1.6644761657292777e-05, + "loss": 1.6907, + "step": 80190 + }, + { + "epoch": 0.5040755799108283, + "grad_norm": 6.719122886657715, + "learning_rate": 1.6644342556348124e-05, + "loss": 1.6157, + "step": 80200 + }, + { + "epoch": 0.5041384322275254, + "grad_norm": 5.839183330535889, + "learning_rate": 1.664392345540347e-05, + "loss": 1.6481, + "step": 80210 + }, + { + "epoch": 0.5042012845442225, + "grad_norm": 6.459103584289551, + "learning_rate": 1.6643504354458818e-05, + "loss": 1.7897, + "step": 80220 + }, + { + "epoch": 0.5042641368609196, + "grad_norm": 6.475305080413818, + "learning_rate": 1.664308525351416e-05, + "loss": 1.6833, + "step": 80230 + }, + { + "epoch": 0.5043269891776168, + "grad_norm": 7.551462650299072, + "learning_rate": 1.664266615256951e-05, + "loss": 1.5904, + "step": 80240 + }, + { + "epoch": 0.5043898414943139, + "grad_norm": 7.036133766174316, + "learning_rate": 1.6642247051624856e-05, + "loss": 1.7531, + "step": 80250 + }, + { + "epoch": 0.504452693811011, + "grad_norm": 7.232636451721191, + "learning_rate": 1.6641827950680203e-05, + "loss": 1.7086, + "step": 80260 + }, + { + "epoch": 0.5045155461277081, + "grad_norm": 6.5783915519714355, + "learning_rate": 1.664140884973555e-05, + "loss": 1.8095, + "step": 80270 + }, + { + "epoch": 0.5045783984444051, + "grad_norm": 6.384034156799316, + "learning_rate": 1.6640989748790893e-05, + "loss": 1.7586, + "step": 80280 + }, + { + "epoch": 0.5046412507611022, + "grad_norm": 6.8381805419921875, + "learning_rate": 1.664057064784624e-05, + "loss": 1.6873, + "step": 80290 + }, + { + "epoch": 0.5047041030777993, + "grad_norm": 6.193711280822754, + "learning_rate": 1.6640151546901588e-05, + "loss": 1.6226, + "step": 80300 + }, + { + "epoch": 0.5047669553944965, + "grad_norm": 6.295399188995361, + "learning_rate": 1.6639732445956935e-05, + "loss": 1.9116, + "step": 80310 + }, + { + "epoch": 0.5048298077111936, + "grad_norm": 7.165435791015625, + "learning_rate": 1.663931334501228e-05, + "loss": 1.6044, + "step": 80320 + }, + { + "epoch": 0.5048926600278907, + "grad_norm": 7.73527193069458, + "learning_rate": 1.663889424406763e-05, + "loss": 1.831, + "step": 80330 + }, + { + "epoch": 0.5049555123445878, + "grad_norm": 7.749932765960693, + "learning_rate": 1.6638475143122972e-05, + "loss": 1.6931, + "step": 80340 + }, + { + "epoch": 0.5050183646612849, + "grad_norm": 6.340909481048584, + "learning_rate": 1.663805604217832e-05, + "loss": 1.6823, + "step": 80350 + }, + { + "epoch": 0.505081216977982, + "grad_norm": 6.750431537628174, + "learning_rate": 1.6637636941233667e-05, + "loss": 1.7272, + "step": 80360 + }, + { + "epoch": 0.5051440692946791, + "grad_norm": 6.4755120277404785, + "learning_rate": 1.6637217840289014e-05, + "loss": 1.8015, + "step": 80370 + }, + { + "epoch": 0.5052069216113763, + "grad_norm": 6.332870960235596, + "learning_rate": 1.663679873934436e-05, + "loss": 1.7403, + "step": 80380 + }, + { + "epoch": 0.5052697739280734, + "grad_norm": 6.5690412521362305, + "learning_rate": 1.6636379638399708e-05, + "loss": 1.8548, + "step": 80390 + }, + { + "epoch": 0.5053326262447705, + "grad_norm": 7.2789225578308105, + "learning_rate": 1.6635960537455055e-05, + "loss": 1.7637, + "step": 80400 + }, + { + "epoch": 0.5053954785614676, + "grad_norm": 7.356903076171875, + "learning_rate": 1.66355414365104e-05, + "loss": 1.7046, + "step": 80410 + }, + { + "epoch": 0.5054583308781647, + "grad_norm": 6.851481914520264, + "learning_rate": 1.6635122335565746e-05, + "loss": 1.7586, + "step": 80420 + }, + { + "epoch": 0.5055211831948618, + "grad_norm": 6.518064498901367, + "learning_rate": 1.6634703234621093e-05, + "loss": 1.7848, + "step": 80430 + }, + { + "epoch": 0.505584035511559, + "grad_norm": 6.317898273468018, + "learning_rate": 1.663428413367644e-05, + "loss": 1.6302, + "step": 80440 + }, + { + "epoch": 0.5056468878282561, + "grad_norm": 6.76554536819458, + "learning_rate": 1.6633865032731783e-05, + "loss": 1.7885, + "step": 80450 + }, + { + "epoch": 0.5057097401449532, + "grad_norm": 7.445808410644531, + "learning_rate": 1.663344593178713e-05, + "loss": 1.763, + "step": 80460 + }, + { + "epoch": 0.5057725924616503, + "grad_norm": 7.02715539932251, + "learning_rate": 1.6633026830842478e-05, + "loss": 1.8355, + "step": 80470 + }, + { + "epoch": 0.5058354447783474, + "grad_norm": 6.42625093460083, + "learning_rate": 1.6632607729897825e-05, + "loss": 1.5474, + "step": 80480 + }, + { + "epoch": 0.5058982970950445, + "grad_norm": 7.86593770980835, + "learning_rate": 1.6632188628953172e-05, + "loss": 1.7894, + "step": 80490 + }, + { + "epoch": 0.5059611494117416, + "grad_norm": 6.331607341766357, + "learning_rate": 1.6631769528008515e-05, + "loss": 1.7241, + "step": 80500 + }, + { + "epoch": 0.5060240017284388, + "grad_norm": 7.302580833435059, + "learning_rate": 1.6631350427063862e-05, + "loss": 1.7968, + "step": 80510 + }, + { + "epoch": 0.5060868540451359, + "grad_norm": 5.495927810668945, + "learning_rate": 1.663093132611921e-05, + "loss": 1.6062, + "step": 80520 + }, + { + "epoch": 0.506149706361833, + "grad_norm": 6.512253761291504, + "learning_rate": 1.6630512225174557e-05, + "loss": 1.6953, + "step": 80530 + }, + { + "epoch": 0.50621255867853, + "grad_norm": 8.227531433105469, + "learning_rate": 1.6630093124229904e-05, + "loss": 1.688, + "step": 80540 + }, + { + "epoch": 0.5062754109952271, + "grad_norm": 7.652024745941162, + "learning_rate": 1.662967402328525e-05, + "loss": 1.719, + "step": 80550 + }, + { + "epoch": 0.5063382633119242, + "grad_norm": 6.868397235870361, + "learning_rate": 1.6629254922340598e-05, + "loss": 1.5406, + "step": 80560 + }, + { + "epoch": 0.5064011156286213, + "grad_norm": 5.889806270599365, + "learning_rate": 1.6628835821395945e-05, + "loss": 1.7383, + "step": 80570 + }, + { + "epoch": 0.5064639679453184, + "grad_norm": 5.755928039550781, + "learning_rate": 1.6628416720451292e-05, + "loss": 1.7932, + "step": 80580 + }, + { + "epoch": 0.5065268202620156, + "grad_norm": 7.116064071655273, + "learning_rate": 1.6627997619506636e-05, + "loss": 1.6301, + "step": 80590 + }, + { + "epoch": 0.5065896725787127, + "grad_norm": 6.306654930114746, + "learning_rate": 1.6627578518561983e-05, + "loss": 1.5587, + "step": 80600 + }, + { + "epoch": 0.5066525248954098, + "grad_norm": 6.208273887634277, + "learning_rate": 1.662715941761733e-05, + "loss": 1.6283, + "step": 80610 + }, + { + "epoch": 0.5067153772121069, + "grad_norm": 6.317222595214844, + "learning_rate": 1.6626740316672677e-05, + "loss": 1.6919, + "step": 80620 + }, + { + "epoch": 0.506778229528804, + "grad_norm": 6.808119297027588, + "learning_rate": 1.662632121572802e-05, + "loss": 1.7396, + "step": 80630 + }, + { + "epoch": 0.5068410818455011, + "grad_norm": 6.985289573669434, + "learning_rate": 1.6625902114783368e-05, + "loss": 1.8456, + "step": 80640 + }, + { + "epoch": 0.5069039341621983, + "grad_norm": 7.419642925262451, + "learning_rate": 1.6625483013838715e-05, + "loss": 1.6966, + "step": 80650 + }, + { + "epoch": 0.5069667864788954, + "grad_norm": 5.950889587402344, + "learning_rate": 1.6625063912894062e-05, + "loss": 1.8473, + "step": 80660 + }, + { + "epoch": 0.5070296387955925, + "grad_norm": 6.5306878089904785, + "learning_rate": 1.662464481194941e-05, + "loss": 1.8264, + "step": 80670 + }, + { + "epoch": 0.5070924911122896, + "grad_norm": 6.157708644866943, + "learning_rate": 1.6624225711004752e-05, + "loss": 1.642, + "step": 80680 + }, + { + "epoch": 0.5071553434289867, + "grad_norm": 6.647695064544678, + "learning_rate": 1.66238066100601e-05, + "loss": 1.7642, + "step": 80690 + }, + { + "epoch": 0.5072181957456838, + "grad_norm": 7.1048359870910645, + "learning_rate": 1.6623387509115447e-05, + "loss": 1.6765, + "step": 80700 + }, + { + "epoch": 0.507281048062381, + "grad_norm": 7.053614139556885, + "learning_rate": 1.6622968408170794e-05, + "loss": 1.5628, + "step": 80710 + }, + { + "epoch": 0.5073439003790781, + "grad_norm": 7.329823970794678, + "learning_rate": 1.6622549307226137e-05, + "loss": 1.7809, + "step": 80720 + }, + { + "epoch": 0.5074067526957752, + "grad_norm": 5.393566608428955, + "learning_rate": 1.6622130206281484e-05, + "loss": 1.5963, + "step": 80730 + }, + { + "epoch": 0.5074696050124723, + "grad_norm": 6.427209854125977, + "learning_rate": 1.662171110533683e-05, + "loss": 1.5768, + "step": 80740 + }, + { + "epoch": 0.5075324573291694, + "grad_norm": 6.104846954345703, + "learning_rate": 1.662129200439218e-05, + "loss": 1.9616, + "step": 80750 + }, + { + "epoch": 0.5075953096458665, + "grad_norm": 6.925151824951172, + "learning_rate": 1.6620872903447526e-05, + "loss": 1.6709, + "step": 80760 + }, + { + "epoch": 0.5076581619625636, + "grad_norm": 8.292290687561035, + "learning_rate": 1.6620453802502873e-05, + "loss": 1.6986, + "step": 80770 + }, + { + "epoch": 0.5077210142792608, + "grad_norm": 7.496538162231445, + "learning_rate": 1.662003470155822e-05, + "loss": 1.6919, + "step": 80780 + }, + { + "epoch": 0.5077838665959578, + "grad_norm": 5.652065753936768, + "learning_rate": 1.6619615600613567e-05, + "loss": 1.6888, + "step": 80790 + }, + { + "epoch": 0.5078467189126549, + "grad_norm": 6.844841480255127, + "learning_rate": 1.6619196499668914e-05, + "loss": 1.8496, + "step": 80800 + }, + { + "epoch": 0.507909571229352, + "grad_norm": 5.9920806884765625, + "learning_rate": 1.6618777398724258e-05, + "loss": 1.5783, + "step": 80810 + }, + { + "epoch": 0.5079724235460491, + "grad_norm": 7.539910793304443, + "learning_rate": 1.6618358297779605e-05, + "loss": 1.8375, + "step": 80820 + }, + { + "epoch": 0.5080352758627462, + "grad_norm": 7.230218887329102, + "learning_rate": 1.6617939196834952e-05, + "loss": 1.7045, + "step": 80830 + }, + { + "epoch": 0.5080981281794433, + "grad_norm": 6.424299716949463, + "learning_rate": 1.66175200958903e-05, + "loss": 1.5901, + "step": 80840 + }, + { + "epoch": 0.5081609804961404, + "grad_norm": 8.0871000289917, + "learning_rate": 1.6617100994945643e-05, + "loss": 1.7978, + "step": 80850 + }, + { + "epoch": 0.5082238328128376, + "grad_norm": 6.569265365600586, + "learning_rate": 1.661668189400099e-05, + "loss": 1.5899, + "step": 80860 + }, + { + "epoch": 0.5082866851295347, + "grad_norm": 6.608719348907471, + "learning_rate": 1.6616262793056337e-05, + "loss": 1.5594, + "step": 80870 + }, + { + "epoch": 0.5083495374462318, + "grad_norm": 7.097671031951904, + "learning_rate": 1.6615843692111684e-05, + "loss": 1.6149, + "step": 80880 + }, + { + "epoch": 0.5084123897629289, + "grad_norm": 7.51013708114624, + "learning_rate": 1.661542459116703e-05, + "loss": 1.6666, + "step": 80890 + }, + { + "epoch": 0.508475242079626, + "grad_norm": 7.3628058433532715, + "learning_rate": 1.6615005490222374e-05, + "loss": 1.4975, + "step": 80900 + }, + { + "epoch": 0.5085380943963231, + "grad_norm": 7.488697528839111, + "learning_rate": 1.661458638927772e-05, + "loss": 1.8612, + "step": 80910 + }, + { + "epoch": 0.5086009467130203, + "grad_norm": 6.688188076019287, + "learning_rate": 1.661416728833307e-05, + "loss": 1.6634, + "step": 80920 + }, + { + "epoch": 0.5086637990297174, + "grad_norm": 5.443098068237305, + "learning_rate": 1.6613748187388416e-05, + "loss": 1.551, + "step": 80930 + }, + { + "epoch": 0.5087266513464145, + "grad_norm": 6.645493030548096, + "learning_rate": 1.6613329086443763e-05, + "loss": 1.6804, + "step": 80940 + }, + { + "epoch": 0.5087895036631116, + "grad_norm": 6.472682476043701, + "learning_rate": 1.661290998549911e-05, + "loss": 1.4649, + "step": 80950 + }, + { + "epoch": 0.5088523559798087, + "grad_norm": 6.152359485626221, + "learning_rate": 1.6612490884554457e-05, + "loss": 1.6103, + "step": 80960 + }, + { + "epoch": 0.5089152082965058, + "grad_norm": 7.1260986328125, + "learning_rate": 1.66120717836098e-05, + "loss": 1.6803, + "step": 80970 + }, + { + "epoch": 0.5089780606132029, + "grad_norm": 7.128537178039551, + "learning_rate": 1.6611652682665148e-05, + "loss": 1.7006, + "step": 80980 + }, + { + "epoch": 0.5090409129299001, + "grad_norm": 6.813705921173096, + "learning_rate": 1.6611233581720495e-05, + "loss": 1.9884, + "step": 80990 + }, + { + "epoch": 0.5091037652465972, + "grad_norm": 6.137548923492432, + "learning_rate": 1.6610814480775842e-05, + "loss": 1.7066, + "step": 81000 + }, + { + "epoch": 0.5091666175632943, + "grad_norm": 7.284581184387207, + "learning_rate": 1.661039537983119e-05, + "loss": 1.6337, + "step": 81010 + }, + { + "epoch": 0.5092294698799914, + "grad_norm": 7.394177436828613, + "learning_rate": 1.6609976278886536e-05, + "loss": 1.6749, + "step": 81020 + }, + { + "epoch": 0.5092923221966885, + "grad_norm": 7.670345783233643, + "learning_rate": 1.660955717794188e-05, + "loss": 1.7117, + "step": 81030 + }, + { + "epoch": 0.5093551745133856, + "grad_norm": 5.4331817626953125, + "learning_rate": 1.6609138076997227e-05, + "loss": 1.5852, + "step": 81040 + }, + { + "epoch": 0.5094180268300826, + "grad_norm": 7.567220687866211, + "learning_rate": 1.6608718976052574e-05, + "loss": 1.5489, + "step": 81050 + }, + { + "epoch": 0.5094808791467798, + "grad_norm": 6.857569217681885, + "learning_rate": 1.660829987510792e-05, + "loss": 1.9249, + "step": 81060 + }, + { + "epoch": 0.5095437314634769, + "grad_norm": 6.4807562828063965, + "learning_rate": 1.6607880774163265e-05, + "loss": 1.9061, + "step": 81070 + }, + { + "epoch": 0.509606583780174, + "grad_norm": 6.4150567054748535, + "learning_rate": 1.660746167321861e-05, + "loss": 1.6716, + "step": 81080 + }, + { + "epoch": 0.5096694360968711, + "grad_norm": 6.111120223999023, + "learning_rate": 1.660704257227396e-05, + "loss": 1.858, + "step": 81090 + }, + { + "epoch": 0.5097322884135682, + "grad_norm": 6.444390773773193, + "learning_rate": 1.6606623471329306e-05, + "loss": 1.4956, + "step": 81100 + }, + { + "epoch": 0.5097951407302653, + "grad_norm": 7.489157199859619, + "learning_rate": 1.6606204370384653e-05, + "loss": 1.582, + "step": 81110 + }, + { + "epoch": 0.5098579930469624, + "grad_norm": 6.417622089385986, + "learning_rate": 1.6605785269439996e-05, + "loss": 1.6753, + "step": 81120 + }, + { + "epoch": 0.5099208453636596, + "grad_norm": 6.800785064697266, + "learning_rate": 1.6605366168495344e-05, + "loss": 1.6678, + "step": 81130 + }, + { + "epoch": 0.5099836976803567, + "grad_norm": 5.283477783203125, + "learning_rate": 1.660494706755069e-05, + "loss": 1.6663, + "step": 81140 + }, + { + "epoch": 0.5100465499970538, + "grad_norm": 6.489543437957764, + "learning_rate": 1.6604527966606038e-05, + "loss": 1.7745, + "step": 81150 + }, + { + "epoch": 0.5101094023137509, + "grad_norm": 7.533888816833496, + "learning_rate": 1.6604108865661385e-05, + "loss": 1.7328, + "step": 81160 + }, + { + "epoch": 0.510172254630448, + "grad_norm": 7.438111782073975, + "learning_rate": 1.6603689764716732e-05, + "loss": 1.5361, + "step": 81170 + }, + { + "epoch": 0.5102351069471451, + "grad_norm": 7.251165866851807, + "learning_rate": 1.660327066377208e-05, + "loss": 1.5971, + "step": 81180 + }, + { + "epoch": 0.5102979592638422, + "grad_norm": 7.350485801696777, + "learning_rate": 1.6602851562827426e-05, + "loss": 1.7068, + "step": 81190 + }, + { + "epoch": 0.5103608115805394, + "grad_norm": 7.594714164733887, + "learning_rate": 1.6602432461882773e-05, + "loss": 1.6654, + "step": 81200 + }, + { + "epoch": 0.5104236638972365, + "grad_norm": 9.837715148925781, + "learning_rate": 1.6602013360938117e-05, + "loss": 1.7133, + "step": 81210 + }, + { + "epoch": 0.5104865162139336, + "grad_norm": 6.73118782043457, + "learning_rate": 1.6601594259993464e-05, + "loss": 1.6478, + "step": 81220 + }, + { + "epoch": 0.5105493685306307, + "grad_norm": 7.012758255004883, + "learning_rate": 1.660117515904881e-05, + "loss": 1.5899, + "step": 81230 + }, + { + "epoch": 0.5106122208473278, + "grad_norm": 7.66168737411499, + "learning_rate": 1.6600756058104158e-05, + "loss": 1.4999, + "step": 81240 + }, + { + "epoch": 0.5106750731640249, + "grad_norm": 6.692058563232422, + "learning_rate": 1.66003369571595e-05, + "loss": 1.7145, + "step": 81250 + }, + { + "epoch": 0.510737925480722, + "grad_norm": 7.03905725479126, + "learning_rate": 1.659991785621485e-05, + "loss": 1.6896, + "step": 81260 + }, + { + "epoch": 0.5108007777974192, + "grad_norm": 6.233604431152344, + "learning_rate": 1.6599498755270196e-05, + "loss": 1.63, + "step": 81270 + }, + { + "epoch": 0.5108636301141163, + "grad_norm": 6.349859714508057, + "learning_rate": 1.6599079654325543e-05, + "loss": 1.5801, + "step": 81280 + }, + { + "epoch": 0.5109264824308134, + "grad_norm": 6.295588970184326, + "learning_rate": 1.659866055338089e-05, + "loss": 1.8677, + "step": 81290 + }, + { + "epoch": 0.5109893347475104, + "grad_norm": 7.059310436248779, + "learning_rate": 1.6598241452436234e-05, + "loss": 1.6375, + "step": 81300 + }, + { + "epoch": 0.5110521870642075, + "grad_norm": 8.250588417053223, + "learning_rate": 1.659782235149158e-05, + "loss": 1.9543, + "step": 81310 + }, + { + "epoch": 0.5111150393809046, + "grad_norm": 5.822747707366943, + "learning_rate": 1.6597403250546928e-05, + "loss": 1.6694, + "step": 81320 + }, + { + "epoch": 0.5111778916976017, + "grad_norm": 6.962681293487549, + "learning_rate": 1.6596984149602275e-05, + "loss": 1.5185, + "step": 81330 + }, + { + "epoch": 0.5112407440142989, + "grad_norm": 7.692488193511963, + "learning_rate": 1.6596565048657622e-05, + "loss": 1.7951, + "step": 81340 + }, + { + "epoch": 0.511303596330996, + "grad_norm": 7.1396284103393555, + "learning_rate": 1.6596145947712966e-05, + "loss": 1.8221, + "step": 81350 + }, + { + "epoch": 0.5113664486476931, + "grad_norm": 5.930846214294434, + "learning_rate": 1.6595726846768313e-05, + "loss": 1.7014, + "step": 81360 + }, + { + "epoch": 0.5114293009643902, + "grad_norm": 7.050151824951172, + "learning_rate": 1.659530774582366e-05, + "loss": 1.6139, + "step": 81370 + }, + { + "epoch": 0.5114921532810873, + "grad_norm": 5.861669540405273, + "learning_rate": 1.6594888644879007e-05, + "loss": 1.6394, + "step": 81380 + }, + { + "epoch": 0.5115550055977844, + "grad_norm": 6.0154266357421875, + "learning_rate": 1.6594469543934354e-05, + "loss": 1.5412, + "step": 81390 + }, + { + "epoch": 0.5116178579144816, + "grad_norm": 6.396653175354004, + "learning_rate": 1.65940504429897e-05, + "loss": 1.6857, + "step": 81400 + }, + { + "epoch": 0.5116807102311787, + "grad_norm": 6.678314685821533, + "learning_rate": 1.6593631342045048e-05, + "loss": 1.5498, + "step": 81410 + }, + { + "epoch": 0.5117435625478758, + "grad_norm": 6.268189907073975, + "learning_rate": 1.6593212241100395e-05, + "loss": 1.5248, + "step": 81420 + }, + { + "epoch": 0.5118064148645729, + "grad_norm": 6.835800647735596, + "learning_rate": 1.659279314015574e-05, + "loss": 1.5798, + "step": 81430 + }, + { + "epoch": 0.51186926718127, + "grad_norm": 5.7435832023620605, + "learning_rate": 1.6592374039211086e-05, + "loss": 1.6341, + "step": 81440 + }, + { + "epoch": 0.5119321194979671, + "grad_norm": 5.660763263702393, + "learning_rate": 1.6591954938266433e-05, + "loss": 1.4495, + "step": 81450 + }, + { + "epoch": 0.5119949718146642, + "grad_norm": 7.291779041290283, + "learning_rate": 1.659153583732178e-05, + "loss": 1.6463, + "step": 81460 + }, + { + "epoch": 0.5120578241313614, + "grad_norm": 5.4878363609313965, + "learning_rate": 1.6591116736377124e-05, + "loss": 1.6303, + "step": 81470 + }, + { + "epoch": 0.5121206764480585, + "grad_norm": 5.558329105377197, + "learning_rate": 1.659069763543247e-05, + "loss": 1.7346, + "step": 81480 + }, + { + "epoch": 0.5121835287647556, + "grad_norm": 6.605159282684326, + "learning_rate": 1.6590278534487818e-05, + "loss": 1.4828, + "step": 81490 + }, + { + "epoch": 0.5122463810814527, + "grad_norm": 7.170902252197266, + "learning_rate": 1.6589859433543165e-05, + "loss": 1.7989, + "step": 81500 + }, + { + "epoch": 0.5123092333981498, + "grad_norm": 6.16931676864624, + "learning_rate": 1.6589440332598512e-05, + "loss": 1.5974, + "step": 81510 + }, + { + "epoch": 0.5123720857148469, + "grad_norm": 6.78040885925293, + "learning_rate": 1.6589021231653856e-05, + "loss": 1.7621, + "step": 81520 + }, + { + "epoch": 0.512434938031544, + "grad_norm": 6.225697994232178, + "learning_rate": 1.6588602130709203e-05, + "loss": 1.8161, + "step": 81530 + }, + { + "epoch": 0.5124977903482412, + "grad_norm": 6.077652454376221, + "learning_rate": 1.658818302976455e-05, + "loss": 1.5478, + "step": 81540 + }, + { + "epoch": 0.5125606426649383, + "grad_norm": 5.984597682952881, + "learning_rate": 1.6587763928819897e-05, + "loss": 1.3905, + "step": 81550 + }, + { + "epoch": 0.5126234949816353, + "grad_norm": 7.048511028289795, + "learning_rate": 1.6587344827875244e-05, + "loss": 1.4659, + "step": 81560 + }, + { + "epoch": 0.5126863472983324, + "grad_norm": 7.914498329162598, + "learning_rate": 1.658692572693059e-05, + "loss": 1.7662, + "step": 81570 + }, + { + "epoch": 0.5127491996150295, + "grad_norm": 7.3455071449279785, + "learning_rate": 1.6586506625985938e-05, + "loss": 1.9286, + "step": 81580 + }, + { + "epoch": 0.5128120519317266, + "grad_norm": 7.239782333374023, + "learning_rate": 1.6586087525041282e-05, + "loss": 1.8077, + "step": 81590 + }, + { + "epoch": 0.5128749042484237, + "grad_norm": 6.27215051651001, + "learning_rate": 1.658566842409663e-05, + "loss": 1.8022, + "step": 81600 + }, + { + "epoch": 0.5129377565651209, + "grad_norm": 4.653055191040039, + "learning_rate": 1.6585249323151976e-05, + "loss": 1.6557, + "step": 81610 + }, + { + "epoch": 0.513000608881818, + "grad_norm": 8.42682933807373, + "learning_rate": 1.6584830222207323e-05, + "loss": 1.8114, + "step": 81620 + }, + { + "epoch": 0.5130634611985151, + "grad_norm": 7.246777057647705, + "learning_rate": 1.658441112126267e-05, + "loss": 1.5247, + "step": 81630 + }, + { + "epoch": 0.5131263135152122, + "grad_norm": 6.788954257965088, + "learning_rate": 1.6583992020318017e-05, + "loss": 1.6471, + "step": 81640 + }, + { + "epoch": 0.5131891658319093, + "grad_norm": 6.673451900482178, + "learning_rate": 1.658357291937336e-05, + "loss": 1.595, + "step": 81650 + }, + { + "epoch": 0.5132520181486064, + "grad_norm": 6.126130104064941, + "learning_rate": 1.6583153818428708e-05, + "loss": 1.6165, + "step": 81660 + }, + { + "epoch": 0.5133148704653036, + "grad_norm": 6.588374137878418, + "learning_rate": 1.6582734717484055e-05, + "loss": 1.742, + "step": 81670 + }, + { + "epoch": 0.5133777227820007, + "grad_norm": 6.954014301300049, + "learning_rate": 1.6582315616539402e-05, + "loss": 1.5431, + "step": 81680 + }, + { + "epoch": 0.5134405750986978, + "grad_norm": 5.762803554534912, + "learning_rate": 1.6581896515594746e-05, + "loss": 1.7215, + "step": 81690 + }, + { + "epoch": 0.5135034274153949, + "grad_norm": 6.05938720703125, + "learning_rate": 1.6581477414650093e-05, + "loss": 1.7686, + "step": 81700 + }, + { + "epoch": 0.513566279732092, + "grad_norm": 5.955667972564697, + "learning_rate": 1.658105831370544e-05, + "loss": 1.6845, + "step": 81710 + }, + { + "epoch": 0.5136291320487891, + "grad_norm": 6.757447719573975, + "learning_rate": 1.6580639212760787e-05, + "loss": 1.6399, + "step": 81720 + }, + { + "epoch": 0.5136919843654862, + "grad_norm": 5.686888694763184, + "learning_rate": 1.6580220111816134e-05, + "loss": 1.2893, + "step": 81730 + }, + { + "epoch": 0.5137548366821834, + "grad_norm": 6.872438430786133, + "learning_rate": 1.6579801010871478e-05, + "loss": 1.742, + "step": 81740 + }, + { + "epoch": 0.5138176889988805, + "grad_norm": 6.417477607727051, + "learning_rate": 1.6579381909926825e-05, + "loss": 1.432, + "step": 81750 + }, + { + "epoch": 0.5138805413155776, + "grad_norm": 6.8781418800354, + "learning_rate": 1.6578962808982172e-05, + "loss": 1.8819, + "step": 81760 + }, + { + "epoch": 0.5139433936322747, + "grad_norm": 8.25013256072998, + "learning_rate": 1.657854370803752e-05, + "loss": 1.7962, + "step": 81770 + }, + { + "epoch": 0.5140062459489718, + "grad_norm": 7.275095462799072, + "learning_rate": 1.6578124607092866e-05, + "loss": 1.905, + "step": 81780 + }, + { + "epoch": 0.5140690982656689, + "grad_norm": 8.735372543334961, + "learning_rate": 1.6577705506148213e-05, + "loss": 1.9513, + "step": 81790 + }, + { + "epoch": 0.514131950582366, + "grad_norm": 6.6073689460754395, + "learning_rate": 1.657728640520356e-05, + "loss": 1.7017, + "step": 81800 + }, + { + "epoch": 0.514194802899063, + "grad_norm": 6.993113040924072, + "learning_rate": 1.6576867304258907e-05, + "loss": 1.8756, + "step": 81810 + }, + { + "epoch": 0.5142576552157602, + "grad_norm": 8.592490196228027, + "learning_rate": 1.6576448203314254e-05, + "loss": 1.7714, + "step": 81820 + }, + { + "epoch": 0.5143205075324573, + "grad_norm": 7.213768005371094, + "learning_rate": 1.6576029102369598e-05, + "loss": 1.724, + "step": 81830 + }, + { + "epoch": 0.5143833598491544, + "grad_norm": 6.945169925689697, + "learning_rate": 1.6575610001424945e-05, + "loss": 1.6812, + "step": 81840 + }, + { + "epoch": 0.5144462121658515, + "grad_norm": 7.712201118469238, + "learning_rate": 1.6575190900480292e-05, + "loss": 1.6623, + "step": 81850 + }, + { + "epoch": 0.5145090644825486, + "grad_norm": 6.78476619720459, + "learning_rate": 1.657477179953564e-05, + "loss": 1.5049, + "step": 81860 + }, + { + "epoch": 0.5145719167992457, + "grad_norm": 6.764843463897705, + "learning_rate": 1.6574352698590983e-05, + "loss": 1.9898, + "step": 81870 + }, + { + "epoch": 0.5146347691159429, + "grad_norm": 7.237852573394775, + "learning_rate": 1.657393359764633e-05, + "loss": 1.6505, + "step": 81880 + }, + { + "epoch": 0.51469762143264, + "grad_norm": 6.3692851066589355, + "learning_rate": 1.6573514496701677e-05, + "loss": 1.728, + "step": 81890 + }, + { + "epoch": 0.5147604737493371, + "grad_norm": 7.024084568023682, + "learning_rate": 1.6573095395757024e-05, + "loss": 1.4154, + "step": 81900 + }, + { + "epoch": 0.5148233260660342, + "grad_norm": 6.015324115753174, + "learning_rate": 1.657267629481237e-05, + "loss": 1.6963, + "step": 81910 + }, + { + "epoch": 0.5148861783827313, + "grad_norm": 6.13345193862915, + "learning_rate": 1.6572257193867715e-05, + "loss": 1.5816, + "step": 81920 + }, + { + "epoch": 0.5149490306994284, + "grad_norm": 6.69673490524292, + "learning_rate": 1.6571838092923062e-05, + "loss": 1.7643, + "step": 81930 + }, + { + "epoch": 0.5150118830161255, + "grad_norm": 6.703579902648926, + "learning_rate": 1.657141899197841e-05, + "loss": 1.9539, + "step": 81940 + }, + { + "epoch": 0.5150747353328227, + "grad_norm": 5.560588359832764, + "learning_rate": 1.6570999891033756e-05, + "loss": 1.5905, + "step": 81950 + }, + { + "epoch": 0.5151375876495198, + "grad_norm": 5.567323207855225, + "learning_rate": 1.6570580790089103e-05, + "loss": 1.5554, + "step": 81960 + }, + { + "epoch": 0.5152004399662169, + "grad_norm": 8.046873092651367, + "learning_rate": 1.6570161689144447e-05, + "loss": 1.8316, + "step": 81970 + }, + { + "epoch": 0.515263292282914, + "grad_norm": 6.263387203216553, + "learning_rate": 1.6569742588199794e-05, + "loss": 1.6225, + "step": 81980 + }, + { + "epoch": 0.5153261445996111, + "grad_norm": 7.945009708404541, + "learning_rate": 1.656932348725514e-05, + "loss": 1.7196, + "step": 81990 + }, + { + "epoch": 0.5153889969163082, + "grad_norm": 7.492959022521973, + "learning_rate": 1.6568904386310488e-05, + "loss": 1.5255, + "step": 82000 + }, + { + "epoch": 0.5154518492330054, + "grad_norm": 5.629178047180176, + "learning_rate": 1.6568485285365835e-05, + "loss": 1.6738, + "step": 82010 + }, + { + "epoch": 0.5155147015497025, + "grad_norm": 5.612534523010254, + "learning_rate": 1.6568066184421182e-05, + "loss": 1.5211, + "step": 82020 + }, + { + "epoch": 0.5155775538663996, + "grad_norm": 6.696208477020264, + "learning_rate": 1.656764708347653e-05, + "loss": 1.715, + "step": 82030 + }, + { + "epoch": 0.5156404061830967, + "grad_norm": 7.129432678222656, + "learning_rate": 1.6567227982531876e-05, + "loss": 1.8996, + "step": 82040 + }, + { + "epoch": 0.5157032584997938, + "grad_norm": 6.362168312072754, + "learning_rate": 1.656680888158722e-05, + "loss": 1.7316, + "step": 82050 + }, + { + "epoch": 0.5157661108164909, + "grad_norm": 6.4442925453186035, + "learning_rate": 1.6566389780642567e-05, + "loss": 1.8083, + "step": 82060 + }, + { + "epoch": 0.5158289631331879, + "grad_norm": 6.292686939239502, + "learning_rate": 1.6565970679697914e-05, + "loss": 1.4529, + "step": 82070 + }, + { + "epoch": 0.515891815449885, + "grad_norm": 7.054361820220947, + "learning_rate": 1.656555157875326e-05, + "loss": 1.8437, + "step": 82080 + }, + { + "epoch": 0.5159546677665822, + "grad_norm": 8.293606758117676, + "learning_rate": 1.6565132477808605e-05, + "loss": 1.6266, + "step": 82090 + }, + { + "epoch": 0.5160175200832793, + "grad_norm": 7.016659259796143, + "learning_rate": 1.6564713376863952e-05, + "loss": 1.5936, + "step": 82100 + }, + { + "epoch": 0.5160803723999764, + "grad_norm": 6.724484443664551, + "learning_rate": 1.65642942759193e-05, + "loss": 1.6597, + "step": 82110 + }, + { + "epoch": 0.5161432247166735, + "grad_norm": 6.709836006164551, + "learning_rate": 1.6563875174974646e-05, + "loss": 1.7444, + "step": 82120 + }, + { + "epoch": 0.5162060770333706, + "grad_norm": 6.972070217132568, + "learning_rate": 1.6563456074029993e-05, + "loss": 1.4151, + "step": 82130 + }, + { + "epoch": 0.5162689293500677, + "grad_norm": 7.710244655609131, + "learning_rate": 1.6563036973085337e-05, + "loss": 1.801, + "step": 82140 + }, + { + "epoch": 0.5163317816667649, + "grad_norm": 6.5838541984558105, + "learning_rate": 1.6562617872140684e-05, + "loss": 1.6151, + "step": 82150 + }, + { + "epoch": 0.516394633983462, + "grad_norm": 7.036131858825684, + "learning_rate": 1.656219877119603e-05, + "loss": 1.6963, + "step": 82160 + }, + { + "epoch": 0.5164574863001591, + "grad_norm": 6.047787189483643, + "learning_rate": 1.6561779670251378e-05, + "loss": 1.8514, + "step": 82170 + }, + { + "epoch": 0.5165203386168562, + "grad_norm": 6.680669784545898, + "learning_rate": 1.6561360569306725e-05, + "loss": 1.7074, + "step": 82180 + }, + { + "epoch": 0.5165831909335533, + "grad_norm": 7.738285541534424, + "learning_rate": 1.6560941468362072e-05, + "loss": 1.5607, + "step": 82190 + }, + { + "epoch": 0.5166460432502504, + "grad_norm": 7.540075302124023, + "learning_rate": 1.656052236741742e-05, + "loss": 1.6826, + "step": 82200 + }, + { + "epoch": 0.5167088955669475, + "grad_norm": 7.239392280578613, + "learning_rate": 1.6560103266472766e-05, + "loss": 1.631, + "step": 82210 + }, + { + "epoch": 0.5167717478836447, + "grad_norm": 5.825963497161865, + "learning_rate": 1.655968416552811e-05, + "loss": 1.7823, + "step": 82220 + }, + { + "epoch": 0.5168346002003418, + "grad_norm": 6.491040229797363, + "learning_rate": 1.6559265064583457e-05, + "loss": 1.765, + "step": 82230 + }, + { + "epoch": 0.5168974525170389, + "grad_norm": 7.147906303405762, + "learning_rate": 1.6558845963638804e-05, + "loss": 1.7067, + "step": 82240 + }, + { + "epoch": 0.516960304833736, + "grad_norm": 7.335332870483398, + "learning_rate": 1.655842686269415e-05, + "loss": 1.7845, + "step": 82250 + }, + { + "epoch": 0.5170231571504331, + "grad_norm": 6.657925605773926, + "learning_rate": 1.6558007761749498e-05, + "loss": 1.6984, + "step": 82260 + }, + { + "epoch": 0.5170860094671302, + "grad_norm": 6.173869609832764, + "learning_rate": 1.6557588660804842e-05, + "loss": 1.6487, + "step": 82270 + }, + { + "epoch": 0.5171488617838274, + "grad_norm": 6.072544097900391, + "learning_rate": 1.655716955986019e-05, + "loss": 1.5735, + "step": 82280 + }, + { + "epoch": 0.5172117141005245, + "grad_norm": 7.850886344909668, + "learning_rate": 1.6556750458915536e-05, + "loss": 1.7009, + "step": 82290 + }, + { + "epoch": 0.5172745664172216, + "grad_norm": 6.694455623626709, + "learning_rate": 1.6556331357970883e-05, + "loss": 1.7019, + "step": 82300 + }, + { + "epoch": 0.5173374187339187, + "grad_norm": 5.889588356018066, + "learning_rate": 1.6555912257026227e-05, + "loss": 1.7359, + "step": 82310 + }, + { + "epoch": 0.5174002710506157, + "grad_norm": 7.2895002365112305, + "learning_rate": 1.6555493156081574e-05, + "loss": 1.6788, + "step": 82320 + }, + { + "epoch": 0.5174631233673128, + "grad_norm": 7.424088001251221, + "learning_rate": 1.655507405513692e-05, + "loss": 1.6296, + "step": 82330 + }, + { + "epoch": 0.5175259756840099, + "grad_norm": 6.089168548583984, + "learning_rate": 1.6554654954192268e-05, + "loss": 1.5945, + "step": 82340 + }, + { + "epoch": 0.517588828000707, + "grad_norm": 7.069284915924072, + "learning_rate": 1.6554235853247615e-05, + "loss": 1.8223, + "step": 82350 + }, + { + "epoch": 0.5176516803174042, + "grad_norm": 7.048602104187012, + "learning_rate": 1.655381675230296e-05, + "loss": 1.8519, + "step": 82360 + }, + { + "epoch": 0.5177145326341013, + "grad_norm": 6.624363899230957, + "learning_rate": 1.6553397651358306e-05, + "loss": 1.6981, + "step": 82370 + }, + { + "epoch": 0.5177773849507984, + "grad_norm": 8.180917739868164, + "learning_rate": 1.6552978550413653e-05, + "loss": 1.8844, + "step": 82380 + }, + { + "epoch": 0.5178402372674955, + "grad_norm": 5.791408538818359, + "learning_rate": 1.6552559449469e-05, + "loss": 1.6947, + "step": 82390 + }, + { + "epoch": 0.5179030895841926, + "grad_norm": 6.24849271774292, + "learning_rate": 1.6552140348524347e-05, + "loss": 1.8205, + "step": 82400 + }, + { + "epoch": 0.5179659419008897, + "grad_norm": 7.0733561515808105, + "learning_rate": 1.6551721247579694e-05, + "loss": 1.5899, + "step": 82410 + }, + { + "epoch": 0.5180287942175869, + "grad_norm": 5.714791774749756, + "learning_rate": 1.655130214663504e-05, + "loss": 1.6854, + "step": 82420 + }, + { + "epoch": 0.518091646534284, + "grad_norm": 6.8169965744018555, + "learning_rate": 1.6550883045690388e-05, + "loss": 1.749, + "step": 82430 + }, + { + "epoch": 0.5181544988509811, + "grad_norm": 7.001819133758545, + "learning_rate": 1.6550463944745735e-05, + "loss": 1.5122, + "step": 82440 + }, + { + "epoch": 0.5182173511676782, + "grad_norm": 7.239821910858154, + "learning_rate": 1.655004484380108e-05, + "loss": 1.6865, + "step": 82450 + }, + { + "epoch": 0.5182802034843753, + "grad_norm": 7.571619987487793, + "learning_rate": 1.6549625742856426e-05, + "loss": 1.8882, + "step": 82460 + }, + { + "epoch": 0.5183430558010724, + "grad_norm": 7.204845905303955, + "learning_rate": 1.6549206641911773e-05, + "loss": 1.7303, + "step": 82470 + }, + { + "epoch": 0.5184059081177695, + "grad_norm": 6.98240852355957, + "learning_rate": 1.654878754096712e-05, + "loss": 1.6622, + "step": 82480 + }, + { + "epoch": 0.5184687604344667, + "grad_norm": 7.970244407653809, + "learning_rate": 1.6548368440022464e-05, + "loss": 1.5693, + "step": 82490 + }, + { + "epoch": 0.5185316127511638, + "grad_norm": 7.878420352935791, + "learning_rate": 1.654794933907781e-05, + "loss": 1.8428, + "step": 82500 + }, + { + "epoch": 0.5185944650678609, + "grad_norm": 6.624403953552246, + "learning_rate": 1.6547530238133158e-05, + "loss": 1.8425, + "step": 82510 + }, + { + "epoch": 0.518657317384558, + "grad_norm": 6.755704879760742, + "learning_rate": 1.6547111137188505e-05, + "loss": 1.572, + "step": 82520 + }, + { + "epoch": 0.5187201697012551, + "grad_norm": 6.136546611785889, + "learning_rate": 1.6546692036243852e-05, + "loss": 1.6231, + "step": 82530 + }, + { + "epoch": 0.5187830220179522, + "grad_norm": 5.777431488037109, + "learning_rate": 1.6546272935299196e-05, + "loss": 1.4797, + "step": 82540 + }, + { + "epoch": 0.5188458743346493, + "grad_norm": 7.702548980712891, + "learning_rate": 1.6545853834354543e-05, + "loss": 1.7375, + "step": 82550 + }, + { + "epoch": 0.5189087266513465, + "grad_norm": 6.238503456115723, + "learning_rate": 1.654543473340989e-05, + "loss": 1.8284, + "step": 82560 + }, + { + "epoch": 0.5189715789680436, + "grad_norm": 5.85338020324707, + "learning_rate": 1.6545015632465237e-05, + "loss": 1.6781, + "step": 82570 + }, + { + "epoch": 0.5190344312847406, + "grad_norm": 6.3454179763793945, + "learning_rate": 1.6544596531520584e-05, + "loss": 1.496, + "step": 82580 + }, + { + "epoch": 0.5190972836014377, + "grad_norm": 6.02119255065918, + "learning_rate": 1.654417743057593e-05, + "loss": 1.6278, + "step": 82590 + }, + { + "epoch": 0.5191601359181348, + "grad_norm": 6.290589332580566, + "learning_rate": 1.6543758329631275e-05, + "loss": 1.5677, + "step": 82600 + }, + { + "epoch": 0.5192229882348319, + "grad_norm": 5.991605758666992, + "learning_rate": 1.6543339228686622e-05, + "loss": 1.679, + "step": 82610 + }, + { + "epoch": 0.519285840551529, + "grad_norm": 6.47032356262207, + "learning_rate": 1.654292012774197e-05, + "loss": 1.5912, + "step": 82620 + }, + { + "epoch": 0.5193486928682262, + "grad_norm": 7.9203572273254395, + "learning_rate": 1.6542501026797316e-05, + "loss": 1.8486, + "step": 82630 + }, + { + "epoch": 0.5194115451849233, + "grad_norm": 7.584958553314209, + "learning_rate": 1.6542081925852663e-05, + "loss": 1.7711, + "step": 82640 + }, + { + "epoch": 0.5194743975016204, + "grad_norm": 6.896289348602295, + "learning_rate": 1.654166282490801e-05, + "loss": 1.7535, + "step": 82650 + }, + { + "epoch": 0.5195372498183175, + "grad_norm": 5.558168888092041, + "learning_rate": 1.6541243723963357e-05, + "loss": 1.5942, + "step": 82660 + }, + { + "epoch": 0.5196001021350146, + "grad_norm": 6.410091876983643, + "learning_rate": 1.65408246230187e-05, + "loss": 1.6743, + "step": 82670 + }, + { + "epoch": 0.5196629544517117, + "grad_norm": 6.681957721710205, + "learning_rate": 1.6540405522074048e-05, + "loss": 1.5559, + "step": 82680 + }, + { + "epoch": 0.5197258067684088, + "grad_norm": 7.247389793395996, + "learning_rate": 1.6539986421129395e-05, + "loss": 1.5263, + "step": 82690 + }, + { + "epoch": 0.519788659085106, + "grad_norm": 7.7606916427612305, + "learning_rate": 1.6539567320184742e-05, + "loss": 1.7305, + "step": 82700 + }, + { + "epoch": 0.5198515114018031, + "grad_norm": 5.6875433921813965, + "learning_rate": 1.6539148219240086e-05, + "loss": 1.7554, + "step": 82710 + }, + { + "epoch": 0.5199143637185002, + "grad_norm": 6.5668463706970215, + "learning_rate": 1.6538729118295433e-05, + "loss": 1.6081, + "step": 82720 + }, + { + "epoch": 0.5199772160351973, + "grad_norm": 6.804002285003662, + "learning_rate": 1.653831001735078e-05, + "loss": 1.8088, + "step": 82730 + }, + { + "epoch": 0.5200400683518944, + "grad_norm": 6.785575866699219, + "learning_rate": 1.6537890916406127e-05, + "loss": 1.8513, + "step": 82740 + }, + { + "epoch": 0.5201029206685915, + "grad_norm": 6.491295337677002, + "learning_rate": 1.6537471815461474e-05, + "loss": 1.5833, + "step": 82750 + }, + { + "epoch": 0.5201657729852887, + "grad_norm": 5.610404968261719, + "learning_rate": 1.6537052714516818e-05, + "loss": 1.5928, + "step": 82760 + }, + { + "epoch": 0.5202286253019858, + "grad_norm": 7.637632846832275, + "learning_rate": 1.6536633613572165e-05, + "loss": 1.7613, + "step": 82770 + }, + { + "epoch": 0.5202914776186829, + "grad_norm": 7.387691020965576, + "learning_rate": 1.6536214512627512e-05, + "loss": 1.6803, + "step": 82780 + }, + { + "epoch": 0.52035432993538, + "grad_norm": 5.746232032775879, + "learning_rate": 1.653579541168286e-05, + "loss": 1.6212, + "step": 82790 + }, + { + "epoch": 0.5204171822520771, + "grad_norm": 7.148149490356445, + "learning_rate": 1.6535376310738206e-05, + "loss": 1.4678, + "step": 82800 + }, + { + "epoch": 0.5204800345687742, + "grad_norm": 6.671700477600098, + "learning_rate": 1.6534957209793553e-05, + "loss": 1.6183, + "step": 82810 + }, + { + "epoch": 0.5205428868854713, + "grad_norm": 6.904843330383301, + "learning_rate": 1.65345381088489e-05, + "loss": 1.7249, + "step": 82820 + }, + { + "epoch": 0.5206057392021683, + "grad_norm": 9.225653648376465, + "learning_rate": 1.6534119007904247e-05, + "loss": 1.9133, + "step": 82830 + }, + { + "epoch": 0.5206685915188655, + "grad_norm": 7.4347147941589355, + "learning_rate": 1.6533699906959594e-05, + "loss": 1.6362, + "step": 82840 + }, + { + "epoch": 0.5207314438355626, + "grad_norm": 6.336967945098877, + "learning_rate": 1.6533280806014938e-05, + "loss": 1.4811, + "step": 82850 + }, + { + "epoch": 0.5207942961522597, + "grad_norm": 6.682900905609131, + "learning_rate": 1.6532861705070285e-05, + "loss": 1.6655, + "step": 82860 + }, + { + "epoch": 0.5208571484689568, + "grad_norm": 5.187028408050537, + "learning_rate": 1.6532442604125632e-05, + "loss": 1.5937, + "step": 82870 + }, + { + "epoch": 0.5209200007856539, + "grad_norm": 6.478296279907227, + "learning_rate": 1.653202350318098e-05, + "loss": 1.8416, + "step": 82880 + }, + { + "epoch": 0.520982853102351, + "grad_norm": 7.833890438079834, + "learning_rate": 1.6531604402236323e-05, + "loss": 1.9684, + "step": 82890 + }, + { + "epoch": 0.5210457054190482, + "grad_norm": 6.321444988250732, + "learning_rate": 1.653118530129167e-05, + "loss": 1.6874, + "step": 82900 + }, + { + "epoch": 0.5211085577357453, + "grad_norm": 5.837413311004639, + "learning_rate": 1.6530766200347017e-05, + "loss": 1.5814, + "step": 82910 + }, + { + "epoch": 0.5211714100524424, + "grad_norm": 6.95207405090332, + "learning_rate": 1.6530347099402364e-05, + "loss": 1.5639, + "step": 82920 + }, + { + "epoch": 0.5212342623691395, + "grad_norm": 6.530197620391846, + "learning_rate": 1.6529927998457708e-05, + "loss": 1.5572, + "step": 82930 + }, + { + "epoch": 0.5212971146858366, + "grad_norm": 7.431722164154053, + "learning_rate": 1.6529508897513055e-05, + "loss": 1.9132, + "step": 82940 + }, + { + "epoch": 0.5213599670025337, + "grad_norm": 7.086301803588867, + "learning_rate": 1.6529089796568402e-05, + "loss": 1.7854, + "step": 82950 + }, + { + "epoch": 0.5214228193192308, + "grad_norm": 7.270763874053955, + "learning_rate": 1.652867069562375e-05, + "loss": 1.7947, + "step": 82960 + }, + { + "epoch": 0.521485671635928, + "grad_norm": 7.24395751953125, + "learning_rate": 1.6528251594679096e-05, + "loss": 1.8873, + "step": 82970 + }, + { + "epoch": 0.5215485239526251, + "grad_norm": 7.02109956741333, + "learning_rate": 1.652783249373444e-05, + "loss": 1.6871, + "step": 82980 + }, + { + "epoch": 0.5216113762693222, + "grad_norm": 7.27868127822876, + "learning_rate": 1.6527413392789787e-05, + "loss": 1.5118, + "step": 82990 + }, + { + "epoch": 0.5216742285860193, + "grad_norm": 7.981987953186035, + "learning_rate": 1.6526994291845134e-05, + "loss": 1.7591, + "step": 83000 + }, + { + "epoch": 0.5217370809027164, + "grad_norm": 6.476938724517822, + "learning_rate": 1.652657519090048e-05, + "loss": 1.6087, + "step": 83010 + }, + { + "epoch": 0.5217999332194135, + "grad_norm": 6.41933012008667, + "learning_rate": 1.6526156089955828e-05, + "loss": 1.7027, + "step": 83020 + }, + { + "epoch": 0.5218627855361107, + "grad_norm": 7.0129828453063965, + "learning_rate": 1.6525736989011175e-05, + "loss": 1.4111, + "step": 83030 + }, + { + "epoch": 0.5219256378528078, + "grad_norm": 6.466144561767578, + "learning_rate": 1.6525317888066522e-05, + "loss": 1.7989, + "step": 83040 + }, + { + "epoch": 0.5219884901695049, + "grad_norm": 6.608112335205078, + "learning_rate": 1.652489878712187e-05, + "loss": 1.6697, + "step": 83050 + }, + { + "epoch": 0.522051342486202, + "grad_norm": 7.534452438354492, + "learning_rate": 1.6524479686177216e-05, + "loss": 1.9912, + "step": 83060 + }, + { + "epoch": 0.5221141948028991, + "grad_norm": 5.754380702972412, + "learning_rate": 1.652406058523256e-05, + "loss": 1.6117, + "step": 83070 + }, + { + "epoch": 0.5221770471195962, + "grad_norm": 6.875207901000977, + "learning_rate": 1.6523641484287907e-05, + "loss": 1.5582, + "step": 83080 + }, + { + "epoch": 0.5222398994362932, + "grad_norm": 8.083128929138184, + "learning_rate": 1.6523222383343254e-05, + "loss": 1.702, + "step": 83090 + }, + { + "epoch": 0.5223027517529903, + "grad_norm": 6.761566638946533, + "learning_rate": 1.65228032823986e-05, + "loss": 1.7788, + "step": 83100 + }, + { + "epoch": 0.5223656040696875, + "grad_norm": 7.703314304351807, + "learning_rate": 1.6522384181453945e-05, + "loss": 1.7674, + "step": 83110 + }, + { + "epoch": 0.5224284563863846, + "grad_norm": 6.8022942543029785, + "learning_rate": 1.6521965080509292e-05, + "loss": 1.6648, + "step": 83120 + }, + { + "epoch": 0.5224913087030817, + "grad_norm": 5.865825176239014, + "learning_rate": 1.652154597956464e-05, + "loss": 1.5966, + "step": 83130 + }, + { + "epoch": 0.5225541610197788, + "grad_norm": 7.486507892608643, + "learning_rate": 1.6521126878619986e-05, + "loss": 1.6999, + "step": 83140 + }, + { + "epoch": 0.5226170133364759, + "grad_norm": 6.923404216766357, + "learning_rate": 1.6520707777675333e-05, + "loss": 1.7957, + "step": 83150 + }, + { + "epoch": 0.522679865653173, + "grad_norm": 5.716740131378174, + "learning_rate": 1.6520288676730677e-05, + "loss": 1.5891, + "step": 83160 + }, + { + "epoch": 0.5227427179698702, + "grad_norm": 8.053667068481445, + "learning_rate": 1.6519869575786024e-05, + "loss": 1.7406, + "step": 83170 + }, + { + "epoch": 0.5228055702865673, + "grad_norm": 6.939223289489746, + "learning_rate": 1.651945047484137e-05, + "loss": 1.8066, + "step": 83180 + }, + { + "epoch": 0.5228684226032644, + "grad_norm": 6.253711223602295, + "learning_rate": 1.6519031373896718e-05, + "loss": 1.7324, + "step": 83190 + }, + { + "epoch": 0.5229312749199615, + "grad_norm": 7.140364646911621, + "learning_rate": 1.6518612272952065e-05, + "loss": 1.5482, + "step": 83200 + }, + { + "epoch": 0.5229941272366586, + "grad_norm": 5.5362935066223145, + "learning_rate": 1.6518193172007412e-05, + "loss": 1.4722, + "step": 83210 + }, + { + "epoch": 0.5230569795533557, + "grad_norm": 6.500285625457764, + "learning_rate": 1.651777407106276e-05, + "loss": 1.7503, + "step": 83220 + }, + { + "epoch": 0.5231198318700528, + "grad_norm": 6.48651647567749, + "learning_rate": 1.6517354970118103e-05, + "loss": 1.6214, + "step": 83230 + }, + { + "epoch": 0.52318268418675, + "grad_norm": 6.936561107635498, + "learning_rate": 1.651693586917345e-05, + "loss": 1.5603, + "step": 83240 + }, + { + "epoch": 0.5232455365034471, + "grad_norm": 6.254504203796387, + "learning_rate": 1.6516516768228797e-05, + "loss": 1.6045, + "step": 83250 + }, + { + "epoch": 0.5233083888201442, + "grad_norm": 7.0453338623046875, + "learning_rate": 1.6516097667284144e-05, + "loss": 1.7993, + "step": 83260 + }, + { + "epoch": 0.5233712411368413, + "grad_norm": 6.0243611335754395, + "learning_rate": 1.651567856633949e-05, + "loss": 1.9001, + "step": 83270 + }, + { + "epoch": 0.5234340934535384, + "grad_norm": 7.146152019500732, + "learning_rate": 1.651525946539484e-05, + "loss": 1.7487, + "step": 83280 + }, + { + "epoch": 0.5234969457702355, + "grad_norm": 6.603753566741943, + "learning_rate": 1.6514840364450182e-05, + "loss": 1.5815, + "step": 83290 + }, + { + "epoch": 0.5235597980869326, + "grad_norm": 6.5526204109191895, + "learning_rate": 1.651442126350553e-05, + "loss": 1.802, + "step": 83300 + }, + { + "epoch": 0.5236226504036298, + "grad_norm": 6.115887641906738, + "learning_rate": 1.6514002162560876e-05, + "loss": 1.5732, + "step": 83310 + }, + { + "epoch": 0.5236855027203269, + "grad_norm": 7.639376163482666, + "learning_rate": 1.6513583061616223e-05, + "loss": 1.7702, + "step": 83320 + }, + { + "epoch": 0.523748355037024, + "grad_norm": 6.723336219787598, + "learning_rate": 1.6513163960671567e-05, + "loss": 1.6207, + "step": 83330 + }, + { + "epoch": 0.5238112073537211, + "grad_norm": 6.08930778503418, + "learning_rate": 1.6512744859726914e-05, + "loss": 1.7635, + "step": 83340 + }, + { + "epoch": 0.5238740596704181, + "grad_norm": 7.516968250274658, + "learning_rate": 1.651232575878226e-05, + "loss": 1.6921, + "step": 83350 + }, + { + "epoch": 0.5239369119871152, + "grad_norm": 6.24344539642334, + "learning_rate": 1.6511906657837608e-05, + "loss": 1.7873, + "step": 83360 + }, + { + "epoch": 0.5239997643038123, + "grad_norm": 6.595812797546387, + "learning_rate": 1.6511487556892955e-05, + "loss": 1.5824, + "step": 83370 + }, + { + "epoch": 0.5240626166205095, + "grad_norm": 6.358027458190918, + "learning_rate": 1.65110684559483e-05, + "loss": 1.6681, + "step": 83380 + }, + { + "epoch": 0.5241254689372066, + "grad_norm": 7.562269687652588, + "learning_rate": 1.6510649355003646e-05, + "loss": 1.6935, + "step": 83390 + }, + { + "epoch": 0.5241883212539037, + "grad_norm": 7.25498104095459, + "learning_rate": 1.6510230254058993e-05, + "loss": 1.6679, + "step": 83400 + }, + { + "epoch": 0.5242511735706008, + "grad_norm": 7.283073902130127, + "learning_rate": 1.650981115311434e-05, + "loss": 1.602, + "step": 83410 + }, + { + "epoch": 0.5243140258872979, + "grad_norm": 7.198612689971924, + "learning_rate": 1.6509392052169687e-05, + "loss": 1.7284, + "step": 83420 + }, + { + "epoch": 0.524376878203995, + "grad_norm": 6.337793827056885, + "learning_rate": 1.6508972951225034e-05, + "loss": 2.0844, + "step": 83430 + }, + { + "epoch": 0.5244397305206921, + "grad_norm": 6.039976596832275, + "learning_rate": 1.650855385028038e-05, + "loss": 1.6695, + "step": 83440 + }, + { + "epoch": 0.5245025828373893, + "grad_norm": 6.3445916175842285, + "learning_rate": 1.650813474933573e-05, + "loss": 1.7213, + "step": 83450 + }, + { + "epoch": 0.5245654351540864, + "grad_norm": 7.413024425506592, + "learning_rate": 1.6507715648391075e-05, + "loss": 1.5914, + "step": 83460 + }, + { + "epoch": 0.5246282874707835, + "grad_norm": 6.500877857208252, + "learning_rate": 1.650729654744642e-05, + "loss": 1.948, + "step": 83470 + }, + { + "epoch": 0.5246911397874806, + "grad_norm": 6.975460052490234, + "learning_rate": 1.6506877446501766e-05, + "loss": 1.6333, + "step": 83480 + }, + { + "epoch": 0.5247539921041777, + "grad_norm": 6.222975730895996, + "learning_rate": 1.6506458345557113e-05, + "loss": 1.6955, + "step": 83490 + }, + { + "epoch": 0.5248168444208748, + "grad_norm": 6.043398857116699, + "learning_rate": 1.650603924461246e-05, + "loss": 1.6554, + "step": 83500 + }, + { + "epoch": 0.524879696737572, + "grad_norm": 6.9041290283203125, + "learning_rate": 1.6505620143667804e-05, + "loss": 1.6828, + "step": 83510 + }, + { + "epoch": 0.5249425490542691, + "grad_norm": 7.749858379364014, + "learning_rate": 1.650520104272315e-05, + "loss": 1.8912, + "step": 83520 + }, + { + "epoch": 0.5250054013709662, + "grad_norm": 6.439682483673096, + "learning_rate": 1.6504781941778498e-05, + "loss": 1.6681, + "step": 83530 + }, + { + "epoch": 0.5250682536876633, + "grad_norm": 6.8574299812316895, + "learning_rate": 1.6504362840833845e-05, + "loss": 1.5521, + "step": 83540 + }, + { + "epoch": 0.5251311060043604, + "grad_norm": 7.462523460388184, + "learning_rate": 1.650394373988919e-05, + "loss": 1.6394, + "step": 83550 + }, + { + "epoch": 0.5251939583210575, + "grad_norm": 7.904224872589111, + "learning_rate": 1.6503524638944536e-05, + "loss": 1.6388, + "step": 83560 + }, + { + "epoch": 0.5252568106377546, + "grad_norm": 5.839818954467773, + "learning_rate": 1.6503105537999883e-05, + "loss": 1.9303, + "step": 83570 + }, + { + "epoch": 0.5253196629544518, + "grad_norm": 6.612687587738037, + "learning_rate": 1.650268643705523e-05, + "loss": 1.685, + "step": 83580 + }, + { + "epoch": 0.5253825152711489, + "grad_norm": 7.205349922180176, + "learning_rate": 1.6502267336110577e-05, + "loss": 1.9504, + "step": 83590 + }, + { + "epoch": 0.5254453675878459, + "grad_norm": 6.072460651397705, + "learning_rate": 1.650184823516592e-05, + "loss": 1.694, + "step": 83600 + }, + { + "epoch": 0.525508219904543, + "grad_norm": 7.299000263214111, + "learning_rate": 1.6501429134221268e-05, + "loss": 1.7088, + "step": 83610 + }, + { + "epoch": 0.5255710722212401, + "grad_norm": 6.392491340637207, + "learning_rate": 1.6501010033276615e-05, + "loss": 1.7243, + "step": 83620 + }, + { + "epoch": 0.5256339245379372, + "grad_norm": 7.418911457061768, + "learning_rate": 1.6500590932331962e-05, + "loss": 1.8272, + "step": 83630 + }, + { + "epoch": 0.5256967768546343, + "grad_norm": 7.277531147003174, + "learning_rate": 1.650017183138731e-05, + "loss": 1.6829, + "step": 83640 + }, + { + "epoch": 0.5257596291713315, + "grad_norm": 6.8729681968688965, + "learning_rate": 1.6499752730442656e-05, + "loss": 1.5724, + "step": 83650 + }, + { + "epoch": 0.5258224814880286, + "grad_norm": 5.070959091186523, + "learning_rate": 1.6499333629498003e-05, + "loss": 1.7617, + "step": 83660 + }, + { + "epoch": 0.5258853338047257, + "grad_norm": 6.0253448486328125, + "learning_rate": 1.649891452855335e-05, + "loss": 1.8828, + "step": 83670 + }, + { + "epoch": 0.5259481861214228, + "grad_norm": 6.4799723625183105, + "learning_rate": 1.6498495427608697e-05, + "loss": 1.6959, + "step": 83680 + }, + { + "epoch": 0.5260110384381199, + "grad_norm": 6.933954238891602, + "learning_rate": 1.649807632666404e-05, + "loss": 1.7841, + "step": 83690 + }, + { + "epoch": 0.526073890754817, + "grad_norm": 6.568515777587891, + "learning_rate": 1.6497657225719388e-05, + "loss": 1.7374, + "step": 83700 + }, + { + "epoch": 0.5261367430715141, + "grad_norm": 7.105417251586914, + "learning_rate": 1.6497238124774735e-05, + "loss": 1.5518, + "step": 83710 + }, + { + "epoch": 0.5261995953882113, + "grad_norm": 5.699483394622803, + "learning_rate": 1.6496819023830082e-05, + "loss": 1.6673, + "step": 83720 + }, + { + "epoch": 0.5262624477049084, + "grad_norm": 6.796374320983887, + "learning_rate": 1.6496399922885426e-05, + "loss": 1.666, + "step": 83730 + }, + { + "epoch": 0.5263253000216055, + "grad_norm": 6.832988262176514, + "learning_rate": 1.6495980821940773e-05, + "loss": 1.7855, + "step": 83740 + }, + { + "epoch": 0.5263881523383026, + "grad_norm": 7.052558898925781, + "learning_rate": 1.649556172099612e-05, + "loss": 1.5677, + "step": 83750 + }, + { + "epoch": 0.5264510046549997, + "grad_norm": 7.3553924560546875, + "learning_rate": 1.6495142620051467e-05, + "loss": 1.7541, + "step": 83760 + }, + { + "epoch": 0.5265138569716968, + "grad_norm": 6.884090423583984, + "learning_rate": 1.6494723519106814e-05, + "loss": 1.8688, + "step": 83770 + }, + { + "epoch": 0.526576709288394, + "grad_norm": 6.454369068145752, + "learning_rate": 1.6494304418162158e-05, + "loss": 1.6198, + "step": 83780 + }, + { + "epoch": 0.5266395616050911, + "grad_norm": 6.577486515045166, + "learning_rate": 1.6493885317217505e-05, + "loss": 1.4164, + "step": 83790 + }, + { + "epoch": 0.5267024139217882, + "grad_norm": 6.482428550720215, + "learning_rate": 1.6493466216272852e-05, + "loss": 1.6054, + "step": 83800 + }, + { + "epoch": 0.5267652662384853, + "grad_norm": 6.978704929351807, + "learning_rate": 1.64930471153282e-05, + "loss": 1.8539, + "step": 83810 + }, + { + "epoch": 0.5268281185551824, + "grad_norm": 6.174668788909912, + "learning_rate": 1.6492628014383546e-05, + "loss": 1.6333, + "step": 83820 + }, + { + "epoch": 0.5268909708718795, + "grad_norm": 6.739724636077881, + "learning_rate": 1.6492208913438893e-05, + "loss": 1.5853, + "step": 83830 + }, + { + "epoch": 0.5269538231885766, + "grad_norm": 6.9323272705078125, + "learning_rate": 1.649178981249424e-05, + "loss": 1.7365, + "step": 83840 + }, + { + "epoch": 0.5270166755052738, + "grad_norm": 6.932864665985107, + "learning_rate": 1.6491370711549584e-05, + "loss": 1.8743, + "step": 83850 + }, + { + "epoch": 0.5270795278219708, + "grad_norm": 6.934847354888916, + "learning_rate": 1.649095161060493e-05, + "loss": 1.6195, + "step": 83860 + }, + { + "epoch": 0.5271423801386679, + "grad_norm": 7.109174728393555, + "learning_rate": 1.6490532509660278e-05, + "loss": 1.7509, + "step": 83870 + }, + { + "epoch": 0.527205232455365, + "grad_norm": 7.261030673980713, + "learning_rate": 1.6490113408715625e-05, + "loss": 1.6738, + "step": 83880 + }, + { + "epoch": 0.5272680847720621, + "grad_norm": 6.480356693267822, + "learning_rate": 1.6489694307770972e-05, + "loss": 1.6618, + "step": 83890 + }, + { + "epoch": 0.5273309370887592, + "grad_norm": 7.185296058654785, + "learning_rate": 1.648927520682632e-05, + "loss": 1.9007, + "step": 83900 + }, + { + "epoch": 0.5273937894054563, + "grad_norm": 6.617218494415283, + "learning_rate": 1.6488856105881663e-05, + "loss": 1.6509, + "step": 83910 + }, + { + "epoch": 0.5274566417221535, + "grad_norm": 5.1567888259887695, + "learning_rate": 1.648843700493701e-05, + "loss": 1.53, + "step": 83920 + }, + { + "epoch": 0.5275194940388506, + "grad_norm": 7.2307329177856445, + "learning_rate": 1.6488017903992357e-05, + "loss": 1.6642, + "step": 83930 + }, + { + "epoch": 0.5275823463555477, + "grad_norm": 6.329922199249268, + "learning_rate": 1.6487598803047704e-05, + "loss": 1.6353, + "step": 83940 + }, + { + "epoch": 0.5276451986722448, + "grad_norm": 6.501959323883057, + "learning_rate": 1.6487179702103048e-05, + "loss": 1.5327, + "step": 83950 + }, + { + "epoch": 0.5277080509889419, + "grad_norm": 5.8954033851623535, + "learning_rate": 1.6486760601158395e-05, + "loss": 1.7865, + "step": 83960 + }, + { + "epoch": 0.527770903305639, + "grad_norm": 6.62356424331665, + "learning_rate": 1.6486341500213742e-05, + "loss": 1.5822, + "step": 83970 + }, + { + "epoch": 0.5278337556223361, + "grad_norm": 7.6494035720825195, + "learning_rate": 1.648592239926909e-05, + "loss": 1.5285, + "step": 83980 + }, + { + "epoch": 0.5278966079390333, + "grad_norm": 6.179942607879639, + "learning_rate": 1.6485503298324436e-05, + "loss": 1.7896, + "step": 83990 + }, + { + "epoch": 0.5279594602557304, + "grad_norm": 7.346772193908691, + "learning_rate": 1.648508419737978e-05, + "loss": 1.5969, + "step": 84000 + }, + { + "epoch": 0.5280223125724275, + "grad_norm": 6.414741039276123, + "learning_rate": 1.6484665096435127e-05, + "loss": 1.6566, + "step": 84010 + }, + { + "epoch": 0.5280851648891246, + "grad_norm": 7.321435451507568, + "learning_rate": 1.6484245995490474e-05, + "loss": 1.7269, + "step": 84020 + }, + { + "epoch": 0.5281480172058217, + "grad_norm": 6.323916912078857, + "learning_rate": 1.648382689454582e-05, + "loss": 1.6105, + "step": 84030 + }, + { + "epoch": 0.5282108695225188, + "grad_norm": 6.770116806030273, + "learning_rate": 1.648340779360117e-05, + "loss": 1.8191, + "step": 84040 + }, + { + "epoch": 0.528273721839216, + "grad_norm": 6.618961334228516, + "learning_rate": 1.6482988692656515e-05, + "loss": 1.6701, + "step": 84050 + }, + { + "epoch": 0.5283365741559131, + "grad_norm": 8.083182334899902, + "learning_rate": 1.6482569591711862e-05, + "loss": 1.8281, + "step": 84060 + }, + { + "epoch": 0.5283994264726102, + "grad_norm": 6.384860038757324, + "learning_rate": 1.648215049076721e-05, + "loss": 1.5342, + "step": 84070 + }, + { + "epoch": 0.5284622787893073, + "grad_norm": 6.723288536071777, + "learning_rate": 1.6481731389822557e-05, + "loss": 1.6023, + "step": 84080 + }, + { + "epoch": 0.5285251311060044, + "grad_norm": 7.6965227127075195, + "learning_rate": 1.64813122888779e-05, + "loss": 1.7709, + "step": 84090 + }, + { + "epoch": 0.5285879834227015, + "grad_norm": 7.332645893096924, + "learning_rate": 1.6480893187933247e-05, + "loss": 1.7903, + "step": 84100 + }, + { + "epoch": 0.5286508357393985, + "grad_norm": 8.037694931030273, + "learning_rate": 1.6480474086988594e-05, + "loss": 1.9066, + "step": 84110 + }, + { + "epoch": 0.5287136880560956, + "grad_norm": 6.930173397064209, + "learning_rate": 1.648005498604394e-05, + "loss": 1.7766, + "step": 84120 + }, + { + "epoch": 0.5287765403727928, + "grad_norm": 6.96900749206543, + "learning_rate": 1.6479635885099285e-05, + "loss": 1.5314, + "step": 84130 + }, + { + "epoch": 0.5288393926894899, + "grad_norm": 6.857990741729736, + "learning_rate": 1.6479216784154632e-05, + "loss": 1.4845, + "step": 84140 + }, + { + "epoch": 0.528902245006187, + "grad_norm": 7.52931022644043, + "learning_rate": 1.647879768320998e-05, + "loss": 1.8119, + "step": 84150 + }, + { + "epoch": 0.5289650973228841, + "grad_norm": 6.831870079040527, + "learning_rate": 1.6478378582265326e-05, + "loss": 1.728, + "step": 84160 + }, + { + "epoch": 0.5290279496395812, + "grad_norm": 7.987659931182861, + "learning_rate": 1.647795948132067e-05, + "loss": 1.7008, + "step": 84170 + }, + { + "epoch": 0.5290908019562783, + "grad_norm": 7.758392810821533, + "learning_rate": 1.6477540380376017e-05, + "loss": 1.5586, + "step": 84180 + }, + { + "epoch": 0.5291536542729754, + "grad_norm": 6.975496768951416, + "learning_rate": 1.6477121279431364e-05, + "loss": 1.8404, + "step": 84190 + }, + { + "epoch": 0.5292165065896726, + "grad_norm": 6.656536102294922, + "learning_rate": 1.647670217848671e-05, + "loss": 1.6575, + "step": 84200 + }, + { + "epoch": 0.5292793589063697, + "grad_norm": 6.5500006675720215, + "learning_rate": 1.647628307754206e-05, + "loss": 1.731, + "step": 84210 + }, + { + "epoch": 0.5293422112230668, + "grad_norm": 6.406815528869629, + "learning_rate": 1.6475863976597405e-05, + "loss": 1.5357, + "step": 84220 + }, + { + "epoch": 0.5294050635397639, + "grad_norm": 7.080085754394531, + "learning_rate": 1.647544487565275e-05, + "loss": 1.5831, + "step": 84230 + }, + { + "epoch": 0.529467915856461, + "grad_norm": 11.400970458984375, + "learning_rate": 1.6475025774708096e-05, + "loss": 1.7842, + "step": 84240 + }, + { + "epoch": 0.5295307681731581, + "grad_norm": 7.819911479949951, + "learning_rate": 1.6474606673763443e-05, + "loss": 1.6955, + "step": 84250 + }, + { + "epoch": 0.5295936204898553, + "grad_norm": 7.373455047607422, + "learning_rate": 1.647418757281879e-05, + "loss": 1.7008, + "step": 84260 + }, + { + "epoch": 0.5296564728065524, + "grad_norm": 6.902364253997803, + "learning_rate": 1.6473768471874137e-05, + "loss": 1.7394, + "step": 84270 + }, + { + "epoch": 0.5297193251232495, + "grad_norm": 7.253707408905029, + "learning_rate": 1.6473349370929484e-05, + "loss": 1.4435, + "step": 84280 + }, + { + "epoch": 0.5297821774399466, + "grad_norm": 6.589545249938965, + "learning_rate": 1.647293026998483e-05, + "loss": 1.8562, + "step": 84290 + }, + { + "epoch": 0.5298450297566437, + "grad_norm": 6.285915851593018, + "learning_rate": 1.647251116904018e-05, + "loss": 1.7756, + "step": 84300 + }, + { + "epoch": 0.5299078820733408, + "grad_norm": 6.506764888763428, + "learning_rate": 1.6472092068095522e-05, + "loss": 1.6378, + "step": 84310 + }, + { + "epoch": 0.529970734390038, + "grad_norm": 6.875381946563721, + "learning_rate": 1.647167296715087e-05, + "loss": 1.5785, + "step": 84320 + }, + { + "epoch": 0.5300335867067351, + "grad_norm": 7.490484237670898, + "learning_rate": 1.6471253866206216e-05, + "loss": 1.7469, + "step": 84330 + }, + { + "epoch": 0.5300964390234322, + "grad_norm": 5.694377899169922, + "learning_rate": 1.6470834765261563e-05, + "loss": 1.6922, + "step": 84340 + }, + { + "epoch": 0.5301592913401293, + "grad_norm": 5.9290337562561035, + "learning_rate": 1.6470415664316907e-05, + "loss": 1.6742, + "step": 84350 + }, + { + "epoch": 0.5302221436568264, + "grad_norm": 6.733861923217773, + "learning_rate": 1.6469996563372254e-05, + "loss": 1.6141, + "step": 84360 + }, + { + "epoch": 0.5302849959735234, + "grad_norm": 7.116500377655029, + "learning_rate": 1.64695774624276e-05, + "loss": 1.8392, + "step": 84370 + }, + { + "epoch": 0.5303478482902205, + "grad_norm": 6.626927852630615, + "learning_rate": 1.646915836148295e-05, + "loss": 1.6648, + "step": 84380 + }, + { + "epoch": 0.5304107006069176, + "grad_norm": 7.265458106994629, + "learning_rate": 1.6468739260538295e-05, + "loss": 1.7129, + "step": 84390 + }, + { + "epoch": 0.5304735529236148, + "grad_norm": 7.232100009918213, + "learning_rate": 1.646832015959364e-05, + "loss": 1.6931, + "step": 84400 + }, + { + "epoch": 0.5305364052403119, + "grad_norm": 7.313258647918701, + "learning_rate": 1.6467901058648986e-05, + "loss": 1.5399, + "step": 84410 + }, + { + "epoch": 0.530599257557009, + "grad_norm": 6.102558612823486, + "learning_rate": 1.6467481957704333e-05, + "loss": 1.5518, + "step": 84420 + }, + { + "epoch": 0.5306621098737061, + "grad_norm": 7.185632228851318, + "learning_rate": 1.646706285675968e-05, + "loss": 1.7073, + "step": 84430 + }, + { + "epoch": 0.5307249621904032, + "grad_norm": 7.501678943634033, + "learning_rate": 1.6466643755815027e-05, + "loss": 1.6756, + "step": 84440 + }, + { + "epoch": 0.5307878145071003, + "grad_norm": 6.88824987411499, + "learning_rate": 1.6466224654870374e-05, + "loss": 1.788, + "step": 84450 + }, + { + "epoch": 0.5308506668237974, + "grad_norm": 7.567595481872559, + "learning_rate": 1.646580555392572e-05, + "loss": 1.8341, + "step": 84460 + }, + { + "epoch": 0.5309135191404946, + "grad_norm": 5.518832683563232, + "learning_rate": 1.646538645298107e-05, + "loss": 1.6301, + "step": 84470 + }, + { + "epoch": 0.5309763714571917, + "grad_norm": 6.848931789398193, + "learning_rate": 1.6464967352036412e-05, + "loss": 1.8876, + "step": 84480 + }, + { + "epoch": 0.5310392237738888, + "grad_norm": 7.0313401222229, + "learning_rate": 1.646454825109176e-05, + "loss": 1.5669, + "step": 84490 + }, + { + "epoch": 0.5311020760905859, + "grad_norm": 6.8558478355407715, + "learning_rate": 1.6464129150147106e-05, + "loss": 1.65, + "step": 84500 + }, + { + "epoch": 0.531164928407283, + "grad_norm": 6.209257125854492, + "learning_rate": 1.6463710049202454e-05, + "loss": 1.7208, + "step": 84510 + }, + { + "epoch": 0.5312277807239801, + "grad_norm": 6.593898296356201, + "learning_rate": 1.64632909482578e-05, + "loss": 1.6445, + "step": 84520 + }, + { + "epoch": 0.5312906330406773, + "grad_norm": 5.721263885498047, + "learning_rate": 1.6462871847313144e-05, + "loss": 1.7591, + "step": 84530 + }, + { + "epoch": 0.5313534853573744, + "grad_norm": 8.044561386108398, + "learning_rate": 1.646245274636849e-05, + "loss": 1.482, + "step": 84540 + }, + { + "epoch": 0.5314163376740715, + "grad_norm": 6.970468521118164, + "learning_rate": 1.646203364542384e-05, + "loss": 1.5991, + "step": 84550 + }, + { + "epoch": 0.5314791899907686, + "grad_norm": 5.181437969207764, + "learning_rate": 1.6461614544479185e-05, + "loss": 1.4328, + "step": 84560 + }, + { + "epoch": 0.5315420423074657, + "grad_norm": 7.308653831481934, + "learning_rate": 1.646119544353453e-05, + "loss": 1.6158, + "step": 84570 + }, + { + "epoch": 0.5316048946241628, + "grad_norm": 6.897716045379639, + "learning_rate": 1.6460776342589876e-05, + "loss": 1.7081, + "step": 84580 + }, + { + "epoch": 0.5316677469408599, + "grad_norm": 6.542665958404541, + "learning_rate": 1.6460357241645223e-05, + "loss": 1.6932, + "step": 84590 + }, + { + "epoch": 0.5317305992575571, + "grad_norm": 4.889917850494385, + "learning_rate": 1.645993814070057e-05, + "loss": 1.589, + "step": 84600 + }, + { + "epoch": 0.5317934515742542, + "grad_norm": 6.633420944213867, + "learning_rate": 1.6459519039755917e-05, + "loss": 1.792, + "step": 84610 + }, + { + "epoch": 0.5318563038909512, + "grad_norm": 6.323985576629639, + "learning_rate": 1.645909993881126e-05, + "loss": 1.6676, + "step": 84620 + }, + { + "epoch": 0.5319191562076483, + "grad_norm": 6.963869094848633, + "learning_rate": 1.6458680837866608e-05, + "loss": 1.6703, + "step": 84630 + }, + { + "epoch": 0.5319820085243454, + "grad_norm": 7.031579494476318, + "learning_rate": 1.6458261736921955e-05, + "loss": 1.9484, + "step": 84640 + }, + { + "epoch": 0.5320448608410425, + "grad_norm": 6.2956156730651855, + "learning_rate": 1.6457842635977302e-05, + "loss": 1.6665, + "step": 84650 + }, + { + "epoch": 0.5321077131577396, + "grad_norm": 6.316678047180176, + "learning_rate": 1.645742353503265e-05, + "loss": 1.5173, + "step": 84660 + }, + { + "epoch": 0.5321705654744368, + "grad_norm": 6.737529277801514, + "learning_rate": 1.6457004434087996e-05, + "loss": 1.5123, + "step": 84670 + }, + { + "epoch": 0.5322334177911339, + "grad_norm": 7.0265069007873535, + "learning_rate": 1.6456585333143344e-05, + "loss": 1.759, + "step": 84680 + }, + { + "epoch": 0.532296270107831, + "grad_norm": 6.733026504516602, + "learning_rate": 1.645616623219869e-05, + "loss": 1.9251, + "step": 84690 + }, + { + "epoch": 0.5323591224245281, + "grad_norm": 7.538741588592529, + "learning_rate": 1.6455747131254038e-05, + "loss": 1.6288, + "step": 84700 + }, + { + "epoch": 0.5324219747412252, + "grad_norm": 6.046632766723633, + "learning_rate": 1.645532803030938e-05, + "loss": 1.5321, + "step": 84710 + }, + { + "epoch": 0.5324848270579223, + "grad_norm": 5.785271644592285, + "learning_rate": 1.645490892936473e-05, + "loss": 1.6415, + "step": 84720 + }, + { + "epoch": 0.5325476793746194, + "grad_norm": 7.816107749938965, + "learning_rate": 1.6454489828420076e-05, + "loss": 1.7927, + "step": 84730 + }, + { + "epoch": 0.5326105316913166, + "grad_norm": 7.172451972961426, + "learning_rate": 1.6454070727475423e-05, + "loss": 1.7184, + "step": 84740 + }, + { + "epoch": 0.5326733840080137, + "grad_norm": 7.187384128570557, + "learning_rate": 1.6453651626530766e-05, + "loss": 1.6256, + "step": 84750 + }, + { + "epoch": 0.5327362363247108, + "grad_norm": 5.67566442489624, + "learning_rate": 1.6453232525586113e-05, + "loss": 1.4756, + "step": 84760 + }, + { + "epoch": 0.5327990886414079, + "grad_norm": 6.156361103057861, + "learning_rate": 1.645281342464146e-05, + "loss": 1.5702, + "step": 84770 + }, + { + "epoch": 0.532861940958105, + "grad_norm": 6.500582218170166, + "learning_rate": 1.6452394323696807e-05, + "loss": 1.8808, + "step": 84780 + }, + { + "epoch": 0.5329247932748021, + "grad_norm": 6.974032878875732, + "learning_rate": 1.6451975222752155e-05, + "loss": 1.5445, + "step": 84790 + }, + { + "epoch": 0.5329876455914992, + "grad_norm": 6.717978000640869, + "learning_rate": 1.6451556121807498e-05, + "loss": 1.842, + "step": 84800 + }, + { + "epoch": 0.5330504979081964, + "grad_norm": 6.938985824584961, + "learning_rate": 1.6451137020862845e-05, + "loss": 1.9241, + "step": 84810 + }, + { + "epoch": 0.5331133502248935, + "grad_norm": 6.2072296142578125, + "learning_rate": 1.6450717919918192e-05, + "loss": 1.7266, + "step": 84820 + }, + { + "epoch": 0.5331762025415906, + "grad_norm": 6.984010219573975, + "learning_rate": 1.645029881897354e-05, + "loss": 1.6843, + "step": 84830 + }, + { + "epoch": 0.5332390548582877, + "grad_norm": 7.583876609802246, + "learning_rate": 1.6449879718028887e-05, + "loss": 1.6457, + "step": 84840 + }, + { + "epoch": 0.5333019071749848, + "grad_norm": 6.7121992111206055, + "learning_rate": 1.6449460617084234e-05, + "loss": 1.4486, + "step": 84850 + }, + { + "epoch": 0.5333647594916819, + "grad_norm": 6.792508602142334, + "learning_rate": 1.6449041516139577e-05, + "loss": 1.5144, + "step": 84860 + }, + { + "epoch": 0.533427611808379, + "grad_norm": 7.822506904602051, + "learning_rate": 1.6448622415194924e-05, + "loss": 1.6888, + "step": 84870 + }, + { + "epoch": 0.5334904641250761, + "grad_norm": 7.036062240600586, + "learning_rate": 1.644820331425027e-05, + "loss": 1.9722, + "step": 84880 + }, + { + "epoch": 0.5335533164417732, + "grad_norm": 6.179880619049072, + "learning_rate": 1.644778421330562e-05, + "loss": 1.569, + "step": 84890 + }, + { + "epoch": 0.5336161687584703, + "grad_norm": 7.492377281188965, + "learning_rate": 1.6447365112360966e-05, + "loss": 1.6265, + "step": 84900 + }, + { + "epoch": 0.5336790210751674, + "grad_norm": 6.322450160980225, + "learning_rate": 1.6446946011416313e-05, + "loss": 1.4699, + "step": 84910 + }, + { + "epoch": 0.5337418733918645, + "grad_norm": 6.580765247344971, + "learning_rate": 1.644652691047166e-05, + "loss": 1.6407, + "step": 84920 + }, + { + "epoch": 0.5338047257085616, + "grad_norm": 6.971106052398682, + "learning_rate": 1.6446107809527003e-05, + "loss": 1.5293, + "step": 84930 + }, + { + "epoch": 0.5338675780252587, + "grad_norm": 6.923962593078613, + "learning_rate": 1.644568870858235e-05, + "loss": 1.5995, + "step": 84940 + }, + { + "epoch": 0.5339304303419559, + "grad_norm": 8.402005195617676, + "learning_rate": 1.6445269607637698e-05, + "loss": 1.6225, + "step": 84950 + }, + { + "epoch": 0.533993282658653, + "grad_norm": 6.86106538772583, + "learning_rate": 1.6444850506693045e-05, + "loss": 1.6665, + "step": 84960 + }, + { + "epoch": 0.5340561349753501, + "grad_norm": 5.663937091827393, + "learning_rate": 1.6444431405748388e-05, + "loss": 1.5055, + "step": 84970 + }, + { + "epoch": 0.5341189872920472, + "grad_norm": 6.386752605438232, + "learning_rate": 1.6444012304803735e-05, + "loss": 1.8918, + "step": 84980 + }, + { + "epoch": 0.5341818396087443, + "grad_norm": 5.876289367675781, + "learning_rate": 1.6443593203859082e-05, + "loss": 1.6871, + "step": 84990 + }, + { + "epoch": 0.5342446919254414, + "grad_norm": 6.044205188751221, + "learning_rate": 1.644317410291443e-05, + "loss": 1.6087, + "step": 85000 + }, + { + "epoch": 0.5343075442421386, + "grad_norm": 7.124244689941406, + "learning_rate": 1.6442755001969777e-05, + "loss": 1.8886, + "step": 85010 + }, + { + "epoch": 0.5343703965588357, + "grad_norm": 6.722472190856934, + "learning_rate": 1.644233590102512e-05, + "loss": 1.6599, + "step": 85020 + }, + { + "epoch": 0.5344332488755328, + "grad_norm": 6.756608486175537, + "learning_rate": 1.6441916800080467e-05, + "loss": 1.7099, + "step": 85030 + }, + { + "epoch": 0.5344961011922299, + "grad_norm": 6.803463459014893, + "learning_rate": 1.6441497699135814e-05, + "loss": 1.5995, + "step": 85040 + }, + { + "epoch": 0.534558953508927, + "grad_norm": 7.612119197845459, + "learning_rate": 1.644107859819116e-05, + "loss": 1.788, + "step": 85050 + }, + { + "epoch": 0.5346218058256241, + "grad_norm": 6.436509132385254, + "learning_rate": 1.644065949724651e-05, + "loss": 1.4981, + "step": 85060 + }, + { + "epoch": 0.5346846581423212, + "grad_norm": 6.51688814163208, + "learning_rate": 1.6440240396301856e-05, + "loss": 1.7471, + "step": 85070 + }, + { + "epoch": 0.5347475104590184, + "grad_norm": 6.626704216003418, + "learning_rate": 1.6439821295357203e-05, + "loss": 1.765, + "step": 85080 + }, + { + "epoch": 0.5348103627757155, + "grad_norm": 6.392571449279785, + "learning_rate": 1.643940219441255e-05, + "loss": 1.6547, + "step": 85090 + }, + { + "epoch": 0.5348732150924126, + "grad_norm": 6.908417701721191, + "learning_rate": 1.6438983093467893e-05, + "loss": 1.9132, + "step": 85100 + }, + { + "epoch": 0.5349360674091097, + "grad_norm": 6.740513801574707, + "learning_rate": 1.643856399252324e-05, + "loss": 1.8449, + "step": 85110 + }, + { + "epoch": 0.5349989197258068, + "grad_norm": 6.186057090759277, + "learning_rate": 1.6438144891578588e-05, + "loss": 1.8939, + "step": 85120 + }, + { + "epoch": 0.5350617720425038, + "grad_norm": 6.962738513946533, + "learning_rate": 1.6437725790633935e-05, + "loss": 1.8209, + "step": 85130 + }, + { + "epoch": 0.5351246243592009, + "grad_norm": 6.540321350097656, + "learning_rate": 1.6437306689689282e-05, + "loss": 1.6953, + "step": 85140 + }, + { + "epoch": 0.535187476675898, + "grad_norm": 7.460124492645264, + "learning_rate": 1.6436887588744625e-05, + "loss": 1.7276, + "step": 85150 + }, + { + "epoch": 0.5352503289925952, + "grad_norm": 7.499505519866943, + "learning_rate": 1.6436468487799972e-05, + "loss": 1.8471, + "step": 85160 + }, + { + "epoch": 0.5353131813092923, + "grad_norm": 6.599464416503906, + "learning_rate": 1.643604938685532e-05, + "loss": 1.6341, + "step": 85170 + }, + { + "epoch": 0.5353760336259894, + "grad_norm": 6.362654685974121, + "learning_rate": 1.6435630285910667e-05, + "loss": 1.6101, + "step": 85180 + }, + { + "epoch": 0.5354388859426865, + "grad_norm": 6.673609256744385, + "learning_rate": 1.643521118496601e-05, + "loss": 1.7353, + "step": 85190 + }, + { + "epoch": 0.5355017382593836, + "grad_norm": 6.538881778717041, + "learning_rate": 1.6434792084021357e-05, + "loss": 1.6995, + "step": 85200 + }, + { + "epoch": 0.5355645905760807, + "grad_norm": 8.122904777526855, + "learning_rate": 1.6434372983076704e-05, + "loss": 1.6756, + "step": 85210 + }, + { + "epoch": 0.5356274428927779, + "grad_norm": 7.356871604919434, + "learning_rate": 1.643395388213205e-05, + "loss": 1.6809, + "step": 85220 + }, + { + "epoch": 0.535690295209475, + "grad_norm": 5.879265308380127, + "learning_rate": 1.64335347811874e-05, + "loss": 1.7661, + "step": 85230 + }, + { + "epoch": 0.5357531475261721, + "grad_norm": 5.349161148071289, + "learning_rate": 1.6433115680242742e-05, + "loss": 1.654, + "step": 85240 + }, + { + "epoch": 0.5358159998428692, + "grad_norm": 6.181868553161621, + "learning_rate": 1.643269657929809e-05, + "loss": 1.8045, + "step": 85250 + }, + { + "epoch": 0.5358788521595663, + "grad_norm": 6.501641273498535, + "learning_rate": 1.6432277478353436e-05, + "loss": 1.7423, + "step": 85260 + }, + { + "epoch": 0.5359417044762634, + "grad_norm": 6.203233242034912, + "learning_rate": 1.6431900287503248e-05, + "loss": 1.6475, + "step": 85270 + }, + { + "epoch": 0.5360045567929606, + "grad_norm": 7.076571464538574, + "learning_rate": 1.6431481186558595e-05, + "loss": 1.4486, + "step": 85280 + }, + { + "epoch": 0.5360674091096577, + "grad_norm": 6.285316467285156, + "learning_rate": 1.6431062085613942e-05, + "loss": 1.5754, + "step": 85290 + }, + { + "epoch": 0.5361302614263548, + "grad_norm": 6.930043697357178, + "learning_rate": 1.643064298466929e-05, + "loss": 1.5767, + "step": 85300 + }, + { + "epoch": 0.5361931137430519, + "grad_norm": 6.204666614532471, + "learning_rate": 1.6430223883724633e-05, + "loss": 1.6366, + "step": 85310 + }, + { + "epoch": 0.536255966059749, + "grad_norm": 7.860479831695557, + "learning_rate": 1.642980478277998e-05, + "loss": 1.8423, + "step": 85320 + }, + { + "epoch": 0.5363188183764461, + "grad_norm": 7.237555503845215, + "learning_rate": 1.6429385681835327e-05, + "loss": 1.7368, + "step": 85330 + }, + { + "epoch": 0.5363816706931432, + "grad_norm": 6.343367576599121, + "learning_rate": 1.6428966580890674e-05, + "loss": 1.489, + "step": 85340 + }, + { + "epoch": 0.5364445230098404, + "grad_norm": 6.436690330505371, + "learning_rate": 1.642854747994602e-05, + "loss": 1.8278, + "step": 85350 + }, + { + "epoch": 0.5365073753265375, + "grad_norm": 6.2652153968811035, + "learning_rate": 1.6428128379001368e-05, + "loss": 1.4449, + "step": 85360 + }, + { + "epoch": 0.5365702276432346, + "grad_norm": 6.447495937347412, + "learning_rate": 1.6427709278056715e-05, + "loss": 1.6627, + "step": 85370 + }, + { + "epoch": 0.5366330799599317, + "grad_norm": 5.91556978225708, + "learning_rate": 1.6427290177112062e-05, + "loss": 1.487, + "step": 85380 + }, + { + "epoch": 0.5366959322766287, + "grad_norm": 6.8496222496032715, + "learning_rate": 1.642687107616741e-05, + "loss": 1.5192, + "step": 85390 + }, + { + "epoch": 0.5367587845933258, + "grad_norm": 6.301149845123291, + "learning_rate": 1.6426451975222753e-05, + "loss": 1.6423, + "step": 85400 + }, + { + "epoch": 0.5368216369100229, + "grad_norm": 8.504805564880371, + "learning_rate": 1.64260328742781e-05, + "loss": 1.818, + "step": 85410 + }, + { + "epoch": 0.53688448922672, + "grad_norm": 6.374795436859131, + "learning_rate": 1.6425613773333447e-05, + "loss": 1.9748, + "step": 85420 + }, + { + "epoch": 0.5369473415434172, + "grad_norm": 6.491119861602783, + "learning_rate": 1.6425194672388794e-05, + "loss": 1.9015, + "step": 85430 + }, + { + "epoch": 0.5370101938601143, + "grad_norm": 7.207005023956299, + "learning_rate": 1.642477557144414e-05, + "loss": 1.7151, + "step": 85440 + }, + { + "epoch": 0.5370730461768114, + "grad_norm": 7.193447113037109, + "learning_rate": 1.6424356470499485e-05, + "loss": 1.8635, + "step": 85450 + }, + { + "epoch": 0.5371358984935085, + "grad_norm": 7.126465320587158, + "learning_rate": 1.6423937369554832e-05, + "loss": 1.8622, + "step": 85460 + }, + { + "epoch": 0.5371987508102056, + "grad_norm": 6.056211948394775, + "learning_rate": 1.642351826861018e-05, + "loss": 1.4906, + "step": 85470 + }, + { + "epoch": 0.5372616031269027, + "grad_norm": 6.354976654052734, + "learning_rate": 1.6423099167665526e-05, + "loss": 1.7693, + "step": 85480 + }, + { + "epoch": 0.5373244554435999, + "grad_norm": 5.696046352386475, + "learning_rate": 1.642268006672087e-05, + "loss": 1.7025, + "step": 85490 + }, + { + "epoch": 0.537387307760297, + "grad_norm": 5.817015171051025, + "learning_rate": 1.6422260965776217e-05, + "loss": 1.7058, + "step": 85500 + }, + { + "epoch": 0.5374501600769941, + "grad_norm": 7.481677532196045, + "learning_rate": 1.6421841864831564e-05, + "loss": 1.6445, + "step": 85510 + }, + { + "epoch": 0.5375130123936912, + "grad_norm": 6.951291084289551, + "learning_rate": 1.642142276388691e-05, + "loss": 1.8054, + "step": 85520 + }, + { + "epoch": 0.5375758647103883, + "grad_norm": 6.017706394195557, + "learning_rate": 1.6421003662942258e-05, + "loss": 1.4114, + "step": 85530 + }, + { + "epoch": 0.5376387170270854, + "grad_norm": 8.327905654907227, + "learning_rate": 1.6420584561997605e-05, + "loss": 1.6243, + "step": 85540 + }, + { + "epoch": 0.5377015693437825, + "grad_norm": 6.597580909729004, + "learning_rate": 1.642016546105295e-05, + "loss": 1.6225, + "step": 85550 + }, + { + "epoch": 0.5377644216604797, + "grad_norm": 8.693310737609863, + "learning_rate": 1.6419746360108296e-05, + "loss": 1.8426, + "step": 85560 + }, + { + "epoch": 0.5378272739771768, + "grad_norm": 7.165815353393555, + "learning_rate": 1.6419327259163643e-05, + "loss": 1.5574, + "step": 85570 + }, + { + "epoch": 0.5378901262938739, + "grad_norm": 6.516468048095703, + "learning_rate": 1.641890815821899e-05, + "loss": 1.7302, + "step": 85580 + }, + { + "epoch": 0.537952978610571, + "grad_norm": 5.70261812210083, + "learning_rate": 1.6418489057274337e-05, + "loss": 1.7721, + "step": 85590 + }, + { + "epoch": 0.5380158309272681, + "grad_norm": 7.128627300262451, + "learning_rate": 1.6418069956329684e-05, + "loss": 1.6988, + "step": 85600 + }, + { + "epoch": 0.5380786832439652, + "grad_norm": 5.911867618560791, + "learning_rate": 1.641765085538503e-05, + "loss": 1.5781, + "step": 85610 + }, + { + "epoch": 0.5381415355606624, + "grad_norm": 5.839148044586182, + "learning_rate": 1.6417231754440375e-05, + "loss": 1.3336, + "step": 85620 + }, + { + "epoch": 0.5382043878773595, + "grad_norm": 6.416183948516846, + "learning_rate": 1.6416812653495722e-05, + "loss": 1.6185, + "step": 85630 + }, + { + "epoch": 0.5382672401940565, + "grad_norm": 6.362691879272461, + "learning_rate": 1.641639355255107e-05, + "loss": 2.0266, + "step": 85640 + }, + { + "epoch": 0.5383300925107536, + "grad_norm": 6.19903039932251, + "learning_rate": 1.6415974451606416e-05, + "loss": 1.5156, + "step": 85650 + }, + { + "epoch": 0.5383929448274507, + "grad_norm": 6.622096538543701, + "learning_rate": 1.6415555350661763e-05, + "loss": 1.6729, + "step": 85660 + }, + { + "epoch": 0.5384557971441478, + "grad_norm": 7.795952320098877, + "learning_rate": 1.6415136249717107e-05, + "loss": 1.5863, + "step": 85670 + }, + { + "epoch": 0.5385186494608449, + "grad_norm": 6.2659831047058105, + "learning_rate": 1.6414717148772454e-05, + "loss": 1.8809, + "step": 85680 + }, + { + "epoch": 0.538581501777542, + "grad_norm": 7.086266994476318, + "learning_rate": 1.64142980478278e-05, + "loss": 1.7379, + "step": 85690 + }, + { + "epoch": 0.5386443540942392, + "grad_norm": 6.989285469055176, + "learning_rate": 1.6413878946883148e-05, + "loss": 2.0259, + "step": 85700 + }, + { + "epoch": 0.5387072064109363, + "grad_norm": 6.347990036010742, + "learning_rate": 1.6413459845938492e-05, + "loss": 1.8471, + "step": 85710 + }, + { + "epoch": 0.5387700587276334, + "grad_norm": 6.735945701599121, + "learning_rate": 1.641304074499384e-05, + "loss": 1.7722, + "step": 85720 + }, + { + "epoch": 0.5388329110443305, + "grad_norm": 6.259716987609863, + "learning_rate": 1.6412621644049186e-05, + "loss": 1.6662, + "step": 85730 + }, + { + "epoch": 0.5388957633610276, + "grad_norm": 7.281030178070068, + "learning_rate": 1.6412202543104533e-05, + "loss": 1.7436, + "step": 85740 + }, + { + "epoch": 0.5389586156777247, + "grad_norm": 6.42799711227417, + "learning_rate": 1.641178344215988e-05, + "loss": 1.6246, + "step": 85750 + }, + { + "epoch": 0.5390214679944219, + "grad_norm": 7.063828945159912, + "learning_rate": 1.6411364341215227e-05, + "loss": 1.642, + "step": 85760 + }, + { + "epoch": 0.539084320311119, + "grad_norm": 6.377913475036621, + "learning_rate": 1.6410945240270574e-05, + "loss": 1.6268, + "step": 85770 + }, + { + "epoch": 0.5391471726278161, + "grad_norm": 6.6367878913879395, + "learning_rate": 1.641052613932592e-05, + "loss": 1.5504, + "step": 85780 + }, + { + "epoch": 0.5392100249445132, + "grad_norm": 6.8672709465026855, + "learning_rate": 1.6410107038381268e-05, + "loss": 1.7591, + "step": 85790 + }, + { + "epoch": 0.5392728772612103, + "grad_norm": 7.131933212280273, + "learning_rate": 1.6409687937436612e-05, + "loss": 1.7588, + "step": 85800 + }, + { + "epoch": 0.5393357295779074, + "grad_norm": 7.047543525695801, + "learning_rate": 1.640926883649196e-05, + "loss": 1.8538, + "step": 85810 + }, + { + "epoch": 0.5393985818946045, + "grad_norm": 6.190505504608154, + "learning_rate": 1.6408849735547306e-05, + "loss": 1.7086, + "step": 85820 + }, + { + "epoch": 0.5394614342113017, + "grad_norm": 5.566699028015137, + "learning_rate": 1.6408430634602653e-05, + "loss": 1.5023, + "step": 85830 + }, + { + "epoch": 0.5395242865279988, + "grad_norm": 8.045251846313477, + "learning_rate": 1.6408011533658e-05, + "loss": 1.599, + "step": 85840 + }, + { + "epoch": 0.5395871388446959, + "grad_norm": 8.263165473937988, + "learning_rate": 1.6407592432713344e-05, + "loss": 1.7251, + "step": 85850 + }, + { + "epoch": 0.539649991161393, + "grad_norm": 6.604892253875732, + "learning_rate": 1.640717333176869e-05, + "loss": 1.6502, + "step": 85860 + }, + { + "epoch": 0.5397128434780901, + "grad_norm": 7.067389011383057, + "learning_rate": 1.6406754230824038e-05, + "loss": 1.6586, + "step": 85870 + }, + { + "epoch": 0.5397756957947872, + "grad_norm": 6.605795383453369, + "learning_rate": 1.6406335129879385e-05, + "loss": 1.6772, + "step": 85880 + }, + { + "epoch": 0.5398385481114844, + "grad_norm": 6.340599060058594, + "learning_rate": 1.640591602893473e-05, + "loss": 1.6492, + "step": 85890 + }, + { + "epoch": 0.5399014004281814, + "grad_norm": 7.894109725952148, + "learning_rate": 1.6405496927990076e-05, + "loss": 1.6686, + "step": 85900 + }, + { + "epoch": 0.5399642527448785, + "grad_norm": 6.646517753601074, + "learning_rate": 1.6405077827045423e-05, + "loss": 1.8185, + "step": 85910 + }, + { + "epoch": 0.5400271050615756, + "grad_norm": 5.803297519683838, + "learning_rate": 1.640465872610077e-05, + "loss": 1.4937, + "step": 85920 + }, + { + "epoch": 0.5400899573782727, + "grad_norm": 5.791867733001709, + "learning_rate": 1.6404239625156114e-05, + "loss": 1.8236, + "step": 85930 + }, + { + "epoch": 0.5401528096949698, + "grad_norm": 6.289924144744873, + "learning_rate": 1.640382052421146e-05, + "loss": 1.7929, + "step": 85940 + }, + { + "epoch": 0.5402156620116669, + "grad_norm": 6.742290019989014, + "learning_rate": 1.6403401423266808e-05, + "loss": 1.7026, + "step": 85950 + }, + { + "epoch": 0.540278514328364, + "grad_norm": 6.8754987716674805, + "learning_rate": 1.6402982322322155e-05, + "loss": 1.6969, + "step": 85960 + }, + { + "epoch": 0.5403413666450612, + "grad_norm": 5.435462474822998, + "learning_rate": 1.6402563221377502e-05, + "loss": 1.5332, + "step": 85970 + }, + { + "epoch": 0.5404042189617583, + "grad_norm": 7.776442527770996, + "learning_rate": 1.640214412043285e-05, + "loss": 1.8064, + "step": 85980 + }, + { + "epoch": 0.5404670712784554, + "grad_norm": 7.129558086395264, + "learning_rate": 1.6401725019488196e-05, + "loss": 1.8947, + "step": 85990 + }, + { + "epoch": 0.5405299235951525, + "grad_norm": 6.044556140899658, + "learning_rate": 1.6401305918543543e-05, + "loss": 1.656, + "step": 86000 + }, + { + "epoch": 0.5405927759118496, + "grad_norm": 7.796718120574951, + "learning_rate": 1.640088681759889e-05, + "loss": 1.9945, + "step": 86010 + }, + { + "epoch": 0.5406556282285467, + "grad_norm": 6.118916034698486, + "learning_rate": 1.6400467716654234e-05, + "loss": 1.6391, + "step": 86020 + }, + { + "epoch": 0.5407184805452439, + "grad_norm": 5.638603687286377, + "learning_rate": 1.640004861570958e-05, + "loss": 1.5943, + "step": 86030 + }, + { + "epoch": 0.540781332861941, + "grad_norm": 6.42252254486084, + "learning_rate": 1.6399629514764928e-05, + "loss": 1.5481, + "step": 86040 + }, + { + "epoch": 0.5408441851786381, + "grad_norm": 7.297764778137207, + "learning_rate": 1.6399210413820275e-05, + "loss": 1.7197, + "step": 86050 + }, + { + "epoch": 0.5409070374953352, + "grad_norm": 6.368427753448486, + "learning_rate": 1.6398791312875622e-05, + "loss": 1.6432, + "step": 86060 + }, + { + "epoch": 0.5409698898120323, + "grad_norm": 6.701649188995361, + "learning_rate": 1.6398372211930966e-05, + "loss": 1.7287, + "step": 86070 + }, + { + "epoch": 0.5410327421287294, + "grad_norm": 6.809452056884766, + "learning_rate": 1.6397953110986313e-05, + "loss": 1.6901, + "step": 86080 + }, + { + "epoch": 0.5410955944454265, + "grad_norm": 7.57839298248291, + "learning_rate": 1.639753401004166e-05, + "loss": 1.6649, + "step": 86090 + }, + { + "epoch": 0.5411584467621237, + "grad_norm": 11.420140266418457, + "learning_rate": 1.6397114909097007e-05, + "loss": 1.6038, + "step": 86100 + }, + { + "epoch": 0.5412212990788208, + "grad_norm": 6.622860431671143, + "learning_rate": 1.639669580815235e-05, + "loss": 1.8323, + "step": 86110 + }, + { + "epoch": 0.5412841513955179, + "grad_norm": 5.812434196472168, + "learning_rate": 1.6396276707207698e-05, + "loss": 1.7367, + "step": 86120 + }, + { + "epoch": 0.541347003712215, + "grad_norm": 7.1364240646362305, + "learning_rate": 1.6395857606263045e-05, + "loss": 1.7987, + "step": 86130 + }, + { + "epoch": 0.5414098560289121, + "grad_norm": 7.4703569412231445, + "learning_rate": 1.6395438505318392e-05, + "loss": 1.8512, + "step": 86140 + }, + { + "epoch": 0.5414727083456091, + "grad_norm": 6.3275651931762695, + "learning_rate": 1.639501940437374e-05, + "loss": 1.7602, + "step": 86150 + }, + { + "epoch": 0.5415355606623062, + "grad_norm": 6.205627918243408, + "learning_rate": 1.6394600303429086e-05, + "loss": 1.6417, + "step": 86160 + }, + { + "epoch": 0.5415984129790034, + "grad_norm": 6.279049396514893, + "learning_rate": 1.639418120248443e-05, + "loss": 1.6631, + "step": 86170 + }, + { + "epoch": 0.5416612652957005, + "grad_norm": 7.833987236022949, + "learning_rate": 1.6393762101539777e-05, + "loss": 1.5057, + "step": 86180 + }, + { + "epoch": 0.5417241176123976, + "grad_norm": 6.630548000335693, + "learning_rate": 1.6393343000595124e-05, + "loss": 1.6445, + "step": 86190 + }, + { + "epoch": 0.5417869699290947, + "grad_norm": 6.6342453956604, + "learning_rate": 1.639292389965047e-05, + "loss": 1.8989, + "step": 86200 + }, + { + "epoch": 0.5418498222457918, + "grad_norm": 6.155070781707764, + "learning_rate": 1.6392504798705818e-05, + "loss": 1.6871, + "step": 86210 + }, + { + "epoch": 0.5419126745624889, + "grad_norm": 6.080020904541016, + "learning_rate": 1.6392085697761165e-05, + "loss": 1.666, + "step": 86220 + }, + { + "epoch": 0.541975526879186, + "grad_norm": 7.034708023071289, + "learning_rate": 1.6391666596816512e-05, + "loss": 1.7103, + "step": 86230 + }, + { + "epoch": 0.5420383791958832, + "grad_norm": 6.662476539611816, + "learning_rate": 1.6391247495871856e-05, + "loss": 1.8543, + "step": 86240 + }, + { + "epoch": 0.5421012315125803, + "grad_norm": 6.4750776290893555, + "learning_rate": 1.6390828394927203e-05, + "loss": 1.8943, + "step": 86250 + }, + { + "epoch": 0.5421640838292774, + "grad_norm": 6.518702983856201, + "learning_rate": 1.639040929398255e-05, + "loss": 1.8582, + "step": 86260 + }, + { + "epoch": 0.5422269361459745, + "grad_norm": 7.378445148468018, + "learning_rate": 1.6389990193037897e-05, + "loss": 1.5996, + "step": 86270 + }, + { + "epoch": 0.5422897884626716, + "grad_norm": 5.998882293701172, + "learning_rate": 1.6389571092093244e-05, + "loss": 1.7373, + "step": 86280 + }, + { + "epoch": 0.5423526407793687, + "grad_norm": 6.869247913360596, + "learning_rate": 1.6389151991148588e-05, + "loss": 1.7492, + "step": 86290 + }, + { + "epoch": 0.5424154930960658, + "grad_norm": 9.03229808807373, + "learning_rate": 1.6388732890203935e-05, + "loss": 1.8033, + "step": 86300 + }, + { + "epoch": 0.542478345412763, + "grad_norm": 7.205885410308838, + "learning_rate": 1.6388313789259282e-05, + "loss": 1.7698, + "step": 86310 + }, + { + "epoch": 0.5425411977294601, + "grad_norm": 6.515379905700684, + "learning_rate": 1.638789468831463e-05, + "loss": 1.5568, + "step": 86320 + }, + { + "epoch": 0.5426040500461572, + "grad_norm": 6.291879177093506, + "learning_rate": 1.6387475587369973e-05, + "loss": 1.7063, + "step": 86330 + }, + { + "epoch": 0.5426669023628543, + "grad_norm": 6.5558271408081055, + "learning_rate": 1.638705648642532e-05, + "loss": 1.554, + "step": 86340 + }, + { + "epoch": 0.5427297546795514, + "grad_norm": 6.825722694396973, + "learning_rate": 1.6386637385480667e-05, + "loss": 1.8762, + "step": 86350 + }, + { + "epoch": 0.5427926069962485, + "grad_norm": 6.950225830078125, + "learning_rate": 1.6386218284536014e-05, + "loss": 1.4571, + "step": 86360 + }, + { + "epoch": 0.5428554593129457, + "grad_norm": 6.12157678604126, + "learning_rate": 1.638579918359136e-05, + "loss": 1.6073, + "step": 86370 + }, + { + "epoch": 0.5429183116296428, + "grad_norm": 6.891550540924072, + "learning_rate": 1.6385380082646708e-05, + "loss": 1.5565, + "step": 86380 + }, + { + "epoch": 0.5429811639463399, + "grad_norm": 6.586368560791016, + "learning_rate": 1.6384960981702055e-05, + "loss": 1.389, + "step": 86390 + }, + { + "epoch": 0.543044016263037, + "grad_norm": 6.79563570022583, + "learning_rate": 1.6384541880757402e-05, + "loss": 1.6659, + "step": 86400 + }, + { + "epoch": 0.543106868579734, + "grad_norm": 7.297008991241455, + "learning_rate": 1.638412277981275e-05, + "loss": 1.5353, + "step": 86410 + }, + { + "epoch": 0.5431697208964311, + "grad_norm": 6.567941665649414, + "learning_rate": 1.6383703678868093e-05, + "loss": 1.6088, + "step": 86420 + }, + { + "epoch": 0.5432325732131282, + "grad_norm": 6.119392395019531, + "learning_rate": 1.638328457792344e-05, + "loss": 1.5637, + "step": 86430 + }, + { + "epoch": 0.5432954255298253, + "grad_norm": 6.379626750946045, + "learning_rate": 1.6382865476978787e-05, + "loss": 1.7865, + "step": 86440 + }, + { + "epoch": 0.5433582778465225, + "grad_norm": 5.699408531188965, + "learning_rate": 1.6382446376034134e-05, + "loss": 1.7122, + "step": 86450 + }, + { + "epoch": 0.5434211301632196, + "grad_norm": 6.399451732635498, + "learning_rate": 1.638202727508948e-05, + "loss": 1.7304, + "step": 86460 + }, + { + "epoch": 0.5434839824799167, + "grad_norm": 6.494431495666504, + "learning_rate": 1.6381608174144825e-05, + "loss": 1.4777, + "step": 86470 + }, + { + "epoch": 0.5435468347966138, + "grad_norm": 6.212937355041504, + "learning_rate": 1.6381189073200172e-05, + "loss": 1.5896, + "step": 86480 + }, + { + "epoch": 0.5436096871133109, + "grad_norm": 6.120029449462891, + "learning_rate": 1.638076997225552e-05, + "loss": 1.843, + "step": 86490 + }, + { + "epoch": 0.543672539430008, + "grad_norm": 6.218716144561768, + "learning_rate": 1.6380350871310866e-05, + "loss": 1.6116, + "step": 86500 + }, + { + "epoch": 0.5437353917467052, + "grad_norm": 5.772097587585449, + "learning_rate": 1.637993177036621e-05, + "loss": 1.6283, + "step": 86510 + }, + { + "epoch": 0.5437982440634023, + "grad_norm": 7.5236358642578125, + "learning_rate": 1.6379512669421557e-05, + "loss": 1.6003, + "step": 86520 + }, + { + "epoch": 0.5438610963800994, + "grad_norm": 7.945309638977051, + "learning_rate": 1.6379093568476904e-05, + "loss": 1.701, + "step": 86530 + }, + { + "epoch": 0.5439239486967965, + "grad_norm": 6.487154006958008, + "learning_rate": 1.637867446753225e-05, + "loss": 1.5663, + "step": 86540 + }, + { + "epoch": 0.5439868010134936, + "grad_norm": 7.235054969787598, + "learning_rate": 1.6378255366587598e-05, + "loss": 1.6827, + "step": 86550 + }, + { + "epoch": 0.5440496533301907, + "grad_norm": 6.165349960327148, + "learning_rate": 1.6377836265642942e-05, + "loss": 1.4464, + "step": 86560 + }, + { + "epoch": 0.5441125056468878, + "grad_norm": 7.019979953765869, + "learning_rate": 1.637741716469829e-05, + "loss": 1.5645, + "step": 86570 + }, + { + "epoch": 0.544175357963585, + "grad_norm": 7.073566436767578, + "learning_rate": 1.6376998063753636e-05, + "loss": 1.6864, + "step": 86580 + }, + { + "epoch": 0.5442382102802821, + "grad_norm": 7.4121527671813965, + "learning_rate": 1.6376578962808983e-05, + "loss": 1.8141, + "step": 86590 + }, + { + "epoch": 0.5443010625969792, + "grad_norm": 6.5008463859558105, + "learning_rate": 1.637615986186433e-05, + "loss": 1.6266, + "step": 86600 + }, + { + "epoch": 0.5443639149136763, + "grad_norm": 7.9003753662109375, + "learning_rate": 1.6375740760919677e-05, + "loss": 1.6775, + "step": 86610 + }, + { + "epoch": 0.5444267672303734, + "grad_norm": 7.4179863929748535, + "learning_rate": 1.6375321659975024e-05, + "loss": 1.6263, + "step": 86620 + }, + { + "epoch": 0.5444896195470705, + "grad_norm": 6.775514602661133, + "learning_rate": 1.637490255903037e-05, + "loss": 1.5732, + "step": 86630 + }, + { + "epoch": 0.5445524718637677, + "grad_norm": 6.417285919189453, + "learning_rate": 1.6374483458085715e-05, + "loss": 1.6495, + "step": 86640 + }, + { + "epoch": 0.5446153241804648, + "grad_norm": 6.63004207611084, + "learning_rate": 1.6374064357141062e-05, + "loss": 1.882, + "step": 86650 + }, + { + "epoch": 0.5446781764971618, + "grad_norm": 6.118006229400635, + "learning_rate": 1.637364525619641e-05, + "loss": 1.6263, + "step": 86660 + }, + { + "epoch": 0.5447410288138589, + "grad_norm": 6.563739776611328, + "learning_rate": 1.6373226155251756e-05, + "loss": 1.6761, + "step": 86670 + }, + { + "epoch": 0.544803881130556, + "grad_norm": 6.6065545082092285, + "learning_rate": 1.6372807054307103e-05, + "loss": 1.4558, + "step": 86680 + }, + { + "epoch": 0.5448667334472531, + "grad_norm": 6.14282751083374, + "learning_rate": 1.6372387953362447e-05, + "loss": 1.6269, + "step": 86690 + }, + { + "epoch": 0.5449295857639502, + "grad_norm": 6.8921098709106445, + "learning_rate": 1.6371968852417794e-05, + "loss": 1.423, + "step": 86700 + }, + { + "epoch": 0.5449924380806473, + "grad_norm": 7.513279438018799, + "learning_rate": 1.637154975147314e-05, + "loss": 1.629, + "step": 86710 + }, + { + "epoch": 0.5450552903973445, + "grad_norm": 5.247377872467041, + "learning_rate": 1.6371130650528488e-05, + "loss": 1.6349, + "step": 86720 + }, + { + "epoch": 0.5451181427140416, + "grad_norm": 7.0045318603515625, + "learning_rate": 1.6370711549583832e-05, + "loss": 1.5124, + "step": 86730 + }, + { + "epoch": 0.5451809950307387, + "grad_norm": 7.852500915527344, + "learning_rate": 1.637029244863918e-05, + "loss": 1.6307, + "step": 86740 + }, + { + "epoch": 0.5452438473474358, + "grad_norm": 7.170313358306885, + "learning_rate": 1.6369873347694526e-05, + "loss": 1.5946, + "step": 86750 + }, + { + "epoch": 0.5453066996641329, + "grad_norm": 7.545778274536133, + "learning_rate": 1.6369454246749873e-05, + "loss": 1.6845, + "step": 86760 + }, + { + "epoch": 0.54536955198083, + "grad_norm": 5.048802375793457, + "learning_rate": 1.636903514580522e-05, + "loss": 1.7877, + "step": 86770 + }, + { + "epoch": 0.5454324042975272, + "grad_norm": 6.210660457611084, + "learning_rate": 1.6368616044860567e-05, + "loss": 1.6917, + "step": 86780 + }, + { + "epoch": 0.5454952566142243, + "grad_norm": 6.029952526092529, + "learning_rate": 1.6368196943915914e-05, + "loss": 1.7503, + "step": 86790 + }, + { + "epoch": 0.5455581089309214, + "grad_norm": 7.362029552459717, + "learning_rate": 1.6367777842971258e-05, + "loss": 1.4296, + "step": 86800 + }, + { + "epoch": 0.5456209612476185, + "grad_norm": 7.046006202697754, + "learning_rate": 1.6367358742026605e-05, + "loss": 1.6642, + "step": 86810 + }, + { + "epoch": 0.5456838135643156, + "grad_norm": 7.1073503494262695, + "learning_rate": 1.6366939641081952e-05, + "loss": 1.6695, + "step": 86820 + }, + { + "epoch": 0.5457466658810127, + "grad_norm": 6.6842522621154785, + "learning_rate": 1.63665205401373e-05, + "loss": 1.5171, + "step": 86830 + }, + { + "epoch": 0.5458095181977098, + "grad_norm": 6.862412452697754, + "learning_rate": 1.6366101439192646e-05, + "loss": 1.6675, + "step": 86840 + }, + { + "epoch": 0.545872370514407, + "grad_norm": 7.693253040313721, + "learning_rate": 1.6365682338247993e-05, + "loss": 1.621, + "step": 86850 + }, + { + "epoch": 0.5459352228311041, + "grad_norm": 5.722990036010742, + "learning_rate": 1.636526323730334e-05, + "loss": 1.6731, + "step": 86860 + }, + { + "epoch": 0.5459980751478012, + "grad_norm": 6.423402786254883, + "learning_rate": 1.6364844136358684e-05, + "loss": 1.7002, + "step": 86870 + }, + { + "epoch": 0.5460609274644983, + "grad_norm": 7.07973051071167, + "learning_rate": 1.636442503541403e-05, + "loss": 1.7934, + "step": 86880 + }, + { + "epoch": 0.5461237797811954, + "grad_norm": 6.4761643409729, + "learning_rate": 1.6364005934469378e-05, + "loss": 1.8783, + "step": 86890 + }, + { + "epoch": 0.5461866320978925, + "grad_norm": 7.523335933685303, + "learning_rate": 1.6363586833524725e-05, + "loss": 1.7059, + "step": 86900 + }, + { + "epoch": 0.5462494844145896, + "grad_norm": 9.280854225158691, + "learning_rate": 1.636316773258007e-05, + "loss": 1.6908, + "step": 86910 + }, + { + "epoch": 0.5463123367312867, + "grad_norm": 6.539122104644775, + "learning_rate": 1.6362748631635416e-05, + "loss": 1.8692, + "step": 86920 + }, + { + "epoch": 0.5463751890479838, + "grad_norm": 7.5889434814453125, + "learning_rate": 1.6362329530690763e-05, + "loss": 1.7153, + "step": 86930 + }, + { + "epoch": 0.5464380413646809, + "grad_norm": 6.818871974945068, + "learning_rate": 1.636191042974611e-05, + "loss": 1.9356, + "step": 86940 + }, + { + "epoch": 0.546500893681378, + "grad_norm": 6.093920707702637, + "learning_rate": 1.6361491328801454e-05, + "loss": 1.4935, + "step": 86950 + }, + { + "epoch": 0.5465637459980751, + "grad_norm": 5.412710189819336, + "learning_rate": 1.63610722278568e-05, + "loss": 1.6619, + "step": 86960 + }, + { + "epoch": 0.5466265983147722, + "grad_norm": 7.041706085205078, + "learning_rate": 1.6360653126912148e-05, + "loss": 1.7782, + "step": 86970 + }, + { + "epoch": 0.5466894506314693, + "grad_norm": 6.9528961181640625, + "learning_rate": 1.6360234025967495e-05, + "loss": 1.6705, + "step": 86980 + }, + { + "epoch": 0.5467523029481665, + "grad_norm": 6.929596424102783, + "learning_rate": 1.6359814925022842e-05, + "loss": 1.7646, + "step": 86990 + }, + { + "epoch": 0.5468151552648636, + "grad_norm": 6.04166316986084, + "learning_rate": 1.635939582407819e-05, + "loss": 1.7182, + "step": 87000 + }, + { + "epoch": 0.5468780075815607, + "grad_norm": 7.2244486808776855, + "learning_rate": 1.6358976723133536e-05, + "loss": 1.5931, + "step": 87010 + }, + { + "epoch": 0.5469408598982578, + "grad_norm": 6.446841716766357, + "learning_rate": 1.6358557622188883e-05, + "loss": 1.8676, + "step": 87020 + }, + { + "epoch": 0.5470037122149549, + "grad_norm": 7.067605018615723, + "learning_rate": 1.635813852124423e-05, + "loss": 1.7028, + "step": 87030 + }, + { + "epoch": 0.547066564531652, + "grad_norm": 6.6622724533081055, + "learning_rate": 1.6357719420299574e-05, + "loss": 1.5149, + "step": 87040 + }, + { + "epoch": 0.5471294168483491, + "grad_norm": 7.171753883361816, + "learning_rate": 1.635730031935492e-05, + "loss": 1.6603, + "step": 87050 + }, + { + "epoch": 0.5471922691650463, + "grad_norm": 6.751748561859131, + "learning_rate": 1.635688121841027e-05, + "loss": 1.6392, + "step": 87060 + }, + { + "epoch": 0.5472551214817434, + "grad_norm": 7.492148399353027, + "learning_rate": 1.6356462117465615e-05, + "loss": 1.815, + "step": 87070 + }, + { + "epoch": 0.5473179737984405, + "grad_norm": 7.068755626678467, + "learning_rate": 1.6356043016520962e-05, + "loss": 1.5841, + "step": 87080 + }, + { + "epoch": 0.5473808261151376, + "grad_norm": 6.7877278327941895, + "learning_rate": 1.6355623915576306e-05, + "loss": 1.7141, + "step": 87090 + }, + { + "epoch": 0.5474436784318347, + "grad_norm": 7.786384582519531, + "learning_rate": 1.6355204814631653e-05, + "loss": 1.6763, + "step": 87100 + }, + { + "epoch": 0.5475065307485318, + "grad_norm": 6.93129825592041, + "learning_rate": 1.6354785713687e-05, + "loss": 1.7916, + "step": 87110 + }, + { + "epoch": 0.547569383065229, + "grad_norm": 6.792569637298584, + "learning_rate": 1.6354366612742347e-05, + "loss": 1.7659, + "step": 87120 + }, + { + "epoch": 0.5476322353819261, + "grad_norm": 6.7060394287109375, + "learning_rate": 1.635394751179769e-05, + "loss": 1.6655, + "step": 87130 + }, + { + "epoch": 0.5476950876986232, + "grad_norm": 7.103468418121338, + "learning_rate": 1.6353528410853038e-05, + "loss": 1.8176, + "step": 87140 + }, + { + "epoch": 0.5477579400153203, + "grad_norm": 6.279816627502441, + "learning_rate": 1.6353109309908385e-05, + "loss": 1.5718, + "step": 87150 + }, + { + "epoch": 0.5478207923320174, + "grad_norm": 6.506233215332031, + "learning_rate": 1.6352690208963732e-05, + "loss": 1.8861, + "step": 87160 + }, + { + "epoch": 0.5478836446487144, + "grad_norm": 8.267227172851562, + "learning_rate": 1.635227110801908e-05, + "loss": 1.6025, + "step": 87170 + }, + { + "epoch": 0.5479464969654115, + "grad_norm": 6.034891605377197, + "learning_rate": 1.6351852007074423e-05, + "loss": 1.7394, + "step": 87180 + }, + { + "epoch": 0.5480093492821086, + "grad_norm": 6.6851043701171875, + "learning_rate": 1.635143290612977e-05, + "loss": 1.4129, + "step": 87190 + }, + { + "epoch": 0.5480722015988058, + "grad_norm": 6.170161724090576, + "learning_rate": 1.6351013805185117e-05, + "loss": 1.7897, + "step": 87200 + }, + { + "epoch": 0.5481350539155029, + "grad_norm": 8.239502906799316, + "learning_rate": 1.6350594704240464e-05, + "loss": 1.6594, + "step": 87210 + }, + { + "epoch": 0.5481979062322, + "grad_norm": 8.009893417358398, + "learning_rate": 1.635017560329581e-05, + "loss": 1.9211, + "step": 87220 + }, + { + "epoch": 0.5482607585488971, + "grad_norm": 5.180368900299072, + "learning_rate": 1.634975650235116e-05, + "loss": 1.6533, + "step": 87230 + }, + { + "epoch": 0.5483236108655942, + "grad_norm": 5.979281425476074, + "learning_rate": 1.6349337401406505e-05, + "loss": 1.5581, + "step": 87240 + }, + { + "epoch": 0.5483864631822913, + "grad_norm": 5.247679233551025, + "learning_rate": 1.6348918300461853e-05, + "loss": 1.4401, + "step": 87250 + }, + { + "epoch": 0.5484493154989885, + "grad_norm": 5.989559173583984, + "learning_rate": 1.6348499199517196e-05, + "loss": 1.4667, + "step": 87260 + }, + { + "epoch": 0.5485121678156856, + "grad_norm": 7.64725399017334, + "learning_rate": 1.6348080098572543e-05, + "loss": 1.8236, + "step": 87270 + }, + { + "epoch": 0.5485750201323827, + "grad_norm": 6.5117011070251465, + "learning_rate": 1.634766099762789e-05, + "loss": 1.6047, + "step": 87280 + }, + { + "epoch": 0.5486378724490798, + "grad_norm": 6.34282922744751, + "learning_rate": 1.6347241896683237e-05, + "loss": 1.3005, + "step": 87290 + }, + { + "epoch": 0.5487007247657769, + "grad_norm": 7.494961261749268, + "learning_rate": 1.6346822795738584e-05, + "loss": 1.5657, + "step": 87300 + }, + { + "epoch": 0.548763577082474, + "grad_norm": 7.219995498657227, + "learning_rate": 1.6346403694793928e-05, + "loss": 1.7029, + "step": 87310 + }, + { + "epoch": 0.5488264293991711, + "grad_norm": 5.836889266967773, + "learning_rate": 1.6345984593849275e-05, + "loss": 1.8481, + "step": 87320 + }, + { + "epoch": 0.5488892817158683, + "grad_norm": 8.275524139404297, + "learning_rate": 1.6345565492904622e-05, + "loss": 1.868, + "step": 87330 + }, + { + "epoch": 0.5489521340325654, + "grad_norm": 5.790599822998047, + "learning_rate": 1.634514639195997e-05, + "loss": 1.5417, + "step": 87340 + }, + { + "epoch": 0.5490149863492625, + "grad_norm": 6.310551643371582, + "learning_rate": 1.6344727291015313e-05, + "loss": 1.8961, + "step": 87350 + }, + { + "epoch": 0.5490778386659596, + "grad_norm": 8.10694694519043, + "learning_rate": 1.634430819007066e-05, + "loss": 1.3983, + "step": 87360 + }, + { + "epoch": 0.5491406909826567, + "grad_norm": 6.893750190734863, + "learning_rate": 1.6343889089126007e-05, + "loss": 1.7624, + "step": 87370 + }, + { + "epoch": 0.5492035432993538, + "grad_norm": 7.259228229522705, + "learning_rate": 1.6343469988181354e-05, + "loss": 1.7395, + "step": 87380 + }, + { + "epoch": 0.549266395616051, + "grad_norm": 6.815544128417969, + "learning_rate": 1.63430508872367e-05, + "loss": 1.6337, + "step": 87390 + }, + { + "epoch": 0.5493292479327481, + "grad_norm": 7.200207233428955, + "learning_rate": 1.634263178629205e-05, + "loss": 1.5261, + "step": 87400 + }, + { + "epoch": 0.5493921002494452, + "grad_norm": 7.834887504577637, + "learning_rate": 1.6342212685347395e-05, + "loss": 1.8426, + "step": 87410 + }, + { + "epoch": 0.5494549525661423, + "grad_norm": 7.845221519470215, + "learning_rate": 1.6341793584402743e-05, + "loss": 1.8138, + "step": 87420 + }, + { + "epoch": 0.5495178048828393, + "grad_norm": 6.522507667541504, + "learning_rate": 1.6341374483458086e-05, + "loss": 1.5218, + "step": 87430 + }, + { + "epoch": 0.5495806571995364, + "grad_norm": 7.382540225982666, + "learning_rate": 1.6340955382513433e-05, + "loss": 1.9934, + "step": 87440 + }, + { + "epoch": 0.5496435095162335, + "grad_norm": 6.693045616149902, + "learning_rate": 1.634053628156878e-05, + "loss": 1.6689, + "step": 87450 + }, + { + "epoch": 0.5497063618329306, + "grad_norm": 6.891082286834717, + "learning_rate": 1.6340117180624127e-05, + "loss": 1.7584, + "step": 87460 + }, + { + "epoch": 0.5497692141496278, + "grad_norm": 8.451995849609375, + "learning_rate": 1.6339698079679475e-05, + "loss": 1.8392, + "step": 87470 + }, + { + "epoch": 0.5498320664663249, + "grad_norm": 6.282824993133545, + "learning_rate": 1.633927897873482e-05, + "loss": 1.6657, + "step": 87480 + }, + { + "epoch": 0.549894918783022, + "grad_norm": 6.958793640136719, + "learning_rate": 1.6338859877790165e-05, + "loss": 1.7762, + "step": 87490 + }, + { + "epoch": 0.5499577710997191, + "grad_norm": 5.975584506988525, + "learning_rate": 1.6338440776845512e-05, + "loss": 1.8833, + "step": 87500 + }, + { + "epoch": 0.5500206234164162, + "grad_norm": 7.92507791519165, + "learning_rate": 1.633802167590086e-05, + "loss": 1.7359, + "step": 87510 + }, + { + "epoch": 0.5500834757331133, + "grad_norm": 7.211165428161621, + "learning_rate": 1.6337602574956206e-05, + "loss": 1.5616, + "step": 87520 + }, + { + "epoch": 0.5501463280498105, + "grad_norm": 6.03791618347168, + "learning_rate": 1.633718347401155e-05, + "loss": 1.7102, + "step": 87530 + }, + { + "epoch": 0.5502091803665076, + "grad_norm": 6.354442596435547, + "learning_rate": 1.6336764373066897e-05, + "loss": 1.5821, + "step": 87540 + }, + { + "epoch": 0.5502720326832047, + "grad_norm": 5.757572174072266, + "learning_rate": 1.6336345272122244e-05, + "loss": 1.7117, + "step": 87550 + }, + { + "epoch": 0.5503348849999018, + "grad_norm": 6.304782390594482, + "learning_rate": 1.633592617117759e-05, + "loss": 1.7086, + "step": 87560 + }, + { + "epoch": 0.5503977373165989, + "grad_norm": 6.215713977813721, + "learning_rate": 1.6335507070232935e-05, + "loss": 1.4925, + "step": 87570 + }, + { + "epoch": 0.550460589633296, + "grad_norm": 5.681164264678955, + "learning_rate": 1.6335087969288282e-05, + "loss": 1.5901, + "step": 87580 + }, + { + "epoch": 0.5505234419499931, + "grad_norm": 7.626335144042969, + "learning_rate": 1.633466886834363e-05, + "loss": 1.8527, + "step": 87590 + }, + { + "epoch": 0.5505862942666903, + "grad_norm": 6.560548782348633, + "learning_rate": 1.6334249767398976e-05, + "loss": 1.9203, + "step": 87600 + }, + { + "epoch": 0.5506491465833874, + "grad_norm": 7.164431571960449, + "learning_rate": 1.6333830666454323e-05, + "loss": 1.5093, + "step": 87610 + }, + { + "epoch": 0.5507119989000845, + "grad_norm": 6.229931354522705, + "learning_rate": 1.633341156550967e-05, + "loss": 1.6094, + "step": 87620 + }, + { + "epoch": 0.5507748512167816, + "grad_norm": 6.629180908203125, + "learning_rate": 1.6332992464565017e-05, + "loss": 1.7345, + "step": 87630 + }, + { + "epoch": 0.5508377035334787, + "grad_norm": 7.034588813781738, + "learning_rate": 1.6332573363620365e-05, + "loss": 1.7259, + "step": 87640 + }, + { + "epoch": 0.5509005558501758, + "grad_norm": 7.558387279510498, + "learning_rate": 1.633215426267571e-05, + "loss": 1.7842, + "step": 87650 + }, + { + "epoch": 0.550963408166873, + "grad_norm": 7.3063273429870605, + "learning_rate": 1.6331735161731055e-05, + "loss": 1.8985, + "step": 87660 + }, + { + "epoch": 0.5510262604835701, + "grad_norm": 5.6917266845703125, + "learning_rate": 1.6331316060786402e-05, + "loss": 1.7987, + "step": 87670 + }, + { + "epoch": 0.5510891128002671, + "grad_norm": 6.492855548858643, + "learning_rate": 1.633089695984175e-05, + "loss": 1.9732, + "step": 87680 + }, + { + "epoch": 0.5511519651169642, + "grad_norm": 7.414215087890625, + "learning_rate": 1.6330477858897097e-05, + "loss": 1.7681, + "step": 87690 + }, + { + "epoch": 0.5512148174336613, + "grad_norm": 6.806081771850586, + "learning_rate": 1.6330058757952444e-05, + "loss": 1.642, + "step": 87700 + }, + { + "epoch": 0.5512776697503584, + "grad_norm": 6.732658386230469, + "learning_rate": 1.6329639657007787e-05, + "loss": 1.9905, + "step": 87710 + }, + { + "epoch": 0.5513405220670555, + "grad_norm": 6.367004871368408, + "learning_rate": 1.6329220556063134e-05, + "loss": 1.87, + "step": 87720 + }, + { + "epoch": 0.5514033743837526, + "grad_norm": 6.659083843231201, + "learning_rate": 1.632880145511848e-05, + "loss": 1.6762, + "step": 87730 + }, + { + "epoch": 0.5514662267004498, + "grad_norm": 6.704216480255127, + "learning_rate": 1.632838235417383e-05, + "loss": 1.9091, + "step": 87740 + }, + { + "epoch": 0.5515290790171469, + "grad_norm": 6.720300197601318, + "learning_rate": 1.6327963253229172e-05, + "loss": 1.5238, + "step": 87750 + }, + { + "epoch": 0.551591931333844, + "grad_norm": 8.09980583190918, + "learning_rate": 1.632754415228452e-05, + "loss": 2.0008, + "step": 87760 + }, + { + "epoch": 0.5516547836505411, + "grad_norm": 6.566691875457764, + "learning_rate": 1.6327125051339866e-05, + "loss": 1.7393, + "step": 87770 + }, + { + "epoch": 0.5517176359672382, + "grad_norm": 5.294040203094482, + "learning_rate": 1.6326705950395213e-05, + "loss": 1.6083, + "step": 87780 + }, + { + "epoch": 0.5517804882839353, + "grad_norm": 6.389209747314453, + "learning_rate": 1.632628684945056e-05, + "loss": 1.7037, + "step": 87790 + }, + { + "epoch": 0.5518433406006324, + "grad_norm": 7.676865100860596, + "learning_rate": 1.6325867748505908e-05, + "loss": 1.6174, + "step": 87800 + }, + { + "epoch": 0.5519061929173296, + "grad_norm": 6.478692531585693, + "learning_rate": 1.632544864756125e-05, + "loss": 1.5484, + "step": 87810 + }, + { + "epoch": 0.5519690452340267, + "grad_norm": 7.029147148132324, + "learning_rate": 1.6325029546616598e-05, + "loss": 1.6543, + "step": 87820 + }, + { + "epoch": 0.5520318975507238, + "grad_norm": 7.854307651519775, + "learning_rate": 1.6324610445671945e-05, + "loss": 1.6173, + "step": 87830 + }, + { + "epoch": 0.5520947498674209, + "grad_norm": 6.4751877784729, + "learning_rate": 1.6324191344727292e-05, + "loss": 1.7093, + "step": 87840 + }, + { + "epoch": 0.552157602184118, + "grad_norm": 5.035534858703613, + "learning_rate": 1.632377224378264e-05, + "loss": 1.7226, + "step": 87850 + }, + { + "epoch": 0.5522204545008151, + "grad_norm": 6.849821090698242, + "learning_rate": 1.6323353142837987e-05, + "loss": 1.8909, + "step": 87860 + }, + { + "epoch": 0.5522833068175123, + "grad_norm": 7.671096324920654, + "learning_rate": 1.6322934041893334e-05, + "loss": 1.51, + "step": 87870 + }, + { + "epoch": 0.5523461591342094, + "grad_norm": 6.942525386810303, + "learning_rate": 1.6322514940948677e-05, + "loss": 1.7381, + "step": 87880 + }, + { + "epoch": 0.5524090114509065, + "grad_norm": 6.023004531860352, + "learning_rate": 1.6322095840004024e-05, + "loss": 1.7272, + "step": 87890 + }, + { + "epoch": 0.5524718637676036, + "grad_norm": 6.889723300933838, + "learning_rate": 1.632167673905937e-05, + "loss": 1.4791, + "step": 87900 + }, + { + "epoch": 0.5525347160843007, + "grad_norm": 6.181000232696533, + "learning_rate": 1.632125763811472e-05, + "loss": 1.6955, + "step": 87910 + }, + { + "epoch": 0.5525975684009978, + "grad_norm": 5.688227653503418, + "learning_rate": 1.6320838537170066e-05, + "loss": 1.7663, + "step": 87920 + }, + { + "epoch": 0.552660420717695, + "grad_norm": 6.987216949462891, + "learning_rate": 1.632041943622541e-05, + "loss": 1.5974, + "step": 87930 + }, + { + "epoch": 0.552723273034392, + "grad_norm": 5.845030784606934, + "learning_rate": 1.6320000335280756e-05, + "loss": 1.8496, + "step": 87940 + }, + { + "epoch": 0.5527861253510891, + "grad_norm": 6.893259048461914, + "learning_rate": 1.6319581234336103e-05, + "loss": 1.7192, + "step": 87950 + }, + { + "epoch": 0.5528489776677862, + "grad_norm": 6.025664806365967, + "learning_rate": 1.631916213339145e-05, + "loss": 1.5366, + "step": 87960 + }, + { + "epoch": 0.5529118299844833, + "grad_norm": 7.317282676696777, + "learning_rate": 1.6318743032446794e-05, + "loss": 1.6679, + "step": 87970 + }, + { + "epoch": 0.5529746823011804, + "grad_norm": 6.57348108291626, + "learning_rate": 1.631832393150214e-05, + "loss": 1.842, + "step": 87980 + }, + { + "epoch": 0.5530375346178775, + "grad_norm": 6.035397052764893, + "learning_rate": 1.631790483055749e-05, + "loss": 1.4517, + "step": 87990 + }, + { + "epoch": 0.5531003869345746, + "grad_norm": 6.152946472167969, + "learning_rate": 1.6317485729612835e-05, + "loss": 1.533, + "step": 88000 + }, + { + "epoch": 0.5531632392512718, + "grad_norm": 6.46749210357666, + "learning_rate": 1.6317066628668182e-05, + "loss": 1.5675, + "step": 88010 + }, + { + "epoch": 0.5532260915679689, + "grad_norm": 7.725470542907715, + "learning_rate": 1.631664752772353e-05, + "loss": 1.4354, + "step": 88020 + }, + { + "epoch": 0.553288943884666, + "grad_norm": 6.083590984344482, + "learning_rate": 1.6316228426778877e-05, + "loss": 1.8766, + "step": 88030 + }, + { + "epoch": 0.5533517962013631, + "grad_norm": 6.757826805114746, + "learning_rate": 1.6315809325834224e-05, + "loss": 1.5701, + "step": 88040 + }, + { + "epoch": 0.5534146485180602, + "grad_norm": 6.432161331176758, + "learning_rate": 1.6315390224889567e-05, + "loss": 1.7546, + "step": 88050 + }, + { + "epoch": 0.5534775008347573, + "grad_norm": 6.68278169631958, + "learning_rate": 1.6314971123944914e-05, + "loss": 1.5692, + "step": 88060 + }, + { + "epoch": 0.5535403531514544, + "grad_norm": 7.217104434967041, + "learning_rate": 1.631455202300026e-05, + "loss": 1.548, + "step": 88070 + }, + { + "epoch": 0.5536032054681516, + "grad_norm": 5.770365238189697, + "learning_rate": 1.6314174832150073e-05, + "loss": 1.5295, + "step": 88080 + }, + { + "epoch": 0.5536660577848487, + "grad_norm": 7.724226951599121, + "learning_rate": 1.631375573120542e-05, + "loss": 1.6381, + "step": 88090 + }, + { + "epoch": 0.5537289101015458, + "grad_norm": 8.098963737487793, + "learning_rate": 1.6313336630260767e-05, + "loss": 1.6756, + "step": 88100 + }, + { + "epoch": 0.5537917624182429, + "grad_norm": 6.411672592163086, + "learning_rate": 1.6312917529316114e-05, + "loss": 1.5901, + "step": 88110 + }, + { + "epoch": 0.55385461473494, + "grad_norm": 7.5321574211120605, + "learning_rate": 1.6312498428371458e-05, + "loss": 1.7832, + "step": 88120 + }, + { + "epoch": 0.5539174670516371, + "grad_norm": 6.153202056884766, + "learning_rate": 1.6312079327426805e-05, + "loss": 1.5185, + "step": 88130 + }, + { + "epoch": 0.5539803193683343, + "grad_norm": 6.171274662017822, + "learning_rate": 1.6311660226482152e-05, + "loss": 1.5899, + "step": 88140 + }, + { + "epoch": 0.5540431716850314, + "grad_norm": 4.977878093719482, + "learning_rate": 1.63112411255375e-05, + "loss": 1.4111, + "step": 88150 + }, + { + "epoch": 0.5541060240017285, + "grad_norm": 6.700924873352051, + "learning_rate": 1.6310822024592846e-05, + "loss": 1.7089, + "step": 88160 + }, + { + "epoch": 0.5541688763184256, + "grad_norm": 6.37979793548584, + "learning_rate": 1.6310402923648193e-05, + "loss": 1.5482, + "step": 88170 + }, + { + "epoch": 0.5542317286351227, + "grad_norm": 6.379642963409424, + "learning_rate": 1.6309983822703537e-05, + "loss": 1.6192, + "step": 88180 + }, + { + "epoch": 0.5542945809518198, + "grad_norm": 6.328685760498047, + "learning_rate": 1.6309564721758884e-05, + "loss": 1.8287, + "step": 88190 + }, + { + "epoch": 0.5543574332685168, + "grad_norm": 7.2171149253845215, + "learning_rate": 1.630914562081423e-05, + "loss": 1.5507, + "step": 88200 + }, + { + "epoch": 0.554420285585214, + "grad_norm": 6.692591190338135, + "learning_rate": 1.6308726519869578e-05, + "loss": 1.5776, + "step": 88210 + }, + { + "epoch": 0.5544831379019111, + "grad_norm": 6.94549036026001, + "learning_rate": 1.6308307418924925e-05, + "loss": 1.882, + "step": 88220 + }, + { + "epoch": 0.5545459902186082, + "grad_norm": 5.682509899139404, + "learning_rate": 1.630788831798027e-05, + "loss": 1.648, + "step": 88230 + }, + { + "epoch": 0.5546088425353053, + "grad_norm": 6.2639055252075195, + "learning_rate": 1.6307469217035616e-05, + "loss": 1.7571, + "step": 88240 + }, + { + "epoch": 0.5546716948520024, + "grad_norm": 6.274423599243164, + "learning_rate": 1.6307050116090963e-05, + "loss": 1.7917, + "step": 88250 + }, + { + "epoch": 0.5547345471686995, + "grad_norm": 5.863760471343994, + "learning_rate": 1.630663101514631e-05, + "loss": 1.7114, + "step": 88260 + }, + { + "epoch": 0.5547973994853966, + "grad_norm": 6.9797468185424805, + "learning_rate": 1.6306211914201654e-05, + "loss": 1.6295, + "step": 88270 + }, + { + "epoch": 0.5548602518020938, + "grad_norm": 7.441795349121094, + "learning_rate": 1.6305792813257e-05, + "loss": 1.6251, + "step": 88280 + }, + { + "epoch": 0.5549231041187909, + "grad_norm": 6.823574066162109, + "learning_rate": 1.6305373712312348e-05, + "loss": 1.4674, + "step": 88290 + }, + { + "epoch": 0.554985956435488, + "grad_norm": 6.422065258026123, + "learning_rate": 1.6304954611367695e-05, + "loss": 1.765, + "step": 88300 + }, + { + "epoch": 0.5550488087521851, + "grad_norm": 6.555672645568848, + "learning_rate": 1.6304535510423042e-05, + "loss": 1.7314, + "step": 88310 + }, + { + "epoch": 0.5551116610688822, + "grad_norm": 6.252634525299072, + "learning_rate": 1.630411640947839e-05, + "loss": 1.5402, + "step": 88320 + }, + { + "epoch": 0.5551745133855793, + "grad_norm": 6.4778594970703125, + "learning_rate": 1.6303697308533736e-05, + "loss": 1.6508, + "step": 88330 + }, + { + "epoch": 0.5552373657022764, + "grad_norm": 6.431690692901611, + "learning_rate": 1.6303278207589083e-05, + "loss": 1.8059, + "step": 88340 + }, + { + "epoch": 0.5553002180189736, + "grad_norm": 6.253882884979248, + "learning_rate": 1.630285910664443e-05, + "loss": 1.7937, + "step": 88350 + }, + { + "epoch": 0.5553630703356707, + "grad_norm": 7.070196628570557, + "learning_rate": 1.6302440005699774e-05, + "loss": 1.9511, + "step": 88360 + }, + { + "epoch": 0.5554259226523678, + "grad_norm": 6.050371170043945, + "learning_rate": 1.630202090475512e-05, + "loss": 1.4563, + "step": 88370 + }, + { + "epoch": 0.5554887749690649, + "grad_norm": 7.460027694702148, + "learning_rate": 1.6301601803810468e-05, + "loss": 1.8283, + "step": 88380 + }, + { + "epoch": 0.555551627285762, + "grad_norm": 6.309556484222412, + "learning_rate": 1.6301182702865815e-05, + "loss": 1.7847, + "step": 88390 + }, + { + "epoch": 0.5556144796024591, + "grad_norm": 7.3537139892578125, + "learning_rate": 1.630076360192116e-05, + "loss": 1.7832, + "step": 88400 + }, + { + "epoch": 0.5556773319191562, + "grad_norm": 6.594983100891113, + "learning_rate": 1.6300344500976506e-05, + "loss": 1.7553, + "step": 88410 + }, + { + "epoch": 0.5557401842358534, + "grad_norm": 7.0119428634643555, + "learning_rate": 1.6299925400031853e-05, + "loss": 1.6977, + "step": 88420 + }, + { + "epoch": 0.5558030365525505, + "grad_norm": 7.39605188369751, + "learning_rate": 1.62995062990872e-05, + "loss": 1.7559, + "step": 88430 + }, + { + "epoch": 0.5558658888692476, + "grad_norm": 5.62230110168457, + "learning_rate": 1.6299087198142547e-05, + "loss": 1.4533, + "step": 88440 + }, + { + "epoch": 0.5559287411859446, + "grad_norm": 5.8085503578186035, + "learning_rate": 1.629866809719789e-05, + "loss": 1.6466, + "step": 88450 + }, + { + "epoch": 0.5559915935026417, + "grad_norm": 6.4032511711120605, + "learning_rate": 1.6298248996253238e-05, + "loss": 1.7441, + "step": 88460 + }, + { + "epoch": 0.5560544458193388, + "grad_norm": 7.032168388366699, + "learning_rate": 1.6297829895308585e-05, + "loss": 1.9077, + "step": 88470 + }, + { + "epoch": 0.5561172981360359, + "grad_norm": 6.47052001953125, + "learning_rate": 1.6297410794363932e-05, + "loss": 1.6508, + "step": 88480 + }, + { + "epoch": 0.5561801504527331, + "grad_norm": 6.570706844329834, + "learning_rate": 1.629699169341928e-05, + "loss": 1.7408, + "step": 88490 + }, + { + "epoch": 0.5562430027694302, + "grad_norm": 6.553955078125, + "learning_rate": 1.6296572592474623e-05, + "loss": 1.9257, + "step": 88500 + }, + { + "epoch": 0.5563058550861273, + "grad_norm": 6.742047309875488, + "learning_rate": 1.629615349152997e-05, + "loss": 1.6006, + "step": 88510 + }, + { + "epoch": 0.5563687074028244, + "grad_norm": 6.896884441375732, + "learning_rate": 1.6295734390585317e-05, + "loss": 1.6857, + "step": 88520 + }, + { + "epoch": 0.5564315597195215, + "grad_norm": 5.0939202308654785, + "learning_rate": 1.6295315289640664e-05, + "loss": 1.4175, + "step": 88530 + }, + { + "epoch": 0.5564944120362186, + "grad_norm": 7.0617547035217285, + "learning_rate": 1.629489618869601e-05, + "loss": 1.6025, + "step": 88540 + }, + { + "epoch": 0.5565572643529157, + "grad_norm": 6.168315887451172, + "learning_rate": 1.6294477087751358e-05, + "loss": 1.6803, + "step": 88550 + }, + { + "epoch": 0.5566201166696129, + "grad_norm": 6.095003604888916, + "learning_rate": 1.6294057986806705e-05, + "loss": 1.5083, + "step": 88560 + }, + { + "epoch": 0.55668296898631, + "grad_norm": 6.832040309906006, + "learning_rate": 1.6293638885862052e-05, + "loss": 1.5936, + "step": 88570 + }, + { + "epoch": 0.5567458213030071, + "grad_norm": 7.0723557472229, + "learning_rate": 1.6293219784917396e-05, + "loss": 1.5651, + "step": 88580 + }, + { + "epoch": 0.5568086736197042, + "grad_norm": 6.6852850914001465, + "learning_rate": 1.6292800683972743e-05, + "loss": 1.6691, + "step": 88590 + }, + { + "epoch": 0.5568715259364013, + "grad_norm": 6.6617231369018555, + "learning_rate": 1.629238158302809e-05, + "loss": 1.7027, + "step": 88600 + }, + { + "epoch": 0.5569343782530984, + "grad_norm": 5.725902557373047, + "learning_rate": 1.6291962482083437e-05, + "loss": 1.6941, + "step": 88610 + }, + { + "epoch": 0.5569972305697956, + "grad_norm": 7.879228591918945, + "learning_rate": 1.629154338113878e-05, + "loss": 1.6248, + "step": 88620 + }, + { + "epoch": 0.5570600828864927, + "grad_norm": 6.585484504699707, + "learning_rate": 1.6291124280194128e-05, + "loss": 1.6381, + "step": 88630 + }, + { + "epoch": 0.5571229352031898, + "grad_norm": 6.741808891296387, + "learning_rate": 1.6290705179249475e-05, + "loss": 1.6211, + "step": 88640 + }, + { + "epoch": 0.5571857875198869, + "grad_norm": 6.813026428222656, + "learning_rate": 1.6290286078304822e-05, + "loss": 1.6066, + "step": 88650 + }, + { + "epoch": 0.557248639836584, + "grad_norm": 6.790803909301758, + "learning_rate": 1.628986697736017e-05, + "loss": 1.8055, + "step": 88660 + }, + { + "epoch": 0.5573114921532811, + "grad_norm": 6.520722389221191, + "learning_rate": 1.6289447876415513e-05, + "loss": 1.6868, + "step": 88670 + }, + { + "epoch": 0.5573743444699782, + "grad_norm": 6.6449174880981445, + "learning_rate": 1.628902877547086e-05, + "loss": 1.7712, + "step": 88680 + }, + { + "epoch": 0.5574371967866754, + "grad_norm": 5.842750549316406, + "learning_rate": 1.6288609674526207e-05, + "loss": 1.3447, + "step": 88690 + }, + { + "epoch": 0.5575000491033725, + "grad_norm": 6.170193195343018, + "learning_rate": 1.6288190573581554e-05, + "loss": 1.7034, + "step": 88700 + }, + { + "epoch": 0.5575629014200695, + "grad_norm": 6.261218070983887, + "learning_rate": 1.62877714726369e-05, + "loss": 1.5003, + "step": 88710 + }, + { + "epoch": 0.5576257537367666, + "grad_norm": 6.723862648010254, + "learning_rate": 1.6287352371692248e-05, + "loss": 1.5344, + "step": 88720 + }, + { + "epoch": 0.5576886060534637, + "grad_norm": 6.398962497711182, + "learning_rate": 1.6286933270747595e-05, + "loss": 1.6921, + "step": 88730 + }, + { + "epoch": 0.5577514583701608, + "grad_norm": 6.833871364593506, + "learning_rate": 1.6286514169802942e-05, + "loss": 1.8374, + "step": 88740 + }, + { + "epoch": 0.5578143106868579, + "grad_norm": 6.804493427276611, + "learning_rate": 1.6286095068858286e-05, + "loss": 1.7986, + "step": 88750 + }, + { + "epoch": 0.557877163003555, + "grad_norm": 6.8638787269592285, + "learning_rate": 1.6285675967913633e-05, + "loss": 1.5772, + "step": 88760 + }, + { + "epoch": 0.5579400153202522, + "grad_norm": 5.638939380645752, + "learning_rate": 1.628525686696898e-05, + "loss": 1.6406, + "step": 88770 + }, + { + "epoch": 0.5580028676369493, + "grad_norm": 9.554666519165039, + "learning_rate": 1.6284837766024327e-05, + "loss": 1.7268, + "step": 88780 + }, + { + "epoch": 0.5580657199536464, + "grad_norm": 6.101341724395752, + "learning_rate": 1.6284418665079674e-05, + "loss": 1.5086, + "step": 88790 + }, + { + "epoch": 0.5581285722703435, + "grad_norm": 6.6777520179748535, + "learning_rate": 1.6283999564135018e-05, + "loss": 1.6133, + "step": 88800 + }, + { + "epoch": 0.5581914245870406, + "grad_norm": 7.209829330444336, + "learning_rate": 1.6283580463190365e-05, + "loss": 1.8295, + "step": 88810 + }, + { + "epoch": 0.5582542769037377, + "grad_norm": 6.874991416931152, + "learning_rate": 1.6283161362245712e-05, + "loss": 1.8068, + "step": 88820 + }, + { + "epoch": 0.5583171292204349, + "grad_norm": 7.330787181854248, + "learning_rate": 1.628274226130106e-05, + "loss": 1.6279, + "step": 88830 + }, + { + "epoch": 0.558379981537132, + "grad_norm": 6.485179901123047, + "learning_rate": 1.6282323160356406e-05, + "loss": 1.6222, + "step": 88840 + }, + { + "epoch": 0.5584428338538291, + "grad_norm": 6.6356329917907715, + "learning_rate": 1.628190405941175e-05, + "loss": 1.9118, + "step": 88850 + }, + { + "epoch": 0.5585056861705262, + "grad_norm": 6.619465351104736, + "learning_rate": 1.6281484958467097e-05, + "loss": 1.8015, + "step": 88860 + }, + { + "epoch": 0.5585685384872233, + "grad_norm": 6.195246696472168, + "learning_rate": 1.6281065857522444e-05, + "loss": 1.6933, + "step": 88870 + }, + { + "epoch": 0.5586313908039204, + "grad_norm": 6.337784767150879, + "learning_rate": 1.628064675657779e-05, + "loss": 1.4796, + "step": 88880 + }, + { + "epoch": 0.5586942431206176, + "grad_norm": 5.8104448318481445, + "learning_rate": 1.6280227655633135e-05, + "loss": 1.9172, + "step": 88890 + }, + { + "epoch": 0.5587570954373147, + "grad_norm": 5.997087478637695, + "learning_rate": 1.6279808554688482e-05, + "loss": 1.7153, + "step": 88900 + }, + { + "epoch": 0.5588199477540118, + "grad_norm": 5.885451316833496, + "learning_rate": 1.627938945374383e-05, + "loss": 1.5284, + "step": 88910 + }, + { + "epoch": 0.5588828000707089, + "grad_norm": 7.631045818328857, + "learning_rate": 1.6278970352799176e-05, + "loss": 1.7715, + "step": 88920 + }, + { + "epoch": 0.558945652387406, + "grad_norm": 6.923070430755615, + "learning_rate": 1.6278551251854523e-05, + "loss": 1.5471, + "step": 88930 + }, + { + "epoch": 0.5590085047041031, + "grad_norm": 6.820895671844482, + "learning_rate": 1.627813215090987e-05, + "loss": 1.6448, + "step": 88940 + }, + { + "epoch": 0.5590713570208002, + "grad_norm": 6.700250148773193, + "learning_rate": 1.6277713049965217e-05, + "loss": 1.6986, + "step": 88950 + }, + { + "epoch": 0.5591342093374972, + "grad_norm": 8.055480003356934, + "learning_rate": 1.6277293949020564e-05, + "loss": 1.5795, + "step": 88960 + }, + { + "epoch": 0.5591970616541944, + "grad_norm": 6.974710464477539, + "learning_rate": 1.627687484807591e-05, + "loss": 1.816, + "step": 88970 + }, + { + "epoch": 0.5592599139708915, + "grad_norm": 6.205530166625977, + "learning_rate": 1.6276455747131255e-05, + "loss": 1.6085, + "step": 88980 + }, + { + "epoch": 0.5593227662875886, + "grad_norm": 6.516695499420166, + "learning_rate": 1.6276036646186602e-05, + "loss": 1.822, + "step": 88990 + }, + { + "epoch": 0.5593856186042857, + "grad_norm": 7.034592151641846, + "learning_rate": 1.627561754524195e-05, + "loss": 1.7036, + "step": 89000 + }, + { + "epoch": 0.5594484709209828, + "grad_norm": 6.519965171813965, + "learning_rate": 1.6275198444297296e-05, + "loss": 1.6152, + "step": 89010 + }, + { + "epoch": 0.5595113232376799, + "grad_norm": 6.692155361175537, + "learning_rate": 1.627477934335264e-05, + "loss": 1.8324, + "step": 89020 + }, + { + "epoch": 0.559574175554377, + "grad_norm": 5.974812030792236, + "learning_rate": 1.6274360242407987e-05, + "loss": 1.6236, + "step": 89030 + }, + { + "epoch": 0.5596370278710742, + "grad_norm": 6.819674491882324, + "learning_rate": 1.6273941141463334e-05, + "loss": 1.7876, + "step": 89040 + }, + { + "epoch": 0.5596998801877713, + "grad_norm": 5.531889915466309, + "learning_rate": 1.627352204051868e-05, + "loss": 1.8333, + "step": 89050 + }, + { + "epoch": 0.5597627325044684, + "grad_norm": 6.38810920715332, + "learning_rate": 1.6273102939574028e-05, + "loss": 1.582, + "step": 89060 + }, + { + "epoch": 0.5598255848211655, + "grad_norm": 6.805556774139404, + "learning_rate": 1.6272683838629372e-05, + "loss": 1.6396, + "step": 89070 + }, + { + "epoch": 0.5598884371378626, + "grad_norm": 6.56950044631958, + "learning_rate": 1.627226473768472e-05, + "loss": 1.4835, + "step": 89080 + }, + { + "epoch": 0.5599512894545597, + "grad_norm": 6.350771903991699, + "learning_rate": 1.6271845636740066e-05, + "loss": 1.5145, + "step": 89090 + }, + { + "epoch": 0.5600141417712569, + "grad_norm": 5.643241882324219, + "learning_rate": 1.6271426535795413e-05, + "loss": 1.728, + "step": 89100 + }, + { + "epoch": 0.560076994087954, + "grad_norm": 6.5237321853637695, + "learning_rate": 1.627100743485076e-05, + "loss": 1.5731, + "step": 89110 + }, + { + "epoch": 0.5601398464046511, + "grad_norm": 6.321384429931641, + "learning_rate": 1.6270588333906104e-05, + "loss": 1.6608, + "step": 89120 + }, + { + "epoch": 0.5602026987213482, + "grad_norm": 6.6404829025268555, + "learning_rate": 1.627016923296145e-05, + "loss": 1.7645, + "step": 89130 + }, + { + "epoch": 0.5602655510380453, + "grad_norm": 6.8443121910095215, + "learning_rate": 1.6269750132016798e-05, + "loss": 1.8534, + "step": 89140 + }, + { + "epoch": 0.5603284033547424, + "grad_norm": 7.051900863647461, + "learning_rate": 1.6269331031072145e-05, + "loss": 1.4937, + "step": 89150 + }, + { + "epoch": 0.5603912556714395, + "grad_norm": 6.721617221832275, + "learning_rate": 1.6268911930127492e-05, + "loss": 1.635, + "step": 89160 + }, + { + "epoch": 0.5604541079881367, + "grad_norm": 7.16155481338501, + "learning_rate": 1.626849282918284e-05, + "loss": 1.7732, + "step": 89170 + }, + { + "epoch": 0.5605169603048338, + "grad_norm": 9.203920364379883, + "learning_rate": 1.6268073728238186e-05, + "loss": 2.0461, + "step": 89180 + }, + { + "epoch": 0.5605798126215309, + "grad_norm": 6.358094215393066, + "learning_rate": 1.6267654627293533e-05, + "loss": 1.6883, + "step": 89190 + }, + { + "epoch": 0.560642664938228, + "grad_norm": 7.03914737701416, + "learning_rate": 1.6267235526348877e-05, + "loss": 1.6068, + "step": 89200 + }, + { + "epoch": 0.5607055172549251, + "grad_norm": 6.174300670623779, + "learning_rate": 1.6266816425404224e-05, + "loss": 1.7287, + "step": 89210 + }, + { + "epoch": 0.5607683695716221, + "grad_norm": 6.269082069396973, + "learning_rate": 1.626639732445957e-05, + "loss": 1.6998, + "step": 89220 + }, + { + "epoch": 0.5608312218883192, + "grad_norm": 5.566357135772705, + "learning_rate": 1.6265978223514918e-05, + "loss": 1.3758, + "step": 89230 + }, + { + "epoch": 0.5608940742050164, + "grad_norm": 7.715774059295654, + "learning_rate": 1.6265559122570265e-05, + "loss": 1.5952, + "step": 89240 + }, + { + "epoch": 0.5609569265217135, + "grad_norm": 6.553233623504639, + "learning_rate": 1.626514002162561e-05, + "loss": 1.5755, + "step": 89250 + }, + { + "epoch": 0.5610197788384106, + "grad_norm": 6.500826835632324, + "learning_rate": 1.6264720920680956e-05, + "loss": 1.6818, + "step": 89260 + }, + { + "epoch": 0.5610826311551077, + "grad_norm": 7.091058254241943, + "learning_rate": 1.6264301819736303e-05, + "loss": 1.9517, + "step": 89270 + }, + { + "epoch": 0.5611454834718048, + "grad_norm": 6.283771514892578, + "learning_rate": 1.626388271879165e-05, + "loss": 1.7043, + "step": 89280 + }, + { + "epoch": 0.5612083357885019, + "grad_norm": 6.668854236602783, + "learning_rate": 1.6263463617846994e-05, + "loss": 1.5439, + "step": 89290 + }, + { + "epoch": 0.561271188105199, + "grad_norm": 5.6727614402771, + "learning_rate": 1.626304451690234e-05, + "loss": 1.5474, + "step": 89300 + }, + { + "epoch": 0.5613340404218962, + "grad_norm": 6.778054714202881, + "learning_rate": 1.6262625415957688e-05, + "loss": 1.8569, + "step": 89310 + }, + { + "epoch": 0.5613968927385933, + "grad_norm": 5.522119045257568, + "learning_rate": 1.6262206315013035e-05, + "loss": 1.4112, + "step": 89320 + }, + { + "epoch": 0.5614597450552904, + "grad_norm": 6.095500469207764, + "learning_rate": 1.6261787214068382e-05, + "loss": 1.6091, + "step": 89330 + }, + { + "epoch": 0.5615225973719875, + "grad_norm": 6.398715496063232, + "learning_rate": 1.626136811312373e-05, + "loss": 1.4782, + "step": 89340 + }, + { + "epoch": 0.5615854496886846, + "grad_norm": 6.670832633972168, + "learning_rate": 1.6260949012179076e-05, + "loss": 1.5155, + "step": 89350 + }, + { + "epoch": 0.5616483020053817, + "grad_norm": 6.177496910095215, + "learning_rate": 1.6260529911234423e-05, + "loss": 1.6604, + "step": 89360 + }, + { + "epoch": 0.5617111543220789, + "grad_norm": 6.3406524658203125, + "learning_rate": 1.6260110810289767e-05, + "loss": 1.7501, + "step": 89370 + }, + { + "epoch": 0.561774006638776, + "grad_norm": 6.5295939445495605, + "learning_rate": 1.6259691709345114e-05, + "loss": 1.7353, + "step": 89380 + }, + { + "epoch": 0.5618368589554731, + "grad_norm": 5.897647380828857, + "learning_rate": 1.625927260840046e-05, + "loss": 1.8933, + "step": 89390 + }, + { + "epoch": 0.5618997112721702, + "grad_norm": 4.999483585357666, + "learning_rate": 1.6258853507455808e-05, + "loss": 1.6702, + "step": 89400 + }, + { + "epoch": 0.5619625635888673, + "grad_norm": 6.448941230773926, + "learning_rate": 1.6258434406511155e-05, + "loss": 1.6223, + "step": 89410 + }, + { + "epoch": 0.5620254159055644, + "grad_norm": 6.9571123123168945, + "learning_rate": 1.62580153055665e-05, + "loss": 1.7999, + "step": 89420 + }, + { + "epoch": 0.5620882682222615, + "grad_norm": 6.229642868041992, + "learning_rate": 1.6257596204621846e-05, + "loss": 1.7164, + "step": 89430 + }, + { + "epoch": 0.5621511205389587, + "grad_norm": 6.478581428527832, + "learning_rate": 1.6257177103677193e-05, + "loss": 1.5214, + "step": 89440 + }, + { + "epoch": 0.5622139728556558, + "grad_norm": 5.852070331573486, + "learning_rate": 1.625675800273254e-05, + "loss": 1.5816, + "step": 89450 + }, + { + "epoch": 0.5622768251723529, + "grad_norm": 6.88888692855835, + "learning_rate": 1.6256338901787887e-05, + "loss": 1.6473, + "step": 89460 + }, + { + "epoch": 0.5623396774890499, + "grad_norm": 5.942054271697998, + "learning_rate": 1.625591980084323e-05, + "loss": 1.6023, + "step": 89470 + }, + { + "epoch": 0.562402529805747, + "grad_norm": 7.349301815032959, + "learning_rate": 1.6255500699898578e-05, + "loss": 1.891, + "step": 89480 + }, + { + "epoch": 0.5624653821224441, + "grad_norm": 6.585026741027832, + "learning_rate": 1.6255081598953925e-05, + "loss": 1.6427, + "step": 89490 + }, + { + "epoch": 0.5625282344391412, + "grad_norm": 6.790942668914795, + "learning_rate": 1.6254662498009272e-05, + "loss": 1.671, + "step": 89500 + }, + { + "epoch": 0.5625910867558384, + "grad_norm": 6.546286582946777, + "learning_rate": 1.6254243397064616e-05, + "loss": 1.5507, + "step": 89510 + }, + { + "epoch": 0.5626539390725355, + "grad_norm": 6.672111511230469, + "learning_rate": 1.6253824296119963e-05, + "loss": 1.6603, + "step": 89520 + }, + { + "epoch": 0.5627167913892326, + "grad_norm": 8.207008361816406, + "learning_rate": 1.625340519517531e-05, + "loss": 1.7917, + "step": 89530 + }, + { + "epoch": 0.5627796437059297, + "grad_norm": 8.121054649353027, + "learning_rate": 1.6252986094230657e-05, + "loss": 1.9549, + "step": 89540 + }, + { + "epoch": 0.5628424960226268, + "grad_norm": 6.844808101654053, + "learning_rate": 1.6252566993286004e-05, + "loss": 1.6826, + "step": 89550 + }, + { + "epoch": 0.5629053483393239, + "grad_norm": 6.727411270141602, + "learning_rate": 1.625214789234135e-05, + "loss": 1.7397, + "step": 89560 + }, + { + "epoch": 0.562968200656021, + "grad_norm": 7.244187355041504, + "learning_rate": 1.6251728791396698e-05, + "loss": 1.7125, + "step": 89570 + }, + { + "epoch": 0.5630310529727182, + "grad_norm": 6.885039806365967, + "learning_rate": 1.6251309690452045e-05, + "loss": 1.4795, + "step": 89580 + }, + { + "epoch": 0.5630939052894153, + "grad_norm": 6.604867458343506, + "learning_rate": 1.6250890589507392e-05, + "loss": 1.6503, + "step": 89590 + }, + { + "epoch": 0.5631567576061124, + "grad_norm": 5.984154224395752, + "learning_rate": 1.6250471488562736e-05, + "loss": 1.8947, + "step": 89600 + }, + { + "epoch": 0.5632196099228095, + "grad_norm": 9.04930591583252, + "learning_rate": 1.6250052387618083e-05, + "loss": 1.7429, + "step": 89610 + }, + { + "epoch": 0.5632824622395066, + "grad_norm": 6.829537391662598, + "learning_rate": 1.624963328667343e-05, + "loss": 1.659, + "step": 89620 + }, + { + "epoch": 0.5633453145562037, + "grad_norm": 6.806103229522705, + "learning_rate": 1.6249214185728777e-05, + "loss": 1.6567, + "step": 89630 + }, + { + "epoch": 0.5634081668729009, + "grad_norm": 6.35833215713501, + "learning_rate": 1.624879508478412e-05, + "loss": 1.8157, + "step": 89640 + }, + { + "epoch": 0.563471019189598, + "grad_norm": 6.641960620880127, + "learning_rate": 1.6248375983839468e-05, + "loss": 1.6399, + "step": 89650 + }, + { + "epoch": 0.5635338715062951, + "grad_norm": 6.535701751708984, + "learning_rate": 1.6247956882894815e-05, + "loss": 1.5097, + "step": 89660 + }, + { + "epoch": 0.5635967238229922, + "grad_norm": 6.738907814025879, + "learning_rate": 1.6247537781950162e-05, + "loss": 1.6767, + "step": 89670 + }, + { + "epoch": 0.5636595761396893, + "grad_norm": 6.377346515655518, + "learning_rate": 1.624711868100551e-05, + "loss": 1.5594, + "step": 89680 + }, + { + "epoch": 0.5637224284563864, + "grad_norm": 6.713415145874023, + "learning_rate": 1.6246699580060853e-05, + "loss": 1.6798, + "step": 89690 + }, + { + "epoch": 0.5637852807730835, + "grad_norm": 5.800888538360596, + "learning_rate": 1.62462804791162e-05, + "loss": 1.518, + "step": 89700 + }, + { + "epoch": 0.5638481330897807, + "grad_norm": 6.9739670753479, + "learning_rate": 1.6245861378171547e-05, + "loss": 1.6548, + "step": 89710 + }, + { + "epoch": 0.5639109854064778, + "grad_norm": 6.058177947998047, + "learning_rate": 1.6245442277226894e-05, + "loss": 1.7046, + "step": 89720 + }, + { + "epoch": 0.5639738377231748, + "grad_norm": 6.74522590637207, + "learning_rate": 1.624502317628224e-05, + "loss": 1.5815, + "step": 89730 + }, + { + "epoch": 0.5640366900398719, + "grad_norm": 6.15690803527832, + "learning_rate": 1.6244604075337588e-05, + "loss": 1.7372, + "step": 89740 + }, + { + "epoch": 0.564099542356569, + "grad_norm": 5.959741115570068, + "learning_rate": 1.6244184974392932e-05, + "loss": 1.7466, + "step": 89750 + }, + { + "epoch": 0.5641623946732661, + "grad_norm": 8.733022689819336, + "learning_rate": 1.624376587344828e-05, + "loss": 1.5584, + "step": 89760 + }, + { + "epoch": 0.5642252469899632, + "grad_norm": 6.4488677978515625, + "learning_rate": 1.6243346772503626e-05, + "loss": 1.7363, + "step": 89770 + }, + { + "epoch": 0.5642880993066604, + "grad_norm": 6.470970630645752, + "learning_rate": 1.6242927671558973e-05, + "loss": 1.5498, + "step": 89780 + }, + { + "epoch": 0.5643509516233575, + "grad_norm": 6.511645317077637, + "learning_rate": 1.624250857061432e-05, + "loss": 1.5934, + "step": 89790 + }, + { + "epoch": 0.5644138039400546, + "grad_norm": 6.307367324829102, + "learning_rate": 1.6242089469669667e-05, + "loss": 1.759, + "step": 89800 + }, + { + "epoch": 0.5644766562567517, + "grad_norm": 7.514179706573486, + "learning_rate": 1.6241670368725014e-05, + "loss": 1.8545, + "step": 89810 + }, + { + "epoch": 0.5645395085734488, + "grad_norm": 5.769866943359375, + "learning_rate": 1.6241251267780358e-05, + "loss": 1.8331, + "step": 89820 + }, + { + "epoch": 0.5646023608901459, + "grad_norm": 5.930098056793213, + "learning_rate": 1.6240832166835705e-05, + "loss": 1.713, + "step": 89830 + }, + { + "epoch": 0.564665213206843, + "grad_norm": 7.681197166442871, + "learning_rate": 1.6240413065891052e-05, + "loss": 1.5439, + "step": 89840 + }, + { + "epoch": 0.5647280655235402, + "grad_norm": 7.412057399749756, + "learning_rate": 1.62399939649464e-05, + "loss": 1.5705, + "step": 89850 + }, + { + "epoch": 0.5647909178402373, + "grad_norm": 6.578376293182373, + "learning_rate": 1.6239574864001746e-05, + "loss": 1.7744, + "step": 89860 + }, + { + "epoch": 0.5648537701569344, + "grad_norm": 9.134100914001465, + "learning_rate": 1.623915576305709e-05, + "loss": 1.5815, + "step": 89870 + }, + { + "epoch": 0.5649166224736315, + "grad_norm": 6.486658573150635, + "learning_rate": 1.6238736662112437e-05, + "loss": 1.6856, + "step": 89880 + }, + { + "epoch": 0.5649794747903286, + "grad_norm": 7.928880214691162, + "learning_rate": 1.6238317561167784e-05, + "loss": 1.7607, + "step": 89890 + }, + { + "epoch": 0.5650423271070257, + "grad_norm": 6.201712131500244, + "learning_rate": 1.623789846022313e-05, + "loss": 1.5833, + "step": 89900 + }, + { + "epoch": 0.5651051794237228, + "grad_norm": 4.976857662200928, + "learning_rate": 1.6237479359278475e-05, + "loss": 1.5414, + "step": 89910 + }, + { + "epoch": 0.56516803174042, + "grad_norm": 7.770523548126221, + "learning_rate": 1.6237060258333822e-05, + "loss": 1.6504, + "step": 89920 + }, + { + "epoch": 0.5652308840571171, + "grad_norm": 6.774728298187256, + "learning_rate": 1.623664115738917e-05, + "loss": 1.5659, + "step": 89930 + }, + { + "epoch": 0.5652937363738142, + "grad_norm": 7.217031478881836, + "learning_rate": 1.6236222056444516e-05, + "loss": 1.7839, + "step": 89940 + }, + { + "epoch": 0.5653565886905113, + "grad_norm": 6.508360385894775, + "learning_rate": 1.6235802955499863e-05, + "loss": 1.6238, + "step": 89950 + }, + { + "epoch": 0.5654194410072084, + "grad_norm": 6.7677435874938965, + "learning_rate": 1.623538385455521e-05, + "loss": 1.4442, + "step": 89960 + }, + { + "epoch": 0.5654822933239055, + "grad_norm": 6.441218376159668, + "learning_rate": 1.6234964753610557e-05, + "loss": 1.554, + "step": 89970 + }, + { + "epoch": 0.5655451456406025, + "grad_norm": 6.243203163146973, + "learning_rate": 1.6234545652665904e-05, + "loss": 1.8936, + "step": 89980 + }, + { + "epoch": 0.5656079979572997, + "grad_norm": 6.367725849151611, + "learning_rate": 1.623412655172125e-05, + "loss": 1.7235, + "step": 89990 + }, + { + "epoch": 0.5656708502739968, + "grad_norm": 6.914323329925537, + "learning_rate": 1.6233707450776595e-05, + "loss": 1.8123, + "step": 90000 + }, + { + "epoch": 0.5657337025906939, + "grad_norm": 6.8788299560546875, + "learning_rate": 1.6233288349831942e-05, + "loss": 1.5721, + "step": 90010 + }, + { + "epoch": 0.565796554907391, + "grad_norm": 7.315779209136963, + "learning_rate": 1.623286924888729e-05, + "loss": 1.7572, + "step": 90020 + }, + { + "epoch": 0.5658594072240881, + "grad_norm": 6.532388210296631, + "learning_rate": 1.6232450147942636e-05, + "loss": 1.7725, + "step": 90030 + }, + { + "epoch": 0.5659222595407852, + "grad_norm": 7.474238395690918, + "learning_rate": 1.623203104699798e-05, + "loss": 1.8216, + "step": 90040 + }, + { + "epoch": 0.5659851118574823, + "grad_norm": 6.514171600341797, + "learning_rate": 1.6231611946053327e-05, + "loss": 1.4655, + "step": 90050 + }, + { + "epoch": 0.5660479641741795, + "grad_norm": 7.061287879943848, + "learning_rate": 1.6231192845108674e-05, + "loss": 1.7795, + "step": 90060 + }, + { + "epoch": 0.5661108164908766, + "grad_norm": 6.0305094718933105, + "learning_rate": 1.623077374416402e-05, + "loss": 1.4329, + "step": 90070 + }, + { + "epoch": 0.5661736688075737, + "grad_norm": 6.642032623291016, + "learning_rate": 1.623035464321937e-05, + "loss": 1.6796, + "step": 90080 + }, + { + "epoch": 0.5662365211242708, + "grad_norm": 6.137035846710205, + "learning_rate": 1.6229935542274712e-05, + "loss": 1.5379, + "step": 90090 + }, + { + "epoch": 0.5662993734409679, + "grad_norm": 8.072105407714844, + "learning_rate": 1.622951644133006e-05, + "loss": 1.8794, + "step": 90100 + }, + { + "epoch": 0.566362225757665, + "grad_norm": 5.9996418952941895, + "learning_rate": 1.6229097340385406e-05, + "loss": 1.3494, + "step": 90110 + }, + { + "epoch": 0.5664250780743622, + "grad_norm": 6.265005588531494, + "learning_rate": 1.6228678239440753e-05, + "loss": 1.7446, + "step": 90120 + }, + { + "epoch": 0.5664879303910593, + "grad_norm": 8.894662857055664, + "learning_rate": 1.6228259138496097e-05, + "loss": 1.9204, + "step": 90130 + }, + { + "epoch": 0.5665507827077564, + "grad_norm": 6.5717034339904785, + "learning_rate": 1.6227840037551444e-05, + "loss": 1.6384, + "step": 90140 + }, + { + "epoch": 0.5666136350244535, + "grad_norm": 6.242033004760742, + "learning_rate": 1.622742093660679e-05, + "loss": 1.803, + "step": 90150 + }, + { + "epoch": 0.5666764873411506, + "grad_norm": 6.6735405921936035, + "learning_rate": 1.6227001835662138e-05, + "loss": 1.7524, + "step": 90160 + }, + { + "epoch": 0.5667393396578477, + "grad_norm": 6.145554065704346, + "learning_rate": 1.6226582734717485e-05, + "loss": 1.7816, + "step": 90170 + }, + { + "epoch": 0.5668021919745448, + "grad_norm": 6.141727447509766, + "learning_rate": 1.6226163633772832e-05, + "loss": 1.8193, + "step": 90180 + }, + { + "epoch": 0.566865044291242, + "grad_norm": 6.712489604949951, + "learning_rate": 1.622574453282818e-05, + "loss": 1.6914, + "step": 90190 + }, + { + "epoch": 0.5669278966079391, + "grad_norm": 6.391608715057373, + "learning_rate": 1.6225325431883526e-05, + "loss": 2.0208, + "step": 90200 + }, + { + "epoch": 0.5669907489246362, + "grad_norm": 6.549454212188721, + "learning_rate": 1.6224906330938874e-05, + "loss": 1.7772, + "step": 90210 + }, + { + "epoch": 0.5670536012413333, + "grad_norm": 7.447284698486328, + "learning_rate": 1.6224487229994217e-05, + "loss": 1.6078, + "step": 90220 + }, + { + "epoch": 0.5671164535580304, + "grad_norm": 6.7304463386535645, + "learning_rate": 1.6224068129049564e-05, + "loss": 1.8029, + "step": 90230 + }, + { + "epoch": 0.5671793058747274, + "grad_norm": 6.139402389526367, + "learning_rate": 1.622364902810491e-05, + "loss": 1.4951, + "step": 90240 + }, + { + "epoch": 0.5672421581914245, + "grad_norm": 6.552315711975098, + "learning_rate": 1.622322992716026e-05, + "loss": 1.6295, + "step": 90250 + }, + { + "epoch": 0.5673050105081217, + "grad_norm": 6.577693939208984, + "learning_rate": 1.6222810826215602e-05, + "loss": 1.7602, + "step": 90260 + }, + { + "epoch": 0.5673678628248188, + "grad_norm": 7.049777507781982, + "learning_rate": 1.622239172527095e-05, + "loss": 1.4879, + "step": 90270 + }, + { + "epoch": 0.5674307151415159, + "grad_norm": 6.171783447265625, + "learning_rate": 1.6221972624326296e-05, + "loss": 1.3778, + "step": 90280 + }, + { + "epoch": 0.567493567458213, + "grad_norm": 7.835484981536865, + "learning_rate": 1.6221553523381643e-05, + "loss": 1.6952, + "step": 90290 + }, + { + "epoch": 0.5675564197749101, + "grad_norm": 6.65958309173584, + "learning_rate": 1.622113442243699e-05, + "loss": 1.6576, + "step": 90300 + }, + { + "epoch": 0.5676192720916072, + "grad_norm": 6.794559001922607, + "learning_rate": 1.6220715321492334e-05, + "loss": 1.6117, + "step": 90310 + }, + { + "epoch": 0.5676821244083043, + "grad_norm": 7.645313739776611, + "learning_rate": 1.622029622054768e-05, + "loss": 1.7929, + "step": 90320 + }, + { + "epoch": 0.5677449767250015, + "grad_norm": 6.500690460205078, + "learning_rate": 1.6219877119603028e-05, + "loss": 1.5474, + "step": 90330 + }, + { + "epoch": 0.5678078290416986, + "grad_norm": 6.779569625854492, + "learning_rate": 1.6219458018658375e-05, + "loss": 1.7564, + "step": 90340 + }, + { + "epoch": 0.5678706813583957, + "grad_norm": 8.097848892211914, + "learning_rate": 1.6219038917713722e-05, + "loss": 1.6125, + "step": 90350 + }, + { + "epoch": 0.5679335336750928, + "grad_norm": 7.679541110992432, + "learning_rate": 1.621861981676907e-05, + "loss": 1.6187, + "step": 90360 + }, + { + "epoch": 0.5679963859917899, + "grad_norm": 7.951330661773682, + "learning_rate": 1.6218200715824416e-05, + "loss": 1.8312, + "step": 90370 + }, + { + "epoch": 0.568059238308487, + "grad_norm": 6.817192554473877, + "learning_rate": 1.621778161487976e-05, + "loss": 1.3995, + "step": 90380 + }, + { + "epoch": 0.5681220906251842, + "grad_norm": 7.9999308586120605, + "learning_rate": 1.6217362513935107e-05, + "loss": 1.7167, + "step": 90390 + }, + { + "epoch": 0.5681849429418813, + "grad_norm": 7.261706352233887, + "learning_rate": 1.6216943412990454e-05, + "loss": 1.7325, + "step": 90400 + }, + { + "epoch": 0.5682477952585784, + "grad_norm": 6.690458297729492, + "learning_rate": 1.62165243120458e-05, + "loss": 1.8187, + "step": 90410 + }, + { + "epoch": 0.5683106475752755, + "grad_norm": 7.295557498931885, + "learning_rate": 1.621610521110115e-05, + "loss": 1.5807, + "step": 90420 + }, + { + "epoch": 0.5683734998919726, + "grad_norm": 5.740212917327881, + "learning_rate": 1.6215686110156496e-05, + "loss": 1.6022, + "step": 90430 + }, + { + "epoch": 0.5684363522086697, + "grad_norm": 7.598939418792725, + "learning_rate": 1.621526700921184e-05, + "loss": 1.8893, + "step": 90440 + }, + { + "epoch": 0.5684992045253668, + "grad_norm": 6.767308235168457, + "learning_rate": 1.6214847908267186e-05, + "loss": 1.6985, + "step": 90450 + }, + { + "epoch": 0.568562056842064, + "grad_norm": 7.9719929695129395, + "learning_rate": 1.6214428807322533e-05, + "loss": 1.6289, + "step": 90460 + }, + { + "epoch": 0.5686249091587611, + "grad_norm": 6.249841690063477, + "learning_rate": 1.621400970637788e-05, + "loss": 1.6309, + "step": 90470 + }, + { + "epoch": 0.5686877614754582, + "grad_norm": 6.916226387023926, + "learning_rate": 1.6213590605433227e-05, + "loss": 1.4816, + "step": 90480 + }, + { + "epoch": 0.5687506137921552, + "grad_norm": 7.875985145568848, + "learning_rate": 1.621317150448857e-05, + "loss": 1.6301, + "step": 90490 + }, + { + "epoch": 0.5688134661088523, + "grad_norm": 6.240650177001953, + "learning_rate": 1.6212752403543918e-05, + "loss": 1.8665, + "step": 90500 + }, + { + "epoch": 0.5688763184255494, + "grad_norm": 7.0537614822387695, + "learning_rate": 1.6212333302599265e-05, + "loss": 1.8948, + "step": 90510 + }, + { + "epoch": 0.5689391707422465, + "grad_norm": 7.471006393432617, + "learning_rate": 1.6211914201654612e-05, + "loss": 1.8681, + "step": 90520 + }, + { + "epoch": 0.5690020230589437, + "grad_norm": 7.404205322265625, + "learning_rate": 1.6211495100709956e-05, + "loss": 1.5513, + "step": 90530 + }, + { + "epoch": 0.5690648753756408, + "grad_norm": 7.2312140464782715, + "learning_rate": 1.6211075999765303e-05, + "loss": 1.8007, + "step": 90540 + }, + { + "epoch": 0.5691277276923379, + "grad_norm": 6.615874290466309, + "learning_rate": 1.621065689882065e-05, + "loss": 1.5656, + "step": 90550 + }, + { + "epoch": 0.569190580009035, + "grad_norm": 3.9499099254608154, + "learning_rate": 1.6210237797875997e-05, + "loss": 1.3342, + "step": 90560 + }, + { + "epoch": 0.5692534323257321, + "grad_norm": 6.043526649475098, + "learning_rate": 1.6209818696931344e-05, + "loss": 1.6464, + "step": 90570 + }, + { + "epoch": 0.5693162846424292, + "grad_norm": 7.471343517303467, + "learning_rate": 1.620939959598669e-05, + "loss": 1.652, + "step": 90580 + }, + { + "epoch": 0.5693791369591263, + "grad_norm": 6.765534400939941, + "learning_rate": 1.620898049504204e-05, + "loss": 1.3959, + "step": 90590 + }, + { + "epoch": 0.5694419892758235, + "grad_norm": 6.237020015716553, + "learning_rate": 1.6208561394097386e-05, + "loss": 1.6979, + "step": 90600 + }, + { + "epoch": 0.5695048415925206, + "grad_norm": 7.709216117858887, + "learning_rate": 1.6208142293152733e-05, + "loss": 1.8288, + "step": 90610 + }, + { + "epoch": 0.5695676939092177, + "grad_norm": 9.005006790161133, + "learning_rate": 1.6207723192208076e-05, + "loss": 1.6457, + "step": 90620 + }, + { + "epoch": 0.5696305462259148, + "grad_norm": 6.673562526702881, + "learning_rate": 1.6207304091263423e-05, + "loss": 1.7839, + "step": 90630 + }, + { + "epoch": 0.5696933985426119, + "grad_norm": 6.20949649810791, + "learning_rate": 1.620688499031877e-05, + "loss": 1.6451, + "step": 90640 + }, + { + "epoch": 0.569756250859309, + "grad_norm": 7.498359680175781, + "learning_rate": 1.6206465889374118e-05, + "loss": 1.5566, + "step": 90650 + }, + { + "epoch": 0.5698191031760061, + "grad_norm": 6.984977722167969, + "learning_rate": 1.620604678842946e-05, + "loss": 1.7864, + "step": 90660 + }, + { + "epoch": 0.5698819554927033, + "grad_norm": 7.174541473388672, + "learning_rate": 1.6205627687484808e-05, + "loss": 1.6211, + "step": 90670 + }, + { + "epoch": 0.5699448078094004, + "grad_norm": 6.415432453155518, + "learning_rate": 1.6205208586540155e-05, + "loss": 1.7804, + "step": 90680 + }, + { + "epoch": 0.5700076601260975, + "grad_norm": 6.4624223709106445, + "learning_rate": 1.6204789485595502e-05, + "loss": 1.6502, + "step": 90690 + }, + { + "epoch": 0.5700705124427946, + "grad_norm": 5.442400932312012, + "learning_rate": 1.620437038465085e-05, + "loss": 1.4261, + "step": 90700 + }, + { + "epoch": 0.5701333647594917, + "grad_norm": 7.268397331237793, + "learning_rate": 1.6203951283706193e-05, + "loss": 1.6389, + "step": 90710 + }, + { + "epoch": 0.5701962170761888, + "grad_norm": 6.997256755828857, + "learning_rate": 1.620353218276154e-05, + "loss": 1.7754, + "step": 90720 + }, + { + "epoch": 0.570259069392886, + "grad_norm": 6.781765937805176, + "learning_rate": 1.6203113081816887e-05, + "loss": 1.7912, + "step": 90730 + }, + { + "epoch": 0.5703219217095831, + "grad_norm": 6.821094036102295, + "learning_rate": 1.6202693980872234e-05, + "loss": 1.6553, + "step": 90740 + }, + { + "epoch": 0.5703847740262801, + "grad_norm": 7.494826316833496, + "learning_rate": 1.6202274879927578e-05, + "loss": 1.6582, + "step": 90750 + }, + { + "epoch": 0.5704476263429772, + "grad_norm": 6.2762041091918945, + "learning_rate": 1.6201855778982925e-05, + "loss": 1.7972, + "step": 90760 + }, + { + "epoch": 0.5705104786596743, + "grad_norm": 6.95667839050293, + "learning_rate": 1.6201436678038272e-05, + "loss": 1.7072, + "step": 90770 + }, + { + "epoch": 0.5705733309763714, + "grad_norm": 7.874874114990234, + "learning_rate": 1.620101757709362e-05, + "loss": 1.6645, + "step": 90780 + }, + { + "epoch": 0.5706361832930685, + "grad_norm": 7.321478843688965, + "learning_rate": 1.6200598476148966e-05, + "loss": 1.6508, + "step": 90790 + }, + { + "epoch": 0.5706990356097656, + "grad_norm": 5.930144786834717, + "learning_rate": 1.6200179375204313e-05, + "loss": 1.6602, + "step": 90800 + }, + { + "epoch": 0.5707618879264628, + "grad_norm": 6.673276901245117, + "learning_rate": 1.619976027425966e-05, + "loss": 1.5876, + "step": 90810 + }, + { + "epoch": 0.5708247402431599, + "grad_norm": 6.345154285430908, + "learning_rate": 1.6199341173315008e-05, + "loss": 1.5658, + "step": 90820 + }, + { + "epoch": 0.570887592559857, + "grad_norm": 7.179617404937744, + "learning_rate": 1.6198922072370355e-05, + "loss": 1.6392, + "step": 90830 + }, + { + "epoch": 0.5709504448765541, + "grad_norm": 7.312381744384766, + "learning_rate": 1.6198502971425698e-05, + "loss": 1.936, + "step": 90840 + }, + { + "epoch": 0.5710132971932512, + "grad_norm": 6.575345993041992, + "learning_rate": 1.6198083870481045e-05, + "loss": 1.5589, + "step": 90850 + }, + { + "epoch": 0.5710761495099483, + "grad_norm": 6.817177772521973, + "learning_rate": 1.6197664769536392e-05, + "loss": 1.626, + "step": 90860 + }, + { + "epoch": 0.5711390018266455, + "grad_norm": 6.790354251861572, + "learning_rate": 1.619724566859174e-05, + "loss": 1.6123, + "step": 90870 + }, + { + "epoch": 0.5712018541433426, + "grad_norm": 5.912313461303711, + "learning_rate": 1.6196826567647083e-05, + "loss": 1.7207, + "step": 90880 + }, + { + "epoch": 0.5712647064600397, + "grad_norm": 6.774677753448486, + "learning_rate": 1.619640746670243e-05, + "loss": 1.933, + "step": 90890 + }, + { + "epoch": 0.5713275587767368, + "grad_norm": 7.587268352508545, + "learning_rate": 1.6195988365757777e-05, + "loss": 1.7738, + "step": 90900 + }, + { + "epoch": 0.5713904110934339, + "grad_norm": 6.267664432525635, + "learning_rate": 1.6195569264813124e-05, + "loss": 1.5228, + "step": 90910 + }, + { + "epoch": 0.571453263410131, + "grad_norm": 5.794346332550049, + "learning_rate": 1.619515016386847e-05, + "loss": 1.6022, + "step": 90920 + }, + { + "epoch": 0.5715161157268281, + "grad_norm": 6.912841796875, + "learning_rate": 1.6194731062923815e-05, + "loss": 1.7417, + "step": 90930 + }, + { + "epoch": 0.5715789680435253, + "grad_norm": 7.007205486297607, + "learning_rate": 1.6194311961979162e-05, + "loss": 1.5764, + "step": 90940 + }, + { + "epoch": 0.5716418203602224, + "grad_norm": 6.86815071105957, + "learning_rate": 1.619389286103451e-05, + "loss": 1.8537, + "step": 90950 + }, + { + "epoch": 0.5717046726769195, + "grad_norm": 5.955638408660889, + "learning_rate": 1.6193473760089856e-05, + "loss": 1.7025, + "step": 90960 + }, + { + "epoch": 0.5717675249936166, + "grad_norm": 6.138908386230469, + "learning_rate": 1.6193054659145203e-05, + "loss": 1.6357, + "step": 90970 + }, + { + "epoch": 0.5718303773103137, + "grad_norm": 5.944375514984131, + "learning_rate": 1.619263555820055e-05, + "loss": 1.8174, + "step": 90980 + }, + { + "epoch": 0.5718932296270108, + "grad_norm": 7.687257289886475, + "learning_rate": 1.6192216457255898e-05, + "loss": 1.6123, + "step": 90990 + }, + { + "epoch": 0.5719560819437078, + "grad_norm": 7.10957670211792, + "learning_rate": 1.619179735631124e-05, + "loss": 1.6329, + "step": 91000 + }, + { + "epoch": 0.572018934260405, + "grad_norm": 6.9417853355407715, + "learning_rate": 1.619137825536659e-05, + "loss": 1.5804, + "step": 91010 + }, + { + "epoch": 0.5720817865771021, + "grad_norm": 6.256638526916504, + "learning_rate": 1.6190959154421935e-05, + "loss": 1.7376, + "step": 91020 + }, + { + "epoch": 0.5721446388937992, + "grad_norm": 7.385227203369141, + "learning_rate": 1.6190540053477282e-05, + "loss": 1.4635, + "step": 91030 + }, + { + "epoch": 0.5722074912104963, + "grad_norm": 6.41224479675293, + "learning_rate": 1.619012095253263e-05, + "loss": 1.672, + "step": 91040 + }, + { + "epoch": 0.5722703435271934, + "grad_norm": 6.543980121612549, + "learning_rate": 1.6189701851587977e-05, + "loss": 1.6074, + "step": 91050 + }, + { + "epoch": 0.5723331958438905, + "grad_norm": 6.76102876663208, + "learning_rate": 1.618928275064332e-05, + "loss": 1.7029, + "step": 91060 + }, + { + "epoch": 0.5723960481605876, + "grad_norm": 7.033742904663086, + "learning_rate": 1.6188863649698667e-05, + "loss": 1.783, + "step": 91070 + }, + { + "epoch": 0.5724589004772848, + "grad_norm": 7.322498321533203, + "learning_rate": 1.6188444548754014e-05, + "loss": 1.6939, + "step": 91080 + }, + { + "epoch": 0.5725217527939819, + "grad_norm": 6.82847785949707, + "learning_rate": 1.618802544780936e-05, + "loss": 1.8255, + "step": 91090 + }, + { + "epoch": 0.572584605110679, + "grad_norm": 6.518442153930664, + "learning_rate": 1.618760634686471e-05, + "loss": 1.6425, + "step": 91100 + }, + { + "epoch": 0.5726474574273761, + "grad_norm": 6.245519638061523, + "learning_rate": 1.6187187245920052e-05, + "loss": 1.6791, + "step": 91110 + }, + { + "epoch": 0.5727103097440732, + "grad_norm": 6.8035359382629395, + "learning_rate": 1.61867681449754e-05, + "loss": 1.5144, + "step": 91120 + }, + { + "epoch": 0.5727731620607703, + "grad_norm": 6.895239353179932, + "learning_rate": 1.6186349044030746e-05, + "loss": 1.6901, + "step": 91130 + }, + { + "epoch": 0.5728360143774675, + "grad_norm": 6.584578514099121, + "learning_rate": 1.6185929943086093e-05, + "loss": 1.7268, + "step": 91140 + }, + { + "epoch": 0.5728988666941646, + "grad_norm": 6.822965621948242, + "learning_rate": 1.6185510842141437e-05, + "loss": 1.8395, + "step": 91150 + }, + { + "epoch": 0.5729617190108617, + "grad_norm": 8.831889152526855, + "learning_rate": 1.6185091741196784e-05, + "loss": 1.5425, + "step": 91160 + }, + { + "epoch": 0.5730245713275588, + "grad_norm": 6.542859077453613, + "learning_rate": 1.618467264025213e-05, + "loss": 1.743, + "step": 91170 + }, + { + "epoch": 0.5730874236442559, + "grad_norm": 5.4479594230651855, + "learning_rate": 1.618425353930748e-05, + "loss": 1.6769, + "step": 91180 + }, + { + "epoch": 0.573150275960953, + "grad_norm": 5.9766998291015625, + "learning_rate": 1.6183834438362825e-05, + "loss": 1.651, + "step": 91190 + }, + { + "epoch": 0.5732131282776501, + "grad_norm": 7.001051425933838, + "learning_rate": 1.6183415337418173e-05, + "loss": 1.4927, + "step": 91200 + }, + { + "epoch": 0.5732759805943473, + "grad_norm": 5.923704624176025, + "learning_rate": 1.618299623647352e-05, + "loss": 1.8324, + "step": 91210 + }, + { + "epoch": 0.5733388329110444, + "grad_norm": 5.801931381225586, + "learning_rate": 1.6182577135528867e-05, + "loss": 1.5198, + "step": 91220 + }, + { + "epoch": 0.5734016852277415, + "grad_norm": 7.863893508911133, + "learning_rate": 1.6182158034584214e-05, + "loss": 1.9263, + "step": 91230 + }, + { + "epoch": 0.5734645375444386, + "grad_norm": 5.9496612548828125, + "learning_rate": 1.6181738933639557e-05, + "loss": 1.7619, + "step": 91240 + }, + { + "epoch": 0.5735273898611357, + "grad_norm": 5.7954630851745605, + "learning_rate": 1.6181319832694904e-05, + "loss": 1.4125, + "step": 91250 + }, + { + "epoch": 0.5735902421778327, + "grad_norm": 7.1357574462890625, + "learning_rate": 1.618090073175025e-05, + "loss": 1.5966, + "step": 91260 + }, + { + "epoch": 0.5736530944945298, + "grad_norm": 6.261843681335449, + "learning_rate": 1.61804816308056e-05, + "loss": 1.817, + "step": 91270 + }, + { + "epoch": 0.573715946811227, + "grad_norm": 6.486170291900635, + "learning_rate": 1.6180062529860942e-05, + "loss": 1.6445, + "step": 91280 + }, + { + "epoch": 0.5737787991279241, + "grad_norm": 7.065108776092529, + "learning_rate": 1.617964342891629e-05, + "loss": 1.7621, + "step": 91290 + }, + { + "epoch": 0.5738416514446212, + "grad_norm": 5.309915542602539, + "learning_rate": 1.6179224327971636e-05, + "loss": 1.6687, + "step": 91300 + }, + { + "epoch": 0.5739045037613183, + "grad_norm": 6.505954265594482, + "learning_rate": 1.6178805227026984e-05, + "loss": 1.5949, + "step": 91310 + }, + { + "epoch": 0.5739673560780154, + "grad_norm": 7.580758094787598, + "learning_rate": 1.617838612608233e-05, + "loss": 1.6881, + "step": 91320 + }, + { + "epoch": 0.5740302083947125, + "grad_norm": 7.761985778808594, + "learning_rate": 1.6177967025137674e-05, + "loss": 1.9408, + "step": 91330 + }, + { + "epoch": 0.5740930607114096, + "grad_norm": 6.892176151275635, + "learning_rate": 1.617754792419302e-05, + "loss": 1.7251, + "step": 91340 + }, + { + "epoch": 0.5741559130281068, + "grad_norm": 7.198729515075684, + "learning_rate": 1.617712882324837e-05, + "loss": 1.6614, + "step": 91350 + }, + { + "epoch": 0.5742187653448039, + "grad_norm": 6.735234260559082, + "learning_rate": 1.6176709722303715e-05, + "loss": 1.8572, + "step": 91360 + }, + { + "epoch": 0.574281617661501, + "grad_norm": 6.974170207977295, + "learning_rate": 1.6176290621359063e-05, + "loss": 1.7196, + "step": 91370 + }, + { + "epoch": 0.5743444699781981, + "grad_norm": 6.183765411376953, + "learning_rate": 1.6175871520414406e-05, + "loss": 1.6468, + "step": 91380 + }, + { + "epoch": 0.5744073222948952, + "grad_norm": 6.75325870513916, + "learning_rate": 1.6175452419469753e-05, + "loss": 1.8088, + "step": 91390 + }, + { + "epoch": 0.5744701746115923, + "grad_norm": 6.08672571182251, + "learning_rate": 1.61750333185251e-05, + "loss": 1.6807, + "step": 91400 + }, + { + "epoch": 0.5745330269282894, + "grad_norm": 8.254101753234863, + "learning_rate": 1.6174614217580447e-05, + "loss": 1.5649, + "step": 91410 + }, + { + "epoch": 0.5745958792449866, + "grad_norm": 6.380203723907471, + "learning_rate": 1.6174195116635795e-05, + "loss": 1.4865, + "step": 91420 + }, + { + "epoch": 0.5746587315616837, + "grad_norm": 6.785824775695801, + "learning_rate": 1.617377601569114e-05, + "loss": 1.7694, + "step": 91430 + }, + { + "epoch": 0.5747215838783808, + "grad_norm": 6.6405558586120605, + "learning_rate": 1.617335691474649e-05, + "loss": 1.7948, + "step": 91440 + }, + { + "epoch": 0.5747844361950779, + "grad_norm": 6.775050163269043, + "learning_rate": 1.6172937813801836e-05, + "loss": 1.6598, + "step": 91450 + }, + { + "epoch": 0.574847288511775, + "grad_norm": 6.293248653411865, + "learning_rate": 1.617251871285718e-05, + "loss": 1.7985, + "step": 91460 + }, + { + "epoch": 0.5749101408284721, + "grad_norm": 6.873047351837158, + "learning_rate": 1.6172099611912526e-05, + "loss": 1.5276, + "step": 91470 + }, + { + "epoch": 0.5749729931451693, + "grad_norm": 7.247620105743408, + "learning_rate": 1.6171680510967874e-05, + "loss": 1.6014, + "step": 91480 + }, + { + "epoch": 0.5750358454618664, + "grad_norm": 7.733684062957764, + "learning_rate": 1.6171303320117685e-05, + "loss": 1.615, + "step": 91490 + }, + { + "epoch": 0.5750986977785635, + "grad_norm": 9.061704635620117, + "learning_rate": 1.6170884219173032e-05, + "loss": 1.7163, + "step": 91500 + }, + { + "epoch": 0.5751615500952605, + "grad_norm": 6.1667022705078125, + "learning_rate": 1.617046511822838e-05, + "loss": 1.5393, + "step": 91510 + }, + { + "epoch": 0.5752244024119576, + "grad_norm": 6.157415390014648, + "learning_rate": 1.6170046017283726e-05, + "loss": 1.6257, + "step": 91520 + }, + { + "epoch": 0.5752872547286547, + "grad_norm": 7.665525913238525, + "learning_rate": 1.6169626916339073e-05, + "loss": 1.786, + "step": 91530 + }, + { + "epoch": 0.5753501070453518, + "grad_norm": 5.009360313415527, + "learning_rate": 1.6169207815394417e-05, + "loss": 1.6624, + "step": 91540 + }, + { + "epoch": 0.575412959362049, + "grad_norm": 7.433297157287598, + "learning_rate": 1.6168788714449764e-05, + "loss": 1.887, + "step": 91550 + }, + { + "epoch": 0.5754758116787461, + "grad_norm": 6.069151878356934, + "learning_rate": 1.616836961350511e-05, + "loss": 1.6577, + "step": 91560 + }, + { + "epoch": 0.5755386639954432, + "grad_norm": 5.833019733428955, + "learning_rate": 1.6167950512560458e-05, + "loss": 1.7991, + "step": 91570 + }, + { + "epoch": 0.5756015163121403, + "grad_norm": 7.223495006561279, + "learning_rate": 1.6167531411615802e-05, + "loss": 1.7811, + "step": 91580 + }, + { + "epoch": 0.5756643686288374, + "grad_norm": 7.341835975646973, + "learning_rate": 1.616711231067115e-05, + "loss": 1.6356, + "step": 91590 + }, + { + "epoch": 0.5757272209455345, + "grad_norm": 7.553828239440918, + "learning_rate": 1.6166693209726496e-05, + "loss": 1.8357, + "step": 91600 + }, + { + "epoch": 0.5757900732622316, + "grad_norm": 6.463942050933838, + "learning_rate": 1.6166274108781843e-05, + "loss": 1.5733, + "step": 91610 + }, + { + "epoch": 0.5758529255789288, + "grad_norm": 7.367928981781006, + "learning_rate": 1.616585500783719e-05, + "loss": 1.364, + "step": 91620 + }, + { + "epoch": 0.5759157778956259, + "grad_norm": 6.926959991455078, + "learning_rate": 1.6165435906892534e-05, + "loss": 1.8664, + "step": 91630 + }, + { + "epoch": 0.575978630212323, + "grad_norm": 6.353264808654785, + "learning_rate": 1.616501680594788e-05, + "loss": 1.5886, + "step": 91640 + }, + { + "epoch": 0.5760414825290201, + "grad_norm": 7.789851188659668, + "learning_rate": 1.6164597705003228e-05, + "loss": 1.7212, + "step": 91650 + }, + { + "epoch": 0.5761043348457172, + "grad_norm": 6.7941412925720215, + "learning_rate": 1.6164178604058575e-05, + "loss": 1.345, + "step": 91660 + }, + { + "epoch": 0.5761671871624143, + "grad_norm": 6.814610481262207, + "learning_rate": 1.6163759503113922e-05, + "loss": 1.4953, + "step": 91670 + }, + { + "epoch": 0.5762300394791114, + "grad_norm": 6.6490092277526855, + "learning_rate": 1.616334040216927e-05, + "loss": 1.5953, + "step": 91680 + }, + { + "epoch": 0.5762928917958086, + "grad_norm": 7.200467109680176, + "learning_rate": 1.6162921301224613e-05, + "loss": 1.7973, + "step": 91690 + }, + { + "epoch": 0.5763557441125057, + "grad_norm": 7.003600597381592, + "learning_rate": 1.616250220027996e-05, + "loss": 1.7018, + "step": 91700 + }, + { + "epoch": 0.5764185964292028, + "grad_norm": 5.625233173370361, + "learning_rate": 1.6162083099335307e-05, + "loss": 1.6863, + "step": 91710 + }, + { + "epoch": 0.5764814487458999, + "grad_norm": 6.291018486022949, + "learning_rate": 1.6161663998390654e-05, + "loss": 1.6589, + "step": 91720 + }, + { + "epoch": 0.576544301062597, + "grad_norm": 5.673514366149902, + "learning_rate": 1.6161244897446e-05, + "loss": 1.8417, + "step": 91730 + }, + { + "epoch": 0.5766071533792941, + "grad_norm": 7.698174476623535, + "learning_rate": 1.6160825796501348e-05, + "loss": 1.7826, + "step": 91740 + }, + { + "epoch": 0.5766700056959913, + "grad_norm": 6.972863674163818, + "learning_rate": 1.6160406695556695e-05, + "loss": 1.6353, + "step": 91750 + }, + { + "epoch": 0.5767328580126884, + "grad_norm": 6.908972263336182, + "learning_rate": 1.615998759461204e-05, + "loss": 1.743, + "step": 91760 + }, + { + "epoch": 0.5767957103293854, + "grad_norm": 5.623591423034668, + "learning_rate": 1.6159568493667386e-05, + "loss": 1.4776, + "step": 91770 + }, + { + "epoch": 0.5768585626460825, + "grad_norm": 7.277279853820801, + "learning_rate": 1.6159149392722733e-05, + "loss": 1.6441, + "step": 91780 + }, + { + "epoch": 0.5769214149627796, + "grad_norm": 7.286955833435059, + "learning_rate": 1.615873029177808e-05, + "loss": 1.6584, + "step": 91790 + }, + { + "epoch": 0.5769842672794767, + "grad_norm": 7.2920732498168945, + "learning_rate": 1.6158311190833424e-05, + "loss": 1.5623, + "step": 91800 + }, + { + "epoch": 0.5770471195961738, + "grad_norm": 7.998461723327637, + "learning_rate": 1.615789208988877e-05, + "loss": 1.83, + "step": 91810 + }, + { + "epoch": 0.577109971912871, + "grad_norm": 6.427152633666992, + "learning_rate": 1.6157514899038586e-05, + "loss": 1.6626, + "step": 91820 + }, + { + "epoch": 0.5771728242295681, + "grad_norm": 6.673247814178467, + "learning_rate": 1.615709579809393e-05, + "loss": 1.7358, + "step": 91830 + }, + { + "epoch": 0.5772356765462652, + "grad_norm": 6.672535419464111, + "learning_rate": 1.6156676697149276e-05, + "loss": 1.7213, + "step": 91840 + }, + { + "epoch": 0.5772985288629623, + "grad_norm": 5.9849419593811035, + "learning_rate": 1.6156257596204623e-05, + "loss": 1.7514, + "step": 91850 + }, + { + "epoch": 0.5773613811796594, + "grad_norm": 8.018648147583008, + "learning_rate": 1.615583849525997e-05, + "loss": 1.8444, + "step": 91860 + }, + { + "epoch": 0.5774242334963565, + "grad_norm": 5.908487796783447, + "learning_rate": 1.6155419394315318e-05, + "loss": 1.5183, + "step": 91870 + }, + { + "epoch": 0.5774870858130536, + "grad_norm": 7.056883335113525, + "learning_rate": 1.615500029337066e-05, + "loss": 1.6636, + "step": 91880 + }, + { + "epoch": 0.5775499381297508, + "grad_norm": 6.9238762855529785, + "learning_rate": 1.6154581192426008e-05, + "loss": 1.5482, + "step": 91890 + }, + { + "epoch": 0.5776127904464479, + "grad_norm": 7.2767181396484375, + "learning_rate": 1.6154162091481355e-05, + "loss": 1.7856, + "step": 91900 + }, + { + "epoch": 0.577675642763145, + "grad_norm": 6.490273952484131, + "learning_rate": 1.6153742990536702e-05, + "loss": 1.6382, + "step": 91910 + }, + { + "epoch": 0.5777384950798421, + "grad_norm": 7.068709373474121, + "learning_rate": 1.6153323889592046e-05, + "loss": 1.6674, + "step": 91920 + }, + { + "epoch": 0.5778013473965392, + "grad_norm": 6.099994659423828, + "learning_rate": 1.6152904788647393e-05, + "loss": 1.8337, + "step": 91930 + }, + { + "epoch": 0.5778641997132363, + "grad_norm": 6.2828826904296875, + "learning_rate": 1.615248568770274e-05, + "loss": 1.7471, + "step": 91940 + }, + { + "epoch": 0.5779270520299334, + "grad_norm": 7.636531352996826, + "learning_rate": 1.6152066586758087e-05, + "loss": 1.6994, + "step": 91950 + }, + { + "epoch": 0.5779899043466306, + "grad_norm": 6.466481685638428, + "learning_rate": 1.6151647485813434e-05, + "loss": 1.5933, + "step": 91960 + }, + { + "epoch": 0.5780527566633277, + "grad_norm": 8.078164100646973, + "learning_rate": 1.615122838486878e-05, + "loss": 1.3894, + "step": 91970 + }, + { + "epoch": 0.5781156089800248, + "grad_norm": 7.178411483764648, + "learning_rate": 1.615080928392413e-05, + "loss": 1.9705, + "step": 91980 + }, + { + "epoch": 0.5781784612967219, + "grad_norm": 6.905796051025391, + "learning_rate": 1.6150390182979476e-05, + "loss": 1.6892, + "step": 91990 + }, + { + "epoch": 0.578241313613419, + "grad_norm": 7.062839508056641, + "learning_rate": 1.6149971082034823e-05, + "loss": 1.4788, + "step": 92000 + }, + { + "epoch": 0.5783041659301161, + "grad_norm": 7.051177501678467, + "learning_rate": 1.6149551981090166e-05, + "loss": 1.6358, + "step": 92010 + }, + { + "epoch": 0.5783670182468131, + "grad_norm": 7.292420864105225, + "learning_rate": 1.6149132880145513e-05, + "loss": 1.6342, + "step": 92020 + }, + { + "epoch": 0.5784298705635103, + "grad_norm": 6.565810680389404, + "learning_rate": 1.614871377920086e-05, + "loss": 1.7552, + "step": 92030 + }, + { + "epoch": 0.5784927228802074, + "grad_norm": 6.324333190917969, + "learning_rate": 1.6148294678256208e-05, + "loss": 1.7477, + "step": 92040 + }, + { + "epoch": 0.5785555751969045, + "grad_norm": 6.976558208465576, + "learning_rate": 1.6147875577311555e-05, + "loss": 1.9807, + "step": 92050 + }, + { + "epoch": 0.5786184275136016, + "grad_norm": 5.373647689819336, + "learning_rate": 1.61474564763669e-05, + "loss": 1.8094, + "step": 92060 + }, + { + "epoch": 0.5786812798302987, + "grad_norm": 6.824400901794434, + "learning_rate": 1.6147037375422245e-05, + "loss": 1.6915, + "step": 92070 + }, + { + "epoch": 0.5787441321469958, + "grad_norm": 5.881832122802734, + "learning_rate": 1.6146618274477592e-05, + "loss": 1.5915, + "step": 92080 + }, + { + "epoch": 0.5788069844636929, + "grad_norm": 5.642775058746338, + "learning_rate": 1.614619917353294e-05, + "loss": 1.5, + "step": 92090 + }, + { + "epoch": 0.5788698367803901, + "grad_norm": 6.9122443199157715, + "learning_rate": 1.6145780072588283e-05, + "loss": 1.6575, + "step": 92100 + }, + { + "epoch": 0.5789326890970872, + "grad_norm": 6.7870330810546875, + "learning_rate": 1.614536097164363e-05, + "loss": 1.7135, + "step": 92110 + }, + { + "epoch": 0.5789955414137843, + "grad_norm": 6.732898235321045, + "learning_rate": 1.6144941870698977e-05, + "loss": 1.6904, + "step": 92120 + }, + { + "epoch": 0.5790583937304814, + "grad_norm": 6.3459672927856445, + "learning_rate": 1.6144522769754324e-05, + "loss": 1.6486, + "step": 92130 + }, + { + "epoch": 0.5791212460471785, + "grad_norm": 6.703038215637207, + "learning_rate": 1.6144103668809668e-05, + "loss": 1.4906, + "step": 92140 + }, + { + "epoch": 0.5791840983638756, + "grad_norm": 6.147325038909912, + "learning_rate": 1.6143684567865015e-05, + "loss": 1.7452, + "step": 92150 + }, + { + "epoch": 0.5792469506805727, + "grad_norm": 6.5492353439331055, + "learning_rate": 1.6143265466920362e-05, + "loss": 1.6346, + "step": 92160 + }, + { + "epoch": 0.5793098029972699, + "grad_norm": 6.067101955413818, + "learning_rate": 1.614284636597571e-05, + "loss": 1.9467, + "step": 92170 + }, + { + "epoch": 0.579372655313967, + "grad_norm": 6.0446367263793945, + "learning_rate": 1.6142427265031056e-05, + "loss": 1.6943, + "step": 92180 + }, + { + "epoch": 0.5794355076306641, + "grad_norm": 7.993139743804932, + "learning_rate": 1.6142008164086403e-05, + "loss": 1.5548, + "step": 92190 + }, + { + "epoch": 0.5794983599473612, + "grad_norm": 6.8349609375, + "learning_rate": 1.614158906314175e-05, + "loss": 1.6541, + "step": 92200 + }, + { + "epoch": 0.5795612122640583, + "grad_norm": 7.241196155548096, + "learning_rate": 1.6141169962197098e-05, + "loss": 1.6765, + "step": 92210 + }, + { + "epoch": 0.5796240645807554, + "grad_norm": 6.830817222595215, + "learning_rate": 1.6140750861252445e-05, + "loss": 1.4719, + "step": 92220 + }, + { + "epoch": 0.5796869168974526, + "grad_norm": 6.541594505310059, + "learning_rate": 1.614033176030779e-05, + "loss": 1.7481, + "step": 92230 + }, + { + "epoch": 0.5797497692141497, + "grad_norm": 7.100144386291504, + "learning_rate": 1.6139912659363135e-05, + "loss": 1.6503, + "step": 92240 + }, + { + "epoch": 0.5798126215308468, + "grad_norm": 5.921083927154541, + "learning_rate": 1.6139493558418482e-05, + "loss": 1.7248, + "step": 92250 + }, + { + "epoch": 0.5798754738475439, + "grad_norm": 7.404361724853516, + "learning_rate": 1.613907445747383e-05, + "loss": 1.8192, + "step": 92260 + }, + { + "epoch": 0.579938326164241, + "grad_norm": 6.89762020111084, + "learning_rate": 1.6138655356529177e-05, + "loss": 1.7486, + "step": 92270 + }, + { + "epoch": 0.580001178480938, + "grad_norm": 6.422845363616943, + "learning_rate": 1.613823625558452e-05, + "loss": 1.5891, + "step": 92280 + }, + { + "epoch": 0.5800640307976351, + "grad_norm": 6.47426176071167, + "learning_rate": 1.6137817154639867e-05, + "loss": 1.6937, + "step": 92290 + }, + { + "epoch": 0.5801268831143322, + "grad_norm": 6.403764724731445, + "learning_rate": 1.6137398053695214e-05, + "loss": 1.3126, + "step": 92300 + }, + { + "epoch": 0.5801897354310294, + "grad_norm": 7.819459438323975, + "learning_rate": 1.613697895275056e-05, + "loss": 1.6318, + "step": 92310 + }, + { + "epoch": 0.5802525877477265, + "grad_norm": 6.455129623413086, + "learning_rate": 1.6136559851805905e-05, + "loss": 1.636, + "step": 92320 + }, + { + "epoch": 0.5803154400644236, + "grad_norm": 7.5750346183776855, + "learning_rate": 1.6136140750861252e-05, + "loss": 1.7812, + "step": 92330 + }, + { + "epoch": 0.5803782923811207, + "grad_norm": 6.966227054595947, + "learning_rate": 1.61357216499166e-05, + "loss": 1.8077, + "step": 92340 + }, + { + "epoch": 0.5804411446978178, + "grad_norm": 5.989964485168457, + "learning_rate": 1.6135302548971946e-05, + "loss": 1.5901, + "step": 92350 + }, + { + "epoch": 0.5805039970145149, + "grad_norm": 6.2141313552856445, + "learning_rate": 1.6134883448027293e-05, + "loss": 1.7269, + "step": 92360 + }, + { + "epoch": 0.580566849331212, + "grad_norm": 7.187416076660156, + "learning_rate": 1.613446434708264e-05, + "loss": 1.5185, + "step": 92370 + }, + { + "epoch": 0.5806297016479092, + "grad_norm": 7.019697666168213, + "learning_rate": 1.6134045246137988e-05, + "loss": 1.609, + "step": 92380 + }, + { + "epoch": 0.5806925539646063, + "grad_norm": 5.8373918533325195, + "learning_rate": 1.613362614519333e-05, + "loss": 1.5661, + "step": 92390 + }, + { + "epoch": 0.5807554062813034, + "grad_norm": 6.427718162536621, + "learning_rate": 1.613320704424868e-05, + "loss": 1.5865, + "step": 92400 + }, + { + "epoch": 0.5808182585980005, + "grad_norm": 6.802637577056885, + "learning_rate": 1.6132787943304025e-05, + "loss": 1.6558, + "step": 92410 + }, + { + "epoch": 0.5808811109146976, + "grad_norm": 8.224874496459961, + "learning_rate": 1.6132368842359373e-05, + "loss": 1.6988, + "step": 92420 + }, + { + "epoch": 0.5809439632313947, + "grad_norm": 6.821275234222412, + "learning_rate": 1.613194974141472e-05, + "loss": 1.8426, + "step": 92430 + }, + { + "epoch": 0.5810068155480919, + "grad_norm": 7.150043487548828, + "learning_rate": 1.6131530640470067e-05, + "loss": 1.7512, + "step": 92440 + }, + { + "epoch": 0.581069667864789, + "grad_norm": 6.87367057800293, + "learning_rate": 1.613111153952541e-05, + "loss": 1.6349, + "step": 92450 + }, + { + "epoch": 0.5811325201814861, + "grad_norm": 7.385997295379639, + "learning_rate": 1.6130692438580757e-05, + "loss": 1.6678, + "step": 92460 + }, + { + "epoch": 0.5811953724981832, + "grad_norm": 7.349902153015137, + "learning_rate": 1.6130273337636104e-05, + "loss": 1.6892, + "step": 92470 + }, + { + "epoch": 0.5812582248148803, + "grad_norm": 5.5534234046936035, + "learning_rate": 1.612985423669145e-05, + "loss": 1.7053, + "step": 92480 + }, + { + "epoch": 0.5813210771315774, + "grad_norm": 7.304937839508057, + "learning_rate": 1.61294351357468e-05, + "loss": 1.7921, + "step": 92490 + }, + { + "epoch": 0.5813839294482746, + "grad_norm": 5.410800457000732, + "learning_rate": 1.6129016034802142e-05, + "loss": 1.5984, + "step": 92500 + }, + { + "epoch": 0.5814467817649717, + "grad_norm": 6.12760066986084, + "learning_rate": 1.612859693385749e-05, + "loss": 1.6574, + "step": 92510 + }, + { + "epoch": 0.5815096340816688, + "grad_norm": 7.921392440795898, + "learning_rate": 1.6128177832912836e-05, + "loss": 1.8178, + "step": 92520 + }, + { + "epoch": 0.5815724863983658, + "grad_norm": 6.937547206878662, + "learning_rate": 1.6127758731968184e-05, + "loss": 1.7, + "step": 92530 + }, + { + "epoch": 0.5816353387150629, + "grad_norm": 6.305153846740723, + "learning_rate": 1.6127339631023527e-05, + "loss": 1.8536, + "step": 92540 + }, + { + "epoch": 0.58169819103176, + "grad_norm": 6.381948471069336, + "learning_rate": 1.6126920530078874e-05, + "loss": 1.6026, + "step": 92550 + }, + { + "epoch": 0.5817610433484571, + "grad_norm": 6.949012279510498, + "learning_rate": 1.612650142913422e-05, + "loss": 1.7281, + "step": 92560 + }, + { + "epoch": 0.5818238956651542, + "grad_norm": 6.717901706695557, + "learning_rate": 1.612608232818957e-05, + "loss": 1.7287, + "step": 92570 + }, + { + "epoch": 0.5818867479818514, + "grad_norm": 6.39068603515625, + "learning_rate": 1.6125663227244915e-05, + "loss": 1.6692, + "step": 92580 + }, + { + "epoch": 0.5819496002985485, + "grad_norm": 7.127432346343994, + "learning_rate": 1.6125244126300263e-05, + "loss": 1.676, + "step": 92590 + }, + { + "epoch": 0.5820124526152456, + "grad_norm": 6.248184680938721, + "learning_rate": 1.612482502535561e-05, + "loss": 1.5538, + "step": 92600 + }, + { + "epoch": 0.5820753049319427, + "grad_norm": 6.269472599029541, + "learning_rate": 1.6124405924410957e-05, + "loss": 1.6206, + "step": 92610 + }, + { + "epoch": 0.5821381572486398, + "grad_norm": 6.745823860168457, + "learning_rate": 1.6123986823466304e-05, + "loss": 1.6224, + "step": 92620 + }, + { + "epoch": 0.5822010095653369, + "grad_norm": 6.95557975769043, + "learning_rate": 1.6123567722521647e-05, + "loss": 1.7648, + "step": 92630 + }, + { + "epoch": 0.582263861882034, + "grad_norm": 5.934445381164551, + "learning_rate": 1.6123148621576995e-05, + "loss": 1.3874, + "step": 92640 + }, + { + "epoch": 0.5823267141987312, + "grad_norm": 6.739503860473633, + "learning_rate": 1.612272952063234e-05, + "loss": 1.6298, + "step": 92650 + }, + { + "epoch": 0.5823895665154283, + "grad_norm": 5.960019588470459, + "learning_rate": 1.612231041968769e-05, + "loss": 1.7667, + "step": 92660 + }, + { + "epoch": 0.5824524188321254, + "grad_norm": 5.629377365112305, + "learning_rate": 1.6121891318743036e-05, + "loss": 1.6309, + "step": 92670 + }, + { + "epoch": 0.5825152711488225, + "grad_norm": 6.700377464294434, + "learning_rate": 1.612147221779838e-05, + "loss": 1.6084, + "step": 92680 + }, + { + "epoch": 0.5825781234655196, + "grad_norm": 7.131255626678467, + "learning_rate": 1.6121053116853726e-05, + "loss": 1.5905, + "step": 92690 + }, + { + "epoch": 0.5826409757822167, + "grad_norm": 6.773268222808838, + "learning_rate": 1.6120634015909074e-05, + "loss": 1.6531, + "step": 92700 + }, + { + "epoch": 0.5827038280989139, + "grad_norm": 7.491827964782715, + "learning_rate": 1.612021491496442e-05, + "loss": 1.5632, + "step": 92710 + }, + { + "epoch": 0.582766680415611, + "grad_norm": 6.638195037841797, + "learning_rate": 1.6119795814019764e-05, + "loss": 1.6553, + "step": 92720 + }, + { + "epoch": 0.5828295327323081, + "grad_norm": 6.352215766906738, + "learning_rate": 1.611937671307511e-05, + "loss": 1.6827, + "step": 92730 + }, + { + "epoch": 0.5828923850490052, + "grad_norm": 6.999631881713867, + "learning_rate": 1.611895761213046e-05, + "loss": 1.8146, + "step": 92740 + }, + { + "epoch": 0.5829552373657023, + "grad_norm": 6.245862007141113, + "learning_rate": 1.6118538511185806e-05, + "loss": 1.5633, + "step": 92750 + }, + { + "epoch": 0.5830180896823994, + "grad_norm": 6.060407638549805, + "learning_rate": 1.611811941024115e-05, + "loss": 1.5717, + "step": 92760 + }, + { + "epoch": 0.5830809419990965, + "grad_norm": 6.836421966552734, + "learning_rate": 1.6117700309296496e-05, + "loss": 1.7283, + "step": 92770 + }, + { + "epoch": 0.5831437943157937, + "grad_norm": 8.111465454101562, + "learning_rate": 1.6117281208351843e-05, + "loss": 1.886, + "step": 92780 + }, + { + "epoch": 0.5832066466324907, + "grad_norm": 6.941701412200928, + "learning_rate": 1.611686210740719e-05, + "loss": 1.7525, + "step": 92790 + }, + { + "epoch": 0.5832694989491878, + "grad_norm": 7.790984153747559, + "learning_rate": 1.6116443006462537e-05, + "loss": 2.0233, + "step": 92800 + }, + { + "epoch": 0.5833323512658849, + "grad_norm": 6.319355487823486, + "learning_rate": 1.6116023905517885e-05, + "loss": 1.7986, + "step": 92810 + }, + { + "epoch": 0.583395203582582, + "grad_norm": 6.105554580688477, + "learning_rate": 1.611560480457323e-05, + "loss": 1.6814, + "step": 92820 + }, + { + "epoch": 0.5834580558992791, + "grad_norm": 6.465337753295898, + "learning_rate": 1.611518570362858e-05, + "loss": 1.7728, + "step": 92830 + }, + { + "epoch": 0.5835209082159762, + "grad_norm": 7.23231840133667, + "learning_rate": 1.6114766602683926e-05, + "loss": 1.673, + "step": 92840 + }, + { + "epoch": 0.5835837605326734, + "grad_norm": 6.262798309326172, + "learning_rate": 1.611434750173927e-05, + "loss": 1.5981, + "step": 92850 + }, + { + "epoch": 0.5836466128493705, + "grad_norm": 6.214344024658203, + "learning_rate": 1.6113928400794617e-05, + "loss": 1.7211, + "step": 92860 + }, + { + "epoch": 0.5837094651660676, + "grad_norm": 7.631073474884033, + "learning_rate": 1.6113509299849964e-05, + "loss": 1.6097, + "step": 92870 + }, + { + "epoch": 0.5837723174827647, + "grad_norm": 6.744692802429199, + "learning_rate": 1.611309019890531e-05, + "loss": 1.6152, + "step": 92880 + }, + { + "epoch": 0.5838351697994618, + "grad_norm": 6.149967670440674, + "learning_rate": 1.6112671097960658e-05, + "loss": 1.6721, + "step": 92890 + }, + { + "epoch": 0.5838980221161589, + "grad_norm": 7.210057735443115, + "learning_rate": 1.6112251997016e-05, + "loss": 1.8445, + "step": 92900 + }, + { + "epoch": 0.583960874432856, + "grad_norm": 7.1479387283325195, + "learning_rate": 1.611183289607135e-05, + "loss": 1.5608, + "step": 92910 + }, + { + "epoch": 0.5840237267495532, + "grad_norm": 5.437380790710449, + "learning_rate": 1.6111413795126696e-05, + "loss": 1.6195, + "step": 92920 + }, + { + "epoch": 0.5840865790662503, + "grad_norm": 6.878098011016846, + "learning_rate": 1.6110994694182043e-05, + "loss": 1.7123, + "step": 92930 + }, + { + "epoch": 0.5841494313829474, + "grad_norm": 7.800525665283203, + "learning_rate": 1.6110575593237386e-05, + "loss": 1.7621, + "step": 92940 + }, + { + "epoch": 0.5842122836996445, + "grad_norm": 6.723679065704346, + "learning_rate": 1.6110156492292733e-05, + "loss": 1.5929, + "step": 92950 + }, + { + "epoch": 0.5842751360163416, + "grad_norm": 7.580997943878174, + "learning_rate": 1.610973739134808e-05, + "loss": 1.6705, + "step": 92960 + }, + { + "epoch": 0.5843379883330387, + "grad_norm": 6.25993537902832, + "learning_rate": 1.6109318290403428e-05, + "loss": 1.5129, + "step": 92970 + }, + { + "epoch": 0.5844008406497359, + "grad_norm": 8.702542304992676, + "learning_rate": 1.6108899189458775e-05, + "loss": 1.8442, + "step": 92980 + }, + { + "epoch": 0.584463692966433, + "grad_norm": 6.514962196350098, + "learning_rate": 1.610848008851412e-05, + "loss": 1.6069, + "step": 92990 + }, + { + "epoch": 0.5845265452831301, + "grad_norm": 5.985245704650879, + "learning_rate": 1.610806098756947e-05, + "loss": 1.6856, + "step": 93000 + }, + { + "epoch": 0.5845893975998272, + "grad_norm": 7.467013835906982, + "learning_rate": 1.6107641886624812e-05, + "loss": 1.7394, + "step": 93010 + }, + { + "epoch": 0.5846522499165243, + "grad_norm": 6.320962429046631, + "learning_rate": 1.610722278568016e-05, + "loss": 1.6715, + "step": 93020 + }, + { + "epoch": 0.5847151022332214, + "grad_norm": 6.738753795623779, + "learning_rate": 1.6106803684735507e-05, + "loss": 1.7898, + "step": 93030 + }, + { + "epoch": 0.5847779545499185, + "grad_norm": 6.880959510803223, + "learning_rate": 1.6106384583790854e-05, + "loss": 1.6081, + "step": 93040 + }, + { + "epoch": 0.5848408068666155, + "grad_norm": 6.13414192199707, + "learning_rate": 1.61059654828462e-05, + "loss": 1.7875, + "step": 93050 + }, + { + "epoch": 0.5849036591833127, + "grad_norm": 6.073904514312744, + "learning_rate": 1.6105546381901548e-05, + "loss": 1.6608, + "step": 93060 + }, + { + "epoch": 0.5849665115000098, + "grad_norm": 5.76240348815918, + "learning_rate": 1.610512728095689e-05, + "loss": 1.7652, + "step": 93070 + }, + { + "epoch": 0.5850293638167069, + "grad_norm": 5.705706596374512, + "learning_rate": 1.610470818001224e-05, + "loss": 1.5047, + "step": 93080 + }, + { + "epoch": 0.585092216133404, + "grad_norm": 7.057455062866211, + "learning_rate": 1.6104289079067586e-05, + "loss": 1.6391, + "step": 93090 + }, + { + "epoch": 0.5851550684501011, + "grad_norm": 5.5291748046875, + "learning_rate": 1.6103869978122933e-05, + "loss": 1.3893, + "step": 93100 + }, + { + "epoch": 0.5852179207667982, + "grad_norm": 7.446898460388184, + "learning_rate": 1.610345087717828e-05, + "loss": 1.4326, + "step": 93110 + }, + { + "epoch": 0.5852807730834954, + "grad_norm": 6.40448522567749, + "learning_rate": 1.6103031776233623e-05, + "loss": 1.5855, + "step": 93120 + }, + { + "epoch": 0.5853436254001925, + "grad_norm": 7.22370719909668, + "learning_rate": 1.610261267528897e-05, + "loss": 1.5817, + "step": 93130 + }, + { + "epoch": 0.5854064777168896, + "grad_norm": 5.670430660247803, + "learning_rate": 1.6102193574344318e-05, + "loss": 1.6276, + "step": 93140 + }, + { + "epoch": 0.5854693300335867, + "grad_norm": 7.9834089279174805, + "learning_rate": 1.6101774473399665e-05, + "loss": 1.6822, + "step": 93150 + }, + { + "epoch": 0.5855321823502838, + "grad_norm": 6.9317827224731445, + "learning_rate": 1.610135537245501e-05, + "loss": 1.8368, + "step": 93160 + }, + { + "epoch": 0.5855950346669809, + "grad_norm": 7.470687389373779, + "learning_rate": 1.6100936271510355e-05, + "loss": 1.4557, + "step": 93170 + }, + { + "epoch": 0.585657886983678, + "grad_norm": 8.429841995239258, + "learning_rate": 1.6100517170565702e-05, + "loss": 1.7588, + "step": 93180 + }, + { + "epoch": 0.5857207393003752, + "grad_norm": 6.873202323913574, + "learning_rate": 1.610009806962105e-05, + "loss": 1.6446, + "step": 93190 + }, + { + "epoch": 0.5857835916170723, + "grad_norm": 8.124137878417969, + "learning_rate": 1.6099678968676397e-05, + "loss": 1.7304, + "step": 93200 + }, + { + "epoch": 0.5858464439337694, + "grad_norm": 5.538909435272217, + "learning_rate": 1.6099259867731744e-05, + "loss": 1.7024, + "step": 93210 + }, + { + "epoch": 0.5859092962504665, + "grad_norm": 6.4878034591674805, + "learning_rate": 1.609884076678709e-05, + "loss": 1.7013, + "step": 93220 + }, + { + "epoch": 0.5859721485671636, + "grad_norm": 8.979625701904297, + "learning_rate": 1.6098421665842438e-05, + "loss": 1.5524, + "step": 93230 + }, + { + "epoch": 0.5860350008838607, + "grad_norm": 7.3275861740112305, + "learning_rate": 1.6098002564897785e-05, + "loss": 1.687, + "step": 93240 + }, + { + "epoch": 0.5860978532005579, + "grad_norm": 7.95927619934082, + "learning_rate": 1.609758346395313e-05, + "loss": 1.6542, + "step": 93250 + }, + { + "epoch": 0.586160705517255, + "grad_norm": 7.133495807647705, + "learning_rate": 1.6097164363008476e-05, + "loss": 1.6232, + "step": 93260 + }, + { + "epoch": 0.5862235578339521, + "grad_norm": 6.226972579956055, + "learning_rate": 1.6096745262063823e-05, + "loss": 1.5965, + "step": 93270 + }, + { + "epoch": 0.5862864101506492, + "grad_norm": 8.452329635620117, + "learning_rate": 1.609632616111917e-05, + "loss": 1.8258, + "step": 93280 + }, + { + "epoch": 0.5863492624673463, + "grad_norm": 7.127265930175781, + "learning_rate": 1.6095907060174517e-05, + "loss": 1.6476, + "step": 93290 + }, + { + "epoch": 0.5864121147840433, + "grad_norm": 7.24031400680542, + "learning_rate": 1.609548795922986e-05, + "loss": 1.8477, + "step": 93300 + }, + { + "epoch": 0.5864749671007404, + "grad_norm": 6.776553630828857, + "learning_rate": 1.6095068858285208e-05, + "loss": 1.5119, + "step": 93310 + }, + { + "epoch": 0.5865378194174375, + "grad_norm": 7.938908100128174, + "learning_rate": 1.6094649757340555e-05, + "loss": 1.7587, + "step": 93320 + }, + { + "epoch": 0.5866006717341347, + "grad_norm": 6.02783727645874, + "learning_rate": 1.6094230656395902e-05, + "loss": 1.6484, + "step": 93330 + }, + { + "epoch": 0.5866635240508318, + "grad_norm": 5.759817600250244, + "learning_rate": 1.6093811555451245e-05, + "loss": 1.5388, + "step": 93340 + }, + { + "epoch": 0.5867263763675289, + "grad_norm": 6.911100387573242, + "learning_rate": 1.6093392454506593e-05, + "loss": 1.5686, + "step": 93350 + }, + { + "epoch": 0.586789228684226, + "grad_norm": 6.46895170211792, + "learning_rate": 1.609297335356194e-05, + "loss": 1.6428, + "step": 93360 + }, + { + "epoch": 0.5868520810009231, + "grad_norm": 7.600230693817139, + "learning_rate": 1.6092554252617287e-05, + "loss": 1.5964, + "step": 93370 + }, + { + "epoch": 0.5869149333176202, + "grad_norm": 6.355671405792236, + "learning_rate": 1.6092135151672634e-05, + "loss": 1.5837, + "step": 93380 + }, + { + "epoch": 0.5869777856343174, + "grad_norm": 7.148404598236084, + "learning_rate": 1.6091716050727977e-05, + "loss": 1.7842, + "step": 93390 + }, + { + "epoch": 0.5870406379510145, + "grad_norm": 6.771876335144043, + "learning_rate": 1.6091296949783324e-05, + "loss": 1.488, + "step": 93400 + }, + { + "epoch": 0.5871034902677116, + "grad_norm": 8.27407169342041, + "learning_rate": 1.609087784883867e-05, + "loss": 1.4837, + "step": 93410 + }, + { + "epoch": 0.5871663425844087, + "grad_norm": 5.6078972816467285, + "learning_rate": 1.609045874789402e-05, + "loss": 1.6057, + "step": 93420 + }, + { + "epoch": 0.5872291949011058, + "grad_norm": 7.58414888381958, + "learning_rate": 1.6090039646949366e-05, + "loss": 1.5732, + "step": 93430 + }, + { + "epoch": 0.5872920472178029, + "grad_norm": 6.562372207641602, + "learning_rate": 1.6089620546004713e-05, + "loss": 1.8322, + "step": 93440 + }, + { + "epoch": 0.5873548995345, + "grad_norm": 6.939548969268799, + "learning_rate": 1.608920144506006e-05, + "loss": 1.4416, + "step": 93450 + }, + { + "epoch": 0.5874177518511972, + "grad_norm": 6.7253570556640625, + "learning_rate": 1.6088782344115407e-05, + "loss": 1.6307, + "step": 93460 + }, + { + "epoch": 0.5874806041678943, + "grad_norm": 6.343280792236328, + "learning_rate": 1.608836324317075e-05, + "loss": 1.7455, + "step": 93470 + }, + { + "epoch": 0.5875434564845914, + "grad_norm": 6.016113758087158, + "learning_rate": 1.6087944142226098e-05, + "loss": 1.6355, + "step": 93480 + }, + { + "epoch": 0.5876063088012885, + "grad_norm": 7.350913047790527, + "learning_rate": 1.6087525041281445e-05, + "loss": 1.5951, + "step": 93490 + }, + { + "epoch": 0.5876691611179856, + "grad_norm": 6.3639092445373535, + "learning_rate": 1.6087105940336792e-05, + "loss": 1.6209, + "step": 93500 + }, + { + "epoch": 0.5877320134346827, + "grad_norm": 6.527315616607666, + "learning_rate": 1.608668683939214e-05, + "loss": 1.6205, + "step": 93510 + }, + { + "epoch": 0.5877948657513798, + "grad_norm": 6.714054584503174, + "learning_rate": 1.6086267738447483e-05, + "loss": 1.5192, + "step": 93520 + }, + { + "epoch": 0.587857718068077, + "grad_norm": 6.581670761108398, + "learning_rate": 1.608584863750283e-05, + "loss": 1.7955, + "step": 93530 + }, + { + "epoch": 0.5879205703847741, + "grad_norm": 7.485185623168945, + "learning_rate": 1.6085429536558177e-05, + "loss": 1.6857, + "step": 93540 + }, + { + "epoch": 0.5879834227014712, + "grad_norm": 6.0929765701293945, + "learning_rate": 1.6085010435613524e-05, + "loss": 1.4265, + "step": 93550 + }, + { + "epoch": 0.5880462750181682, + "grad_norm": 6.712284564971924, + "learning_rate": 1.6084591334668867e-05, + "loss": 1.8785, + "step": 93560 + }, + { + "epoch": 0.5881091273348653, + "grad_norm": 8.242375373840332, + "learning_rate": 1.6084172233724215e-05, + "loss": 1.7111, + "step": 93570 + }, + { + "epoch": 0.5881719796515624, + "grad_norm": 7.200745582580566, + "learning_rate": 1.608375313277956e-05, + "loss": 1.6196, + "step": 93580 + }, + { + "epoch": 0.5882348319682595, + "grad_norm": 6.68762731552124, + "learning_rate": 1.608333403183491e-05, + "loss": 1.3562, + "step": 93590 + }, + { + "epoch": 0.5882976842849567, + "grad_norm": 6.237014293670654, + "learning_rate": 1.6082914930890256e-05, + "loss": 1.5764, + "step": 93600 + }, + { + "epoch": 0.5883605366016538, + "grad_norm": 7.350156307220459, + "learning_rate": 1.6082495829945603e-05, + "loss": 1.4858, + "step": 93610 + }, + { + "epoch": 0.5884233889183509, + "grad_norm": 6.49483585357666, + "learning_rate": 1.608207672900095e-05, + "loss": 1.4508, + "step": 93620 + }, + { + "epoch": 0.588486241235048, + "grad_norm": 6.597403049468994, + "learning_rate": 1.6081657628056297e-05, + "loss": 1.7434, + "step": 93630 + }, + { + "epoch": 0.5885490935517451, + "grad_norm": 6.692638397216797, + "learning_rate": 1.608123852711164e-05, + "loss": 1.728, + "step": 93640 + }, + { + "epoch": 0.5886119458684422, + "grad_norm": 6.858451843261719, + "learning_rate": 1.6080819426166988e-05, + "loss": 1.5272, + "step": 93650 + }, + { + "epoch": 0.5886747981851393, + "grad_norm": 7.041205406188965, + "learning_rate": 1.6080400325222335e-05, + "loss": 1.5382, + "step": 93660 + }, + { + "epoch": 0.5887376505018365, + "grad_norm": 5.92672872543335, + "learning_rate": 1.6079981224277682e-05, + "loss": 1.6031, + "step": 93670 + }, + { + "epoch": 0.5888005028185336, + "grad_norm": 6.83441162109375, + "learning_rate": 1.607956212333303e-05, + "loss": 1.7608, + "step": 93680 + }, + { + "epoch": 0.5888633551352307, + "grad_norm": 6.780746936798096, + "learning_rate": 1.6079143022388376e-05, + "loss": 1.6298, + "step": 93690 + }, + { + "epoch": 0.5889262074519278, + "grad_norm": 6.454309940338135, + "learning_rate": 1.607872392144372e-05, + "loss": 1.6783, + "step": 93700 + }, + { + "epoch": 0.5889890597686249, + "grad_norm": 7.5611371994018555, + "learning_rate": 1.6078304820499067e-05, + "loss": 1.5631, + "step": 93710 + }, + { + "epoch": 0.589051912085322, + "grad_norm": 6.402606964111328, + "learning_rate": 1.6077885719554414e-05, + "loss": 1.6421, + "step": 93720 + }, + { + "epoch": 0.5891147644020192, + "grad_norm": 5.505886554718018, + "learning_rate": 1.607746661860976e-05, + "loss": 1.782, + "step": 93730 + }, + { + "epoch": 0.5891776167187163, + "grad_norm": 6.116056442260742, + "learning_rate": 1.6077047517665105e-05, + "loss": 1.7706, + "step": 93740 + }, + { + "epoch": 0.5892404690354134, + "grad_norm": 7.178369045257568, + "learning_rate": 1.607662841672045e-05, + "loss": 1.4497, + "step": 93750 + }, + { + "epoch": 0.5893033213521105, + "grad_norm": 5.941424369812012, + "learning_rate": 1.60762093157758e-05, + "loss": 1.7454, + "step": 93760 + }, + { + "epoch": 0.5893661736688076, + "grad_norm": 21.873794555664062, + "learning_rate": 1.6075790214831146e-05, + "loss": 1.5727, + "step": 93770 + }, + { + "epoch": 0.5894290259855047, + "grad_norm": 6.942519187927246, + "learning_rate": 1.607537111388649e-05, + "loss": 1.776, + "step": 93780 + }, + { + "epoch": 0.5894918783022018, + "grad_norm": 6.549501419067383, + "learning_rate": 1.6074952012941837e-05, + "loss": 1.5357, + "step": 93790 + }, + { + "epoch": 0.589554730618899, + "grad_norm": 5.837566375732422, + "learning_rate": 1.6074532911997184e-05, + "loss": 1.6878, + "step": 93800 + }, + { + "epoch": 0.589617582935596, + "grad_norm": 7.793712139129639, + "learning_rate": 1.607411381105253e-05, + "loss": 1.6156, + "step": 93810 + }, + { + "epoch": 0.5896804352522931, + "grad_norm": 5.924585819244385, + "learning_rate": 1.6073694710107878e-05, + "loss": 1.5394, + "step": 93820 + }, + { + "epoch": 0.5897432875689902, + "grad_norm": 5.860642433166504, + "learning_rate": 1.6073275609163225e-05, + "loss": 1.7339, + "step": 93830 + }, + { + "epoch": 0.5898061398856873, + "grad_norm": 7.303297519683838, + "learning_rate": 1.6072856508218572e-05, + "loss": 1.5512, + "step": 93840 + }, + { + "epoch": 0.5898689922023844, + "grad_norm": 6.974435806274414, + "learning_rate": 1.607243740727392e-05, + "loss": 1.6649, + "step": 93850 + }, + { + "epoch": 0.5899318445190815, + "grad_norm": 6.702293395996094, + "learning_rate": 1.6072018306329266e-05, + "loss": 1.6807, + "step": 93860 + }, + { + "epoch": 0.5899946968357787, + "grad_norm": 6.116212844848633, + "learning_rate": 1.607159920538461e-05, + "loss": 1.5708, + "step": 93870 + }, + { + "epoch": 0.5900575491524758, + "grad_norm": 6.604289531707764, + "learning_rate": 1.6071180104439957e-05, + "loss": 1.626, + "step": 93880 + }, + { + "epoch": 0.5901204014691729, + "grad_norm": 6.858876705169678, + "learning_rate": 1.6070761003495304e-05, + "loss": 1.5147, + "step": 93890 + }, + { + "epoch": 0.59018325378587, + "grad_norm": 5.5290446281433105, + "learning_rate": 1.607034190255065e-05, + "loss": 1.7127, + "step": 93900 + }, + { + "epoch": 0.5902461061025671, + "grad_norm": 5.859738349914551, + "learning_rate": 1.6069922801605998e-05, + "loss": 1.4923, + "step": 93910 + }, + { + "epoch": 0.5903089584192642, + "grad_norm": 6.121007442474365, + "learning_rate": 1.606950370066134e-05, + "loss": 1.4182, + "step": 93920 + }, + { + "epoch": 0.5903718107359613, + "grad_norm": 5.847887992858887, + "learning_rate": 1.606908459971669e-05, + "loss": 1.6474, + "step": 93930 + }, + { + "epoch": 0.5904346630526585, + "grad_norm": 6.725071907043457, + "learning_rate": 1.6068665498772036e-05, + "loss": 1.7763, + "step": 93940 + }, + { + "epoch": 0.5904975153693556, + "grad_norm": 6.250272750854492, + "learning_rate": 1.6068246397827383e-05, + "loss": 1.612, + "step": 93950 + }, + { + "epoch": 0.5905603676860527, + "grad_norm": 6.042320251464844, + "learning_rate": 1.6067827296882727e-05, + "loss": 1.7588, + "step": 93960 + }, + { + "epoch": 0.5906232200027498, + "grad_norm": 6.307615756988525, + "learning_rate": 1.6067408195938074e-05, + "loss": 1.843, + "step": 93970 + }, + { + "epoch": 0.5906860723194469, + "grad_norm": 7.286334991455078, + "learning_rate": 1.606698909499342e-05, + "loss": 1.7702, + "step": 93980 + }, + { + "epoch": 0.590748924636144, + "grad_norm": 7.581210136413574, + "learning_rate": 1.6066569994048768e-05, + "loss": 1.6187, + "step": 93990 + }, + { + "epoch": 0.5908117769528412, + "grad_norm": 6.826915740966797, + "learning_rate": 1.6066150893104115e-05, + "loss": 1.7507, + "step": 94000 + }, + { + "epoch": 0.5908746292695383, + "grad_norm": 6.500402450561523, + "learning_rate": 1.6065731792159462e-05, + "loss": 1.7423, + "step": 94010 + }, + { + "epoch": 0.5909374815862354, + "grad_norm": 6.2059197425842285, + "learning_rate": 1.6065312691214806e-05, + "loss": 1.6294, + "step": 94020 + }, + { + "epoch": 0.5910003339029325, + "grad_norm": 7.754387855529785, + "learning_rate": 1.6064893590270153e-05, + "loss": 2.0808, + "step": 94030 + }, + { + "epoch": 0.5910631862196296, + "grad_norm": 6.315862655639648, + "learning_rate": 1.60644744893255e-05, + "loss": 1.6505, + "step": 94040 + }, + { + "epoch": 0.5911260385363267, + "grad_norm": 5.965827465057373, + "learning_rate": 1.6064055388380847e-05, + "loss": 1.8246, + "step": 94050 + }, + { + "epoch": 0.5911888908530238, + "grad_norm": 6.464879035949707, + "learning_rate": 1.6063636287436194e-05, + "loss": 1.8034, + "step": 94060 + }, + { + "epoch": 0.5912517431697208, + "grad_norm": 5.9997172355651855, + "learning_rate": 1.606321718649154e-05, + "loss": 1.682, + "step": 94070 + }, + { + "epoch": 0.591314595486418, + "grad_norm": 5.9515557289123535, + "learning_rate": 1.6062798085546888e-05, + "loss": 1.5475, + "step": 94080 + }, + { + "epoch": 0.5913774478031151, + "grad_norm": 6.149786949157715, + "learning_rate": 1.606237898460223e-05, + "loss": 1.6599, + "step": 94090 + }, + { + "epoch": 0.5914403001198122, + "grad_norm": 7.075897693634033, + "learning_rate": 1.606195988365758e-05, + "loss": 1.8625, + "step": 94100 + }, + { + "epoch": 0.5915031524365093, + "grad_norm": 5.6168365478515625, + "learning_rate": 1.6061540782712926e-05, + "loss": 1.5913, + "step": 94110 + }, + { + "epoch": 0.5915660047532064, + "grad_norm": 7.652287006378174, + "learning_rate": 1.6061121681768273e-05, + "loss": 1.8451, + "step": 94120 + }, + { + "epoch": 0.5916288570699035, + "grad_norm": 5.249066352844238, + "learning_rate": 1.606070258082362e-05, + "loss": 1.7546, + "step": 94130 + }, + { + "epoch": 0.5916917093866007, + "grad_norm": 7.445679187774658, + "learning_rate": 1.6060283479878964e-05, + "loss": 1.7411, + "step": 94140 + }, + { + "epoch": 0.5917545617032978, + "grad_norm": 7.669356822967529, + "learning_rate": 1.605986437893431e-05, + "loss": 1.7584, + "step": 94150 + }, + { + "epoch": 0.5918174140199949, + "grad_norm": 6.608617305755615, + "learning_rate": 1.6059445277989658e-05, + "loss": 1.7146, + "step": 94160 + }, + { + "epoch": 0.591880266336692, + "grad_norm": 8.506745338439941, + "learning_rate": 1.6059026177045005e-05, + "loss": 1.6163, + "step": 94170 + }, + { + "epoch": 0.5919431186533891, + "grad_norm": 6.032199382781982, + "learning_rate": 1.605860707610035e-05, + "loss": 1.6001, + "step": 94180 + }, + { + "epoch": 0.5920059709700862, + "grad_norm": 7.232632637023926, + "learning_rate": 1.6058187975155696e-05, + "loss": 1.7259, + "step": 94190 + }, + { + "epoch": 0.5920688232867833, + "grad_norm": 6.042525768280029, + "learning_rate": 1.6057768874211043e-05, + "loss": 1.6663, + "step": 94200 + }, + { + "epoch": 0.5921316756034805, + "grad_norm": 7.102494716644287, + "learning_rate": 1.605734977326639e-05, + "loss": 1.5914, + "step": 94210 + }, + { + "epoch": 0.5921945279201776, + "grad_norm": 8.22027587890625, + "learning_rate": 1.6056930672321737e-05, + "loss": 1.6215, + "step": 94220 + }, + { + "epoch": 0.5922573802368747, + "grad_norm": 5.757334232330322, + "learning_rate": 1.6056511571377084e-05, + "loss": 1.4659, + "step": 94230 + }, + { + "epoch": 0.5923202325535718, + "grad_norm": 6.860438346862793, + "learning_rate": 1.605609247043243e-05, + "loss": 1.6507, + "step": 94240 + }, + { + "epoch": 0.5923830848702689, + "grad_norm": 6.159027099609375, + "learning_rate": 1.6055673369487778e-05, + "loss": 1.766, + "step": 94250 + }, + { + "epoch": 0.592445937186966, + "grad_norm": 8.29791259765625, + "learning_rate": 1.6055254268543122e-05, + "loss": 1.7368, + "step": 94260 + }, + { + "epoch": 0.5925087895036631, + "grad_norm": 6.134650707244873, + "learning_rate": 1.605483516759847e-05, + "loss": 2.0518, + "step": 94270 + }, + { + "epoch": 0.5925716418203603, + "grad_norm": 5.710776329040527, + "learning_rate": 1.6054416066653816e-05, + "loss": 1.5536, + "step": 94280 + }, + { + "epoch": 0.5926344941370574, + "grad_norm": 7.502482891082764, + "learning_rate": 1.6053996965709163e-05, + "loss": 1.5654, + "step": 94290 + }, + { + "epoch": 0.5926973464537545, + "grad_norm": 6.617005348205566, + "learning_rate": 1.605357786476451e-05, + "loss": 1.5087, + "step": 94300 + }, + { + "epoch": 0.5927601987704516, + "grad_norm": 6.882579803466797, + "learning_rate": 1.6053158763819857e-05, + "loss": 1.5574, + "step": 94310 + }, + { + "epoch": 0.5928230510871486, + "grad_norm": 6.2495245933532715, + "learning_rate": 1.60527396628752e-05, + "loss": 1.6714, + "step": 94320 + }, + { + "epoch": 0.5928859034038457, + "grad_norm": 7.2485246658325195, + "learning_rate": 1.6052320561930548e-05, + "loss": 1.6908, + "step": 94330 + }, + { + "epoch": 0.5929487557205428, + "grad_norm": 6.475519180297852, + "learning_rate": 1.6051901460985895e-05, + "loss": 1.8505, + "step": 94340 + }, + { + "epoch": 0.59301160803724, + "grad_norm": 6.150235176086426, + "learning_rate": 1.6051482360041242e-05, + "loss": 1.5998, + "step": 94350 + }, + { + "epoch": 0.5930744603539371, + "grad_norm": 7.224484920501709, + "learning_rate": 1.6051063259096586e-05, + "loss": 1.5895, + "step": 94360 + }, + { + "epoch": 0.5931373126706342, + "grad_norm": 6.305262088775635, + "learning_rate": 1.6050644158151933e-05, + "loss": 1.5623, + "step": 94370 + }, + { + "epoch": 0.5932001649873313, + "grad_norm": 11.27811050415039, + "learning_rate": 1.605022505720728e-05, + "loss": 1.6536, + "step": 94380 + }, + { + "epoch": 0.5932630173040284, + "grad_norm": 5.835448741912842, + "learning_rate": 1.6049805956262627e-05, + "loss": 1.4614, + "step": 94390 + }, + { + "epoch": 0.5933258696207255, + "grad_norm": 6.761912822723389, + "learning_rate": 1.604938685531797e-05, + "loss": 1.6031, + "step": 94400 + }, + { + "epoch": 0.5933887219374226, + "grad_norm": 8.251826286315918, + "learning_rate": 1.6048967754373318e-05, + "loss": 1.8298, + "step": 94410 + }, + { + "epoch": 0.5934515742541198, + "grad_norm": 7.148452281951904, + "learning_rate": 1.6048548653428665e-05, + "loss": 1.6455, + "step": 94420 + }, + { + "epoch": 0.5935144265708169, + "grad_norm": 6.777230739593506, + "learning_rate": 1.6048129552484012e-05, + "loss": 1.4488, + "step": 94430 + }, + { + "epoch": 0.593577278887514, + "grad_norm": 6.385249137878418, + "learning_rate": 1.604771045153936e-05, + "loss": 1.6662, + "step": 94440 + }, + { + "epoch": 0.5936401312042111, + "grad_norm": 6.66254186630249, + "learning_rate": 1.6047291350594706e-05, + "loss": 1.5689, + "step": 94450 + }, + { + "epoch": 0.5937029835209082, + "grad_norm": 6.666104793548584, + "learning_rate": 1.6046872249650053e-05, + "loss": 1.5781, + "step": 94460 + }, + { + "epoch": 0.5937658358376053, + "grad_norm": 6.386703968048096, + "learning_rate": 1.60464531487054e-05, + "loss": 1.6384, + "step": 94470 + }, + { + "epoch": 0.5938286881543025, + "grad_norm": 6.9321794509887695, + "learning_rate": 1.6046034047760747e-05, + "loss": 1.7958, + "step": 94480 + }, + { + "epoch": 0.5938915404709996, + "grad_norm": 5.967465877532959, + "learning_rate": 1.604561494681609e-05, + "loss": 1.605, + "step": 94490 + }, + { + "epoch": 0.5939543927876967, + "grad_norm": 7.165401458740234, + "learning_rate": 1.6045195845871438e-05, + "loss": 1.6437, + "step": 94500 + }, + { + "epoch": 0.5940172451043938, + "grad_norm": 6.284710884094238, + "learning_rate": 1.6044776744926785e-05, + "loss": 1.6488, + "step": 94510 + }, + { + "epoch": 0.5940800974210909, + "grad_norm": 5.183877944946289, + "learning_rate": 1.6044357643982132e-05, + "loss": 1.5784, + "step": 94520 + }, + { + "epoch": 0.594142949737788, + "grad_norm": 7.364722728729248, + "learning_rate": 1.604393854303748e-05, + "loss": 1.5545, + "step": 94530 + }, + { + "epoch": 0.5942058020544851, + "grad_norm": 6.855818271636963, + "learning_rate": 1.6043519442092823e-05, + "loss": 1.6052, + "step": 94540 + }, + { + "epoch": 0.5942686543711823, + "grad_norm": 6.377351760864258, + "learning_rate": 1.604310034114817e-05, + "loss": 1.5647, + "step": 94550 + }, + { + "epoch": 0.5943315066878794, + "grad_norm": 5.653633117675781, + "learning_rate": 1.6042681240203517e-05, + "loss": 1.5609, + "step": 94560 + }, + { + "epoch": 0.5943943590045765, + "grad_norm": 7.1275763511657715, + "learning_rate": 1.6042262139258864e-05, + "loss": 1.437, + "step": 94570 + }, + { + "epoch": 0.5944572113212735, + "grad_norm": 6.853404998779297, + "learning_rate": 1.6041843038314208e-05, + "loss": 1.5824, + "step": 94580 + }, + { + "epoch": 0.5945200636379706, + "grad_norm": 8.112689018249512, + "learning_rate": 1.6041423937369555e-05, + "loss": 1.6326, + "step": 94590 + }, + { + "epoch": 0.5945829159546677, + "grad_norm": 5.817140579223633, + "learning_rate": 1.6041004836424902e-05, + "loss": 1.704, + "step": 94600 + }, + { + "epoch": 0.5946457682713648, + "grad_norm": 6.160879135131836, + "learning_rate": 1.604058573548025e-05, + "loss": 1.7157, + "step": 94610 + }, + { + "epoch": 0.594708620588062, + "grad_norm": 6.327479839324951, + "learning_rate": 1.6040166634535596e-05, + "loss": 1.7189, + "step": 94620 + }, + { + "epoch": 0.5947714729047591, + "grad_norm": 6.345967769622803, + "learning_rate": 1.6039747533590943e-05, + "loss": 1.7162, + "step": 94630 + }, + { + "epoch": 0.5948343252214562, + "grad_norm": 7.8932414054870605, + "learning_rate": 1.6039328432646287e-05, + "loss": 1.6821, + "step": 94640 + }, + { + "epoch": 0.5948971775381533, + "grad_norm": 5.700346946716309, + "learning_rate": 1.6038909331701634e-05, + "loss": 1.5322, + "step": 94650 + }, + { + "epoch": 0.5949600298548504, + "grad_norm": 6.52507209777832, + "learning_rate": 1.603849023075698e-05, + "loss": 1.722, + "step": 94660 + }, + { + "epoch": 0.5950228821715475, + "grad_norm": 6.896712303161621, + "learning_rate": 1.6038071129812328e-05, + "loss": 1.7281, + "step": 94670 + }, + { + "epoch": 0.5950857344882446, + "grad_norm": 6.592931270599365, + "learning_rate": 1.6037652028867675e-05, + "loss": 1.714, + "step": 94680 + }, + { + "epoch": 0.5951485868049418, + "grad_norm": 6.37777042388916, + "learning_rate": 1.6037232927923022e-05, + "loss": 1.6428, + "step": 94690 + }, + { + "epoch": 0.5952114391216389, + "grad_norm": 5.731248378753662, + "learning_rate": 1.603681382697837e-05, + "loss": 1.7142, + "step": 94700 + }, + { + "epoch": 0.595274291438336, + "grad_norm": 6.787526607513428, + "learning_rate": 1.6036394726033713e-05, + "loss": 1.448, + "step": 94710 + }, + { + "epoch": 0.5953371437550331, + "grad_norm": 7.775476932525635, + "learning_rate": 1.603597562508906e-05, + "loss": 1.5785, + "step": 94720 + }, + { + "epoch": 0.5953999960717302, + "grad_norm": 6.696002006530762, + "learning_rate": 1.6035556524144407e-05, + "loss": 1.6204, + "step": 94730 + }, + { + "epoch": 0.5954628483884273, + "grad_norm": 5.686933994293213, + "learning_rate": 1.6035137423199754e-05, + "loss": 1.4842, + "step": 94740 + }, + { + "epoch": 0.5955257007051244, + "grad_norm": 9.774834632873535, + "learning_rate": 1.60347183222551e-05, + "loss": 1.7541, + "step": 94750 + }, + { + "epoch": 0.5955885530218216, + "grad_norm": 5.845921039581299, + "learning_rate": 1.6034299221310445e-05, + "loss": 1.7277, + "step": 94760 + }, + { + "epoch": 0.5956514053385187, + "grad_norm": 7.821176528930664, + "learning_rate": 1.6033880120365792e-05, + "loss": 1.6146, + "step": 94770 + }, + { + "epoch": 0.5957142576552158, + "grad_norm": 7.6437482833862305, + "learning_rate": 1.603346101942114e-05, + "loss": 1.6718, + "step": 94780 + }, + { + "epoch": 0.5957771099719129, + "grad_norm": 6.291599273681641, + "learning_rate": 1.6033041918476486e-05, + "loss": 1.7452, + "step": 94790 + }, + { + "epoch": 0.59583996228861, + "grad_norm": 7.090104103088379, + "learning_rate": 1.603262281753183e-05, + "loss": 1.9078, + "step": 94800 + }, + { + "epoch": 0.5959028146053071, + "grad_norm": 7.310959815979004, + "learning_rate": 1.6032203716587177e-05, + "loss": 1.8254, + "step": 94810 + }, + { + "epoch": 0.5959656669220043, + "grad_norm": 6.400932312011719, + "learning_rate": 1.6031784615642524e-05, + "loss": 1.6835, + "step": 94820 + }, + { + "epoch": 0.5960285192387013, + "grad_norm": 6.25166654586792, + "learning_rate": 1.603136551469787e-05, + "loss": 1.8227, + "step": 94830 + }, + { + "epoch": 0.5960913715553984, + "grad_norm": 6.510306358337402, + "learning_rate": 1.6030946413753218e-05, + "loss": 1.6051, + "step": 94840 + }, + { + "epoch": 0.5961542238720955, + "grad_norm": 7.328588485717773, + "learning_rate": 1.6030527312808565e-05, + "loss": 1.7419, + "step": 94850 + }, + { + "epoch": 0.5962170761887926, + "grad_norm": 6.656290054321289, + "learning_rate": 1.6030108211863912e-05, + "loss": 1.4313, + "step": 94860 + }, + { + "epoch": 0.5962799285054897, + "grad_norm": 6.53260612487793, + "learning_rate": 1.602968911091926e-05, + "loss": 1.6663, + "step": 94870 + }, + { + "epoch": 0.5963427808221868, + "grad_norm": 6.379949569702148, + "learning_rate": 1.6029270009974606e-05, + "loss": 1.6955, + "step": 94880 + }, + { + "epoch": 0.596405633138884, + "grad_norm": 7.013059616088867, + "learning_rate": 1.602885090902995e-05, + "loss": 1.6558, + "step": 94890 + }, + { + "epoch": 0.5964684854555811, + "grad_norm": 6.310605049133301, + "learning_rate": 1.6028431808085297e-05, + "loss": 1.6859, + "step": 94900 + }, + { + "epoch": 0.5965313377722782, + "grad_norm": 5.933061599731445, + "learning_rate": 1.6028012707140644e-05, + "loss": 1.6247, + "step": 94910 + }, + { + "epoch": 0.5965941900889753, + "grad_norm": 6.350223064422607, + "learning_rate": 1.602759360619599e-05, + "loss": 1.6963, + "step": 94920 + }, + { + "epoch": 0.5966570424056724, + "grad_norm": 7.618001461029053, + "learning_rate": 1.6027174505251338e-05, + "loss": 1.7173, + "step": 94930 + }, + { + "epoch": 0.5967198947223695, + "grad_norm": 6.708108901977539, + "learning_rate": 1.6026755404306682e-05, + "loss": 1.4079, + "step": 94940 + }, + { + "epoch": 0.5967827470390666, + "grad_norm": 6.737509727478027, + "learning_rate": 1.602633630336203e-05, + "loss": 1.7482, + "step": 94950 + }, + { + "epoch": 0.5968455993557638, + "grad_norm": 6.866915225982666, + "learning_rate": 1.6025917202417376e-05, + "loss": 1.3871, + "step": 94960 + }, + { + "epoch": 0.5969084516724609, + "grad_norm": 7.664914131164551, + "learning_rate": 1.6025498101472723e-05, + "loss": 1.7733, + "step": 94970 + }, + { + "epoch": 0.596971303989158, + "grad_norm": 6.510303974151611, + "learning_rate": 1.6025079000528067e-05, + "loss": 1.7395, + "step": 94980 + }, + { + "epoch": 0.5970341563058551, + "grad_norm": 6.773441791534424, + "learning_rate": 1.6024659899583414e-05, + "loss": 1.5984, + "step": 94990 + }, + { + "epoch": 0.5970970086225522, + "grad_norm": 6.100302696228027, + "learning_rate": 1.602424079863876e-05, + "loss": 1.5435, + "step": 95000 + }, + { + "epoch": 0.5971598609392493, + "grad_norm": 7.015550136566162, + "learning_rate": 1.6023821697694108e-05, + "loss": 1.8034, + "step": 95010 + }, + { + "epoch": 0.5972227132559464, + "grad_norm": 7.05791711807251, + "learning_rate": 1.602340259674945e-05, + "loss": 1.6437, + "step": 95020 + }, + { + "epoch": 0.5972855655726436, + "grad_norm": 6.620450496673584, + "learning_rate": 1.60229834958048e-05, + "loss": 1.7381, + "step": 95030 + }, + { + "epoch": 0.5973484178893407, + "grad_norm": 6.6620941162109375, + "learning_rate": 1.6022564394860146e-05, + "loss": 1.4741, + "step": 95040 + }, + { + "epoch": 0.5974112702060378, + "grad_norm": 6.4213104248046875, + "learning_rate": 1.602218720400996e-05, + "loss": 1.549, + "step": 95050 + }, + { + "epoch": 0.5974741225227349, + "grad_norm": 6.880028247833252, + "learning_rate": 1.6021768103065304e-05, + "loss": 1.5293, + "step": 95060 + }, + { + "epoch": 0.597536974839432, + "grad_norm": 6.232072830200195, + "learning_rate": 1.602134900212065e-05, + "loss": 1.6536, + "step": 95070 + }, + { + "epoch": 0.5975998271561291, + "grad_norm": 6.660600662231445, + "learning_rate": 1.6020929901176e-05, + "loss": 1.6193, + "step": 95080 + }, + { + "epoch": 0.5976626794728261, + "grad_norm": 6.547708988189697, + "learning_rate": 1.6020510800231345e-05, + "loss": 1.6307, + "step": 95090 + }, + { + "epoch": 0.5977255317895233, + "grad_norm": 6.666970729827881, + "learning_rate": 1.602009169928669e-05, + "loss": 1.6335, + "step": 95100 + }, + { + "epoch": 0.5977883841062204, + "grad_norm": 7.2162861824035645, + "learning_rate": 1.6019672598342036e-05, + "loss": 1.5506, + "step": 95110 + }, + { + "epoch": 0.5978512364229175, + "grad_norm": 6.301713466644287, + "learning_rate": 1.6019253497397383e-05, + "loss": 1.639, + "step": 95120 + }, + { + "epoch": 0.5979140887396146, + "grad_norm": 6.836359024047852, + "learning_rate": 1.601883439645273e-05, + "loss": 1.7094, + "step": 95130 + }, + { + "epoch": 0.5979769410563117, + "grad_norm": 6.2631144523620605, + "learning_rate": 1.6018415295508077e-05, + "loss": 1.561, + "step": 95140 + }, + { + "epoch": 0.5980397933730088, + "grad_norm": 7.471103668212891, + "learning_rate": 1.6017996194563424e-05, + "loss": 1.8125, + "step": 95150 + }, + { + "epoch": 0.598102645689706, + "grad_norm": 6.798772811889648, + "learning_rate": 1.601757709361877e-05, + "loss": 1.6111, + "step": 95160 + }, + { + "epoch": 0.5981654980064031, + "grad_norm": 6.729697227478027, + "learning_rate": 1.601715799267412e-05, + "loss": 1.6082, + "step": 95170 + }, + { + "epoch": 0.5982283503231002, + "grad_norm": 6.938941955566406, + "learning_rate": 1.6016738891729466e-05, + "loss": 1.299, + "step": 95180 + }, + { + "epoch": 0.5982912026397973, + "grad_norm": 7.27415657043457, + "learning_rate": 1.601631979078481e-05, + "loss": 1.5815, + "step": 95190 + }, + { + "epoch": 0.5983540549564944, + "grad_norm": 7.754593849182129, + "learning_rate": 1.6015900689840156e-05, + "loss": 1.7821, + "step": 95200 + }, + { + "epoch": 0.5984169072731915, + "grad_norm": 7.225983619689941, + "learning_rate": 1.6015481588895503e-05, + "loss": 1.7128, + "step": 95210 + }, + { + "epoch": 0.5984797595898886, + "grad_norm": 6.401730537414551, + "learning_rate": 1.601506248795085e-05, + "loss": 1.6446, + "step": 95220 + }, + { + "epoch": 0.5985426119065858, + "grad_norm": 7.39094352722168, + "learning_rate": 1.6014643387006194e-05, + "loss": 1.5911, + "step": 95230 + }, + { + "epoch": 0.5986054642232829, + "grad_norm": 8.610921859741211, + "learning_rate": 1.601422428606154e-05, + "loss": 1.6109, + "step": 95240 + }, + { + "epoch": 0.59866831653998, + "grad_norm": 6.567859649658203, + "learning_rate": 1.601380518511689e-05, + "loss": 1.7104, + "step": 95250 + }, + { + "epoch": 0.5987311688566771, + "grad_norm": 6.0750250816345215, + "learning_rate": 1.6013386084172235e-05, + "loss": 1.568, + "step": 95260 + }, + { + "epoch": 0.5987940211733742, + "grad_norm": 7.271208763122559, + "learning_rate": 1.6012966983227583e-05, + "loss": 1.8688, + "step": 95270 + }, + { + "epoch": 0.5988568734900713, + "grad_norm": 6.858160972595215, + "learning_rate": 1.6012547882282926e-05, + "loss": 1.6456, + "step": 95280 + }, + { + "epoch": 0.5989197258067684, + "grad_norm": 7.072115421295166, + "learning_rate": 1.6012128781338273e-05, + "loss": 1.6997, + "step": 95290 + }, + { + "epoch": 0.5989825781234656, + "grad_norm": 6.856001853942871, + "learning_rate": 1.601170968039362e-05, + "loss": 1.8486, + "step": 95300 + }, + { + "epoch": 0.5990454304401627, + "grad_norm": 6.55513334274292, + "learning_rate": 1.6011290579448967e-05, + "loss": 1.7358, + "step": 95310 + }, + { + "epoch": 0.5991082827568598, + "grad_norm": 6.499383926391602, + "learning_rate": 1.6010871478504314e-05, + "loss": 1.5296, + "step": 95320 + }, + { + "epoch": 0.5991711350735569, + "grad_norm": 6.556888580322266, + "learning_rate": 1.6010452377559658e-05, + "loss": 1.7428, + "step": 95330 + }, + { + "epoch": 0.5992339873902539, + "grad_norm": 6.718261241912842, + "learning_rate": 1.6010033276615005e-05, + "loss": 1.4371, + "step": 95340 + }, + { + "epoch": 0.599296839706951, + "grad_norm": 6.831932544708252, + "learning_rate": 1.6009614175670352e-05, + "loss": 1.9143, + "step": 95350 + }, + { + "epoch": 0.5993596920236481, + "grad_norm": 7.682587146759033, + "learning_rate": 1.60091950747257e-05, + "loss": 1.6924, + "step": 95360 + }, + { + "epoch": 0.5994225443403453, + "grad_norm": 7.187410354614258, + "learning_rate": 1.6008775973781046e-05, + "loss": 1.5765, + "step": 95370 + }, + { + "epoch": 0.5994853966570424, + "grad_norm": 6.18812894821167, + "learning_rate": 1.6008356872836394e-05, + "loss": 1.5242, + "step": 95380 + }, + { + "epoch": 0.5995482489737395, + "grad_norm": 7.400139808654785, + "learning_rate": 1.600793777189174e-05, + "loss": 1.4055, + "step": 95390 + }, + { + "epoch": 0.5996111012904366, + "grad_norm": 7.379140853881836, + "learning_rate": 1.6007518670947088e-05, + "loss": 1.8885, + "step": 95400 + }, + { + "epoch": 0.5996739536071337, + "grad_norm": 7.497847557067871, + "learning_rate": 1.600709957000243e-05, + "loss": 1.7633, + "step": 95410 + }, + { + "epoch": 0.5997368059238308, + "grad_norm": 9.0486478805542, + "learning_rate": 1.600668046905778e-05, + "loss": 1.6708, + "step": 95420 + }, + { + "epoch": 0.5997996582405279, + "grad_norm": 7.280218124389648, + "learning_rate": 1.6006261368113125e-05, + "loss": 1.6665, + "step": 95430 + }, + { + "epoch": 0.5998625105572251, + "grad_norm": 8.033541679382324, + "learning_rate": 1.6005842267168473e-05, + "loss": 1.4409, + "step": 95440 + }, + { + "epoch": 0.5999253628739222, + "grad_norm": 6.502048969268799, + "learning_rate": 1.600542316622382e-05, + "loss": 1.7335, + "step": 95450 + }, + { + "epoch": 0.5999882151906193, + "grad_norm": 6.807484149932861, + "learning_rate": 1.6005004065279163e-05, + "loss": 1.689, + "step": 95460 + }, + { + "epoch": 0.6000510675073164, + "grad_norm": 7.029139041900635, + "learning_rate": 1.600458496433451e-05, + "loss": 1.674, + "step": 95470 + }, + { + "epoch": 0.6001139198240135, + "grad_norm": 7.192809581756592, + "learning_rate": 1.6004165863389857e-05, + "loss": 1.8582, + "step": 95480 + }, + { + "epoch": 0.6001767721407106, + "grad_norm": 7.5557661056518555, + "learning_rate": 1.6003746762445205e-05, + "loss": 1.8241, + "step": 95490 + }, + { + "epoch": 0.6002396244574077, + "grad_norm": 7.017920017242432, + "learning_rate": 1.6003327661500548e-05, + "loss": 1.8574, + "step": 95500 + }, + { + "epoch": 0.6003024767741049, + "grad_norm": 7.448575496673584, + "learning_rate": 1.6002908560555895e-05, + "loss": 1.6413, + "step": 95510 + }, + { + "epoch": 0.600365329090802, + "grad_norm": 5.390271186828613, + "learning_rate": 1.6002489459611242e-05, + "loss": 1.4774, + "step": 95520 + }, + { + "epoch": 0.6004281814074991, + "grad_norm": 6.9333062171936035, + "learning_rate": 1.600207035866659e-05, + "loss": 1.6295, + "step": 95530 + }, + { + "epoch": 0.6004910337241962, + "grad_norm": 7.013101577758789, + "learning_rate": 1.6001651257721936e-05, + "loss": 1.464, + "step": 95540 + }, + { + "epoch": 0.6005538860408933, + "grad_norm": 6.239163398742676, + "learning_rate": 1.6001232156777284e-05, + "loss": 2.0033, + "step": 95550 + }, + { + "epoch": 0.6006167383575904, + "grad_norm": 7.4665069580078125, + "learning_rate": 1.600081305583263e-05, + "loss": 1.6143, + "step": 95560 + }, + { + "epoch": 0.6006795906742876, + "grad_norm": 6.792352199554443, + "learning_rate": 1.6000393954887978e-05, + "loss": 1.6973, + "step": 95570 + }, + { + "epoch": 0.6007424429909847, + "grad_norm": 7.63231086730957, + "learning_rate": 1.599997485394332e-05, + "loss": 1.7097, + "step": 95580 + }, + { + "epoch": 0.6008052953076818, + "grad_norm": 5.52643346786499, + "learning_rate": 1.599955575299867e-05, + "loss": 1.4769, + "step": 95590 + }, + { + "epoch": 0.6008681476243788, + "grad_norm": 6.46755313873291, + "learning_rate": 1.5999136652054016e-05, + "loss": 1.5199, + "step": 95600 + }, + { + "epoch": 0.6009309999410759, + "grad_norm": 6.4414896965026855, + "learning_rate": 1.5998717551109363e-05, + "loss": 1.9104, + "step": 95610 + }, + { + "epoch": 0.600993852257773, + "grad_norm": 7.002188682556152, + "learning_rate": 1.599829845016471e-05, + "loss": 1.5368, + "step": 95620 + }, + { + "epoch": 0.6010567045744701, + "grad_norm": 6.919557094573975, + "learning_rate": 1.5997879349220053e-05, + "loss": 1.8452, + "step": 95630 + }, + { + "epoch": 0.6011195568911673, + "grad_norm": 8.59383487701416, + "learning_rate": 1.59974602482754e-05, + "loss": 2.0311, + "step": 95640 + }, + { + "epoch": 0.6011824092078644, + "grad_norm": 5.873063564300537, + "learning_rate": 1.5997041147330747e-05, + "loss": 1.7351, + "step": 95650 + }, + { + "epoch": 0.6012452615245615, + "grad_norm": 8.141200065612793, + "learning_rate": 1.5996622046386095e-05, + "loss": 1.766, + "step": 95660 + }, + { + "epoch": 0.6013081138412586, + "grad_norm": 6.853367328643799, + "learning_rate": 1.599620294544144e-05, + "loss": 1.8471, + "step": 95670 + }, + { + "epoch": 0.6013709661579557, + "grad_norm": 7.046663761138916, + "learning_rate": 1.5995783844496785e-05, + "loss": 1.8587, + "step": 95680 + }, + { + "epoch": 0.6014338184746528, + "grad_norm": 6.066128730773926, + "learning_rate": 1.5995364743552132e-05, + "loss": 1.4011, + "step": 95690 + }, + { + "epoch": 0.6014966707913499, + "grad_norm": 5.925566673278809, + "learning_rate": 1.599494564260748e-05, + "loss": 1.803, + "step": 95700 + }, + { + "epoch": 0.6015595231080471, + "grad_norm": 6.03173303604126, + "learning_rate": 1.5994526541662827e-05, + "loss": 1.6507, + "step": 95710 + }, + { + "epoch": 0.6016223754247442, + "grad_norm": 6.576630592346191, + "learning_rate": 1.599410744071817e-05, + "loss": 1.5175, + "step": 95720 + }, + { + "epoch": 0.6016852277414413, + "grad_norm": 7.683470726013184, + "learning_rate": 1.5993688339773517e-05, + "loss": 1.6615, + "step": 95730 + }, + { + "epoch": 0.6017480800581384, + "grad_norm": 6.241876602172852, + "learning_rate": 1.5993269238828864e-05, + "loss": 1.6691, + "step": 95740 + }, + { + "epoch": 0.6018109323748355, + "grad_norm": 6.379269599914551, + "learning_rate": 1.599285013788421e-05, + "loss": 1.5956, + "step": 95750 + }, + { + "epoch": 0.6018737846915326, + "grad_norm": 7.8299102783203125, + "learning_rate": 1.599243103693956e-05, + "loss": 1.7619, + "step": 95760 + }, + { + "epoch": 0.6019366370082297, + "grad_norm": 5.8008928298950195, + "learning_rate": 1.5992011935994906e-05, + "loss": 1.6117, + "step": 95770 + }, + { + "epoch": 0.6019994893249269, + "grad_norm": 5.89392614364624, + "learning_rate": 1.5991592835050253e-05, + "loss": 1.8185, + "step": 95780 + }, + { + "epoch": 0.602062341641624, + "grad_norm": 6.424180030822754, + "learning_rate": 1.59911737341056e-05, + "loss": 1.6598, + "step": 95790 + }, + { + "epoch": 0.6021251939583211, + "grad_norm": 6.174285411834717, + "learning_rate": 1.5990754633160947e-05, + "loss": 1.6731, + "step": 95800 + }, + { + "epoch": 0.6021880462750182, + "grad_norm": 6.065310001373291, + "learning_rate": 1.599033553221629e-05, + "loss": 1.7175, + "step": 95810 + }, + { + "epoch": 0.6022508985917153, + "grad_norm": 6.311932563781738, + "learning_rate": 1.5989916431271638e-05, + "loss": 1.7119, + "step": 95820 + }, + { + "epoch": 0.6023137509084124, + "grad_norm": 6.33814001083374, + "learning_rate": 1.5989497330326985e-05, + "loss": 1.5467, + "step": 95830 + }, + { + "epoch": 0.6023766032251096, + "grad_norm": 6.47866153717041, + "learning_rate": 1.598907822938233e-05, + "loss": 1.6905, + "step": 95840 + }, + { + "epoch": 0.6024394555418066, + "grad_norm": 6.675044536590576, + "learning_rate": 1.5988659128437675e-05, + "loss": 1.5809, + "step": 95850 + }, + { + "epoch": 0.6025023078585037, + "grad_norm": 5.273915767669678, + "learning_rate": 1.5988240027493022e-05, + "loss": 1.6373, + "step": 95860 + }, + { + "epoch": 0.6025651601752008, + "grad_norm": 7.633127689361572, + "learning_rate": 1.598782092654837e-05, + "loss": 1.6152, + "step": 95870 + }, + { + "epoch": 0.6026280124918979, + "grad_norm": 6.930589199066162, + "learning_rate": 1.5987401825603717e-05, + "loss": 1.6847, + "step": 95880 + }, + { + "epoch": 0.602690864808595, + "grad_norm": 8.631400108337402, + "learning_rate": 1.5986982724659064e-05, + "loss": 1.5672, + "step": 95890 + }, + { + "epoch": 0.6027537171252921, + "grad_norm": 6.0152106285095215, + "learning_rate": 1.5986563623714407e-05, + "loss": 1.7107, + "step": 95900 + }, + { + "epoch": 0.6028165694419892, + "grad_norm": 6.5257158279418945, + "learning_rate": 1.5986144522769754e-05, + "loss": 1.5632, + "step": 95910 + }, + { + "epoch": 0.6028794217586864, + "grad_norm": 6.976310729980469, + "learning_rate": 1.59857254218251e-05, + "loss": 1.9114, + "step": 95920 + }, + { + "epoch": 0.6029422740753835, + "grad_norm": 6.798217296600342, + "learning_rate": 1.598530632088045e-05, + "loss": 1.6182, + "step": 95930 + }, + { + "epoch": 0.6030051263920806, + "grad_norm": 6.026243209838867, + "learning_rate": 1.5984887219935796e-05, + "loss": 1.6752, + "step": 95940 + }, + { + "epoch": 0.6030679787087777, + "grad_norm": 7.162676811218262, + "learning_rate": 1.5984468118991143e-05, + "loss": 1.7328, + "step": 95950 + }, + { + "epoch": 0.6031308310254748, + "grad_norm": 6.474308490753174, + "learning_rate": 1.5984049018046486e-05, + "loss": 1.6895, + "step": 95960 + }, + { + "epoch": 0.6031936833421719, + "grad_norm": 6.281386375427246, + "learning_rate": 1.5983629917101833e-05, + "loss": 1.6435, + "step": 95970 + }, + { + "epoch": 0.603256535658869, + "grad_norm": 7.172461986541748, + "learning_rate": 1.598321081615718e-05, + "loss": 1.6507, + "step": 95980 + }, + { + "epoch": 0.6033193879755662, + "grad_norm": 6.905472755432129, + "learning_rate": 1.5982791715212528e-05, + "loss": 1.5973, + "step": 95990 + }, + { + "epoch": 0.6033822402922633, + "grad_norm": 7.273667335510254, + "learning_rate": 1.5982372614267875e-05, + "loss": 1.4689, + "step": 96000 + }, + { + "epoch": 0.6034450926089604, + "grad_norm": 6.713540077209473, + "learning_rate": 1.5981953513323222e-05, + "loss": 1.7759, + "step": 96010 + }, + { + "epoch": 0.6035079449256575, + "grad_norm": 7.233867168426514, + "learning_rate": 1.598153441237857e-05, + "loss": 1.671, + "step": 96020 + }, + { + "epoch": 0.6035707972423546, + "grad_norm": 7.417419910430908, + "learning_rate": 1.5981115311433912e-05, + "loss": 1.836, + "step": 96030 + }, + { + "epoch": 0.6036336495590517, + "grad_norm": 5.484426498413086, + "learning_rate": 1.598069621048926e-05, + "loss": 1.499, + "step": 96040 + }, + { + "epoch": 0.6036965018757489, + "grad_norm": 6.9189534187316895, + "learning_rate": 1.5980277109544607e-05, + "loss": 1.4944, + "step": 96050 + }, + { + "epoch": 0.603759354192446, + "grad_norm": 6.4656805992126465, + "learning_rate": 1.5979858008599954e-05, + "loss": 1.6776, + "step": 96060 + }, + { + "epoch": 0.6038222065091431, + "grad_norm": 7.165610313415527, + "learning_rate": 1.59794389076553e-05, + "loss": 1.911, + "step": 96070 + }, + { + "epoch": 0.6038850588258402, + "grad_norm": 6.205036163330078, + "learning_rate": 1.5979019806710644e-05, + "loss": 1.6976, + "step": 96080 + }, + { + "epoch": 0.6039479111425373, + "grad_norm": 6.048856258392334, + "learning_rate": 1.597860070576599e-05, + "loss": 1.3312, + "step": 96090 + }, + { + "epoch": 0.6040107634592344, + "grad_norm": 5.5724778175354, + "learning_rate": 1.597818160482134e-05, + "loss": 1.6017, + "step": 96100 + }, + { + "epoch": 0.6040736157759314, + "grad_norm": 6.729308605194092, + "learning_rate": 1.5977762503876686e-05, + "loss": 1.6707, + "step": 96110 + }, + { + "epoch": 0.6041364680926286, + "grad_norm": 6.612854957580566, + "learning_rate": 1.597734340293203e-05, + "loss": 1.596, + "step": 96120 + }, + { + "epoch": 0.6041993204093257, + "grad_norm": 7.262803554534912, + "learning_rate": 1.5976924301987376e-05, + "loss": 1.86, + "step": 96130 + }, + { + "epoch": 0.6042621727260228, + "grad_norm": 7.318043231964111, + "learning_rate": 1.5976505201042723e-05, + "loss": 1.63, + "step": 96140 + }, + { + "epoch": 0.6043250250427199, + "grad_norm": 6.175392150878906, + "learning_rate": 1.597608610009807e-05, + "loss": 1.4979, + "step": 96150 + }, + { + "epoch": 0.604387877359417, + "grad_norm": 6.861042022705078, + "learning_rate": 1.5975666999153418e-05, + "loss": 1.6917, + "step": 96160 + }, + { + "epoch": 0.6044507296761141, + "grad_norm": 6.393370151519775, + "learning_rate": 1.5975247898208765e-05, + "loss": 1.6265, + "step": 96170 + }, + { + "epoch": 0.6045135819928112, + "grad_norm": 8.243154525756836, + "learning_rate": 1.5974828797264112e-05, + "loss": 1.6527, + "step": 96180 + }, + { + "epoch": 0.6045764343095084, + "grad_norm": 7.203034400939941, + "learning_rate": 1.597440969631946e-05, + "loss": 1.786, + "step": 96190 + }, + { + "epoch": 0.6046392866262055, + "grad_norm": 5.90932559967041, + "learning_rate": 1.5973990595374806e-05, + "loss": 1.7499, + "step": 96200 + }, + { + "epoch": 0.6047021389429026, + "grad_norm": 6.472553730010986, + "learning_rate": 1.597357149443015e-05, + "loss": 1.4541, + "step": 96210 + }, + { + "epoch": 0.6047649912595997, + "grad_norm": 6.357101917266846, + "learning_rate": 1.5973152393485497e-05, + "loss": 1.4883, + "step": 96220 + }, + { + "epoch": 0.6048278435762968, + "grad_norm": 6.183920383453369, + "learning_rate": 1.5972733292540844e-05, + "loss": 1.7241, + "step": 96230 + }, + { + "epoch": 0.6048906958929939, + "grad_norm": 7.03749942779541, + "learning_rate": 1.597231419159619e-05, + "loss": 1.624, + "step": 96240 + }, + { + "epoch": 0.604953548209691, + "grad_norm": 7.242808818817139, + "learning_rate": 1.5971895090651534e-05, + "loss": 1.4835, + "step": 96250 + }, + { + "epoch": 0.6050164005263882, + "grad_norm": 6.75843620300293, + "learning_rate": 1.597147598970688e-05, + "loss": 1.7205, + "step": 96260 + }, + { + "epoch": 0.6050792528430853, + "grad_norm": 7.639392852783203, + "learning_rate": 1.597105688876223e-05, + "loss": 1.7237, + "step": 96270 + }, + { + "epoch": 0.6051421051597824, + "grad_norm": 6.621246814727783, + "learning_rate": 1.5970637787817576e-05, + "loss": 1.4742, + "step": 96280 + }, + { + "epoch": 0.6052049574764795, + "grad_norm": 7.6240458488464355, + "learning_rate": 1.5970218686872923e-05, + "loss": 1.6599, + "step": 96290 + }, + { + "epoch": 0.6052678097931766, + "grad_norm": 6.757303237915039, + "learning_rate": 1.5969799585928266e-05, + "loss": 1.7977, + "step": 96300 + }, + { + "epoch": 0.6053306621098737, + "grad_norm": 7.339069843292236, + "learning_rate": 1.5969380484983613e-05, + "loss": 1.497, + "step": 96310 + }, + { + "epoch": 0.6053935144265709, + "grad_norm": 6.622323513031006, + "learning_rate": 1.596896138403896e-05, + "loss": 1.5652, + "step": 96320 + }, + { + "epoch": 0.605456366743268, + "grad_norm": 7.0324273109436035, + "learning_rate": 1.5968542283094308e-05, + "loss": 1.6223, + "step": 96330 + }, + { + "epoch": 0.6055192190599651, + "grad_norm": 6.163375377655029, + "learning_rate": 1.596812318214965e-05, + "loss": 1.6602, + "step": 96340 + }, + { + "epoch": 0.6055820713766622, + "grad_norm": 6.083081245422363, + "learning_rate": 1.5967704081205e-05, + "loss": 1.6352, + "step": 96350 + }, + { + "epoch": 0.6056449236933592, + "grad_norm": 7.394678592681885, + "learning_rate": 1.5967284980260345e-05, + "loss": 1.6288, + "step": 96360 + }, + { + "epoch": 0.6057077760100563, + "grad_norm": 5.2517523765563965, + "learning_rate": 1.5966865879315693e-05, + "loss": 1.52, + "step": 96370 + }, + { + "epoch": 0.6057706283267534, + "grad_norm": 5.803227424621582, + "learning_rate": 1.596644677837104e-05, + "loss": 1.6651, + "step": 96380 + }, + { + "epoch": 0.6058334806434506, + "grad_norm": 6.1749701499938965, + "learning_rate": 1.5966027677426387e-05, + "loss": 1.4957, + "step": 96390 + }, + { + "epoch": 0.6058963329601477, + "grad_norm": 6.390466690063477, + "learning_rate": 1.5965608576481734e-05, + "loss": 1.4403, + "step": 96400 + }, + { + "epoch": 0.6059591852768448, + "grad_norm": 7.035869598388672, + "learning_rate": 1.596518947553708e-05, + "loss": 1.5197, + "step": 96410 + }, + { + "epoch": 0.6060220375935419, + "grad_norm": 6.534717082977295, + "learning_rate": 1.5964770374592428e-05, + "loss": 1.9955, + "step": 96420 + }, + { + "epoch": 0.606084889910239, + "grad_norm": 6.712184429168701, + "learning_rate": 1.596435127364777e-05, + "loss": 1.4664, + "step": 96430 + }, + { + "epoch": 0.6061477422269361, + "grad_norm": 7.0679931640625, + "learning_rate": 1.596393217270312e-05, + "loss": 1.7205, + "step": 96440 + }, + { + "epoch": 0.6062105945436332, + "grad_norm": 9.562171936035156, + "learning_rate": 1.5963513071758466e-05, + "loss": 1.7928, + "step": 96450 + }, + { + "epoch": 0.6062734468603304, + "grad_norm": 4.552233695983887, + "learning_rate": 1.5963093970813813e-05, + "loss": 1.5129, + "step": 96460 + }, + { + "epoch": 0.6063362991770275, + "grad_norm": 6.2462263107299805, + "learning_rate": 1.5962674869869156e-05, + "loss": 1.674, + "step": 96470 + }, + { + "epoch": 0.6063991514937246, + "grad_norm": 5.8251237869262695, + "learning_rate": 1.5962255768924504e-05, + "loss": 1.6712, + "step": 96480 + }, + { + "epoch": 0.6064620038104217, + "grad_norm": 6.038680553436279, + "learning_rate": 1.596183666797985e-05, + "loss": 1.6022, + "step": 96490 + }, + { + "epoch": 0.6065248561271188, + "grad_norm": 6.039565086364746, + "learning_rate": 1.5961417567035198e-05, + "loss": 1.5418, + "step": 96500 + }, + { + "epoch": 0.6065877084438159, + "grad_norm": 6.144685745239258, + "learning_rate": 1.5960998466090545e-05, + "loss": 1.6655, + "step": 96510 + }, + { + "epoch": 0.606650560760513, + "grad_norm": 5.955746173858643, + "learning_rate": 1.596057936514589e-05, + "loss": 1.556, + "step": 96520 + }, + { + "epoch": 0.6067134130772102, + "grad_norm": 6.442933082580566, + "learning_rate": 1.5960160264201235e-05, + "loss": 1.5626, + "step": 96530 + }, + { + "epoch": 0.6067762653939073, + "grad_norm": 6.719789981842041, + "learning_rate": 1.5959741163256583e-05, + "loss": 1.8061, + "step": 96540 + }, + { + "epoch": 0.6068391177106044, + "grad_norm": 7.188895225524902, + "learning_rate": 1.595932206231193e-05, + "loss": 1.6047, + "step": 96550 + }, + { + "epoch": 0.6069019700273015, + "grad_norm": 6.589751720428467, + "learning_rate": 1.5958902961367277e-05, + "loss": 1.5288, + "step": 96560 + }, + { + "epoch": 0.6069648223439986, + "grad_norm": 5.951982021331787, + "learning_rate": 1.5958483860422624e-05, + "loss": 1.6871, + "step": 96570 + }, + { + "epoch": 0.6070276746606957, + "grad_norm": 6.021017551422119, + "learning_rate": 1.595806475947797e-05, + "loss": 1.6616, + "step": 96580 + }, + { + "epoch": 0.6070905269773929, + "grad_norm": 7.280405044555664, + "learning_rate": 1.5957645658533315e-05, + "loss": 1.7106, + "step": 96590 + }, + { + "epoch": 0.60715337929409, + "grad_norm": 6.818925380706787, + "learning_rate": 1.595722655758866e-05, + "loss": 1.6977, + "step": 96600 + }, + { + "epoch": 0.6072162316107871, + "grad_norm": 5.315835475921631, + "learning_rate": 1.595680745664401e-05, + "loss": 1.7101, + "step": 96610 + }, + { + "epoch": 0.6072790839274841, + "grad_norm": 6.7951979637146, + "learning_rate": 1.5956388355699356e-05, + "loss": 1.5392, + "step": 96620 + }, + { + "epoch": 0.6073419362441812, + "grad_norm": 6.182480812072754, + "learning_rate": 1.5955969254754703e-05, + "loss": 1.7713, + "step": 96630 + }, + { + "epoch": 0.6074047885608783, + "grad_norm": 5.1290483474731445, + "learning_rate": 1.595555015381005e-05, + "loss": 1.6297, + "step": 96640 + }, + { + "epoch": 0.6074676408775754, + "grad_norm": 6.374390125274658, + "learning_rate": 1.5955131052865394e-05, + "loss": 1.6293, + "step": 96650 + }, + { + "epoch": 0.6075304931942725, + "grad_norm": 6.011179447174072, + "learning_rate": 1.595471195192074e-05, + "loss": 1.6367, + "step": 96660 + }, + { + "epoch": 0.6075933455109697, + "grad_norm": 6.635353088378906, + "learning_rate": 1.5954292850976088e-05, + "loss": 1.7165, + "step": 96670 + }, + { + "epoch": 0.6076561978276668, + "grad_norm": 6.711002349853516, + "learning_rate": 1.5953873750031435e-05, + "loss": 1.6859, + "step": 96680 + }, + { + "epoch": 0.6077190501443639, + "grad_norm": 6.37709379196167, + "learning_rate": 1.5953454649086782e-05, + "loss": 1.6371, + "step": 96690 + }, + { + "epoch": 0.607781902461061, + "grad_norm": 7.191039085388184, + "learning_rate": 1.5953035548142126e-05, + "loss": 1.7203, + "step": 96700 + }, + { + "epoch": 0.6078447547777581, + "grad_norm": 6.104044437408447, + "learning_rate": 1.5952616447197473e-05, + "loss": 1.6676, + "step": 96710 + }, + { + "epoch": 0.6079076070944552, + "grad_norm": 6.773703098297119, + "learning_rate": 1.595219734625282e-05, + "loss": 1.9815, + "step": 96720 + }, + { + "epoch": 0.6079704594111524, + "grad_norm": 6.631420135498047, + "learning_rate": 1.5951778245308167e-05, + "loss": 1.5296, + "step": 96730 + }, + { + "epoch": 0.6080333117278495, + "grad_norm": 8.33754825592041, + "learning_rate": 1.595135914436351e-05, + "loss": 1.8381, + "step": 96740 + }, + { + "epoch": 0.6080961640445466, + "grad_norm": 6.7561116218566895, + "learning_rate": 1.5950940043418857e-05, + "loss": 1.7066, + "step": 96750 + }, + { + "epoch": 0.6081590163612437, + "grad_norm": 6.796525955200195, + "learning_rate": 1.5950520942474205e-05, + "loss": 1.7786, + "step": 96760 + }, + { + "epoch": 0.6082218686779408, + "grad_norm": 7.03566837310791, + "learning_rate": 1.595010184152955e-05, + "loss": 1.5446, + "step": 96770 + }, + { + "epoch": 0.6082847209946379, + "grad_norm": 6.7658586502075195, + "learning_rate": 1.59496827405849e-05, + "loss": 1.544, + "step": 96780 + }, + { + "epoch": 0.608347573311335, + "grad_norm": 7.732165813446045, + "learning_rate": 1.5949263639640246e-05, + "loss": 1.6133, + "step": 96790 + }, + { + "epoch": 0.6084104256280322, + "grad_norm": 7.413866996765137, + "learning_rate": 1.5948844538695593e-05, + "loss": 1.8205, + "step": 96800 + }, + { + "epoch": 0.6084732779447293, + "grad_norm": 6.353707790374756, + "learning_rate": 1.594842543775094e-05, + "loss": 1.5322, + "step": 96810 + }, + { + "epoch": 0.6085361302614264, + "grad_norm": 6.494779586791992, + "learning_rate": 1.5948006336806287e-05, + "loss": 1.6119, + "step": 96820 + }, + { + "epoch": 0.6085989825781235, + "grad_norm": 6.874396800994873, + "learning_rate": 1.594758723586163e-05, + "loss": 1.6729, + "step": 96830 + }, + { + "epoch": 0.6086618348948206, + "grad_norm": 7.573245048522949, + "learning_rate": 1.5947168134916978e-05, + "loss": 1.43, + "step": 96840 + }, + { + "epoch": 0.6087246872115177, + "grad_norm": 6.823930740356445, + "learning_rate": 1.5946749033972325e-05, + "loss": 1.7342, + "step": 96850 + }, + { + "epoch": 0.6087875395282148, + "grad_norm": 7.079809188842773, + "learning_rate": 1.5946329933027672e-05, + "loss": 1.8017, + "step": 96860 + }, + { + "epoch": 0.6088503918449119, + "grad_norm": 7.149454593658447, + "learning_rate": 1.5945910832083016e-05, + "loss": 1.6538, + "step": 96870 + }, + { + "epoch": 0.608913244161609, + "grad_norm": 6.1625566482543945, + "learning_rate": 1.5945491731138363e-05, + "loss": 1.2721, + "step": 96880 + }, + { + "epoch": 0.6089760964783061, + "grad_norm": 7.1358323097229, + "learning_rate": 1.594507263019371e-05, + "loss": 1.7929, + "step": 96890 + }, + { + "epoch": 0.6090389487950032, + "grad_norm": 6.218572616577148, + "learning_rate": 1.5944653529249057e-05, + "loss": 1.6383, + "step": 96900 + }, + { + "epoch": 0.6091018011117003, + "grad_norm": 6.8250298500061035, + "learning_rate": 1.5944234428304404e-05, + "loss": 1.5742, + "step": 96910 + }, + { + "epoch": 0.6091646534283974, + "grad_norm": 6.283374786376953, + "learning_rate": 1.5943815327359748e-05, + "loss": 1.9339, + "step": 96920 + }, + { + "epoch": 0.6092275057450945, + "grad_norm": 6.209414958953857, + "learning_rate": 1.5943396226415095e-05, + "loss": 1.8073, + "step": 96930 + }, + { + "epoch": 0.6092903580617917, + "grad_norm": 5.9464826583862305, + "learning_rate": 1.594297712547044e-05, + "loss": 1.7446, + "step": 96940 + }, + { + "epoch": 0.6093532103784888, + "grad_norm": 6.390690803527832, + "learning_rate": 1.594255802452579e-05, + "loss": 1.4427, + "step": 96950 + }, + { + "epoch": 0.6094160626951859, + "grad_norm": 7.447775840759277, + "learning_rate": 1.5942138923581136e-05, + "loss": 1.6181, + "step": 96960 + }, + { + "epoch": 0.609478915011883, + "grad_norm": 6.484543323516846, + "learning_rate": 1.594171982263648e-05, + "loss": 1.694, + "step": 96970 + }, + { + "epoch": 0.6095417673285801, + "grad_norm": 6.508401393890381, + "learning_rate": 1.5941300721691827e-05, + "loss": 1.4989, + "step": 96980 + }, + { + "epoch": 0.6096046196452772, + "grad_norm": 6.89706563949585, + "learning_rate": 1.5940881620747174e-05, + "loss": 1.6738, + "step": 96990 + }, + { + "epoch": 0.6096674719619743, + "grad_norm": 7.620410919189453, + "learning_rate": 1.594046251980252e-05, + "loss": 1.8045, + "step": 97000 + }, + { + "epoch": 0.6097303242786715, + "grad_norm": 7.822571754455566, + "learning_rate": 1.5940043418857868e-05, + "loss": 1.5355, + "step": 97010 + }, + { + "epoch": 0.6097931765953686, + "grad_norm": 5.538037300109863, + "learning_rate": 1.5939624317913215e-05, + "loss": 1.6928, + "step": 97020 + }, + { + "epoch": 0.6098560289120657, + "grad_norm": 6.762993812561035, + "learning_rate": 1.5939205216968562e-05, + "loss": 1.5583, + "step": 97030 + }, + { + "epoch": 0.6099188812287628, + "grad_norm": 10.59797191619873, + "learning_rate": 1.593878611602391e-05, + "loss": 1.7227, + "step": 97040 + }, + { + "epoch": 0.6099817335454599, + "grad_norm": 7.5071702003479, + "learning_rate": 1.5938367015079253e-05, + "loss": 1.7092, + "step": 97050 + }, + { + "epoch": 0.610044585862157, + "grad_norm": 6.896035671234131, + "learning_rate": 1.59379479141346e-05, + "loss": 1.6339, + "step": 97060 + }, + { + "epoch": 0.6101074381788542, + "grad_norm": 7.5746307373046875, + "learning_rate": 1.5937528813189947e-05, + "loss": 1.6787, + "step": 97070 + }, + { + "epoch": 0.6101702904955513, + "grad_norm": 6.058924198150635, + "learning_rate": 1.5937109712245294e-05, + "loss": 1.6495, + "step": 97080 + }, + { + "epoch": 0.6102331428122484, + "grad_norm": 7.73477840423584, + "learning_rate": 1.5936690611300638e-05, + "loss": 1.5875, + "step": 97090 + }, + { + "epoch": 0.6102959951289455, + "grad_norm": 6.746033668518066, + "learning_rate": 1.5936271510355985e-05, + "loss": 1.6142, + "step": 97100 + }, + { + "epoch": 0.6103588474456426, + "grad_norm": 6.900345802307129, + "learning_rate": 1.5935852409411332e-05, + "loss": 1.889, + "step": 97110 + }, + { + "epoch": 0.6104216997623397, + "grad_norm": 5.7613525390625, + "learning_rate": 1.593543330846668e-05, + "loss": 1.5652, + "step": 97120 + }, + { + "epoch": 0.6104845520790367, + "grad_norm": 6.890136241912842, + "learning_rate": 1.5935014207522026e-05, + "loss": 1.7257, + "step": 97130 + }, + { + "epoch": 0.6105474043957339, + "grad_norm": 6.521233558654785, + "learning_rate": 1.593459510657737e-05, + "loss": 1.6479, + "step": 97140 + }, + { + "epoch": 0.610610256712431, + "grad_norm": 7.145533084869385, + "learning_rate": 1.5934176005632717e-05, + "loss": 1.4238, + "step": 97150 + }, + { + "epoch": 0.6106731090291281, + "grad_norm": 6.435394763946533, + "learning_rate": 1.5933756904688064e-05, + "loss": 1.7891, + "step": 97160 + }, + { + "epoch": 0.6107359613458252, + "grad_norm": 6.097654342651367, + "learning_rate": 1.593333780374341e-05, + "loss": 1.5306, + "step": 97170 + }, + { + "epoch": 0.6107988136625223, + "grad_norm": 7.007704257965088, + "learning_rate": 1.5932918702798758e-05, + "loss": 1.5745, + "step": 97180 + }, + { + "epoch": 0.6108616659792194, + "grad_norm": 6.812733173370361, + "learning_rate": 1.5932499601854105e-05, + "loss": 1.5193, + "step": 97190 + }, + { + "epoch": 0.6109245182959165, + "grad_norm": 6.507319927215576, + "learning_rate": 1.5932080500909452e-05, + "loss": 1.6541, + "step": 97200 + }, + { + "epoch": 0.6109873706126137, + "grad_norm": 5.9998908042907715, + "learning_rate": 1.5931661399964796e-05, + "loss": 1.506, + "step": 97210 + }, + { + "epoch": 0.6110502229293108, + "grad_norm": 7.0761237144470215, + "learning_rate": 1.5931242299020143e-05, + "loss": 1.5575, + "step": 97220 + }, + { + "epoch": 0.6111130752460079, + "grad_norm": 6.050992488861084, + "learning_rate": 1.593082319807549e-05, + "loss": 1.5718, + "step": 97230 + }, + { + "epoch": 0.611175927562705, + "grad_norm": 6.629901885986328, + "learning_rate": 1.5930404097130837e-05, + "loss": 1.8976, + "step": 97240 + }, + { + "epoch": 0.6112387798794021, + "grad_norm": 6.821048259735107, + "learning_rate": 1.5929984996186184e-05, + "loss": 1.623, + "step": 97250 + }, + { + "epoch": 0.6113016321960992, + "grad_norm": 6.956756114959717, + "learning_rate": 1.592956589524153e-05, + "loss": 1.6399, + "step": 97260 + }, + { + "epoch": 0.6113644845127963, + "grad_norm": 5.7625627517700195, + "learning_rate": 1.5929146794296875e-05, + "loss": 1.6874, + "step": 97270 + }, + { + "epoch": 0.6114273368294935, + "grad_norm": 6.572230339050293, + "learning_rate": 1.5928727693352222e-05, + "loss": 1.8282, + "step": 97280 + }, + { + "epoch": 0.6114901891461906, + "grad_norm": 6.548996925354004, + "learning_rate": 1.592830859240757e-05, + "loss": 1.6299, + "step": 97290 + }, + { + "epoch": 0.6115530414628877, + "grad_norm": 6.3550639152526855, + "learning_rate": 1.5927889491462916e-05, + "loss": 1.4761, + "step": 97300 + }, + { + "epoch": 0.6116158937795848, + "grad_norm": 5.993685722351074, + "learning_rate": 1.5927470390518263e-05, + "loss": 1.618, + "step": 97310 + }, + { + "epoch": 0.6116787460962819, + "grad_norm": 6.1411213874816895, + "learning_rate": 1.5927051289573607e-05, + "loss": 1.5549, + "step": 97320 + }, + { + "epoch": 0.611741598412979, + "grad_norm": 6.283779144287109, + "learning_rate": 1.5926632188628954e-05, + "loss": 1.5801, + "step": 97330 + }, + { + "epoch": 0.6118044507296762, + "grad_norm": 7.2673845291137695, + "learning_rate": 1.59262130876843e-05, + "loss": 1.6625, + "step": 97340 + }, + { + "epoch": 0.6118673030463733, + "grad_norm": 6.941159725189209, + "learning_rate": 1.5925793986739648e-05, + "loss": 1.5375, + "step": 97350 + }, + { + "epoch": 0.6119301553630704, + "grad_norm": 6.971386432647705, + "learning_rate": 1.592537488579499e-05, + "loss": 1.769, + "step": 97360 + }, + { + "epoch": 0.6119930076797675, + "grad_norm": 6.657928943634033, + "learning_rate": 1.592495578485034e-05, + "loss": 1.8468, + "step": 97370 + }, + { + "epoch": 0.6120558599964645, + "grad_norm": 6.511856555938721, + "learning_rate": 1.5924536683905686e-05, + "loss": 1.5713, + "step": 97380 + }, + { + "epoch": 0.6121187123131616, + "grad_norm": 6.132238864898682, + "learning_rate": 1.5924117582961033e-05, + "loss": 1.6285, + "step": 97390 + }, + { + "epoch": 0.6121815646298587, + "grad_norm": 7.924448013305664, + "learning_rate": 1.592369848201638e-05, + "loss": 1.8515, + "step": 97400 + }, + { + "epoch": 0.6122444169465558, + "grad_norm": 7.811633586883545, + "learning_rate": 1.5923279381071727e-05, + "loss": 1.8305, + "step": 97410 + }, + { + "epoch": 0.612307269263253, + "grad_norm": 6.808788776397705, + "learning_rate": 1.5922860280127074e-05, + "loss": 1.4275, + "step": 97420 + }, + { + "epoch": 0.6123701215799501, + "grad_norm": 7.154580593109131, + "learning_rate": 1.592244117918242e-05, + "loss": 1.7556, + "step": 97430 + }, + { + "epoch": 0.6124329738966472, + "grad_norm": 6.226292133331299, + "learning_rate": 1.5922022078237768e-05, + "loss": 1.594, + "step": 97440 + }, + { + "epoch": 0.6124958262133443, + "grad_norm": 6.5397491455078125, + "learning_rate": 1.5921602977293112e-05, + "loss": 1.8206, + "step": 97450 + }, + { + "epoch": 0.6125586785300414, + "grad_norm": 6.531113147735596, + "learning_rate": 1.592118387634846e-05, + "loss": 1.5511, + "step": 97460 + }, + { + "epoch": 0.6126215308467385, + "grad_norm": 6.342073440551758, + "learning_rate": 1.5920764775403806e-05, + "loss": 1.5654, + "step": 97470 + }, + { + "epoch": 0.6126843831634357, + "grad_norm": 6.173723220825195, + "learning_rate": 1.5920345674459153e-05, + "loss": 1.6765, + "step": 97480 + }, + { + "epoch": 0.6127472354801328, + "grad_norm": 7.431957244873047, + "learning_rate": 1.5919926573514497e-05, + "loss": 1.6724, + "step": 97490 + }, + { + "epoch": 0.6128100877968299, + "grad_norm": 7.1538801193237305, + "learning_rate": 1.5919507472569844e-05, + "loss": 1.728, + "step": 97500 + }, + { + "epoch": 0.612872940113527, + "grad_norm": 5.422327995300293, + "learning_rate": 1.591908837162519e-05, + "loss": 1.8189, + "step": 97510 + }, + { + "epoch": 0.6129357924302241, + "grad_norm": 6.470592498779297, + "learning_rate": 1.5918669270680538e-05, + "loss": 1.7974, + "step": 97520 + }, + { + "epoch": 0.6129986447469212, + "grad_norm": 8.223944664001465, + "learning_rate": 1.5918250169735885e-05, + "loss": 1.6255, + "step": 97530 + }, + { + "epoch": 0.6130614970636183, + "grad_norm": 6.294012546539307, + "learning_rate": 1.591783106879123e-05, + "loss": 1.7991, + "step": 97540 + }, + { + "epoch": 0.6131243493803155, + "grad_norm": 6.184208393096924, + "learning_rate": 1.5917411967846576e-05, + "loss": 1.789, + "step": 97550 + }, + { + "epoch": 0.6131872016970126, + "grad_norm": 6.930463790893555, + "learning_rate": 1.5916992866901923e-05, + "loss": 1.6868, + "step": 97560 + }, + { + "epoch": 0.6132500540137097, + "grad_norm": 7.4079670906066895, + "learning_rate": 1.591657376595727e-05, + "loss": 1.5491, + "step": 97570 + }, + { + "epoch": 0.6133129063304068, + "grad_norm": 6.005690574645996, + "learning_rate": 1.5916154665012617e-05, + "loss": 1.467, + "step": 97580 + }, + { + "epoch": 0.6133757586471039, + "grad_norm": 6.448835372924805, + "learning_rate": 1.591573556406796e-05, + "loss": 1.7108, + "step": 97590 + }, + { + "epoch": 0.613438610963801, + "grad_norm": 6.382637977600098, + "learning_rate": 1.5915316463123308e-05, + "loss": 1.7321, + "step": 97600 + }, + { + "epoch": 0.6135014632804981, + "grad_norm": 7.751312732696533, + "learning_rate": 1.5914897362178655e-05, + "loss": 1.5915, + "step": 97610 + }, + { + "epoch": 0.6135643155971953, + "grad_norm": 6.062617778778076, + "learning_rate": 1.5914478261234002e-05, + "loss": 1.6638, + "step": 97620 + }, + { + "epoch": 0.6136271679138924, + "grad_norm": 6.349198818206787, + "learning_rate": 1.591405916028935e-05, + "loss": 1.6401, + "step": 97630 + }, + { + "epoch": 0.6136900202305894, + "grad_norm": 5.5992045402526855, + "learning_rate": 1.5913640059344696e-05, + "loss": 1.5345, + "step": 97640 + }, + { + "epoch": 0.6137528725472865, + "grad_norm": 7.2889838218688965, + "learning_rate": 1.5913220958400043e-05, + "loss": 1.9279, + "step": 97650 + }, + { + "epoch": 0.6138157248639836, + "grad_norm": 6.606493949890137, + "learning_rate": 1.591280185745539e-05, + "loss": 1.6457, + "step": 97660 + }, + { + "epoch": 0.6138785771806807, + "grad_norm": 7.579192161560059, + "learning_rate": 1.5912382756510734e-05, + "loss": 1.6003, + "step": 97670 + }, + { + "epoch": 0.6139414294973778, + "grad_norm": 6.619809627532959, + "learning_rate": 1.591196365556608e-05, + "loss": 1.6042, + "step": 97680 + }, + { + "epoch": 0.614004281814075, + "grad_norm": 5.405503273010254, + "learning_rate": 1.5911544554621428e-05, + "loss": 1.7067, + "step": 97690 + }, + { + "epoch": 0.6140671341307721, + "grad_norm": 6.7340312004089355, + "learning_rate": 1.5911125453676775e-05, + "loss": 1.5633, + "step": 97700 + }, + { + "epoch": 0.6141299864474692, + "grad_norm": 6.256981372833252, + "learning_rate": 1.591070635273212e-05, + "loss": 1.6141, + "step": 97710 + }, + { + "epoch": 0.6141928387641663, + "grad_norm": 7.2453083992004395, + "learning_rate": 1.5910287251787466e-05, + "loss": 1.8788, + "step": 97720 + }, + { + "epoch": 0.6142556910808634, + "grad_norm": 6.870968818664551, + "learning_rate": 1.5909868150842813e-05, + "loss": 1.3222, + "step": 97730 + }, + { + "epoch": 0.6143185433975605, + "grad_norm": 6.767097473144531, + "learning_rate": 1.590944904989816e-05, + "loss": 1.8889, + "step": 97740 + }, + { + "epoch": 0.6143813957142576, + "grad_norm": 7.072991371154785, + "learning_rate": 1.5909029948953507e-05, + "loss": 1.7936, + "step": 97750 + }, + { + "epoch": 0.6144442480309548, + "grad_norm": 5.585392475128174, + "learning_rate": 1.590861084800885e-05, + "loss": 1.3332, + "step": 97760 + }, + { + "epoch": 0.6145071003476519, + "grad_norm": 5.266753673553467, + "learning_rate": 1.5908191747064198e-05, + "loss": 1.4711, + "step": 97770 + }, + { + "epoch": 0.614569952664349, + "grad_norm": 7.481316089630127, + "learning_rate": 1.5907772646119545e-05, + "loss": 1.5279, + "step": 97780 + }, + { + "epoch": 0.6146328049810461, + "grad_norm": 6.820082187652588, + "learning_rate": 1.5907353545174892e-05, + "loss": 1.5929, + "step": 97790 + }, + { + "epoch": 0.6146956572977432, + "grad_norm": 7.213798999786377, + "learning_rate": 1.590693444423024e-05, + "loss": 1.7012, + "step": 97800 + }, + { + "epoch": 0.6147585096144403, + "grad_norm": 7.327370643615723, + "learning_rate": 1.5906515343285586e-05, + "loss": 1.6506, + "step": 97810 + }, + { + "epoch": 0.6148213619311375, + "grad_norm": 7.922680377960205, + "learning_rate": 1.5906096242340933e-05, + "loss": 1.7617, + "step": 97820 + }, + { + "epoch": 0.6148842142478346, + "grad_norm": 6.1932501792907715, + "learning_rate": 1.590567714139628e-05, + "loss": 1.648, + "step": 97830 + }, + { + "epoch": 0.6149470665645317, + "grad_norm": 8.107322692871094, + "learning_rate": 1.5905258040451624e-05, + "loss": 1.6619, + "step": 97840 + }, + { + "epoch": 0.6150099188812288, + "grad_norm": 7.934214115142822, + "learning_rate": 1.590483893950697e-05, + "loss": 1.8224, + "step": 97850 + }, + { + "epoch": 0.6150727711979259, + "grad_norm": 6.7235517501831055, + "learning_rate": 1.5904419838562318e-05, + "loss": 1.7292, + "step": 97860 + }, + { + "epoch": 0.615135623514623, + "grad_norm": 6.896998405456543, + "learning_rate": 1.5904000737617665e-05, + "loss": 1.6557, + "step": 97870 + }, + { + "epoch": 0.6151984758313201, + "grad_norm": 7.2692999839782715, + "learning_rate": 1.5903581636673012e-05, + "loss": 1.6229, + "step": 97880 + }, + { + "epoch": 0.6152613281480172, + "grad_norm": 6.0252461433410645, + "learning_rate": 1.5903162535728356e-05, + "loss": 1.4469, + "step": 97890 + }, + { + "epoch": 0.6153241804647143, + "grad_norm": 7.354637145996094, + "learning_rate": 1.5902743434783703e-05, + "loss": 1.7811, + "step": 97900 + }, + { + "epoch": 0.6153870327814114, + "grad_norm": 5.593834400177002, + "learning_rate": 1.590232433383905e-05, + "loss": 1.4064, + "step": 97910 + }, + { + "epoch": 0.6154498850981085, + "grad_norm": 7.0428619384765625, + "learning_rate": 1.5901905232894397e-05, + "loss": 1.7334, + "step": 97920 + }, + { + "epoch": 0.6155127374148056, + "grad_norm": 6.362512588500977, + "learning_rate": 1.5901486131949744e-05, + "loss": 1.9084, + "step": 97930 + }, + { + "epoch": 0.6155755897315027, + "grad_norm": 6.2018208503723145, + "learning_rate": 1.5901067031005088e-05, + "loss": 1.5764, + "step": 97940 + }, + { + "epoch": 0.6156384420481998, + "grad_norm": 6.819106578826904, + "learning_rate": 1.5900647930060435e-05, + "loss": 1.6431, + "step": 97950 + }, + { + "epoch": 0.615701294364897, + "grad_norm": 6.307815074920654, + "learning_rate": 1.5900228829115782e-05, + "loss": 1.632, + "step": 97960 + }, + { + "epoch": 0.6157641466815941, + "grad_norm": 6.985607147216797, + "learning_rate": 1.589980972817113e-05, + "loss": 1.8224, + "step": 97970 + }, + { + "epoch": 0.6158269989982912, + "grad_norm": 6.369467735290527, + "learning_rate": 1.5899390627226473e-05, + "loss": 1.7077, + "step": 97980 + }, + { + "epoch": 0.6158898513149883, + "grad_norm": 6.204318046569824, + "learning_rate": 1.589897152628182e-05, + "loss": 1.465, + "step": 97990 + }, + { + "epoch": 0.6159527036316854, + "grad_norm": 7.323031902313232, + "learning_rate": 1.5898552425337167e-05, + "loss": 1.6542, + "step": 98000 + }, + { + "epoch": 0.6160155559483825, + "grad_norm": 6.1911444664001465, + "learning_rate": 1.5898133324392514e-05, + "loss": 1.4795, + "step": 98010 + }, + { + "epoch": 0.6160784082650796, + "grad_norm": 5.6918110847473145, + "learning_rate": 1.589771422344786e-05, + "loss": 1.52, + "step": 98020 + }, + { + "epoch": 0.6161412605817768, + "grad_norm": 6.485158443450928, + "learning_rate": 1.5897295122503208e-05, + "loss": 1.6236, + "step": 98030 + }, + { + "epoch": 0.6162041128984739, + "grad_norm": 6.915482044219971, + "learning_rate": 1.5896876021558555e-05, + "loss": 1.5711, + "step": 98040 + }, + { + "epoch": 0.616266965215171, + "grad_norm": 7.597339153289795, + "learning_rate": 1.5896456920613902e-05, + "loss": 1.8487, + "step": 98050 + }, + { + "epoch": 0.6163298175318681, + "grad_norm": 7.884457111358643, + "learning_rate": 1.589603781966925e-05, + "loss": 1.6207, + "step": 98060 + }, + { + "epoch": 0.6163926698485652, + "grad_norm": 6.6080002784729, + "learning_rate": 1.5895618718724593e-05, + "loss": 1.5521, + "step": 98070 + }, + { + "epoch": 0.6164555221652623, + "grad_norm": 7.194190502166748, + "learning_rate": 1.589519961777994e-05, + "loss": 1.3555, + "step": 98080 + }, + { + "epoch": 0.6165183744819595, + "grad_norm": 5.772176742553711, + "learning_rate": 1.5894780516835287e-05, + "loss": 1.7029, + "step": 98090 + }, + { + "epoch": 0.6165812267986566, + "grad_norm": 7.119681358337402, + "learning_rate": 1.5894361415890634e-05, + "loss": 1.757, + "step": 98100 + }, + { + "epoch": 0.6166440791153537, + "grad_norm": 6.931814670562744, + "learning_rate": 1.5893942314945978e-05, + "loss": 1.7993, + "step": 98110 + }, + { + "epoch": 0.6167069314320508, + "grad_norm": 6.7501301765441895, + "learning_rate": 1.5893523214001325e-05, + "loss": 1.6404, + "step": 98120 + }, + { + "epoch": 0.6167697837487479, + "grad_norm": 7.078472137451172, + "learning_rate": 1.5893104113056672e-05, + "loss": 1.611, + "step": 98130 + }, + { + "epoch": 0.616832636065445, + "grad_norm": 6.168127536773682, + "learning_rate": 1.589268501211202e-05, + "loss": 1.5652, + "step": 98140 + }, + { + "epoch": 0.616895488382142, + "grad_norm": 6.353023052215576, + "learning_rate": 1.5892265911167366e-05, + "loss": 1.5572, + "step": 98150 + }, + { + "epoch": 0.6169583406988391, + "grad_norm": 5.670644283294678, + "learning_rate": 1.589184681022271e-05, + "loss": 1.4688, + "step": 98160 + }, + { + "epoch": 0.6170211930155363, + "grad_norm": 7.587639331817627, + "learning_rate": 1.5891427709278057e-05, + "loss": 1.8769, + "step": 98170 + }, + { + "epoch": 0.6170840453322334, + "grad_norm": 6.383285045623779, + "learning_rate": 1.5891008608333404e-05, + "loss": 1.8693, + "step": 98180 + }, + { + "epoch": 0.6171468976489305, + "grad_norm": 7.242428779602051, + "learning_rate": 1.589058950738875e-05, + "loss": 1.7981, + "step": 98190 + }, + { + "epoch": 0.6172097499656276, + "grad_norm": 7.480915546417236, + "learning_rate": 1.5890170406444098e-05, + "loss": 1.8373, + "step": 98200 + }, + { + "epoch": 0.6172726022823247, + "grad_norm": 7.819182395935059, + "learning_rate": 1.5889751305499445e-05, + "loss": 1.5363, + "step": 98210 + }, + { + "epoch": 0.6173354545990218, + "grad_norm": 5.920949935913086, + "learning_rate": 1.588933220455479e-05, + "loss": 1.6519, + "step": 98220 + }, + { + "epoch": 0.617398306915719, + "grad_norm": 6.396178722381592, + "learning_rate": 1.5888913103610136e-05, + "loss": 1.564, + "step": 98230 + }, + { + "epoch": 0.6174611592324161, + "grad_norm": 6.648460388183594, + "learning_rate": 1.5888494002665483e-05, + "loss": 1.7183, + "step": 98240 + }, + { + "epoch": 0.6175240115491132, + "grad_norm": 6.459324359893799, + "learning_rate": 1.588807490172083e-05, + "loss": 1.7778, + "step": 98250 + }, + { + "epoch": 0.6175868638658103, + "grad_norm": 7.500816345214844, + "learning_rate": 1.5887655800776177e-05, + "loss": 1.5668, + "step": 98260 + }, + { + "epoch": 0.6176497161825074, + "grad_norm": 5.732392311096191, + "learning_rate": 1.5887236699831524e-05, + "loss": 1.6429, + "step": 98270 + }, + { + "epoch": 0.6177125684992045, + "grad_norm": 7.986827373504639, + "learning_rate": 1.588681759888687e-05, + "loss": 1.8901, + "step": 98280 + }, + { + "epoch": 0.6177754208159016, + "grad_norm": 7.052105903625488, + "learning_rate": 1.5886398497942215e-05, + "loss": 1.4003, + "step": 98290 + }, + { + "epoch": 0.6178382731325988, + "grad_norm": 6.326847076416016, + "learning_rate": 1.5885979396997562e-05, + "loss": 1.5663, + "step": 98300 + }, + { + "epoch": 0.6179011254492959, + "grad_norm": 7.511728763580322, + "learning_rate": 1.588556029605291e-05, + "loss": 1.7506, + "step": 98310 + }, + { + "epoch": 0.617963977765993, + "grad_norm": 7.672304630279541, + "learning_rate": 1.5885141195108256e-05, + "loss": 1.8009, + "step": 98320 + }, + { + "epoch": 0.6180268300826901, + "grad_norm": 6.804565906524658, + "learning_rate": 1.58847220941636e-05, + "loss": 1.5392, + "step": 98330 + }, + { + "epoch": 0.6180896823993872, + "grad_norm": 7.108956813812256, + "learning_rate": 1.5884302993218947e-05, + "loss": 1.4983, + "step": 98340 + }, + { + "epoch": 0.6181525347160843, + "grad_norm": 9.570054054260254, + "learning_rate": 1.5883883892274294e-05, + "loss": 1.577, + "step": 98350 + }, + { + "epoch": 0.6182153870327814, + "grad_norm": 7.5288004875183105, + "learning_rate": 1.588346479132964e-05, + "loss": 1.862, + "step": 98360 + }, + { + "epoch": 0.6182782393494786, + "grad_norm": 5.870203018188477, + "learning_rate": 1.5883045690384988e-05, + "loss": 1.8158, + "step": 98370 + }, + { + "epoch": 0.6183410916661757, + "grad_norm": 6.705949783325195, + "learning_rate": 1.5882626589440332e-05, + "loss": 1.6715, + "step": 98380 + }, + { + "epoch": 0.6184039439828728, + "grad_norm": 5.2269978523254395, + "learning_rate": 1.588220748849568e-05, + "loss": 1.5872, + "step": 98390 + }, + { + "epoch": 0.6184667962995699, + "grad_norm": 7.804924011230469, + "learning_rate": 1.5881788387551026e-05, + "loss": 1.7103, + "step": 98400 + }, + { + "epoch": 0.6185296486162669, + "grad_norm": 7.38302755355835, + "learning_rate": 1.5881369286606373e-05, + "loss": 1.7875, + "step": 98410 + }, + { + "epoch": 0.618592500932964, + "grad_norm": 6.590409755706787, + "learning_rate": 1.588095018566172e-05, + "loss": 1.733, + "step": 98420 + }, + { + "epoch": 0.6186553532496611, + "grad_norm": 7.335029125213623, + "learning_rate": 1.5880531084717067e-05, + "loss": 1.8107, + "step": 98430 + }, + { + "epoch": 0.6187182055663583, + "grad_norm": 6.710123062133789, + "learning_rate": 1.5880111983772414e-05, + "loss": 1.926, + "step": 98440 + }, + { + "epoch": 0.6187810578830554, + "grad_norm": 6.156457424163818, + "learning_rate": 1.587969288282776e-05, + "loss": 1.7721, + "step": 98450 + }, + { + "epoch": 0.6188439101997525, + "grad_norm": 6.876556396484375, + "learning_rate": 1.587927378188311e-05, + "loss": 1.74, + "step": 98460 + }, + { + "epoch": 0.6189067625164496, + "grad_norm": 7.142808437347412, + "learning_rate": 1.5878854680938452e-05, + "loss": 1.6317, + "step": 98470 + }, + { + "epoch": 0.6189696148331467, + "grad_norm": 6.802892684936523, + "learning_rate": 1.58784355799938e-05, + "loss": 1.6254, + "step": 98480 + }, + { + "epoch": 0.6190324671498438, + "grad_norm": 5.966985702514648, + "learning_rate": 1.5878016479049146e-05, + "loss": 1.7586, + "step": 98490 + }, + { + "epoch": 0.619095319466541, + "grad_norm": 6.2259321212768555, + "learning_rate": 1.5877597378104493e-05, + "loss": 1.7226, + "step": 98500 + }, + { + "epoch": 0.6191581717832381, + "grad_norm": 6.297909259796143, + "learning_rate": 1.5877178277159837e-05, + "loss": 1.7483, + "step": 98510 + }, + { + "epoch": 0.6192210240999352, + "grad_norm": 6.961431980133057, + "learning_rate": 1.5876759176215184e-05, + "loss": 1.606, + "step": 98520 + }, + { + "epoch": 0.6192838764166323, + "grad_norm": 6.00648307800293, + "learning_rate": 1.587634007527053e-05, + "loss": 1.4804, + "step": 98530 + }, + { + "epoch": 0.6193467287333294, + "grad_norm": 7.5596771240234375, + "learning_rate": 1.5875920974325878e-05, + "loss": 1.7532, + "step": 98540 + }, + { + "epoch": 0.6194095810500265, + "grad_norm": 6.770756244659424, + "learning_rate": 1.5875501873381225e-05, + "loss": 1.6694, + "step": 98550 + }, + { + "epoch": 0.6194724333667236, + "grad_norm": 7.217389106750488, + "learning_rate": 1.587508277243657e-05, + "loss": 1.4545, + "step": 98560 + }, + { + "epoch": 0.6195352856834208, + "grad_norm": 7.36926794052124, + "learning_rate": 1.5874663671491916e-05, + "loss": 1.67, + "step": 98570 + }, + { + "epoch": 0.6195981380001179, + "grad_norm": 6.236382961273193, + "learning_rate": 1.5874244570547263e-05, + "loss": 1.7156, + "step": 98580 + }, + { + "epoch": 0.619660990316815, + "grad_norm": 6.446400165557861, + "learning_rate": 1.587382546960261e-05, + "loss": 1.6656, + "step": 98590 + }, + { + "epoch": 0.6197238426335121, + "grad_norm": 6.381457328796387, + "learning_rate": 1.5873406368657954e-05, + "loss": 1.8684, + "step": 98600 + }, + { + "epoch": 0.6197866949502092, + "grad_norm": 7.383438587188721, + "learning_rate": 1.58729872677133e-05, + "loss": 1.4665, + "step": 98610 + }, + { + "epoch": 0.6198495472669063, + "grad_norm": 6.446352958679199, + "learning_rate": 1.5872568166768648e-05, + "loss": 1.7084, + "step": 98620 + }, + { + "epoch": 0.6199123995836034, + "grad_norm": 6.78460168838501, + "learning_rate": 1.5872149065823995e-05, + "loss": 1.5823, + "step": 98630 + }, + { + "epoch": 0.6199752519003006, + "grad_norm": 6.465644836425781, + "learning_rate": 1.5871729964879342e-05, + "loss": 1.8514, + "step": 98640 + }, + { + "epoch": 0.6200381042169977, + "grad_norm": 6.511555194854736, + "learning_rate": 1.587131086393469e-05, + "loss": 1.6222, + "step": 98650 + }, + { + "epoch": 0.6201009565336947, + "grad_norm": 7.202386856079102, + "learning_rate": 1.5870891762990036e-05, + "loss": 1.5518, + "step": 98660 + }, + { + "epoch": 0.6201638088503918, + "grad_norm": 6.277887344360352, + "learning_rate": 1.5870472662045383e-05, + "loss": 1.7382, + "step": 98670 + }, + { + "epoch": 0.6202266611670889, + "grad_norm": 7.642271518707275, + "learning_rate": 1.587005356110073e-05, + "loss": 1.5345, + "step": 98680 + }, + { + "epoch": 0.620289513483786, + "grad_norm": 6.236955642700195, + "learning_rate": 1.5869634460156074e-05, + "loss": 1.6613, + "step": 98690 + }, + { + "epoch": 0.6203523658004831, + "grad_norm": 6.624298572540283, + "learning_rate": 1.586921535921142e-05, + "loss": 1.7777, + "step": 98700 + }, + { + "epoch": 0.6204152181171803, + "grad_norm": 6.969363212585449, + "learning_rate": 1.5868796258266768e-05, + "loss": 1.5978, + "step": 98710 + }, + { + "epoch": 0.6204780704338774, + "grad_norm": 6.063605308532715, + "learning_rate": 1.5868377157322115e-05, + "loss": 1.5309, + "step": 98720 + }, + { + "epoch": 0.6205409227505745, + "grad_norm": 8.052339553833008, + "learning_rate": 1.586795805637746e-05, + "loss": 1.7814, + "step": 98730 + }, + { + "epoch": 0.6206037750672716, + "grad_norm": 6.75014066696167, + "learning_rate": 1.5867538955432806e-05, + "loss": 1.6291, + "step": 98740 + }, + { + "epoch": 0.6206666273839687, + "grad_norm": 6.427272319793701, + "learning_rate": 1.5867119854488153e-05, + "loss": 1.5339, + "step": 98750 + }, + { + "epoch": 0.6207294797006658, + "grad_norm": 7.240400791168213, + "learning_rate": 1.58667007535435e-05, + "loss": 1.8073, + "step": 98760 + }, + { + "epoch": 0.620792332017363, + "grad_norm": 7.051139831542969, + "learning_rate": 1.5866281652598847e-05, + "loss": 1.5311, + "step": 98770 + }, + { + "epoch": 0.6208551843340601, + "grad_norm": 7.380163192749023, + "learning_rate": 1.586586255165419e-05, + "loss": 1.6614, + "step": 98780 + }, + { + "epoch": 0.6209180366507572, + "grad_norm": 6.821539402008057, + "learning_rate": 1.5865443450709538e-05, + "loss": 1.6406, + "step": 98790 + }, + { + "epoch": 0.6209808889674543, + "grad_norm": 6.191500663757324, + "learning_rate": 1.5865024349764885e-05, + "loss": 1.5525, + "step": 98800 + }, + { + "epoch": 0.6210437412841514, + "grad_norm": 6.564124584197998, + "learning_rate": 1.5864605248820232e-05, + "loss": 1.6494, + "step": 98810 + }, + { + "epoch": 0.6211065936008485, + "grad_norm": 7.13649845123291, + "learning_rate": 1.586418614787558e-05, + "loss": 1.6922, + "step": 98820 + }, + { + "epoch": 0.6211694459175456, + "grad_norm": 5.814675331115723, + "learning_rate": 1.5863767046930926e-05, + "loss": 1.7732, + "step": 98830 + }, + { + "epoch": 0.6212322982342428, + "grad_norm": 6.690029144287109, + "learning_rate": 1.5863347945986273e-05, + "loss": 1.5637, + "step": 98840 + }, + { + "epoch": 0.6212951505509399, + "grad_norm": 6.871272087097168, + "learning_rate": 1.5862928845041617e-05, + "loss": 1.5397, + "step": 98850 + }, + { + "epoch": 0.621358002867637, + "grad_norm": 5.294637680053711, + "learning_rate": 1.5862509744096964e-05, + "loss": 1.5828, + "step": 98860 + }, + { + "epoch": 0.6214208551843341, + "grad_norm": 7.080363750457764, + "learning_rate": 1.586209064315231e-05, + "loss": 1.8897, + "step": 98870 + }, + { + "epoch": 0.6214837075010312, + "grad_norm": 6.606733798980713, + "learning_rate": 1.5861671542207658e-05, + "loss": 1.6083, + "step": 98880 + }, + { + "epoch": 0.6215465598177283, + "grad_norm": 6.138832092285156, + "learning_rate": 1.5861252441263005e-05, + "loss": 1.7771, + "step": 98890 + }, + { + "epoch": 0.6216094121344254, + "grad_norm": 5.845148086547852, + "learning_rate": 1.5860833340318352e-05, + "loss": 1.5498, + "step": 98900 + }, + { + "epoch": 0.6216722644511226, + "grad_norm": 7.054131984710693, + "learning_rate": 1.5860414239373696e-05, + "loss": 1.5525, + "step": 98910 + }, + { + "epoch": 0.6217351167678196, + "grad_norm": 6.092016696929932, + "learning_rate": 1.5859995138429043e-05, + "loss": 1.7413, + "step": 98920 + }, + { + "epoch": 0.6217979690845167, + "grad_norm": 7.949788570404053, + "learning_rate": 1.585957603748439e-05, + "loss": 1.8284, + "step": 98930 + }, + { + "epoch": 0.6218608214012138, + "grad_norm": 6.822504997253418, + "learning_rate": 1.5859156936539737e-05, + "loss": 1.7476, + "step": 98940 + }, + { + "epoch": 0.6219236737179109, + "grad_norm": 6.951657295227051, + "learning_rate": 1.585873783559508e-05, + "loss": 1.7433, + "step": 98950 + }, + { + "epoch": 0.621986526034608, + "grad_norm": 7.07517671585083, + "learning_rate": 1.5858318734650428e-05, + "loss": 1.9471, + "step": 98960 + }, + { + "epoch": 0.6220493783513051, + "grad_norm": 6.086246967315674, + "learning_rate": 1.5857899633705775e-05, + "loss": 1.4152, + "step": 98970 + }, + { + "epoch": 0.6221122306680023, + "grad_norm": 6.832052230834961, + "learning_rate": 1.5857480532761122e-05, + "loss": 1.4304, + "step": 98980 + }, + { + "epoch": 0.6221750829846994, + "grad_norm": 6.731996059417725, + "learning_rate": 1.585706143181647e-05, + "loss": 1.6459, + "step": 98990 + }, + { + "epoch": 0.6222379353013965, + "grad_norm": 6.495983123779297, + "learning_rate": 1.5856642330871813e-05, + "loss": 1.6378, + "step": 99000 + }, + { + "epoch": 0.6223007876180936, + "grad_norm": 6.157533645629883, + "learning_rate": 1.585622322992716e-05, + "loss": 1.7741, + "step": 99010 + }, + { + "epoch": 0.6223636399347907, + "grad_norm": 7.105135440826416, + "learning_rate": 1.5855804128982507e-05, + "loss": 1.8572, + "step": 99020 + }, + { + "epoch": 0.6224264922514878, + "grad_norm": 6.373230457305908, + "learning_rate": 1.5855385028037854e-05, + "loss": 1.7724, + "step": 99030 + }, + { + "epoch": 0.6224893445681849, + "grad_norm": 6.70004415512085, + "learning_rate": 1.58549659270932e-05, + "loss": 1.6024, + "step": 99040 + }, + { + "epoch": 0.6225521968848821, + "grad_norm": 6.644879341125488, + "learning_rate": 1.5854588736243012e-05, + "loss": 1.6875, + "step": 99050 + }, + { + "epoch": 0.6226150492015792, + "grad_norm": 6.124501705169678, + "learning_rate": 1.585416963529836e-05, + "loss": 1.5633, + "step": 99060 + }, + { + "epoch": 0.6226779015182763, + "grad_norm": 5.908251762390137, + "learning_rate": 1.5853750534353707e-05, + "loss": 1.5969, + "step": 99070 + }, + { + "epoch": 0.6227407538349734, + "grad_norm": 6.524896144866943, + "learning_rate": 1.585333143340905e-05, + "loss": 1.8025, + "step": 99080 + }, + { + "epoch": 0.6228036061516705, + "grad_norm": 5.9396562576293945, + "learning_rate": 1.5852912332464397e-05, + "loss": 1.5912, + "step": 99090 + }, + { + "epoch": 0.6228664584683676, + "grad_norm": 9.76508903503418, + "learning_rate": 1.5852493231519744e-05, + "loss": 1.4613, + "step": 99100 + }, + { + "epoch": 0.6229293107850647, + "grad_norm": 6.710978031158447, + "learning_rate": 1.585207413057509e-05, + "loss": 1.7373, + "step": 99110 + }, + { + "epoch": 0.6229921631017619, + "grad_norm": 6.091090202331543, + "learning_rate": 1.585165502963044e-05, + "loss": 1.4173, + "step": 99120 + }, + { + "epoch": 0.623055015418459, + "grad_norm": 6.581202030181885, + "learning_rate": 1.5851235928685786e-05, + "loss": 1.7805, + "step": 99130 + }, + { + "epoch": 0.6231178677351561, + "grad_norm": 6.508041858673096, + "learning_rate": 1.5850816827741133e-05, + "loss": 1.8844, + "step": 99140 + }, + { + "epoch": 0.6231807200518532, + "grad_norm": 7.098252773284912, + "learning_rate": 1.585039772679648e-05, + "loss": 1.6756, + "step": 99150 + }, + { + "epoch": 0.6232435723685503, + "grad_norm": 6.896302700042725, + "learning_rate": 1.5849978625851823e-05, + "loss": 1.5964, + "step": 99160 + }, + { + "epoch": 0.6233064246852473, + "grad_norm": 7.160173416137695, + "learning_rate": 1.584955952490717e-05, + "loss": 1.5011, + "step": 99170 + }, + { + "epoch": 0.6233692770019444, + "grad_norm": 6.765491008758545, + "learning_rate": 1.5849140423962518e-05, + "loss": 1.7538, + "step": 99180 + }, + { + "epoch": 0.6234321293186416, + "grad_norm": 7.4504499435424805, + "learning_rate": 1.5848721323017865e-05, + "loss": 1.856, + "step": 99190 + }, + { + "epoch": 0.6234949816353387, + "grad_norm": 7.950074195861816, + "learning_rate": 1.5848302222073212e-05, + "loss": 1.5604, + "step": 99200 + }, + { + "epoch": 0.6235578339520358, + "grad_norm": 6.2002482414245605, + "learning_rate": 1.5847883121128555e-05, + "loss": 1.7324, + "step": 99210 + }, + { + "epoch": 0.6236206862687329, + "grad_norm": 5.960773468017578, + "learning_rate": 1.5847464020183903e-05, + "loss": 1.5907, + "step": 99220 + }, + { + "epoch": 0.62368353858543, + "grad_norm": 5.5091094970703125, + "learning_rate": 1.584704491923925e-05, + "loss": 1.6034, + "step": 99230 + }, + { + "epoch": 0.6237463909021271, + "grad_norm": 6.118611812591553, + "learning_rate": 1.5846625818294597e-05, + "loss": 1.6115, + "step": 99240 + }, + { + "epoch": 0.6238092432188242, + "grad_norm": 6.906420707702637, + "learning_rate": 1.584620671734994e-05, + "loss": 1.7667, + "step": 99250 + }, + { + "epoch": 0.6238720955355214, + "grad_norm": 6.723021507263184, + "learning_rate": 1.5845787616405287e-05, + "loss": 1.4951, + "step": 99260 + }, + { + "epoch": 0.6239349478522185, + "grad_norm": 6.305889129638672, + "learning_rate": 1.5845368515460634e-05, + "loss": 1.5258, + "step": 99270 + }, + { + "epoch": 0.6239978001689156, + "grad_norm": 6.666172981262207, + "learning_rate": 1.584494941451598e-05, + "loss": 1.7402, + "step": 99280 + }, + { + "epoch": 0.6240606524856127, + "grad_norm": 6.768346786499023, + "learning_rate": 1.584453031357133e-05, + "loss": 1.8403, + "step": 99290 + }, + { + "epoch": 0.6241235048023098, + "grad_norm": 5.794680118560791, + "learning_rate": 1.5844111212626672e-05, + "loss": 1.5025, + "step": 99300 + }, + { + "epoch": 0.6241863571190069, + "grad_norm": 7.722352981567383, + "learning_rate": 1.584369211168202e-05, + "loss": 1.5967, + "step": 99310 + }, + { + "epoch": 0.6242492094357041, + "grad_norm": 6.2834906578063965, + "learning_rate": 1.5843273010737366e-05, + "loss": 1.7963, + "step": 99320 + }, + { + "epoch": 0.6243120617524012, + "grad_norm": 6.741940975189209, + "learning_rate": 1.5842853909792714e-05, + "loss": 1.4822, + "step": 99330 + }, + { + "epoch": 0.6243749140690983, + "grad_norm": 6.804315090179443, + "learning_rate": 1.584243480884806e-05, + "loss": 1.7334, + "step": 99340 + }, + { + "epoch": 0.6244377663857954, + "grad_norm": 7.085864067077637, + "learning_rate": 1.5842015707903408e-05, + "loss": 1.7177, + "step": 99350 + }, + { + "epoch": 0.6245006187024925, + "grad_norm": 7.832089900970459, + "learning_rate": 1.5841596606958755e-05, + "loss": 1.5916, + "step": 99360 + }, + { + "epoch": 0.6245634710191896, + "grad_norm": 6.379815101623535, + "learning_rate": 1.5841177506014102e-05, + "loss": 1.7412, + "step": 99370 + }, + { + "epoch": 0.6246263233358867, + "grad_norm": 6.425118923187256, + "learning_rate": 1.584075840506945e-05, + "loss": 1.5342, + "step": 99380 + }, + { + "epoch": 0.6246891756525839, + "grad_norm": 6.985279083251953, + "learning_rate": 1.5840339304124793e-05, + "loss": 1.8981, + "step": 99390 + }, + { + "epoch": 0.624752027969281, + "grad_norm": 5.208855152130127, + "learning_rate": 1.583992020318014e-05, + "loss": 1.6735, + "step": 99400 + }, + { + "epoch": 0.6248148802859781, + "grad_norm": 5.808994770050049, + "learning_rate": 1.5839501102235487e-05, + "loss": 1.4748, + "step": 99410 + }, + { + "epoch": 0.6248777326026752, + "grad_norm": 6.836221694946289, + "learning_rate": 1.5839082001290834e-05, + "loss": 1.739, + "step": 99420 + }, + { + "epoch": 0.6249405849193722, + "grad_norm": 6.197317123413086, + "learning_rate": 1.5838662900346177e-05, + "loss": 1.5554, + "step": 99430 + }, + { + "epoch": 0.6250034372360693, + "grad_norm": 8.52884578704834, + "learning_rate": 1.5838243799401525e-05, + "loss": 1.7089, + "step": 99440 + }, + { + "epoch": 0.6250662895527664, + "grad_norm": 6.217985153198242, + "learning_rate": 1.583782469845687e-05, + "loss": 1.7057, + "step": 99450 + }, + { + "epoch": 0.6251291418694636, + "grad_norm": 6.842314720153809, + "learning_rate": 1.583740559751222e-05, + "loss": 1.776, + "step": 99460 + }, + { + "epoch": 0.6251919941861607, + "grad_norm": 6.270942211151123, + "learning_rate": 1.5836986496567562e-05, + "loss": 1.7268, + "step": 99470 + }, + { + "epoch": 0.6252548465028578, + "grad_norm": 6.600823879241943, + "learning_rate": 1.583656739562291e-05, + "loss": 1.3532, + "step": 99480 + }, + { + "epoch": 0.6253176988195549, + "grad_norm": 6.996237754821777, + "learning_rate": 1.5836148294678256e-05, + "loss": 1.7153, + "step": 99490 + }, + { + "epoch": 0.625380551136252, + "grad_norm": 6.548280239105225, + "learning_rate": 1.5835729193733604e-05, + "loss": 1.8595, + "step": 99500 + }, + { + "epoch": 0.6254434034529491, + "grad_norm": 5.533222198486328, + "learning_rate": 1.583531009278895e-05, + "loss": 1.4936, + "step": 99510 + }, + { + "epoch": 0.6255062557696462, + "grad_norm": 6.712929725646973, + "learning_rate": 1.5834890991844298e-05, + "loss": 1.8575, + "step": 99520 + }, + { + "epoch": 0.6255691080863434, + "grad_norm": 6.34805154800415, + "learning_rate": 1.5834471890899645e-05, + "loss": 1.7679, + "step": 99530 + }, + { + "epoch": 0.6256319604030405, + "grad_norm": 6.5129218101501465, + "learning_rate": 1.583405278995499e-05, + "loss": 1.6556, + "step": 99540 + }, + { + "epoch": 0.6256948127197376, + "grad_norm": 6.358635902404785, + "learning_rate": 1.5833633689010336e-05, + "loss": 1.7738, + "step": 99550 + }, + { + "epoch": 0.6257576650364347, + "grad_norm": 6.160097122192383, + "learning_rate": 1.5833214588065683e-05, + "loss": 1.5183, + "step": 99560 + }, + { + "epoch": 0.6258205173531318, + "grad_norm": 7.927013397216797, + "learning_rate": 1.583279548712103e-05, + "loss": 1.751, + "step": 99570 + }, + { + "epoch": 0.6258833696698289, + "grad_norm": 5.808539390563965, + "learning_rate": 1.5832376386176377e-05, + "loss": 1.6338, + "step": 99580 + }, + { + "epoch": 0.625946221986526, + "grad_norm": 5.886746883392334, + "learning_rate": 1.5831957285231724e-05, + "loss": 1.8113, + "step": 99590 + }, + { + "epoch": 0.6260090743032232, + "grad_norm": 6.869828224182129, + "learning_rate": 1.583153818428707e-05, + "loss": 1.7536, + "step": 99600 + }, + { + "epoch": 0.6260719266199203, + "grad_norm": 6.9043097496032715, + "learning_rate": 1.5831119083342415e-05, + "loss": 1.5779, + "step": 99610 + }, + { + "epoch": 0.6261347789366174, + "grad_norm": 7.80623197555542, + "learning_rate": 1.583069998239776e-05, + "loss": 1.6508, + "step": 99620 + }, + { + "epoch": 0.6261976312533145, + "grad_norm": 7.624624729156494, + "learning_rate": 1.583028088145311e-05, + "loss": 1.7147, + "step": 99630 + }, + { + "epoch": 0.6262604835700116, + "grad_norm": 7.440316677093506, + "learning_rate": 1.5829861780508456e-05, + "loss": 1.6111, + "step": 99640 + }, + { + "epoch": 0.6263233358867087, + "grad_norm": 7.38603401184082, + "learning_rate": 1.58294426795638e-05, + "loss": 1.5613, + "step": 99650 + }, + { + "epoch": 0.6263861882034059, + "grad_norm": 7.0097246170043945, + "learning_rate": 1.5829023578619147e-05, + "loss": 1.867, + "step": 99660 + }, + { + "epoch": 0.626449040520103, + "grad_norm": 6.874953269958496, + "learning_rate": 1.5828604477674494e-05, + "loss": 1.549, + "step": 99670 + }, + { + "epoch": 0.6265118928368, + "grad_norm": 6.660768508911133, + "learning_rate": 1.582818537672984e-05, + "loss": 1.6114, + "step": 99680 + }, + { + "epoch": 0.6265747451534971, + "grad_norm": 5.996919631958008, + "learning_rate": 1.5827766275785188e-05, + "loss": 1.6485, + "step": 99690 + }, + { + "epoch": 0.6266375974701942, + "grad_norm": 6.453237533569336, + "learning_rate": 1.582734717484053e-05, + "loss": 1.6214, + "step": 99700 + }, + { + "epoch": 0.6267004497868913, + "grad_norm": 8.109010696411133, + "learning_rate": 1.582692807389588e-05, + "loss": 1.769, + "step": 99710 + }, + { + "epoch": 0.6267633021035884, + "grad_norm": 6.3128228187561035, + "learning_rate": 1.5826508972951226e-05, + "loss": 1.8159, + "step": 99720 + }, + { + "epoch": 0.6268261544202856, + "grad_norm": 6.217896461486816, + "learning_rate": 1.5826089872006573e-05, + "loss": 1.4919, + "step": 99730 + }, + { + "epoch": 0.6268890067369827, + "grad_norm": 7.381162166595459, + "learning_rate": 1.582567077106192e-05, + "loss": 1.7286, + "step": 99740 + }, + { + "epoch": 0.6269518590536798, + "grad_norm": 7.417752265930176, + "learning_rate": 1.5825251670117267e-05, + "loss": 1.6283, + "step": 99750 + }, + { + "epoch": 0.6270147113703769, + "grad_norm": 6.68156099319458, + "learning_rate": 1.5824832569172614e-05, + "loss": 1.6258, + "step": 99760 + }, + { + "epoch": 0.627077563687074, + "grad_norm": 6.632552146911621, + "learning_rate": 1.582441346822796e-05, + "loss": 1.8614, + "step": 99770 + }, + { + "epoch": 0.6271404160037711, + "grad_norm": 7.476672649383545, + "learning_rate": 1.5823994367283305e-05, + "loss": 1.8872, + "step": 99780 + }, + { + "epoch": 0.6272032683204682, + "grad_norm": 6.832566738128662, + "learning_rate": 1.582357526633865e-05, + "loss": 1.7, + "step": 99790 + }, + { + "epoch": 0.6272661206371654, + "grad_norm": 7.108794689178467, + "learning_rate": 1.5823156165394e-05, + "loss": 1.7619, + "step": 99800 + }, + { + "epoch": 0.6273289729538625, + "grad_norm": 9.079567909240723, + "learning_rate": 1.5822737064449346e-05, + "loss": 1.7159, + "step": 99810 + }, + { + "epoch": 0.6273918252705596, + "grad_norm": 6.384576320648193, + "learning_rate": 1.5822317963504693e-05, + "loss": 1.7503, + "step": 99820 + }, + { + "epoch": 0.6274546775872567, + "grad_norm": 7.277607440948486, + "learning_rate": 1.5821898862560037e-05, + "loss": 1.7782, + "step": 99830 + }, + { + "epoch": 0.6275175299039538, + "grad_norm": 6.08124303817749, + "learning_rate": 1.5821479761615384e-05, + "loss": 1.6447, + "step": 99840 + }, + { + "epoch": 0.6275803822206509, + "grad_norm": 8.458341598510742, + "learning_rate": 1.582106066067073e-05, + "loss": 1.6659, + "step": 99850 + }, + { + "epoch": 0.627643234537348, + "grad_norm": 7.769322872161865, + "learning_rate": 1.5820641559726078e-05, + "loss": 1.612, + "step": 99860 + }, + { + "epoch": 0.6277060868540452, + "grad_norm": 6.998531818389893, + "learning_rate": 1.582022245878142e-05, + "loss": 1.3814, + "step": 99870 + }, + { + "epoch": 0.6277689391707423, + "grad_norm": 6.404359340667725, + "learning_rate": 1.581980335783677e-05, + "loss": 1.6981, + "step": 99880 + }, + { + "epoch": 0.6278317914874394, + "grad_norm": 6.724607944488525, + "learning_rate": 1.5819384256892116e-05, + "loss": 1.6364, + "step": 99890 + }, + { + "epoch": 0.6278946438041365, + "grad_norm": 7.201076030731201, + "learning_rate": 1.5818965155947463e-05, + "loss": 1.7632, + "step": 99900 + }, + { + "epoch": 0.6279574961208336, + "grad_norm": 6.761590003967285, + "learning_rate": 1.581854605500281e-05, + "loss": 1.4493, + "step": 99910 + }, + { + "epoch": 0.6280203484375307, + "grad_norm": 6.595172882080078, + "learning_rate": 1.5818126954058153e-05, + "loss": 1.5363, + "step": 99920 + }, + { + "epoch": 0.6280832007542279, + "grad_norm": 6.319594383239746, + "learning_rate": 1.58177078531135e-05, + "loss": 1.5987, + "step": 99930 + }, + { + "epoch": 0.6281460530709249, + "grad_norm": 5.454696178436279, + "learning_rate": 1.5817288752168848e-05, + "loss": 1.5252, + "step": 99940 + }, + { + "epoch": 0.628208905387622, + "grad_norm": 7.6446919441223145, + "learning_rate": 1.5816869651224195e-05, + "loss": 1.9102, + "step": 99950 + }, + { + "epoch": 0.6282717577043191, + "grad_norm": 9.58559799194336, + "learning_rate": 1.5816450550279542e-05, + "loss": 1.6907, + "step": 99960 + }, + { + "epoch": 0.6283346100210162, + "grad_norm": 6.981563091278076, + "learning_rate": 1.581603144933489e-05, + "loss": 1.7332, + "step": 99970 + }, + { + "epoch": 0.6283974623377133, + "grad_norm": 5.508819580078125, + "learning_rate": 1.5815612348390236e-05, + "loss": 1.6118, + "step": 99980 + }, + { + "epoch": 0.6284603146544104, + "grad_norm": 6.374399662017822, + "learning_rate": 1.5815193247445583e-05, + "loss": 1.4557, + "step": 99990 + }, + { + "epoch": 0.6285231669711075, + "grad_norm": 7.851498603820801, + "learning_rate": 1.581477414650093e-05, + "loss": 1.6887, + "step": 100000 + }, + { + "epoch": 0.6285860192878047, + "grad_norm": 6.407346248626709, + "learning_rate": 1.5814355045556274e-05, + "loss": 1.9452, + "step": 100010 + }, + { + "epoch": 0.6286488716045018, + "grad_norm": 5.7401933670043945, + "learning_rate": 1.581393594461162e-05, + "loss": 1.8405, + "step": 100020 + }, + { + "epoch": 0.6287117239211989, + "grad_norm": 9.775955200195312, + "learning_rate": 1.5813516843666968e-05, + "loss": 1.6058, + "step": 100030 + }, + { + "epoch": 0.628774576237896, + "grad_norm": 6.347497463226318, + "learning_rate": 1.5813097742722315e-05, + "loss": 1.8753, + "step": 100040 + }, + { + "epoch": 0.6288374285545931, + "grad_norm": 6.3936052322387695, + "learning_rate": 1.581267864177766e-05, + "loss": 1.4589, + "step": 100050 + }, + { + "epoch": 0.6289002808712902, + "grad_norm": 5.969712257385254, + "learning_rate": 1.5812259540833006e-05, + "loss": 1.4234, + "step": 100060 + }, + { + "epoch": 0.6289631331879874, + "grad_norm": 7.093703269958496, + "learning_rate": 1.5811840439888353e-05, + "loss": 1.7106, + "step": 100070 + }, + { + "epoch": 0.6290259855046845, + "grad_norm": 7.396203517913818, + "learning_rate": 1.58114213389437e-05, + "loss": 1.7015, + "step": 100080 + }, + { + "epoch": 0.6290888378213816, + "grad_norm": 6.682152271270752, + "learning_rate": 1.5811002237999043e-05, + "loss": 1.6464, + "step": 100090 + }, + { + "epoch": 0.6291516901380787, + "grad_norm": 6.390876770019531, + "learning_rate": 1.581058313705439e-05, + "loss": 1.6781, + "step": 100100 + }, + { + "epoch": 0.6292145424547758, + "grad_norm": 6.433990001678467, + "learning_rate": 1.5810164036109738e-05, + "loss": 1.4025, + "step": 100110 + }, + { + "epoch": 0.6292773947714729, + "grad_norm": 6.061580181121826, + "learning_rate": 1.5809744935165085e-05, + "loss": 1.5259, + "step": 100120 + }, + { + "epoch": 0.62934024708817, + "grad_norm": 6.715423107147217, + "learning_rate": 1.5809325834220432e-05, + "loss": 1.7404, + "step": 100130 + }, + { + "epoch": 0.6294030994048672, + "grad_norm": 6.707705020904541, + "learning_rate": 1.580890673327578e-05, + "loss": 1.8089, + "step": 100140 + }, + { + "epoch": 0.6294659517215643, + "grad_norm": 6.904034614562988, + "learning_rate": 1.5808487632331126e-05, + "loss": 1.8503, + "step": 100150 + }, + { + "epoch": 0.6295288040382614, + "grad_norm": 6.170143127441406, + "learning_rate": 1.580806853138647e-05, + "loss": 1.4703, + "step": 100160 + }, + { + "epoch": 0.6295916563549585, + "grad_norm": 6.437007427215576, + "learning_rate": 1.5807649430441817e-05, + "loss": 1.4874, + "step": 100170 + }, + { + "epoch": 0.6296545086716556, + "grad_norm": 6.416689395904541, + "learning_rate": 1.5807230329497164e-05, + "loss": 1.9164, + "step": 100180 + }, + { + "epoch": 0.6297173609883526, + "grad_norm": 7.378628730773926, + "learning_rate": 1.580681122855251e-05, + "loss": 1.6999, + "step": 100190 + }, + { + "epoch": 0.6297802133050497, + "grad_norm": 7.50117301940918, + "learning_rate": 1.5806392127607858e-05, + "loss": 1.6007, + "step": 100200 + }, + { + "epoch": 0.6298430656217469, + "grad_norm": 5.30483865737915, + "learning_rate": 1.5805973026663205e-05, + "loss": 1.7694, + "step": 100210 + }, + { + "epoch": 0.629905917938444, + "grad_norm": 5.686487674713135, + "learning_rate": 1.5805553925718552e-05, + "loss": 1.5099, + "step": 100220 + }, + { + "epoch": 0.6299687702551411, + "grad_norm": 7.767488479614258, + "learning_rate": 1.5805134824773896e-05, + "loss": 1.9495, + "step": 100230 + }, + { + "epoch": 0.6300316225718382, + "grad_norm": 8.49234390258789, + "learning_rate": 1.5804715723829243e-05, + "loss": 1.7609, + "step": 100240 + }, + { + "epoch": 0.6300944748885353, + "grad_norm": 5.479924201965332, + "learning_rate": 1.580429662288459e-05, + "loss": 1.6703, + "step": 100250 + }, + { + "epoch": 0.6301573272052324, + "grad_norm": 7.043915271759033, + "learning_rate": 1.5803877521939937e-05, + "loss": 1.6396, + "step": 100260 + }, + { + "epoch": 0.6302201795219295, + "grad_norm": 6.031806468963623, + "learning_rate": 1.580345842099528e-05, + "loss": 1.6458, + "step": 100270 + }, + { + "epoch": 0.6302830318386267, + "grad_norm": 7.834958553314209, + "learning_rate": 1.5803039320050628e-05, + "loss": 1.6333, + "step": 100280 + }, + { + "epoch": 0.6303458841553238, + "grad_norm": 6.062341690063477, + "learning_rate": 1.5802620219105975e-05, + "loss": 1.8101, + "step": 100290 + }, + { + "epoch": 0.6304087364720209, + "grad_norm": 7.53291130065918, + "learning_rate": 1.5802201118161322e-05, + "loss": 1.8961, + "step": 100300 + }, + { + "epoch": 0.630471588788718, + "grad_norm": 7.180240154266357, + "learning_rate": 1.580178201721667e-05, + "loss": 1.8702, + "step": 100310 + }, + { + "epoch": 0.6305344411054151, + "grad_norm": 6.761497497558594, + "learning_rate": 1.5801362916272013e-05, + "loss": 1.6263, + "step": 100320 + }, + { + "epoch": 0.6305972934221122, + "grad_norm": 7.409494400024414, + "learning_rate": 1.580094381532736e-05, + "loss": 1.6978, + "step": 100330 + }, + { + "epoch": 0.6306601457388094, + "grad_norm": 7.477789878845215, + "learning_rate": 1.5800524714382707e-05, + "loss": 1.6104, + "step": 100340 + }, + { + "epoch": 0.6307229980555065, + "grad_norm": 6.731983661651611, + "learning_rate": 1.5800105613438054e-05, + "loss": 1.6432, + "step": 100350 + }, + { + "epoch": 0.6307858503722036, + "grad_norm": 7.559552192687988, + "learning_rate": 1.57996865124934e-05, + "loss": 1.6665, + "step": 100360 + }, + { + "epoch": 0.6308487026889007, + "grad_norm": 6.713873386383057, + "learning_rate": 1.5799267411548748e-05, + "loss": 1.6159, + "step": 100370 + }, + { + "epoch": 0.6309115550055978, + "grad_norm": 6.6399078369140625, + "learning_rate": 1.5798848310604095e-05, + "loss": 1.5398, + "step": 100380 + }, + { + "epoch": 0.6309744073222949, + "grad_norm": 6.55191707611084, + "learning_rate": 1.5798429209659442e-05, + "loss": 1.5245, + "step": 100390 + }, + { + "epoch": 0.631037259638992, + "grad_norm": 6.504763126373291, + "learning_rate": 1.5798010108714786e-05, + "loss": 1.6908, + "step": 100400 + }, + { + "epoch": 0.6311001119556892, + "grad_norm": 5.631613731384277, + "learning_rate": 1.5797591007770133e-05, + "loss": 1.5776, + "step": 100410 + }, + { + "epoch": 0.6311629642723863, + "grad_norm": 6.911037921905518, + "learning_rate": 1.579717190682548e-05, + "loss": 1.6871, + "step": 100420 + }, + { + "epoch": 0.6312258165890834, + "grad_norm": 6.991258144378662, + "learning_rate": 1.5796752805880827e-05, + "loss": 1.7642, + "step": 100430 + }, + { + "epoch": 0.6312886689057805, + "grad_norm": 7.394499778747559, + "learning_rate": 1.5796333704936174e-05, + "loss": 1.7376, + "step": 100440 + }, + { + "epoch": 0.6313515212224775, + "grad_norm": 6.803630828857422, + "learning_rate": 1.5795914603991518e-05, + "loss": 1.6368, + "step": 100450 + }, + { + "epoch": 0.6314143735391746, + "grad_norm": 6.7257914543151855, + "learning_rate": 1.5795495503046865e-05, + "loss": 1.7314, + "step": 100460 + }, + { + "epoch": 0.6314772258558717, + "grad_norm": 7.259373188018799, + "learning_rate": 1.5795076402102212e-05, + "loss": 1.8811, + "step": 100470 + }, + { + "epoch": 0.6315400781725689, + "grad_norm": 7.685807704925537, + "learning_rate": 1.579465730115756e-05, + "loss": 1.6636, + "step": 100480 + }, + { + "epoch": 0.631602930489266, + "grad_norm": 6.555708885192871, + "learning_rate": 1.5794238200212903e-05, + "loss": 1.5783, + "step": 100490 + }, + { + "epoch": 0.6316657828059631, + "grad_norm": 5.685708045959473, + "learning_rate": 1.579381909926825e-05, + "loss": 1.5042, + "step": 100500 + }, + { + "epoch": 0.6317286351226602, + "grad_norm": 6.318507194519043, + "learning_rate": 1.5793399998323597e-05, + "loss": 1.5881, + "step": 100510 + }, + { + "epoch": 0.6317914874393573, + "grad_norm": 6.510059356689453, + "learning_rate": 1.5792980897378944e-05, + "loss": 1.629, + "step": 100520 + }, + { + "epoch": 0.6318543397560544, + "grad_norm": 6.250816822052002, + "learning_rate": 1.579256179643429e-05, + "loss": 1.7354, + "step": 100530 + }, + { + "epoch": 0.6319171920727515, + "grad_norm": 7.057291030883789, + "learning_rate": 1.5792142695489635e-05, + "loss": 1.5878, + "step": 100540 + }, + { + "epoch": 0.6319800443894487, + "grad_norm": 6.631834983825684, + "learning_rate": 1.579172359454498e-05, + "loss": 1.6387, + "step": 100550 + }, + { + "epoch": 0.6320428967061458, + "grad_norm": 6.286482810974121, + "learning_rate": 1.579130449360033e-05, + "loss": 1.4187, + "step": 100560 + }, + { + "epoch": 0.6321057490228429, + "grad_norm": 6.1507368087768555, + "learning_rate": 1.5790885392655676e-05, + "loss": 1.681, + "step": 100570 + }, + { + "epoch": 0.63216860133954, + "grad_norm": 6.44645881652832, + "learning_rate": 1.5790466291711023e-05, + "loss": 1.5956, + "step": 100580 + }, + { + "epoch": 0.6322314536562371, + "grad_norm": 6.74025297164917, + "learning_rate": 1.579004719076637e-05, + "loss": 1.5966, + "step": 100590 + }, + { + "epoch": 0.6322943059729342, + "grad_norm": 6.204047203063965, + "learning_rate": 1.5789628089821717e-05, + "loss": 1.5122, + "step": 100600 + }, + { + "epoch": 0.6323571582896313, + "grad_norm": 6.540848731994629, + "learning_rate": 1.5789208988877064e-05, + "loss": 1.7332, + "step": 100610 + }, + { + "epoch": 0.6324200106063285, + "grad_norm": 5.2913737297058105, + "learning_rate": 1.578878988793241e-05, + "loss": 1.7636, + "step": 100620 + }, + { + "epoch": 0.6324828629230256, + "grad_norm": 7.495303630828857, + "learning_rate": 1.5788370786987755e-05, + "loss": 1.7023, + "step": 100630 + }, + { + "epoch": 0.6325457152397227, + "grad_norm": 6.820461750030518, + "learning_rate": 1.5787951686043102e-05, + "loss": 1.5975, + "step": 100640 + }, + { + "epoch": 0.6326085675564198, + "grad_norm": 5.557898044586182, + "learning_rate": 1.578753258509845e-05, + "loss": 1.3953, + "step": 100650 + }, + { + "epoch": 0.6326714198731169, + "grad_norm": 6.902298927307129, + "learning_rate": 1.5787113484153796e-05, + "loss": 1.7426, + "step": 100660 + }, + { + "epoch": 0.632734272189814, + "grad_norm": 6.433447360992432, + "learning_rate": 1.578669438320914e-05, + "loss": 1.756, + "step": 100670 + }, + { + "epoch": 0.6327971245065112, + "grad_norm": 6.460206508636475, + "learning_rate": 1.5786275282264487e-05, + "loss": 1.7636, + "step": 100680 + }, + { + "epoch": 0.6328599768232083, + "grad_norm": 6.105504512786865, + "learning_rate": 1.5785856181319834e-05, + "loss": 1.7044, + "step": 100690 + }, + { + "epoch": 0.6329228291399053, + "grad_norm": 6.065561294555664, + "learning_rate": 1.578543708037518e-05, + "loss": 1.5518, + "step": 100700 + }, + { + "epoch": 0.6329856814566024, + "grad_norm": 6.046292304992676, + "learning_rate": 1.5785017979430525e-05, + "loss": 1.6447, + "step": 100710 + }, + { + "epoch": 0.6330485337732995, + "grad_norm": 6.256049633026123, + "learning_rate": 1.578459887848587e-05, + "loss": 1.8812, + "step": 100720 + }, + { + "epoch": 0.6331113860899966, + "grad_norm": 5.4630818367004395, + "learning_rate": 1.578417977754122e-05, + "loss": 1.5341, + "step": 100730 + }, + { + "epoch": 0.6331742384066937, + "grad_norm": 7.070973873138428, + "learning_rate": 1.5783760676596566e-05, + "loss": 1.7251, + "step": 100740 + }, + { + "epoch": 0.6332370907233908, + "grad_norm": 9.157733917236328, + "learning_rate": 1.5783341575651913e-05, + "loss": 1.7675, + "step": 100750 + }, + { + "epoch": 0.633299943040088, + "grad_norm": 6.445616722106934, + "learning_rate": 1.578292247470726e-05, + "loss": 1.81, + "step": 100760 + }, + { + "epoch": 0.6333627953567851, + "grad_norm": 6.646481990814209, + "learning_rate": 1.5782503373762607e-05, + "loss": 1.7616, + "step": 100770 + }, + { + "epoch": 0.6334256476734822, + "grad_norm": 6.220576763153076, + "learning_rate": 1.5782084272817954e-05, + "loss": 1.8266, + "step": 100780 + }, + { + "epoch": 0.6334884999901793, + "grad_norm": 7.9391374588012695, + "learning_rate": 1.5781665171873298e-05, + "loss": 1.8282, + "step": 100790 + }, + { + "epoch": 0.6335513523068764, + "grad_norm": 5.481481552124023, + "learning_rate": 1.5781246070928645e-05, + "loss": 1.6849, + "step": 100800 + }, + { + "epoch": 0.6336142046235735, + "grad_norm": 7.201502323150635, + "learning_rate": 1.5780826969983992e-05, + "loss": 1.8536, + "step": 100810 + }, + { + "epoch": 0.6336770569402707, + "grad_norm": 6.303986072540283, + "learning_rate": 1.578040786903934e-05, + "loss": 1.5363, + "step": 100820 + }, + { + "epoch": 0.6337399092569678, + "grad_norm": 6.826161861419678, + "learning_rate": 1.5779988768094686e-05, + "loss": 1.6907, + "step": 100830 + }, + { + "epoch": 0.6338027615736649, + "grad_norm": 6.250027179718018, + "learning_rate": 1.5779569667150033e-05, + "loss": 1.7413, + "step": 100840 + }, + { + "epoch": 0.633865613890362, + "grad_norm": 5.6930060386657715, + "learning_rate": 1.5779150566205377e-05, + "loss": 1.7811, + "step": 100850 + }, + { + "epoch": 0.6339284662070591, + "grad_norm": 7.0470147132873535, + "learning_rate": 1.5778731465260724e-05, + "loss": 1.5405, + "step": 100860 + }, + { + "epoch": 0.6339913185237562, + "grad_norm": 5.79329776763916, + "learning_rate": 1.577831236431607e-05, + "loss": 1.6439, + "step": 100870 + }, + { + "epoch": 0.6340541708404533, + "grad_norm": 6.484084606170654, + "learning_rate": 1.5777893263371418e-05, + "loss": 1.5257, + "step": 100880 + }, + { + "epoch": 0.6341170231571505, + "grad_norm": 8.19410514831543, + "learning_rate": 1.577747416242676e-05, + "loss": 1.779, + "step": 100890 + }, + { + "epoch": 0.6341798754738476, + "grad_norm": 5.868208885192871, + "learning_rate": 1.577705506148211e-05, + "loss": 1.5361, + "step": 100900 + }, + { + "epoch": 0.6342427277905447, + "grad_norm": 6.817616939544678, + "learning_rate": 1.5776635960537456e-05, + "loss": 1.7818, + "step": 100910 + }, + { + "epoch": 0.6343055801072418, + "grad_norm": 6.6354451179504395, + "learning_rate": 1.5776216859592803e-05, + "loss": 1.8738, + "step": 100920 + }, + { + "epoch": 0.6343684324239389, + "grad_norm": 5.966352939605713, + "learning_rate": 1.577579775864815e-05, + "loss": 1.8658, + "step": 100930 + }, + { + "epoch": 0.634431284740636, + "grad_norm": 7.401264667510986, + "learning_rate": 1.5775378657703494e-05, + "loss": 1.8332, + "step": 100940 + }, + { + "epoch": 0.6344941370573332, + "grad_norm": 6.14589262008667, + "learning_rate": 1.577495955675884e-05, + "loss": 1.7332, + "step": 100950 + }, + { + "epoch": 0.6345569893740302, + "grad_norm": 6.752025127410889, + "learning_rate": 1.5774540455814188e-05, + "loss": 1.6043, + "step": 100960 + }, + { + "epoch": 0.6346198416907273, + "grad_norm": 6.3684258460998535, + "learning_rate": 1.5774121354869535e-05, + "loss": 1.5993, + "step": 100970 + }, + { + "epoch": 0.6346826940074244, + "grad_norm": 6.318819522857666, + "learning_rate": 1.5773702253924882e-05, + "loss": 1.6613, + "step": 100980 + }, + { + "epoch": 0.6347455463241215, + "grad_norm": 5.311209678649902, + "learning_rate": 1.577328315298023e-05, + "loss": 1.6114, + "step": 100990 + }, + { + "epoch": 0.6348083986408186, + "grad_norm": 6.803685188293457, + "learning_rate": 1.5772864052035576e-05, + "loss": 1.6155, + "step": 101000 + }, + { + "epoch": 0.6348712509575157, + "grad_norm": 6.295719623565674, + "learning_rate": 1.5772444951090923e-05, + "loss": 1.6225, + "step": 101010 + }, + { + "epoch": 0.6349341032742128, + "grad_norm": 6.521117687225342, + "learning_rate": 1.5772025850146267e-05, + "loss": 1.5671, + "step": 101020 + }, + { + "epoch": 0.63499695559091, + "grad_norm": 6.544680595397949, + "learning_rate": 1.5771606749201614e-05, + "loss": 1.4785, + "step": 101030 + }, + { + "epoch": 0.6350598079076071, + "grad_norm": 6.518697738647461, + "learning_rate": 1.577118764825696e-05, + "loss": 1.6918, + "step": 101040 + }, + { + "epoch": 0.6351226602243042, + "grad_norm": 7.030117511749268, + "learning_rate": 1.5770768547312308e-05, + "loss": 1.6475, + "step": 101050 + }, + { + "epoch": 0.6351855125410013, + "grad_norm": 6.561397075653076, + "learning_rate": 1.5770349446367655e-05, + "loss": 1.5397, + "step": 101060 + }, + { + "epoch": 0.6352483648576984, + "grad_norm": 6.138189792633057, + "learning_rate": 1.5769930345423e-05, + "loss": 1.7127, + "step": 101070 + }, + { + "epoch": 0.6353112171743955, + "grad_norm": 6.9077277183532715, + "learning_rate": 1.5769511244478346e-05, + "loss": 1.7314, + "step": 101080 + }, + { + "epoch": 0.6353740694910927, + "grad_norm": 6.724200248718262, + "learning_rate": 1.5769092143533693e-05, + "loss": 1.6853, + "step": 101090 + }, + { + "epoch": 0.6354369218077898, + "grad_norm": 7.189093589782715, + "learning_rate": 1.576867304258904e-05, + "loss": 1.6719, + "step": 101100 + }, + { + "epoch": 0.6354997741244869, + "grad_norm": 6.973060607910156, + "learning_rate": 1.5768253941644384e-05, + "loss": 1.5184, + "step": 101110 + }, + { + "epoch": 0.635562626441184, + "grad_norm": 7.763575553894043, + "learning_rate": 1.576783484069973e-05, + "loss": 1.6132, + "step": 101120 + }, + { + "epoch": 0.6356254787578811, + "grad_norm": 5.495080471038818, + "learning_rate": 1.5767415739755078e-05, + "loss": 1.694, + "step": 101130 + }, + { + "epoch": 0.6356883310745782, + "grad_norm": 6.750556468963623, + "learning_rate": 1.5766996638810425e-05, + "loss": 1.5549, + "step": 101140 + }, + { + "epoch": 0.6357511833912753, + "grad_norm": 6.03955078125, + "learning_rate": 1.5766577537865772e-05, + "loss": 1.716, + "step": 101150 + }, + { + "epoch": 0.6358140357079725, + "grad_norm": 4.914484024047852, + "learning_rate": 1.576615843692112e-05, + "loss": 1.5294, + "step": 101160 + }, + { + "epoch": 0.6358768880246696, + "grad_norm": 6.851123809814453, + "learning_rate": 1.5765739335976463e-05, + "loss": 1.582, + "step": 101170 + }, + { + "epoch": 0.6359397403413667, + "grad_norm": 6.215464115142822, + "learning_rate": 1.576532023503181e-05, + "loss": 1.5753, + "step": 101180 + }, + { + "epoch": 0.6360025926580638, + "grad_norm": 7.140315532684326, + "learning_rate": 1.5764901134087157e-05, + "loss": 1.7042, + "step": 101190 + }, + { + "epoch": 0.6360654449747609, + "grad_norm": 6.683956146240234, + "learning_rate": 1.5764482033142504e-05, + "loss": 1.4798, + "step": 101200 + }, + { + "epoch": 0.6361282972914579, + "grad_norm": 6.934081077575684, + "learning_rate": 1.576406293219785e-05, + "loss": 1.6671, + "step": 101210 + }, + { + "epoch": 0.636191149608155, + "grad_norm": 6.101939678192139, + "learning_rate": 1.5763643831253198e-05, + "loss": 1.5294, + "step": 101220 + }, + { + "epoch": 0.6362540019248522, + "grad_norm": 6.583488941192627, + "learning_rate": 1.5763224730308545e-05, + "loss": 1.6551, + "step": 101230 + }, + { + "epoch": 0.6363168542415493, + "grad_norm": 5.876173973083496, + "learning_rate": 1.5762805629363892e-05, + "loss": 1.6241, + "step": 101240 + }, + { + "epoch": 0.6363797065582464, + "grad_norm": 7.093881130218506, + "learning_rate": 1.5762386528419236e-05, + "loss": 1.5319, + "step": 101250 + }, + { + "epoch": 0.6364425588749435, + "grad_norm": 6.061024188995361, + "learning_rate": 1.5761967427474583e-05, + "loss": 1.6872, + "step": 101260 + }, + { + "epoch": 0.6365054111916406, + "grad_norm": 6.763847351074219, + "learning_rate": 1.576154832652993e-05, + "loss": 1.7427, + "step": 101270 + }, + { + "epoch": 0.6365682635083377, + "grad_norm": 6.708096027374268, + "learning_rate": 1.5761129225585277e-05, + "loss": 1.7042, + "step": 101280 + }, + { + "epoch": 0.6366311158250348, + "grad_norm": 6.899614334106445, + "learning_rate": 1.576071012464062e-05, + "loss": 1.7199, + "step": 101290 + }, + { + "epoch": 0.636693968141732, + "grad_norm": 6.982869625091553, + "learning_rate": 1.5760291023695968e-05, + "loss": 1.4169, + "step": 101300 + }, + { + "epoch": 0.6367568204584291, + "grad_norm": 5.419250011444092, + "learning_rate": 1.5759871922751315e-05, + "loss": 1.4193, + "step": 101310 + }, + { + "epoch": 0.6368196727751262, + "grad_norm": 6.4072041511535645, + "learning_rate": 1.5759452821806662e-05, + "loss": 1.724, + "step": 101320 + }, + { + "epoch": 0.6368825250918233, + "grad_norm": 6.207793235778809, + "learning_rate": 1.575903372086201e-05, + "loss": 1.648, + "step": 101330 + }, + { + "epoch": 0.6369453774085204, + "grad_norm": 6.272695064544678, + "learning_rate": 1.5758614619917353e-05, + "loss": 1.4553, + "step": 101340 + }, + { + "epoch": 0.6370082297252175, + "grad_norm": 7.17915678024292, + "learning_rate": 1.57581955189727e-05, + "loss": 1.6907, + "step": 101350 + }, + { + "epoch": 0.6370710820419146, + "grad_norm": 7.949512958526611, + "learning_rate": 1.5757776418028047e-05, + "loss": 1.4746, + "step": 101360 + }, + { + "epoch": 0.6371339343586118, + "grad_norm": 7.116039276123047, + "learning_rate": 1.5757357317083394e-05, + "loss": 1.6751, + "step": 101370 + }, + { + "epoch": 0.6371967866753089, + "grad_norm": 6.2278923988342285, + "learning_rate": 1.575693821613874e-05, + "loss": 1.528, + "step": 101380 + }, + { + "epoch": 0.637259638992006, + "grad_norm": 6.393247127532959, + "learning_rate": 1.5756519115194088e-05, + "loss": 1.664, + "step": 101390 + }, + { + "epoch": 0.6373224913087031, + "grad_norm": 7.355311393737793, + "learning_rate": 1.5756100014249435e-05, + "loss": 1.6343, + "step": 101400 + }, + { + "epoch": 0.6373853436254002, + "grad_norm": 5.815723896026611, + "learning_rate": 1.5755680913304782e-05, + "loss": 1.637, + "step": 101410 + }, + { + "epoch": 0.6374481959420973, + "grad_norm": 6.428781032562256, + "learning_rate": 1.5755261812360126e-05, + "loss": 1.7356, + "step": 101420 + }, + { + "epoch": 0.6375110482587945, + "grad_norm": 6.101195335388184, + "learning_rate": 1.5754842711415473e-05, + "loss": 1.514, + "step": 101430 + }, + { + "epoch": 0.6375739005754916, + "grad_norm": 6.362186908721924, + "learning_rate": 1.575442361047082e-05, + "loss": 1.7776, + "step": 101440 + }, + { + "epoch": 0.6376367528921887, + "grad_norm": 7.063998222351074, + "learning_rate": 1.5754004509526167e-05, + "loss": 1.6367, + "step": 101450 + }, + { + "epoch": 0.6376996052088858, + "grad_norm": 7.601218223571777, + "learning_rate": 1.5753585408581514e-05, + "loss": 1.6837, + "step": 101460 + }, + { + "epoch": 0.6377624575255828, + "grad_norm": 6.573344707489014, + "learning_rate": 1.5753166307636858e-05, + "loss": 1.5347, + "step": 101470 + }, + { + "epoch": 0.6378253098422799, + "grad_norm": 5.948842525482178, + "learning_rate": 1.5752747206692205e-05, + "loss": 1.528, + "step": 101480 + }, + { + "epoch": 0.637888162158977, + "grad_norm": 6.568027019500732, + "learning_rate": 1.5752328105747552e-05, + "loss": 1.4821, + "step": 101490 + }, + { + "epoch": 0.6379510144756741, + "grad_norm": 5.7263875007629395, + "learning_rate": 1.57519090048029e-05, + "loss": 1.4768, + "step": 101500 + }, + { + "epoch": 0.6380138667923713, + "grad_norm": 7.680974960327148, + "learning_rate": 1.5751489903858243e-05, + "loss": 1.8568, + "step": 101510 + }, + { + "epoch": 0.6380767191090684, + "grad_norm": 6.494784355163574, + "learning_rate": 1.575107080291359e-05, + "loss": 1.6616, + "step": 101520 + }, + { + "epoch": 0.6381395714257655, + "grad_norm": 6.3536224365234375, + "learning_rate": 1.5750651701968937e-05, + "loss": 1.6196, + "step": 101530 + }, + { + "epoch": 0.6382024237424626, + "grad_norm": 5.7544145584106445, + "learning_rate": 1.5750232601024284e-05, + "loss": 1.5604, + "step": 101540 + }, + { + "epoch": 0.6382652760591597, + "grad_norm": 7.191973686218262, + "learning_rate": 1.574981350007963e-05, + "loss": 1.6539, + "step": 101550 + }, + { + "epoch": 0.6383281283758568, + "grad_norm": 7.5846028327941895, + "learning_rate": 1.5749394399134975e-05, + "loss": 1.6648, + "step": 101560 + }, + { + "epoch": 0.638390980692554, + "grad_norm": 6.780168056488037, + "learning_rate": 1.5748975298190322e-05, + "loss": 1.8323, + "step": 101570 + }, + { + "epoch": 0.6384538330092511, + "grad_norm": 7.254518508911133, + "learning_rate": 1.574855619724567e-05, + "loss": 1.6612, + "step": 101580 + }, + { + "epoch": 0.6385166853259482, + "grad_norm": 7.408272743225098, + "learning_rate": 1.5748137096301016e-05, + "loss": 1.6778, + "step": 101590 + }, + { + "epoch": 0.6385795376426453, + "grad_norm": 6.666190147399902, + "learning_rate": 1.5747717995356363e-05, + "loss": 1.6954, + "step": 101600 + }, + { + "epoch": 0.6386423899593424, + "grad_norm": 7.656550407409668, + "learning_rate": 1.574729889441171e-05, + "loss": 1.8318, + "step": 101610 + }, + { + "epoch": 0.6387052422760395, + "grad_norm": 7.480816841125488, + "learning_rate": 1.5746879793467057e-05, + "loss": 1.4682, + "step": 101620 + }, + { + "epoch": 0.6387680945927366, + "grad_norm": 7.185154914855957, + "learning_rate": 1.5746460692522404e-05, + "loss": 1.5891, + "step": 101630 + }, + { + "epoch": 0.6388309469094338, + "grad_norm": 6.208061695098877, + "learning_rate": 1.574604159157775e-05, + "loss": 1.6934, + "step": 101640 + }, + { + "epoch": 0.6388937992261309, + "grad_norm": 6.467697620391846, + "learning_rate": 1.5745622490633095e-05, + "loss": 1.5176, + "step": 101650 + }, + { + "epoch": 0.638956651542828, + "grad_norm": 6.263512134552002, + "learning_rate": 1.5745203389688442e-05, + "loss": 1.8768, + "step": 101660 + }, + { + "epoch": 0.6390195038595251, + "grad_norm": 5.4175872802734375, + "learning_rate": 1.574478428874379e-05, + "loss": 1.65, + "step": 101670 + }, + { + "epoch": 0.6390823561762222, + "grad_norm": 5.825191020965576, + "learning_rate": 1.5744365187799136e-05, + "loss": 1.7513, + "step": 101680 + }, + { + "epoch": 0.6391452084929193, + "grad_norm": 5.984965801239014, + "learning_rate": 1.574394608685448e-05, + "loss": 1.8157, + "step": 101690 + }, + { + "epoch": 0.6392080608096165, + "grad_norm": 7.22231388092041, + "learning_rate": 1.5743526985909827e-05, + "loss": 1.6587, + "step": 101700 + }, + { + "epoch": 0.6392709131263136, + "grad_norm": 6.928992748260498, + "learning_rate": 1.5743107884965174e-05, + "loss": 1.7003, + "step": 101710 + }, + { + "epoch": 0.6393337654430106, + "grad_norm": 7.389366626739502, + "learning_rate": 1.574268878402052e-05, + "loss": 1.4202, + "step": 101720 + }, + { + "epoch": 0.6393966177597077, + "grad_norm": 5.17368745803833, + "learning_rate": 1.5742269683075865e-05, + "loss": 1.6994, + "step": 101730 + }, + { + "epoch": 0.6394594700764048, + "grad_norm": 6.973698139190674, + "learning_rate": 1.5741850582131212e-05, + "loss": 1.7086, + "step": 101740 + }, + { + "epoch": 0.6395223223931019, + "grad_norm": 6.596184730529785, + "learning_rate": 1.574143148118656e-05, + "loss": 1.7873, + "step": 101750 + }, + { + "epoch": 0.639585174709799, + "grad_norm": 6.920825481414795, + "learning_rate": 1.5741012380241906e-05, + "loss": 1.6646, + "step": 101760 + }, + { + "epoch": 0.6396480270264961, + "grad_norm": 6.680747985839844, + "learning_rate": 1.5740593279297253e-05, + "loss": 1.67, + "step": 101770 + }, + { + "epoch": 0.6397108793431933, + "grad_norm": 6.86875581741333, + "learning_rate": 1.57401741783526e-05, + "loss": 1.8445, + "step": 101780 + }, + { + "epoch": 0.6397737316598904, + "grad_norm": 7.263949394226074, + "learning_rate": 1.5739755077407944e-05, + "loss": 1.6791, + "step": 101790 + }, + { + "epoch": 0.6398365839765875, + "grad_norm": 6.624402046203613, + "learning_rate": 1.573933597646329e-05, + "loss": 1.7059, + "step": 101800 + }, + { + "epoch": 0.6398994362932846, + "grad_norm": 5.630288124084473, + "learning_rate": 1.5738916875518638e-05, + "loss": 1.4169, + "step": 101810 + }, + { + "epoch": 0.6399622886099817, + "grad_norm": 7.361122131347656, + "learning_rate": 1.5738497774573985e-05, + "loss": 1.6202, + "step": 101820 + }, + { + "epoch": 0.6400251409266788, + "grad_norm": 6.6272430419921875, + "learning_rate": 1.5738078673629332e-05, + "loss": 1.5626, + "step": 101830 + }, + { + "epoch": 0.640087993243376, + "grad_norm": 5.221563339233398, + "learning_rate": 1.573765957268468e-05, + "loss": 1.4445, + "step": 101840 + }, + { + "epoch": 0.6401508455600731, + "grad_norm": 7.25012731552124, + "learning_rate": 1.5737240471740026e-05, + "loss": 1.494, + "step": 101850 + }, + { + "epoch": 0.6402136978767702, + "grad_norm": 7.599103927612305, + "learning_rate": 1.5736821370795373e-05, + "loss": 1.6157, + "step": 101860 + }, + { + "epoch": 0.6402765501934673, + "grad_norm": 6.354990005493164, + "learning_rate": 1.5736402269850717e-05, + "loss": 1.7489, + "step": 101870 + }, + { + "epoch": 0.6403394025101644, + "grad_norm": 5.892938613891602, + "learning_rate": 1.5735983168906064e-05, + "loss": 1.4894, + "step": 101880 + }, + { + "epoch": 0.6404022548268615, + "grad_norm": 6.195607662200928, + "learning_rate": 1.573556406796141e-05, + "loss": 1.7119, + "step": 101890 + }, + { + "epoch": 0.6404651071435586, + "grad_norm": 6.346561431884766, + "learning_rate": 1.5735144967016758e-05, + "loss": 1.5715, + "step": 101900 + }, + { + "epoch": 0.6405279594602558, + "grad_norm": 7.593139171600342, + "learning_rate": 1.5734725866072102e-05, + "loss": 1.742, + "step": 101910 + }, + { + "epoch": 0.6405908117769529, + "grad_norm": 5.391120433807373, + "learning_rate": 1.573430676512745e-05, + "loss": 1.5101, + "step": 101920 + }, + { + "epoch": 0.64065366409365, + "grad_norm": 8.564781188964844, + "learning_rate": 1.5733887664182796e-05, + "loss": 1.6782, + "step": 101930 + }, + { + "epoch": 0.6407165164103471, + "grad_norm": 6.898707866668701, + "learning_rate": 1.5733468563238143e-05, + "loss": 1.8672, + "step": 101940 + }, + { + "epoch": 0.6407793687270442, + "grad_norm": 6.528625965118408, + "learning_rate": 1.573304946229349e-05, + "loss": 1.7425, + "step": 101950 + }, + { + "epoch": 0.6408422210437413, + "grad_norm": Infinity, + "learning_rate": 1.5732630361348834e-05, + "loss": 1.5653, + "step": 101960 + }, + { + "epoch": 0.6409050733604384, + "grad_norm": 7.259417533874512, + "learning_rate": 1.573225317049865e-05, + "loss": 1.67, + "step": 101970 + }, + { + "epoch": 0.6409679256771355, + "grad_norm": 6.333691596984863, + "learning_rate": 1.5731834069553996e-05, + "loss": 1.4398, + "step": 101980 + }, + { + "epoch": 0.6410307779938326, + "grad_norm": 6.996382713317871, + "learning_rate": 1.573141496860934e-05, + "loss": 1.6902, + "step": 101990 + }, + { + "epoch": 0.6410936303105297, + "grad_norm": 6.314414024353027, + "learning_rate": 1.5730995867664686e-05, + "loss": 1.6564, + "step": 102000 + }, + { + "epoch": 0.6411564826272268, + "grad_norm": 6.953320503234863, + "learning_rate": 1.5730576766720033e-05, + "loss": 1.7557, + "step": 102010 + }, + { + "epoch": 0.6412193349439239, + "grad_norm": 5.498722553253174, + "learning_rate": 1.573015766577538e-05, + "loss": 1.5373, + "step": 102020 + }, + { + "epoch": 0.641282187260621, + "grad_norm": 6.309254169464111, + "learning_rate": 1.5729738564830724e-05, + "loss": 1.6494, + "step": 102030 + }, + { + "epoch": 0.6413450395773181, + "grad_norm": 6.082536697387695, + "learning_rate": 1.572931946388607e-05, + "loss": 1.4346, + "step": 102040 + }, + { + "epoch": 0.6414078918940153, + "grad_norm": 6.319396018981934, + "learning_rate": 1.572890036294142e-05, + "loss": 1.5892, + "step": 102050 + }, + { + "epoch": 0.6414707442107124, + "grad_norm": 6.706999778747559, + "learning_rate": 1.5728481261996765e-05, + "loss": 1.7324, + "step": 102060 + }, + { + "epoch": 0.6415335965274095, + "grad_norm": 6.6370649337768555, + "learning_rate": 1.5728062161052113e-05, + "loss": 1.7239, + "step": 102070 + }, + { + "epoch": 0.6415964488441066, + "grad_norm": 6.0964274406433105, + "learning_rate": 1.572764306010746e-05, + "loss": 1.7505, + "step": 102080 + }, + { + "epoch": 0.6416593011608037, + "grad_norm": 7.073576927185059, + "learning_rate": 1.5727223959162807e-05, + "loss": 1.6065, + "step": 102090 + }, + { + "epoch": 0.6417221534775008, + "grad_norm": 6.858996391296387, + "learning_rate": 1.5726804858218154e-05, + "loss": 1.6198, + "step": 102100 + }, + { + "epoch": 0.641785005794198, + "grad_norm": 6.487966537475586, + "learning_rate": 1.5726385757273497e-05, + "loss": 1.7107, + "step": 102110 + }, + { + "epoch": 0.6418478581108951, + "grad_norm": 6.950842380523682, + "learning_rate": 1.5725966656328844e-05, + "loss": 1.6784, + "step": 102120 + }, + { + "epoch": 0.6419107104275922, + "grad_norm": 6.250916957855225, + "learning_rate": 1.572554755538419e-05, + "loss": 1.6726, + "step": 102130 + }, + { + "epoch": 0.6419735627442893, + "grad_norm": 6.4210028648376465, + "learning_rate": 1.572512845443954e-05, + "loss": 1.6568, + "step": 102140 + }, + { + "epoch": 0.6420364150609864, + "grad_norm": 6.273014068603516, + "learning_rate": 1.5724709353494886e-05, + "loss": 1.5464, + "step": 102150 + }, + { + "epoch": 0.6420992673776835, + "grad_norm": 6.026818752288818, + "learning_rate": 1.572429025255023e-05, + "loss": 1.5868, + "step": 102160 + }, + { + "epoch": 0.6421621196943806, + "grad_norm": 7.14760160446167, + "learning_rate": 1.5723871151605576e-05, + "loss": 1.7496, + "step": 102170 + }, + { + "epoch": 0.6422249720110778, + "grad_norm": 6.712249755859375, + "learning_rate": 1.5723452050660924e-05, + "loss": 1.7964, + "step": 102180 + }, + { + "epoch": 0.6422878243277749, + "grad_norm": 7.693052291870117, + "learning_rate": 1.572303294971627e-05, + "loss": 1.5236, + "step": 102190 + }, + { + "epoch": 0.642350676644472, + "grad_norm": 6.573236465454102, + "learning_rate": 1.5722613848771618e-05, + "loss": 1.4585, + "step": 102200 + }, + { + "epoch": 0.6424135289611691, + "grad_norm": 6.692652702331543, + "learning_rate": 1.572219474782696e-05, + "loss": 1.5833, + "step": 102210 + }, + { + "epoch": 0.6424763812778662, + "grad_norm": 6.677134037017822, + "learning_rate": 1.572177564688231e-05, + "loss": 1.6617, + "step": 102220 + }, + { + "epoch": 0.6425392335945632, + "grad_norm": 6.623659610748291, + "learning_rate": 1.5721356545937655e-05, + "loss": 1.9046, + "step": 102230 + }, + { + "epoch": 0.6426020859112603, + "grad_norm": 5.9338812828063965, + "learning_rate": 1.5720937444993003e-05, + "loss": 1.5888, + "step": 102240 + }, + { + "epoch": 0.6426649382279574, + "grad_norm": 6.421466827392578, + "learning_rate": 1.5720518344048346e-05, + "loss": 1.5223, + "step": 102250 + }, + { + "epoch": 0.6427277905446546, + "grad_norm": 6.812142372131348, + "learning_rate": 1.5720099243103693e-05, + "loss": 1.8059, + "step": 102260 + }, + { + "epoch": 0.6427906428613517, + "grad_norm": 6.643022060394287, + "learning_rate": 1.571968014215904e-05, + "loss": 1.5783, + "step": 102270 + }, + { + "epoch": 0.6428534951780488, + "grad_norm": 7.271981716156006, + "learning_rate": 1.5719261041214387e-05, + "loss": 1.5519, + "step": 102280 + }, + { + "epoch": 0.6429163474947459, + "grad_norm": 6.32779598236084, + "learning_rate": 1.5718841940269735e-05, + "loss": 1.602, + "step": 102290 + }, + { + "epoch": 0.642979199811443, + "grad_norm": 6.529623985290527, + "learning_rate": 1.571842283932508e-05, + "loss": 1.6133, + "step": 102300 + }, + { + "epoch": 0.6430420521281401, + "grad_norm": 7.365950584411621, + "learning_rate": 1.571800373838043e-05, + "loss": 1.6673, + "step": 102310 + }, + { + "epoch": 0.6431049044448373, + "grad_norm": 6.111841678619385, + "learning_rate": 1.5717584637435776e-05, + "loss": 1.484, + "step": 102320 + }, + { + "epoch": 0.6431677567615344, + "grad_norm": 7.701827049255371, + "learning_rate": 1.5717165536491123e-05, + "loss": 1.7046, + "step": 102330 + }, + { + "epoch": 0.6432306090782315, + "grad_norm": 6.581363201141357, + "learning_rate": 1.5716746435546466e-05, + "loss": 1.7479, + "step": 102340 + }, + { + "epoch": 0.6432934613949286, + "grad_norm": 6.214954853057861, + "learning_rate": 1.5716327334601814e-05, + "loss": 1.5456, + "step": 102350 + }, + { + "epoch": 0.6433563137116257, + "grad_norm": 6.552243709564209, + "learning_rate": 1.571590823365716e-05, + "loss": 1.4382, + "step": 102360 + }, + { + "epoch": 0.6434191660283228, + "grad_norm": 5.998539447784424, + "learning_rate": 1.5715489132712508e-05, + "loss": 1.5903, + "step": 102370 + }, + { + "epoch": 0.64348201834502, + "grad_norm": 8.151195526123047, + "learning_rate": 1.5715070031767855e-05, + "loss": 1.8683, + "step": 102380 + }, + { + "epoch": 0.6435448706617171, + "grad_norm": 6.959873676300049, + "learning_rate": 1.57146509308232e-05, + "loss": 1.6179, + "step": 102390 + }, + { + "epoch": 0.6436077229784142, + "grad_norm": 7.281445026397705, + "learning_rate": 1.5714231829878546e-05, + "loss": 1.7423, + "step": 102400 + }, + { + "epoch": 0.6436705752951113, + "grad_norm": 6.711086750030518, + "learning_rate": 1.5713812728933893e-05, + "loss": 1.5798, + "step": 102410 + }, + { + "epoch": 0.6437334276118084, + "grad_norm": 6.808206081390381, + "learning_rate": 1.571339362798924e-05, + "loss": 1.5961, + "step": 102420 + }, + { + "epoch": 0.6437962799285055, + "grad_norm": 6.403003692626953, + "learning_rate": 1.5712974527044583e-05, + "loss": 1.7072, + "step": 102430 + }, + { + "epoch": 0.6438591322452026, + "grad_norm": 6.481310844421387, + "learning_rate": 1.571255542609993e-05, + "loss": 1.758, + "step": 102440 + }, + { + "epoch": 0.6439219845618998, + "grad_norm": 7.031172275543213, + "learning_rate": 1.5712136325155277e-05, + "loss": 1.7948, + "step": 102450 + }, + { + "epoch": 0.6439848368785969, + "grad_norm": 6.796384334564209, + "learning_rate": 1.5711717224210625e-05, + "loss": 1.5936, + "step": 102460 + }, + { + "epoch": 0.644047689195294, + "grad_norm": 7.230562686920166, + "learning_rate": 1.571129812326597e-05, + "loss": 1.6324, + "step": 102470 + }, + { + "epoch": 0.6441105415119911, + "grad_norm": 7.450069427490234, + "learning_rate": 1.571087902232132e-05, + "loss": 1.5685, + "step": 102480 + }, + { + "epoch": 0.6441733938286881, + "grad_norm": 5.21614933013916, + "learning_rate": 1.5710459921376662e-05, + "loss": 1.654, + "step": 102490 + }, + { + "epoch": 0.6442362461453852, + "grad_norm": 6.1010332107543945, + "learning_rate": 1.571004082043201e-05, + "loss": 1.4736, + "step": 102500 + }, + { + "epoch": 0.6442990984620823, + "grad_norm": 6.872131824493408, + "learning_rate": 1.5709621719487357e-05, + "loss": 1.9806, + "step": 102510 + }, + { + "epoch": 0.6443619507787794, + "grad_norm": 6.7846760749816895, + "learning_rate": 1.5709202618542704e-05, + "loss": 1.8059, + "step": 102520 + }, + { + "epoch": 0.6444248030954766, + "grad_norm": 6.188438415527344, + "learning_rate": 1.570878351759805e-05, + "loss": 1.5085, + "step": 102530 + }, + { + "epoch": 0.6444876554121737, + "grad_norm": 6.045825004577637, + "learning_rate": 1.5708364416653398e-05, + "loss": 1.613, + "step": 102540 + }, + { + "epoch": 0.6445505077288708, + "grad_norm": 7.195934295654297, + "learning_rate": 1.5707945315708745e-05, + "loss": 1.7093, + "step": 102550 + }, + { + "epoch": 0.6446133600455679, + "grad_norm": 7.038455009460449, + "learning_rate": 1.570752621476409e-05, + "loss": 1.7824, + "step": 102560 + }, + { + "epoch": 0.644676212362265, + "grad_norm": 7.097109317779541, + "learning_rate": 1.5707107113819436e-05, + "loss": 1.571, + "step": 102570 + }, + { + "epoch": 0.6447390646789621, + "grad_norm": 6.5821757316589355, + "learning_rate": 1.5706688012874783e-05, + "loss": 1.6042, + "step": 102580 + }, + { + "epoch": 0.6448019169956593, + "grad_norm": 6.128716945648193, + "learning_rate": 1.570626891193013e-05, + "loss": 1.5795, + "step": 102590 + }, + { + "epoch": 0.6448647693123564, + "grad_norm": 7.007327556610107, + "learning_rate": 1.5705849810985477e-05, + "loss": 1.6193, + "step": 102600 + }, + { + "epoch": 0.6449276216290535, + "grad_norm": 7.820053577423096, + "learning_rate": 1.570543071004082e-05, + "loss": 1.6681, + "step": 102610 + }, + { + "epoch": 0.6449904739457506, + "grad_norm": 5.884525299072266, + "learning_rate": 1.5705011609096168e-05, + "loss": 1.4219, + "step": 102620 + }, + { + "epoch": 0.6450533262624477, + "grad_norm": 6.69332218170166, + "learning_rate": 1.5704592508151515e-05, + "loss": 1.6222, + "step": 102630 + }, + { + "epoch": 0.6451161785791448, + "grad_norm": 5.9894938468933105, + "learning_rate": 1.570417340720686e-05, + "loss": 1.5548, + "step": 102640 + }, + { + "epoch": 0.6451790308958419, + "grad_norm": 7.076059341430664, + "learning_rate": 1.5703754306262205e-05, + "loss": 1.6576, + "step": 102650 + }, + { + "epoch": 0.6452418832125391, + "grad_norm": 7.720175743103027, + "learning_rate": 1.5703335205317552e-05, + "loss": 1.6047, + "step": 102660 + }, + { + "epoch": 0.6453047355292362, + "grad_norm": 5.937623500823975, + "learning_rate": 1.57029161043729e-05, + "loss": 1.5105, + "step": 102670 + }, + { + "epoch": 0.6453675878459333, + "grad_norm": 7.381980895996094, + "learning_rate": 1.5702497003428247e-05, + "loss": 1.4033, + "step": 102680 + }, + { + "epoch": 0.6454304401626304, + "grad_norm": 6.697049140930176, + "learning_rate": 1.5702077902483594e-05, + "loss": 1.7542, + "step": 102690 + }, + { + "epoch": 0.6454932924793275, + "grad_norm": 6.8227972984313965, + "learning_rate": 1.570165880153894e-05, + "loss": 1.6011, + "step": 102700 + }, + { + "epoch": 0.6455561447960246, + "grad_norm": 7.084945201873779, + "learning_rate": 1.5701239700594288e-05, + "loss": 1.7403, + "step": 102710 + }, + { + "epoch": 0.6456189971127217, + "grad_norm": 6.482322692871094, + "learning_rate": 1.5700820599649635e-05, + "loss": 1.657, + "step": 102720 + }, + { + "epoch": 0.6456818494294189, + "grad_norm": 6.782051086425781, + "learning_rate": 1.570040149870498e-05, + "loss": 1.6043, + "step": 102730 + }, + { + "epoch": 0.6457447017461159, + "grad_norm": 7.116504192352295, + "learning_rate": 1.5699982397760326e-05, + "loss": 1.6849, + "step": 102740 + }, + { + "epoch": 0.645807554062813, + "grad_norm": 5.7617506980896, + "learning_rate": 1.5699563296815673e-05, + "loss": 1.5332, + "step": 102750 + }, + { + "epoch": 0.6458704063795101, + "grad_norm": 6.336673259735107, + "learning_rate": 1.569914419587102e-05, + "loss": 1.7876, + "step": 102760 + }, + { + "epoch": 0.6459332586962072, + "grad_norm": 7.29729700088501, + "learning_rate": 1.5698725094926367e-05, + "loss": 1.7959, + "step": 102770 + }, + { + "epoch": 0.6459961110129043, + "grad_norm": 6.527895450592041, + "learning_rate": 1.569830599398171e-05, + "loss": 1.8514, + "step": 102780 + }, + { + "epoch": 0.6460589633296014, + "grad_norm": 6.2483744621276855, + "learning_rate": 1.5697886893037058e-05, + "loss": 1.7979, + "step": 102790 + }, + { + "epoch": 0.6461218156462986, + "grad_norm": 7.259784698486328, + "learning_rate": 1.5697467792092405e-05, + "loss": 1.6971, + "step": 102800 + }, + { + "epoch": 0.6461846679629957, + "grad_norm": 6.024012088775635, + "learning_rate": 1.5697048691147752e-05, + "loss": 1.6908, + "step": 102810 + }, + { + "epoch": 0.6462475202796928, + "grad_norm": 6.680114269256592, + "learning_rate": 1.56966295902031e-05, + "loss": 1.7216, + "step": 102820 + }, + { + "epoch": 0.6463103725963899, + "grad_norm": 7.588250160217285, + "learning_rate": 1.5696210489258442e-05, + "loss": 1.5704, + "step": 102830 + }, + { + "epoch": 0.646373224913087, + "grad_norm": 5.725822925567627, + "learning_rate": 1.569579138831379e-05, + "loss": 1.8783, + "step": 102840 + }, + { + "epoch": 0.6464360772297841, + "grad_norm": 6.266225337982178, + "learning_rate": 1.5695372287369137e-05, + "loss": 1.33, + "step": 102850 + }, + { + "epoch": 0.6464989295464812, + "grad_norm": 6.439865589141846, + "learning_rate": 1.5694953186424484e-05, + "loss": 1.7816, + "step": 102860 + }, + { + "epoch": 0.6465617818631784, + "grad_norm": 8.650877952575684, + "learning_rate": 1.5694534085479827e-05, + "loss": 1.7552, + "step": 102870 + }, + { + "epoch": 0.6466246341798755, + "grad_norm": 6.840614318847656, + "learning_rate": 1.5694114984535174e-05, + "loss": 1.6594, + "step": 102880 + }, + { + "epoch": 0.6466874864965726, + "grad_norm": 6.866539478302002, + "learning_rate": 1.569369588359052e-05, + "loss": 1.6927, + "step": 102890 + }, + { + "epoch": 0.6467503388132697, + "grad_norm": 6.2211785316467285, + "learning_rate": 1.569327678264587e-05, + "loss": 1.6892, + "step": 102900 + }, + { + "epoch": 0.6468131911299668, + "grad_norm": 6.687888145446777, + "learning_rate": 1.5692857681701216e-05, + "loss": 1.6261, + "step": 102910 + }, + { + "epoch": 0.6468760434466639, + "grad_norm": 6.953901290893555, + "learning_rate": 1.5692438580756563e-05, + "loss": 1.6963, + "step": 102920 + }, + { + "epoch": 0.646938895763361, + "grad_norm": 7.242424488067627, + "learning_rate": 1.569201947981191e-05, + "loss": 1.7554, + "step": 102930 + }, + { + "epoch": 0.6470017480800582, + "grad_norm": 7.113307476043701, + "learning_rate": 1.5691600378867257e-05, + "loss": 1.8331, + "step": 102940 + }, + { + "epoch": 0.6470646003967553, + "grad_norm": 6.461259365081787, + "learning_rate": 1.5691181277922604e-05, + "loss": 1.6479, + "step": 102950 + }, + { + "epoch": 0.6471274527134524, + "grad_norm": 7.612675666809082, + "learning_rate": 1.5690762176977948e-05, + "loss": 1.7094, + "step": 102960 + }, + { + "epoch": 0.6471903050301495, + "grad_norm": 7.381922721862793, + "learning_rate": 1.5690343076033295e-05, + "loss": 1.5518, + "step": 102970 + }, + { + "epoch": 0.6472531573468466, + "grad_norm": 5.682344436645508, + "learning_rate": 1.5689923975088642e-05, + "loss": 1.4742, + "step": 102980 + }, + { + "epoch": 0.6473160096635437, + "grad_norm": 6.8673272132873535, + "learning_rate": 1.568950487414399e-05, + "loss": 1.5129, + "step": 102990 + }, + { + "epoch": 0.6473788619802407, + "grad_norm": 6.724424839019775, + "learning_rate": 1.5689085773199336e-05, + "loss": 1.5424, + "step": 103000 + }, + { + "epoch": 0.6474417142969379, + "grad_norm": 6.3348212242126465, + "learning_rate": 1.568866667225468e-05, + "loss": 1.6309, + "step": 103010 + }, + { + "epoch": 0.647504566613635, + "grad_norm": 6.291509628295898, + "learning_rate": 1.5688247571310027e-05, + "loss": 1.5779, + "step": 103020 + }, + { + "epoch": 0.6475674189303321, + "grad_norm": 6.038863182067871, + "learning_rate": 1.5687828470365374e-05, + "loss": 1.7678, + "step": 103030 + }, + { + "epoch": 0.6476302712470292, + "grad_norm": 8.962211608886719, + "learning_rate": 1.568740936942072e-05, + "loss": 1.6524, + "step": 103040 + }, + { + "epoch": 0.6476931235637263, + "grad_norm": 6.943170070648193, + "learning_rate": 1.5686990268476064e-05, + "loss": 1.5671, + "step": 103050 + }, + { + "epoch": 0.6477559758804234, + "grad_norm": 6.481675148010254, + "learning_rate": 1.568657116753141e-05, + "loss": 1.551, + "step": 103060 + }, + { + "epoch": 0.6478188281971206, + "grad_norm": 6.505732536315918, + "learning_rate": 1.568615206658676e-05, + "loss": 1.763, + "step": 103070 + }, + { + "epoch": 0.6478816805138177, + "grad_norm": 7.271378040313721, + "learning_rate": 1.5685732965642106e-05, + "loss": 1.7787, + "step": 103080 + }, + { + "epoch": 0.6479445328305148, + "grad_norm": 8.255097389221191, + "learning_rate": 1.5685313864697453e-05, + "loss": 1.6224, + "step": 103090 + }, + { + "epoch": 0.6480073851472119, + "grad_norm": 5.565671443939209, + "learning_rate": 1.56848947637528e-05, + "loss": 1.5235, + "step": 103100 + }, + { + "epoch": 0.648070237463909, + "grad_norm": 6.7757134437561035, + "learning_rate": 1.5684475662808143e-05, + "loss": 1.5838, + "step": 103110 + }, + { + "epoch": 0.6481330897806061, + "grad_norm": 7.188182353973389, + "learning_rate": 1.568405656186349e-05, + "loss": 1.7577, + "step": 103120 + }, + { + "epoch": 0.6481959420973032, + "grad_norm": 6.870747089385986, + "learning_rate": 1.5683637460918838e-05, + "loss": 1.7028, + "step": 103130 + }, + { + "epoch": 0.6482587944140004, + "grad_norm": 5.249590873718262, + "learning_rate": 1.5683218359974185e-05, + "loss": 1.7051, + "step": 103140 + }, + { + "epoch": 0.6483216467306975, + "grad_norm": 5.709137916564941, + "learning_rate": 1.5682799259029532e-05, + "loss": 1.6306, + "step": 103150 + }, + { + "epoch": 0.6483844990473946, + "grad_norm": 6.573286056518555, + "learning_rate": 1.568238015808488e-05, + "loss": 1.6941, + "step": 103160 + }, + { + "epoch": 0.6484473513640917, + "grad_norm": 6.8041558265686035, + "learning_rate": 1.5681961057140226e-05, + "loss": 1.662, + "step": 103170 + }, + { + "epoch": 0.6485102036807888, + "grad_norm": 6.927123546600342, + "learning_rate": 1.568154195619557e-05, + "loss": 1.8207, + "step": 103180 + }, + { + "epoch": 0.6485730559974859, + "grad_norm": 5.384753704071045, + "learning_rate": 1.5681122855250917e-05, + "loss": 1.6552, + "step": 103190 + }, + { + "epoch": 0.648635908314183, + "grad_norm": 7.417147159576416, + "learning_rate": 1.5680703754306264e-05, + "loss": 1.649, + "step": 103200 + }, + { + "epoch": 0.6486987606308802, + "grad_norm": 7.085951805114746, + "learning_rate": 1.568028465336161e-05, + "loss": 1.4693, + "step": 103210 + }, + { + "epoch": 0.6487616129475773, + "grad_norm": 7.058738708496094, + "learning_rate": 1.5679865552416958e-05, + "loss": 1.7497, + "step": 103220 + }, + { + "epoch": 0.6488244652642744, + "grad_norm": 6.559217929840088, + "learning_rate": 1.56794464514723e-05, + "loss": 1.6933, + "step": 103230 + }, + { + "epoch": 0.6488873175809715, + "grad_norm": 7.10174036026001, + "learning_rate": 1.567902735052765e-05, + "loss": 1.744, + "step": 103240 + }, + { + "epoch": 0.6489501698976686, + "grad_norm": 8.040226936340332, + "learning_rate": 1.5678608249582996e-05, + "loss": 1.7605, + "step": 103250 + }, + { + "epoch": 0.6490130222143656, + "grad_norm": 7.494002342224121, + "learning_rate": 1.5678189148638343e-05, + "loss": 1.4333, + "step": 103260 + }, + { + "epoch": 0.6490758745310627, + "grad_norm": 8.5501708984375, + "learning_rate": 1.5677770047693686e-05, + "loss": 1.5655, + "step": 103270 + }, + { + "epoch": 0.6491387268477599, + "grad_norm": 6.449906826019287, + "learning_rate": 1.5677350946749034e-05, + "loss": 1.6743, + "step": 103280 + }, + { + "epoch": 0.649201579164457, + "grad_norm": 6.372710227966309, + "learning_rate": 1.567693184580438e-05, + "loss": 1.8011, + "step": 103290 + }, + { + "epoch": 0.6492644314811541, + "grad_norm": 5.954245090484619, + "learning_rate": 1.5676512744859728e-05, + "loss": 1.595, + "step": 103300 + }, + { + "epoch": 0.6493272837978512, + "grad_norm": 7.211219787597656, + "learning_rate": 1.5676093643915075e-05, + "loss": 1.7222, + "step": 103310 + }, + { + "epoch": 0.6493901361145483, + "grad_norm": 6.185349464416504, + "learning_rate": 1.5675674542970422e-05, + "loss": 1.4639, + "step": 103320 + }, + { + "epoch": 0.6494529884312454, + "grad_norm": 7.957022666931152, + "learning_rate": 1.567525544202577e-05, + "loss": 1.4498, + "step": 103330 + }, + { + "epoch": 0.6495158407479426, + "grad_norm": 7.452658176422119, + "learning_rate": 1.5674836341081116e-05, + "loss": 1.7506, + "step": 103340 + }, + { + "epoch": 0.6495786930646397, + "grad_norm": 7.014586448669434, + "learning_rate": 1.5674417240136463e-05, + "loss": 1.6089, + "step": 103350 + }, + { + "epoch": 0.6496415453813368, + "grad_norm": 5.949844837188721, + "learning_rate": 1.5673998139191807e-05, + "loss": 1.6528, + "step": 103360 + }, + { + "epoch": 0.6497043976980339, + "grad_norm": 7.606826305389404, + "learning_rate": 1.5673579038247154e-05, + "loss": 1.6338, + "step": 103370 + }, + { + "epoch": 0.649767250014731, + "grad_norm": 6.81550407409668, + "learning_rate": 1.56731599373025e-05, + "loss": 1.4132, + "step": 103380 + }, + { + "epoch": 0.6498301023314281, + "grad_norm": 5.845203876495361, + "learning_rate": 1.5672740836357848e-05, + "loss": 1.665, + "step": 103390 + }, + { + "epoch": 0.6498929546481252, + "grad_norm": 7.393462181091309, + "learning_rate": 1.567232173541319e-05, + "loss": 1.5338, + "step": 103400 + }, + { + "epoch": 0.6499558069648224, + "grad_norm": 6.444361686706543, + "learning_rate": 1.567190263446854e-05, + "loss": 1.6614, + "step": 103410 + }, + { + "epoch": 0.6500186592815195, + "grad_norm": 6.161467552185059, + "learning_rate": 1.5671483533523886e-05, + "loss": 1.5387, + "step": 103420 + }, + { + "epoch": 0.6500815115982166, + "grad_norm": 5.715375900268555, + "learning_rate": 1.5671064432579233e-05, + "loss": 1.708, + "step": 103430 + }, + { + "epoch": 0.6501443639149137, + "grad_norm": 6.9999284744262695, + "learning_rate": 1.567064533163458e-05, + "loss": 1.5844, + "step": 103440 + }, + { + "epoch": 0.6502072162316108, + "grad_norm": 6.0578765869140625, + "learning_rate": 1.5670226230689924e-05, + "loss": 1.5845, + "step": 103450 + }, + { + "epoch": 0.6502700685483079, + "grad_norm": 7.019611358642578, + "learning_rate": 1.566980712974527e-05, + "loss": 1.6105, + "step": 103460 + }, + { + "epoch": 0.650332920865005, + "grad_norm": 7.245494842529297, + "learning_rate": 1.5669388028800618e-05, + "loss": 1.739, + "step": 103470 + }, + { + "epoch": 0.6503957731817022, + "grad_norm": 7.221858501434326, + "learning_rate": 1.5668968927855965e-05, + "loss": 1.669, + "step": 103480 + }, + { + "epoch": 0.6504586254983993, + "grad_norm": 6.329747200012207, + "learning_rate": 1.566854982691131e-05, + "loss": 1.5785, + "step": 103490 + }, + { + "epoch": 0.6505214778150964, + "grad_norm": 6.927884578704834, + "learning_rate": 1.5668130725966656e-05, + "loss": 1.5369, + "step": 103500 + }, + { + "epoch": 0.6505843301317934, + "grad_norm": 6.898622035980225, + "learning_rate": 1.5667711625022003e-05, + "loss": 1.4242, + "step": 103510 + }, + { + "epoch": 0.6506471824484905, + "grad_norm": 6.615671157836914, + "learning_rate": 1.566729252407735e-05, + "loss": 1.6846, + "step": 103520 + }, + { + "epoch": 0.6507100347651876, + "grad_norm": 6.230980396270752, + "learning_rate": 1.5666873423132697e-05, + "loss": 1.6307, + "step": 103530 + }, + { + "epoch": 0.6507728870818847, + "grad_norm": 7.339495658874512, + "learning_rate": 1.5666454322188044e-05, + "loss": 1.5946, + "step": 103540 + }, + { + "epoch": 0.6508357393985819, + "grad_norm": 6.651918411254883, + "learning_rate": 1.566603522124339e-05, + "loss": 1.6696, + "step": 103550 + }, + { + "epoch": 0.650898591715279, + "grad_norm": 7.0048980712890625, + "learning_rate": 1.5665616120298738e-05, + "loss": 1.6126, + "step": 103560 + }, + { + "epoch": 0.6509614440319761, + "grad_norm": 6.759527683258057, + "learning_rate": 1.5665197019354085e-05, + "loss": 1.7289, + "step": 103570 + }, + { + "epoch": 0.6510242963486732, + "grad_norm": 7.07916259765625, + "learning_rate": 1.566477791840943e-05, + "loss": 1.7425, + "step": 103580 + }, + { + "epoch": 0.6510871486653703, + "grad_norm": 5.696152210235596, + "learning_rate": 1.5664358817464776e-05, + "loss": 1.4232, + "step": 103590 + }, + { + "epoch": 0.6511500009820674, + "grad_norm": 6.269619941711426, + "learning_rate": 1.5663939716520123e-05, + "loss": 1.7295, + "step": 103600 + }, + { + "epoch": 0.6512128532987645, + "grad_norm": 6.450016498565674, + "learning_rate": 1.566352061557547e-05, + "loss": 1.6174, + "step": 103610 + }, + { + "epoch": 0.6512757056154617, + "grad_norm": 6.65458345413208, + "learning_rate": 1.5663101514630817e-05, + "loss": 1.9206, + "step": 103620 + }, + { + "epoch": 0.6513385579321588, + "grad_norm": 5.862672328948975, + "learning_rate": 1.566268241368616e-05, + "loss": 1.5831, + "step": 103630 + }, + { + "epoch": 0.6514014102488559, + "grad_norm": 5.754400253295898, + "learning_rate": 1.5662263312741508e-05, + "loss": 1.6507, + "step": 103640 + }, + { + "epoch": 0.651464262565553, + "grad_norm": 7.132193565368652, + "learning_rate": 1.5661844211796855e-05, + "loss": 1.8128, + "step": 103650 + }, + { + "epoch": 0.6515271148822501, + "grad_norm": 7.32619571685791, + "learning_rate": 1.5661425110852202e-05, + "loss": 1.5302, + "step": 103660 + }, + { + "epoch": 0.6515899671989472, + "grad_norm": 5.260908126831055, + "learning_rate": 1.5661006009907546e-05, + "loss": 1.8892, + "step": 103670 + }, + { + "epoch": 0.6516528195156444, + "grad_norm": 6.902319431304932, + "learning_rate": 1.5660586908962893e-05, + "loss": 1.7225, + "step": 103680 + }, + { + "epoch": 0.6517156718323415, + "grad_norm": 6.186056137084961, + "learning_rate": 1.566016780801824e-05, + "loss": 1.8432, + "step": 103690 + }, + { + "epoch": 0.6517785241490386, + "grad_norm": 7.764537811279297, + "learning_rate": 1.5659748707073587e-05, + "loss": 1.7789, + "step": 103700 + }, + { + "epoch": 0.6518413764657357, + "grad_norm": 6.304036617279053, + "learning_rate": 1.5659329606128934e-05, + "loss": 1.7729, + "step": 103710 + }, + { + "epoch": 0.6519042287824328, + "grad_norm": 6.370176315307617, + "learning_rate": 1.565891050518428e-05, + "loss": 1.5822, + "step": 103720 + }, + { + "epoch": 0.6519670810991299, + "grad_norm": 6.075143337249756, + "learning_rate": 1.5658491404239628e-05, + "loss": 1.4602, + "step": 103730 + }, + { + "epoch": 0.652029933415827, + "grad_norm": 7.602111339569092, + "learning_rate": 1.565807230329497e-05, + "loss": 1.86, + "step": 103740 + }, + { + "epoch": 0.6520927857325242, + "grad_norm": 5.180424690246582, + "learning_rate": 1.565765320235032e-05, + "loss": 1.3146, + "step": 103750 + }, + { + "epoch": 0.6521556380492213, + "grad_norm": 7.499391555786133, + "learning_rate": 1.5657234101405666e-05, + "loss": 1.6131, + "step": 103760 + }, + { + "epoch": 0.6522184903659183, + "grad_norm": 6.732677459716797, + "learning_rate": 1.5656815000461013e-05, + "loss": 1.4125, + "step": 103770 + }, + { + "epoch": 0.6522813426826154, + "grad_norm": 5.746530055999756, + "learning_rate": 1.565639589951636e-05, + "loss": 1.7176, + "step": 103780 + }, + { + "epoch": 0.6523441949993125, + "grad_norm": 5.902767658233643, + "learning_rate": 1.5655976798571707e-05, + "loss": 1.64, + "step": 103790 + }, + { + "epoch": 0.6524070473160096, + "grad_norm": 5.808832168579102, + "learning_rate": 1.565555769762705e-05, + "loss": 1.5475, + "step": 103800 + }, + { + "epoch": 0.6524698996327067, + "grad_norm": 6.253108978271484, + "learning_rate": 1.5655138596682398e-05, + "loss": 1.6154, + "step": 103810 + }, + { + "epoch": 0.6525327519494039, + "grad_norm": 6.812736511230469, + "learning_rate": 1.5654719495737745e-05, + "loss": 1.6477, + "step": 103820 + }, + { + "epoch": 0.652595604266101, + "grad_norm": 7.912046909332275, + "learning_rate": 1.5654300394793092e-05, + "loss": 1.7127, + "step": 103830 + }, + { + "epoch": 0.6526584565827981, + "grad_norm": 6.249855995178223, + "learning_rate": 1.565388129384844e-05, + "loss": 1.6528, + "step": 103840 + }, + { + "epoch": 0.6527213088994952, + "grad_norm": 7.224602699279785, + "learning_rate": 1.5653462192903783e-05, + "loss": 1.5417, + "step": 103850 + }, + { + "epoch": 0.6527841612161923, + "grad_norm": 6.1871161460876465, + "learning_rate": 1.565304309195913e-05, + "loss": 1.5821, + "step": 103860 + }, + { + "epoch": 0.6528470135328894, + "grad_norm": 5.854806900024414, + "learning_rate": 1.5652623991014477e-05, + "loss": 1.5883, + "step": 103870 + }, + { + "epoch": 0.6529098658495865, + "grad_norm": 6.067279815673828, + "learning_rate": 1.5652204890069824e-05, + "loss": 1.7265, + "step": 103880 + }, + { + "epoch": 0.6529727181662837, + "grad_norm": 6.640414714813232, + "learning_rate": 1.5651785789125168e-05, + "loss": 1.7412, + "step": 103890 + }, + { + "epoch": 0.6530355704829808, + "grad_norm": 6.987613201141357, + "learning_rate": 1.5651366688180515e-05, + "loss": 1.608, + "step": 103900 + }, + { + "epoch": 0.6530984227996779, + "grad_norm": 8.038615226745605, + "learning_rate": 1.5650947587235862e-05, + "loss": 1.5996, + "step": 103910 + }, + { + "epoch": 0.653161275116375, + "grad_norm": 6.497643947601318, + "learning_rate": 1.565052848629121e-05, + "loss": 1.5211, + "step": 103920 + }, + { + "epoch": 0.6532241274330721, + "grad_norm": 7.015405654907227, + "learning_rate": 1.5650109385346556e-05, + "loss": 1.6312, + "step": 103930 + }, + { + "epoch": 0.6532869797497692, + "grad_norm": 6.187511920928955, + "learning_rate": 1.5649690284401903e-05, + "loss": 1.5689, + "step": 103940 + }, + { + "epoch": 0.6533498320664664, + "grad_norm": 7.025246620178223, + "learning_rate": 1.564927118345725e-05, + "loss": 1.5311, + "step": 103950 + }, + { + "epoch": 0.6534126843831635, + "grad_norm": 6.445618629455566, + "learning_rate": 1.5648852082512597e-05, + "loss": 1.8203, + "step": 103960 + }, + { + "epoch": 0.6534755366998606, + "grad_norm": 6.659660816192627, + "learning_rate": 1.5648432981567944e-05, + "loss": 1.6104, + "step": 103970 + }, + { + "epoch": 0.6535383890165577, + "grad_norm": 6.287054061889648, + "learning_rate": 1.5648013880623288e-05, + "loss": 1.4779, + "step": 103980 + }, + { + "epoch": 0.6536012413332548, + "grad_norm": 6.669844150543213, + "learning_rate": 1.5647594779678635e-05, + "loss": 1.5118, + "step": 103990 + }, + { + "epoch": 0.6536640936499519, + "grad_norm": 7.070347309112549, + "learning_rate": 1.5647175678733982e-05, + "loss": 1.7626, + "step": 104000 + }, + { + "epoch": 0.653726945966649, + "grad_norm": 6.286337852478027, + "learning_rate": 1.564675657778933e-05, + "loss": 1.5305, + "step": 104010 + }, + { + "epoch": 0.653789798283346, + "grad_norm": 6.582416534423828, + "learning_rate": 1.5646337476844676e-05, + "loss": 1.6839, + "step": 104020 + }, + { + "epoch": 0.6538526506000432, + "grad_norm": 7.97673225402832, + "learning_rate": 1.564591837590002e-05, + "loss": 1.8633, + "step": 104030 + }, + { + "epoch": 0.6539155029167403, + "grad_norm": 10.233292579650879, + "learning_rate": 1.5645499274955367e-05, + "loss": 1.6929, + "step": 104040 + }, + { + "epoch": 0.6539783552334374, + "grad_norm": 6.176262378692627, + "learning_rate": 1.5645080174010714e-05, + "loss": 1.3382, + "step": 104050 + }, + { + "epoch": 0.6540412075501345, + "grad_norm": 6.435325622558594, + "learning_rate": 1.564466107306606e-05, + "loss": 1.8239, + "step": 104060 + }, + { + "epoch": 0.6541040598668316, + "grad_norm": 6.884663105010986, + "learning_rate": 1.5644241972121405e-05, + "loss": 1.6751, + "step": 104070 + }, + { + "epoch": 0.6541669121835287, + "grad_norm": 6.993259429931641, + "learning_rate": 1.5643822871176752e-05, + "loss": 1.7147, + "step": 104080 + }, + { + "epoch": 0.6542297645002259, + "grad_norm": 6.018107891082764, + "learning_rate": 1.56434037702321e-05, + "loss": 1.751, + "step": 104090 + }, + { + "epoch": 0.654292616816923, + "grad_norm": 5.587553024291992, + "learning_rate": 1.5642984669287446e-05, + "loss": 1.6907, + "step": 104100 + }, + { + "epoch": 0.6543554691336201, + "grad_norm": 6.716568470001221, + "learning_rate": 1.5642565568342793e-05, + "loss": 1.7155, + "step": 104110 + }, + { + "epoch": 0.6544183214503172, + "grad_norm": 6.479062080383301, + "learning_rate": 1.5642146467398137e-05, + "loss": 1.7387, + "step": 104120 + }, + { + "epoch": 0.6544811737670143, + "grad_norm": 7.63808536529541, + "learning_rate": 1.5641727366453484e-05, + "loss": 1.624, + "step": 104130 + }, + { + "epoch": 0.6545440260837114, + "grad_norm": 8.376218795776367, + "learning_rate": 1.564130826550883e-05, + "loss": 1.9181, + "step": 104140 + }, + { + "epoch": 0.6546068784004085, + "grad_norm": 5.5728864669799805, + "learning_rate": 1.5640889164564178e-05, + "loss": 1.6911, + "step": 104150 + }, + { + "epoch": 0.6546697307171057, + "grad_norm": 6.79200553894043, + "learning_rate": 1.5640470063619525e-05, + "loss": 1.596, + "step": 104160 + }, + { + "epoch": 0.6547325830338028, + "grad_norm": 6.636775970458984, + "learning_rate": 1.5640050962674872e-05, + "loss": 1.5336, + "step": 104170 + }, + { + "epoch": 0.6547954353504999, + "grad_norm": 7.076252460479736, + "learning_rate": 1.563963186173022e-05, + "loss": 1.7462, + "step": 104180 + }, + { + "epoch": 0.654858287667197, + "grad_norm": 6.61713981628418, + "learning_rate": 1.5639212760785566e-05, + "loss": 1.5213, + "step": 104190 + }, + { + "epoch": 0.6549211399838941, + "grad_norm": 6.114788055419922, + "learning_rate": 1.563879365984091e-05, + "loss": 1.5887, + "step": 104200 + }, + { + "epoch": 0.6549839923005912, + "grad_norm": 5.9853596687316895, + "learning_rate": 1.5638374558896257e-05, + "loss": 1.6344, + "step": 104210 + }, + { + "epoch": 0.6550468446172883, + "grad_norm": 5.697851657867432, + "learning_rate": 1.5637955457951604e-05, + "loss": 1.7218, + "step": 104220 + }, + { + "epoch": 0.6551096969339855, + "grad_norm": 5.63599967956543, + "learning_rate": 1.563753635700695e-05, + "loss": 1.7532, + "step": 104230 + }, + { + "epoch": 0.6551725492506826, + "grad_norm": 6.949346542358398, + "learning_rate": 1.5637117256062298e-05, + "loss": 1.5974, + "step": 104240 + }, + { + "epoch": 0.6552354015673797, + "grad_norm": 6.5578694343566895, + "learning_rate": 1.5636698155117642e-05, + "loss": 1.6962, + "step": 104250 + }, + { + "epoch": 0.6552982538840768, + "grad_norm": 6.383918285369873, + "learning_rate": 1.563627905417299e-05, + "loss": 1.5176, + "step": 104260 + }, + { + "epoch": 0.6553611062007739, + "grad_norm": 5.7543487548828125, + "learning_rate": 1.5635859953228336e-05, + "loss": 1.5252, + "step": 104270 + }, + { + "epoch": 0.6554239585174709, + "grad_norm": 6.0621771812438965, + "learning_rate": 1.5635440852283683e-05, + "loss": 1.6052, + "step": 104280 + }, + { + "epoch": 0.655486810834168, + "grad_norm": 6.2976393699646, + "learning_rate": 1.5635021751339027e-05, + "loss": 1.6621, + "step": 104290 + }, + { + "epoch": 0.6555496631508652, + "grad_norm": 7.7909088134765625, + "learning_rate": 1.5634602650394374e-05, + "loss": 1.8192, + "step": 104300 + }, + { + "epoch": 0.6556125154675623, + "grad_norm": 6.488101482391357, + "learning_rate": 1.563418354944972e-05, + "loss": 1.4934, + "step": 104310 + }, + { + "epoch": 0.6556753677842594, + "grad_norm": 6.674446105957031, + "learning_rate": 1.5633764448505068e-05, + "loss": 1.546, + "step": 104320 + }, + { + "epoch": 0.6557382201009565, + "grad_norm": 6.641434192657471, + "learning_rate": 1.5633345347560415e-05, + "loss": 1.6793, + "step": 104330 + }, + { + "epoch": 0.6558010724176536, + "grad_norm": 6.575596332550049, + "learning_rate": 1.5632926246615762e-05, + "loss": 1.7416, + "step": 104340 + }, + { + "epoch": 0.6558639247343507, + "grad_norm": 5.703613758087158, + "learning_rate": 1.563250714567111e-05, + "loss": 1.5926, + "step": 104350 + }, + { + "epoch": 0.6559267770510478, + "grad_norm": 7.271982669830322, + "learning_rate": 1.5632088044726453e-05, + "loss": 1.6012, + "step": 104360 + }, + { + "epoch": 0.655989629367745, + "grad_norm": 7.279525279998779, + "learning_rate": 1.56316689437818e-05, + "loss": 1.8358, + "step": 104370 + }, + { + "epoch": 0.6560524816844421, + "grad_norm": 7.8478779792785645, + "learning_rate": 1.5631249842837147e-05, + "loss": 1.5804, + "step": 104380 + }, + { + "epoch": 0.6561153340011392, + "grad_norm": 7.450168609619141, + "learning_rate": 1.5630830741892494e-05, + "loss": 1.6337, + "step": 104390 + }, + { + "epoch": 0.6561781863178363, + "grad_norm": 6.73850679397583, + "learning_rate": 1.563041164094784e-05, + "loss": 1.6636, + "step": 104400 + }, + { + "epoch": 0.6562410386345334, + "grad_norm": 6.8981852531433105, + "learning_rate": 1.5629992540003188e-05, + "loss": 1.4187, + "step": 104410 + }, + { + "epoch": 0.6563038909512305, + "grad_norm": 6.81155252456665, + "learning_rate": 1.5629573439058532e-05, + "loss": 1.5495, + "step": 104420 + }, + { + "epoch": 0.6563667432679277, + "grad_norm": 7.1588544845581055, + "learning_rate": 1.562915433811388e-05, + "loss": 1.5536, + "step": 104430 + }, + { + "epoch": 0.6564295955846248, + "grad_norm": 5.820721626281738, + "learning_rate": 1.5628735237169226e-05, + "loss": 1.4353, + "step": 104440 + }, + { + "epoch": 0.6564924479013219, + "grad_norm": 7.019535064697266, + "learning_rate": 1.5628316136224573e-05, + "loss": 1.6098, + "step": 104450 + }, + { + "epoch": 0.656555300218019, + "grad_norm": 5.81046724319458, + "learning_rate": 1.562789703527992e-05, + "loss": 1.4722, + "step": 104460 + }, + { + "epoch": 0.6566181525347161, + "grad_norm": 6.771027088165283, + "learning_rate": 1.5627477934335264e-05, + "loss": 1.5313, + "step": 104470 + }, + { + "epoch": 0.6566810048514132, + "grad_norm": 6.619336128234863, + "learning_rate": 1.562705883339061e-05, + "loss": 1.7025, + "step": 104480 + }, + { + "epoch": 0.6567438571681103, + "grad_norm": 6.592316627502441, + "learning_rate": 1.5626639732445958e-05, + "loss": 1.6013, + "step": 104490 + }, + { + "epoch": 0.6568067094848075, + "grad_norm": 6.460874080657959, + "learning_rate": 1.5626220631501305e-05, + "loss": 1.653, + "step": 104500 + }, + { + "epoch": 0.6568695618015046, + "grad_norm": 6.1259074211120605, + "learning_rate": 1.562580153055665e-05, + "loss": 1.5678, + "step": 104510 + }, + { + "epoch": 0.6569324141182017, + "grad_norm": 5.821610927581787, + "learning_rate": 1.5625382429611996e-05, + "loss": 1.514, + "step": 104520 + }, + { + "epoch": 0.6569952664348987, + "grad_norm": 7.125027656555176, + "learning_rate": 1.5624963328667343e-05, + "loss": 1.5448, + "step": 104530 + }, + { + "epoch": 0.6570581187515958, + "grad_norm": 7.306406021118164, + "learning_rate": 1.562454422772269e-05, + "loss": 1.7979, + "step": 104540 + }, + { + "epoch": 0.6571209710682929, + "grad_norm": 6.649415493011475, + "learning_rate": 1.5624125126778037e-05, + "loss": 1.6668, + "step": 104550 + }, + { + "epoch": 0.65718382338499, + "grad_norm": 6.722707271575928, + "learning_rate": 1.5623706025833384e-05, + "loss": 1.4713, + "step": 104560 + }, + { + "epoch": 0.6572466757016872, + "grad_norm": 7.2857666015625, + "learning_rate": 1.562328692488873e-05, + "loss": 1.6118, + "step": 104570 + }, + { + "epoch": 0.6573095280183843, + "grad_norm": 7.040681838989258, + "learning_rate": 1.5622867823944078e-05, + "loss": 1.8266, + "step": 104580 + }, + { + "epoch": 0.6573723803350814, + "grad_norm": 6.947454929351807, + "learning_rate": 1.5622448722999425e-05, + "loss": 1.6549, + "step": 104590 + }, + { + "epoch": 0.6574352326517785, + "grad_norm": 7.445709228515625, + "learning_rate": 1.562202962205477e-05, + "loss": 1.7482, + "step": 104600 + }, + { + "epoch": 0.6574980849684756, + "grad_norm": 6.887095928192139, + "learning_rate": 1.5621610521110116e-05, + "loss": 1.4813, + "step": 104610 + }, + { + "epoch": 0.6575609372851727, + "grad_norm": 5.7909159660339355, + "learning_rate": 1.5621191420165463e-05, + "loss": 1.7325, + "step": 104620 + }, + { + "epoch": 0.6576237896018698, + "grad_norm": 5.971845626831055, + "learning_rate": 1.562077231922081e-05, + "loss": 1.849, + "step": 104630 + }, + { + "epoch": 0.657686641918567, + "grad_norm": 6.557778358459473, + "learning_rate": 1.5620353218276157e-05, + "loss": 1.6898, + "step": 104640 + }, + { + "epoch": 0.6577494942352641, + "grad_norm": 7.787708759307861, + "learning_rate": 1.56199341173315e-05, + "loss": 1.6621, + "step": 104650 + }, + { + "epoch": 0.6578123465519612, + "grad_norm": 5.77874755859375, + "learning_rate": 1.5619515016386848e-05, + "loss": 1.5902, + "step": 104660 + }, + { + "epoch": 0.6578751988686583, + "grad_norm": 6.337566375732422, + "learning_rate": 1.5619095915442195e-05, + "loss": 1.7225, + "step": 104670 + }, + { + "epoch": 0.6579380511853554, + "grad_norm": 6.183952331542969, + "learning_rate": 1.5618676814497542e-05, + "loss": 1.6201, + "step": 104680 + }, + { + "epoch": 0.6580009035020525, + "grad_norm": 5.826647758483887, + "learning_rate": 1.5618257713552886e-05, + "loss": 1.4433, + "step": 104690 + }, + { + "epoch": 0.6580637558187497, + "grad_norm": 5.762514591217041, + "learning_rate": 1.5617838612608233e-05, + "loss": 1.611, + "step": 104700 + }, + { + "epoch": 0.6581266081354468, + "grad_norm": 5.980803966522217, + "learning_rate": 1.561741951166358e-05, + "loss": 1.6396, + "step": 104710 + }, + { + "epoch": 0.6581894604521439, + "grad_norm": 7.105878829956055, + "learning_rate": 1.5617000410718927e-05, + "loss": 1.5972, + "step": 104720 + }, + { + "epoch": 0.658252312768841, + "grad_norm": 7.1127495765686035, + "learning_rate": 1.5616581309774274e-05, + "loss": 1.6492, + "step": 104730 + }, + { + "epoch": 0.6583151650855381, + "grad_norm": 8.201815605163574, + "learning_rate": 1.5616162208829618e-05, + "loss": 1.612, + "step": 104740 + }, + { + "epoch": 0.6583780174022352, + "grad_norm": 5.925051212310791, + "learning_rate": 1.5615743107884965e-05, + "loss": 1.8186, + "step": 104750 + }, + { + "epoch": 0.6584408697189323, + "grad_norm": 6.60505485534668, + "learning_rate": 1.5615324006940312e-05, + "loss": 1.6606, + "step": 104760 + }, + { + "epoch": 0.6585037220356295, + "grad_norm": 5.959009170532227, + "learning_rate": 1.561490490599566e-05, + "loss": 1.6531, + "step": 104770 + }, + { + "epoch": 0.6585665743523266, + "grad_norm": 6.67812442779541, + "learning_rate": 1.5614485805051006e-05, + "loss": 1.7244, + "step": 104780 + }, + { + "epoch": 0.6586294266690236, + "grad_norm": 6.7957234382629395, + "learning_rate": 1.5614066704106353e-05, + "loss": 1.6334, + "step": 104790 + }, + { + "epoch": 0.6586922789857207, + "grad_norm": 5.9729390144348145, + "learning_rate": 1.56136476031617e-05, + "loss": 1.6634, + "step": 104800 + }, + { + "epoch": 0.6587551313024178, + "grad_norm": 6.336520671844482, + "learning_rate": 1.5613228502217047e-05, + "loss": 1.6001, + "step": 104810 + }, + { + "epoch": 0.6588179836191149, + "grad_norm": 7.992360591888428, + "learning_rate": 1.561280940127239e-05, + "loss": 1.5651, + "step": 104820 + }, + { + "epoch": 0.658880835935812, + "grad_norm": 6.051232814788818, + "learning_rate": 1.5612390300327738e-05, + "loss": 1.5507, + "step": 104830 + }, + { + "epoch": 0.6589436882525092, + "grad_norm": 6.314977169036865, + "learning_rate": 1.5611971199383085e-05, + "loss": 1.5137, + "step": 104840 + }, + { + "epoch": 0.6590065405692063, + "grad_norm": 7.841394424438477, + "learning_rate": 1.5611552098438432e-05, + "loss": 1.8054, + "step": 104850 + }, + { + "epoch": 0.6590693928859034, + "grad_norm": 6.797749042510986, + "learning_rate": 1.561113299749378e-05, + "loss": 1.7056, + "step": 104860 + }, + { + "epoch": 0.6591322452026005, + "grad_norm": 7.025812149047852, + "learning_rate": 1.5610713896549123e-05, + "loss": 1.6572, + "step": 104870 + }, + { + "epoch": 0.6591950975192976, + "grad_norm": 6.532901287078857, + "learning_rate": 1.561029479560447e-05, + "loss": 1.4581, + "step": 104880 + }, + { + "epoch": 0.6592579498359947, + "grad_norm": 6.782949924468994, + "learning_rate": 1.5609875694659817e-05, + "loss": 1.6079, + "step": 104890 + }, + { + "epoch": 0.6593208021526918, + "grad_norm": 6.53534460067749, + "learning_rate": 1.5609456593715164e-05, + "loss": 1.3991, + "step": 104900 + }, + { + "epoch": 0.659383654469389, + "grad_norm": 6.3602213859558105, + "learning_rate": 1.5609037492770508e-05, + "loss": 1.6587, + "step": 104910 + }, + { + "epoch": 0.6594465067860861, + "grad_norm": 7.155568599700928, + "learning_rate": 1.5608618391825855e-05, + "loss": 1.5366, + "step": 104920 + }, + { + "epoch": 0.6595093591027832, + "grad_norm": 6.3102264404296875, + "learning_rate": 1.5608199290881202e-05, + "loss": 1.8082, + "step": 104930 + }, + { + "epoch": 0.6595722114194803, + "grad_norm": 6.084749698638916, + "learning_rate": 1.560778018993655e-05, + "loss": 1.6722, + "step": 104940 + }, + { + "epoch": 0.6596350637361774, + "grad_norm": 5.354098320007324, + "learning_rate": 1.5607361088991896e-05, + "loss": 1.6343, + "step": 104950 + }, + { + "epoch": 0.6596979160528745, + "grad_norm": 6.635079383850098, + "learning_rate": 1.5606941988047243e-05, + "loss": 1.6978, + "step": 104960 + }, + { + "epoch": 0.6597607683695716, + "grad_norm": 6.3366875648498535, + "learning_rate": 1.560652288710259e-05, + "loss": 1.7198, + "step": 104970 + }, + { + "epoch": 0.6598236206862688, + "grad_norm": 6.161338806152344, + "learning_rate": 1.5606103786157937e-05, + "loss": 1.7174, + "step": 104980 + }, + { + "epoch": 0.6598864730029659, + "grad_norm": 6.67751407623291, + "learning_rate": 1.560568468521328e-05, + "loss": 1.5836, + "step": 104990 + }, + { + "epoch": 0.659949325319663, + "grad_norm": 6.224991798400879, + "learning_rate": 1.5605265584268628e-05, + "loss": 1.7559, + "step": 105000 + }, + { + "epoch": 0.6600121776363601, + "grad_norm": 6.425436496734619, + "learning_rate": 1.5604846483323975e-05, + "loss": 1.8432, + "step": 105010 + }, + { + "epoch": 0.6600750299530572, + "grad_norm": 7.239255905151367, + "learning_rate": 1.5604427382379322e-05, + "loss": 1.7081, + "step": 105020 + }, + { + "epoch": 0.6601378822697543, + "grad_norm": 6.133124351501465, + "learning_rate": 1.560400828143467e-05, + "loss": 1.5968, + "step": 105030 + }, + { + "epoch": 0.6602007345864513, + "grad_norm": 6.762413024902344, + "learning_rate": 1.5603589180490013e-05, + "loss": 1.9224, + "step": 105040 + }, + { + "epoch": 0.6602635869031485, + "grad_norm": 6.529871463775635, + "learning_rate": 1.560317007954536e-05, + "loss": 1.6763, + "step": 105050 + }, + { + "epoch": 0.6603264392198456, + "grad_norm": 6.822028160095215, + "learning_rate": 1.5602750978600707e-05, + "loss": 1.7081, + "step": 105060 + }, + { + "epoch": 0.6603892915365427, + "grad_norm": 6.54036808013916, + "learning_rate": 1.5602331877656054e-05, + "loss": 1.6399, + "step": 105070 + }, + { + "epoch": 0.6604521438532398, + "grad_norm": 5.844644069671631, + "learning_rate": 1.56019127767114e-05, + "loss": 1.466, + "step": 105080 + }, + { + "epoch": 0.6605149961699369, + "grad_norm": 7.746768951416016, + "learning_rate": 1.5601493675766745e-05, + "loss": 1.664, + "step": 105090 + }, + { + "epoch": 0.660577848486634, + "grad_norm": 7.402467727661133, + "learning_rate": 1.5601074574822092e-05, + "loss": 1.564, + "step": 105100 + }, + { + "epoch": 0.6606407008033311, + "grad_norm": 6.537257671356201, + "learning_rate": 1.560065547387744e-05, + "loss": 1.5916, + "step": 105110 + }, + { + "epoch": 0.6607035531200283, + "grad_norm": 6.990403652191162, + "learning_rate": 1.5600236372932786e-05, + "loss": 1.7585, + "step": 105120 + }, + { + "epoch": 0.6607664054367254, + "grad_norm": 6.598796844482422, + "learning_rate": 1.559981727198813e-05, + "loss": 1.7664, + "step": 105130 + }, + { + "epoch": 0.6608292577534225, + "grad_norm": 6.536150932312012, + "learning_rate": 1.5599398171043477e-05, + "loss": 1.6822, + "step": 105140 + }, + { + "epoch": 0.6608921100701196, + "grad_norm": 6.739705562591553, + "learning_rate": 1.5598979070098824e-05, + "loss": 1.6046, + "step": 105150 + }, + { + "epoch": 0.6609549623868167, + "grad_norm": 6.427530288696289, + "learning_rate": 1.559855996915417e-05, + "loss": 1.7359, + "step": 105160 + }, + { + "epoch": 0.6610178147035138, + "grad_norm": 6.731106758117676, + "learning_rate": 1.5598140868209518e-05, + "loss": 1.6558, + "step": 105170 + }, + { + "epoch": 0.661080667020211, + "grad_norm": 6.3343400955200195, + "learning_rate": 1.5597721767264865e-05, + "loss": 1.477, + "step": 105180 + }, + { + "epoch": 0.6611435193369081, + "grad_norm": 5.260409832000732, + "learning_rate": 1.5597302666320212e-05, + "loss": 1.7898, + "step": 105190 + }, + { + "epoch": 0.6612063716536052, + "grad_norm": 6.937628269195557, + "learning_rate": 1.559688356537556e-05, + "loss": 1.522, + "step": 105200 + }, + { + "epoch": 0.6612692239703023, + "grad_norm": 6.389298915863037, + "learning_rate": 1.5596464464430906e-05, + "loss": 1.8267, + "step": 105210 + }, + { + "epoch": 0.6613320762869994, + "grad_norm": 6.643794536590576, + "learning_rate": 1.559604536348625e-05, + "loss": 1.4692, + "step": 105220 + }, + { + "epoch": 0.6613949286036965, + "grad_norm": 8.866166114807129, + "learning_rate": 1.5595626262541597e-05, + "loss": 1.7742, + "step": 105230 + }, + { + "epoch": 0.6614577809203936, + "grad_norm": 5.795926094055176, + "learning_rate": 1.5595207161596944e-05, + "loss": 1.7647, + "step": 105240 + }, + { + "epoch": 0.6615206332370908, + "grad_norm": 6.502231597900391, + "learning_rate": 1.559478806065229e-05, + "loss": 1.8489, + "step": 105250 + }, + { + "epoch": 0.6615834855537879, + "grad_norm": 5.95989465713501, + "learning_rate": 1.559436895970764e-05, + "loss": 1.4893, + "step": 105260 + }, + { + "epoch": 0.661646337870485, + "grad_norm": 6.643709659576416, + "learning_rate": 1.5593949858762982e-05, + "loss": 1.6102, + "step": 105270 + }, + { + "epoch": 0.6617091901871821, + "grad_norm": 6.032772064208984, + "learning_rate": 1.559353075781833e-05, + "loss": 1.6731, + "step": 105280 + }, + { + "epoch": 0.6617720425038792, + "grad_norm": 6.644000053405762, + "learning_rate": 1.5593111656873676e-05, + "loss": 1.4855, + "step": 105290 + }, + { + "epoch": 0.6618348948205762, + "grad_norm": 7.02887487411499, + "learning_rate": 1.5592692555929023e-05, + "loss": 1.5104, + "step": 105300 + }, + { + "epoch": 0.6618977471372733, + "grad_norm": 6.866332530975342, + "learning_rate": 1.5592273454984367e-05, + "loss": 1.5726, + "step": 105310 + }, + { + "epoch": 0.6619605994539705, + "grad_norm": 6.947324752807617, + "learning_rate": 1.5591854354039714e-05, + "loss": 1.545, + "step": 105320 + }, + { + "epoch": 0.6620234517706676, + "grad_norm": 7.434155464172363, + "learning_rate": 1.559143525309506e-05, + "loss": 1.642, + "step": 105330 + }, + { + "epoch": 0.6620863040873647, + "grad_norm": 6.633470058441162, + "learning_rate": 1.5591016152150408e-05, + "loss": 1.6313, + "step": 105340 + }, + { + "epoch": 0.6621491564040618, + "grad_norm": 7.02971887588501, + "learning_rate": 1.5590597051205755e-05, + "loss": 1.4788, + "step": 105350 + }, + { + "epoch": 0.6622120087207589, + "grad_norm": 5.845401763916016, + "learning_rate": 1.5590177950261102e-05, + "loss": 1.6663, + "step": 105360 + }, + { + "epoch": 0.662274861037456, + "grad_norm": 6.136750221252441, + "learning_rate": 1.5589758849316446e-05, + "loss": 1.781, + "step": 105370 + }, + { + "epoch": 0.6623377133541531, + "grad_norm": 5.939883708953857, + "learning_rate": 1.5589339748371793e-05, + "loss": 1.8151, + "step": 105380 + }, + { + "epoch": 0.6624005656708503, + "grad_norm": 7.601027965545654, + "learning_rate": 1.558892064742714e-05, + "loss": 1.5708, + "step": 105390 + }, + { + "epoch": 0.6624634179875474, + "grad_norm": 6.943202018737793, + "learning_rate": 1.5588501546482487e-05, + "loss": 1.7305, + "step": 105400 + }, + { + "epoch": 0.6625262703042445, + "grad_norm": 6.401005744934082, + "learning_rate": 1.5588082445537834e-05, + "loss": 1.8093, + "step": 105410 + }, + { + "epoch": 0.6625891226209416, + "grad_norm": 6.7155070304870605, + "learning_rate": 1.558766334459318e-05, + "loss": 1.7965, + "step": 105420 + }, + { + "epoch": 0.6626519749376387, + "grad_norm": 8.192904472351074, + "learning_rate": 1.558724424364853e-05, + "loss": 1.7007, + "step": 105430 + }, + { + "epoch": 0.6627148272543358, + "grad_norm": 6.841559410095215, + "learning_rate": 1.5586825142703872e-05, + "loss": 1.6327, + "step": 105440 + }, + { + "epoch": 0.662777679571033, + "grad_norm": 7.527811050415039, + "learning_rate": 1.558640604175922e-05, + "loss": 1.7499, + "step": 105450 + }, + { + "epoch": 0.6628405318877301, + "grad_norm": 5.782719135284424, + "learning_rate": 1.5585986940814566e-05, + "loss": 1.5712, + "step": 105460 + }, + { + "epoch": 0.6629033842044272, + "grad_norm": 6.337703227996826, + "learning_rate": 1.5585567839869913e-05, + "loss": 1.7707, + "step": 105470 + }, + { + "epoch": 0.6629662365211243, + "grad_norm": 5.9730544090271, + "learning_rate": 1.558514873892526e-05, + "loss": 1.5823, + "step": 105480 + }, + { + "epoch": 0.6630290888378214, + "grad_norm": 6.3289666175842285, + "learning_rate": 1.5584729637980604e-05, + "loss": 1.5429, + "step": 105490 + }, + { + "epoch": 0.6630919411545185, + "grad_norm": 6.009237289428711, + "learning_rate": 1.558431053703595e-05, + "loss": 1.6163, + "step": 105500 + }, + { + "epoch": 0.6631547934712156, + "grad_norm": 7.3155436515808105, + "learning_rate": 1.5583891436091298e-05, + "loss": 1.7001, + "step": 105510 + }, + { + "epoch": 0.6632176457879128, + "grad_norm": 7.042203426361084, + "learning_rate": 1.5583472335146645e-05, + "loss": 1.8785, + "step": 105520 + }, + { + "epoch": 0.6632804981046099, + "grad_norm": 6.458547115325928, + "learning_rate": 1.558305323420199e-05, + "loss": 1.4179, + "step": 105530 + }, + { + "epoch": 0.663343350421307, + "grad_norm": 6.338866233825684, + "learning_rate": 1.5582634133257336e-05, + "loss": 1.6736, + "step": 105540 + }, + { + "epoch": 0.663406202738004, + "grad_norm": 6.325279712677002, + "learning_rate": 1.5582215032312683e-05, + "loss": 1.5982, + "step": 105550 + }, + { + "epoch": 0.6634690550547011, + "grad_norm": 6.828125953674316, + "learning_rate": 1.558179593136803e-05, + "loss": 1.951, + "step": 105560 + }, + { + "epoch": 0.6635319073713982, + "grad_norm": 6.712599277496338, + "learning_rate": 1.5581376830423377e-05, + "loss": 1.534, + "step": 105570 + }, + { + "epoch": 0.6635947596880953, + "grad_norm": 6.4551262855529785, + "learning_rate": 1.5580957729478724e-05, + "loss": 1.5952, + "step": 105580 + }, + { + "epoch": 0.6636576120047925, + "grad_norm": 6.0207977294921875, + "learning_rate": 1.558053862853407e-05, + "loss": 1.6773, + "step": 105590 + }, + { + "epoch": 0.6637204643214896, + "grad_norm": 7.634629249572754, + "learning_rate": 1.558011952758942e-05, + "loss": 1.752, + "step": 105600 + }, + { + "epoch": 0.6637833166381867, + "grad_norm": 6.328338623046875, + "learning_rate": 1.5579700426644765e-05, + "loss": 1.5793, + "step": 105610 + }, + { + "epoch": 0.6638461689548838, + "grad_norm": 5.922032356262207, + "learning_rate": 1.557928132570011e-05, + "loss": 1.4599, + "step": 105620 + }, + { + "epoch": 0.6639090212715809, + "grad_norm": 6.268474102020264, + "learning_rate": 1.5578862224755456e-05, + "loss": 1.6257, + "step": 105630 + }, + { + "epoch": 0.663971873588278, + "grad_norm": 6.577045917510986, + "learning_rate": 1.5578443123810803e-05, + "loss": 1.5157, + "step": 105640 + }, + { + "epoch": 0.6640347259049751, + "grad_norm": 5.638345241546631, + "learning_rate": 1.557802402286615e-05, + "loss": 1.578, + "step": 105650 + }, + { + "epoch": 0.6640975782216723, + "grad_norm": 6.140465259552002, + "learning_rate": 1.5577604921921494e-05, + "loss": 1.5578, + "step": 105660 + }, + { + "epoch": 0.6641604305383694, + "grad_norm": 6.0519843101501465, + "learning_rate": 1.557718582097684e-05, + "loss": 1.5569, + "step": 105670 + }, + { + "epoch": 0.6642232828550665, + "grad_norm": 7.764479160308838, + "learning_rate": 1.5576766720032188e-05, + "loss": 1.5647, + "step": 105680 + }, + { + "epoch": 0.6642861351717636, + "grad_norm": 6.630612373352051, + "learning_rate": 1.5576347619087535e-05, + "loss": 1.6348, + "step": 105690 + }, + { + "epoch": 0.6643489874884607, + "grad_norm": 6.928365707397461, + "learning_rate": 1.5575928518142882e-05, + "loss": 1.5322, + "step": 105700 + }, + { + "epoch": 0.6644118398051578, + "grad_norm": 6.307169437408447, + "learning_rate": 1.5575509417198226e-05, + "loss": 1.6617, + "step": 105710 + }, + { + "epoch": 0.664474692121855, + "grad_norm": 6.200599193572998, + "learning_rate": 1.5575090316253573e-05, + "loss": 1.862, + "step": 105720 + }, + { + "epoch": 0.6645375444385521, + "grad_norm": 6.4765167236328125, + "learning_rate": 1.557467121530892e-05, + "loss": 1.7521, + "step": 105730 + }, + { + "epoch": 0.6646003967552492, + "grad_norm": 8.003438949584961, + "learning_rate": 1.5574252114364267e-05, + "loss": 1.591, + "step": 105740 + }, + { + "epoch": 0.6646632490719463, + "grad_norm": 5.496792793273926, + "learning_rate": 1.557383301341961e-05, + "loss": 1.4939, + "step": 105750 + }, + { + "epoch": 0.6647261013886434, + "grad_norm": 7.4300713539123535, + "learning_rate": 1.5573413912474958e-05, + "loss": 1.728, + "step": 105760 + }, + { + "epoch": 0.6647889537053405, + "grad_norm": 7.0992431640625, + "learning_rate": 1.5572994811530305e-05, + "loss": 1.5779, + "step": 105770 + }, + { + "epoch": 0.6648518060220376, + "grad_norm": 6.628957748413086, + "learning_rate": 1.5572575710585652e-05, + "loss": 1.4178, + "step": 105780 + }, + { + "epoch": 0.6649146583387348, + "grad_norm": 7.31161642074585, + "learning_rate": 1.5572156609641e-05, + "loss": 1.7454, + "step": 105790 + }, + { + "epoch": 0.6649775106554319, + "grad_norm": 7.662426471710205, + "learning_rate": 1.5571737508696346e-05, + "loss": 1.6708, + "step": 105800 + }, + { + "epoch": 0.6650403629721289, + "grad_norm": 6.7243781089782715, + "learning_rate": 1.5571318407751693e-05, + "loss": 1.619, + "step": 105810 + }, + { + "epoch": 0.665103215288826, + "grad_norm": 6.5975260734558105, + "learning_rate": 1.557089930680704e-05, + "loss": 1.649, + "step": 105820 + }, + { + "epoch": 0.6651660676055231, + "grad_norm": 7.845924377441406, + "learning_rate": 1.5570480205862387e-05, + "loss": 1.6569, + "step": 105830 + }, + { + "epoch": 0.6652289199222202, + "grad_norm": 7.023687839508057, + "learning_rate": 1.557006110491773e-05, + "loss": 1.5158, + "step": 105840 + }, + { + "epoch": 0.6652917722389173, + "grad_norm": 6.436861991882324, + "learning_rate": 1.5569642003973078e-05, + "loss": 1.3093, + "step": 105850 + }, + { + "epoch": 0.6653546245556144, + "grad_norm": 6.995415210723877, + "learning_rate": 1.5569222903028425e-05, + "loss": 1.8804, + "step": 105860 + }, + { + "epoch": 0.6654174768723116, + "grad_norm": 6.78193998336792, + "learning_rate": 1.5568803802083772e-05, + "loss": 1.9103, + "step": 105870 + }, + { + "epoch": 0.6654803291890087, + "grad_norm": 7.750466346740723, + "learning_rate": 1.556838470113912e-05, + "loss": 1.573, + "step": 105880 + }, + { + "epoch": 0.6655431815057058, + "grad_norm": 8.08763313293457, + "learning_rate": 1.5567965600194463e-05, + "loss": 1.6636, + "step": 105890 + }, + { + "epoch": 0.6656060338224029, + "grad_norm": 6.756246089935303, + "learning_rate": 1.556754649924981e-05, + "loss": 1.5212, + "step": 105900 + }, + { + "epoch": 0.6656688861391, + "grad_norm": 6.320174217224121, + "learning_rate": 1.5567127398305157e-05, + "loss": 1.6462, + "step": 105910 + }, + { + "epoch": 0.6657317384557971, + "grad_norm": 6.737569808959961, + "learning_rate": 1.5566708297360504e-05, + "loss": 1.6862, + "step": 105920 + }, + { + "epoch": 0.6657945907724943, + "grad_norm": 7.224164009094238, + "learning_rate": 1.5566289196415848e-05, + "loss": 1.7514, + "step": 105930 + }, + { + "epoch": 0.6658574430891914, + "grad_norm": 6.790672302246094, + "learning_rate": 1.5565870095471195e-05, + "loss": 1.6229, + "step": 105940 + }, + { + "epoch": 0.6659202954058885, + "grad_norm": 5.7221269607543945, + "learning_rate": 1.5565450994526542e-05, + "loss": 1.7798, + "step": 105950 + }, + { + "epoch": 0.6659831477225856, + "grad_norm": 6.954379558563232, + "learning_rate": 1.556503189358189e-05, + "loss": 1.6222, + "step": 105960 + }, + { + "epoch": 0.6660460000392827, + "grad_norm": 6.151113033294678, + "learning_rate": 1.55646547027317e-05, + "loss": 1.3968, + "step": 105970 + }, + { + "epoch": 0.6661088523559798, + "grad_norm": 6.866469860076904, + "learning_rate": 1.5564235601787048e-05, + "loss": 1.4886, + "step": 105980 + }, + { + "epoch": 0.666171704672677, + "grad_norm": 6.896879196166992, + "learning_rate": 1.5563816500842395e-05, + "loss": 1.7967, + "step": 105990 + }, + { + "epoch": 0.6662345569893741, + "grad_norm": 5.111074924468994, + "learning_rate": 1.5563397399897742e-05, + "loss": 1.4791, + "step": 106000 + }, + { + "epoch": 0.6662974093060712, + "grad_norm": 6.863555908203125, + "learning_rate": 1.5562978298953085e-05, + "loss": 1.6016, + "step": 106010 + }, + { + "epoch": 0.6663602616227683, + "grad_norm": 6.542895793914795, + "learning_rate": 1.5562559198008433e-05, + "loss": 1.6089, + "step": 106020 + }, + { + "epoch": 0.6664231139394654, + "grad_norm": 7.639867305755615, + "learning_rate": 1.556214009706378e-05, + "loss": 1.8336, + "step": 106030 + }, + { + "epoch": 0.6664859662561625, + "grad_norm": 6.384363651275635, + "learning_rate": 1.5561720996119127e-05, + "loss": 1.7023, + "step": 106040 + }, + { + "epoch": 0.6665488185728596, + "grad_norm": 6.245970249176025, + "learning_rate": 1.5561301895174474e-05, + "loss": 1.6612, + "step": 106050 + }, + { + "epoch": 0.6666116708895566, + "grad_norm": 5.9655442237854, + "learning_rate": 1.5560882794229817e-05, + "loss": 1.5802, + "step": 106060 + }, + { + "epoch": 0.6666745232062538, + "grad_norm": 6.154887676239014, + "learning_rate": 1.5560463693285164e-05, + "loss": 1.5052, + "step": 106070 + }, + { + "epoch": 0.6667373755229509, + "grad_norm": 7.505038738250732, + "learning_rate": 1.556004459234051e-05, + "loss": 1.5395, + "step": 106080 + }, + { + "epoch": 0.666800227839648, + "grad_norm": 5.876123428344727, + "learning_rate": 1.5559667401490323e-05, + "loss": 1.5256, + "step": 106090 + }, + { + "epoch": 0.6668630801563451, + "grad_norm": 6.18739652633667, + "learning_rate": 1.555924830054567e-05, + "loss": 1.5564, + "step": 106100 + }, + { + "epoch": 0.6669259324730422, + "grad_norm": 7.498041152954102, + "learning_rate": 1.5558829199601017e-05, + "loss": 1.5418, + "step": 106110 + }, + { + "epoch": 0.6669887847897393, + "grad_norm": 6.704996109008789, + "learning_rate": 1.5558410098656364e-05, + "loss": 1.64, + "step": 106120 + }, + { + "epoch": 0.6670516371064364, + "grad_norm": 6.852806091308594, + "learning_rate": 1.5557990997711708e-05, + "loss": 1.6613, + "step": 106130 + }, + { + "epoch": 0.6671144894231336, + "grad_norm": 6.0546650886535645, + "learning_rate": 1.5557571896767055e-05, + "loss": 1.6197, + "step": 106140 + }, + { + "epoch": 0.6671773417398307, + "grad_norm": 7.7422356605529785, + "learning_rate": 1.5557152795822402e-05, + "loss": 1.7779, + "step": 106150 + }, + { + "epoch": 0.6672401940565278, + "grad_norm": 6.542478561401367, + "learning_rate": 1.555673369487775e-05, + "loss": 1.6599, + "step": 106160 + }, + { + "epoch": 0.6673030463732249, + "grad_norm": 6.31635046005249, + "learning_rate": 1.5556314593933096e-05, + "loss": 1.4869, + "step": 106170 + }, + { + "epoch": 0.667365898689922, + "grad_norm": 5.072801113128662, + "learning_rate": 1.5555895492988443e-05, + "loss": 1.4812, + "step": 106180 + }, + { + "epoch": 0.6674287510066191, + "grad_norm": 7.292965412139893, + "learning_rate": 1.555547639204379e-05, + "loss": 1.5579, + "step": 106190 + }, + { + "epoch": 0.6674916033233163, + "grad_norm": 8.103798866271973, + "learning_rate": 1.5555057291099137e-05, + "loss": 1.7417, + "step": 106200 + }, + { + "epoch": 0.6675544556400134, + "grad_norm": 6.62891960144043, + "learning_rate": 1.5554638190154484e-05, + "loss": 1.8203, + "step": 106210 + }, + { + "epoch": 0.6676173079567105, + "grad_norm": 6.723920822143555, + "learning_rate": 1.5554219089209828e-05, + "loss": 1.5491, + "step": 106220 + }, + { + "epoch": 0.6676801602734076, + "grad_norm": 5.83394718170166, + "learning_rate": 1.5553799988265175e-05, + "loss": 1.7686, + "step": 106230 + }, + { + "epoch": 0.6677430125901047, + "grad_norm": 6.1524810791015625, + "learning_rate": 1.5553380887320522e-05, + "loss": 1.651, + "step": 106240 + }, + { + "epoch": 0.6678058649068018, + "grad_norm": 5.9737372398376465, + "learning_rate": 1.555296178637587e-05, + "loss": 1.8207, + "step": 106250 + }, + { + "epoch": 0.6678687172234989, + "grad_norm": 7.384658336639404, + "learning_rate": 1.5552542685431213e-05, + "loss": 1.7481, + "step": 106260 + }, + { + "epoch": 0.6679315695401961, + "grad_norm": 6.399549961090088, + "learning_rate": 1.555212358448656e-05, + "loss": 1.7386, + "step": 106270 + }, + { + "epoch": 0.6679944218568932, + "grad_norm": 6.9837727546691895, + "learning_rate": 1.5551704483541907e-05, + "loss": 1.7751, + "step": 106280 + }, + { + "epoch": 0.6680572741735903, + "grad_norm": 5.711150169372559, + "learning_rate": 1.5551285382597254e-05, + "loss": 1.577, + "step": 106290 + }, + { + "epoch": 0.6681201264902874, + "grad_norm": 5.359866142272949, + "learning_rate": 1.5550866281652598e-05, + "loss": 1.6143, + "step": 106300 + }, + { + "epoch": 0.6681829788069845, + "grad_norm": 6.565360069274902, + "learning_rate": 1.5550447180707945e-05, + "loss": 1.5832, + "step": 106310 + }, + { + "epoch": 0.6682458311236815, + "grad_norm": 6.1983771324157715, + "learning_rate": 1.5550028079763292e-05, + "loss": 1.7735, + "step": 106320 + }, + { + "epoch": 0.6683086834403786, + "grad_norm": 6.371673107147217, + "learning_rate": 1.554960897881864e-05, + "loss": 1.4869, + "step": 106330 + }, + { + "epoch": 0.6683715357570758, + "grad_norm": 5.785719394683838, + "learning_rate": 1.5549189877873986e-05, + "loss": 1.7531, + "step": 106340 + }, + { + "epoch": 0.6684343880737729, + "grad_norm": 7.046372890472412, + "learning_rate": 1.5548770776929333e-05, + "loss": 1.6951, + "step": 106350 + }, + { + "epoch": 0.66849724039047, + "grad_norm": 7.223507404327393, + "learning_rate": 1.554835167598468e-05, + "loss": 1.6526, + "step": 106360 + }, + { + "epoch": 0.6685600927071671, + "grad_norm": 6.702063083648682, + "learning_rate": 1.5547932575040024e-05, + "loss": 1.476, + "step": 106370 + }, + { + "epoch": 0.6686229450238642, + "grad_norm": 5.8352370262146, + "learning_rate": 1.554751347409537e-05, + "loss": 1.5178, + "step": 106380 + }, + { + "epoch": 0.6686857973405613, + "grad_norm": 6.32943058013916, + "learning_rate": 1.5547094373150718e-05, + "loss": 1.577, + "step": 106390 + }, + { + "epoch": 0.6687486496572584, + "grad_norm": 6.566135406494141, + "learning_rate": 1.5546675272206065e-05, + "loss": 1.5621, + "step": 106400 + }, + { + "epoch": 0.6688115019739556, + "grad_norm": 6.144376754760742, + "learning_rate": 1.5546256171261412e-05, + "loss": 1.6911, + "step": 106410 + }, + { + "epoch": 0.6688743542906527, + "grad_norm": 6.189729690551758, + "learning_rate": 1.554583707031676e-05, + "loss": 1.4312, + "step": 106420 + }, + { + "epoch": 0.6689372066073498, + "grad_norm": 6.6637773513793945, + "learning_rate": 1.5545417969372106e-05, + "loss": 1.6125, + "step": 106430 + }, + { + "epoch": 0.6690000589240469, + "grad_norm": 6.491280555725098, + "learning_rate": 1.554499886842745e-05, + "loss": 1.7233, + "step": 106440 + }, + { + "epoch": 0.669062911240744, + "grad_norm": 7.1595354080200195, + "learning_rate": 1.5544579767482797e-05, + "loss": 1.7563, + "step": 106450 + }, + { + "epoch": 0.6691257635574411, + "grad_norm": 6.155887603759766, + "learning_rate": 1.5544160666538144e-05, + "loss": 1.8459, + "step": 106460 + }, + { + "epoch": 0.6691886158741382, + "grad_norm": 7.118120193481445, + "learning_rate": 1.554374156559349e-05, + "loss": 1.6809, + "step": 106470 + }, + { + "epoch": 0.6692514681908354, + "grad_norm": 5.1893486976623535, + "learning_rate": 1.5543322464648835e-05, + "loss": 1.6264, + "step": 106480 + }, + { + "epoch": 0.6693143205075325, + "grad_norm": 6.951686382293701, + "learning_rate": 1.5542903363704182e-05, + "loss": 1.6391, + "step": 106490 + }, + { + "epoch": 0.6693771728242296, + "grad_norm": 6.366652488708496, + "learning_rate": 1.554248426275953e-05, + "loss": 1.4176, + "step": 106500 + }, + { + "epoch": 0.6694400251409267, + "grad_norm": 6.808743953704834, + "learning_rate": 1.5542065161814876e-05, + "loss": 1.7209, + "step": 106510 + }, + { + "epoch": 0.6695028774576238, + "grad_norm": 5.900963306427002, + "learning_rate": 1.5541646060870223e-05, + "loss": 1.5499, + "step": 106520 + }, + { + "epoch": 0.6695657297743209, + "grad_norm": 7.198376178741455, + "learning_rate": 1.5541226959925567e-05, + "loss": 1.5471, + "step": 106530 + }, + { + "epoch": 0.669628582091018, + "grad_norm": 7.565441131591797, + "learning_rate": 1.5540807858980914e-05, + "loss": 1.6335, + "step": 106540 + }, + { + "epoch": 0.6696914344077152, + "grad_norm": 7.076322078704834, + "learning_rate": 1.554038875803626e-05, + "loss": 1.8162, + "step": 106550 + }, + { + "epoch": 0.6697542867244123, + "grad_norm": 5.171063423156738, + "learning_rate": 1.5539969657091608e-05, + "loss": 1.7026, + "step": 106560 + }, + { + "epoch": 0.6698171390411093, + "grad_norm": 5.83885383605957, + "learning_rate": 1.5539550556146955e-05, + "loss": 1.3154, + "step": 106570 + }, + { + "epoch": 0.6698799913578064, + "grad_norm": 5.799517631530762, + "learning_rate": 1.5539131455202302e-05, + "loss": 1.5434, + "step": 106580 + }, + { + "epoch": 0.6699428436745035, + "grad_norm": 6.511415958404541, + "learning_rate": 1.553871235425765e-05, + "loss": 1.6854, + "step": 106590 + }, + { + "epoch": 0.6700056959912006, + "grad_norm": 6.911275863647461, + "learning_rate": 1.5538293253312996e-05, + "loss": 1.8594, + "step": 106600 + }, + { + "epoch": 0.6700685483078977, + "grad_norm": 6.210766315460205, + "learning_rate": 1.553787415236834e-05, + "loss": 1.6294, + "step": 106610 + }, + { + "epoch": 0.6701314006245949, + "grad_norm": 5.71303653717041, + "learning_rate": 1.5537455051423687e-05, + "loss": 1.7189, + "step": 106620 + }, + { + "epoch": 0.670194252941292, + "grad_norm": 6.001368045806885, + "learning_rate": 1.5537035950479034e-05, + "loss": 1.6858, + "step": 106630 + }, + { + "epoch": 0.6702571052579891, + "grad_norm": 6.6295695304870605, + "learning_rate": 1.553661684953438e-05, + "loss": 1.7003, + "step": 106640 + }, + { + "epoch": 0.6703199575746862, + "grad_norm": 7.705511093139648, + "learning_rate": 1.553619774858973e-05, + "loss": 1.5779, + "step": 106650 + }, + { + "epoch": 0.6703828098913833, + "grad_norm": 7.167219638824463, + "learning_rate": 1.5535778647645072e-05, + "loss": 1.7056, + "step": 106660 + }, + { + "epoch": 0.6704456622080804, + "grad_norm": 4.897042751312256, + "learning_rate": 1.553535954670042e-05, + "loss": 1.574, + "step": 106670 + }, + { + "epoch": 0.6705085145247776, + "grad_norm": 6.355395317077637, + "learning_rate": 1.5534940445755766e-05, + "loss": 1.8371, + "step": 106680 + }, + { + "epoch": 0.6705713668414747, + "grad_norm": 5.6377482414245605, + "learning_rate": 1.5534521344811113e-05, + "loss": 1.6737, + "step": 106690 + }, + { + "epoch": 0.6706342191581718, + "grad_norm": 6.316112518310547, + "learning_rate": 1.5534102243866457e-05, + "loss": 1.6156, + "step": 106700 + }, + { + "epoch": 0.6706970714748689, + "grad_norm": 6.347810745239258, + "learning_rate": 1.5533683142921804e-05, + "loss": 1.9162, + "step": 106710 + }, + { + "epoch": 0.670759923791566, + "grad_norm": 6.955513000488281, + "learning_rate": 1.553326404197715e-05, + "loss": 1.7304, + "step": 106720 + }, + { + "epoch": 0.6708227761082631, + "grad_norm": 6.361878871917725, + "learning_rate": 1.5532844941032498e-05, + "loss": 1.6015, + "step": 106730 + }, + { + "epoch": 0.6708856284249602, + "grad_norm": 7.454643249511719, + "learning_rate": 1.5532425840087845e-05, + "loss": 1.7404, + "step": 106740 + }, + { + "epoch": 0.6709484807416574, + "grad_norm": 7.591629981994629, + "learning_rate": 1.553200673914319e-05, + "loss": 1.5899, + "step": 106750 + }, + { + "epoch": 0.6710113330583545, + "grad_norm": 5.682295799255371, + "learning_rate": 1.5531587638198536e-05, + "loss": 1.7101, + "step": 106760 + }, + { + "epoch": 0.6710741853750516, + "grad_norm": 7.851317882537842, + "learning_rate": 1.5531168537253883e-05, + "loss": 1.8985, + "step": 106770 + }, + { + "epoch": 0.6711370376917487, + "grad_norm": 6.232215881347656, + "learning_rate": 1.553074943630923e-05, + "loss": 1.7223, + "step": 106780 + }, + { + "epoch": 0.6711998900084458, + "grad_norm": 5.8570146560668945, + "learning_rate": 1.5530330335364577e-05, + "loss": 1.6537, + "step": 106790 + }, + { + "epoch": 0.6712627423251429, + "grad_norm": 7.223216533660889, + "learning_rate": 1.5529911234419924e-05, + "loss": 1.7117, + "step": 106800 + }, + { + "epoch": 0.67132559464184, + "grad_norm": 7.296375751495361, + "learning_rate": 1.552949213347527e-05, + "loss": 1.4305, + "step": 106810 + }, + { + "epoch": 0.6713884469585372, + "grad_norm": 6.399417400360107, + "learning_rate": 1.552907303253062e-05, + "loss": 1.6044, + "step": 106820 + }, + { + "epoch": 0.6714512992752342, + "grad_norm": 5.918104648590088, + "learning_rate": 1.5528653931585965e-05, + "loss": 1.8545, + "step": 106830 + }, + { + "epoch": 0.6715141515919313, + "grad_norm": 6.380157947540283, + "learning_rate": 1.552823483064131e-05, + "loss": 1.7125, + "step": 106840 + }, + { + "epoch": 0.6715770039086284, + "grad_norm": 6.100411415100098, + "learning_rate": 1.5527815729696656e-05, + "loss": 1.5642, + "step": 106850 + }, + { + "epoch": 0.6716398562253255, + "grad_norm": 6.3617730140686035, + "learning_rate": 1.5527396628752003e-05, + "loss": 1.6316, + "step": 106860 + }, + { + "epoch": 0.6717027085420226, + "grad_norm": 6.027811527252197, + "learning_rate": 1.552697752780735e-05, + "loss": 1.5886, + "step": 106870 + }, + { + "epoch": 0.6717655608587197, + "grad_norm": 6.4630537033081055, + "learning_rate": 1.5526558426862694e-05, + "loss": 1.5688, + "step": 106880 + }, + { + "epoch": 0.6718284131754169, + "grad_norm": 6.114879608154297, + "learning_rate": 1.552613932591804e-05, + "loss": 1.4731, + "step": 106890 + }, + { + "epoch": 0.671891265492114, + "grad_norm": 6.567522048950195, + "learning_rate": 1.5525720224973388e-05, + "loss": 1.6373, + "step": 106900 + }, + { + "epoch": 0.6719541178088111, + "grad_norm": 7.470694065093994, + "learning_rate": 1.5525301124028735e-05, + "loss": 1.8952, + "step": 106910 + }, + { + "epoch": 0.6720169701255082, + "grad_norm": 6.166024208068848, + "learning_rate": 1.552488202308408e-05, + "loss": 1.8146, + "step": 106920 + }, + { + "epoch": 0.6720798224422053, + "grad_norm": 6.314844608306885, + "learning_rate": 1.5524462922139426e-05, + "loss": 1.5766, + "step": 106930 + }, + { + "epoch": 0.6721426747589024, + "grad_norm": 6.448526859283447, + "learning_rate": 1.5524043821194773e-05, + "loss": 1.7526, + "step": 106940 + }, + { + "epoch": 0.6722055270755996, + "grad_norm": 5.572268009185791, + "learning_rate": 1.552362472025012e-05, + "loss": 1.6016, + "step": 106950 + }, + { + "epoch": 0.6722683793922967, + "grad_norm": 7.105794429779053, + "learning_rate": 1.5523205619305467e-05, + "loss": 1.6695, + "step": 106960 + }, + { + "epoch": 0.6723312317089938, + "grad_norm": 8.571908950805664, + "learning_rate": 1.5522786518360814e-05, + "loss": 1.9172, + "step": 106970 + }, + { + "epoch": 0.6723940840256909, + "grad_norm": 6.695889949798584, + "learning_rate": 1.552236741741616e-05, + "loss": 1.4954, + "step": 106980 + }, + { + "epoch": 0.672456936342388, + "grad_norm": 6.310705184936523, + "learning_rate": 1.552194831647151e-05, + "loss": 1.8722, + "step": 106990 + }, + { + "epoch": 0.6725197886590851, + "grad_norm": 6.111076354980469, + "learning_rate": 1.5521529215526852e-05, + "loss": 1.8558, + "step": 107000 + }, + { + "epoch": 0.6725826409757822, + "grad_norm": 6.492177963256836, + "learning_rate": 1.55211101145822e-05, + "loss": 1.3883, + "step": 107010 + }, + { + "epoch": 0.6726454932924794, + "grad_norm": 6.950572490692139, + "learning_rate": 1.5520691013637546e-05, + "loss": 1.6791, + "step": 107020 + }, + { + "epoch": 0.6727083456091765, + "grad_norm": 7.7460103034973145, + "learning_rate": 1.5520271912692893e-05, + "loss": 1.6308, + "step": 107030 + }, + { + "epoch": 0.6727711979258736, + "grad_norm": 6.830924987792969, + "learning_rate": 1.551985281174824e-05, + "loss": 1.7531, + "step": 107040 + }, + { + "epoch": 0.6728340502425707, + "grad_norm": 6.992501258850098, + "learning_rate": 1.5519433710803587e-05, + "loss": 1.7381, + "step": 107050 + }, + { + "epoch": 0.6728969025592678, + "grad_norm": 6.135435581207275, + "learning_rate": 1.551901460985893e-05, + "loss": 1.5797, + "step": 107060 + }, + { + "epoch": 0.6729597548759649, + "grad_norm": 5.973193645477295, + "learning_rate": 1.5518595508914278e-05, + "loss": 1.7572, + "step": 107070 + }, + { + "epoch": 0.6730226071926619, + "grad_norm": 7.702726364135742, + "learning_rate": 1.5518176407969625e-05, + "loss": 1.598, + "step": 107080 + }, + { + "epoch": 0.673085459509359, + "grad_norm": 6.913188934326172, + "learning_rate": 1.5517757307024972e-05, + "loss": 1.4042, + "step": 107090 + }, + { + "epoch": 0.6731483118260562, + "grad_norm": 7.558887004852295, + "learning_rate": 1.5517338206080316e-05, + "loss": 1.7481, + "step": 107100 + }, + { + "epoch": 0.6732111641427533, + "grad_norm": 7.296529293060303, + "learning_rate": 1.5516919105135663e-05, + "loss": 1.4847, + "step": 107110 + }, + { + "epoch": 0.6732740164594504, + "grad_norm": 6.7288289070129395, + "learning_rate": 1.551650000419101e-05, + "loss": 1.6209, + "step": 107120 + }, + { + "epoch": 0.6733368687761475, + "grad_norm": 7.862184524536133, + "learning_rate": 1.5516080903246357e-05, + "loss": 1.6374, + "step": 107130 + }, + { + "epoch": 0.6733997210928446, + "grad_norm": 7.252203464508057, + "learning_rate": 1.5515661802301704e-05, + "loss": 1.5912, + "step": 107140 + }, + { + "epoch": 0.6734625734095417, + "grad_norm": 6.187528133392334, + "learning_rate": 1.5515242701357048e-05, + "loss": 1.3804, + "step": 107150 + }, + { + "epoch": 0.6735254257262389, + "grad_norm": 6.828664779663086, + "learning_rate": 1.5514823600412395e-05, + "loss": 1.6811, + "step": 107160 + }, + { + "epoch": 0.673588278042936, + "grad_norm": 6.430910587310791, + "learning_rate": 1.5514404499467742e-05, + "loss": 1.6207, + "step": 107170 + }, + { + "epoch": 0.6736511303596331, + "grad_norm": 7.479432106018066, + "learning_rate": 1.551398539852309e-05, + "loss": 1.7641, + "step": 107180 + }, + { + "epoch": 0.6737139826763302, + "grad_norm": 7.683620929718018, + "learning_rate": 1.5513566297578436e-05, + "loss": 1.5278, + "step": 107190 + }, + { + "epoch": 0.6737768349930273, + "grad_norm": 6.850433349609375, + "learning_rate": 1.5513147196633783e-05, + "loss": 1.6015, + "step": 107200 + }, + { + "epoch": 0.6738396873097244, + "grad_norm": 6.360098361968994, + "learning_rate": 1.551272809568913e-05, + "loss": 1.5139, + "step": 107210 + }, + { + "epoch": 0.6739025396264215, + "grad_norm": 7.004693031311035, + "learning_rate": 1.5512308994744478e-05, + "loss": 1.7587, + "step": 107220 + }, + { + "epoch": 0.6739653919431187, + "grad_norm": 6.2940778732299805, + "learning_rate": 1.551188989379982e-05, + "loss": 1.5376, + "step": 107230 + }, + { + "epoch": 0.6740282442598158, + "grad_norm": 6.842554092407227, + "learning_rate": 1.5511470792855168e-05, + "loss": 1.6649, + "step": 107240 + }, + { + "epoch": 0.6740910965765129, + "grad_norm": 7.063696384429932, + "learning_rate": 1.5511051691910515e-05, + "loss": 1.5728, + "step": 107250 + }, + { + "epoch": 0.67415394889321, + "grad_norm": 5.923314094543457, + "learning_rate": 1.5510632590965862e-05, + "loss": 1.8809, + "step": 107260 + }, + { + "epoch": 0.6742168012099071, + "grad_norm": 5.884190082550049, + "learning_rate": 1.551021349002121e-05, + "loss": 1.6403, + "step": 107270 + }, + { + "epoch": 0.6742796535266042, + "grad_norm": 6.68449592590332, + "learning_rate": 1.5509794389076553e-05, + "loss": 1.7118, + "step": 107280 + }, + { + "epoch": 0.6743425058433014, + "grad_norm": 6.421838283538818, + "learning_rate": 1.55093752881319e-05, + "loss": 1.6536, + "step": 107290 + }, + { + "epoch": 0.6744053581599985, + "grad_norm": 5.841495990753174, + "learning_rate": 1.5508956187187247e-05, + "loss": 1.6271, + "step": 107300 + }, + { + "epoch": 0.6744682104766956, + "grad_norm": 7.0780720710754395, + "learning_rate": 1.5508537086242594e-05, + "loss": 1.7483, + "step": 107310 + }, + { + "epoch": 0.6745310627933927, + "grad_norm": 6.905803680419922, + "learning_rate": 1.5508117985297938e-05, + "loss": 1.9378, + "step": 107320 + }, + { + "epoch": 0.6745939151100898, + "grad_norm": 6.076014518737793, + "learning_rate": 1.5507698884353285e-05, + "loss": 1.6559, + "step": 107330 + }, + { + "epoch": 0.6746567674267868, + "grad_norm": 6.738631725311279, + "learning_rate": 1.5507279783408632e-05, + "loss": 1.6098, + "step": 107340 + }, + { + "epoch": 0.6747196197434839, + "grad_norm": 7.530428409576416, + "learning_rate": 1.550686068246398e-05, + "loss": 1.9081, + "step": 107350 + }, + { + "epoch": 0.674782472060181, + "grad_norm": 6.086501121520996, + "learning_rate": 1.5506441581519326e-05, + "loss": 1.5324, + "step": 107360 + }, + { + "epoch": 0.6748453243768782, + "grad_norm": 6.828165054321289, + "learning_rate": 1.5506022480574673e-05, + "loss": 1.6179, + "step": 107370 + }, + { + "epoch": 0.6749081766935753, + "grad_norm": 7.415923118591309, + "learning_rate": 1.5505603379630017e-05, + "loss": 1.6776, + "step": 107380 + }, + { + "epoch": 0.6749710290102724, + "grad_norm": 5.763005256652832, + "learning_rate": 1.5505184278685364e-05, + "loss": 1.6505, + "step": 107390 + }, + { + "epoch": 0.6750338813269695, + "grad_norm": 7.107841491699219, + "learning_rate": 1.550476517774071e-05, + "loss": 1.5215, + "step": 107400 + }, + { + "epoch": 0.6750967336436666, + "grad_norm": 5.866076469421387, + "learning_rate": 1.5504346076796058e-05, + "loss": 1.4398, + "step": 107410 + }, + { + "epoch": 0.6751595859603637, + "grad_norm": 6.280481815338135, + "learning_rate": 1.5503926975851405e-05, + "loss": 1.8825, + "step": 107420 + }, + { + "epoch": 0.6752224382770609, + "grad_norm": 6.061824798583984, + "learning_rate": 1.5503507874906752e-05, + "loss": 1.4968, + "step": 107430 + }, + { + "epoch": 0.675285290593758, + "grad_norm": 5.807056427001953, + "learning_rate": 1.55030887739621e-05, + "loss": 1.8132, + "step": 107440 + }, + { + "epoch": 0.6753481429104551, + "grad_norm": 6.263597011566162, + "learning_rate": 1.5502669673017447e-05, + "loss": 1.5875, + "step": 107450 + }, + { + "epoch": 0.6754109952271522, + "grad_norm": 6.837041854858398, + "learning_rate": 1.550225057207279e-05, + "loss": 1.7016, + "step": 107460 + }, + { + "epoch": 0.6754738475438493, + "grad_norm": 7.085463047027588, + "learning_rate": 1.5501831471128137e-05, + "loss": 1.7122, + "step": 107470 + }, + { + "epoch": 0.6755366998605464, + "grad_norm": 5.949290752410889, + "learning_rate": 1.5501412370183484e-05, + "loss": 1.6366, + "step": 107480 + }, + { + "epoch": 0.6755995521772435, + "grad_norm": 6.12795877456665, + "learning_rate": 1.550099326923883e-05, + "loss": 1.5677, + "step": 107490 + }, + { + "epoch": 0.6756624044939407, + "grad_norm": 6.4892778396606445, + "learning_rate": 1.5500574168294175e-05, + "loss": 1.5485, + "step": 107500 + }, + { + "epoch": 0.6757252568106378, + "grad_norm": 7.078492164611816, + "learning_rate": 1.5500155067349522e-05, + "loss": 1.6116, + "step": 107510 + }, + { + "epoch": 0.6757881091273349, + "grad_norm": 6.096056938171387, + "learning_rate": 1.549973596640487e-05, + "loss": 1.5864, + "step": 107520 + }, + { + "epoch": 0.675850961444032, + "grad_norm": 6.814564228057861, + "learning_rate": 1.5499316865460216e-05, + "loss": 1.4588, + "step": 107530 + }, + { + "epoch": 0.6759138137607291, + "grad_norm": 7.397793292999268, + "learning_rate": 1.549889776451556e-05, + "loss": 1.6345, + "step": 107540 + }, + { + "epoch": 0.6759766660774262, + "grad_norm": 6.652068138122559, + "learning_rate": 1.5498478663570907e-05, + "loss": 1.7347, + "step": 107550 + }, + { + "epoch": 0.6760395183941234, + "grad_norm": 7.2200822830200195, + "learning_rate": 1.5498059562626254e-05, + "loss": 1.598, + "step": 107560 + }, + { + "epoch": 0.6761023707108205, + "grad_norm": 5.757396221160889, + "learning_rate": 1.54976404616816e-05, + "loss": 1.6958, + "step": 107570 + }, + { + "epoch": 0.6761652230275176, + "grad_norm": 6.409972667694092, + "learning_rate": 1.549722136073695e-05, + "loss": 1.6045, + "step": 107580 + }, + { + "epoch": 0.6762280753442146, + "grad_norm": 5.316153526306152, + "learning_rate": 1.5496802259792295e-05, + "loss": 1.4484, + "step": 107590 + }, + { + "epoch": 0.6762909276609117, + "grad_norm": 6.884145259857178, + "learning_rate": 1.5496383158847642e-05, + "loss": 1.8094, + "step": 107600 + }, + { + "epoch": 0.6763537799776088, + "grad_norm": 6.612267017364502, + "learning_rate": 1.549596405790299e-05, + "loss": 1.6474, + "step": 107610 + }, + { + "epoch": 0.6764166322943059, + "grad_norm": 6.848570823669434, + "learning_rate": 1.5495544956958337e-05, + "loss": 1.6618, + "step": 107620 + }, + { + "epoch": 0.676479484611003, + "grad_norm": 6.698587417602539, + "learning_rate": 1.549512585601368e-05, + "loss": 1.681, + "step": 107630 + }, + { + "epoch": 0.6765423369277002, + "grad_norm": 6.573361873626709, + "learning_rate": 1.5494706755069027e-05, + "loss": 1.6353, + "step": 107640 + }, + { + "epoch": 0.6766051892443973, + "grad_norm": 6.124057769775391, + "learning_rate": 1.5494287654124374e-05, + "loss": 1.5539, + "step": 107650 + }, + { + "epoch": 0.6766680415610944, + "grad_norm": 7.117899417877197, + "learning_rate": 1.549386855317972e-05, + "loss": 1.4745, + "step": 107660 + }, + { + "epoch": 0.6767308938777915, + "grad_norm": 6.080042839050293, + "learning_rate": 1.549344945223507e-05, + "loss": 1.805, + "step": 107670 + }, + { + "epoch": 0.6767937461944886, + "grad_norm": 7.0961174964904785, + "learning_rate": 1.5493030351290412e-05, + "loss": 1.6328, + "step": 107680 + }, + { + "epoch": 0.6768565985111857, + "grad_norm": 7.08535623550415, + "learning_rate": 1.549261125034576e-05, + "loss": 1.7396, + "step": 107690 + }, + { + "epoch": 0.6769194508278829, + "grad_norm": 6.2137837409973145, + "learning_rate": 1.5492192149401106e-05, + "loss": 1.7535, + "step": 107700 + }, + { + "epoch": 0.67698230314458, + "grad_norm": 6.663417339324951, + "learning_rate": 1.5491773048456453e-05, + "loss": 1.8152, + "step": 107710 + }, + { + "epoch": 0.6770451554612771, + "grad_norm": 7.089684963226318, + "learning_rate": 1.5491353947511797e-05, + "loss": 1.7627, + "step": 107720 + }, + { + "epoch": 0.6771080077779742, + "grad_norm": 7.1028218269348145, + "learning_rate": 1.5490934846567144e-05, + "loss": 1.5991, + "step": 107730 + }, + { + "epoch": 0.6771708600946713, + "grad_norm": 6.861858367919922, + "learning_rate": 1.549051574562249e-05, + "loss": 1.4481, + "step": 107740 + }, + { + "epoch": 0.6772337124113684, + "grad_norm": 7.217982292175293, + "learning_rate": 1.549009664467784e-05, + "loss": 1.6189, + "step": 107750 + }, + { + "epoch": 0.6772965647280655, + "grad_norm": 6.6078972816467285, + "learning_rate": 1.5489677543733185e-05, + "loss": 1.7096, + "step": 107760 + }, + { + "epoch": 0.6773594170447627, + "grad_norm": 6.024550914764404, + "learning_rate": 1.548925844278853e-05, + "loss": 1.777, + "step": 107770 + }, + { + "epoch": 0.6774222693614598, + "grad_norm": 5.867654323577881, + "learning_rate": 1.5488839341843876e-05, + "loss": 1.4508, + "step": 107780 + }, + { + "epoch": 0.6774851216781569, + "grad_norm": 6.194303035736084, + "learning_rate": 1.5488420240899223e-05, + "loss": 1.7473, + "step": 107790 + }, + { + "epoch": 0.677547973994854, + "grad_norm": 7.7232160568237305, + "learning_rate": 1.548800113995457e-05, + "loss": 1.4863, + "step": 107800 + }, + { + "epoch": 0.6776108263115511, + "grad_norm": 6.032736778259277, + "learning_rate": 1.5487582039009917e-05, + "loss": 1.5798, + "step": 107810 + }, + { + "epoch": 0.6776736786282482, + "grad_norm": 7.0337114334106445, + "learning_rate": 1.5487162938065264e-05, + "loss": 1.5471, + "step": 107820 + }, + { + "epoch": 0.6777365309449453, + "grad_norm": 6.021357536315918, + "learning_rate": 1.548674383712061e-05, + "loss": 1.4817, + "step": 107830 + }, + { + "epoch": 0.6777993832616425, + "grad_norm": 5.410083770751953, + "learning_rate": 1.548632473617596e-05, + "loss": 1.6508, + "step": 107840 + }, + { + "epoch": 0.6778622355783395, + "grad_norm": 7.354559421539307, + "learning_rate": 1.5485905635231302e-05, + "loss": 1.5473, + "step": 107850 + }, + { + "epoch": 0.6779250878950366, + "grad_norm": 6.202705383300781, + "learning_rate": 1.548548653428665e-05, + "loss": 1.609, + "step": 107860 + }, + { + "epoch": 0.6779879402117337, + "grad_norm": 6.383668899536133, + "learning_rate": 1.5485067433341996e-05, + "loss": 1.8578, + "step": 107870 + }, + { + "epoch": 0.6780507925284308, + "grad_norm": 7.749847888946533, + "learning_rate": 1.5484648332397344e-05, + "loss": 1.6429, + "step": 107880 + }, + { + "epoch": 0.6781136448451279, + "grad_norm": 5.802626132965088, + "learning_rate": 1.548422923145269e-05, + "loss": 1.5985, + "step": 107890 + }, + { + "epoch": 0.678176497161825, + "grad_norm": 5.312656402587891, + "learning_rate": 1.5483810130508034e-05, + "loss": 1.702, + "step": 107900 + }, + { + "epoch": 0.6782393494785222, + "grad_norm": 6.155538082122803, + "learning_rate": 1.548339102956338e-05, + "loss": 1.6247, + "step": 107910 + }, + { + "epoch": 0.6783022017952193, + "grad_norm": 6.4692816734313965, + "learning_rate": 1.548297192861873e-05, + "loss": 1.6866, + "step": 107920 + }, + { + "epoch": 0.6783650541119164, + "grad_norm": 6.633361339569092, + "learning_rate": 1.5482552827674075e-05, + "loss": 1.8328, + "step": 107930 + }, + { + "epoch": 0.6784279064286135, + "grad_norm": 6.464073181152344, + "learning_rate": 1.548213372672942e-05, + "loss": 1.585, + "step": 107940 + }, + { + "epoch": 0.6784907587453106, + "grad_norm": 7.110156059265137, + "learning_rate": 1.5481714625784766e-05, + "loss": 1.6967, + "step": 107950 + }, + { + "epoch": 0.6785536110620077, + "grad_norm": 6.3033318519592285, + "learning_rate": 1.5481295524840113e-05, + "loss": 1.5713, + "step": 107960 + }, + { + "epoch": 0.6786164633787048, + "grad_norm": 6.375494480133057, + "learning_rate": 1.548087642389546e-05, + "loss": 1.6137, + "step": 107970 + }, + { + "epoch": 0.678679315695402, + "grad_norm": 7.028976917266846, + "learning_rate": 1.5480457322950807e-05, + "loss": 1.5442, + "step": 107980 + }, + { + "epoch": 0.6787421680120991, + "grad_norm": 6.737597465515137, + "learning_rate": 1.5480038222006155e-05, + "loss": 1.448, + "step": 107990 + }, + { + "epoch": 0.6788050203287962, + "grad_norm": 5.59172248840332, + "learning_rate": 1.54796191210615e-05, + "loss": 1.545, + "step": 108000 + }, + { + "epoch": 0.6788678726454933, + "grad_norm": 6.734803199768066, + "learning_rate": 1.5479200020116845e-05, + "loss": 1.7618, + "step": 108010 + }, + { + "epoch": 0.6789307249621904, + "grad_norm": 6.651988983154297, + "learning_rate": 1.5478780919172192e-05, + "loss": 1.575, + "step": 108020 + }, + { + "epoch": 0.6789935772788875, + "grad_norm": 7.12288236618042, + "learning_rate": 1.547836181822754e-05, + "loss": 1.5762, + "step": 108030 + }, + { + "epoch": 0.6790564295955847, + "grad_norm": 5.929279804229736, + "learning_rate": 1.5477942717282886e-05, + "loss": 1.6749, + "step": 108040 + }, + { + "epoch": 0.6791192819122818, + "grad_norm": 6.111502647399902, + "learning_rate": 1.5477523616338234e-05, + "loss": 1.5812, + "step": 108050 + }, + { + "epoch": 0.6791821342289789, + "grad_norm": 5.756034851074219, + "learning_rate": 1.547710451539358e-05, + "loss": 1.6907, + "step": 108060 + }, + { + "epoch": 0.679244986545676, + "grad_norm": 5.800210952758789, + "learning_rate": 1.5476685414448928e-05, + "loss": 1.7198, + "step": 108070 + }, + { + "epoch": 0.6793078388623731, + "grad_norm": 6.871427536010742, + "learning_rate": 1.547626631350427e-05, + "loss": 1.72, + "step": 108080 + }, + { + "epoch": 0.6793706911790702, + "grad_norm": 5.799198150634766, + "learning_rate": 1.547584721255962e-05, + "loss": 1.7499, + "step": 108090 + }, + { + "epoch": 0.6794335434957673, + "grad_norm": 6.329927921295166, + "learning_rate": 1.5475428111614966e-05, + "loss": 1.6051, + "step": 108100 + }, + { + "epoch": 0.6794963958124643, + "grad_norm": 7.542634963989258, + "learning_rate": 1.5475009010670313e-05, + "loss": 1.5641, + "step": 108110 + }, + { + "epoch": 0.6795592481291615, + "grad_norm": 7.232003211975098, + "learning_rate": 1.5474589909725656e-05, + "loss": 1.6142, + "step": 108120 + }, + { + "epoch": 0.6796221004458586, + "grad_norm": 5.938115119934082, + "learning_rate": 1.5474170808781003e-05, + "loss": 1.7776, + "step": 108130 + }, + { + "epoch": 0.6796849527625557, + "grad_norm": 6.24160099029541, + "learning_rate": 1.547375170783635e-05, + "loss": 1.5408, + "step": 108140 + }, + { + "epoch": 0.6797478050792528, + "grad_norm": 6.338478088378906, + "learning_rate": 1.5473332606891697e-05, + "loss": 1.5938, + "step": 108150 + }, + { + "epoch": 0.6798106573959499, + "grad_norm": 6.602304935455322, + "learning_rate": 1.5472913505947045e-05, + "loss": 1.5456, + "step": 108160 + }, + { + "epoch": 0.679873509712647, + "grad_norm": 6.956899166107178, + "learning_rate": 1.5472494405002388e-05, + "loss": 1.6943, + "step": 108170 + }, + { + "epoch": 0.6799363620293442, + "grad_norm": 7.508603096008301, + "learning_rate": 1.5472075304057735e-05, + "loss": 1.5053, + "step": 108180 + }, + { + "epoch": 0.6799992143460413, + "grad_norm": 7.045706272125244, + "learning_rate": 1.5471656203113082e-05, + "loss": 1.6037, + "step": 108190 + }, + { + "epoch": 0.6800620666627384, + "grad_norm": 7.3159871101379395, + "learning_rate": 1.547123710216843e-05, + "loss": 1.6272, + "step": 108200 + }, + { + "epoch": 0.6801249189794355, + "grad_norm": 6.823150634765625, + "learning_rate": 1.5470818001223777e-05, + "loss": 1.6574, + "step": 108210 + }, + { + "epoch": 0.6801877712961326, + "grad_norm": 6.396905422210693, + "learning_rate": 1.5470398900279124e-05, + "loss": 1.7211, + "step": 108220 + }, + { + "epoch": 0.6802506236128297, + "grad_norm": 5.912825107574463, + "learning_rate": 1.546997979933447e-05, + "loss": 1.5153, + "step": 108230 + }, + { + "epoch": 0.6803134759295268, + "grad_norm": 6.832945346832275, + "learning_rate": 1.5469560698389818e-05, + "loss": 1.4702, + "step": 108240 + }, + { + "epoch": 0.680376328246224, + "grad_norm": 6.325376987457275, + "learning_rate": 1.546914159744516e-05, + "loss": 1.6689, + "step": 108250 + }, + { + "epoch": 0.6804391805629211, + "grad_norm": 6.241364479064941, + "learning_rate": 1.546872249650051e-05, + "loss": 1.6209, + "step": 108260 + }, + { + "epoch": 0.6805020328796182, + "grad_norm": 6.7398905754089355, + "learning_rate": 1.5468303395555856e-05, + "loss": 1.6831, + "step": 108270 + }, + { + "epoch": 0.6805648851963153, + "grad_norm": 6.899637699127197, + "learning_rate": 1.5467884294611203e-05, + "loss": 1.7488, + "step": 108280 + }, + { + "epoch": 0.6806277375130124, + "grad_norm": 6.30224609375, + "learning_rate": 1.546746519366655e-05, + "loss": 1.6354, + "step": 108290 + }, + { + "epoch": 0.6806905898297095, + "grad_norm": 6.6264495849609375, + "learning_rate": 1.5467046092721893e-05, + "loss": 1.7231, + "step": 108300 + }, + { + "epoch": 0.6807534421464067, + "grad_norm": 6.702651500701904, + "learning_rate": 1.546662699177724e-05, + "loss": 1.5692, + "step": 108310 + }, + { + "epoch": 0.6808162944631038, + "grad_norm": 7.259090423583984, + "learning_rate": 1.5466207890832588e-05, + "loss": 1.7154, + "step": 108320 + }, + { + "epoch": 0.6808791467798009, + "grad_norm": 7.181676387786865, + "learning_rate": 1.5465788789887935e-05, + "loss": 1.7076, + "step": 108330 + }, + { + "epoch": 0.680941999096498, + "grad_norm": 7.554019927978516, + "learning_rate": 1.5465369688943278e-05, + "loss": 1.5567, + "step": 108340 + }, + { + "epoch": 0.6810048514131951, + "grad_norm": 6.8995137214660645, + "learning_rate": 1.5464950587998625e-05, + "loss": 1.6748, + "step": 108350 + }, + { + "epoch": 0.6810677037298921, + "grad_norm": 6.453468322753906, + "learning_rate": 1.5464531487053972e-05, + "loss": 1.4256, + "step": 108360 + }, + { + "epoch": 0.6811305560465892, + "grad_norm": 7.019639015197754, + "learning_rate": 1.546411238610932e-05, + "loss": 1.451, + "step": 108370 + }, + { + "epoch": 0.6811934083632863, + "grad_norm": 7.011302947998047, + "learning_rate": 1.5463693285164667e-05, + "loss": 1.4287, + "step": 108380 + }, + { + "epoch": 0.6812562606799835, + "grad_norm": 6.763635635375977, + "learning_rate": 1.546327418422001e-05, + "loss": 1.6974, + "step": 108390 + }, + { + "epoch": 0.6813191129966806, + "grad_norm": 6.361536026000977, + "learning_rate": 1.5462855083275357e-05, + "loss": 1.6616, + "step": 108400 + }, + { + "epoch": 0.6813819653133777, + "grad_norm": 7.214605808258057, + "learning_rate": 1.5462435982330704e-05, + "loss": 1.4605, + "step": 108410 + }, + { + "epoch": 0.6814448176300748, + "grad_norm": 7.430278778076172, + "learning_rate": 1.546201688138605e-05, + "loss": 1.5119, + "step": 108420 + }, + { + "epoch": 0.6815076699467719, + "grad_norm": 5.081109046936035, + "learning_rate": 1.54615977804414e-05, + "loss": 1.7151, + "step": 108430 + }, + { + "epoch": 0.681570522263469, + "grad_norm": 6.070030212402344, + "learning_rate": 1.5461178679496746e-05, + "loss": 1.617, + "step": 108440 + }, + { + "epoch": 0.6816333745801662, + "grad_norm": 7.476914405822754, + "learning_rate": 1.5460759578552093e-05, + "loss": 1.921, + "step": 108450 + }, + { + "epoch": 0.6816962268968633, + "grad_norm": 5.63602352142334, + "learning_rate": 1.546034047760744e-05, + "loss": 1.575, + "step": 108460 + }, + { + "epoch": 0.6817590792135604, + "grad_norm": 6.046362400054932, + "learning_rate": 1.5459921376662787e-05, + "loss": 1.6911, + "step": 108470 + }, + { + "epoch": 0.6818219315302575, + "grad_norm": 7.255295276641846, + "learning_rate": 1.545950227571813e-05, + "loss": 1.6588, + "step": 108480 + }, + { + "epoch": 0.6818847838469546, + "grad_norm": 6.208819389343262, + "learning_rate": 1.5459083174773478e-05, + "loss": 1.5616, + "step": 108490 + }, + { + "epoch": 0.6819476361636517, + "grad_norm": 6.300037384033203, + "learning_rate": 1.5458664073828825e-05, + "loss": 1.5962, + "step": 108500 + }, + { + "epoch": 0.6820104884803488, + "grad_norm": 6.827779769897461, + "learning_rate": 1.545824497288417e-05, + "loss": 1.7844, + "step": 108510 + }, + { + "epoch": 0.682073340797046, + "grad_norm": 6.917608737945557, + "learning_rate": 1.5457825871939515e-05, + "loss": 1.5409, + "step": 108520 + }, + { + "epoch": 0.6821361931137431, + "grad_norm": 6.608980655670166, + "learning_rate": 1.5457406770994862e-05, + "loss": 1.4922, + "step": 108530 + }, + { + "epoch": 0.6821990454304402, + "grad_norm": 7.339383125305176, + "learning_rate": 1.545698767005021e-05, + "loss": 1.4428, + "step": 108540 + }, + { + "epoch": 0.6822618977471373, + "grad_norm": 6.288963794708252, + "learning_rate": 1.5456568569105557e-05, + "loss": 1.4886, + "step": 108550 + }, + { + "epoch": 0.6823247500638344, + "grad_norm": 7.163339138031006, + "learning_rate": 1.54561494681609e-05, + "loss": 1.6897, + "step": 108560 + }, + { + "epoch": 0.6823876023805315, + "grad_norm": 6.777314186096191, + "learning_rate": 1.5455730367216247e-05, + "loss": 1.6044, + "step": 108570 + }, + { + "epoch": 0.6824504546972286, + "grad_norm": 5.41243839263916, + "learning_rate": 1.5455311266271594e-05, + "loss": 1.8293, + "step": 108580 + }, + { + "epoch": 0.6825133070139258, + "grad_norm": 6.540939807891846, + "learning_rate": 1.545489216532694e-05, + "loss": 1.6995, + "step": 108590 + }, + { + "epoch": 0.6825761593306229, + "grad_norm": 8.392694473266602, + "learning_rate": 1.545447306438229e-05, + "loss": 1.6126, + "step": 108600 + }, + { + "epoch": 0.68263901164732, + "grad_norm": 5.375041961669922, + "learning_rate": 1.5454053963437636e-05, + "loss": 1.3655, + "step": 108610 + }, + { + "epoch": 0.682701863964017, + "grad_norm": 6.172969341278076, + "learning_rate": 1.5453634862492983e-05, + "loss": 1.7403, + "step": 108620 + }, + { + "epoch": 0.6827647162807141, + "grad_norm": 6.109135627746582, + "learning_rate": 1.5453215761548326e-05, + "loss": 1.3881, + "step": 108630 + }, + { + "epoch": 0.6828275685974112, + "grad_norm": 5.76917839050293, + "learning_rate": 1.5452796660603673e-05, + "loss": 1.5411, + "step": 108640 + }, + { + "epoch": 0.6828904209141083, + "grad_norm": 6.600244522094727, + "learning_rate": 1.545237755965902e-05, + "loss": 1.8191, + "step": 108650 + }, + { + "epoch": 0.6829532732308055, + "grad_norm": 6.708452224731445, + "learning_rate": 1.5451958458714368e-05, + "loss": 1.64, + "step": 108660 + }, + { + "epoch": 0.6830161255475026, + "grad_norm": 7.054609775543213, + "learning_rate": 1.5451539357769715e-05, + "loss": 1.6014, + "step": 108670 + }, + { + "epoch": 0.6830789778641997, + "grad_norm": 7.061050891876221, + "learning_rate": 1.5451120256825062e-05, + "loss": 1.504, + "step": 108680 + }, + { + "epoch": 0.6831418301808968, + "grad_norm": 6.879281520843506, + "learning_rate": 1.545070115588041e-05, + "loss": 1.7542, + "step": 108690 + }, + { + "epoch": 0.6832046824975939, + "grad_norm": 6.8982343673706055, + "learning_rate": 1.5450282054935752e-05, + "loss": 1.7823, + "step": 108700 + }, + { + "epoch": 0.683267534814291, + "grad_norm": 5.512999057769775, + "learning_rate": 1.54498629539911e-05, + "loss": 1.6196, + "step": 108710 + }, + { + "epoch": 0.6833303871309881, + "grad_norm": 6.384003639221191, + "learning_rate": 1.5449443853046447e-05, + "loss": 1.5271, + "step": 108720 + }, + { + "epoch": 0.6833932394476853, + "grad_norm": 7.060078144073486, + "learning_rate": 1.5449024752101794e-05, + "loss": 1.4296, + "step": 108730 + }, + { + "epoch": 0.6834560917643824, + "grad_norm": 6.49919319152832, + "learning_rate": 1.5448605651157137e-05, + "loss": 1.4782, + "step": 108740 + }, + { + "epoch": 0.6835189440810795, + "grad_norm": 4.964798927307129, + "learning_rate": 1.5448186550212484e-05, + "loss": 1.3391, + "step": 108750 + }, + { + "epoch": 0.6835817963977766, + "grad_norm": 6.3799729347229, + "learning_rate": 1.544776744926783e-05, + "loss": 1.3969, + "step": 108760 + }, + { + "epoch": 0.6836446487144737, + "grad_norm": 6.024362564086914, + "learning_rate": 1.544734834832318e-05, + "loss": 1.5368, + "step": 108770 + }, + { + "epoch": 0.6837075010311708, + "grad_norm": 6.578910827636719, + "learning_rate": 1.5446929247378526e-05, + "loss": 1.7696, + "step": 108780 + }, + { + "epoch": 0.683770353347868, + "grad_norm": 6.558101177215576, + "learning_rate": 1.544651014643387e-05, + "loss": 1.686, + "step": 108790 + }, + { + "epoch": 0.6838332056645651, + "grad_norm": 6.501033306121826, + "learning_rate": 1.5446091045489216e-05, + "loss": 1.6505, + "step": 108800 + }, + { + "epoch": 0.6838960579812622, + "grad_norm": 6.418697834014893, + "learning_rate": 1.5445671944544563e-05, + "loss": 1.5026, + "step": 108810 + }, + { + "epoch": 0.6839589102979593, + "grad_norm": 7.684604167938232, + "learning_rate": 1.544525284359991e-05, + "loss": 1.7611, + "step": 108820 + }, + { + "epoch": 0.6840217626146564, + "grad_norm": 5.792769432067871, + "learning_rate": 1.5444833742655258e-05, + "loss": 1.7072, + "step": 108830 + }, + { + "epoch": 0.6840846149313535, + "grad_norm": 6.451221942901611, + "learning_rate": 1.5444414641710605e-05, + "loss": 1.5386, + "step": 108840 + }, + { + "epoch": 0.6841474672480506, + "grad_norm": 6.220157146453857, + "learning_rate": 1.5443995540765952e-05, + "loss": 1.6949, + "step": 108850 + }, + { + "epoch": 0.6842103195647478, + "grad_norm": 6.748058319091797, + "learning_rate": 1.54435764398213e-05, + "loss": 1.5862, + "step": 108860 + }, + { + "epoch": 0.6842731718814448, + "grad_norm": 6.666525363922119, + "learning_rate": 1.5443157338876643e-05, + "loss": 1.4896, + "step": 108870 + }, + { + "epoch": 0.6843360241981419, + "grad_norm": 5.935793876647949, + "learning_rate": 1.544273823793199e-05, + "loss": 1.5257, + "step": 108880 + }, + { + "epoch": 0.684398876514839, + "grad_norm": 5.911357879638672, + "learning_rate": 1.5442319136987337e-05, + "loss": 1.6697, + "step": 108890 + }, + { + "epoch": 0.6844617288315361, + "grad_norm": 6.261645317077637, + "learning_rate": 1.5441900036042684e-05, + "loss": 1.4149, + "step": 108900 + }, + { + "epoch": 0.6845245811482332, + "grad_norm": 7.229828357696533, + "learning_rate": 1.544148093509803e-05, + "loss": 1.5759, + "step": 108910 + }, + { + "epoch": 0.6845874334649303, + "grad_norm": 6.908501625061035, + "learning_rate": 1.5441061834153374e-05, + "loss": 1.7641, + "step": 108920 + }, + { + "epoch": 0.6846502857816275, + "grad_norm": 6.935072898864746, + "learning_rate": 1.544064273320872e-05, + "loss": 1.6499, + "step": 108930 + }, + { + "epoch": 0.6847131380983246, + "grad_norm": 6.077419757843018, + "learning_rate": 1.544022363226407e-05, + "loss": 1.7182, + "step": 108940 + }, + { + "epoch": 0.6847759904150217, + "grad_norm": 8.297808647155762, + "learning_rate": 1.5439804531319416e-05, + "loss": 1.6095, + "step": 108950 + }, + { + "epoch": 0.6848388427317188, + "grad_norm": 6.749807834625244, + "learning_rate": 1.543938543037476e-05, + "loss": 1.7973, + "step": 108960 + }, + { + "epoch": 0.6849016950484159, + "grad_norm": 6.1888837814331055, + "learning_rate": 1.5438966329430106e-05, + "loss": 1.5648, + "step": 108970 + }, + { + "epoch": 0.684964547365113, + "grad_norm": 5.637599945068359, + "learning_rate": 1.5438547228485454e-05, + "loss": 1.485, + "step": 108980 + }, + { + "epoch": 0.6850273996818101, + "grad_norm": 5.449834823608398, + "learning_rate": 1.54381281275408e-05, + "loss": 1.412, + "step": 108990 + }, + { + "epoch": 0.6850902519985073, + "grad_norm": 6.458430767059326, + "learning_rate": 1.5437709026596148e-05, + "loss": 1.7153, + "step": 109000 + }, + { + "epoch": 0.6851531043152044, + "grad_norm": 7.78114652633667, + "learning_rate": 1.543728992565149e-05, + "loss": 1.4838, + "step": 109010 + }, + { + "epoch": 0.6852159566319015, + "grad_norm": 6.260158538818359, + "learning_rate": 1.543687082470684e-05, + "loss": 1.5003, + "step": 109020 + }, + { + "epoch": 0.6852788089485986, + "grad_norm": 6.7112507820129395, + "learning_rate": 1.5436451723762185e-05, + "loss": 1.4536, + "step": 109030 + }, + { + "epoch": 0.6853416612652957, + "grad_norm": 7.102040767669678, + "learning_rate": 1.5436032622817533e-05, + "loss": 1.5841, + "step": 109040 + }, + { + "epoch": 0.6854045135819928, + "grad_norm": 7.388706684112549, + "learning_rate": 1.543561352187288e-05, + "loss": 1.7225, + "step": 109050 + }, + { + "epoch": 0.68546736589869, + "grad_norm": 6.066342830657959, + "learning_rate": 1.5435194420928227e-05, + "loss": 1.434, + "step": 109060 + }, + { + "epoch": 0.6855302182153871, + "grad_norm": 6.081060409545898, + "learning_rate": 1.5434775319983574e-05, + "loss": 1.6384, + "step": 109070 + }, + { + "epoch": 0.6855930705320842, + "grad_norm": 7.088042259216309, + "learning_rate": 1.543435621903892e-05, + "loss": 1.7854, + "step": 109080 + }, + { + "epoch": 0.6856559228487813, + "grad_norm": 6.102912425994873, + "learning_rate": 1.5433937118094268e-05, + "loss": 1.8214, + "step": 109090 + }, + { + "epoch": 0.6857187751654784, + "grad_norm": 6.492491722106934, + "learning_rate": 1.543351801714961e-05, + "loss": 1.56, + "step": 109100 + }, + { + "epoch": 0.6857816274821755, + "grad_norm": 7.235417366027832, + "learning_rate": 1.543309891620496e-05, + "loss": 1.7355, + "step": 109110 + }, + { + "epoch": 0.6858444797988726, + "grad_norm": 6.384990692138672, + "learning_rate": 1.5432679815260306e-05, + "loss": 1.2698, + "step": 109120 + }, + { + "epoch": 0.6859073321155696, + "grad_norm": 6.30357027053833, + "learning_rate": 1.5432260714315653e-05, + "loss": 1.4585, + "step": 109130 + }, + { + "epoch": 0.6859701844322668, + "grad_norm": 7.86502742767334, + "learning_rate": 1.5431841613370996e-05, + "loss": 1.6503, + "step": 109140 + }, + { + "epoch": 0.6860330367489639, + "grad_norm": 7.20648193359375, + "learning_rate": 1.5431422512426344e-05, + "loss": 1.5368, + "step": 109150 + }, + { + "epoch": 0.686095889065661, + "grad_norm": 6.791904449462891, + "learning_rate": 1.543100341148169e-05, + "loss": 1.7353, + "step": 109160 + }, + { + "epoch": 0.6861587413823581, + "grad_norm": 6.168340682983398, + "learning_rate": 1.5430584310537038e-05, + "loss": 1.6341, + "step": 109170 + }, + { + "epoch": 0.6862215936990552, + "grad_norm": 6.9182000160217285, + "learning_rate": 1.543016520959238e-05, + "loss": 1.6674, + "step": 109180 + }, + { + "epoch": 0.6862844460157523, + "grad_norm": 6.65117883682251, + "learning_rate": 1.542974610864773e-05, + "loss": 1.8268, + "step": 109190 + }, + { + "epoch": 0.6863472983324495, + "grad_norm": 4.9110236167907715, + "learning_rate": 1.5429327007703076e-05, + "loss": 1.5559, + "step": 109200 + }, + { + "epoch": 0.6864101506491466, + "grad_norm": 6.228017807006836, + "learning_rate": 1.5428907906758423e-05, + "loss": 1.4989, + "step": 109210 + }, + { + "epoch": 0.6864730029658437, + "grad_norm": 7.477842807769775, + "learning_rate": 1.542848880581377e-05, + "loss": 1.5137, + "step": 109220 + }, + { + "epoch": 0.6865358552825408, + "grad_norm": 6.381414413452148, + "learning_rate": 1.5428069704869117e-05, + "loss": 1.4476, + "step": 109230 + }, + { + "epoch": 0.6865987075992379, + "grad_norm": 7.051868438720703, + "learning_rate": 1.5427650603924464e-05, + "loss": 1.6207, + "step": 109240 + }, + { + "epoch": 0.686661559915935, + "grad_norm": 6.752513408660889, + "learning_rate": 1.542723150297981e-05, + "loss": 1.6355, + "step": 109250 + }, + { + "epoch": 0.6867244122326321, + "grad_norm": 6.904550075531006, + "learning_rate": 1.5426812402035155e-05, + "loss": 1.8425, + "step": 109260 + }, + { + "epoch": 0.6867872645493293, + "grad_norm": 7.052554607391357, + "learning_rate": 1.54263933010905e-05, + "loss": 1.7125, + "step": 109270 + }, + { + "epoch": 0.6868501168660264, + "grad_norm": 7.331358909606934, + "learning_rate": 1.542597420014585e-05, + "loss": 1.6841, + "step": 109280 + }, + { + "epoch": 0.6869129691827235, + "grad_norm": 7.771301746368408, + "learning_rate": 1.5425555099201196e-05, + "loss": 1.7112, + "step": 109290 + }, + { + "epoch": 0.6869758214994206, + "grad_norm": 5.575971603393555, + "learning_rate": 1.5425135998256543e-05, + "loss": 1.4082, + "step": 109300 + }, + { + "epoch": 0.6870386738161177, + "grad_norm": 6.857352256774902, + "learning_rate": 1.542471689731189e-05, + "loss": 1.6333, + "step": 109310 + }, + { + "epoch": 0.6871015261328148, + "grad_norm": 6.6807732582092285, + "learning_rate": 1.5424297796367234e-05, + "loss": 1.6542, + "step": 109320 + }, + { + "epoch": 0.687164378449512, + "grad_norm": 7.070662498474121, + "learning_rate": 1.542387869542258e-05, + "loss": 1.7512, + "step": 109330 + }, + { + "epoch": 0.6872272307662091, + "grad_norm": 5.746772289276123, + "learning_rate": 1.5423459594477928e-05, + "loss": 1.7782, + "step": 109340 + }, + { + "epoch": 0.6872900830829062, + "grad_norm": 6.7605390548706055, + "learning_rate": 1.5423040493533275e-05, + "loss": 1.6884, + "step": 109350 + }, + { + "epoch": 0.6873529353996033, + "grad_norm": 7.406123161315918, + "learning_rate": 1.542262139258862e-05, + "loss": 1.8441, + "step": 109360 + }, + { + "epoch": 0.6874157877163004, + "grad_norm": 5.762146472930908, + "learning_rate": 1.5422202291643966e-05, + "loss": 1.4661, + "step": 109370 + }, + { + "epoch": 0.6874786400329974, + "grad_norm": 6.332161903381348, + "learning_rate": 1.5421783190699313e-05, + "loss": 1.3427, + "step": 109380 + }, + { + "epoch": 0.6875414923496945, + "grad_norm": 7.25848388671875, + "learning_rate": 1.542136408975466e-05, + "loss": 1.7529, + "step": 109390 + }, + { + "epoch": 0.6876043446663916, + "grad_norm": 7.368528842926025, + "learning_rate": 1.5420944988810007e-05, + "loss": 2.0065, + "step": 109400 + }, + { + "epoch": 0.6876671969830888, + "grad_norm": 7.377408027648926, + "learning_rate": 1.542052588786535e-05, + "loss": 1.7156, + "step": 109410 + }, + { + "epoch": 0.6877300492997859, + "grad_norm": 6.124571323394775, + "learning_rate": 1.5420106786920698e-05, + "loss": 1.505, + "step": 109420 + }, + { + "epoch": 0.687792901616483, + "grad_norm": 6.970246315002441, + "learning_rate": 1.5419687685976045e-05, + "loss": 1.5657, + "step": 109430 + }, + { + "epoch": 0.6878557539331801, + "grad_norm": 7.313261032104492, + "learning_rate": 1.541926858503139e-05, + "loss": 1.6251, + "step": 109440 + }, + { + "epoch": 0.6879186062498772, + "grad_norm": 7.775289058685303, + "learning_rate": 1.541884948408674e-05, + "loss": 1.6768, + "step": 109450 + }, + { + "epoch": 0.6879814585665743, + "grad_norm": 8.135762214660645, + "learning_rate": 1.5418430383142086e-05, + "loss": 1.8218, + "step": 109460 + }, + { + "epoch": 0.6880443108832714, + "grad_norm": 6.805774688720703, + "learning_rate": 1.5418011282197433e-05, + "loss": 1.3224, + "step": 109470 + }, + { + "epoch": 0.6881071631999686, + "grad_norm": 6.121889591217041, + "learning_rate": 1.541759218125278e-05, + "loss": 1.63, + "step": 109480 + }, + { + "epoch": 0.6881700155166657, + "grad_norm": 7.373378753662109, + "learning_rate": 1.5417173080308124e-05, + "loss": 1.6781, + "step": 109490 + }, + { + "epoch": 0.6882328678333628, + "grad_norm": 6.9453864097595215, + "learning_rate": 1.541675397936347e-05, + "loss": 1.6407, + "step": 109500 + }, + { + "epoch": 0.6882957201500599, + "grad_norm": 7.068073272705078, + "learning_rate": 1.5416334878418818e-05, + "loss": 1.6273, + "step": 109510 + }, + { + "epoch": 0.688358572466757, + "grad_norm": 8.614026069641113, + "learning_rate": 1.5415915777474165e-05, + "loss": 1.7811, + "step": 109520 + }, + { + "epoch": 0.6884214247834541, + "grad_norm": 7.80880069732666, + "learning_rate": 1.5415496676529512e-05, + "loss": 1.5961, + "step": 109530 + }, + { + "epoch": 0.6884842771001513, + "grad_norm": 6.941601753234863, + "learning_rate": 1.5415077575584856e-05, + "loss": 1.7483, + "step": 109540 + }, + { + "epoch": 0.6885471294168484, + "grad_norm": 6.181019306182861, + "learning_rate": 1.5414658474640203e-05, + "loss": 1.601, + "step": 109550 + }, + { + "epoch": 0.6886099817335455, + "grad_norm": 6.1155219078063965, + "learning_rate": 1.541423937369555e-05, + "loss": 1.7136, + "step": 109560 + }, + { + "epoch": 0.6886728340502426, + "grad_norm": 7.124586582183838, + "learning_rate": 1.5413820272750897e-05, + "loss": 1.5453, + "step": 109570 + }, + { + "epoch": 0.6887356863669397, + "grad_norm": 6.8544511795043945, + "learning_rate": 1.541340117180624e-05, + "loss": 1.5531, + "step": 109580 + }, + { + "epoch": 0.6887985386836368, + "grad_norm": 7.167695999145508, + "learning_rate": 1.5412982070861588e-05, + "loss": 1.6281, + "step": 109590 + }, + { + "epoch": 0.688861391000334, + "grad_norm": 6.573579788208008, + "learning_rate": 1.5412562969916935e-05, + "loss": 1.6064, + "step": 109600 + }, + { + "epoch": 0.6889242433170311, + "grad_norm": 6.685288429260254, + "learning_rate": 1.541214386897228e-05, + "loss": 1.5467, + "step": 109610 + }, + { + "epoch": 0.6889870956337282, + "grad_norm": 6.076452255249023, + "learning_rate": 1.541172476802763e-05, + "loss": 1.554, + "step": 109620 + }, + { + "epoch": 0.6890499479504253, + "grad_norm": 5.2713623046875, + "learning_rate": 1.5411305667082976e-05, + "loss": 1.489, + "step": 109630 + }, + { + "epoch": 0.6891128002671223, + "grad_norm": 5.7808966636657715, + "learning_rate": 1.541088656613832e-05, + "loss": 1.5726, + "step": 109640 + }, + { + "epoch": 0.6891756525838194, + "grad_norm": 7.051931858062744, + "learning_rate": 1.5410467465193667e-05, + "loss": 1.7635, + "step": 109650 + }, + { + "epoch": 0.6892385049005165, + "grad_norm": 7.634355545043945, + "learning_rate": 1.5410048364249014e-05, + "loss": 1.663, + "step": 109660 + }, + { + "epoch": 0.6893013572172136, + "grad_norm": 7.906497955322266, + "learning_rate": 1.540962926330436e-05, + "loss": 1.6142, + "step": 109670 + }, + { + "epoch": 0.6893642095339108, + "grad_norm": 7.191876411437988, + "learning_rate": 1.5409210162359708e-05, + "loss": 1.55, + "step": 109680 + }, + { + "epoch": 0.6894270618506079, + "grad_norm": 7.026230812072754, + "learning_rate": 1.5408791061415055e-05, + "loss": 1.6485, + "step": 109690 + }, + { + "epoch": 0.689489914167305, + "grad_norm": 6.550210475921631, + "learning_rate": 1.5408371960470402e-05, + "loss": 1.5969, + "step": 109700 + }, + { + "epoch": 0.6895527664840021, + "grad_norm": 6.096058368682861, + "learning_rate": 1.540795285952575e-05, + "loss": 1.6915, + "step": 109710 + }, + { + "epoch": 0.6896156188006992, + "grad_norm": 7.495835304260254, + "learning_rate": 1.5407533758581093e-05, + "loss": 1.7327, + "step": 109720 + }, + { + "epoch": 0.6896784711173963, + "grad_norm": 6.865203380584717, + "learning_rate": 1.540711465763644e-05, + "loss": 1.6517, + "step": 109730 + }, + { + "epoch": 0.6897413234340934, + "grad_norm": 5.886893272399902, + "learning_rate": 1.5406695556691787e-05, + "loss": 1.6512, + "step": 109740 + }, + { + "epoch": 0.6898041757507906, + "grad_norm": 6.635234832763672, + "learning_rate": 1.5406276455747134e-05, + "loss": 1.6374, + "step": 109750 + }, + { + "epoch": 0.6898670280674877, + "grad_norm": 7.9032816886901855, + "learning_rate": 1.5405857354802478e-05, + "loss": 1.5856, + "step": 109760 + }, + { + "epoch": 0.6899298803841848, + "grad_norm": 5.6491379737854, + "learning_rate": 1.5405438253857825e-05, + "loss": 1.5709, + "step": 109770 + }, + { + "epoch": 0.6899927327008819, + "grad_norm": 5.710926532745361, + "learning_rate": 1.5405019152913172e-05, + "loss": 1.4562, + "step": 109780 + }, + { + "epoch": 0.690055585017579, + "grad_norm": 6.611181259155273, + "learning_rate": 1.540460005196852e-05, + "loss": 1.7436, + "step": 109790 + }, + { + "epoch": 0.6901184373342761, + "grad_norm": 6.378403663635254, + "learning_rate": 1.5404180951023862e-05, + "loss": 1.618, + "step": 109800 + }, + { + "epoch": 0.6901812896509733, + "grad_norm": 7.318225860595703, + "learning_rate": 1.540376185007921e-05, + "loss": 1.4601, + "step": 109810 + }, + { + "epoch": 0.6902441419676704, + "grad_norm": 6.113973140716553, + "learning_rate": 1.5403342749134557e-05, + "loss": 1.4868, + "step": 109820 + }, + { + "epoch": 0.6903069942843675, + "grad_norm": 6.777023792266846, + "learning_rate": 1.5402923648189904e-05, + "loss": 1.5545, + "step": 109830 + }, + { + "epoch": 0.6903698466010646, + "grad_norm": 7.185040473937988, + "learning_rate": 1.540250454724525e-05, + "loss": 1.7578, + "step": 109840 + }, + { + "epoch": 0.6904326989177617, + "grad_norm": 6.249362468719482, + "learning_rate": 1.5402085446300598e-05, + "loss": 1.7373, + "step": 109850 + }, + { + "epoch": 0.6904955512344588, + "grad_norm": 6.4665608406066895, + "learning_rate": 1.5401666345355945e-05, + "loss": 1.7623, + "step": 109860 + }, + { + "epoch": 0.6905584035511559, + "grad_norm": 7.5931267738342285, + "learning_rate": 1.5401247244411292e-05, + "loss": 1.5157, + "step": 109870 + }, + { + "epoch": 0.6906212558678531, + "grad_norm": 6.92786169052124, + "learning_rate": 1.5400828143466636e-05, + "loss": 1.569, + "step": 109880 + }, + { + "epoch": 0.6906841081845501, + "grad_norm": 6.552675724029541, + "learning_rate": 1.5400409042521983e-05, + "loss": 1.5763, + "step": 109890 + }, + { + "epoch": 0.6907469605012472, + "grad_norm": 8.306509017944336, + "learning_rate": 1.539998994157733e-05, + "loss": 1.7155, + "step": 109900 + }, + { + "epoch": 0.6908098128179443, + "grad_norm": 6.897550106048584, + "learning_rate": 1.5399570840632677e-05, + "loss": 1.684, + "step": 109910 + }, + { + "epoch": 0.6908726651346414, + "grad_norm": 6.713376522064209, + "learning_rate": 1.5399151739688024e-05, + "loss": 1.6465, + "step": 109920 + }, + { + "epoch": 0.6909355174513385, + "grad_norm": 6.562190532684326, + "learning_rate": 1.539873263874337e-05, + "loss": 1.5514, + "step": 109930 + }, + { + "epoch": 0.6909983697680356, + "grad_norm": 5.659005165100098, + "learning_rate": 1.5398313537798715e-05, + "loss": 1.5351, + "step": 109940 + }, + { + "epoch": 0.6910612220847328, + "grad_norm": 6.241729259490967, + "learning_rate": 1.5397894436854062e-05, + "loss": 1.7717, + "step": 109950 + }, + { + "epoch": 0.6911240744014299, + "grad_norm": 6.490688800811768, + "learning_rate": 1.539747533590941e-05, + "loss": 1.6363, + "step": 109960 + }, + { + "epoch": 0.691186926718127, + "grad_norm": 6.120362281799316, + "learning_rate": 1.5397056234964756e-05, + "loss": 1.4055, + "step": 109970 + }, + { + "epoch": 0.6912497790348241, + "grad_norm": 6.8912034034729, + "learning_rate": 1.53966371340201e-05, + "loss": 1.4607, + "step": 109980 + }, + { + "epoch": 0.6913126313515212, + "grad_norm": 6.576463222503662, + "learning_rate": 1.5396218033075447e-05, + "loss": 1.8473, + "step": 109990 + }, + { + "epoch": 0.6913754836682183, + "grad_norm": 6.136288642883301, + "learning_rate": 1.5395798932130794e-05, + "loss": 1.7056, + "step": 110000 + }, + { + "epoch": 0.6914383359849154, + "grad_norm": 7.206757068634033, + "learning_rate": 1.539537983118614e-05, + "loss": 1.5955, + "step": 110010 + }, + { + "epoch": 0.6915011883016126, + "grad_norm": 6.368089199066162, + "learning_rate": 1.5394960730241488e-05, + "loss": 1.6546, + "step": 110020 + }, + { + "epoch": 0.6915640406183097, + "grad_norm": 6.267004013061523, + "learning_rate": 1.539454162929683e-05, + "loss": 1.6784, + "step": 110030 + }, + { + "epoch": 0.6916268929350068, + "grad_norm": 6.834345817565918, + "learning_rate": 1.539412252835218e-05, + "loss": 1.775, + "step": 110040 + }, + { + "epoch": 0.6916897452517039, + "grad_norm": 5.859621524810791, + "learning_rate": 1.5393703427407526e-05, + "loss": 1.4838, + "step": 110050 + }, + { + "epoch": 0.691752597568401, + "grad_norm": 5.676794052124023, + "learning_rate": 1.5393284326462873e-05, + "loss": 1.5864, + "step": 110060 + }, + { + "epoch": 0.6918154498850981, + "grad_norm": 6.597995758056641, + "learning_rate": 1.539286522551822e-05, + "loss": 1.5039, + "step": 110070 + }, + { + "epoch": 0.6918783022017952, + "grad_norm": 5.67252779006958, + "learning_rate": 1.5392446124573567e-05, + "loss": 1.6945, + "step": 110080 + }, + { + "epoch": 0.6919411545184924, + "grad_norm": 6.437417030334473, + "learning_rate": 1.5392027023628914e-05, + "loss": 1.6711, + "step": 110090 + }, + { + "epoch": 0.6920040068351895, + "grad_norm": 5.708282947540283, + "learning_rate": 1.539160792268426e-05, + "loss": 1.5275, + "step": 110100 + }, + { + "epoch": 0.6920668591518866, + "grad_norm": 6.925989151000977, + "learning_rate": 1.5391188821739605e-05, + "loss": 1.6202, + "step": 110110 + }, + { + "epoch": 0.6921297114685837, + "grad_norm": 5.939941883087158, + "learning_rate": 1.5390769720794952e-05, + "loss": 1.7532, + "step": 110120 + }, + { + "epoch": 0.6921925637852808, + "grad_norm": 6.466796875, + "learning_rate": 1.5390392529944763e-05, + "loss": 1.6942, + "step": 110130 + }, + { + "epoch": 0.6922554161019779, + "grad_norm": 6.570621490478516, + "learning_rate": 1.538997342900011e-05, + "loss": 1.4633, + "step": 110140 + }, + { + "epoch": 0.6923182684186749, + "grad_norm": 7.082062721252441, + "learning_rate": 1.5389554328055457e-05, + "loss": 1.6256, + "step": 110150 + }, + { + "epoch": 0.6923811207353721, + "grad_norm": 6.98456335067749, + "learning_rate": 1.5389135227110804e-05, + "loss": 1.7756, + "step": 110160 + }, + { + "epoch": 0.6924439730520692, + "grad_norm": 6.79068660736084, + "learning_rate": 1.538871612616615e-05, + "loss": 1.5717, + "step": 110170 + }, + { + "epoch": 0.6925068253687663, + "grad_norm": 7.054384231567383, + "learning_rate": 1.53882970252215e-05, + "loss": 1.5873, + "step": 110180 + }, + { + "epoch": 0.6925696776854634, + "grad_norm": 5.819924354553223, + "learning_rate": 1.5387877924276842e-05, + "loss": 1.6075, + "step": 110190 + }, + { + "epoch": 0.6926325300021605, + "grad_norm": 6.592485427856445, + "learning_rate": 1.538745882333219e-05, + "loss": 1.8599, + "step": 110200 + }, + { + "epoch": 0.6926953823188576, + "grad_norm": 7.693783283233643, + "learning_rate": 1.5387039722387536e-05, + "loss": 1.4913, + "step": 110210 + }, + { + "epoch": 0.6927582346355547, + "grad_norm": 7.944255352020264, + "learning_rate": 1.5386620621442883e-05, + "loss": 1.4727, + "step": 110220 + }, + { + "epoch": 0.6928210869522519, + "grad_norm": 6.045512676239014, + "learning_rate": 1.538620152049823e-05, + "loss": 1.7314, + "step": 110230 + }, + { + "epoch": 0.692883939268949, + "grad_norm": 7.270375728607178, + "learning_rate": 1.5385782419553574e-05, + "loss": 1.5352, + "step": 110240 + }, + { + "epoch": 0.6929467915856461, + "grad_norm": 7.211243152618408, + "learning_rate": 1.538536331860892e-05, + "loss": 1.7192, + "step": 110250 + }, + { + "epoch": 0.6930096439023432, + "grad_norm": 6.737878322601318, + "learning_rate": 1.5384944217664268e-05, + "loss": 1.5612, + "step": 110260 + }, + { + "epoch": 0.6930724962190403, + "grad_norm": 6.604902267456055, + "learning_rate": 1.5384525116719615e-05, + "loss": 1.7959, + "step": 110270 + }, + { + "epoch": 0.6931353485357374, + "grad_norm": 6.718039035797119, + "learning_rate": 1.538410601577496e-05, + "loss": 1.6063, + "step": 110280 + }, + { + "epoch": 0.6931982008524346, + "grad_norm": 6.575676918029785, + "learning_rate": 1.5383686914830306e-05, + "loss": 1.4797, + "step": 110290 + }, + { + "epoch": 0.6932610531691317, + "grad_norm": 6.538565158843994, + "learning_rate": 1.5383267813885653e-05, + "loss": 1.5373, + "step": 110300 + }, + { + "epoch": 0.6933239054858288, + "grad_norm": 6.900051116943359, + "learning_rate": 1.5382848712941e-05, + "loss": 1.5909, + "step": 110310 + }, + { + "epoch": 0.6933867578025259, + "grad_norm": 6.351840496063232, + "learning_rate": 1.5382429611996347e-05, + "loss": 1.4559, + "step": 110320 + }, + { + "epoch": 0.693449610119223, + "grad_norm": 6.0556817054748535, + "learning_rate": 1.538201051105169e-05, + "loss": 1.7691, + "step": 110330 + }, + { + "epoch": 0.6935124624359201, + "grad_norm": 6.344932556152344, + "learning_rate": 1.5381591410107038e-05, + "loss": 1.4947, + "step": 110340 + }, + { + "epoch": 0.6935753147526172, + "grad_norm": 7.3224358558654785, + "learning_rate": 1.5381172309162385e-05, + "loss": 1.7113, + "step": 110350 + }, + { + "epoch": 0.6936381670693144, + "grad_norm": 7.070436954498291, + "learning_rate": 1.5380753208217732e-05, + "loss": 1.6708, + "step": 110360 + }, + { + "epoch": 0.6937010193860115, + "grad_norm": 5.970061779022217, + "learning_rate": 1.538033410727308e-05, + "loss": 1.3929, + "step": 110370 + }, + { + "epoch": 0.6937638717027086, + "grad_norm": 7.096200466156006, + "learning_rate": 1.5379915006328426e-05, + "loss": 1.5439, + "step": 110380 + }, + { + "epoch": 0.6938267240194057, + "grad_norm": 6.618981838226318, + "learning_rate": 1.5379495905383773e-05, + "loss": 1.7176, + "step": 110390 + }, + { + "epoch": 0.6938895763361027, + "grad_norm": 6.2274909019470215, + "learning_rate": 1.537907680443912e-05, + "loss": 1.6271, + "step": 110400 + }, + { + "epoch": 0.6939524286527998, + "grad_norm": 6.041452407836914, + "learning_rate": 1.5378657703494464e-05, + "loss": 1.655, + "step": 110410 + }, + { + "epoch": 0.6940152809694969, + "grad_norm": 6.7422261238098145, + "learning_rate": 1.537823860254981e-05, + "loss": 1.6391, + "step": 110420 + }, + { + "epoch": 0.694078133286194, + "grad_norm": 6.288084506988525, + "learning_rate": 1.537781950160516e-05, + "loss": 1.8139, + "step": 110430 + }, + { + "epoch": 0.6941409856028912, + "grad_norm": 6.951841831207275, + "learning_rate": 1.5377400400660505e-05, + "loss": 1.5908, + "step": 110440 + }, + { + "epoch": 0.6942038379195883, + "grad_norm": 5.775060176849365, + "learning_rate": 1.5376981299715852e-05, + "loss": 1.6025, + "step": 110450 + }, + { + "epoch": 0.6942666902362854, + "grad_norm": 8.272947311401367, + "learning_rate": 1.5376562198771196e-05, + "loss": 1.3625, + "step": 110460 + }, + { + "epoch": 0.6943295425529825, + "grad_norm": 7.3116230964660645, + "learning_rate": 1.5376143097826543e-05, + "loss": 1.6746, + "step": 110470 + }, + { + "epoch": 0.6943923948696796, + "grad_norm": 7.830728054046631, + "learning_rate": 1.537572399688189e-05, + "loss": 1.6529, + "step": 110480 + }, + { + "epoch": 0.6944552471863767, + "grad_norm": 6.397856712341309, + "learning_rate": 1.5375304895937237e-05, + "loss": 1.5174, + "step": 110490 + }, + { + "epoch": 0.6945180995030739, + "grad_norm": 7.6474480628967285, + "learning_rate": 1.537488579499258e-05, + "loss": 1.6356, + "step": 110500 + }, + { + "epoch": 0.694580951819771, + "grad_norm": 7.2004218101501465, + "learning_rate": 1.5374466694047928e-05, + "loss": 1.8332, + "step": 110510 + }, + { + "epoch": 0.6946438041364681, + "grad_norm": 6.119325637817383, + "learning_rate": 1.5374047593103275e-05, + "loss": 1.6274, + "step": 110520 + }, + { + "epoch": 0.6947066564531652, + "grad_norm": 6.005640029907227, + "learning_rate": 1.5373628492158622e-05, + "loss": 1.3396, + "step": 110530 + }, + { + "epoch": 0.6947695087698623, + "grad_norm": 6.580428123474121, + "learning_rate": 1.537320939121397e-05, + "loss": 1.5653, + "step": 110540 + }, + { + "epoch": 0.6948323610865594, + "grad_norm": 6.162042140960693, + "learning_rate": 1.5372790290269316e-05, + "loss": 1.4884, + "step": 110550 + }, + { + "epoch": 0.6948952134032566, + "grad_norm": 5.7658371925354, + "learning_rate": 1.5372371189324663e-05, + "loss": 1.4138, + "step": 110560 + }, + { + "epoch": 0.6949580657199537, + "grad_norm": 6.4251861572265625, + "learning_rate": 1.537195208838001e-05, + "loss": 1.7541, + "step": 110570 + }, + { + "epoch": 0.6950209180366508, + "grad_norm": 5.379714488983154, + "learning_rate": 1.5371532987435354e-05, + "loss": 1.743, + "step": 110580 + }, + { + "epoch": 0.6950837703533479, + "grad_norm": 7.091882228851318, + "learning_rate": 1.53711138864907e-05, + "loss": 1.9623, + "step": 110590 + }, + { + "epoch": 0.695146622670045, + "grad_norm": 7.727228164672852, + "learning_rate": 1.537069478554605e-05, + "loss": 1.8519, + "step": 110600 + }, + { + "epoch": 0.6952094749867421, + "grad_norm": 7.1329450607299805, + "learning_rate": 1.5370275684601395e-05, + "loss": 1.6731, + "step": 110610 + }, + { + "epoch": 0.6952723273034392, + "grad_norm": 5.80506706237793, + "learning_rate": 1.5369856583656743e-05, + "loss": 1.6767, + "step": 110620 + }, + { + "epoch": 0.6953351796201364, + "grad_norm": 5.10790491104126, + "learning_rate": 1.5369437482712086e-05, + "loss": 1.6245, + "step": 110630 + }, + { + "epoch": 0.6953980319368335, + "grad_norm": 5.793562889099121, + "learning_rate": 1.5369018381767433e-05, + "loss": 1.7881, + "step": 110640 + }, + { + "epoch": 0.6954608842535306, + "grad_norm": 6.144549369812012, + "learning_rate": 1.536859928082278e-05, + "loss": 1.7062, + "step": 110650 + }, + { + "epoch": 0.6955237365702276, + "grad_norm": 5.768095970153809, + "learning_rate": 1.5368180179878127e-05, + "loss": 1.5482, + "step": 110660 + }, + { + "epoch": 0.6955865888869247, + "grad_norm": 6.386692047119141, + "learning_rate": 1.5367761078933474e-05, + "loss": 1.3336, + "step": 110670 + }, + { + "epoch": 0.6956494412036218, + "grad_norm": 6.446012020111084, + "learning_rate": 1.5367341977988818e-05, + "loss": 1.6281, + "step": 110680 + }, + { + "epoch": 0.6957122935203189, + "grad_norm": 6.755799293518066, + "learning_rate": 1.5366922877044165e-05, + "loss": 1.4132, + "step": 110690 + }, + { + "epoch": 0.695775145837016, + "grad_norm": 6.42385721206665, + "learning_rate": 1.5366503776099512e-05, + "loss": 1.6404, + "step": 110700 + }, + { + "epoch": 0.6958379981537132, + "grad_norm": 7.034458160400391, + "learning_rate": 1.536608467515486e-05, + "loss": 1.8111, + "step": 110710 + }, + { + "epoch": 0.6959008504704103, + "grad_norm": 6.789862632751465, + "learning_rate": 1.5365665574210203e-05, + "loss": 1.5472, + "step": 110720 + }, + { + "epoch": 0.6959637027871074, + "grad_norm": 7.90257453918457, + "learning_rate": 1.536524647326555e-05, + "loss": 1.4989, + "step": 110730 + }, + { + "epoch": 0.6960265551038045, + "grad_norm": 5.922754764556885, + "learning_rate": 1.5364827372320897e-05, + "loss": 1.5503, + "step": 110740 + }, + { + "epoch": 0.6960894074205016, + "grad_norm": 6.36511754989624, + "learning_rate": 1.5364408271376244e-05, + "loss": 1.4924, + "step": 110750 + }, + { + "epoch": 0.6961522597371987, + "grad_norm": 7.440944194793701, + "learning_rate": 1.536398917043159e-05, + "loss": 1.6945, + "step": 110760 + }, + { + "epoch": 0.6962151120538959, + "grad_norm": 6.556360721588135, + "learning_rate": 1.536357006948694e-05, + "loss": 1.7309, + "step": 110770 + }, + { + "epoch": 0.696277964370593, + "grad_norm": 6.454293727874756, + "learning_rate": 1.5363150968542285e-05, + "loss": 1.4156, + "step": 110780 + }, + { + "epoch": 0.6963408166872901, + "grad_norm": 7.992779731750488, + "learning_rate": 1.5362731867597633e-05, + "loss": 1.5207, + "step": 110790 + }, + { + "epoch": 0.6964036690039872, + "grad_norm": 5.340725898742676, + "learning_rate": 1.536231276665298e-05, + "loss": 1.7143, + "step": 110800 + }, + { + "epoch": 0.6964665213206843, + "grad_norm": 8.281492233276367, + "learning_rate": 1.5361893665708323e-05, + "loss": 1.8827, + "step": 110810 + }, + { + "epoch": 0.6965293736373814, + "grad_norm": 5.960738658905029, + "learning_rate": 1.536147456476367e-05, + "loss": 1.4878, + "step": 110820 + }, + { + "epoch": 0.6965922259540785, + "grad_norm": 7.015256404876709, + "learning_rate": 1.5361055463819017e-05, + "loss": 1.4828, + "step": 110830 + }, + { + "epoch": 0.6966550782707757, + "grad_norm": 5.641343116760254, + "learning_rate": 1.5360636362874365e-05, + "loss": 1.4729, + "step": 110840 + }, + { + "epoch": 0.6967179305874728, + "grad_norm": 7.063246250152588, + "learning_rate": 1.536021726192971e-05, + "loss": 1.635, + "step": 110850 + }, + { + "epoch": 0.6967807829041699, + "grad_norm": 6.208061695098877, + "learning_rate": 1.5359798160985055e-05, + "loss": 1.6084, + "step": 110860 + }, + { + "epoch": 0.696843635220867, + "grad_norm": 6.916558265686035, + "learning_rate": 1.5359379060040402e-05, + "loss": 1.7269, + "step": 110870 + }, + { + "epoch": 0.6969064875375641, + "grad_norm": 6.103330135345459, + "learning_rate": 1.535895995909575e-05, + "loss": 1.5001, + "step": 110880 + }, + { + "epoch": 0.6969693398542612, + "grad_norm": 6.57139253616333, + "learning_rate": 1.5358540858151096e-05, + "loss": 1.9416, + "step": 110890 + }, + { + "epoch": 0.6970321921709584, + "grad_norm": 6.957958221435547, + "learning_rate": 1.535812175720644e-05, + "loss": 1.822, + "step": 110900 + }, + { + "epoch": 0.6970950444876554, + "grad_norm": 5.670475006103516, + "learning_rate": 1.5357702656261787e-05, + "loss": 1.4883, + "step": 110910 + }, + { + "epoch": 0.6971578968043525, + "grad_norm": 6.034177780151367, + "learning_rate": 1.5357283555317134e-05, + "loss": 1.7143, + "step": 110920 + }, + { + "epoch": 0.6972207491210496, + "grad_norm": 6.624166011810303, + "learning_rate": 1.535686445437248e-05, + "loss": 1.5443, + "step": 110930 + }, + { + "epoch": 0.6972836014377467, + "grad_norm": 6.640042781829834, + "learning_rate": 1.535644535342783e-05, + "loss": 1.6603, + "step": 110940 + }, + { + "epoch": 0.6973464537544438, + "grad_norm": 5.989957809448242, + "learning_rate": 1.5356026252483172e-05, + "loss": 1.4335, + "step": 110950 + }, + { + "epoch": 0.6974093060711409, + "grad_norm": 5.535582542419434, + "learning_rate": 1.535560715153852e-05, + "loss": 1.554, + "step": 110960 + }, + { + "epoch": 0.697472158387838, + "grad_norm": 7.012519359588623, + "learning_rate": 1.5355188050593866e-05, + "loss": 1.7497, + "step": 110970 + }, + { + "epoch": 0.6975350107045352, + "grad_norm": 6.441524028778076, + "learning_rate": 1.5354768949649213e-05, + "loss": 1.4893, + "step": 110980 + }, + { + "epoch": 0.6975978630212323, + "grad_norm": 6.06854248046875, + "learning_rate": 1.535434984870456e-05, + "loss": 1.5386, + "step": 110990 + }, + { + "epoch": 0.6976607153379294, + "grad_norm": 6.238610744476318, + "learning_rate": 1.5353930747759907e-05, + "loss": 1.8803, + "step": 111000 + }, + { + "epoch": 0.6977235676546265, + "grad_norm": 5.91594934463501, + "learning_rate": 1.5353511646815255e-05, + "loss": 1.7155, + "step": 111010 + }, + { + "epoch": 0.6977864199713236, + "grad_norm": 6.111907482147217, + "learning_rate": 1.53530925458706e-05, + "loss": 1.6746, + "step": 111020 + }, + { + "epoch": 0.6978492722880207, + "grad_norm": 5.388387680053711, + "learning_rate": 1.5352673444925945e-05, + "loss": 1.657, + "step": 111030 + }, + { + "epoch": 0.6979121246047179, + "grad_norm": 5.682615280151367, + "learning_rate": 1.5352254343981292e-05, + "loss": 1.6807, + "step": 111040 + }, + { + "epoch": 0.697974976921415, + "grad_norm": 6.166755676269531, + "learning_rate": 1.535183524303664e-05, + "loss": 1.6375, + "step": 111050 + }, + { + "epoch": 0.6980378292381121, + "grad_norm": 5.698094367980957, + "learning_rate": 1.5351416142091987e-05, + "loss": 1.5027, + "step": 111060 + }, + { + "epoch": 0.6981006815548092, + "grad_norm": 6.914948463439941, + "learning_rate": 1.5350997041147334e-05, + "loss": 1.4712, + "step": 111070 + }, + { + "epoch": 0.6981635338715063, + "grad_norm": 7.217332363128662, + "learning_rate": 1.5350577940202677e-05, + "loss": 1.65, + "step": 111080 + }, + { + "epoch": 0.6982263861882034, + "grad_norm": 7.417016983032227, + "learning_rate": 1.5350158839258024e-05, + "loss": 1.7447, + "step": 111090 + }, + { + "epoch": 0.6982892385049005, + "grad_norm": 6.69106388092041, + "learning_rate": 1.534973973831337e-05, + "loss": 1.5877, + "step": 111100 + }, + { + "epoch": 0.6983520908215977, + "grad_norm": 7.806684494018555, + "learning_rate": 1.534932063736872e-05, + "loss": 1.6986, + "step": 111110 + }, + { + "epoch": 0.6984149431382948, + "grad_norm": 5.269705772399902, + "learning_rate": 1.5348901536424062e-05, + "loss": 1.4404, + "step": 111120 + }, + { + "epoch": 0.6984777954549919, + "grad_norm": 7.750637531280518, + "learning_rate": 1.534848243547941e-05, + "loss": 1.4203, + "step": 111130 + }, + { + "epoch": 0.698540647771689, + "grad_norm": 6.469650745391846, + "learning_rate": 1.5348063334534756e-05, + "loss": 1.9374, + "step": 111140 + }, + { + "epoch": 0.6986035000883861, + "grad_norm": 6.945512294769287, + "learning_rate": 1.5347644233590103e-05, + "loss": 1.541, + "step": 111150 + }, + { + "epoch": 0.6986663524050832, + "grad_norm": 7.122017860412598, + "learning_rate": 1.534722513264545e-05, + "loss": 1.6571, + "step": 111160 + }, + { + "epoch": 0.6987292047217802, + "grad_norm": 7.253368377685547, + "learning_rate": 1.5346806031700798e-05, + "loss": 1.849, + "step": 111170 + }, + { + "epoch": 0.6987920570384774, + "grad_norm": 6.511105060577393, + "learning_rate": 1.5346386930756145e-05, + "loss": 1.9503, + "step": 111180 + }, + { + "epoch": 0.6988549093551745, + "grad_norm": 6.037867069244385, + "learning_rate": 1.534596782981149e-05, + "loss": 1.6691, + "step": 111190 + }, + { + "epoch": 0.6989177616718716, + "grad_norm": 7.932125091552734, + "learning_rate": 1.5345548728866835e-05, + "loss": 1.6816, + "step": 111200 + }, + { + "epoch": 0.6989806139885687, + "grad_norm": 7.179452419281006, + "learning_rate": 1.5345129627922182e-05, + "loss": 1.5332, + "step": 111210 + }, + { + "epoch": 0.6990434663052658, + "grad_norm": 5.7000732421875, + "learning_rate": 1.534471052697753e-05, + "loss": 1.5691, + "step": 111220 + }, + { + "epoch": 0.6991063186219629, + "grad_norm": 6.47969913482666, + "learning_rate": 1.5344291426032877e-05, + "loss": 1.5694, + "step": 111230 + }, + { + "epoch": 0.69916917093866, + "grad_norm": 7.466422080993652, + "learning_rate": 1.5343872325088224e-05, + "loss": 1.7279, + "step": 111240 + }, + { + "epoch": 0.6992320232553572, + "grad_norm": 7.074092864990234, + "learning_rate": 1.5343453224143567e-05, + "loss": 1.6808, + "step": 111250 + }, + { + "epoch": 0.6992948755720543, + "grad_norm": 6.891863822937012, + "learning_rate": 1.5343034123198914e-05, + "loss": 1.5505, + "step": 111260 + }, + { + "epoch": 0.6993577278887514, + "grad_norm": 5.30508279800415, + "learning_rate": 1.534261502225426e-05, + "loss": 1.6563, + "step": 111270 + }, + { + "epoch": 0.6994205802054485, + "grad_norm": 6.897408962249756, + "learning_rate": 1.534219592130961e-05, + "loss": 1.7412, + "step": 111280 + }, + { + "epoch": 0.6994834325221456, + "grad_norm": 7.533118724822998, + "learning_rate": 1.5341776820364956e-05, + "loss": 1.7652, + "step": 111290 + }, + { + "epoch": 0.6995462848388427, + "grad_norm": 5.944263458251953, + "learning_rate": 1.53413577194203e-05, + "loss": 1.5952, + "step": 111300 + }, + { + "epoch": 0.6996091371555399, + "grad_norm": 7.44816255569458, + "learning_rate": 1.5340938618475646e-05, + "loss": 1.7116, + "step": 111310 + }, + { + "epoch": 0.699671989472237, + "grad_norm": 6.531190872192383, + "learning_rate": 1.5340519517530993e-05, + "loss": 1.5211, + "step": 111320 + }, + { + "epoch": 0.6997348417889341, + "grad_norm": 7.204296588897705, + "learning_rate": 1.534010041658634e-05, + "loss": 1.6991, + "step": 111330 + }, + { + "epoch": 0.6997976941056312, + "grad_norm": 6.985568523406982, + "learning_rate": 1.5339681315641684e-05, + "loss": 1.5637, + "step": 111340 + }, + { + "epoch": 0.6998605464223283, + "grad_norm": 5.216329574584961, + "learning_rate": 1.533926221469703e-05, + "loss": 1.3214, + "step": 111350 + }, + { + "epoch": 0.6999233987390254, + "grad_norm": 6.626350402832031, + "learning_rate": 1.5338843113752378e-05, + "loss": 1.7145, + "step": 111360 + }, + { + "epoch": 0.6999862510557225, + "grad_norm": 7.1996965408325195, + "learning_rate": 1.5338424012807725e-05, + "loss": 1.71, + "step": 111370 + }, + { + "epoch": 0.7000491033724197, + "grad_norm": 6.929688930511475, + "learning_rate": 1.5338004911863072e-05, + "loss": 1.7993, + "step": 111380 + }, + { + "epoch": 0.7001119556891168, + "grad_norm": 6.898329257965088, + "learning_rate": 1.533758581091842e-05, + "loss": 1.615, + "step": 111390 + }, + { + "epoch": 0.7001748080058139, + "grad_norm": 6.943667411804199, + "learning_rate": 1.5337166709973767e-05, + "loss": 1.6869, + "step": 111400 + }, + { + "epoch": 0.700237660322511, + "grad_norm": 6.866006374359131, + "learning_rate": 1.5336747609029114e-05, + "loss": 1.5386, + "step": 111410 + }, + { + "epoch": 0.700300512639208, + "grad_norm": 7.109818935394287, + "learning_rate": 1.533632850808446e-05, + "loss": 1.7766, + "step": 111420 + }, + { + "epoch": 0.7003633649559051, + "grad_norm": 5.786563873291016, + "learning_rate": 1.5335909407139804e-05, + "loss": 1.5939, + "step": 111430 + }, + { + "epoch": 0.7004262172726022, + "grad_norm": 5.846156120300293, + "learning_rate": 1.533549030619515e-05, + "loss": 1.3397, + "step": 111440 + }, + { + "epoch": 0.7004890695892994, + "grad_norm": 7.736020565032959, + "learning_rate": 1.53350712052505e-05, + "loss": 1.6997, + "step": 111450 + }, + { + "epoch": 0.7005519219059965, + "grad_norm": 7.803137302398682, + "learning_rate": 1.5334652104305846e-05, + "loss": 1.7629, + "step": 111460 + }, + { + "epoch": 0.7006147742226936, + "grad_norm": 6.8571906089782715, + "learning_rate": 1.5334233003361193e-05, + "loss": 1.6626, + "step": 111470 + }, + { + "epoch": 0.7006776265393907, + "grad_norm": 6.03222131729126, + "learning_rate": 1.5333813902416536e-05, + "loss": 1.6276, + "step": 111480 + }, + { + "epoch": 0.7007404788560878, + "grad_norm": 7.02052640914917, + "learning_rate": 1.5333394801471883e-05, + "loss": 1.5892, + "step": 111490 + }, + { + "epoch": 0.7008033311727849, + "grad_norm": 5.841487407684326, + "learning_rate": 1.5333017610621698e-05, + "loss": 1.5184, + "step": 111500 + }, + { + "epoch": 0.700866183489482, + "grad_norm": 6.02025842666626, + "learning_rate": 1.5332598509677042e-05, + "loss": 1.7335, + "step": 111510 + }, + { + "epoch": 0.7009290358061792, + "grad_norm": 7.783817768096924, + "learning_rate": 1.533217940873239e-05, + "loss": 1.7158, + "step": 111520 + }, + { + "epoch": 0.7009918881228763, + "grad_norm": 6.168801307678223, + "learning_rate": 1.5331760307787736e-05, + "loss": 1.5162, + "step": 111530 + }, + { + "epoch": 0.7010547404395734, + "grad_norm": 6.441208362579346, + "learning_rate": 1.5331341206843083e-05, + "loss": 1.7334, + "step": 111540 + }, + { + "epoch": 0.7011175927562705, + "grad_norm": 6.061811447143555, + "learning_rate": 1.5330922105898427e-05, + "loss": 1.5543, + "step": 111550 + }, + { + "epoch": 0.7011804450729676, + "grad_norm": 7.365804195404053, + "learning_rate": 1.5330503004953774e-05, + "loss": 1.6776, + "step": 111560 + }, + { + "epoch": 0.7012432973896647, + "grad_norm": 5.934698104858398, + "learning_rate": 1.533008390400912e-05, + "loss": 1.5696, + "step": 111570 + }, + { + "epoch": 0.7013061497063618, + "grad_norm": 6.140161037445068, + "learning_rate": 1.5329664803064468e-05, + "loss": 1.8776, + "step": 111580 + }, + { + "epoch": 0.701369002023059, + "grad_norm": 6.963189125061035, + "learning_rate": 1.5329245702119815e-05, + "loss": 1.6721, + "step": 111590 + }, + { + "epoch": 0.7014318543397561, + "grad_norm": 7.292041778564453, + "learning_rate": 1.532882660117516e-05, + "loss": 1.8092, + "step": 111600 + }, + { + "epoch": 0.7014947066564532, + "grad_norm": 7.88812255859375, + "learning_rate": 1.5328407500230506e-05, + "loss": 1.4997, + "step": 111610 + }, + { + "epoch": 0.7015575589731503, + "grad_norm": 7.066206932067871, + "learning_rate": 1.5327988399285853e-05, + "loss": 1.8031, + "step": 111620 + }, + { + "epoch": 0.7016204112898474, + "grad_norm": 6.974310398101807, + "learning_rate": 1.53275692983412e-05, + "loss": 1.57, + "step": 111630 + }, + { + "epoch": 0.7016832636065445, + "grad_norm": 5.402825832366943, + "learning_rate": 1.5327150197396547e-05, + "loss": 1.6978, + "step": 111640 + }, + { + "epoch": 0.7017461159232417, + "grad_norm": 5.739223003387451, + "learning_rate": 1.532673109645189e-05, + "loss": 1.4454, + "step": 111650 + }, + { + "epoch": 0.7018089682399388, + "grad_norm": 6.70404577255249, + "learning_rate": 1.5326311995507238e-05, + "loss": 1.6804, + "step": 111660 + }, + { + "epoch": 0.7018718205566359, + "grad_norm": 8.330659866333008, + "learning_rate": 1.5325892894562585e-05, + "loss": 1.7975, + "step": 111670 + }, + { + "epoch": 0.7019346728733329, + "grad_norm": 6.400068283081055, + "learning_rate": 1.5325473793617932e-05, + "loss": 1.6265, + "step": 111680 + }, + { + "epoch": 0.70199752519003, + "grad_norm": 7.074087619781494, + "learning_rate": 1.532505469267328e-05, + "loss": 1.483, + "step": 111690 + }, + { + "epoch": 0.7020603775067271, + "grad_norm": 7.001039505004883, + "learning_rate": 1.5324635591728626e-05, + "loss": 1.5174, + "step": 111700 + }, + { + "epoch": 0.7021232298234242, + "grad_norm": 6.503770351409912, + "learning_rate": 1.5324216490783973e-05, + "loss": 1.5599, + "step": 111710 + }, + { + "epoch": 0.7021860821401213, + "grad_norm": 6.884154796600342, + "learning_rate": 1.532379738983932e-05, + "loss": 1.547, + "step": 111720 + }, + { + "epoch": 0.7022489344568185, + "grad_norm": 6.284861087799072, + "learning_rate": 1.5323378288894664e-05, + "loss": 1.6342, + "step": 111730 + }, + { + "epoch": 0.7023117867735156, + "grad_norm": 7.665338516235352, + "learning_rate": 1.532295918795001e-05, + "loss": 1.6949, + "step": 111740 + }, + { + "epoch": 0.7023746390902127, + "grad_norm": 6.341705322265625, + "learning_rate": 1.5322540087005358e-05, + "loss": 1.334, + "step": 111750 + }, + { + "epoch": 0.7024374914069098, + "grad_norm": 5.66460657119751, + "learning_rate": 1.5322120986060705e-05, + "loss": 1.6776, + "step": 111760 + }, + { + "epoch": 0.7025003437236069, + "grad_norm": 6.004974842071533, + "learning_rate": 1.532170188511605e-05, + "loss": 1.6779, + "step": 111770 + }, + { + "epoch": 0.702563196040304, + "grad_norm": 6.916309356689453, + "learning_rate": 1.5321282784171396e-05, + "loss": 1.6225, + "step": 111780 + }, + { + "epoch": 0.7026260483570012, + "grad_norm": 6.033321857452393, + "learning_rate": 1.5320863683226743e-05, + "loss": 1.6739, + "step": 111790 + }, + { + "epoch": 0.7026889006736983, + "grad_norm": 6.991604804992676, + "learning_rate": 1.532044458228209e-05, + "loss": 1.8158, + "step": 111800 + }, + { + "epoch": 0.7027517529903954, + "grad_norm": 6.926016807556152, + "learning_rate": 1.5320025481337437e-05, + "loss": 1.5259, + "step": 111810 + }, + { + "epoch": 0.7028146053070925, + "grad_norm": 7.260146141052246, + "learning_rate": 1.531960638039278e-05, + "loss": 1.642, + "step": 111820 + }, + { + "epoch": 0.7028774576237896, + "grad_norm": 5.637368202209473, + "learning_rate": 1.5319187279448128e-05, + "loss": 1.4252, + "step": 111830 + }, + { + "epoch": 0.7029403099404867, + "grad_norm": 6.351644515991211, + "learning_rate": 1.5318768178503475e-05, + "loss": 1.9248, + "step": 111840 + }, + { + "epoch": 0.7030031622571838, + "grad_norm": 6.875118732452393, + "learning_rate": 1.5318349077558822e-05, + "loss": 1.6857, + "step": 111850 + }, + { + "epoch": 0.703066014573881, + "grad_norm": 5.7353515625, + "learning_rate": 1.531792997661417e-05, + "loss": 1.4319, + "step": 111860 + }, + { + "epoch": 0.7031288668905781, + "grad_norm": 5.102963924407959, + "learning_rate": 1.5317510875669516e-05, + "loss": 1.6871, + "step": 111870 + }, + { + "epoch": 0.7031917192072752, + "grad_norm": 6.1228837966918945, + "learning_rate": 1.5317091774724863e-05, + "loss": 1.7169, + "step": 111880 + }, + { + "epoch": 0.7032545715239723, + "grad_norm": 7.117494583129883, + "learning_rate": 1.5316672673780207e-05, + "loss": 1.7236, + "step": 111890 + }, + { + "epoch": 0.7033174238406694, + "grad_norm": 7.3301167488098145, + "learning_rate": 1.5316253572835554e-05, + "loss": 1.6067, + "step": 111900 + }, + { + "epoch": 0.7033802761573665, + "grad_norm": 6.822005271911621, + "learning_rate": 1.53158344718909e-05, + "loss": 1.4782, + "step": 111910 + }, + { + "epoch": 0.7034431284740637, + "grad_norm": 5.827635765075684, + "learning_rate": 1.5315415370946248e-05, + "loss": 1.6321, + "step": 111920 + }, + { + "epoch": 0.7035059807907607, + "grad_norm": 6.484631538391113, + "learning_rate": 1.5314996270001595e-05, + "loss": 1.7872, + "step": 111930 + }, + { + "epoch": 0.7035688331074578, + "grad_norm": 6.064381122589111, + "learning_rate": 1.5314577169056942e-05, + "loss": 1.4691, + "step": 111940 + }, + { + "epoch": 0.7036316854241549, + "grad_norm": 6.730330944061279, + "learning_rate": 1.5314158068112286e-05, + "loss": 1.531, + "step": 111950 + }, + { + "epoch": 0.703694537740852, + "grad_norm": 6.129236698150635, + "learning_rate": 1.5313738967167633e-05, + "loss": 1.7533, + "step": 111960 + }, + { + "epoch": 0.7037573900575491, + "grad_norm": 5.432794570922852, + "learning_rate": 1.531331986622298e-05, + "loss": 1.4477, + "step": 111970 + }, + { + "epoch": 0.7038202423742462, + "grad_norm": 6.189945697784424, + "learning_rate": 1.5312900765278327e-05, + "loss": 1.7233, + "step": 111980 + }, + { + "epoch": 0.7038830946909433, + "grad_norm": 6.620745658874512, + "learning_rate": 1.531248166433367e-05, + "loss": 1.5436, + "step": 111990 + }, + { + "epoch": 0.7039459470076405, + "grad_norm": 5.968117713928223, + "learning_rate": 1.5312062563389018e-05, + "loss": 1.622, + "step": 112000 + }, + { + "epoch": 0.7040087993243376, + "grad_norm": 7.784635066986084, + "learning_rate": 1.5311643462444365e-05, + "loss": 1.7669, + "step": 112010 + }, + { + "epoch": 0.7040716516410347, + "grad_norm": 6.146176338195801, + "learning_rate": 1.5311224361499712e-05, + "loss": 1.4989, + "step": 112020 + }, + { + "epoch": 0.7041345039577318, + "grad_norm": 5.737129211425781, + "learning_rate": 1.531080526055506e-05, + "loss": 1.4719, + "step": 112030 + }, + { + "epoch": 0.7041973562744289, + "grad_norm": 6.738218307495117, + "learning_rate": 1.5310386159610403e-05, + "loss": 1.5552, + "step": 112040 + }, + { + "epoch": 0.704260208591126, + "grad_norm": 6.180079460144043, + "learning_rate": 1.530996705866575e-05, + "loss": 1.6742, + "step": 112050 + }, + { + "epoch": 0.7043230609078232, + "grad_norm": 6.585474491119385, + "learning_rate": 1.5309547957721097e-05, + "loss": 1.9372, + "step": 112060 + }, + { + "epoch": 0.7043859132245203, + "grad_norm": 5.886623382568359, + "learning_rate": 1.5309128856776444e-05, + "loss": 1.6076, + "step": 112070 + }, + { + "epoch": 0.7044487655412174, + "grad_norm": 6.316711902618408, + "learning_rate": 1.530870975583179e-05, + "loss": 1.5736, + "step": 112080 + }, + { + "epoch": 0.7045116178579145, + "grad_norm": 5.902610778808594, + "learning_rate": 1.5308290654887138e-05, + "loss": 1.4446, + "step": 112090 + }, + { + "epoch": 0.7045744701746116, + "grad_norm": 6.4041643142700195, + "learning_rate": 1.5307871553942485e-05, + "loss": 1.8072, + "step": 112100 + }, + { + "epoch": 0.7046373224913087, + "grad_norm": 7.238096237182617, + "learning_rate": 1.5307452452997832e-05, + "loss": 1.6929, + "step": 112110 + }, + { + "epoch": 0.7047001748080058, + "grad_norm": 5.849660396575928, + "learning_rate": 1.530703335205318e-05, + "loss": 1.327, + "step": 112120 + }, + { + "epoch": 0.704763027124703, + "grad_norm": 5.922614574432373, + "learning_rate": 1.5306614251108523e-05, + "loss": 1.4867, + "step": 112130 + }, + { + "epoch": 0.7048258794414001, + "grad_norm": 7.565813064575195, + "learning_rate": 1.530619515016387e-05, + "loss": 1.8852, + "step": 112140 + }, + { + "epoch": 0.7048887317580972, + "grad_norm": 6.607747554779053, + "learning_rate": 1.5305776049219217e-05, + "loss": 1.6111, + "step": 112150 + }, + { + "epoch": 0.7049515840747943, + "grad_norm": 5.848597526550293, + "learning_rate": 1.5305356948274564e-05, + "loss": 1.6545, + "step": 112160 + }, + { + "epoch": 0.7050144363914914, + "grad_norm": 6.902886867523193, + "learning_rate": 1.5304937847329908e-05, + "loss": 1.7418, + "step": 112170 + }, + { + "epoch": 0.7050772887081885, + "grad_norm": 6.819500923156738, + "learning_rate": 1.5304518746385255e-05, + "loss": 1.7065, + "step": 112180 + }, + { + "epoch": 0.7051401410248855, + "grad_norm": 6.405013561248779, + "learning_rate": 1.5304099645440602e-05, + "loss": 1.5845, + "step": 112190 + }, + { + "epoch": 0.7052029933415827, + "grad_norm": 6.885857582092285, + "learning_rate": 1.530368054449595e-05, + "loss": 1.546, + "step": 112200 + }, + { + "epoch": 0.7052658456582798, + "grad_norm": 6.973810195922852, + "learning_rate": 1.5303261443551296e-05, + "loss": 1.6993, + "step": 112210 + }, + { + "epoch": 0.7053286979749769, + "grad_norm": 6.513957500457764, + "learning_rate": 1.530284234260664e-05, + "loss": 1.3631, + "step": 112220 + }, + { + "epoch": 0.705391550291674, + "grad_norm": 5.68147087097168, + "learning_rate": 1.5302423241661987e-05, + "loss": 1.6635, + "step": 112230 + }, + { + "epoch": 0.7054544026083711, + "grad_norm": 7.037917613983154, + "learning_rate": 1.5302004140717334e-05, + "loss": 1.6377, + "step": 112240 + }, + { + "epoch": 0.7055172549250682, + "grad_norm": 6.520129680633545, + "learning_rate": 1.530158503977268e-05, + "loss": 1.7185, + "step": 112250 + }, + { + "epoch": 0.7055801072417653, + "grad_norm": 5.36246919631958, + "learning_rate": 1.5301165938828028e-05, + "loss": 1.76, + "step": 112260 + }, + { + "epoch": 0.7056429595584625, + "grad_norm": 6.547013282775879, + "learning_rate": 1.5300746837883372e-05, + "loss": 1.6227, + "step": 112270 + }, + { + "epoch": 0.7057058118751596, + "grad_norm": 6.3188581466674805, + "learning_rate": 1.530032773693872e-05, + "loss": 1.5041, + "step": 112280 + }, + { + "epoch": 0.7057686641918567, + "grad_norm": 6.86865234375, + "learning_rate": 1.5299908635994066e-05, + "loss": 1.5423, + "step": 112290 + }, + { + "epoch": 0.7058315165085538, + "grad_norm": 5.936668872833252, + "learning_rate": 1.5299489535049413e-05, + "loss": 1.6973, + "step": 112300 + }, + { + "epoch": 0.7058943688252509, + "grad_norm": 6.411205291748047, + "learning_rate": 1.529907043410476e-05, + "loss": 1.6422, + "step": 112310 + }, + { + "epoch": 0.705957221141948, + "grad_norm": 5.711047649383545, + "learning_rate": 1.5298651333160107e-05, + "loss": 1.4986, + "step": 112320 + }, + { + "epoch": 0.7060200734586451, + "grad_norm": 7.482402324676514, + "learning_rate": 1.5298232232215454e-05, + "loss": 1.5474, + "step": 112330 + }, + { + "epoch": 0.7060829257753423, + "grad_norm": 7.547204971313477, + "learning_rate": 1.52978131312708e-05, + "loss": 1.6577, + "step": 112340 + }, + { + "epoch": 0.7061457780920394, + "grad_norm": 6.965823173522949, + "learning_rate": 1.5297394030326145e-05, + "loss": 1.6739, + "step": 112350 + }, + { + "epoch": 0.7062086304087365, + "grad_norm": 7.063565254211426, + "learning_rate": 1.5296974929381492e-05, + "loss": 1.647, + "step": 112360 + }, + { + "epoch": 0.7062714827254336, + "grad_norm": 7.207581520080566, + "learning_rate": 1.529655582843684e-05, + "loss": 1.7837, + "step": 112370 + }, + { + "epoch": 0.7063343350421307, + "grad_norm": 7.736453056335449, + "learning_rate": 1.5296136727492186e-05, + "loss": 1.5431, + "step": 112380 + }, + { + "epoch": 0.7063971873588278, + "grad_norm": 6.191543102264404, + "learning_rate": 1.529571762654753e-05, + "loss": 1.5448, + "step": 112390 + }, + { + "epoch": 0.706460039675525, + "grad_norm": 6.08800745010376, + "learning_rate": 1.5295298525602877e-05, + "loss": 1.5879, + "step": 112400 + }, + { + "epoch": 0.7065228919922221, + "grad_norm": 6.827498435974121, + "learning_rate": 1.5294879424658224e-05, + "loss": 1.695, + "step": 112410 + }, + { + "epoch": 0.7065857443089192, + "grad_norm": 6.868002891540527, + "learning_rate": 1.529446032371357e-05, + "loss": 1.6397, + "step": 112420 + }, + { + "epoch": 0.7066485966256163, + "grad_norm": 6.5036468505859375, + "learning_rate": 1.5294041222768918e-05, + "loss": 1.4834, + "step": 112430 + }, + { + "epoch": 0.7067114489423133, + "grad_norm": 7.184835910797119, + "learning_rate": 1.5293622121824262e-05, + "loss": 1.4564, + "step": 112440 + }, + { + "epoch": 0.7067743012590104, + "grad_norm": 6.301793575286865, + "learning_rate": 1.529320302087961e-05, + "loss": 1.4185, + "step": 112450 + }, + { + "epoch": 0.7068371535757075, + "grad_norm": 7.015898704528809, + "learning_rate": 1.5292783919934956e-05, + "loss": 1.6782, + "step": 112460 + }, + { + "epoch": 0.7069000058924046, + "grad_norm": 7.5571699142456055, + "learning_rate": 1.5292364818990303e-05, + "loss": 1.8978, + "step": 112470 + }, + { + "epoch": 0.7069628582091018, + "grad_norm": 7.434046745300293, + "learning_rate": 1.529194571804565e-05, + "loss": 1.6505, + "step": 112480 + }, + { + "epoch": 0.7070257105257989, + "grad_norm": 6.994179725646973, + "learning_rate": 1.5291526617100997e-05, + "loss": 1.5477, + "step": 112490 + }, + { + "epoch": 0.707088562842496, + "grad_norm": 6.415890693664551, + "learning_rate": 1.5291107516156344e-05, + "loss": 1.4122, + "step": 112500 + }, + { + "epoch": 0.7071514151591931, + "grad_norm": 6.995442867279053, + "learning_rate": 1.529068841521169e-05, + "loss": 1.4985, + "step": 112510 + }, + { + "epoch": 0.7072142674758902, + "grad_norm": 7.962581634521484, + "learning_rate": 1.5290269314267035e-05, + "loss": 1.8319, + "step": 112520 + }, + { + "epoch": 0.7072771197925873, + "grad_norm": 5.557328701019287, + "learning_rate": 1.5289850213322382e-05, + "loss": 1.5311, + "step": 112530 + }, + { + "epoch": 0.7073399721092845, + "grad_norm": 6.887127876281738, + "learning_rate": 1.528943111237773e-05, + "loss": 1.682, + "step": 112540 + }, + { + "epoch": 0.7074028244259816, + "grad_norm": 7.099719524383545, + "learning_rate": 1.5289012011433076e-05, + "loss": 1.5395, + "step": 112550 + }, + { + "epoch": 0.7074656767426787, + "grad_norm": 6.0728559494018555, + "learning_rate": 1.5288592910488423e-05, + "loss": 1.6686, + "step": 112560 + }, + { + "epoch": 0.7075285290593758, + "grad_norm": 7.208877086639404, + "learning_rate": 1.5288173809543767e-05, + "loss": 1.7004, + "step": 112570 + }, + { + "epoch": 0.7075913813760729, + "grad_norm": 5.693336486816406, + "learning_rate": 1.5287754708599114e-05, + "loss": 1.4082, + "step": 112580 + }, + { + "epoch": 0.70765423369277, + "grad_norm": 6.975170612335205, + "learning_rate": 1.528733560765446e-05, + "loss": 1.6212, + "step": 112590 + }, + { + "epoch": 0.7077170860094671, + "grad_norm": 6.518259048461914, + "learning_rate": 1.5286916506709808e-05, + "loss": 1.5739, + "step": 112600 + }, + { + "epoch": 0.7077799383261643, + "grad_norm": 7.409762382507324, + "learning_rate": 1.5286497405765155e-05, + "loss": 1.7499, + "step": 112610 + }, + { + "epoch": 0.7078427906428614, + "grad_norm": 6.29955530166626, + "learning_rate": 1.52860783048205e-05, + "loss": 1.5993, + "step": 112620 + }, + { + "epoch": 0.7079056429595585, + "grad_norm": 6.510418891906738, + "learning_rate": 1.5285659203875846e-05, + "loss": 1.6247, + "step": 112630 + }, + { + "epoch": 0.7079684952762556, + "grad_norm": 6.022556781768799, + "learning_rate": 1.5285240102931193e-05, + "loss": 1.7137, + "step": 112640 + }, + { + "epoch": 0.7080313475929527, + "grad_norm": 6.4454827308654785, + "learning_rate": 1.528482100198654e-05, + "loss": 1.6566, + "step": 112650 + }, + { + "epoch": 0.7080941999096498, + "grad_norm": 6.457005977630615, + "learning_rate": 1.5284401901041884e-05, + "loss": 1.5795, + "step": 112660 + }, + { + "epoch": 0.708157052226347, + "grad_norm": 6.971104621887207, + "learning_rate": 1.528398280009723e-05, + "loss": 1.6491, + "step": 112670 + }, + { + "epoch": 0.7082199045430441, + "grad_norm": 7.521648406982422, + "learning_rate": 1.5283563699152578e-05, + "loss": 1.5301, + "step": 112680 + }, + { + "epoch": 0.7082827568597412, + "grad_norm": 4.746102809906006, + "learning_rate": 1.5283144598207925e-05, + "loss": 1.4043, + "step": 112690 + }, + { + "epoch": 0.7083456091764382, + "grad_norm": 6.2397050857543945, + "learning_rate": 1.5282725497263272e-05, + "loss": 1.8456, + "step": 112700 + }, + { + "epoch": 0.7084084614931353, + "grad_norm": 6.495884895324707, + "learning_rate": 1.528230639631862e-05, + "loss": 1.8223, + "step": 112710 + }, + { + "epoch": 0.7084713138098324, + "grad_norm": 6.451614856719971, + "learning_rate": 1.5281887295373966e-05, + "loss": 1.7124, + "step": 112720 + }, + { + "epoch": 0.7085341661265295, + "grad_norm": 5.592561721801758, + "learning_rate": 1.5281468194429313e-05, + "loss": 1.516, + "step": 112730 + }, + { + "epoch": 0.7085970184432266, + "grad_norm": 7.746678829193115, + "learning_rate": 1.528104909348466e-05, + "loss": 1.7306, + "step": 112740 + }, + { + "epoch": 0.7086598707599238, + "grad_norm": 6.638991832733154, + "learning_rate": 1.5280629992540004e-05, + "loss": 1.6608, + "step": 112750 + }, + { + "epoch": 0.7087227230766209, + "grad_norm": 6.159857749938965, + "learning_rate": 1.528021089159535e-05, + "loss": 1.8309, + "step": 112760 + }, + { + "epoch": 0.708785575393318, + "grad_norm": 6.521195411682129, + "learning_rate": 1.5279791790650698e-05, + "loss": 1.4588, + "step": 112770 + }, + { + "epoch": 0.7088484277100151, + "grad_norm": 6.476180076599121, + "learning_rate": 1.5279372689706045e-05, + "loss": 1.5082, + "step": 112780 + }, + { + "epoch": 0.7089112800267122, + "grad_norm": 6.665990352630615, + "learning_rate": 1.527895358876139e-05, + "loss": 1.7222, + "step": 112790 + }, + { + "epoch": 0.7089741323434093, + "grad_norm": 5.634333610534668, + "learning_rate": 1.5278534487816736e-05, + "loss": 1.6938, + "step": 112800 + }, + { + "epoch": 0.7090369846601065, + "grad_norm": 5.693087100982666, + "learning_rate": 1.5278115386872083e-05, + "loss": 1.7999, + "step": 112810 + }, + { + "epoch": 0.7090998369768036, + "grad_norm": 6.429813861846924, + "learning_rate": 1.527769628592743e-05, + "loss": 1.4559, + "step": 112820 + }, + { + "epoch": 0.7091626892935007, + "grad_norm": 6.750729084014893, + "learning_rate": 1.5277277184982777e-05, + "loss": 1.4969, + "step": 112830 + }, + { + "epoch": 0.7092255416101978, + "grad_norm": 5.89532470703125, + "learning_rate": 1.527685808403812e-05, + "loss": 1.785, + "step": 112840 + }, + { + "epoch": 0.7092883939268949, + "grad_norm": 6.74790096282959, + "learning_rate": 1.5276438983093468e-05, + "loss": 1.6283, + "step": 112850 + }, + { + "epoch": 0.709351246243592, + "grad_norm": 6.837890148162842, + "learning_rate": 1.5276019882148815e-05, + "loss": 1.7477, + "step": 112860 + }, + { + "epoch": 0.7094140985602891, + "grad_norm": 5.5891008377075195, + "learning_rate": 1.5275600781204162e-05, + "loss": 1.6749, + "step": 112870 + }, + { + "epoch": 0.7094769508769863, + "grad_norm": 6.491587162017822, + "learning_rate": 1.527518168025951e-05, + "loss": 1.7476, + "step": 112880 + }, + { + "epoch": 0.7095398031936834, + "grad_norm": 6.33069372177124, + "learning_rate": 1.5274762579314856e-05, + "loss": 1.5088, + "step": 112890 + }, + { + "epoch": 0.7096026555103805, + "grad_norm": 6.677149295806885, + "learning_rate": 1.52743434783702e-05, + "loss": 1.7174, + "step": 112900 + }, + { + "epoch": 0.7096655078270776, + "grad_norm": 5.773202419281006, + "learning_rate": 1.5273924377425547e-05, + "loss": 1.5749, + "step": 112910 + }, + { + "epoch": 0.7097283601437747, + "grad_norm": 6.393325328826904, + "learning_rate": 1.5273505276480894e-05, + "loss": 1.5115, + "step": 112920 + }, + { + "epoch": 0.7097912124604718, + "grad_norm": 7.29564094543457, + "learning_rate": 1.527308617553624e-05, + "loss": 1.5899, + "step": 112930 + }, + { + "epoch": 0.709854064777169, + "grad_norm": 6.721494674682617, + "learning_rate": 1.5272667074591588e-05, + "loss": 1.3894, + "step": 112940 + }, + { + "epoch": 0.709916917093866, + "grad_norm": 6.804461479187012, + "learning_rate": 1.5272247973646935e-05, + "loss": 1.7747, + "step": 112950 + }, + { + "epoch": 0.7099797694105631, + "grad_norm": 6.622951984405518, + "learning_rate": 1.5271828872702282e-05, + "loss": 1.6428, + "step": 112960 + }, + { + "epoch": 0.7100426217272602, + "grad_norm": 5.707070350646973, + "learning_rate": 1.5271409771757626e-05, + "loss": 1.5076, + "step": 112970 + }, + { + "epoch": 0.7101054740439573, + "grad_norm": 6.101596832275391, + "learning_rate": 1.5270990670812973e-05, + "loss": 1.7525, + "step": 112980 + }, + { + "epoch": 0.7101683263606544, + "grad_norm": 7.57112455368042, + "learning_rate": 1.527057156986832e-05, + "loss": 1.7756, + "step": 112990 + }, + { + "epoch": 0.7102311786773515, + "grad_norm": 6.470608711242676, + "learning_rate": 1.5270152468923667e-05, + "loss": 1.6362, + "step": 113000 + }, + { + "epoch": 0.7102940309940486, + "grad_norm": 6.327256202697754, + "learning_rate": 1.526973336797901e-05, + "loss": 1.5283, + "step": 113010 + }, + { + "epoch": 0.7103568833107458, + "grad_norm": 6.72871208190918, + "learning_rate": 1.5269314267034358e-05, + "loss": 1.3429, + "step": 113020 + }, + { + "epoch": 0.7104197356274429, + "grad_norm": 7.564001083374023, + "learning_rate": 1.5268895166089705e-05, + "loss": 1.7246, + "step": 113030 + }, + { + "epoch": 0.71048258794414, + "grad_norm": 6.237943172454834, + "learning_rate": 1.5268476065145052e-05, + "loss": 1.6596, + "step": 113040 + }, + { + "epoch": 0.7105454402608371, + "grad_norm": 6.461047172546387, + "learning_rate": 1.52680569642004e-05, + "loss": 1.5815, + "step": 113050 + }, + { + "epoch": 0.7106082925775342, + "grad_norm": 6.568318843841553, + "learning_rate": 1.5267637863255743e-05, + "loss": 1.7275, + "step": 113060 + }, + { + "epoch": 0.7106711448942313, + "grad_norm": 7.531859397888184, + "learning_rate": 1.526721876231109e-05, + "loss": 1.5079, + "step": 113070 + }, + { + "epoch": 0.7107339972109284, + "grad_norm": 5.017735004425049, + "learning_rate": 1.5266799661366437e-05, + "loss": 1.353, + "step": 113080 + }, + { + "epoch": 0.7107968495276256, + "grad_norm": 6.397959232330322, + "learning_rate": 1.5266380560421784e-05, + "loss": 1.6848, + "step": 113090 + }, + { + "epoch": 0.7108597018443227, + "grad_norm": 7.199172019958496, + "learning_rate": 1.526596145947713e-05, + "loss": 1.6795, + "step": 113100 + }, + { + "epoch": 0.7109225541610198, + "grad_norm": 5.098223686218262, + "learning_rate": 1.5265542358532478e-05, + "loss": 1.3205, + "step": 113110 + }, + { + "epoch": 0.7109854064777169, + "grad_norm": 5.997243404388428, + "learning_rate": 1.5265123257587825e-05, + "loss": 1.642, + "step": 113120 + }, + { + "epoch": 0.711048258794414, + "grad_norm": 6.338582515716553, + "learning_rate": 1.5264704156643172e-05, + "loss": 1.503, + "step": 113130 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 5.719226360321045, + "learning_rate": 1.526428505569852e-05, + "loss": 1.5103, + "step": 113140 + }, + { + "epoch": 0.7111739634278083, + "grad_norm": 6.2346014976501465, + "learning_rate": 1.5263865954753863e-05, + "loss": 1.545, + "step": 113150 + }, + { + "epoch": 0.7112368157445054, + "grad_norm": 6.652457237243652, + "learning_rate": 1.526344685380921e-05, + "loss": 1.5949, + "step": 113160 + }, + { + "epoch": 0.7112996680612025, + "grad_norm": 6.260744571685791, + "learning_rate": 1.5263027752864557e-05, + "loss": 1.7655, + "step": 113170 + }, + { + "epoch": 0.7113625203778996, + "grad_norm": 6.24083137512207, + "learning_rate": 1.5262608651919904e-05, + "loss": 1.4542, + "step": 113180 + }, + { + "epoch": 0.7114253726945967, + "grad_norm": 7.184473037719727, + "learning_rate": 1.5262189550975248e-05, + "loss": 1.6336, + "step": 113190 + }, + { + "epoch": 0.7114882250112938, + "grad_norm": 5.504934310913086, + "learning_rate": 1.5261770450030595e-05, + "loss": 1.5596, + "step": 113200 + }, + { + "epoch": 0.7115510773279908, + "grad_norm": 6.840089797973633, + "learning_rate": 1.5261351349085942e-05, + "loss": 1.619, + "step": 113210 + }, + { + "epoch": 0.711613929644688, + "grad_norm": 6.864090442657471, + "learning_rate": 1.526093224814129e-05, + "loss": 1.4888, + "step": 113220 + }, + { + "epoch": 0.7116767819613851, + "grad_norm": 6.57045316696167, + "learning_rate": 1.5260513147196636e-05, + "loss": 1.6587, + "step": 113230 + }, + { + "epoch": 0.7117396342780822, + "grad_norm": 6.458637714385986, + "learning_rate": 1.526009404625198e-05, + "loss": 1.5771, + "step": 113240 + }, + { + "epoch": 0.7118024865947793, + "grad_norm": 6.163811206817627, + "learning_rate": 1.5259674945307327e-05, + "loss": 1.3506, + "step": 113250 + }, + { + "epoch": 0.7118653389114764, + "grad_norm": 6.221703052520752, + "learning_rate": 1.5259255844362674e-05, + "loss": 1.4874, + "step": 113260 + }, + { + "epoch": 0.7119281912281735, + "grad_norm": 6.887861251831055, + "learning_rate": 1.525883674341802e-05, + "loss": 1.4691, + "step": 113270 + }, + { + "epoch": 0.7119910435448706, + "grad_norm": 5.505577087402344, + "learning_rate": 1.5258417642473367e-05, + "loss": 1.6029, + "step": 113280 + }, + { + "epoch": 0.7120538958615678, + "grad_norm": 6.861028671264648, + "learning_rate": 1.5257998541528714e-05, + "loss": 1.532, + "step": 113290 + }, + { + "epoch": 0.7121167481782649, + "grad_norm": 7.401561260223389, + "learning_rate": 1.525757944058406e-05, + "loss": 1.5277, + "step": 113300 + }, + { + "epoch": 0.712179600494962, + "grad_norm": 5.729288578033447, + "learning_rate": 1.5257160339639408e-05, + "loss": 1.3577, + "step": 113310 + }, + { + "epoch": 0.7122424528116591, + "grad_norm": 5.505568981170654, + "learning_rate": 1.5256741238694752e-05, + "loss": 1.3835, + "step": 113320 + }, + { + "epoch": 0.7123053051283562, + "grad_norm": 6.931628227233887, + "learning_rate": 1.5256322137750099e-05, + "loss": 1.6917, + "step": 113330 + }, + { + "epoch": 0.7123681574450533, + "grad_norm": 6.793735027313232, + "learning_rate": 1.5255903036805446e-05, + "loss": 1.486, + "step": 113340 + }, + { + "epoch": 0.7124310097617504, + "grad_norm": 8.003320693969727, + "learning_rate": 1.5255483935860793e-05, + "loss": 1.8749, + "step": 113350 + }, + { + "epoch": 0.7124938620784476, + "grad_norm": 5.8099870681762695, + "learning_rate": 1.525506483491614e-05, + "loss": 1.422, + "step": 113360 + }, + { + "epoch": 0.7125567143951447, + "grad_norm": 5.143426418304443, + "learning_rate": 1.5254645733971485e-05, + "loss": 1.661, + "step": 113370 + }, + { + "epoch": 0.7126195667118418, + "grad_norm": 6.967090606689453, + "learning_rate": 1.5254226633026832e-05, + "loss": 1.4827, + "step": 113380 + }, + { + "epoch": 0.7126824190285389, + "grad_norm": 5.961867332458496, + "learning_rate": 1.525380753208218e-05, + "loss": 1.4611, + "step": 113390 + }, + { + "epoch": 0.712745271345236, + "grad_norm": 7.57195520401001, + "learning_rate": 1.5253388431137526e-05, + "loss": 1.6686, + "step": 113400 + }, + { + "epoch": 0.7128081236619331, + "grad_norm": 5.611908912658691, + "learning_rate": 1.525296933019287e-05, + "loss": 1.5144, + "step": 113410 + }, + { + "epoch": 0.7128709759786303, + "grad_norm": 5.720475673675537, + "learning_rate": 1.5252550229248217e-05, + "loss": 1.6377, + "step": 113420 + }, + { + "epoch": 0.7129338282953274, + "grad_norm": 6.286452770233154, + "learning_rate": 1.5252131128303564e-05, + "loss": 1.6228, + "step": 113430 + }, + { + "epoch": 0.7129966806120245, + "grad_norm": 5.332391738891602, + "learning_rate": 1.5251712027358911e-05, + "loss": 1.3138, + "step": 113440 + }, + { + "epoch": 0.7130595329287216, + "grad_norm": 7.41482400894165, + "learning_rate": 1.5251292926414258e-05, + "loss": 1.6605, + "step": 113450 + }, + { + "epoch": 0.7131223852454187, + "grad_norm": 6.277690887451172, + "learning_rate": 1.5250873825469604e-05, + "loss": 1.6301, + "step": 113460 + }, + { + "epoch": 0.7131852375621157, + "grad_norm": 6.387878894805908, + "learning_rate": 1.525045472452495e-05, + "loss": 1.4176, + "step": 113470 + }, + { + "epoch": 0.7132480898788128, + "grad_norm": 7.576438903808594, + "learning_rate": 1.5250035623580298e-05, + "loss": 1.6814, + "step": 113480 + }, + { + "epoch": 0.71331094219551, + "grad_norm": 6.340225696563721, + "learning_rate": 1.5249616522635643e-05, + "loss": 1.6373, + "step": 113490 + }, + { + "epoch": 0.7133737945122071, + "grad_norm": 5.660751819610596, + "learning_rate": 1.5249197421690989e-05, + "loss": 1.5514, + "step": 113500 + }, + { + "epoch": 0.7134366468289042, + "grad_norm": 7.049087047576904, + "learning_rate": 1.5248778320746336e-05, + "loss": 1.7007, + "step": 113510 + }, + { + "epoch": 0.7134994991456013, + "grad_norm": 7.369273662567139, + "learning_rate": 1.5248359219801683e-05, + "loss": 1.7085, + "step": 113520 + }, + { + "epoch": 0.7135623514622984, + "grad_norm": 6.360713005065918, + "learning_rate": 1.524794011885703e-05, + "loss": 1.5136, + "step": 113530 + }, + { + "epoch": 0.7136252037789955, + "grad_norm": 6.789496898651123, + "learning_rate": 1.5247521017912377e-05, + "loss": 1.8079, + "step": 113540 + }, + { + "epoch": 0.7136880560956926, + "grad_norm": 6.46088171005249, + "learning_rate": 1.5247101916967722e-05, + "loss": 1.6976, + "step": 113550 + }, + { + "epoch": 0.7137509084123898, + "grad_norm": 5.728837490081787, + "learning_rate": 1.5246682816023068e-05, + "loss": 1.4974, + "step": 113560 + }, + { + "epoch": 0.7138137607290869, + "grad_norm": 7.043184757232666, + "learning_rate": 1.5246263715078415e-05, + "loss": 1.7299, + "step": 113570 + }, + { + "epoch": 0.713876613045784, + "grad_norm": 7.5252203941345215, + "learning_rate": 1.5245844614133762e-05, + "loss": 1.7266, + "step": 113580 + }, + { + "epoch": 0.7139394653624811, + "grad_norm": 6.560543060302734, + "learning_rate": 1.5245425513189107e-05, + "loss": 1.8253, + "step": 113590 + }, + { + "epoch": 0.7140023176791782, + "grad_norm": 6.411479473114014, + "learning_rate": 1.5245006412244454e-05, + "loss": 1.6435, + "step": 113600 + }, + { + "epoch": 0.7140651699958753, + "grad_norm": 6.163437843322754, + "learning_rate": 1.5244587311299801e-05, + "loss": 1.7005, + "step": 113610 + }, + { + "epoch": 0.7141280223125724, + "grad_norm": 7.1745381355285645, + "learning_rate": 1.5244168210355148e-05, + "loss": 1.5691, + "step": 113620 + }, + { + "epoch": 0.7141908746292696, + "grad_norm": 6.296079635620117, + "learning_rate": 1.5243749109410492e-05, + "loss": 1.6268, + "step": 113630 + }, + { + "epoch": 0.7142537269459667, + "grad_norm": 5.797415733337402, + "learning_rate": 1.5243330008465839e-05, + "loss": 1.6974, + "step": 113640 + }, + { + "epoch": 0.7143165792626638, + "grad_norm": 6.801391124725342, + "learning_rate": 1.5242910907521186e-05, + "loss": 1.4754, + "step": 113650 + }, + { + "epoch": 0.7143794315793609, + "grad_norm": 6.8657026290893555, + "learning_rate": 1.5242491806576533e-05, + "loss": 1.7169, + "step": 113660 + }, + { + "epoch": 0.714442283896058, + "grad_norm": 6.456601619720459, + "learning_rate": 1.524207270563188e-05, + "loss": 1.618, + "step": 113670 + }, + { + "epoch": 0.7145051362127551, + "grad_norm": 6.933406829833984, + "learning_rate": 1.5241653604687226e-05, + "loss": 1.6887, + "step": 113680 + }, + { + "epoch": 0.7145679885294522, + "grad_norm": 6.51193380355835, + "learning_rate": 1.5241234503742573e-05, + "loss": 1.6549, + "step": 113690 + }, + { + "epoch": 0.7146308408461494, + "grad_norm": 7.263623237609863, + "learning_rate": 1.524081540279792e-05, + "loss": 1.3845, + "step": 113700 + }, + { + "epoch": 0.7146936931628465, + "grad_norm": 6.034303665161133, + "learning_rate": 1.5240396301853267e-05, + "loss": 1.7341, + "step": 113710 + }, + { + "epoch": 0.7147565454795435, + "grad_norm": 6.4354143142700195, + "learning_rate": 1.523997720090861e-05, + "loss": 1.4352, + "step": 113720 + }, + { + "epoch": 0.7148193977962406, + "grad_norm": 7.792895793914795, + "learning_rate": 1.5239558099963958e-05, + "loss": 1.5413, + "step": 113730 + }, + { + "epoch": 0.7148822501129377, + "grad_norm": 7.444430828094482, + "learning_rate": 1.5239138999019305e-05, + "loss": 1.5746, + "step": 113740 + }, + { + "epoch": 0.7149451024296348, + "grad_norm": 6.952875137329102, + "learning_rate": 1.5238719898074652e-05, + "loss": 1.6593, + "step": 113750 + }, + { + "epoch": 0.7150079547463319, + "grad_norm": 6.499113082885742, + "learning_rate": 1.5238300797129999e-05, + "loss": 1.7686, + "step": 113760 + }, + { + "epoch": 0.7150708070630291, + "grad_norm": 7.119709491729736, + "learning_rate": 1.5237881696185344e-05, + "loss": 1.6387, + "step": 113770 + }, + { + "epoch": 0.7151336593797262, + "grad_norm": 6.934809684753418, + "learning_rate": 1.5237462595240691e-05, + "loss": 1.5899, + "step": 113780 + }, + { + "epoch": 0.7151965116964233, + "grad_norm": 5.7645440101623535, + "learning_rate": 1.5237043494296038e-05, + "loss": 1.8011, + "step": 113790 + }, + { + "epoch": 0.7152593640131204, + "grad_norm": 6.047173500061035, + "learning_rate": 1.5236624393351385e-05, + "loss": 1.5378, + "step": 113800 + }, + { + "epoch": 0.7153222163298175, + "grad_norm": 5.505337715148926, + "learning_rate": 1.523620529240673e-05, + "loss": 1.7135, + "step": 113810 + }, + { + "epoch": 0.7153850686465146, + "grad_norm": 6.926814079284668, + "learning_rate": 1.5235786191462076e-05, + "loss": 1.9873, + "step": 113820 + }, + { + "epoch": 0.7154479209632117, + "grad_norm": 6.95925760269165, + "learning_rate": 1.5235367090517423e-05, + "loss": 1.8136, + "step": 113830 + }, + { + "epoch": 0.7155107732799089, + "grad_norm": 7.495611190795898, + "learning_rate": 1.523494798957277e-05, + "loss": 1.5423, + "step": 113840 + }, + { + "epoch": 0.715573625596606, + "grad_norm": 6.385137557983398, + "learning_rate": 1.5234528888628117e-05, + "loss": 1.7796, + "step": 113850 + }, + { + "epoch": 0.7156364779133031, + "grad_norm": 6.669556140899658, + "learning_rate": 1.5234109787683463e-05, + "loss": 1.5417, + "step": 113860 + }, + { + "epoch": 0.7156993302300002, + "grad_norm": 7.049135208129883, + "learning_rate": 1.5233690686738808e-05, + "loss": 1.4325, + "step": 113870 + }, + { + "epoch": 0.7157621825466973, + "grad_norm": 6.6187825202941895, + "learning_rate": 1.5233271585794155e-05, + "loss": 1.5089, + "step": 113880 + }, + { + "epoch": 0.7158250348633944, + "grad_norm": 5.775775909423828, + "learning_rate": 1.5232852484849502e-05, + "loss": 1.725, + "step": 113890 + }, + { + "epoch": 0.7158878871800916, + "grad_norm": 6.5200395584106445, + "learning_rate": 1.5232433383904848e-05, + "loss": 1.6679, + "step": 113900 + }, + { + "epoch": 0.7159507394967887, + "grad_norm": 6.957881927490234, + "learning_rate": 1.5232014282960195e-05, + "loss": 1.5921, + "step": 113910 + }, + { + "epoch": 0.7160135918134858, + "grad_norm": 6.852334976196289, + "learning_rate": 1.5231595182015542e-05, + "loss": 1.7087, + "step": 113920 + }, + { + "epoch": 0.7160764441301829, + "grad_norm": 6.447397708892822, + "learning_rate": 1.5231176081070889e-05, + "loss": 1.7099, + "step": 113930 + }, + { + "epoch": 0.71613929644688, + "grad_norm": 6.0002121925354, + "learning_rate": 1.5230756980126233e-05, + "loss": 1.845, + "step": 113940 + }, + { + "epoch": 0.7162021487635771, + "grad_norm": 6.524419784545898, + "learning_rate": 1.523033787918158e-05, + "loss": 1.6429, + "step": 113950 + }, + { + "epoch": 0.7162650010802742, + "grad_norm": 5.7728400230407715, + "learning_rate": 1.5229918778236927e-05, + "loss": 1.5089, + "step": 113960 + }, + { + "epoch": 0.7163278533969714, + "grad_norm": 7.010677814483643, + "learning_rate": 1.5229499677292274e-05, + "loss": 1.3965, + "step": 113970 + }, + { + "epoch": 0.7163907057136684, + "grad_norm": 6.472847938537598, + "learning_rate": 1.5229080576347621e-05, + "loss": 1.6774, + "step": 113980 + }, + { + "epoch": 0.7164535580303655, + "grad_norm": 6.118325233459473, + "learning_rate": 1.5228661475402966e-05, + "loss": 1.7889, + "step": 113990 + }, + { + "epoch": 0.7165164103470626, + "grad_norm": 6.4376678466796875, + "learning_rate": 1.5228242374458313e-05, + "loss": 1.515, + "step": 114000 + }, + { + "epoch": 0.7165792626637597, + "grad_norm": 6.637792587280273, + "learning_rate": 1.522782327351366e-05, + "loss": 1.7016, + "step": 114010 + }, + { + "epoch": 0.7166421149804568, + "grad_norm": 6.195915699005127, + "learning_rate": 1.5227404172569007e-05, + "loss": 1.7905, + "step": 114020 + }, + { + "epoch": 0.7167049672971539, + "grad_norm": 6.304981708526611, + "learning_rate": 1.5226985071624351e-05, + "loss": 1.7846, + "step": 114030 + }, + { + "epoch": 0.716767819613851, + "grad_norm": 7.737209320068359, + "learning_rate": 1.5226565970679698e-05, + "loss": 1.3662, + "step": 114040 + }, + { + "epoch": 0.7168306719305482, + "grad_norm": 6.3326497077941895, + "learning_rate": 1.5226146869735045e-05, + "loss": 1.5719, + "step": 114050 + }, + { + "epoch": 0.7168935242472453, + "grad_norm": 6.7617669105529785, + "learning_rate": 1.5225727768790392e-05, + "loss": 1.4707, + "step": 114060 + }, + { + "epoch": 0.7169563765639424, + "grad_norm": 5.8286237716674805, + "learning_rate": 1.522530866784574e-05, + "loss": 1.3576, + "step": 114070 + }, + { + "epoch": 0.7170192288806395, + "grad_norm": 7.066000938415527, + "learning_rate": 1.5224889566901085e-05, + "loss": 1.7826, + "step": 114080 + }, + { + "epoch": 0.7170820811973366, + "grad_norm": 6.6880717277526855, + "learning_rate": 1.5224470465956432e-05, + "loss": 1.4572, + "step": 114090 + }, + { + "epoch": 0.7171449335140337, + "grad_norm": 7.817972660064697, + "learning_rate": 1.5224051365011779e-05, + "loss": 1.6042, + "step": 114100 + }, + { + "epoch": 0.7172077858307309, + "grad_norm": 6.40554141998291, + "learning_rate": 1.5223632264067126e-05, + "loss": 1.5907, + "step": 114110 + }, + { + "epoch": 0.717270638147428, + "grad_norm": 7.366568088531494, + "learning_rate": 1.522321316312247e-05, + "loss": 1.6764, + "step": 114120 + }, + { + "epoch": 0.7173334904641251, + "grad_norm": 7.0088725090026855, + "learning_rate": 1.5222794062177817e-05, + "loss": 1.7399, + "step": 114130 + }, + { + "epoch": 0.7173963427808222, + "grad_norm": 6.872783184051514, + "learning_rate": 1.5222374961233164e-05, + "loss": 1.5902, + "step": 114140 + }, + { + "epoch": 0.7174591950975193, + "grad_norm": 6.1382341384887695, + "learning_rate": 1.5221955860288511e-05, + "loss": 1.6572, + "step": 114150 + }, + { + "epoch": 0.7175220474142164, + "grad_norm": 6.142192363739014, + "learning_rate": 1.5221536759343858e-05, + "loss": 1.4925, + "step": 114160 + }, + { + "epoch": 0.7175848997309136, + "grad_norm": 5.902562618255615, + "learning_rate": 1.5221117658399203e-05, + "loss": 1.5269, + "step": 114170 + }, + { + "epoch": 0.7176477520476107, + "grad_norm": 6.210169315338135, + "learning_rate": 1.5220698557454549e-05, + "loss": 1.6386, + "step": 114180 + }, + { + "epoch": 0.7177106043643078, + "grad_norm": 6.398752212524414, + "learning_rate": 1.5220279456509896e-05, + "loss": 1.5583, + "step": 114190 + }, + { + "epoch": 0.7177734566810049, + "grad_norm": 6.617867469787598, + "learning_rate": 1.5219860355565243e-05, + "loss": 1.6815, + "step": 114200 + }, + { + "epoch": 0.717836308997702, + "grad_norm": 5.834376811981201, + "learning_rate": 1.5219441254620588e-05, + "loss": 1.4698, + "step": 114210 + }, + { + "epoch": 0.7178991613143991, + "grad_norm": 6.115474700927734, + "learning_rate": 1.5219022153675935e-05, + "loss": 1.7216, + "step": 114220 + }, + { + "epoch": 0.7179620136310961, + "grad_norm": 6.723456859588623, + "learning_rate": 1.5218603052731282e-05, + "loss": 1.6805, + "step": 114230 + }, + { + "epoch": 0.7180248659477932, + "grad_norm": 5.87972354888916, + "learning_rate": 1.521818395178663e-05, + "loss": 1.6633, + "step": 114240 + }, + { + "epoch": 0.7180877182644904, + "grad_norm": 6.3831634521484375, + "learning_rate": 1.5217764850841973e-05, + "loss": 1.732, + "step": 114250 + }, + { + "epoch": 0.7181505705811875, + "grad_norm": 6.4843974113464355, + "learning_rate": 1.521734574989732e-05, + "loss": 1.7449, + "step": 114260 + }, + { + "epoch": 0.7182134228978846, + "grad_norm": 6.328886985778809, + "learning_rate": 1.5216926648952667e-05, + "loss": 1.7362, + "step": 114270 + }, + { + "epoch": 0.7182762752145817, + "grad_norm": 5.957115650177002, + "learning_rate": 1.5216507548008014e-05, + "loss": 1.6076, + "step": 114280 + }, + { + "epoch": 0.7183391275312788, + "grad_norm": 5.09821891784668, + "learning_rate": 1.5216088447063361e-05, + "loss": 1.6868, + "step": 114290 + }, + { + "epoch": 0.7184019798479759, + "grad_norm": 6.222084045410156, + "learning_rate": 1.5215669346118707e-05, + "loss": 1.5835, + "step": 114300 + }, + { + "epoch": 0.718464832164673, + "grad_norm": 7.457278728485107, + "learning_rate": 1.5215250245174054e-05, + "loss": 1.4989, + "step": 114310 + }, + { + "epoch": 0.7185276844813702, + "grad_norm": 6.377392768859863, + "learning_rate": 1.5214831144229401e-05, + "loss": 1.6346, + "step": 114320 + }, + { + "epoch": 0.7185905367980673, + "grad_norm": 7.023126125335693, + "learning_rate": 1.5214412043284748e-05, + "loss": 1.4804, + "step": 114330 + }, + { + "epoch": 0.7186533891147644, + "grad_norm": 6.258632183074951, + "learning_rate": 1.5213992942340092e-05, + "loss": 1.8114, + "step": 114340 + }, + { + "epoch": 0.7187162414314615, + "grad_norm": 6.734646797180176, + "learning_rate": 1.5213573841395439e-05, + "loss": 1.5873, + "step": 114350 + }, + { + "epoch": 0.7187790937481586, + "grad_norm": 6.044432640075684, + "learning_rate": 1.5213154740450786e-05, + "loss": 1.7029, + "step": 114360 + }, + { + "epoch": 0.7188419460648557, + "grad_norm": 6.947794437408447, + "learning_rate": 1.5212735639506133e-05, + "loss": 1.6726, + "step": 114370 + }, + { + "epoch": 0.7189047983815529, + "grad_norm": 6.909889221191406, + "learning_rate": 1.521231653856148e-05, + "loss": 1.6194, + "step": 114380 + }, + { + "epoch": 0.71896765069825, + "grad_norm": 6.474206447601318, + "learning_rate": 1.5211897437616825e-05, + "loss": 1.5683, + "step": 114390 + }, + { + "epoch": 0.7190305030149471, + "grad_norm": 6.785498142242432, + "learning_rate": 1.5211478336672172e-05, + "loss": 1.8991, + "step": 114400 + }, + { + "epoch": 0.7190933553316442, + "grad_norm": 6.298401832580566, + "learning_rate": 1.521105923572752e-05, + "loss": 1.61, + "step": 114410 + }, + { + "epoch": 0.7191562076483413, + "grad_norm": 5.573666095733643, + "learning_rate": 1.5210640134782867e-05, + "loss": 1.6632, + "step": 114420 + }, + { + "epoch": 0.7192190599650384, + "grad_norm": 7.085525035858154, + "learning_rate": 1.521022103383821e-05, + "loss": 1.5195, + "step": 114430 + }, + { + "epoch": 0.7192819122817355, + "grad_norm": 6.482182025909424, + "learning_rate": 1.5209801932893557e-05, + "loss": 1.6174, + "step": 114440 + }, + { + "epoch": 0.7193447645984327, + "grad_norm": 6.55352783203125, + "learning_rate": 1.5209382831948904e-05, + "loss": 1.5703, + "step": 114450 + }, + { + "epoch": 0.7194076169151298, + "grad_norm": 7.438021659851074, + "learning_rate": 1.5208963731004252e-05, + "loss": 1.8293, + "step": 114460 + }, + { + "epoch": 0.7194704692318269, + "grad_norm": 6.282613277435303, + "learning_rate": 1.5208544630059599e-05, + "loss": 1.6461, + "step": 114470 + }, + { + "epoch": 0.719533321548524, + "grad_norm": 7.057091236114502, + "learning_rate": 1.5208125529114944e-05, + "loss": 1.5788, + "step": 114480 + }, + { + "epoch": 0.719596173865221, + "grad_norm": 6.355682373046875, + "learning_rate": 1.5207706428170291e-05, + "loss": 1.7209, + "step": 114490 + }, + { + "epoch": 0.7196590261819181, + "grad_norm": 6.700733184814453, + "learning_rate": 1.5207287327225636e-05, + "loss": 1.4957, + "step": 114500 + }, + { + "epoch": 0.7197218784986152, + "grad_norm": 6.967093467712402, + "learning_rate": 1.5206868226280983e-05, + "loss": 1.5567, + "step": 114510 + }, + { + "epoch": 0.7197847308153124, + "grad_norm": 5.767993927001953, + "learning_rate": 1.5206449125336329e-05, + "loss": 1.6891, + "step": 114520 + }, + { + "epoch": 0.7198475831320095, + "grad_norm": 5.1797895431518555, + "learning_rate": 1.5206030024391676e-05, + "loss": 1.3634, + "step": 114530 + }, + { + "epoch": 0.7199104354487066, + "grad_norm": 6.135411739349365, + "learning_rate": 1.5205610923447023e-05, + "loss": 1.7077, + "step": 114540 + }, + { + "epoch": 0.7199732877654037, + "grad_norm": 7.227076530456543, + "learning_rate": 1.520519182250237e-05, + "loss": 1.7955, + "step": 114550 + }, + { + "epoch": 0.7200361400821008, + "grad_norm": 5.616551399230957, + "learning_rate": 1.520481463165218e-05, + "loss": 1.7635, + "step": 114560 + }, + { + "epoch": 0.7200989923987979, + "grad_norm": 5.438570022583008, + "learning_rate": 1.5204395530707527e-05, + "loss": 1.4608, + "step": 114570 + }, + { + "epoch": 0.720161844715495, + "grad_norm": 6.289920330047607, + "learning_rate": 1.5203976429762874e-05, + "loss": 1.3727, + "step": 114580 + }, + { + "epoch": 0.7202246970321922, + "grad_norm": 6.528743743896484, + "learning_rate": 1.5203557328818221e-05, + "loss": 1.6865, + "step": 114590 + }, + { + "epoch": 0.7202875493488893, + "grad_norm": 6.531278133392334, + "learning_rate": 1.5203138227873566e-05, + "loss": 1.639, + "step": 114600 + }, + { + "epoch": 0.7203504016655864, + "grad_norm": 7.219261646270752, + "learning_rate": 1.5202719126928913e-05, + "loss": 1.7829, + "step": 114610 + }, + { + "epoch": 0.7204132539822835, + "grad_norm": 7.336671352386475, + "learning_rate": 1.520230002598426e-05, + "loss": 1.3718, + "step": 114620 + }, + { + "epoch": 0.7204761062989806, + "grad_norm": 6.335206985473633, + "learning_rate": 1.5201880925039608e-05, + "loss": 1.5391, + "step": 114630 + }, + { + "epoch": 0.7205389586156777, + "grad_norm": 7.5218505859375, + "learning_rate": 1.5201461824094951e-05, + "loss": 1.5025, + "step": 114640 + }, + { + "epoch": 0.7206018109323749, + "grad_norm": 6.53808069229126, + "learning_rate": 1.5201042723150298e-05, + "loss": 1.5093, + "step": 114650 + }, + { + "epoch": 0.720664663249072, + "grad_norm": 7.6231303215026855, + "learning_rate": 1.5200623622205645e-05, + "loss": 1.6067, + "step": 114660 + }, + { + "epoch": 0.7207275155657691, + "grad_norm": 6.8570637702941895, + "learning_rate": 1.5200204521260992e-05, + "loss": 1.7287, + "step": 114670 + }, + { + "epoch": 0.7207903678824662, + "grad_norm": 7.506631374359131, + "learning_rate": 1.519978542031634e-05, + "loss": 1.5974, + "step": 114680 + }, + { + "epoch": 0.7208532201991633, + "grad_norm": 7.381155490875244, + "learning_rate": 1.5199366319371685e-05, + "loss": 1.7563, + "step": 114690 + }, + { + "epoch": 0.7209160725158604, + "grad_norm": 6.632617473602295, + "learning_rate": 1.5198947218427032e-05, + "loss": 1.5796, + "step": 114700 + }, + { + "epoch": 0.7209789248325575, + "grad_norm": 6.989343166351318, + "learning_rate": 1.5198528117482379e-05, + "loss": 1.6069, + "step": 114710 + }, + { + "epoch": 0.7210417771492547, + "grad_norm": 7.608463287353516, + "learning_rate": 1.5198109016537726e-05, + "loss": 1.6091, + "step": 114720 + }, + { + "epoch": 0.7211046294659518, + "grad_norm": 6.86716365814209, + "learning_rate": 1.519768991559307e-05, + "loss": 1.4456, + "step": 114730 + }, + { + "epoch": 0.7211674817826488, + "grad_norm": 6.527399063110352, + "learning_rate": 1.5197270814648417e-05, + "loss": 1.6957, + "step": 114740 + }, + { + "epoch": 0.7212303340993459, + "grad_norm": 8.976424217224121, + "learning_rate": 1.5196851713703764e-05, + "loss": 1.557, + "step": 114750 + }, + { + "epoch": 0.721293186416043, + "grad_norm": 7.675748348236084, + "learning_rate": 1.5196432612759111e-05, + "loss": 1.8114, + "step": 114760 + }, + { + "epoch": 0.7213560387327401, + "grad_norm": 7.14664363861084, + "learning_rate": 1.5196013511814456e-05, + "loss": 1.6201, + "step": 114770 + }, + { + "epoch": 0.7214188910494372, + "grad_norm": 6.664819240570068, + "learning_rate": 1.5195594410869803e-05, + "loss": 1.7189, + "step": 114780 + }, + { + "epoch": 0.7214817433661344, + "grad_norm": 8.908856391906738, + "learning_rate": 1.519517530992515e-05, + "loss": 1.7485, + "step": 114790 + }, + { + "epoch": 0.7215445956828315, + "grad_norm": 5.326156139373779, + "learning_rate": 1.5194756208980498e-05, + "loss": 1.4227, + "step": 114800 + }, + { + "epoch": 0.7216074479995286, + "grad_norm": 7.644719123840332, + "learning_rate": 1.5194337108035843e-05, + "loss": 1.7497, + "step": 114810 + }, + { + "epoch": 0.7216703003162257, + "grad_norm": 5.888164520263672, + "learning_rate": 1.5193918007091188e-05, + "loss": 1.8029, + "step": 114820 + }, + { + "epoch": 0.7217331526329228, + "grad_norm": 5.382526874542236, + "learning_rate": 1.5193498906146535e-05, + "loss": 1.5768, + "step": 114830 + }, + { + "epoch": 0.7217960049496199, + "grad_norm": 7.742189407348633, + "learning_rate": 1.5193079805201882e-05, + "loss": 1.5128, + "step": 114840 + }, + { + "epoch": 0.721858857266317, + "grad_norm": 7.856515884399414, + "learning_rate": 1.519266070425723e-05, + "loss": 1.5368, + "step": 114850 + }, + { + "epoch": 0.7219217095830142, + "grad_norm": 7.214084148406982, + "learning_rate": 1.5192241603312575e-05, + "loss": 1.6149, + "step": 114860 + }, + { + "epoch": 0.7219845618997113, + "grad_norm": 5.069043159484863, + "learning_rate": 1.5191822502367922e-05, + "loss": 1.4039, + "step": 114870 + }, + { + "epoch": 0.7220474142164084, + "grad_norm": 6.424645900726318, + "learning_rate": 1.5191403401423267e-05, + "loss": 1.639, + "step": 114880 + }, + { + "epoch": 0.7221102665331055, + "grad_norm": 7.57219123840332, + "learning_rate": 1.5190984300478614e-05, + "loss": 1.7926, + "step": 114890 + }, + { + "epoch": 0.7221731188498026, + "grad_norm": 6.031521320343018, + "learning_rate": 1.5190565199533961e-05, + "loss": 1.784, + "step": 114900 + }, + { + "epoch": 0.7222359711664997, + "grad_norm": 6.382801532745361, + "learning_rate": 1.5190146098589307e-05, + "loss": 1.6667, + "step": 114910 + }, + { + "epoch": 0.7222988234831969, + "grad_norm": 6.570066452026367, + "learning_rate": 1.5189726997644654e-05, + "loss": 1.7855, + "step": 114920 + }, + { + "epoch": 0.722361675799894, + "grad_norm": 6.704894065856934, + "learning_rate": 1.5189307896700001e-05, + "loss": 1.4182, + "step": 114930 + }, + { + "epoch": 0.7224245281165911, + "grad_norm": 6.024482250213623, + "learning_rate": 1.5188888795755348e-05, + "loss": 1.5817, + "step": 114940 + }, + { + "epoch": 0.7224873804332882, + "grad_norm": 8.764252662658691, + "learning_rate": 1.5188469694810692e-05, + "loss": 1.7421, + "step": 114950 + }, + { + "epoch": 0.7225502327499853, + "grad_norm": 6.126072883605957, + "learning_rate": 1.5188050593866039e-05, + "loss": 1.6377, + "step": 114960 + }, + { + "epoch": 0.7226130850666824, + "grad_norm": 5.632742404937744, + "learning_rate": 1.5187631492921386e-05, + "loss": 1.692, + "step": 114970 + }, + { + "epoch": 0.7226759373833795, + "grad_norm": 5.550741672515869, + "learning_rate": 1.5187212391976733e-05, + "loss": 1.4384, + "step": 114980 + }, + { + "epoch": 0.7227387897000767, + "grad_norm": 6.033601760864258, + "learning_rate": 1.518679329103208e-05, + "loss": 1.7988, + "step": 114990 + }, + { + "epoch": 0.7228016420167737, + "grad_norm": 6.537342548370361, + "learning_rate": 1.5186374190087425e-05, + "loss": 1.64, + "step": 115000 + }, + { + "epoch": 0.7228644943334708, + "grad_norm": 6.672082424163818, + "learning_rate": 1.5185955089142772e-05, + "loss": 1.2782, + "step": 115010 + }, + { + "epoch": 0.7229273466501679, + "grad_norm": 6.4160075187683105, + "learning_rate": 1.518553598819812e-05, + "loss": 1.5656, + "step": 115020 + }, + { + "epoch": 0.722990198966865, + "grad_norm": 6.561112403869629, + "learning_rate": 1.5185116887253467e-05, + "loss": 1.8092, + "step": 115030 + }, + { + "epoch": 0.7230530512835621, + "grad_norm": 6.481167316436768, + "learning_rate": 1.518469778630881e-05, + "loss": 1.7376, + "step": 115040 + }, + { + "epoch": 0.7231159036002592, + "grad_norm": 5.722866058349609, + "learning_rate": 1.5184278685364157e-05, + "loss": 1.431, + "step": 115050 + }, + { + "epoch": 0.7231787559169564, + "grad_norm": 6.623687267303467, + "learning_rate": 1.5183859584419504e-05, + "loss": 1.5459, + "step": 115060 + }, + { + "epoch": 0.7232416082336535, + "grad_norm": 6.44396448135376, + "learning_rate": 1.5183440483474852e-05, + "loss": 1.4087, + "step": 115070 + }, + { + "epoch": 0.7233044605503506, + "grad_norm": 5.95437479019165, + "learning_rate": 1.5183021382530197e-05, + "loss": 1.6956, + "step": 115080 + }, + { + "epoch": 0.7233673128670477, + "grad_norm": 5.641931056976318, + "learning_rate": 1.5182602281585544e-05, + "loss": 1.452, + "step": 115090 + }, + { + "epoch": 0.7234301651837448, + "grad_norm": 7.695308208465576, + "learning_rate": 1.5182183180640891e-05, + "loss": 1.5653, + "step": 115100 + }, + { + "epoch": 0.7234930175004419, + "grad_norm": 6.782965660095215, + "learning_rate": 1.5181764079696238e-05, + "loss": 1.5719, + "step": 115110 + }, + { + "epoch": 0.723555869817139, + "grad_norm": 6.327223300933838, + "learning_rate": 1.5181344978751583e-05, + "loss": 1.5051, + "step": 115120 + }, + { + "epoch": 0.7236187221338362, + "grad_norm": 6.862222194671631, + "learning_rate": 1.5180925877806929e-05, + "loss": 1.7584, + "step": 115130 + }, + { + "epoch": 0.7236815744505333, + "grad_norm": 6.9344635009765625, + "learning_rate": 1.5180506776862276e-05, + "loss": 1.7514, + "step": 115140 + }, + { + "epoch": 0.7237444267672304, + "grad_norm": 6.590836048126221, + "learning_rate": 1.5180087675917623e-05, + "loss": 1.5174, + "step": 115150 + }, + { + "epoch": 0.7238072790839275, + "grad_norm": 6.448301792144775, + "learning_rate": 1.517966857497297e-05, + "loss": 1.7572, + "step": 115160 + }, + { + "epoch": 0.7238701314006246, + "grad_norm": 7.644365310668945, + "learning_rate": 1.5179249474028315e-05, + "loss": 1.4495, + "step": 115170 + }, + { + "epoch": 0.7239329837173217, + "grad_norm": 7.2462663650512695, + "learning_rate": 1.5178830373083663e-05, + "loss": 1.7079, + "step": 115180 + }, + { + "epoch": 0.7239958360340188, + "grad_norm": 6.857507228851318, + "learning_rate": 1.5178411272139008e-05, + "loss": 1.5334, + "step": 115190 + }, + { + "epoch": 0.724058688350716, + "grad_norm": 7.825425624847412, + "learning_rate": 1.5177992171194355e-05, + "loss": 1.486, + "step": 115200 + }, + { + "epoch": 0.7241215406674131, + "grad_norm": 7.303027629852295, + "learning_rate": 1.5177573070249702e-05, + "loss": 1.6061, + "step": 115210 + }, + { + "epoch": 0.7241843929841102, + "grad_norm": 6.9621262550354, + "learning_rate": 1.5177153969305047e-05, + "loss": 1.8309, + "step": 115220 + }, + { + "epoch": 0.7242472453008073, + "grad_norm": 5.888366222381592, + "learning_rate": 1.5176734868360394e-05, + "loss": 1.5802, + "step": 115230 + }, + { + "epoch": 0.7243100976175044, + "grad_norm": 7.329326629638672, + "learning_rate": 1.5176315767415742e-05, + "loss": 1.7735, + "step": 115240 + }, + { + "epoch": 0.7243729499342014, + "grad_norm": 7.154123306274414, + "learning_rate": 1.5175896666471089e-05, + "loss": 1.6783, + "step": 115250 + }, + { + "epoch": 0.7244358022508985, + "grad_norm": 6.366217613220215, + "learning_rate": 1.5175477565526432e-05, + "loss": 1.6962, + "step": 115260 + }, + { + "epoch": 0.7244986545675957, + "grad_norm": 7.0050177574157715, + "learning_rate": 1.517505846458178e-05, + "loss": 1.6533, + "step": 115270 + }, + { + "epoch": 0.7245615068842928, + "grad_norm": 6.864089488983154, + "learning_rate": 1.5174639363637126e-05, + "loss": 1.6909, + "step": 115280 + }, + { + "epoch": 0.7246243592009899, + "grad_norm": 6.635605335235596, + "learning_rate": 1.5174220262692474e-05, + "loss": 1.4402, + "step": 115290 + }, + { + "epoch": 0.724687211517687, + "grad_norm": 6.629925727844238, + "learning_rate": 1.517380116174782e-05, + "loss": 1.5788, + "step": 115300 + }, + { + "epoch": 0.7247500638343841, + "grad_norm": 6.296230316162109, + "learning_rate": 1.5173382060803166e-05, + "loss": 1.5403, + "step": 115310 + }, + { + "epoch": 0.7248129161510812, + "grad_norm": 6.700946807861328, + "learning_rate": 1.5172962959858513e-05, + "loss": 1.8149, + "step": 115320 + }, + { + "epoch": 0.7248757684677783, + "grad_norm": 6.950066566467285, + "learning_rate": 1.517254385891386e-05, + "loss": 1.498, + "step": 115330 + }, + { + "epoch": 0.7249386207844755, + "grad_norm": 6.15997314453125, + "learning_rate": 1.5172124757969207e-05, + "loss": 1.4936, + "step": 115340 + }, + { + "epoch": 0.7250014731011726, + "grad_norm": 6.174920082092285, + "learning_rate": 1.5171705657024551e-05, + "loss": 1.9595, + "step": 115350 + }, + { + "epoch": 0.7250643254178697, + "grad_norm": 6.02477502822876, + "learning_rate": 1.5171286556079898e-05, + "loss": 1.5151, + "step": 115360 + }, + { + "epoch": 0.7251271777345668, + "grad_norm": 7.505553245544434, + "learning_rate": 1.5170867455135245e-05, + "loss": 1.6931, + "step": 115370 + }, + { + "epoch": 0.7251900300512639, + "grad_norm": 6.075516700744629, + "learning_rate": 1.5170448354190592e-05, + "loss": 1.5881, + "step": 115380 + }, + { + "epoch": 0.725252882367961, + "grad_norm": 7.066200256347656, + "learning_rate": 1.5170029253245937e-05, + "loss": 1.6859, + "step": 115390 + }, + { + "epoch": 0.7253157346846582, + "grad_norm": 6.309739112854004, + "learning_rate": 1.5169610152301285e-05, + "loss": 1.67, + "step": 115400 + }, + { + "epoch": 0.7253785870013553, + "grad_norm": 6.822290420532227, + "learning_rate": 1.5169191051356632e-05, + "loss": 1.6519, + "step": 115410 + }, + { + "epoch": 0.7254414393180524, + "grad_norm": 6.343250274658203, + "learning_rate": 1.5168771950411979e-05, + "loss": 1.725, + "step": 115420 + }, + { + "epoch": 0.7255042916347495, + "grad_norm": 6.2186689376831055, + "learning_rate": 1.5168352849467326e-05, + "loss": 1.5672, + "step": 115430 + }, + { + "epoch": 0.7255671439514466, + "grad_norm": 6.9470367431640625, + "learning_rate": 1.516793374852267e-05, + "loss": 1.6098, + "step": 115440 + }, + { + "epoch": 0.7256299962681437, + "grad_norm": 6.316976070404053, + "learning_rate": 1.5167514647578016e-05, + "loss": 1.4436, + "step": 115450 + }, + { + "epoch": 0.7256928485848408, + "grad_norm": 6.690842628479004, + "learning_rate": 1.5167095546633364e-05, + "loss": 1.5943, + "step": 115460 + }, + { + "epoch": 0.725755700901538, + "grad_norm": 7.384859085083008, + "learning_rate": 1.516667644568871e-05, + "loss": 1.6767, + "step": 115470 + }, + { + "epoch": 0.7258185532182351, + "grad_norm": 5.442703723907471, + "learning_rate": 1.5166257344744056e-05, + "loss": 1.4782, + "step": 115480 + }, + { + "epoch": 0.7258814055349322, + "grad_norm": 7.1188130378723145, + "learning_rate": 1.5165838243799403e-05, + "loss": 1.7413, + "step": 115490 + }, + { + "epoch": 0.7259442578516293, + "grad_norm": 6.402884483337402, + "learning_rate": 1.5165419142854748e-05, + "loss": 1.4255, + "step": 115500 + }, + { + "epoch": 0.7260071101683263, + "grad_norm": 6.383247375488281, + "learning_rate": 1.5165000041910096e-05, + "loss": 1.8272, + "step": 115510 + }, + { + "epoch": 0.7260699624850234, + "grad_norm": 5.225858211517334, + "learning_rate": 1.5164580940965443e-05, + "loss": 1.489, + "step": 115520 + }, + { + "epoch": 0.7261328148017205, + "grad_norm": 7.787111759185791, + "learning_rate": 1.5164161840020788e-05, + "loss": 1.6115, + "step": 115530 + }, + { + "epoch": 0.7261956671184177, + "grad_norm": 6.8042802810668945, + "learning_rate": 1.5163742739076135e-05, + "loss": 1.7215, + "step": 115540 + }, + { + "epoch": 0.7262585194351148, + "grad_norm": 7.336939811706543, + "learning_rate": 1.5163323638131482e-05, + "loss": 1.6679, + "step": 115550 + }, + { + "epoch": 0.7263213717518119, + "grad_norm": 5.344560146331787, + "learning_rate": 1.516290453718683e-05, + "loss": 1.6706, + "step": 115560 + }, + { + "epoch": 0.726384224068509, + "grad_norm": 7.378252029418945, + "learning_rate": 1.5162485436242173e-05, + "loss": 1.6384, + "step": 115570 + }, + { + "epoch": 0.7264470763852061, + "grad_norm": 5.6592254638671875, + "learning_rate": 1.516206633529752e-05, + "loss": 1.7509, + "step": 115580 + }, + { + "epoch": 0.7265099287019032, + "grad_norm": 6.7056403160095215, + "learning_rate": 1.5161647234352867e-05, + "loss": 1.6448, + "step": 115590 + }, + { + "epoch": 0.7265727810186003, + "grad_norm": 7.759737014770508, + "learning_rate": 1.5161228133408214e-05, + "loss": 1.4573, + "step": 115600 + }, + { + "epoch": 0.7266356333352975, + "grad_norm": 7.28309965133667, + "learning_rate": 1.5160809032463561e-05, + "loss": 1.6218, + "step": 115610 + }, + { + "epoch": 0.7266984856519946, + "grad_norm": 6.890913963317871, + "learning_rate": 1.5160389931518907e-05, + "loss": 1.8592, + "step": 115620 + }, + { + "epoch": 0.7267613379686917, + "grad_norm": 7.0204362869262695, + "learning_rate": 1.5159970830574254e-05, + "loss": 1.8612, + "step": 115630 + }, + { + "epoch": 0.7268241902853888, + "grad_norm": 5.921641826629639, + "learning_rate": 1.51595517296296e-05, + "loss": 1.5616, + "step": 115640 + }, + { + "epoch": 0.7268870426020859, + "grad_norm": 6.536056041717529, + "learning_rate": 1.5159132628684948e-05, + "loss": 1.6371, + "step": 115650 + }, + { + "epoch": 0.726949894918783, + "grad_norm": 6.2884063720703125, + "learning_rate": 1.5158713527740291e-05, + "loss": 1.8226, + "step": 115660 + }, + { + "epoch": 0.7270127472354802, + "grad_norm": 5.8661699295043945, + "learning_rate": 1.5158294426795638e-05, + "loss": 1.4365, + "step": 115670 + }, + { + "epoch": 0.7270755995521773, + "grad_norm": 7.0693254470825195, + "learning_rate": 1.5157875325850986e-05, + "loss": 1.6888, + "step": 115680 + }, + { + "epoch": 0.7271384518688744, + "grad_norm": 6.45350980758667, + "learning_rate": 1.5157456224906333e-05, + "loss": 1.6169, + "step": 115690 + }, + { + "epoch": 0.7272013041855715, + "grad_norm": 5.505177021026611, + "learning_rate": 1.5157037123961678e-05, + "loss": 1.4611, + "step": 115700 + }, + { + "epoch": 0.7272641565022686, + "grad_norm": 6.742366313934326, + "learning_rate": 1.5156618023017025e-05, + "loss": 1.6813, + "step": 115710 + }, + { + "epoch": 0.7273270088189657, + "grad_norm": 6.366575717926025, + "learning_rate": 1.5156198922072372e-05, + "loss": 1.5547, + "step": 115720 + }, + { + "epoch": 0.7273898611356628, + "grad_norm": 5.64211368560791, + "learning_rate": 1.515577982112772e-05, + "loss": 1.4973, + "step": 115730 + }, + { + "epoch": 0.72745271345236, + "grad_norm": 6.600712299346924, + "learning_rate": 1.5155360720183066e-05, + "loss": 1.5888, + "step": 115740 + }, + { + "epoch": 0.7275155657690571, + "grad_norm": 6.504429817199707, + "learning_rate": 1.515494161923841e-05, + "loss": 1.6392, + "step": 115750 + }, + { + "epoch": 0.7275784180857541, + "grad_norm": 5.690182685852051, + "learning_rate": 1.5154522518293757e-05, + "loss": 1.4292, + "step": 115760 + }, + { + "epoch": 0.7276412704024512, + "grad_norm": 7.890905380249023, + "learning_rate": 1.5154103417349104e-05, + "loss": 1.8914, + "step": 115770 + }, + { + "epoch": 0.7277041227191483, + "grad_norm": 5.442069053649902, + "learning_rate": 1.5153684316404451e-05, + "loss": 1.4884, + "step": 115780 + }, + { + "epoch": 0.7277669750358454, + "grad_norm": 7.099849700927734, + "learning_rate": 1.5153265215459797e-05, + "loss": 1.7281, + "step": 115790 + }, + { + "epoch": 0.7278298273525425, + "grad_norm": 7.283675670623779, + "learning_rate": 1.5152846114515144e-05, + "loss": 1.6031, + "step": 115800 + }, + { + "epoch": 0.7278926796692397, + "grad_norm": 5.814444541931152, + "learning_rate": 1.5152427013570489e-05, + "loss": 1.5404, + "step": 115810 + }, + { + "epoch": 0.7279555319859368, + "grad_norm": 6.222223281860352, + "learning_rate": 1.5152007912625836e-05, + "loss": 1.4462, + "step": 115820 + }, + { + "epoch": 0.7280183843026339, + "grad_norm": 8.399040222167969, + "learning_rate": 1.5151588811681183e-05, + "loss": 1.7329, + "step": 115830 + }, + { + "epoch": 0.728081236619331, + "grad_norm": 7.937182903289795, + "learning_rate": 1.5151169710736529e-05, + "loss": 1.581, + "step": 115840 + }, + { + "epoch": 0.7281440889360281, + "grad_norm": 6.623231410980225, + "learning_rate": 1.5150750609791876e-05, + "loss": 1.7026, + "step": 115850 + }, + { + "epoch": 0.7282069412527252, + "grad_norm": 6.111772060394287, + "learning_rate": 1.5150331508847223e-05, + "loss": 1.4106, + "step": 115860 + }, + { + "epoch": 0.7282697935694223, + "grad_norm": 5.90593147277832, + "learning_rate": 1.514991240790257e-05, + "loss": 1.7615, + "step": 115870 + }, + { + "epoch": 0.7283326458861195, + "grad_norm": 5.499425888061523, + "learning_rate": 1.5149493306957913e-05, + "loss": 1.7468, + "step": 115880 + }, + { + "epoch": 0.7283954982028166, + "grad_norm": 6.455041408538818, + "learning_rate": 1.514907420601326e-05, + "loss": 1.7094, + "step": 115890 + }, + { + "epoch": 0.7284583505195137, + "grad_norm": 6.752415657043457, + "learning_rate": 1.5148655105068608e-05, + "loss": 1.6646, + "step": 115900 + }, + { + "epoch": 0.7285212028362108, + "grad_norm": 7.045289039611816, + "learning_rate": 1.5148236004123955e-05, + "loss": 1.8899, + "step": 115910 + }, + { + "epoch": 0.7285840551529079, + "grad_norm": 7.456486225128174, + "learning_rate": 1.5147816903179302e-05, + "loss": 1.7274, + "step": 115920 + }, + { + "epoch": 0.728646907469605, + "grad_norm": 5.679928779602051, + "learning_rate": 1.5147397802234647e-05, + "loss": 1.7484, + "step": 115930 + }, + { + "epoch": 0.7287097597863021, + "grad_norm": 6.834784984588623, + "learning_rate": 1.5146978701289994e-05, + "loss": 1.5582, + "step": 115940 + }, + { + "epoch": 0.7287726121029993, + "grad_norm": 6.277487277984619, + "learning_rate": 1.5146559600345341e-05, + "loss": 1.6715, + "step": 115950 + }, + { + "epoch": 0.7288354644196964, + "grad_norm": 7.477523326873779, + "learning_rate": 1.5146140499400688e-05, + "loss": 1.559, + "step": 115960 + }, + { + "epoch": 0.7288983167363935, + "grad_norm": 6.518509864807129, + "learning_rate": 1.5145721398456032e-05, + "loss": 1.4773, + "step": 115970 + }, + { + "epoch": 0.7289611690530906, + "grad_norm": 6.6805572509765625, + "learning_rate": 1.5145302297511379e-05, + "loss": 1.6848, + "step": 115980 + }, + { + "epoch": 0.7290240213697877, + "grad_norm": 5.526790618896484, + "learning_rate": 1.5144883196566726e-05, + "loss": 1.5908, + "step": 115990 + }, + { + "epoch": 0.7290868736864848, + "grad_norm": 6.3358588218688965, + "learning_rate": 1.5144464095622073e-05, + "loss": 1.6823, + "step": 116000 + }, + { + "epoch": 0.729149726003182, + "grad_norm": 6.603145122528076, + "learning_rate": 1.5144044994677419e-05, + "loss": 1.569, + "step": 116010 + }, + { + "epoch": 0.729212578319879, + "grad_norm": 7.549184799194336, + "learning_rate": 1.5143625893732766e-05, + "loss": 1.677, + "step": 116020 + }, + { + "epoch": 0.7292754306365761, + "grad_norm": 5.670623779296875, + "learning_rate": 1.5143206792788113e-05, + "loss": 1.6699, + "step": 116030 + }, + { + "epoch": 0.7293382829532732, + "grad_norm": 6.9575371742248535, + "learning_rate": 1.514278769184346e-05, + "loss": 1.4866, + "step": 116040 + }, + { + "epoch": 0.7294011352699703, + "grad_norm": 6.037784576416016, + "learning_rate": 1.5142368590898807e-05, + "loss": 1.7322, + "step": 116050 + }, + { + "epoch": 0.7294639875866674, + "grad_norm": 7.633106231689453, + "learning_rate": 1.514194948995415e-05, + "loss": 1.5799, + "step": 116060 + }, + { + "epoch": 0.7295268399033645, + "grad_norm": 7.580491065979004, + "learning_rate": 1.5141530389009498e-05, + "loss": 1.7369, + "step": 116070 + }, + { + "epoch": 0.7295896922200616, + "grad_norm": 5.560009956359863, + "learning_rate": 1.5141111288064845e-05, + "loss": 1.5047, + "step": 116080 + }, + { + "epoch": 0.7296525445367588, + "grad_norm": 6.160469055175781, + "learning_rate": 1.5140692187120192e-05, + "loss": 1.5171, + "step": 116090 + }, + { + "epoch": 0.7297153968534559, + "grad_norm": 6.144357204437256, + "learning_rate": 1.5140273086175537e-05, + "loss": 1.4165, + "step": 116100 + }, + { + "epoch": 0.729778249170153, + "grad_norm": 7.0018815994262695, + "learning_rate": 1.5139853985230884e-05, + "loss": 1.4202, + "step": 116110 + }, + { + "epoch": 0.7298411014868501, + "grad_norm": 6.264974594116211, + "learning_rate": 1.5139434884286231e-05, + "loss": 1.5731, + "step": 116120 + }, + { + "epoch": 0.7299039538035472, + "grad_norm": 6.4756550788879395, + "learning_rate": 1.5139015783341577e-05, + "loss": 1.7918, + "step": 116130 + }, + { + "epoch": 0.7299668061202443, + "grad_norm": 6.437472820281982, + "learning_rate": 1.5138596682396924e-05, + "loss": 1.9329, + "step": 116140 + }, + { + "epoch": 0.7300296584369415, + "grad_norm": 5.813361644744873, + "learning_rate": 1.5138177581452269e-05, + "loss": 1.6144, + "step": 116150 + }, + { + "epoch": 0.7300925107536386, + "grad_norm": 7.2174787521362305, + "learning_rate": 1.5137758480507616e-05, + "loss": 1.7227, + "step": 116160 + }, + { + "epoch": 0.7301553630703357, + "grad_norm": 7.056854248046875, + "learning_rate": 1.5137339379562963e-05, + "loss": 1.6733, + "step": 116170 + }, + { + "epoch": 0.7302182153870328, + "grad_norm": 6.974052906036377, + "learning_rate": 1.513692027861831e-05, + "loss": 1.6408, + "step": 116180 + }, + { + "epoch": 0.7302810677037299, + "grad_norm": 7.3612518310546875, + "learning_rate": 1.5136501177673654e-05, + "loss": 1.569, + "step": 116190 + }, + { + "epoch": 0.730343920020427, + "grad_norm": 7.1358323097229, + "learning_rate": 1.5136082076729001e-05, + "loss": 1.6857, + "step": 116200 + }, + { + "epoch": 0.7304067723371241, + "grad_norm": 5.359313011169434, + "learning_rate": 1.5135662975784348e-05, + "loss": 1.475, + "step": 116210 + }, + { + "epoch": 0.7304696246538213, + "grad_norm": 6.692825794219971, + "learning_rate": 1.5135243874839695e-05, + "loss": 1.5207, + "step": 116220 + }, + { + "epoch": 0.7305324769705184, + "grad_norm": 6.0015363693237305, + "learning_rate": 1.5134824773895042e-05, + "loss": 1.6661, + "step": 116230 + }, + { + "epoch": 0.7305953292872155, + "grad_norm": 5.568943500518799, + "learning_rate": 1.5134405672950388e-05, + "loss": 1.5029, + "step": 116240 + }, + { + "epoch": 0.7306581816039126, + "grad_norm": 6.180572032928467, + "learning_rate": 1.5133986572005735e-05, + "loss": 1.6326, + "step": 116250 + }, + { + "epoch": 0.7307210339206097, + "grad_norm": 6.116061687469482, + "learning_rate": 1.5133567471061082e-05, + "loss": 1.5924, + "step": 116260 + }, + { + "epoch": 0.7307838862373067, + "grad_norm": 5.452944755554199, + "learning_rate": 1.5133148370116429e-05, + "loss": 1.5732, + "step": 116270 + }, + { + "epoch": 0.7308467385540038, + "grad_norm": 5.4776787757873535, + "learning_rate": 1.5132729269171773e-05, + "loss": 1.5846, + "step": 116280 + }, + { + "epoch": 0.730909590870701, + "grad_norm": 7.125429153442383, + "learning_rate": 1.513231016822712e-05, + "loss": 1.6662, + "step": 116290 + }, + { + "epoch": 0.7309724431873981, + "grad_norm": 6.490479946136475, + "learning_rate": 1.5131891067282467e-05, + "loss": 1.5348, + "step": 116300 + }, + { + "epoch": 0.7310352955040952, + "grad_norm": 5.33502197265625, + "learning_rate": 1.5131471966337814e-05, + "loss": 1.6706, + "step": 116310 + }, + { + "epoch": 0.7310981478207923, + "grad_norm": 6.439328670501709, + "learning_rate": 1.5131052865393159e-05, + "loss": 1.9088, + "step": 116320 + }, + { + "epoch": 0.7311610001374894, + "grad_norm": 6.690783500671387, + "learning_rate": 1.5130633764448506e-05, + "loss": 1.8729, + "step": 116330 + }, + { + "epoch": 0.7312238524541865, + "grad_norm": 6.990403652191162, + "learning_rate": 1.5130214663503853e-05, + "loss": 1.5862, + "step": 116340 + }, + { + "epoch": 0.7312867047708836, + "grad_norm": 7.408207416534424, + "learning_rate": 1.51297955625592e-05, + "loss": 1.6638, + "step": 116350 + }, + { + "epoch": 0.7313495570875808, + "grad_norm": 6.472781658172607, + "learning_rate": 1.5129376461614547e-05, + "loss": 1.6145, + "step": 116360 + }, + { + "epoch": 0.7314124094042779, + "grad_norm": 5.800587177276611, + "learning_rate": 1.5128957360669891e-05, + "loss": 1.7008, + "step": 116370 + }, + { + "epoch": 0.731475261720975, + "grad_norm": 5.939591407775879, + "learning_rate": 1.5128538259725238e-05, + "loss": 1.6694, + "step": 116380 + }, + { + "epoch": 0.7315381140376721, + "grad_norm": 6.2899956703186035, + "learning_rate": 1.5128119158780585e-05, + "loss": 1.6128, + "step": 116390 + }, + { + "epoch": 0.7316009663543692, + "grad_norm": 6.833620548248291, + "learning_rate": 1.5127700057835932e-05, + "loss": 1.7073, + "step": 116400 + }, + { + "epoch": 0.7316638186710663, + "grad_norm": 5.662091255187988, + "learning_rate": 1.5127280956891278e-05, + "loss": 1.6163, + "step": 116410 + }, + { + "epoch": 0.7317266709877635, + "grad_norm": 6.265413284301758, + "learning_rate": 1.5126861855946625e-05, + "loss": 1.6779, + "step": 116420 + }, + { + "epoch": 0.7317895233044606, + "grad_norm": 6.575082778930664, + "learning_rate": 1.5126442755001972e-05, + "loss": 1.6361, + "step": 116430 + }, + { + "epoch": 0.7318523756211577, + "grad_norm": 5.75390100479126, + "learning_rate": 1.5126023654057317e-05, + "loss": 1.6087, + "step": 116440 + }, + { + "epoch": 0.7319152279378548, + "grad_norm": 6.973175525665283, + "learning_rate": 1.5125604553112664e-05, + "loss": 1.6973, + "step": 116450 + }, + { + "epoch": 0.7319780802545519, + "grad_norm": 5.981052875518799, + "learning_rate": 1.512518545216801e-05, + "loss": 1.6895, + "step": 116460 + }, + { + "epoch": 0.732040932571249, + "grad_norm": 5.5195746421813965, + "learning_rate": 1.5124766351223357e-05, + "loss": 1.4089, + "step": 116470 + }, + { + "epoch": 0.7321037848879461, + "grad_norm": 7.317570209503174, + "learning_rate": 1.5124347250278704e-05, + "loss": 1.729, + "step": 116480 + }, + { + "epoch": 0.7321666372046433, + "grad_norm": 6.834884166717529, + "learning_rate": 1.512392814933405e-05, + "loss": 1.5518, + "step": 116490 + }, + { + "epoch": 0.7322294895213404, + "grad_norm": 6.461305141448975, + "learning_rate": 1.5123509048389396e-05, + "loss": 1.5368, + "step": 116500 + }, + { + "epoch": 0.7322923418380375, + "grad_norm": 6.329416751861572, + "learning_rate": 1.5123089947444742e-05, + "loss": 1.6686, + "step": 116510 + }, + { + "epoch": 0.7323551941547346, + "grad_norm": 5.427972316741943, + "learning_rate": 1.5122670846500089e-05, + "loss": 1.7472, + "step": 116520 + }, + { + "epoch": 0.7324180464714316, + "grad_norm": 7.150508880615234, + "learning_rate": 1.5122251745555436e-05, + "loss": 1.3206, + "step": 116530 + }, + { + "epoch": 0.7324808987881287, + "grad_norm": 7.156881809234619, + "learning_rate": 1.5121832644610783e-05, + "loss": 1.5712, + "step": 116540 + }, + { + "epoch": 0.7325437511048258, + "grad_norm": 6.0363335609436035, + "learning_rate": 1.5121413543666128e-05, + "loss": 1.6576, + "step": 116550 + }, + { + "epoch": 0.732606603421523, + "grad_norm": 5.923098087310791, + "learning_rate": 1.5120994442721475e-05, + "loss": 1.568, + "step": 116560 + }, + { + "epoch": 0.7326694557382201, + "grad_norm": 6.134315490722656, + "learning_rate": 1.5120575341776822e-05, + "loss": 1.3467, + "step": 116570 + }, + { + "epoch": 0.7327323080549172, + "grad_norm": 5.646820068359375, + "learning_rate": 1.512015624083217e-05, + "loss": 1.672, + "step": 116580 + }, + { + "epoch": 0.7327951603716143, + "grad_norm": 6.210283279418945, + "learning_rate": 1.5119737139887513e-05, + "loss": 1.6509, + "step": 116590 + }, + { + "epoch": 0.7328580126883114, + "grad_norm": 6.5223469734191895, + "learning_rate": 1.511931803894286e-05, + "loss": 1.7307, + "step": 116600 + }, + { + "epoch": 0.7329208650050085, + "grad_norm": 5.664274215698242, + "learning_rate": 1.5118898937998207e-05, + "loss": 1.5186, + "step": 116610 + }, + { + "epoch": 0.7329837173217056, + "grad_norm": 6.588070392608643, + "learning_rate": 1.5118479837053554e-05, + "loss": 1.4739, + "step": 116620 + }, + { + "epoch": 0.7330465696384028, + "grad_norm": 7.984457492828369, + "learning_rate": 1.51180607361089e-05, + "loss": 1.5591, + "step": 116630 + }, + { + "epoch": 0.7331094219550999, + "grad_norm": 6.535252094268799, + "learning_rate": 1.5117641635164247e-05, + "loss": 1.5877, + "step": 116640 + }, + { + "epoch": 0.733172274271797, + "grad_norm": 8.361675262451172, + "learning_rate": 1.5117222534219594e-05, + "loss": 1.5656, + "step": 116650 + }, + { + "epoch": 0.7332351265884941, + "grad_norm": 5.582570552825928, + "learning_rate": 1.5116803433274941e-05, + "loss": 1.4592, + "step": 116660 + }, + { + "epoch": 0.7332979789051912, + "grad_norm": 6.3676533699035645, + "learning_rate": 1.5116384332330288e-05, + "loss": 1.6096, + "step": 116670 + }, + { + "epoch": 0.7333608312218883, + "grad_norm": 5.959659576416016, + "learning_rate": 1.5115965231385632e-05, + "loss": 1.6329, + "step": 116680 + }, + { + "epoch": 0.7334236835385854, + "grad_norm": 6.887051105499268, + "learning_rate": 1.5115546130440979e-05, + "loss": 1.8432, + "step": 116690 + }, + { + "epoch": 0.7334865358552826, + "grad_norm": 7.40312385559082, + "learning_rate": 1.5115127029496326e-05, + "loss": 1.5736, + "step": 116700 + }, + { + "epoch": 0.7335493881719797, + "grad_norm": 5.859505653381348, + "learning_rate": 1.5114707928551673e-05, + "loss": 1.4872, + "step": 116710 + }, + { + "epoch": 0.7336122404886768, + "grad_norm": 6.143748760223389, + "learning_rate": 1.5114288827607018e-05, + "loss": 1.5369, + "step": 116720 + }, + { + "epoch": 0.7336750928053739, + "grad_norm": 6.233851432800293, + "learning_rate": 1.5113869726662365e-05, + "loss": 1.3973, + "step": 116730 + }, + { + "epoch": 0.733737945122071, + "grad_norm": 6.276238441467285, + "learning_rate": 1.5113450625717712e-05, + "loss": 1.5194, + "step": 116740 + }, + { + "epoch": 0.7338007974387681, + "grad_norm": 7.023739814758301, + "learning_rate": 1.5113031524773058e-05, + "loss": 1.5849, + "step": 116750 + }, + { + "epoch": 0.7338636497554653, + "grad_norm": 6.218929767608643, + "learning_rate": 1.5112612423828405e-05, + "loss": 1.4176, + "step": 116760 + }, + { + "epoch": 0.7339265020721624, + "grad_norm": 6.902184963226318, + "learning_rate": 1.511219332288375e-05, + "loss": 1.5564, + "step": 116770 + }, + { + "epoch": 0.7339893543888594, + "grad_norm": 6.392886638641357, + "learning_rate": 1.5111774221939097e-05, + "loss": 1.674, + "step": 116780 + }, + { + "epoch": 0.7340522067055565, + "grad_norm": 6.936563014984131, + "learning_rate": 1.5111355120994444e-05, + "loss": 1.6001, + "step": 116790 + }, + { + "epoch": 0.7341150590222536, + "grad_norm": 7.19557523727417, + "learning_rate": 1.5110936020049791e-05, + "loss": 1.646, + "step": 116800 + }, + { + "epoch": 0.7341779113389507, + "grad_norm": 7.987978935241699, + "learning_rate": 1.5110516919105137e-05, + "loss": 1.5771, + "step": 116810 + }, + { + "epoch": 0.7342407636556478, + "grad_norm": 5.6326189041137695, + "learning_rate": 1.5110097818160482e-05, + "loss": 1.6144, + "step": 116820 + }, + { + "epoch": 0.734303615972345, + "grad_norm": 8.156567573547363, + "learning_rate": 1.510967871721583e-05, + "loss": 1.568, + "step": 116830 + }, + { + "epoch": 0.7343664682890421, + "grad_norm": 5.975371360778809, + "learning_rate": 1.5109259616271176e-05, + "loss": 1.5546, + "step": 116840 + }, + { + "epoch": 0.7344293206057392, + "grad_norm": 6.602675437927246, + "learning_rate": 1.5108840515326523e-05, + "loss": 1.661, + "step": 116850 + }, + { + "epoch": 0.7344921729224363, + "grad_norm": 5.703989028930664, + "learning_rate": 1.5108421414381869e-05, + "loss": 1.5246, + "step": 116860 + }, + { + "epoch": 0.7345550252391334, + "grad_norm": 6.698914051055908, + "learning_rate": 1.5108002313437216e-05, + "loss": 1.6645, + "step": 116870 + }, + { + "epoch": 0.7346178775558305, + "grad_norm": 5.767265796661377, + "learning_rate": 1.5107583212492563e-05, + "loss": 1.478, + "step": 116880 + }, + { + "epoch": 0.7346807298725276, + "grad_norm": 6.4239821434021, + "learning_rate": 1.510716411154791e-05, + "loss": 1.6222, + "step": 116890 + }, + { + "epoch": 0.7347435821892248, + "grad_norm": 6.624935150146484, + "learning_rate": 1.5106745010603254e-05, + "loss": 1.8779, + "step": 116900 + }, + { + "epoch": 0.7348064345059219, + "grad_norm": 6.570802211761475, + "learning_rate": 1.51063259096586e-05, + "loss": 1.6508, + "step": 116910 + }, + { + "epoch": 0.734869286822619, + "grad_norm": 6.047118663787842, + "learning_rate": 1.5105906808713948e-05, + "loss": 1.9309, + "step": 116920 + }, + { + "epoch": 0.7349321391393161, + "grad_norm": 6.4321699142456055, + "learning_rate": 1.5105487707769295e-05, + "loss": 1.6348, + "step": 116930 + }, + { + "epoch": 0.7349949914560132, + "grad_norm": 7.588624477386475, + "learning_rate": 1.510506860682464e-05, + "loss": 1.6869, + "step": 116940 + }, + { + "epoch": 0.7350578437727103, + "grad_norm": 8.491658210754395, + "learning_rate": 1.5104649505879987e-05, + "loss": 1.9243, + "step": 116950 + }, + { + "epoch": 0.7351206960894074, + "grad_norm": 5.338583946228027, + "learning_rate": 1.5104230404935334e-05, + "loss": 1.6569, + "step": 116960 + }, + { + "epoch": 0.7351835484061046, + "grad_norm": 5.248413562774658, + "learning_rate": 1.5103811303990681e-05, + "loss": 1.6199, + "step": 116970 + }, + { + "epoch": 0.7352464007228017, + "grad_norm": 6.071453094482422, + "learning_rate": 1.5103392203046028e-05, + "loss": 1.6443, + "step": 116980 + }, + { + "epoch": 0.7353092530394988, + "grad_norm": 5.989220142364502, + "learning_rate": 1.5102973102101372e-05, + "loss": 1.4356, + "step": 116990 + }, + { + "epoch": 0.7353721053561959, + "grad_norm": 6.5885210037231445, + "learning_rate": 1.510255400115672e-05, + "loss": 1.4912, + "step": 117000 + }, + { + "epoch": 0.735434957672893, + "grad_norm": 8.531759262084961, + "learning_rate": 1.5102134900212066e-05, + "loss": 1.5257, + "step": 117010 + }, + { + "epoch": 0.7354978099895901, + "grad_norm": 6.679140567779541, + "learning_rate": 1.5101715799267413e-05, + "loss": 1.6421, + "step": 117020 + }, + { + "epoch": 0.7355606623062872, + "grad_norm": 5.821613311767578, + "learning_rate": 1.5101296698322759e-05, + "loss": 1.4254, + "step": 117030 + }, + { + "epoch": 0.7356235146229843, + "grad_norm": 6.059935569763184, + "learning_rate": 1.5100877597378106e-05, + "loss": 1.5685, + "step": 117040 + }, + { + "epoch": 0.7356863669396814, + "grad_norm": 7.621307373046875, + "learning_rate": 1.5100458496433453e-05, + "loss": 1.6196, + "step": 117050 + }, + { + "epoch": 0.7357492192563785, + "grad_norm": 6.805684566497803, + "learning_rate": 1.51000393954888e-05, + "loss": 1.9584, + "step": 117060 + }, + { + "epoch": 0.7358120715730756, + "grad_norm": 5.543520450592041, + "learning_rate": 1.5099620294544145e-05, + "loss": 1.5554, + "step": 117070 + }, + { + "epoch": 0.7358749238897727, + "grad_norm": 6.204122066497803, + "learning_rate": 1.509920119359949e-05, + "loss": 1.7006, + "step": 117080 + }, + { + "epoch": 0.7359377762064698, + "grad_norm": 5.589944362640381, + "learning_rate": 1.5098782092654838e-05, + "loss": 1.7825, + "step": 117090 + }, + { + "epoch": 0.7360006285231669, + "grad_norm": 7.761539459228516, + "learning_rate": 1.5098362991710185e-05, + "loss": 1.4863, + "step": 117100 + }, + { + "epoch": 0.7360634808398641, + "grad_norm": 8.452686309814453, + "learning_rate": 1.5097943890765532e-05, + "loss": 1.5597, + "step": 117110 + }, + { + "epoch": 0.7361263331565612, + "grad_norm": 6.617719650268555, + "learning_rate": 1.5097524789820877e-05, + "loss": 1.7412, + "step": 117120 + }, + { + "epoch": 0.7361891854732583, + "grad_norm": 5.494043350219727, + "learning_rate": 1.5097105688876223e-05, + "loss": 1.6649, + "step": 117130 + }, + { + "epoch": 0.7362520377899554, + "grad_norm": 7.095428943634033, + "learning_rate": 1.509668658793157e-05, + "loss": 1.4653, + "step": 117140 + }, + { + "epoch": 0.7363148901066525, + "grad_norm": 6.8330254554748535, + "learning_rate": 1.5096267486986917e-05, + "loss": 1.6437, + "step": 117150 + }, + { + "epoch": 0.7363777424233496, + "grad_norm": 7.541816711425781, + "learning_rate": 1.5095848386042264e-05, + "loss": 1.9871, + "step": 117160 + }, + { + "epoch": 0.7364405947400468, + "grad_norm": 6.407535552978516, + "learning_rate": 1.509542928509761e-05, + "loss": 1.5611, + "step": 117170 + }, + { + "epoch": 0.7365034470567439, + "grad_norm": 6.419076442718506, + "learning_rate": 1.5095010184152956e-05, + "loss": 1.6267, + "step": 117180 + }, + { + "epoch": 0.736566299373441, + "grad_norm": 7.079204082489014, + "learning_rate": 1.5094591083208303e-05, + "loss": 1.5027, + "step": 117190 + }, + { + "epoch": 0.7366291516901381, + "grad_norm": 5.94259786605835, + "learning_rate": 1.509417198226365e-05, + "loss": 1.5696, + "step": 117200 + }, + { + "epoch": 0.7366920040068352, + "grad_norm": 6.831979274749756, + "learning_rate": 1.5093752881318994e-05, + "loss": 1.8869, + "step": 117210 + }, + { + "epoch": 0.7367548563235323, + "grad_norm": 6.6600751876831055, + "learning_rate": 1.5093333780374341e-05, + "loss": 1.5593, + "step": 117220 + }, + { + "epoch": 0.7368177086402294, + "grad_norm": 6.9768524169921875, + "learning_rate": 1.5092914679429688e-05, + "loss": 1.5743, + "step": 117230 + }, + { + "epoch": 0.7368805609569266, + "grad_norm": 7.4975104331970215, + "learning_rate": 1.5092495578485035e-05, + "loss": 1.7093, + "step": 117240 + }, + { + "epoch": 0.7369434132736237, + "grad_norm": 7.1950507164001465, + "learning_rate": 1.509207647754038e-05, + "loss": 1.486, + "step": 117250 + }, + { + "epoch": 0.7370062655903208, + "grad_norm": 6.349445343017578, + "learning_rate": 1.5091657376595728e-05, + "loss": 1.6197, + "step": 117260 + }, + { + "epoch": 0.7370691179070179, + "grad_norm": 6.088489532470703, + "learning_rate": 1.5091238275651075e-05, + "loss": 1.5926, + "step": 117270 + }, + { + "epoch": 0.737131970223715, + "grad_norm": 6.358327388763428, + "learning_rate": 1.5090819174706422e-05, + "loss": 1.752, + "step": 117280 + }, + { + "epoch": 0.737194822540412, + "grad_norm": 7.0250372886657715, + "learning_rate": 1.5090400073761769e-05, + "loss": 1.7792, + "step": 117290 + }, + { + "epoch": 0.7372576748571091, + "grad_norm": 6.476198196411133, + "learning_rate": 1.5089980972817113e-05, + "loss": 1.5934, + "step": 117300 + }, + { + "epoch": 0.7373205271738063, + "grad_norm": 5.442954063415527, + "learning_rate": 1.508956187187246e-05, + "loss": 1.3738, + "step": 117310 + }, + { + "epoch": 0.7373833794905034, + "grad_norm": 6.9679412841796875, + "learning_rate": 1.5089142770927807e-05, + "loss": 1.7288, + "step": 117320 + }, + { + "epoch": 0.7374462318072005, + "grad_norm": 8.034341812133789, + "learning_rate": 1.5088723669983154e-05, + "loss": 1.6302, + "step": 117330 + }, + { + "epoch": 0.7375090841238976, + "grad_norm": 7.061643600463867, + "learning_rate": 1.50883045690385e-05, + "loss": 1.6806, + "step": 117340 + }, + { + "epoch": 0.7375719364405947, + "grad_norm": 7.586759090423584, + "learning_rate": 1.5087885468093846e-05, + "loss": 1.6545, + "step": 117350 + }, + { + "epoch": 0.7376347887572918, + "grad_norm": 7.27079439163208, + "learning_rate": 1.5087466367149193e-05, + "loss": 1.6971, + "step": 117360 + }, + { + "epoch": 0.7376976410739889, + "grad_norm": 6.415982246398926, + "learning_rate": 1.508704726620454e-05, + "loss": 1.5838, + "step": 117370 + }, + { + "epoch": 0.7377604933906861, + "grad_norm": 5.629652976989746, + "learning_rate": 1.5086628165259886e-05, + "loss": 1.5687, + "step": 117380 + }, + { + "epoch": 0.7378233457073832, + "grad_norm": 6.433467388153076, + "learning_rate": 1.5086209064315231e-05, + "loss": 1.6912, + "step": 117390 + }, + { + "epoch": 0.7378861980240803, + "grad_norm": 6.706539154052734, + "learning_rate": 1.5085789963370578e-05, + "loss": 1.7466, + "step": 117400 + }, + { + "epoch": 0.7379490503407774, + "grad_norm": 5.629751682281494, + "learning_rate": 1.5085370862425925e-05, + "loss": 1.5597, + "step": 117410 + }, + { + "epoch": 0.7380119026574745, + "grad_norm": 6.2336249351501465, + "learning_rate": 1.5084951761481272e-05, + "loss": 1.5987, + "step": 117420 + }, + { + "epoch": 0.7380747549741716, + "grad_norm": 6.122241497039795, + "learning_rate": 1.5084532660536618e-05, + "loss": 1.5728, + "step": 117430 + }, + { + "epoch": 0.7381376072908687, + "grad_norm": 7.282014846801758, + "learning_rate": 1.5084113559591965e-05, + "loss": 1.7043, + "step": 117440 + }, + { + "epoch": 0.7382004596075659, + "grad_norm": 6.606590270996094, + "learning_rate": 1.508369445864731e-05, + "loss": 1.6365, + "step": 117450 + }, + { + "epoch": 0.738263311924263, + "grad_norm": 6.349709987640381, + "learning_rate": 1.5083275357702657e-05, + "loss": 1.6036, + "step": 117460 + }, + { + "epoch": 0.7383261642409601, + "grad_norm": 5.677004814147949, + "learning_rate": 1.5082856256758004e-05, + "loss": 1.5725, + "step": 117470 + }, + { + "epoch": 0.7383890165576572, + "grad_norm": 6.100164890289307, + "learning_rate": 1.508243715581335e-05, + "loss": 1.5149, + "step": 117480 + }, + { + "epoch": 0.7384518688743543, + "grad_norm": 5.808437824249268, + "learning_rate": 1.5082018054868697e-05, + "loss": 1.6419, + "step": 117490 + }, + { + "epoch": 0.7385147211910514, + "grad_norm": 6.135100364685059, + "learning_rate": 1.5081598953924044e-05, + "loss": 1.6567, + "step": 117500 + }, + { + "epoch": 0.7385775735077486, + "grad_norm": 6.83665132522583, + "learning_rate": 1.5081179852979391e-05, + "loss": 1.7174, + "step": 117510 + }, + { + "epoch": 0.7386404258244457, + "grad_norm": 6.4246087074279785, + "learning_rate": 1.5080760752034735e-05, + "loss": 1.6823, + "step": 117520 + }, + { + "epoch": 0.7387032781411428, + "grad_norm": 6.082372665405273, + "learning_rate": 1.5080341651090082e-05, + "loss": 1.862, + "step": 117530 + }, + { + "epoch": 0.7387661304578399, + "grad_norm": 6.764336109161377, + "learning_rate": 1.5079922550145429e-05, + "loss": 1.6196, + "step": 117540 + }, + { + "epoch": 0.7388289827745369, + "grad_norm": 7.09053373336792, + "learning_rate": 1.5079503449200776e-05, + "loss": 1.6565, + "step": 117550 + }, + { + "epoch": 0.738891835091234, + "grad_norm": 6.333735466003418, + "learning_rate": 1.5079084348256121e-05, + "loss": 1.6949, + "step": 117560 + }, + { + "epoch": 0.7389546874079311, + "grad_norm": 5.893936634063721, + "learning_rate": 1.5078665247311468e-05, + "loss": 1.6049, + "step": 117570 + }, + { + "epoch": 0.7390175397246282, + "grad_norm": 6.619492530822754, + "learning_rate": 1.5078246146366815e-05, + "loss": 1.5969, + "step": 117580 + }, + { + "epoch": 0.7390803920413254, + "grad_norm": 5.940426349639893, + "learning_rate": 1.5077827045422163e-05, + "loss": 1.8313, + "step": 117590 + }, + { + "epoch": 0.7391432443580225, + "grad_norm": 6.792811870574951, + "learning_rate": 1.507740794447751e-05, + "loss": 1.7358, + "step": 117600 + }, + { + "epoch": 0.7392060966747196, + "grad_norm": 7.276131629943848, + "learning_rate": 1.5076988843532853e-05, + "loss": 1.7425, + "step": 117610 + }, + { + "epoch": 0.7392689489914167, + "grad_norm": 6.933396816253662, + "learning_rate": 1.50765697425882e-05, + "loss": 1.5703, + "step": 117620 + }, + { + "epoch": 0.7393318013081138, + "grad_norm": 7.595689296722412, + "learning_rate": 1.5076150641643547e-05, + "loss": 1.4989, + "step": 117630 + }, + { + "epoch": 0.7393946536248109, + "grad_norm": 5.8264288902282715, + "learning_rate": 1.5075731540698894e-05, + "loss": 1.7436, + "step": 117640 + }, + { + "epoch": 0.739457505941508, + "grad_norm": 7.674973487854004, + "learning_rate": 1.507531243975424e-05, + "loss": 1.665, + "step": 117650 + }, + { + "epoch": 0.7395203582582052, + "grad_norm": 7.688501834869385, + "learning_rate": 1.5074893338809587e-05, + "loss": 1.5542, + "step": 117660 + }, + { + "epoch": 0.7395832105749023, + "grad_norm": 6.045027732849121, + "learning_rate": 1.5074474237864934e-05, + "loss": 1.4263, + "step": 117670 + }, + { + "epoch": 0.7396460628915994, + "grad_norm": 7.995920181274414, + "learning_rate": 1.5074055136920281e-05, + "loss": 1.8176, + "step": 117680 + }, + { + "epoch": 0.7397089152082965, + "grad_norm": 6.833772659301758, + "learning_rate": 1.5073636035975626e-05, + "loss": 1.6143, + "step": 117690 + }, + { + "epoch": 0.7397717675249936, + "grad_norm": 6.990723609924316, + "learning_rate": 1.5073216935030972e-05, + "loss": 1.845, + "step": 117700 + }, + { + "epoch": 0.7398346198416907, + "grad_norm": 5.987464904785156, + "learning_rate": 1.5072797834086319e-05, + "loss": 1.535, + "step": 117710 + }, + { + "epoch": 0.7398974721583879, + "grad_norm": 6.514626979827881, + "learning_rate": 1.5072378733141666e-05, + "loss": 1.7832, + "step": 117720 + }, + { + "epoch": 0.739960324475085, + "grad_norm": 8.164240837097168, + "learning_rate": 1.5071959632197013e-05, + "loss": 1.6897, + "step": 117730 + }, + { + "epoch": 0.7400231767917821, + "grad_norm": 6.876088619232178, + "learning_rate": 1.5071540531252358e-05, + "loss": 1.5586, + "step": 117740 + }, + { + "epoch": 0.7400860291084792, + "grad_norm": 6.978507041931152, + "learning_rate": 1.5071121430307705e-05, + "loss": 1.6416, + "step": 117750 + }, + { + "epoch": 0.7401488814251763, + "grad_norm": 6.801281452178955, + "learning_rate": 1.5070702329363051e-05, + "loss": 1.5543, + "step": 117760 + }, + { + "epoch": 0.7402117337418734, + "grad_norm": 7.367517948150635, + "learning_rate": 1.5070283228418398e-05, + "loss": 1.66, + "step": 117770 + }, + { + "epoch": 0.7402745860585705, + "grad_norm": 6.92649507522583, + "learning_rate": 1.5069864127473745e-05, + "loss": 1.564, + "step": 117780 + }, + { + "epoch": 0.7403374383752677, + "grad_norm": 6.037821292877197, + "learning_rate": 1.506944502652909e-05, + "loss": 1.5933, + "step": 117790 + }, + { + "epoch": 0.7404002906919647, + "grad_norm": 6.205532550811768, + "learning_rate": 1.5069025925584437e-05, + "loss": 1.4872, + "step": 117800 + }, + { + "epoch": 0.7404631430086618, + "grad_norm": 5.795130252838135, + "learning_rate": 1.5068606824639785e-05, + "loss": 1.5236, + "step": 117810 + }, + { + "epoch": 0.7405259953253589, + "grad_norm": 7.018015384674072, + "learning_rate": 1.5068187723695132e-05, + "loss": 1.585, + "step": 117820 + }, + { + "epoch": 0.740588847642056, + "grad_norm": 6.7126946449279785, + "learning_rate": 1.5067768622750475e-05, + "loss": 1.6935, + "step": 117830 + }, + { + "epoch": 0.7406516999587531, + "grad_norm": 5.657297134399414, + "learning_rate": 1.5067349521805822e-05, + "loss": 1.5176, + "step": 117840 + }, + { + "epoch": 0.7407145522754502, + "grad_norm": 5.1493144035339355, + "learning_rate": 1.506693042086117e-05, + "loss": 1.8007, + "step": 117850 + }, + { + "epoch": 0.7407774045921474, + "grad_norm": 7.843067646026611, + "learning_rate": 1.5066511319916516e-05, + "loss": 1.6827, + "step": 117860 + }, + { + "epoch": 0.7408402569088445, + "grad_norm": 5.899585247039795, + "learning_rate": 1.5066092218971862e-05, + "loss": 1.5232, + "step": 117870 + }, + { + "epoch": 0.7409031092255416, + "grad_norm": 6.4450483322143555, + "learning_rate": 1.5065673118027209e-05, + "loss": 1.8019, + "step": 117880 + }, + { + "epoch": 0.7409659615422387, + "grad_norm": 5.604441165924072, + "learning_rate": 1.5065254017082556e-05, + "loss": 1.3844, + "step": 117890 + }, + { + "epoch": 0.7410288138589358, + "grad_norm": 6.067689895629883, + "learning_rate": 1.5064834916137903e-05, + "loss": 1.4794, + "step": 117900 + }, + { + "epoch": 0.7410916661756329, + "grad_norm": 7.927680015563965, + "learning_rate": 1.506441581519325e-05, + "loss": 1.7118, + "step": 117910 + }, + { + "epoch": 0.74115451849233, + "grad_norm": 5.2320404052734375, + "learning_rate": 1.5063996714248594e-05, + "loss": 1.5452, + "step": 117920 + }, + { + "epoch": 0.7412173708090272, + "grad_norm": 6.294632911682129, + "learning_rate": 1.5063577613303941e-05, + "loss": 1.5066, + "step": 117930 + }, + { + "epoch": 0.7412802231257243, + "grad_norm": 7.3671417236328125, + "learning_rate": 1.5063158512359288e-05, + "loss": 1.6631, + "step": 117940 + }, + { + "epoch": 0.7413430754424214, + "grad_norm": 5.318902015686035, + "learning_rate": 1.5062739411414635e-05, + "loss": 1.5723, + "step": 117950 + }, + { + "epoch": 0.7414059277591185, + "grad_norm": 5.740302085876465, + "learning_rate": 1.506232031046998e-05, + "loss": 1.6862, + "step": 117960 + }, + { + "epoch": 0.7414687800758156, + "grad_norm": 6.682391166687012, + "learning_rate": 1.5061901209525327e-05, + "loss": 1.5793, + "step": 117970 + }, + { + "epoch": 0.7415316323925127, + "grad_norm": 6.816469669342041, + "learning_rate": 1.5061482108580675e-05, + "loss": 1.5008, + "step": 117980 + }, + { + "epoch": 0.7415944847092099, + "grad_norm": 5.641001224517822, + "learning_rate": 1.5061063007636022e-05, + "loss": 1.6833, + "step": 117990 + }, + { + "epoch": 0.741657337025907, + "grad_norm": 5.387622833251953, + "learning_rate": 1.5060643906691369e-05, + "loss": 1.5852, + "step": 118000 + }, + { + "epoch": 0.7417201893426041, + "grad_norm": 6.281401634216309, + "learning_rate": 1.5060224805746712e-05, + "loss": 1.6492, + "step": 118010 + }, + { + "epoch": 0.7417830416593012, + "grad_norm": 5.9567413330078125, + "learning_rate": 1.505980570480206e-05, + "loss": 1.6401, + "step": 118020 + }, + { + "epoch": 0.7418458939759983, + "grad_norm": 7.386930465698242, + "learning_rate": 1.5059386603857407e-05, + "loss": 1.748, + "step": 118030 + }, + { + "epoch": 0.7419087462926954, + "grad_norm": 7.229539394378662, + "learning_rate": 1.5058967502912754e-05, + "loss": 1.646, + "step": 118040 + }, + { + "epoch": 0.7419715986093925, + "grad_norm": 6.037184715270996, + "learning_rate": 1.5058548401968099e-05, + "loss": 1.4495, + "step": 118050 + }, + { + "epoch": 0.7420344509260896, + "grad_norm": 6.56737756729126, + "learning_rate": 1.5058129301023446e-05, + "loss": 1.8005, + "step": 118060 + }, + { + "epoch": 0.7420973032427867, + "grad_norm": 5.887204647064209, + "learning_rate": 1.5057710200078791e-05, + "loss": 1.5325, + "step": 118070 + }, + { + "epoch": 0.7421601555594838, + "grad_norm": 5.897393703460693, + "learning_rate": 1.5057291099134138e-05, + "loss": 1.6846, + "step": 118080 + }, + { + "epoch": 0.7422230078761809, + "grad_norm": 6.135063648223877, + "learning_rate": 1.5056871998189486e-05, + "loss": 1.4418, + "step": 118090 + }, + { + "epoch": 0.742285860192878, + "grad_norm": 6.761139392852783, + "learning_rate": 1.5056452897244831e-05, + "loss": 1.7583, + "step": 118100 + }, + { + "epoch": 0.7423487125095751, + "grad_norm": 6.599857807159424, + "learning_rate": 1.5056033796300178e-05, + "loss": 1.5573, + "step": 118110 + }, + { + "epoch": 0.7424115648262722, + "grad_norm": 6.822390556335449, + "learning_rate": 1.5055614695355525e-05, + "loss": 1.5575, + "step": 118120 + }, + { + "epoch": 0.7424744171429694, + "grad_norm": 6.472692966461182, + "learning_rate": 1.5055195594410872e-05, + "loss": 1.6348, + "step": 118130 + }, + { + "epoch": 0.7425372694596665, + "grad_norm": 6.222887992858887, + "learning_rate": 1.5054776493466216e-05, + "loss": 1.6309, + "step": 118140 + }, + { + "epoch": 0.7426001217763636, + "grad_norm": 8.920105934143066, + "learning_rate": 1.5054357392521563e-05, + "loss": 1.9112, + "step": 118150 + }, + { + "epoch": 0.7426629740930607, + "grad_norm": 7.522040843963623, + "learning_rate": 1.505393829157691e-05, + "loss": 1.8268, + "step": 118160 + }, + { + "epoch": 0.7427258264097578, + "grad_norm": 6.264855861663818, + "learning_rate": 1.5053519190632257e-05, + "loss": 1.6398, + "step": 118170 + }, + { + "epoch": 0.7427886787264549, + "grad_norm": 6.111952781677246, + "learning_rate": 1.5053100089687602e-05, + "loss": 1.6049, + "step": 118180 + }, + { + "epoch": 0.742851531043152, + "grad_norm": 6.81397008895874, + "learning_rate": 1.505268098874295e-05, + "loss": 1.532, + "step": 118190 + }, + { + "epoch": 0.7429143833598492, + "grad_norm": 6.4752302169799805, + "learning_rate": 1.5052261887798297e-05, + "loss": 1.7698, + "step": 118200 + }, + { + "epoch": 0.7429772356765463, + "grad_norm": 6.936573028564453, + "learning_rate": 1.5051842786853644e-05, + "loss": 1.6254, + "step": 118210 + }, + { + "epoch": 0.7430400879932434, + "grad_norm": 7.293984889984131, + "learning_rate": 1.505142368590899e-05, + "loss": 1.511, + "step": 118220 + }, + { + "epoch": 0.7431029403099405, + "grad_norm": 6.396081447601318, + "learning_rate": 1.5051004584964334e-05, + "loss": 1.5334, + "step": 118230 + }, + { + "epoch": 0.7431657926266376, + "grad_norm": 6.191473484039307, + "learning_rate": 1.5050585484019681e-05, + "loss": 1.6911, + "step": 118240 + }, + { + "epoch": 0.7432286449433347, + "grad_norm": 7.289321422576904, + "learning_rate": 1.5050166383075029e-05, + "loss": 1.8485, + "step": 118250 + }, + { + "epoch": 0.7432914972600319, + "grad_norm": 6.679356098175049, + "learning_rate": 1.5049747282130376e-05, + "loss": 1.7827, + "step": 118260 + }, + { + "epoch": 0.743354349576729, + "grad_norm": 6.273317337036133, + "learning_rate": 1.5049328181185721e-05, + "loss": 1.5529, + "step": 118270 + }, + { + "epoch": 0.7434172018934261, + "grad_norm": 5.995965957641602, + "learning_rate": 1.5048909080241068e-05, + "loss": 1.4336, + "step": 118280 + }, + { + "epoch": 0.7434800542101232, + "grad_norm": 6.3854522705078125, + "learning_rate": 1.5048489979296415e-05, + "loss": 1.5366, + "step": 118290 + }, + { + "epoch": 0.7435429065268203, + "grad_norm": 5.601490020751953, + "learning_rate": 1.5048070878351762e-05, + "loss": 1.6345, + "step": 118300 + }, + { + "epoch": 0.7436057588435174, + "grad_norm": 6.8942461013793945, + "learning_rate": 1.504765177740711e-05, + "loss": 1.8003, + "step": 118310 + }, + { + "epoch": 0.7436686111602144, + "grad_norm": 5.988004684448242, + "learning_rate": 1.5047232676462453e-05, + "loss": 1.8467, + "step": 118320 + }, + { + "epoch": 0.7437314634769115, + "grad_norm": 5.545806884765625, + "learning_rate": 1.50468135755178e-05, + "loss": 1.3995, + "step": 118330 + }, + { + "epoch": 0.7437943157936087, + "grad_norm": 6.340879917144775, + "learning_rate": 1.5046394474573147e-05, + "loss": 1.5756, + "step": 118340 + }, + { + "epoch": 0.7438571681103058, + "grad_norm": 5.379518508911133, + "learning_rate": 1.5045975373628494e-05, + "loss": 1.6435, + "step": 118350 + }, + { + "epoch": 0.7439200204270029, + "grad_norm": 6.786842346191406, + "learning_rate": 1.504555627268384e-05, + "loss": 1.6649, + "step": 118360 + }, + { + "epoch": 0.7439828727437, + "grad_norm": 5.5345988273620605, + "learning_rate": 1.5045137171739187e-05, + "loss": 1.5497, + "step": 118370 + }, + { + "epoch": 0.7440457250603971, + "grad_norm": 6.6930437088012695, + "learning_rate": 1.5044718070794534e-05, + "loss": 1.4766, + "step": 118380 + }, + { + "epoch": 0.7441085773770942, + "grad_norm": 6.040542125701904, + "learning_rate": 1.5044298969849879e-05, + "loss": 1.647, + "step": 118390 + }, + { + "epoch": 0.7441714296937914, + "grad_norm": 6.675861358642578, + "learning_rate": 1.5043879868905226e-05, + "loss": 1.5709, + "step": 118400 + }, + { + "epoch": 0.7442342820104885, + "grad_norm": 6.339999675750732, + "learning_rate": 1.5043460767960571e-05, + "loss": 1.6079, + "step": 118410 + }, + { + "epoch": 0.7442971343271856, + "grad_norm": 5.432798862457275, + "learning_rate": 1.5043041667015919e-05, + "loss": 1.6047, + "step": 118420 + }, + { + "epoch": 0.7443599866438827, + "grad_norm": 6.235202312469482, + "learning_rate": 1.5042622566071266e-05, + "loss": 1.6056, + "step": 118430 + }, + { + "epoch": 0.7444228389605798, + "grad_norm": 5.5712995529174805, + "learning_rate": 1.5042203465126613e-05, + "loss": 1.4556, + "step": 118440 + }, + { + "epoch": 0.7444856912772769, + "grad_norm": 6.592348098754883, + "learning_rate": 1.5041784364181956e-05, + "loss": 1.7613, + "step": 118450 + }, + { + "epoch": 0.744548543593974, + "grad_norm": 6.759244441986084, + "learning_rate": 1.5041365263237303e-05, + "loss": 1.739, + "step": 118460 + }, + { + "epoch": 0.7446113959106712, + "grad_norm": 8.218925476074219, + "learning_rate": 1.504094616229265e-05, + "loss": 1.6422, + "step": 118470 + }, + { + "epoch": 0.7446742482273683, + "grad_norm": 6.363652229309082, + "learning_rate": 1.5040527061347998e-05, + "loss": 1.5126, + "step": 118480 + }, + { + "epoch": 0.7447371005440654, + "grad_norm": 7.040666103363037, + "learning_rate": 1.5040107960403345e-05, + "loss": 1.6639, + "step": 118490 + }, + { + "epoch": 0.7447999528607625, + "grad_norm": 6.67605447769165, + "learning_rate": 1.503968885945869e-05, + "loss": 1.3955, + "step": 118500 + }, + { + "epoch": 0.7448628051774596, + "grad_norm": 7.312140464782715, + "learning_rate": 1.5039269758514037e-05, + "loss": 1.4657, + "step": 118510 + }, + { + "epoch": 0.7449256574941567, + "grad_norm": 7.492226600646973, + "learning_rate": 1.5038850657569384e-05, + "loss": 1.8691, + "step": 118520 + }, + { + "epoch": 0.7449885098108538, + "grad_norm": 5.2623186111450195, + "learning_rate": 1.5038431556624731e-05, + "loss": 1.4467, + "step": 118530 + }, + { + "epoch": 0.745051362127551, + "grad_norm": 6.508378505706787, + "learning_rate": 1.5038012455680075e-05, + "loss": 1.5585, + "step": 118540 + }, + { + "epoch": 0.7451142144442481, + "grad_norm": 6.904928684234619, + "learning_rate": 1.5037593354735422e-05, + "loss": 1.7736, + "step": 118550 + }, + { + "epoch": 0.7451770667609452, + "grad_norm": 5.544427394866943, + "learning_rate": 1.5037174253790769e-05, + "loss": 1.3645, + "step": 118560 + }, + { + "epoch": 0.7452399190776422, + "grad_norm": 5.96975564956665, + "learning_rate": 1.5036755152846116e-05, + "loss": 1.5651, + "step": 118570 + }, + { + "epoch": 0.7453027713943393, + "grad_norm": 6.607202053070068, + "learning_rate": 1.5036336051901462e-05, + "loss": 1.5868, + "step": 118580 + }, + { + "epoch": 0.7453656237110364, + "grad_norm": 6.926027774810791, + "learning_rate": 1.5035916950956809e-05, + "loss": 1.6423, + "step": 118590 + }, + { + "epoch": 0.7454284760277335, + "grad_norm": 7.431971549987793, + "learning_rate": 1.5035497850012156e-05, + "loss": 1.7106, + "step": 118600 + }, + { + "epoch": 0.7454913283444307, + "grad_norm": 6.554296493530273, + "learning_rate": 1.5035078749067503e-05, + "loss": 1.5635, + "step": 118610 + }, + { + "epoch": 0.7455541806611278, + "grad_norm": 8.098177909851074, + "learning_rate": 1.5034701558217312e-05, + "loss": 2.0012, + "step": 118620 + }, + { + "epoch": 0.7456170329778249, + "grad_norm": 6.262833595275879, + "learning_rate": 1.503428245727266e-05, + "loss": 1.6464, + "step": 118630 + }, + { + "epoch": 0.745679885294522, + "grad_norm": 6.295077800750732, + "learning_rate": 1.5033863356328007e-05, + "loss": 1.6451, + "step": 118640 + }, + { + "epoch": 0.7457427376112191, + "grad_norm": 7.018307685852051, + "learning_rate": 1.5033444255383354e-05, + "loss": 1.4181, + "step": 118650 + }, + { + "epoch": 0.7458055899279162, + "grad_norm": 6.842716693878174, + "learning_rate": 1.5033025154438699e-05, + "loss": 1.5012, + "step": 118660 + }, + { + "epoch": 0.7458684422446133, + "grad_norm": 5.80836820602417, + "learning_rate": 1.5032606053494046e-05, + "loss": 1.5672, + "step": 118670 + }, + { + "epoch": 0.7459312945613105, + "grad_norm": 6.2121100425720215, + "learning_rate": 1.5032186952549393e-05, + "loss": 1.8919, + "step": 118680 + }, + { + "epoch": 0.7459941468780076, + "grad_norm": 7.220552444458008, + "learning_rate": 1.503176785160474e-05, + "loss": 1.4593, + "step": 118690 + }, + { + "epoch": 0.7460569991947047, + "grad_norm": 6.2886552810668945, + "learning_rate": 1.5031348750660084e-05, + "loss": 1.6792, + "step": 118700 + }, + { + "epoch": 0.7461198515114018, + "grad_norm": 5.707417964935303, + "learning_rate": 1.5030929649715431e-05, + "loss": 1.4881, + "step": 118710 + }, + { + "epoch": 0.7461827038280989, + "grad_norm": 5.562893390655518, + "learning_rate": 1.5030510548770778e-05, + "loss": 1.6741, + "step": 118720 + }, + { + "epoch": 0.746245556144796, + "grad_norm": 4.820675849914551, + "learning_rate": 1.5030091447826125e-05, + "loss": 1.5545, + "step": 118730 + }, + { + "epoch": 0.7463084084614932, + "grad_norm": 6.513208389282227, + "learning_rate": 1.5029672346881472e-05, + "loss": 1.7086, + "step": 118740 + }, + { + "epoch": 0.7463712607781903, + "grad_norm": 5.914416790008545, + "learning_rate": 1.5029253245936818e-05, + "loss": 1.6179, + "step": 118750 + }, + { + "epoch": 0.7464341130948874, + "grad_norm": 6.555986404418945, + "learning_rate": 1.5028834144992163e-05, + "loss": 1.4677, + "step": 118760 + }, + { + "epoch": 0.7464969654115845, + "grad_norm": 7.121492385864258, + "learning_rate": 1.502841504404751e-05, + "loss": 1.6739, + "step": 118770 + }, + { + "epoch": 0.7465598177282816, + "grad_norm": 6.156549453735352, + "learning_rate": 1.5027995943102857e-05, + "loss": 1.5818, + "step": 118780 + }, + { + "epoch": 0.7466226700449787, + "grad_norm": 5.9737067222595215, + "learning_rate": 1.5027576842158202e-05, + "loss": 1.3584, + "step": 118790 + }, + { + "epoch": 0.7466855223616758, + "grad_norm": 6.020236492156982, + "learning_rate": 1.502715774121355e-05, + "loss": 1.654, + "step": 118800 + }, + { + "epoch": 0.746748374678373, + "grad_norm": 7.43925142288208, + "learning_rate": 1.5026738640268897e-05, + "loss": 1.6574, + "step": 118810 + }, + { + "epoch": 0.7468112269950701, + "grad_norm": 7.011662006378174, + "learning_rate": 1.5026319539324244e-05, + "loss": 1.5776, + "step": 118820 + }, + { + "epoch": 0.7468740793117671, + "grad_norm": 6.51852560043335, + "learning_rate": 1.502590043837959e-05, + "loss": 1.7336, + "step": 118830 + }, + { + "epoch": 0.7469369316284642, + "grad_norm": 5.864762306213379, + "learning_rate": 1.5025481337434934e-05, + "loss": 1.5665, + "step": 118840 + }, + { + "epoch": 0.7469997839451613, + "grad_norm": 6.758666038513184, + "learning_rate": 1.5025062236490281e-05, + "loss": 1.6257, + "step": 118850 + }, + { + "epoch": 0.7470626362618584, + "grad_norm": 6.533952713012695, + "learning_rate": 1.5024643135545629e-05, + "loss": 1.512, + "step": 118860 + }, + { + "epoch": 0.7471254885785555, + "grad_norm": 5.908234596252441, + "learning_rate": 1.5024224034600976e-05, + "loss": 1.5964, + "step": 118870 + }, + { + "epoch": 0.7471883408952527, + "grad_norm": 7.8281049728393555, + "learning_rate": 1.5023804933656321e-05, + "loss": 1.6632, + "step": 118880 + }, + { + "epoch": 0.7472511932119498, + "grad_norm": 6.972465991973877, + "learning_rate": 1.5023385832711668e-05, + "loss": 1.7618, + "step": 118890 + }, + { + "epoch": 0.7473140455286469, + "grad_norm": 6.552032947540283, + "learning_rate": 1.5022966731767015e-05, + "loss": 1.6267, + "step": 118900 + }, + { + "epoch": 0.747376897845344, + "grad_norm": 5.361602783203125, + "learning_rate": 1.5022547630822362e-05, + "loss": 1.8534, + "step": 118910 + }, + { + "epoch": 0.7474397501620411, + "grad_norm": 7.634783744812012, + "learning_rate": 1.502212852987771e-05, + "loss": 1.691, + "step": 118920 + }, + { + "epoch": 0.7475026024787382, + "grad_norm": 6.120603084564209, + "learning_rate": 1.5021709428933053e-05, + "loss": 1.5766, + "step": 118930 + }, + { + "epoch": 0.7475654547954353, + "grad_norm": 5.479430675506592, + "learning_rate": 1.50212903279884e-05, + "loss": 1.3384, + "step": 118940 + }, + { + "epoch": 0.7476283071121325, + "grad_norm": 6.822821617126465, + "learning_rate": 1.5020871227043747e-05, + "loss": 1.3687, + "step": 118950 + }, + { + "epoch": 0.7476911594288296, + "grad_norm": 7.230306148529053, + "learning_rate": 1.5020452126099094e-05, + "loss": 1.532, + "step": 118960 + }, + { + "epoch": 0.7477540117455267, + "grad_norm": 8.492520332336426, + "learning_rate": 1.502003302515444e-05, + "loss": 1.6345, + "step": 118970 + }, + { + "epoch": 0.7478168640622238, + "grad_norm": 7.006572723388672, + "learning_rate": 1.5019613924209787e-05, + "loss": 1.5281, + "step": 118980 + }, + { + "epoch": 0.7478797163789209, + "grad_norm": 6.370782375335693, + "learning_rate": 1.5019194823265134e-05, + "loss": 1.5704, + "step": 118990 + }, + { + "epoch": 0.747942568695618, + "grad_norm": 6.308587074279785, + "learning_rate": 1.501877572232048e-05, + "loss": 1.3109, + "step": 119000 + }, + { + "epoch": 0.7480054210123152, + "grad_norm": 5.765300273895264, + "learning_rate": 1.5018356621375824e-05, + "loss": 1.5817, + "step": 119010 + }, + { + "epoch": 0.7480682733290123, + "grad_norm": 6.795331001281738, + "learning_rate": 1.5017937520431172e-05, + "loss": 1.5964, + "step": 119020 + }, + { + "epoch": 0.7481311256457094, + "grad_norm": 7.549683570861816, + "learning_rate": 1.5017518419486519e-05, + "loss": 1.5406, + "step": 119030 + }, + { + "epoch": 0.7481939779624065, + "grad_norm": 5.843475341796875, + "learning_rate": 1.5017099318541866e-05, + "loss": 1.3048, + "step": 119040 + }, + { + "epoch": 0.7482568302791036, + "grad_norm": 6.7381486892700195, + "learning_rate": 1.5016680217597213e-05, + "loss": 1.7444, + "step": 119050 + }, + { + "epoch": 0.7483196825958007, + "grad_norm": 7.63314962387085, + "learning_rate": 1.5016261116652558e-05, + "loss": 1.6775, + "step": 119060 + }, + { + "epoch": 0.7483825349124978, + "grad_norm": 6.3762946128845215, + "learning_rate": 1.5015842015707905e-05, + "loss": 1.5391, + "step": 119070 + }, + { + "epoch": 0.7484453872291948, + "grad_norm": 6.232476234436035, + "learning_rate": 1.501542291476325e-05, + "loss": 1.5976, + "step": 119080 + }, + { + "epoch": 0.748508239545892, + "grad_norm": 6.704739093780518, + "learning_rate": 1.5015003813818598e-05, + "loss": 1.6854, + "step": 119090 + }, + { + "epoch": 0.7485710918625891, + "grad_norm": 6.483764171600342, + "learning_rate": 1.5014584712873943e-05, + "loss": 1.7323, + "step": 119100 + }, + { + "epoch": 0.7486339441792862, + "grad_norm": 5.834470272064209, + "learning_rate": 1.501416561192929e-05, + "loss": 1.6401, + "step": 119110 + }, + { + "epoch": 0.7486967964959833, + "grad_norm": 7.356104850769043, + "learning_rate": 1.5013746510984637e-05, + "loss": 1.5479, + "step": 119120 + }, + { + "epoch": 0.7487596488126804, + "grad_norm": 5.908942222595215, + "learning_rate": 1.5013327410039984e-05, + "loss": 1.6411, + "step": 119130 + }, + { + "epoch": 0.7488225011293775, + "grad_norm": 6.813505172729492, + "learning_rate": 1.5012908309095331e-05, + "loss": 1.4182, + "step": 119140 + }, + { + "epoch": 0.7488853534460747, + "grad_norm": 5.873258113861084, + "learning_rate": 1.5012489208150675e-05, + "loss": 1.5243, + "step": 119150 + }, + { + "epoch": 0.7489482057627718, + "grad_norm": 6.333630084991455, + "learning_rate": 1.5012070107206022e-05, + "loss": 1.605, + "step": 119160 + }, + { + "epoch": 0.7490110580794689, + "grad_norm": 6.718225002288818, + "learning_rate": 1.5011651006261369e-05, + "loss": 1.6831, + "step": 119170 + }, + { + "epoch": 0.749073910396166, + "grad_norm": 6.953636169433594, + "learning_rate": 1.5011231905316716e-05, + "loss": 1.6459, + "step": 119180 + }, + { + "epoch": 0.7491367627128631, + "grad_norm": 6.216082572937012, + "learning_rate": 1.5010812804372062e-05, + "loss": 1.5992, + "step": 119190 + }, + { + "epoch": 0.7491996150295602, + "grad_norm": 7.1688385009765625, + "learning_rate": 1.5010393703427409e-05, + "loss": 1.6605, + "step": 119200 + }, + { + "epoch": 0.7492624673462573, + "grad_norm": 5.808324337005615, + "learning_rate": 1.5009974602482756e-05, + "loss": 1.6583, + "step": 119210 + }, + { + "epoch": 0.7493253196629545, + "grad_norm": 6.335639953613281, + "learning_rate": 1.5009555501538103e-05, + "loss": 1.655, + "step": 119220 + }, + { + "epoch": 0.7493881719796516, + "grad_norm": 7.026367664337158, + "learning_rate": 1.500913640059345e-05, + "loss": 1.6521, + "step": 119230 + }, + { + "epoch": 0.7494510242963487, + "grad_norm": 6.402663707733154, + "learning_rate": 1.5008717299648794e-05, + "loss": 1.5132, + "step": 119240 + }, + { + "epoch": 0.7495138766130458, + "grad_norm": 7.0340070724487305, + "learning_rate": 1.500829819870414e-05, + "loss": 1.4258, + "step": 119250 + }, + { + "epoch": 0.7495767289297429, + "grad_norm": 6.382259845733643, + "learning_rate": 1.5007879097759488e-05, + "loss": 1.6958, + "step": 119260 + }, + { + "epoch": 0.74963958124644, + "grad_norm": 6.875700950622559, + "learning_rate": 1.5007459996814835e-05, + "loss": 1.6692, + "step": 119270 + }, + { + "epoch": 0.7497024335631371, + "grad_norm": 6.259668350219727, + "learning_rate": 1.500704089587018e-05, + "loss": 1.84, + "step": 119280 + }, + { + "epoch": 0.7497652858798343, + "grad_norm": 6.386197090148926, + "learning_rate": 1.5006621794925527e-05, + "loss": 1.5187, + "step": 119290 + }, + { + "epoch": 0.7498281381965314, + "grad_norm": 6.529397010803223, + "learning_rate": 1.5006202693980874e-05, + "loss": 1.6688, + "step": 119300 + }, + { + "epoch": 0.7498909905132285, + "grad_norm": 5.699202537536621, + "learning_rate": 1.5005783593036221e-05, + "loss": 1.6539, + "step": 119310 + }, + { + "epoch": 0.7499538428299256, + "grad_norm": 6.4420366287231445, + "learning_rate": 1.5005364492091565e-05, + "loss": 1.6209, + "step": 119320 + }, + { + "epoch": 0.7500166951466227, + "grad_norm": 7.651759147644043, + "learning_rate": 1.5004945391146912e-05, + "loss": 1.6806, + "step": 119330 + }, + { + "epoch": 0.7500795474633197, + "grad_norm": 6.842735767364502, + "learning_rate": 1.5004526290202259e-05, + "loss": 1.7389, + "step": 119340 + }, + { + "epoch": 0.7501423997800168, + "grad_norm": 6.2523956298828125, + "learning_rate": 1.5004107189257606e-05, + "loss": 1.6368, + "step": 119350 + }, + { + "epoch": 0.750205252096714, + "grad_norm": 6.198376178741455, + "learning_rate": 1.5003688088312953e-05, + "loss": 1.8505, + "step": 119360 + }, + { + "epoch": 0.7502681044134111, + "grad_norm": 6.618760108947754, + "learning_rate": 1.5003268987368299e-05, + "loss": 1.9014, + "step": 119370 + }, + { + "epoch": 0.7503309567301082, + "grad_norm": 6.725455284118652, + "learning_rate": 1.5002849886423646e-05, + "loss": 1.6399, + "step": 119380 + }, + { + "epoch": 0.7503938090468053, + "grad_norm": 6.938900947570801, + "learning_rate": 1.5002430785478991e-05, + "loss": 1.3884, + "step": 119390 + }, + { + "epoch": 0.7504566613635024, + "grad_norm": 6.5777997970581055, + "learning_rate": 1.5002011684534338e-05, + "loss": 1.7021, + "step": 119400 + }, + { + "epoch": 0.7505195136801995, + "grad_norm": 6.52875280380249, + "learning_rate": 1.5001592583589684e-05, + "loss": 1.6928, + "step": 119410 + }, + { + "epoch": 0.7505823659968966, + "grad_norm": 5.5812225341796875, + "learning_rate": 1.500117348264503e-05, + "loss": 1.5077, + "step": 119420 + }, + { + "epoch": 0.7506452183135938, + "grad_norm": 6.043531894683838, + "learning_rate": 1.5000754381700378e-05, + "loss": 1.4386, + "step": 119430 + }, + { + "epoch": 0.7507080706302909, + "grad_norm": 7.530872821807861, + "learning_rate": 1.5000335280755725e-05, + "loss": 1.7013, + "step": 119440 + }, + { + "epoch": 0.750770922946988, + "grad_norm": 7.171182632446289, + "learning_rate": 1.4999916179811072e-05, + "loss": 1.7723, + "step": 119450 + }, + { + "epoch": 0.7508337752636851, + "grad_norm": 5.6413469314575195, + "learning_rate": 1.4999497078866416e-05, + "loss": 1.6987, + "step": 119460 + }, + { + "epoch": 0.7508966275803822, + "grad_norm": 7.155804634094238, + "learning_rate": 1.4999077977921763e-05, + "loss": 1.4665, + "step": 119470 + }, + { + "epoch": 0.7509594798970793, + "grad_norm": 7.0509562492370605, + "learning_rate": 1.499865887697711e-05, + "loss": 1.6415, + "step": 119480 + }, + { + "epoch": 0.7510223322137765, + "grad_norm": 7.219264030456543, + "learning_rate": 1.4998239776032457e-05, + "loss": 1.497, + "step": 119490 + }, + { + "epoch": 0.7510851845304736, + "grad_norm": 6.636020660400391, + "learning_rate": 1.4997820675087802e-05, + "loss": 1.8566, + "step": 119500 + }, + { + "epoch": 0.7511480368471707, + "grad_norm": 6.3326311111450195, + "learning_rate": 1.499740157414315e-05, + "loss": 1.5311, + "step": 119510 + }, + { + "epoch": 0.7512108891638678, + "grad_norm": 7.0651445388793945, + "learning_rate": 1.4996982473198496e-05, + "loss": 1.5763, + "step": 119520 + }, + { + "epoch": 0.7512737414805649, + "grad_norm": 5.846452713012695, + "learning_rate": 1.4996563372253843e-05, + "loss": 1.4839, + "step": 119530 + }, + { + "epoch": 0.751336593797262, + "grad_norm": 5.834648132324219, + "learning_rate": 1.499614427130919e-05, + "loss": 1.4879, + "step": 119540 + }, + { + "epoch": 0.7513994461139591, + "grad_norm": 6.99166202545166, + "learning_rate": 1.4995725170364534e-05, + "loss": 1.5047, + "step": 119550 + }, + { + "epoch": 0.7514622984306563, + "grad_norm": 6.823217391967773, + "learning_rate": 1.4995306069419881e-05, + "loss": 1.6005, + "step": 119560 + }, + { + "epoch": 0.7515251507473534, + "grad_norm": 6.487685203552246, + "learning_rate": 1.4994886968475228e-05, + "loss": 1.4634, + "step": 119570 + }, + { + "epoch": 0.7515880030640505, + "grad_norm": 6.586220741271973, + "learning_rate": 1.4994467867530575e-05, + "loss": 1.8928, + "step": 119580 + }, + { + "epoch": 0.7516508553807475, + "grad_norm": 8.22240161895752, + "learning_rate": 1.499404876658592e-05, + "loss": 1.5348, + "step": 119590 + }, + { + "epoch": 0.7517137076974446, + "grad_norm": 6.863974571228027, + "learning_rate": 1.4993629665641268e-05, + "loss": 1.5618, + "step": 119600 + }, + { + "epoch": 0.7517765600141417, + "grad_norm": 6.535792350769043, + "learning_rate": 1.4993210564696615e-05, + "loss": 1.6731, + "step": 119610 + }, + { + "epoch": 0.7518394123308388, + "grad_norm": 5.779008865356445, + "learning_rate": 1.4992791463751962e-05, + "loss": 1.7479, + "step": 119620 + }, + { + "epoch": 0.751902264647536, + "grad_norm": 6.919078350067139, + "learning_rate": 1.4992372362807306e-05, + "loss": 1.6353, + "step": 119630 + }, + { + "epoch": 0.7519651169642331, + "grad_norm": 5.948938369750977, + "learning_rate": 1.4991953261862653e-05, + "loss": 1.4725, + "step": 119640 + }, + { + "epoch": 0.7520279692809302, + "grad_norm": 7.857065200805664, + "learning_rate": 1.4991534160918e-05, + "loss": 1.7719, + "step": 119650 + }, + { + "epoch": 0.7520908215976273, + "grad_norm": 5.382479667663574, + "learning_rate": 1.4991115059973347e-05, + "loss": 1.6679, + "step": 119660 + }, + { + "epoch": 0.7521536739143244, + "grad_norm": 6.866490364074707, + "learning_rate": 1.4990695959028694e-05, + "loss": 1.685, + "step": 119670 + }, + { + "epoch": 0.7522165262310215, + "grad_norm": 6.799786567687988, + "learning_rate": 1.499027685808404e-05, + "loss": 1.7707, + "step": 119680 + }, + { + "epoch": 0.7522793785477186, + "grad_norm": 7.421419620513916, + "learning_rate": 1.4989857757139386e-05, + "loss": 1.4406, + "step": 119690 + }, + { + "epoch": 0.7523422308644158, + "grad_norm": 6.190274715423584, + "learning_rate": 1.4989438656194732e-05, + "loss": 1.4512, + "step": 119700 + }, + { + "epoch": 0.7524050831811129, + "grad_norm": 7.217263698577881, + "learning_rate": 1.4989019555250079e-05, + "loss": 1.3324, + "step": 119710 + }, + { + "epoch": 0.75246793549781, + "grad_norm": 5.917891502380371, + "learning_rate": 1.4988600454305424e-05, + "loss": 1.4482, + "step": 119720 + }, + { + "epoch": 0.7525307878145071, + "grad_norm": 6.145440578460693, + "learning_rate": 1.4988181353360771e-05, + "loss": 1.4712, + "step": 119730 + }, + { + "epoch": 0.7525936401312042, + "grad_norm": 5.475290775299072, + "learning_rate": 1.4987762252416118e-05, + "loss": 1.4071, + "step": 119740 + }, + { + "epoch": 0.7526564924479013, + "grad_norm": 8.057433128356934, + "learning_rate": 1.4987343151471465e-05, + "loss": 1.7007, + "step": 119750 + }, + { + "epoch": 0.7527193447645985, + "grad_norm": 6.021699905395508, + "learning_rate": 1.4986924050526812e-05, + "loss": 1.5067, + "step": 119760 + }, + { + "epoch": 0.7527821970812956, + "grad_norm": 6.090139865875244, + "learning_rate": 1.4986504949582156e-05, + "loss": 1.5609, + "step": 119770 + }, + { + "epoch": 0.7528450493979927, + "grad_norm": 6.361364841461182, + "learning_rate": 1.4986085848637503e-05, + "loss": 1.5204, + "step": 119780 + }, + { + "epoch": 0.7529079017146898, + "grad_norm": 7.001276016235352, + "learning_rate": 1.498566674769285e-05, + "loss": 1.4831, + "step": 119790 + }, + { + "epoch": 0.7529707540313869, + "grad_norm": 6.353245258331299, + "learning_rate": 1.4985247646748197e-05, + "loss": 1.5124, + "step": 119800 + }, + { + "epoch": 0.753033606348084, + "grad_norm": 5.2376790046691895, + "learning_rate": 1.4984828545803543e-05, + "loss": 1.4544, + "step": 119810 + }, + { + "epoch": 0.7530964586647811, + "grad_norm": 5.625237941741943, + "learning_rate": 1.498440944485889e-05, + "loss": 1.4652, + "step": 119820 + }, + { + "epoch": 0.7531593109814783, + "grad_norm": 5.442669868469238, + "learning_rate": 1.4983990343914237e-05, + "loss": 1.5617, + "step": 119830 + }, + { + "epoch": 0.7532221632981754, + "grad_norm": 6.807565212249756, + "learning_rate": 1.4983571242969584e-05, + "loss": 1.6231, + "step": 119840 + }, + { + "epoch": 0.7532850156148724, + "grad_norm": 6.105103969573975, + "learning_rate": 1.4983152142024931e-05, + "loss": 1.5512, + "step": 119850 + }, + { + "epoch": 0.7533478679315695, + "grad_norm": 5.972720623016357, + "learning_rate": 1.4982733041080275e-05, + "loss": 1.6881, + "step": 119860 + }, + { + "epoch": 0.7534107202482666, + "grad_norm": 6.814130783081055, + "learning_rate": 1.4982313940135622e-05, + "loss": 1.5766, + "step": 119870 + }, + { + "epoch": 0.7534735725649637, + "grad_norm": 7.6490559577941895, + "learning_rate": 1.4981894839190969e-05, + "loss": 1.4441, + "step": 119880 + }, + { + "epoch": 0.7535364248816608, + "grad_norm": 6.740894317626953, + "learning_rate": 1.4981475738246316e-05, + "loss": 1.5884, + "step": 119890 + }, + { + "epoch": 0.753599277198358, + "grad_norm": 5.973379135131836, + "learning_rate": 1.4981056637301661e-05, + "loss": 1.7099, + "step": 119900 + }, + { + "epoch": 0.7536621295150551, + "grad_norm": 5.714463710784912, + "learning_rate": 1.4980637536357008e-05, + "loss": 1.4491, + "step": 119910 + }, + { + "epoch": 0.7537249818317522, + "grad_norm": 7.492783069610596, + "learning_rate": 1.4980218435412355e-05, + "loss": 1.6315, + "step": 119920 + }, + { + "epoch": 0.7537878341484493, + "grad_norm": 5.984746932983398, + "learning_rate": 1.4979799334467702e-05, + "loss": 1.551, + "step": 119930 + }, + { + "epoch": 0.7538506864651464, + "grad_norm": 5.90654182434082, + "learning_rate": 1.4979380233523046e-05, + "loss": 1.502, + "step": 119940 + }, + { + "epoch": 0.7539135387818435, + "grad_norm": 6.398062705993652, + "learning_rate": 1.4978961132578393e-05, + "loss": 1.5511, + "step": 119950 + }, + { + "epoch": 0.7539763910985406, + "grad_norm": 6.428770065307617, + "learning_rate": 1.497854203163374e-05, + "loss": 1.4763, + "step": 119960 + }, + { + "epoch": 0.7540392434152378, + "grad_norm": 6.965630531311035, + "learning_rate": 1.4978122930689087e-05, + "loss": 1.34, + "step": 119970 + }, + { + "epoch": 0.7541020957319349, + "grad_norm": 6.642297744750977, + "learning_rate": 1.4977703829744434e-05, + "loss": 1.7019, + "step": 119980 + }, + { + "epoch": 0.754164948048632, + "grad_norm": 6.695052623748779, + "learning_rate": 1.497728472879978e-05, + "loss": 1.6922, + "step": 119990 + }, + { + "epoch": 0.7542278003653291, + "grad_norm": 5.795705318450928, + "learning_rate": 1.4976865627855127e-05, + "loss": 1.5766, + "step": 120000 + }, + { + "epoch": 0.7542906526820262, + "grad_norm": 6.503615856170654, + "learning_rate": 1.4976446526910474e-05, + "loss": 1.4383, + "step": 120010 + }, + { + "epoch": 0.7543535049987233, + "grad_norm": 7.517498016357422, + "learning_rate": 1.497602742596582e-05, + "loss": 1.6784, + "step": 120020 + }, + { + "epoch": 0.7544163573154204, + "grad_norm": 6.060889720916748, + "learning_rate": 1.4975608325021165e-05, + "loss": 1.7626, + "step": 120030 + }, + { + "epoch": 0.7544792096321176, + "grad_norm": 7.414714336395264, + "learning_rate": 1.4975189224076512e-05, + "loss": 1.4736, + "step": 120040 + }, + { + "epoch": 0.7545420619488147, + "grad_norm": 8.387202262878418, + "learning_rate": 1.4974770123131859e-05, + "loss": 1.7192, + "step": 120050 + }, + { + "epoch": 0.7546049142655118, + "grad_norm": 5.452805042266846, + "learning_rate": 1.4974351022187206e-05, + "loss": 1.6026, + "step": 120060 + }, + { + "epoch": 0.7546677665822089, + "grad_norm": 5.210594654083252, + "learning_rate": 1.4973931921242553e-05, + "loss": 1.4089, + "step": 120070 + }, + { + "epoch": 0.754730618898906, + "grad_norm": 6.504967212677002, + "learning_rate": 1.4973512820297897e-05, + "loss": 1.5778, + "step": 120080 + }, + { + "epoch": 0.7547934712156031, + "grad_norm": 8.21863079071045, + "learning_rate": 1.4973093719353244e-05, + "loss": 1.6274, + "step": 120090 + }, + { + "epoch": 0.7548563235323001, + "grad_norm": 5.795107364654541, + "learning_rate": 1.497267461840859e-05, + "loss": 1.5769, + "step": 120100 + }, + { + "epoch": 0.7549191758489973, + "grad_norm": 7.059028625488281, + "learning_rate": 1.4972255517463938e-05, + "loss": 1.5492, + "step": 120110 + }, + { + "epoch": 0.7549820281656944, + "grad_norm": 7.099276542663574, + "learning_rate": 1.4971836416519283e-05, + "loss": 1.607, + "step": 120120 + }, + { + "epoch": 0.7550448804823915, + "grad_norm": 6.764489650726318, + "learning_rate": 1.497141731557463e-05, + "loss": 1.638, + "step": 120130 + }, + { + "epoch": 0.7551077327990886, + "grad_norm": 6.1144609451293945, + "learning_rate": 1.4970998214629977e-05, + "loss": 1.6679, + "step": 120140 + }, + { + "epoch": 0.7551705851157857, + "grad_norm": 7.087555408477783, + "learning_rate": 1.4970579113685324e-05, + "loss": 1.7779, + "step": 120150 + }, + { + "epoch": 0.7552334374324828, + "grad_norm": 6.355130672454834, + "learning_rate": 1.4970160012740671e-05, + "loss": 1.5195, + "step": 120160 + }, + { + "epoch": 0.75529628974918, + "grad_norm": 5.404919624328613, + "learning_rate": 1.4969740911796015e-05, + "loss": 1.526, + "step": 120170 + }, + { + "epoch": 0.7553591420658771, + "grad_norm": 7.248736381530762, + "learning_rate": 1.4969321810851362e-05, + "loss": 1.8076, + "step": 120180 + }, + { + "epoch": 0.7554219943825742, + "grad_norm": 5.894480228424072, + "learning_rate": 1.496890270990671e-05, + "loss": 1.4873, + "step": 120190 + }, + { + "epoch": 0.7554848466992713, + "grad_norm": 6.088755130767822, + "learning_rate": 1.4968483608962056e-05, + "loss": 1.9181, + "step": 120200 + }, + { + "epoch": 0.7555476990159684, + "grad_norm": 6.1901068687438965, + "learning_rate": 1.4968064508017402e-05, + "loss": 1.5689, + "step": 120210 + }, + { + "epoch": 0.7556105513326655, + "grad_norm": 6.730910301208496, + "learning_rate": 1.4967645407072749e-05, + "loss": 1.8808, + "step": 120220 + }, + { + "epoch": 0.7556734036493626, + "grad_norm": 5.562646865844727, + "learning_rate": 1.4967226306128096e-05, + "loss": 1.4225, + "step": 120230 + }, + { + "epoch": 0.7557362559660598, + "grad_norm": 6.61494779586792, + "learning_rate": 1.4966807205183443e-05, + "loss": 1.8968, + "step": 120240 + }, + { + "epoch": 0.7557991082827569, + "grad_norm": 7.165287971496582, + "learning_rate": 1.4966388104238787e-05, + "loss": 1.8284, + "step": 120250 + }, + { + "epoch": 0.755861960599454, + "grad_norm": 7.3369598388671875, + "learning_rate": 1.4965969003294134e-05, + "loss": 1.6924, + "step": 120260 + }, + { + "epoch": 0.7559248129161511, + "grad_norm": 7.2790632247924805, + "learning_rate": 1.496554990234948e-05, + "loss": 1.7475, + "step": 120270 + }, + { + "epoch": 0.7559876652328482, + "grad_norm": 6.322650909423828, + "learning_rate": 1.4965130801404828e-05, + "loss": 1.6261, + "step": 120280 + }, + { + "epoch": 0.7560505175495453, + "grad_norm": 5.456332683563232, + "learning_rate": 1.4964711700460175e-05, + "loss": 1.4581, + "step": 120290 + }, + { + "epoch": 0.7561133698662424, + "grad_norm": 6.35498046875, + "learning_rate": 1.496429259951552e-05, + "loss": 1.6147, + "step": 120300 + }, + { + "epoch": 0.7561762221829396, + "grad_norm": 6.338553428649902, + "learning_rate": 1.4963873498570867e-05, + "loss": 1.611, + "step": 120310 + }, + { + "epoch": 0.7562390744996367, + "grad_norm": 6.472460746765137, + "learning_rate": 1.4963454397626214e-05, + "loss": 1.5694, + "step": 120320 + }, + { + "epoch": 0.7563019268163338, + "grad_norm": 6.855090141296387, + "learning_rate": 1.496303529668156e-05, + "loss": 1.6434, + "step": 120330 + }, + { + "epoch": 0.7563647791330309, + "grad_norm": 6.079153537750244, + "learning_rate": 1.4962616195736905e-05, + "loss": 1.6676, + "step": 120340 + }, + { + "epoch": 0.756427631449728, + "grad_norm": 6.099603176116943, + "learning_rate": 1.4962197094792252e-05, + "loss": 1.6, + "step": 120350 + }, + { + "epoch": 0.756490483766425, + "grad_norm": 6.089348793029785, + "learning_rate": 1.49617779938476e-05, + "loss": 1.4022, + "step": 120360 + }, + { + "epoch": 0.7565533360831221, + "grad_norm": 6.879031181335449, + "learning_rate": 1.4961358892902946e-05, + "loss": 1.6911, + "step": 120370 + }, + { + "epoch": 0.7566161883998193, + "grad_norm": 6.240353107452393, + "learning_rate": 1.4960939791958293e-05, + "loss": 1.7932, + "step": 120380 + }, + { + "epoch": 0.7566790407165164, + "grad_norm": 6.377612590789795, + "learning_rate": 1.4960520691013639e-05, + "loss": 1.5124, + "step": 120390 + }, + { + "epoch": 0.7567418930332135, + "grad_norm": 7.6781415939331055, + "learning_rate": 1.496014350016345e-05, + "loss": 1.5271, + "step": 120400 + }, + { + "epoch": 0.7568047453499106, + "grad_norm": 7.081296920776367, + "learning_rate": 1.4959724399218797e-05, + "loss": 1.6089, + "step": 120410 + }, + { + "epoch": 0.7568675976666077, + "grad_norm": 6.226444721221924, + "learning_rate": 1.4959305298274143e-05, + "loss": 1.4901, + "step": 120420 + }, + { + "epoch": 0.7569304499833048, + "grad_norm": 6.075373649597168, + "learning_rate": 1.495888619732949e-05, + "loss": 1.7908, + "step": 120430 + }, + { + "epoch": 0.756993302300002, + "grad_norm": 6.699268817901611, + "learning_rate": 1.4958467096384837e-05, + "loss": 1.7134, + "step": 120440 + }, + { + "epoch": 0.7570561546166991, + "grad_norm": 6.797584533691406, + "learning_rate": 1.4958047995440184e-05, + "loss": 1.7966, + "step": 120450 + }, + { + "epoch": 0.7571190069333962, + "grad_norm": 6.222202777862549, + "learning_rate": 1.4957628894495528e-05, + "loss": 1.5158, + "step": 120460 + }, + { + "epoch": 0.7571818592500933, + "grad_norm": 5.914909839630127, + "learning_rate": 1.4957209793550875e-05, + "loss": 1.5251, + "step": 120470 + }, + { + "epoch": 0.7572447115667904, + "grad_norm": 5.490307807922363, + "learning_rate": 1.4956790692606222e-05, + "loss": 1.6727, + "step": 120480 + }, + { + "epoch": 0.7573075638834875, + "grad_norm": 6.202897548675537, + "learning_rate": 1.4956371591661569e-05, + "loss": 1.7193, + "step": 120490 + }, + { + "epoch": 0.7573704162001846, + "grad_norm": 6.767843246459961, + "learning_rate": 1.4955952490716916e-05, + "loss": 1.6305, + "step": 120500 + }, + { + "epoch": 0.7574332685168818, + "grad_norm": 6.597473621368408, + "learning_rate": 1.4955533389772261e-05, + "loss": 1.6484, + "step": 120510 + }, + { + "epoch": 0.7574961208335789, + "grad_norm": 5.778598308563232, + "learning_rate": 1.4955114288827608e-05, + "loss": 1.6532, + "step": 120520 + }, + { + "epoch": 0.757558973150276, + "grad_norm": 6.088018417358398, + "learning_rate": 1.4954695187882955e-05, + "loss": 1.7389, + "step": 120530 + }, + { + "epoch": 0.7576218254669731, + "grad_norm": 6.116697788238525, + "learning_rate": 1.4954276086938302e-05, + "loss": 1.69, + "step": 120540 + }, + { + "epoch": 0.7576846777836702, + "grad_norm": 5.867340087890625, + "learning_rate": 1.4953856985993646e-05, + "loss": 1.4857, + "step": 120550 + }, + { + "epoch": 0.7577475301003673, + "grad_norm": 6.752843379974365, + "learning_rate": 1.4953437885048993e-05, + "loss": 1.5705, + "step": 120560 + }, + { + "epoch": 0.7578103824170644, + "grad_norm": 5.58306884765625, + "learning_rate": 1.495301878410434e-05, + "loss": 1.4287, + "step": 120570 + }, + { + "epoch": 0.7578732347337616, + "grad_norm": 6.616302490234375, + "learning_rate": 1.4952599683159687e-05, + "loss": 1.7193, + "step": 120580 + }, + { + "epoch": 0.7579360870504587, + "grad_norm": 7.261279106140137, + "learning_rate": 1.4952180582215034e-05, + "loss": 1.7444, + "step": 120590 + }, + { + "epoch": 0.7579989393671558, + "grad_norm": 6.934172630310059, + "learning_rate": 1.495176148127038e-05, + "loss": 1.602, + "step": 120600 + }, + { + "epoch": 0.7580617916838528, + "grad_norm": 6.805965900421143, + "learning_rate": 1.4951342380325727e-05, + "loss": 1.6606, + "step": 120610 + }, + { + "epoch": 0.7581246440005499, + "grad_norm": 6.8719282150268555, + "learning_rate": 1.4950923279381074e-05, + "loss": 1.5321, + "step": 120620 + }, + { + "epoch": 0.758187496317247, + "grad_norm": 6.2738189697265625, + "learning_rate": 1.4950504178436421e-05, + "loss": 1.7606, + "step": 120630 + }, + { + "epoch": 0.7582503486339441, + "grad_norm": 6.845386028289795, + "learning_rate": 1.4950085077491765e-05, + "loss": 1.5743, + "step": 120640 + }, + { + "epoch": 0.7583132009506413, + "grad_norm": 6.946133136749268, + "learning_rate": 1.4949665976547112e-05, + "loss": 1.5849, + "step": 120650 + }, + { + "epoch": 0.7583760532673384, + "grad_norm": 6.396969318389893, + "learning_rate": 1.4949246875602459e-05, + "loss": 1.5044, + "step": 120660 + }, + { + "epoch": 0.7584389055840355, + "grad_norm": 5.696493148803711, + "learning_rate": 1.4948827774657806e-05, + "loss": 1.5822, + "step": 120670 + }, + { + "epoch": 0.7585017579007326, + "grad_norm": 6.477611064910889, + "learning_rate": 1.4948408673713153e-05, + "loss": 1.6271, + "step": 120680 + }, + { + "epoch": 0.7585646102174297, + "grad_norm": 6.455690860748291, + "learning_rate": 1.4947989572768498e-05, + "loss": 1.5837, + "step": 120690 + }, + { + "epoch": 0.7586274625341268, + "grad_norm": 6.538627624511719, + "learning_rate": 1.4947570471823845e-05, + "loss": 1.7551, + "step": 120700 + }, + { + "epoch": 0.7586903148508239, + "grad_norm": 5.95809268951416, + "learning_rate": 1.494715137087919e-05, + "loss": 1.7291, + "step": 120710 + }, + { + "epoch": 0.7587531671675211, + "grad_norm": 6.785862445831299, + "learning_rate": 1.4946732269934538e-05, + "loss": 1.5176, + "step": 120720 + }, + { + "epoch": 0.7588160194842182, + "grad_norm": 7.060803413391113, + "learning_rate": 1.4946313168989883e-05, + "loss": 1.714, + "step": 120730 + }, + { + "epoch": 0.7588788718009153, + "grad_norm": 6.920099258422852, + "learning_rate": 1.494589406804523e-05, + "loss": 1.5574, + "step": 120740 + }, + { + "epoch": 0.7589417241176124, + "grad_norm": 6.748926639556885, + "learning_rate": 1.4945474967100577e-05, + "loss": 1.6358, + "step": 120750 + }, + { + "epoch": 0.7590045764343095, + "grad_norm": 6.599475860595703, + "learning_rate": 1.4945055866155924e-05, + "loss": 1.7025, + "step": 120760 + }, + { + "epoch": 0.7590674287510066, + "grad_norm": 7.630424976348877, + "learning_rate": 1.4944636765211268e-05, + "loss": 1.5113, + "step": 120770 + }, + { + "epoch": 0.7591302810677037, + "grad_norm": 6.237452983856201, + "learning_rate": 1.4944217664266615e-05, + "loss": 1.5949, + "step": 120780 + }, + { + "epoch": 0.7591931333844009, + "grad_norm": 6.928414821624756, + "learning_rate": 1.4943798563321962e-05, + "loss": 1.4555, + "step": 120790 + }, + { + "epoch": 0.759255985701098, + "grad_norm": 5.862486839294434, + "learning_rate": 1.494337946237731e-05, + "loss": 1.742, + "step": 120800 + }, + { + "epoch": 0.7593188380177951, + "grad_norm": 5.778768539428711, + "learning_rate": 1.4942960361432656e-05, + "loss": 1.4755, + "step": 120810 + }, + { + "epoch": 0.7593816903344922, + "grad_norm": 6.168492794036865, + "learning_rate": 1.4942541260488002e-05, + "loss": 1.6806, + "step": 120820 + }, + { + "epoch": 0.7594445426511893, + "grad_norm": 6.794562339782715, + "learning_rate": 1.4942122159543349e-05, + "loss": 1.6003, + "step": 120830 + }, + { + "epoch": 0.7595073949678864, + "grad_norm": 5.526988506317139, + "learning_rate": 1.4941703058598696e-05, + "loss": 1.6219, + "step": 120840 + }, + { + "epoch": 0.7595702472845836, + "grad_norm": 6.265909671783447, + "learning_rate": 1.4941283957654043e-05, + "loss": 1.7621, + "step": 120850 + }, + { + "epoch": 0.7596330996012807, + "grad_norm": 6.518588066101074, + "learning_rate": 1.4940864856709387e-05, + "loss": 1.4799, + "step": 120860 + }, + { + "epoch": 0.7596959519179777, + "grad_norm": 7.25003719329834, + "learning_rate": 1.4940445755764734e-05, + "loss": 1.6973, + "step": 120870 + }, + { + "epoch": 0.7597588042346748, + "grad_norm": 7.085589408874512, + "learning_rate": 1.494002665482008e-05, + "loss": 1.4295, + "step": 120880 + }, + { + "epoch": 0.7598216565513719, + "grad_norm": 7.0398406982421875, + "learning_rate": 1.4939607553875428e-05, + "loss": 1.6731, + "step": 120890 + }, + { + "epoch": 0.759884508868069, + "grad_norm": 5.306856155395508, + "learning_rate": 1.4939188452930775e-05, + "loss": 1.6904, + "step": 120900 + }, + { + "epoch": 0.7599473611847661, + "grad_norm": 7.085124492645264, + "learning_rate": 1.493876935198612e-05, + "loss": 1.4914, + "step": 120910 + }, + { + "epoch": 0.7600102135014632, + "grad_norm": 6.8141069412231445, + "learning_rate": 1.4938350251041467e-05, + "loss": 1.7216, + "step": 120920 + }, + { + "epoch": 0.7600730658181604, + "grad_norm": 7.061438083648682, + "learning_rate": 1.4937931150096814e-05, + "loss": 1.6401, + "step": 120930 + }, + { + "epoch": 0.7601359181348575, + "grad_norm": 6.064600467681885, + "learning_rate": 1.4937512049152162e-05, + "loss": 1.9376, + "step": 120940 + }, + { + "epoch": 0.7601987704515546, + "grad_norm": 6.246726989746094, + "learning_rate": 1.4937092948207505e-05, + "loss": 1.8237, + "step": 120950 + }, + { + "epoch": 0.7602616227682517, + "grad_norm": 10.821596145629883, + "learning_rate": 1.4936673847262852e-05, + "loss": 1.7269, + "step": 120960 + }, + { + "epoch": 0.7603244750849488, + "grad_norm": 5.70432710647583, + "learning_rate": 1.49362547463182e-05, + "loss": 1.817, + "step": 120970 + }, + { + "epoch": 0.7603873274016459, + "grad_norm": 6.571020126342773, + "learning_rate": 1.4935835645373546e-05, + "loss": 1.8675, + "step": 120980 + }, + { + "epoch": 0.7604501797183431, + "grad_norm": 6.246038913726807, + "learning_rate": 1.4935416544428893e-05, + "loss": 1.5206, + "step": 120990 + }, + { + "epoch": 0.7605130320350402, + "grad_norm": 5.737758636474609, + "learning_rate": 1.4934997443484239e-05, + "loss": 1.6251, + "step": 121000 + }, + { + "epoch": 0.7605758843517373, + "grad_norm": 6.515652656555176, + "learning_rate": 1.4934578342539586e-05, + "loss": 1.4381, + "step": 121010 + }, + { + "epoch": 0.7606387366684344, + "grad_norm": 6.005425930023193, + "learning_rate": 1.4934159241594931e-05, + "loss": 1.652, + "step": 121020 + }, + { + "epoch": 0.7607015889851315, + "grad_norm": 6.146503925323486, + "learning_rate": 1.4933740140650278e-05, + "loss": 1.5105, + "step": 121030 + }, + { + "epoch": 0.7607644413018286, + "grad_norm": 6.9523749351501465, + "learning_rate": 1.4933321039705624e-05, + "loss": 1.7115, + "step": 121040 + }, + { + "epoch": 0.7608272936185257, + "grad_norm": 6.038151264190674, + "learning_rate": 1.493290193876097e-05, + "loss": 1.5377, + "step": 121050 + }, + { + "epoch": 0.7608901459352229, + "grad_norm": 6.48395299911499, + "learning_rate": 1.4932482837816318e-05, + "loss": 1.7367, + "step": 121060 + }, + { + "epoch": 0.76095299825192, + "grad_norm": 5.40743350982666, + "learning_rate": 1.4932063736871665e-05, + "loss": 1.6873, + "step": 121070 + }, + { + "epoch": 0.7610158505686171, + "grad_norm": 5.547600269317627, + "learning_rate": 1.493164463592701e-05, + "loss": 1.546, + "step": 121080 + }, + { + "epoch": 0.7610787028853142, + "grad_norm": 6.854653358459473, + "learning_rate": 1.4931225534982356e-05, + "loss": 1.6727, + "step": 121090 + }, + { + "epoch": 0.7611415552020113, + "grad_norm": 7.525597095489502, + "learning_rate": 1.4930806434037703e-05, + "loss": 1.6834, + "step": 121100 + }, + { + "epoch": 0.7612044075187084, + "grad_norm": 5.9096879959106445, + "learning_rate": 1.493038733309305e-05, + "loss": 1.9445, + "step": 121110 + }, + { + "epoch": 0.7612672598354054, + "grad_norm": 6.041938304901123, + "learning_rate": 1.4929968232148397e-05, + "loss": 1.7379, + "step": 121120 + }, + { + "epoch": 0.7613301121521026, + "grad_norm": 6.086507320404053, + "learning_rate": 1.4929549131203742e-05, + "loss": 1.6905, + "step": 121130 + }, + { + "epoch": 0.7613929644687997, + "grad_norm": 6.364861965179443, + "learning_rate": 1.492913003025909e-05, + "loss": 1.6181, + "step": 121140 + }, + { + "epoch": 0.7614558167854968, + "grad_norm": 5.678732395172119, + "learning_rate": 1.4928710929314436e-05, + "loss": 1.6366, + "step": 121150 + }, + { + "epoch": 0.7615186691021939, + "grad_norm": 6.15526819229126, + "learning_rate": 1.4928291828369784e-05, + "loss": 1.6084, + "step": 121160 + }, + { + "epoch": 0.761581521418891, + "grad_norm": 6.950185775756836, + "learning_rate": 1.4927872727425127e-05, + "loss": 1.6141, + "step": 121170 + }, + { + "epoch": 0.7616443737355881, + "grad_norm": 7.115409851074219, + "learning_rate": 1.4927453626480474e-05, + "loss": 1.5466, + "step": 121180 + }, + { + "epoch": 0.7617072260522852, + "grad_norm": 7.771196365356445, + "learning_rate": 1.4927034525535821e-05, + "loss": 1.633, + "step": 121190 + }, + { + "epoch": 0.7617700783689824, + "grad_norm": 6.16953706741333, + "learning_rate": 1.4926615424591168e-05, + "loss": 1.725, + "step": 121200 + }, + { + "epoch": 0.7618329306856795, + "grad_norm": 6.843245983123779, + "learning_rate": 1.4926196323646515e-05, + "loss": 1.595, + "step": 121210 + }, + { + "epoch": 0.7618957830023766, + "grad_norm": 6.203823566436768, + "learning_rate": 1.4925777222701861e-05, + "loss": 1.4425, + "step": 121220 + }, + { + "epoch": 0.7619586353190737, + "grad_norm": 6.376070976257324, + "learning_rate": 1.4925358121757208e-05, + "loss": 1.6897, + "step": 121230 + }, + { + "epoch": 0.7620214876357708, + "grad_norm": 6.103188991546631, + "learning_rate": 1.4924939020812555e-05, + "loss": 1.6413, + "step": 121240 + }, + { + "epoch": 0.7620843399524679, + "grad_norm": 5.701595783233643, + "learning_rate": 1.4924519919867902e-05, + "loss": 1.6432, + "step": 121250 + }, + { + "epoch": 0.762147192269165, + "grad_norm": 7.023159980773926, + "learning_rate": 1.4924100818923246e-05, + "loss": 1.5428, + "step": 121260 + }, + { + "epoch": 0.7622100445858622, + "grad_norm": 7.466994285583496, + "learning_rate": 1.4923681717978593e-05, + "loss": 1.9101, + "step": 121270 + }, + { + "epoch": 0.7622728969025593, + "grad_norm": 5.344905853271484, + "learning_rate": 1.492326261703394e-05, + "loss": 1.2814, + "step": 121280 + }, + { + "epoch": 0.7623357492192564, + "grad_norm": 6.5035719871521, + "learning_rate": 1.4922843516089287e-05, + "loss": 1.5778, + "step": 121290 + }, + { + "epoch": 0.7623986015359535, + "grad_norm": 6.424167156219482, + "learning_rate": 1.4922424415144634e-05, + "loss": 1.7851, + "step": 121300 + }, + { + "epoch": 0.7624614538526506, + "grad_norm": 7.77842903137207, + "learning_rate": 1.492200531419998e-05, + "loss": 1.6441, + "step": 121310 + }, + { + "epoch": 0.7625243061693477, + "grad_norm": 5.498336315155029, + "learning_rate": 1.4921586213255326e-05, + "loss": 1.5566, + "step": 121320 + }, + { + "epoch": 0.7625871584860449, + "grad_norm": 6.364030838012695, + "learning_rate": 1.4921167112310672e-05, + "loss": 1.9447, + "step": 121330 + }, + { + "epoch": 0.762650010802742, + "grad_norm": 5.463682651519775, + "learning_rate": 1.4920748011366019e-05, + "loss": 1.482, + "step": 121340 + }, + { + "epoch": 0.7627128631194391, + "grad_norm": 6.897846698760986, + "learning_rate": 1.4920328910421364e-05, + "loss": 1.4963, + "step": 121350 + }, + { + "epoch": 0.7627757154361362, + "grad_norm": 5.895562171936035, + "learning_rate": 1.4919909809476711e-05, + "loss": 1.7759, + "step": 121360 + }, + { + "epoch": 0.7628385677528333, + "grad_norm": 6.170264720916748, + "learning_rate": 1.4919490708532058e-05, + "loss": 1.6302, + "step": 121370 + }, + { + "epoch": 0.7629014200695303, + "grad_norm": 6.082094192504883, + "learning_rate": 1.4919071607587406e-05, + "loss": 1.5568, + "step": 121380 + }, + { + "epoch": 0.7629642723862274, + "grad_norm": 6.674420356750488, + "learning_rate": 1.4918652506642751e-05, + "loss": 1.7198, + "step": 121390 + }, + { + "epoch": 0.7630271247029246, + "grad_norm": 6.145451068878174, + "learning_rate": 1.4918233405698096e-05, + "loss": 1.4018, + "step": 121400 + }, + { + "epoch": 0.7630899770196217, + "grad_norm": 7.611517906188965, + "learning_rate": 1.4917814304753443e-05, + "loss": 1.8585, + "step": 121410 + }, + { + "epoch": 0.7631528293363188, + "grad_norm": 6.481110572814941, + "learning_rate": 1.491739520380879e-05, + "loss": 1.6318, + "step": 121420 + }, + { + "epoch": 0.7632156816530159, + "grad_norm": 6.759619235992432, + "learning_rate": 1.4916976102864137e-05, + "loss": 1.7117, + "step": 121430 + }, + { + "epoch": 0.763278533969713, + "grad_norm": 6.643283367156982, + "learning_rate": 1.4916557001919483e-05, + "loss": 1.5947, + "step": 121440 + }, + { + "epoch": 0.7633413862864101, + "grad_norm": 6.595210075378418, + "learning_rate": 1.491613790097483e-05, + "loss": 1.604, + "step": 121450 + }, + { + "epoch": 0.7634042386031072, + "grad_norm": 6.565183639526367, + "learning_rate": 1.4915718800030177e-05, + "loss": 1.6944, + "step": 121460 + }, + { + "epoch": 0.7634670909198044, + "grad_norm": 5.534493923187256, + "learning_rate": 1.4915299699085524e-05, + "loss": 1.7532, + "step": 121470 + }, + { + "epoch": 0.7635299432365015, + "grad_norm": 6.0962443351745605, + "learning_rate": 1.4914880598140868e-05, + "loss": 1.6787, + "step": 121480 + }, + { + "epoch": 0.7635927955531986, + "grad_norm": 6.684833526611328, + "learning_rate": 1.4914461497196215e-05, + "loss": 1.6874, + "step": 121490 + }, + { + "epoch": 0.7636556478698957, + "grad_norm": 6.234200477600098, + "learning_rate": 1.4914042396251562e-05, + "loss": 1.4909, + "step": 121500 + }, + { + "epoch": 0.7637185001865928, + "grad_norm": 6.338442802429199, + "learning_rate": 1.4913623295306909e-05, + "loss": 1.5812, + "step": 121510 + }, + { + "epoch": 0.7637813525032899, + "grad_norm": 6.367171287536621, + "learning_rate": 1.4913204194362256e-05, + "loss": 1.6601, + "step": 121520 + }, + { + "epoch": 0.763844204819987, + "grad_norm": 5.642130374908447, + "learning_rate": 1.4912785093417601e-05, + "loss": 1.6207, + "step": 121530 + }, + { + "epoch": 0.7639070571366842, + "grad_norm": 6.396183967590332, + "learning_rate": 1.4912365992472949e-05, + "loss": 1.5957, + "step": 121540 + }, + { + "epoch": 0.7639699094533813, + "grad_norm": 7.066701412200928, + "learning_rate": 1.4911946891528296e-05, + "loss": 1.7143, + "step": 121550 + }, + { + "epoch": 0.7640327617700784, + "grad_norm": 5.673250675201416, + "learning_rate": 1.4911527790583643e-05, + "loss": 1.5213, + "step": 121560 + }, + { + "epoch": 0.7640956140867755, + "grad_norm": 6.310756683349609, + "learning_rate": 1.4911108689638986e-05, + "loss": 1.9226, + "step": 121570 + }, + { + "epoch": 0.7641584664034726, + "grad_norm": 7.087517261505127, + "learning_rate": 1.4910689588694333e-05, + "loss": 1.6778, + "step": 121580 + }, + { + "epoch": 0.7642213187201697, + "grad_norm": 7.2179694175720215, + "learning_rate": 1.491027048774968e-05, + "loss": 1.7289, + "step": 121590 + }, + { + "epoch": 0.7642841710368669, + "grad_norm": 7.069692611694336, + "learning_rate": 1.4909851386805028e-05, + "loss": 1.7558, + "step": 121600 + }, + { + "epoch": 0.764347023353564, + "grad_norm": 7.267943859100342, + "learning_rate": 1.4909432285860375e-05, + "loss": 1.7601, + "step": 121610 + }, + { + "epoch": 0.7644098756702611, + "grad_norm": 7.698588848114014, + "learning_rate": 1.490901318491572e-05, + "loss": 1.5349, + "step": 121620 + }, + { + "epoch": 0.7644727279869581, + "grad_norm": 7.444325923919678, + "learning_rate": 1.4908594083971067e-05, + "loss": 1.6643, + "step": 121630 + }, + { + "epoch": 0.7645355803036552, + "grad_norm": 6.691740989685059, + "learning_rate": 1.4908174983026414e-05, + "loss": 1.7189, + "step": 121640 + }, + { + "epoch": 0.7645984326203523, + "grad_norm": 6.304153919219971, + "learning_rate": 1.490775588208176e-05, + "loss": 1.5654, + "step": 121650 + }, + { + "epoch": 0.7646612849370494, + "grad_norm": 6.096138954162598, + "learning_rate": 1.4907336781137105e-05, + "loss": 1.6734, + "step": 121660 + }, + { + "epoch": 0.7647241372537465, + "grad_norm": 6.932074069976807, + "learning_rate": 1.4906917680192452e-05, + "loss": 1.9373, + "step": 121670 + }, + { + "epoch": 0.7647869895704437, + "grad_norm": 6.2231221199035645, + "learning_rate": 1.4906498579247799e-05, + "loss": 1.4769, + "step": 121680 + }, + { + "epoch": 0.7648498418871408, + "grad_norm": 4.926756381988525, + "learning_rate": 1.4906079478303146e-05, + "loss": 1.5477, + "step": 121690 + }, + { + "epoch": 0.7649126942038379, + "grad_norm": 7.400648593902588, + "learning_rate": 1.4905660377358491e-05, + "loss": 1.7029, + "step": 121700 + }, + { + "epoch": 0.764975546520535, + "grad_norm": 6.2708210945129395, + "learning_rate": 1.4905241276413837e-05, + "loss": 1.5563, + "step": 121710 + }, + { + "epoch": 0.7650383988372321, + "grad_norm": 5.93113899230957, + "learning_rate": 1.4904822175469184e-05, + "loss": 1.536, + "step": 121720 + }, + { + "epoch": 0.7651012511539292, + "grad_norm": 6.963664531707764, + "learning_rate": 1.4904403074524531e-05, + "loss": 1.6805, + "step": 121730 + }, + { + "epoch": 0.7651641034706264, + "grad_norm": 6.695322036743164, + "learning_rate": 1.4903983973579878e-05, + "loss": 1.7509, + "step": 121740 + }, + { + "epoch": 0.7652269557873235, + "grad_norm": 7.450059413909912, + "learning_rate": 1.4903564872635223e-05, + "loss": 1.7181, + "step": 121750 + }, + { + "epoch": 0.7652898081040206, + "grad_norm": 7.113367080688477, + "learning_rate": 1.490314577169057e-05, + "loss": 1.6927, + "step": 121760 + }, + { + "epoch": 0.7653526604207177, + "grad_norm": 8.61163330078125, + "learning_rate": 1.4902726670745918e-05, + "loss": 1.7477, + "step": 121770 + }, + { + "epoch": 0.7654155127374148, + "grad_norm": 6.27892541885376, + "learning_rate": 1.4902307569801265e-05, + "loss": 1.6296, + "step": 121780 + }, + { + "epoch": 0.7654783650541119, + "grad_norm": 7.385900497436523, + "learning_rate": 1.4901888468856608e-05, + "loss": 1.4883, + "step": 121790 + }, + { + "epoch": 0.765541217370809, + "grad_norm": 6.854226112365723, + "learning_rate": 1.4901469367911955e-05, + "loss": 1.8356, + "step": 121800 + }, + { + "epoch": 0.7656040696875062, + "grad_norm": 7.135695457458496, + "learning_rate": 1.4901050266967302e-05, + "loss": 1.5556, + "step": 121810 + }, + { + "epoch": 0.7656669220042033, + "grad_norm": 6.962805271148682, + "learning_rate": 1.490063116602265e-05, + "loss": 1.522, + "step": 121820 + }, + { + "epoch": 0.7657297743209004, + "grad_norm": 6.554911136627197, + "learning_rate": 1.4900212065077997e-05, + "loss": 1.7597, + "step": 121830 + }, + { + "epoch": 0.7657926266375975, + "grad_norm": 6.736742973327637, + "learning_rate": 1.4899792964133342e-05, + "loss": 1.7353, + "step": 121840 + }, + { + "epoch": 0.7658554789542946, + "grad_norm": 6.078730583190918, + "learning_rate": 1.4899373863188689e-05, + "loss": 1.5013, + "step": 121850 + }, + { + "epoch": 0.7659183312709917, + "grad_norm": 6.418303489685059, + "learning_rate": 1.4898954762244036e-05, + "loss": 1.5643, + "step": 121860 + }, + { + "epoch": 0.7659811835876889, + "grad_norm": 6.1626996994018555, + "learning_rate": 1.4898535661299383e-05, + "loss": 1.6183, + "step": 121870 + }, + { + "epoch": 0.766044035904386, + "grad_norm": 6.378403186798096, + "learning_rate": 1.4898116560354727e-05, + "loss": 1.4926, + "step": 121880 + }, + { + "epoch": 0.766106888221083, + "grad_norm": 6.052713394165039, + "learning_rate": 1.4897697459410074e-05, + "loss": 1.7107, + "step": 121890 + }, + { + "epoch": 0.7661697405377801, + "grad_norm": 5.871499061584473, + "learning_rate": 1.4897278358465421e-05, + "loss": 1.6582, + "step": 121900 + }, + { + "epoch": 0.7662325928544772, + "grad_norm": 7.141749858856201, + "learning_rate": 1.4896859257520768e-05, + "loss": 1.647, + "step": 121910 + }, + { + "epoch": 0.7662954451711743, + "grad_norm": 6.619952201843262, + "learning_rate": 1.4896440156576115e-05, + "loss": 1.5802, + "step": 121920 + }, + { + "epoch": 0.7663582974878714, + "grad_norm": 6.620635509490967, + "learning_rate": 1.489602105563146e-05, + "loss": 1.5038, + "step": 121930 + }, + { + "epoch": 0.7664211498045685, + "grad_norm": 6.27406644821167, + "learning_rate": 1.4895601954686808e-05, + "loss": 1.7344, + "step": 121940 + }, + { + "epoch": 0.7664840021212657, + "grad_norm": 6.9420270919799805, + "learning_rate": 1.4895182853742155e-05, + "loss": 1.6119, + "step": 121950 + }, + { + "epoch": 0.7665468544379628, + "grad_norm": 5.954476833343506, + "learning_rate": 1.48947637527975e-05, + "loss": 1.3455, + "step": 121960 + }, + { + "epoch": 0.7666097067546599, + "grad_norm": 5.4925127029418945, + "learning_rate": 1.4894344651852845e-05, + "loss": 1.3755, + "step": 121970 + }, + { + "epoch": 0.766672559071357, + "grad_norm": 6.041146755218506, + "learning_rate": 1.4893925550908193e-05, + "loss": 1.5138, + "step": 121980 + }, + { + "epoch": 0.7667354113880541, + "grad_norm": 6.702816963195801, + "learning_rate": 1.489350644996354e-05, + "loss": 1.6429, + "step": 121990 + }, + { + "epoch": 0.7667982637047512, + "grad_norm": 5.840723037719727, + "learning_rate": 1.4893087349018887e-05, + "loss": 1.5491, + "step": 122000 + }, + { + "epoch": 0.7668611160214484, + "grad_norm": 6.275969505310059, + "learning_rate": 1.4892668248074232e-05, + "loss": 1.6287, + "step": 122010 + }, + { + "epoch": 0.7669239683381455, + "grad_norm": 5.27000093460083, + "learning_rate": 1.4892249147129579e-05, + "loss": 1.4055, + "step": 122020 + }, + { + "epoch": 0.7669868206548426, + "grad_norm": 7.497596740722656, + "learning_rate": 1.4891830046184924e-05, + "loss": 1.6989, + "step": 122030 + }, + { + "epoch": 0.7670496729715397, + "grad_norm": 6.702113151550293, + "learning_rate": 1.4891410945240272e-05, + "loss": 1.7854, + "step": 122040 + }, + { + "epoch": 0.7671125252882368, + "grad_norm": 6.365345478057861, + "learning_rate": 1.4890991844295619e-05, + "loss": 1.5324, + "step": 122050 + }, + { + "epoch": 0.7671753776049339, + "grad_norm": 6.774776458740234, + "learning_rate": 1.4890572743350964e-05, + "loss": 1.493, + "step": 122060 + }, + { + "epoch": 0.767238229921631, + "grad_norm": 7.4185285568237305, + "learning_rate": 1.4890153642406311e-05, + "loss": 1.7349, + "step": 122070 + }, + { + "epoch": 0.7673010822383282, + "grad_norm": 6.927353382110596, + "learning_rate": 1.4889734541461658e-05, + "loss": 1.6425, + "step": 122080 + }, + { + "epoch": 0.7673639345550253, + "grad_norm": 5.199800491333008, + "learning_rate": 1.4889315440517005e-05, + "loss": 1.4486, + "step": 122090 + }, + { + "epoch": 0.7674267868717224, + "grad_norm": 6.803194999694824, + "learning_rate": 1.4888938249666815e-05, + "loss": 1.7102, + "step": 122100 + }, + { + "epoch": 0.7674896391884195, + "grad_norm": 6.105362415313721, + "learning_rate": 1.4888519148722162e-05, + "loss": 1.4752, + "step": 122110 + }, + { + "epoch": 0.7675524915051166, + "grad_norm": 7.666367530822754, + "learning_rate": 1.4888100047777509e-05, + "loss": 1.6505, + "step": 122120 + }, + { + "epoch": 0.7676153438218137, + "grad_norm": 6.379772186279297, + "learning_rate": 1.4887680946832856e-05, + "loss": 1.5318, + "step": 122130 + }, + { + "epoch": 0.7676781961385107, + "grad_norm": 6.910738945007324, + "learning_rate": 1.4887261845888201e-05, + "loss": 1.5936, + "step": 122140 + }, + { + "epoch": 0.7677410484552079, + "grad_norm": 5.412835597991943, + "learning_rate": 1.4886842744943549e-05, + "loss": 1.6097, + "step": 122150 + }, + { + "epoch": 0.767803900771905, + "grad_norm": 6.722506046295166, + "learning_rate": 1.4886423643998896e-05, + "loss": 1.6577, + "step": 122160 + }, + { + "epoch": 0.7678667530886021, + "grad_norm": 6.645995616912842, + "learning_rate": 1.4886004543054243e-05, + "loss": 1.4969, + "step": 122170 + }, + { + "epoch": 0.7679296054052992, + "grad_norm": 5.618885517120361, + "learning_rate": 1.4885585442109586e-05, + "loss": 1.6188, + "step": 122180 + }, + { + "epoch": 0.7679924577219963, + "grad_norm": 6.013381481170654, + "learning_rate": 1.4885166341164933e-05, + "loss": 1.5751, + "step": 122190 + }, + { + "epoch": 0.7680553100386934, + "grad_norm": 6.78753137588501, + "learning_rate": 1.488474724022028e-05, + "loss": 1.6351, + "step": 122200 + }, + { + "epoch": 0.7681181623553905, + "grad_norm": 6.380014419555664, + "learning_rate": 1.4884328139275628e-05, + "loss": 1.741, + "step": 122210 + }, + { + "epoch": 0.7681810146720877, + "grad_norm": 6.304038047790527, + "learning_rate": 1.4883909038330973e-05, + "loss": 1.8592, + "step": 122220 + }, + { + "epoch": 0.7682438669887848, + "grad_norm": 5.757291316986084, + "learning_rate": 1.488348993738632e-05, + "loss": 1.4222, + "step": 122230 + }, + { + "epoch": 0.7683067193054819, + "grad_norm": 7.418156147003174, + "learning_rate": 1.4883070836441667e-05, + "loss": 1.5522, + "step": 122240 + }, + { + "epoch": 0.768369571622179, + "grad_norm": 6.327040195465088, + "learning_rate": 1.4882651735497014e-05, + "loss": 1.8053, + "step": 122250 + }, + { + "epoch": 0.7684324239388761, + "grad_norm": 5.413448810577393, + "learning_rate": 1.4882232634552361e-05, + "loss": 1.4365, + "step": 122260 + }, + { + "epoch": 0.7684952762555732, + "grad_norm": 7.084179878234863, + "learning_rate": 1.4881813533607705e-05, + "loss": 1.5816, + "step": 122270 + }, + { + "epoch": 0.7685581285722703, + "grad_norm": 6.731881141662598, + "learning_rate": 1.4881394432663052e-05, + "loss": 1.4995, + "step": 122280 + }, + { + "epoch": 0.7686209808889675, + "grad_norm": 7.3260498046875, + "learning_rate": 1.4880975331718399e-05, + "loss": 1.361, + "step": 122290 + }, + { + "epoch": 0.7686838332056646, + "grad_norm": 6.583715915679932, + "learning_rate": 1.4880556230773746e-05, + "loss": 1.752, + "step": 122300 + }, + { + "epoch": 0.7687466855223617, + "grad_norm": 6.189996242523193, + "learning_rate": 1.4880137129829091e-05, + "loss": 1.7227, + "step": 122310 + }, + { + "epoch": 0.7688095378390588, + "grad_norm": 7.296090602874756, + "learning_rate": 1.4879718028884439e-05, + "loss": 1.8045, + "step": 122320 + }, + { + "epoch": 0.7688723901557559, + "grad_norm": 6.088799953460693, + "learning_rate": 1.4879298927939786e-05, + "loss": 1.5412, + "step": 122330 + }, + { + "epoch": 0.768935242472453, + "grad_norm": 6.83470344543457, + "learning_rate": 1.4878879826995131e-05, + "loss": 1.5028, + "step": 122340 + }, + { + "epoch": 0.7689980947891502, + "grad_norm": 6.843228340148926, + "learning_rate": 1.4878460726050478e-05, + "loss": 1.4771, + "step": 122350 + }, + { + "epoch": 0.7690609471058473, + "grad_norm": 6.270680904388428, + "learning_rate": 1.4878041625105823e-05, + "loss": 1.6067, + "step": 122360 + }, + { + "epoch": 0.7691237994225444, + "grad_norm": 23.329750061035156, + "learning_rate": 1.487762252416117e-05, + "loss": 1.5755, + "step": 122370 + }, + { + "epoch": 0.7691866517392415, + "grad_norm": 6.056442737579346, + "learning_rate": 1.4877203423216518e-05, + "loss": 1.6251, + "step": 122380 + }, + { + "epoch": 0.7692495040559386, + "grad_norm": 5.918461322784424, + "learning_rate": 1.4876784322271865e-05, + "loss": 1.5211, + "step": 122390 + }, + { + "epoch": 0.7693123563726356, + "grad_norm": 5.789246559143066, + "learning_rate": 1.4876365221327208e-05, + "loss": 1.8626, + "step": 122400 + }, + { + "epoch": 0.7693752086893327, + "grad_norm": 6.11323356628418, + "learning_rate": 1.4875946120382555e-05, + "loss": 1.4109, + "step": 122410 + }, + { + "epoch": 0.7694380610060298, + "grad_norm": 7.065954208374023, + "learning_rate": 1.4875527019437902e-05, + "loss": 1.6697, + "step": 122420 + }, + { + "epoch": 0.769500913322727, + "grad_norm": 5.824702739715576, + "learning_rate": 1.487510791849325e-05, + "loss": 1.6175, + "step": 122430 + }, + { + "epoch": 0.7695637656394241, + "grad_norm": 6.183588981628418, + "learning_rate": 1.4874688817548597e-05, + "loss": 1.5873, + "step": 122440 + }, + { + "epoch": 0.7696266179561212, + "grad_norm": 6.583012580871582, + "learning_rate": 1.4874269716603942e-05, + "loss": 1.7353, + "step": 122450 + }, + { + "epoch": 0.7696894702728183, + "grad_norm": 6.278700828552246, + "learning_rate": 1.4873850615659289e-05, + "loss": 1.5125, + "step": 122460 + }, + { + "epoch": 0.7697523225895154, + "grad_norm": 6.665764808654785, + "learning_rate": 1.4873431514714636e-05, + "loss": 1.6511, + "step": 122470 + }, + { + "epoch": 0.7698151749062125, + "grad_norm": 7.0793657302856445, + "learning_rate": 1.4873012413769983e-05, + "loss": 1.4307, + "step": 122480 + }, + { + "epoch": 0.7698780272229097, + "grad_norm": 6.780641078948975, + "learning_rate": 1.4872593312825327e-05, + "loss": 1.72, + "step": 122490 + }, + { + "epoch": 0.7699408795396068, + "grad_norm": 6.398270606994629, + "learning_rate": 1.4872174211880674e-05, + "loss": 1.4372, + "step": 122500 + }, + { + "epoch": 0.7700037318563039, + "grad_norm": 7.318230152130127, + "learning_rate": 1.4871755110936021e-05, + "loss": 1.6557, + "step": 122510 + }, + { + "epoch": 0.770066584173001, + "grad_norm": 4.567630290985107, + "learning_rate": 1.4871336009991368e-05, + "loss": 1.5319, + "step": 122520 + }, + { + "epoch": 0.7701294364896981, + "grad_norm": 6.257627964019775, + "learning_rate": 1.4870916909046713e-05, + "loss": 1.7295, + "step": 122530 + }, + { + "epoch": 0.7701922888063952, + "grad_norm": 6.4811201095581055, + "learning_rate": 1.487049780810206e-05, + "loss": 1.533, + "step": 122540 + }, + { + "epoch": 0.7702551411230923, + "grad_norm": 6.295751571655273, + "learning_rate": 1.4870078707157408e-05, + "loss": 1.5123, + "step": 122550 + }, + { + "epoch": 0.7703179934397895, + "grad_norm": 6.236231327056885, + "learning_rate": 1.4869659606212755e-05, + "loss": 1.6454, + "step": 122560 + }, + { + "epoch": 0.7703808457564866, + "grad_norm": 6.155300140380859, + "learning_rate": 1.4869240505268102e-05, + "loss": 1.7777, + "step": 122570 + }, + { + "epoch": 0.7704436980731837, + "grad_norm": 7.4999680519104, + "learning_rate": 1.4868821404323445e-05, + "loss": 1.8179, + "step": 122580 + }, + { + "epoch": 0.7705065503898808, + "grad_norm": 6.615935325622559, + "learning_rate": 1.4868402303378793e-05, + "loss": 1.7054, + "step": 122590 + }, + { + "epoch": 0.7705694027065779, + "grad_norm": 6.052435874938965, + "learning_rate": 1.486798320243414e-05, + "loss": 1.3716, + "step": 122600 + }, + { + "epoch": 0.770632255023275, + "grad_norm": 6.248154640197754, + "learning_rate": 1.4867564101489487e-05, + "loss": 1.6163, + "step": 122610 + }, + { + "epoch": 0.7706951073399722, + "grad_norm": 6.153942108154297, + "learning_rate": 1.4867145000544832e-05, + "loss": 1.5133, + "step": 122620 + }, + { + "epoch": 0.7707579596566693, + "grad_norm": 6.781783103942871, + "learning_rate": 1.4866725899600179e-05, + "loss": 1.5677, + "step": 122630 + }, + { + "epoch": 0.7708208119733664, + "grad_norm": 6.606148719787598, + "learning_rate": 1.4866306798655526e-05, + "loss": 1.5422, + "step": 122640 + }, + { + "epoch": 0.7708836642900634, + "grad_norm": 6.399189472198486, + "learning_rate": 1.4865887697710872e-05, + "loss": 1.7016, + "step": 122650 + }, + { + "epoch": 0.7709465166067605, + "grad_norm": 6.381160259246826, + "learning_rate": 1.4865468596766219e-05, + "loss": 1.5201, + "step": 122660 + }, + { + "epoch": 0.7710093689234576, + "grad_norm": 5.457291126251221, + "learning_rate": 1.4865049495821564e-05, + "loss": 1.626, + "step": 122670 + }, + { + "epoch": 0.7710722212401547, + "grad_norm": 7.976124286651611, + "learning_rate": 1.4864630394876911e-05, + "loss": 1.3152, + "step": 122680 + }, + { + "epoch": 0.7711350735568518, + "grad_norm": 6.42151403427124, + "learning_rate": 1.4864211293932258e-05, + "loss": 1.5678, + "step": 122690 + }, + { + "epoch": 0.771197925873549, + "grad_norm": 10.819112777709961, + "learning_rate": 1.4863792192987605e-05, + "loss": 1.5978, + "step": 122700 + }, + { + "epoch": 0.7712607781902461, + "grad_norm": 6.606426239013672, + "learning_rate": 1.486337309204295e-05, + "loss": 1.6483, + "step": 122710 + }, + { + "epoch": 0.7713236305069432, + "grad_norm": 6.718689441680908, + "learning_rate": 1.4862953991098296e-05, + "loss": 1.6981, + "step": 122720 + }, + { + "epoch": 0.7713864828236403, + "grad_norm": 7.290799617767334, + "learning_rate": 1.4862534890153643e-05, + "loss": 1.558, + "step": 122730 + }, + { + "epoch": 0.7714493351403374, + "grad_norm": 6.368934154510498, + "learning_rate": 1.486211578920899e-05, + "loss": 1.7425, + "step": 122740 + }, + { + "epoch": 0.7715121874570345, + "grad_norm": 6.9713897705078125, + "learning_rate": 1.4861696688264337e-05, + "loss": 1.9851, + "step": 122750 + }, + { + "epoch": 0.7715750397737317, + "grad_norm": 7.266341686248779, + "learning_rate": 1.4861277587319683e-05, + "loss": 1.619, + "step": 122760 + }, + { + "epoch": 0.7716378920904288, + "grad_norm": 6.003509998321533, + "learning_rate": 1.486085848637503e-05, + "loss": 1.665, + "step": 122770 + }, + { + "epoch": 0.7717007444071259, + "grad_norm": 6.83775520324707, + "learning_rate": 1.4860439385430377e-05, + "loss": 1.6166, + "step": 122780 + }, + { + "epoch": 0.771763596723823, + "grad_norm": 7.0539422035217285, + "learning_rate": 1.4860020284485724e-05, + "loss": 1.454, + "step": 122790 + }, + { + "epoch": 0.7718264490405201, + "grad_norm": 5.957339763641357, + "learning_rate": 1.4859601183541067e-05, + "loss": 1.6903, + "step": 122800 + }, + { + "epoch": 0.7718893013572172, + "grad_norm": 5.7905192375183105, + "learning_rate": 1.4859182082596415e-05, + "loss": 1.3768, + "step": 122810 + }, + { + "epoch": 0.7719521536739143, + "grad_norm": 7.8568010330200195, + "learning_rate": 1.4858762981651762e-05, + "loss": 1.6476, + "step": 122820 + }, + { + "epoch": 0.7720150059906115, + "grad_norm": 4.974095344543457, + "learning_rate": 1.4858343880707109e-05, + "loss": 1.4265, + "step": 122830 + }, + { + "epoch": 0.7720778583073086, + "grad_norm": 6.108072280883789, + "learning_rate": 1.4857924779762454e-05, + "loss": 2.0263, + "step": 122840 + }, + { + "epoch": 0.7721407106240057, + "grad_norm": 6.359543323516846, + "learning_rate": 1.4857505678817801e-05, + "loss": 1.6471, + "step": 122850 + }, + { + "epoch": 0.7722035629407028, + "grad_norm": 8.34288215637207, + "learning_rate": 1.4857086577873148e-05, + "loss": 1.6121, + "step": 122860 + }, + { + "epoch": 0.7722664152573999, + "grad_norm": 6.9957275390625, + "learning_rate": 1.4856667476928495e-05, + "loss": 1.8025, + "step": 122870 + }, + { + "epoch": 0.772329267574097, + "grad_norm": 7.1086201667785645, + "learning_rate": 1.4856248375983842e-05, + "loss": 1.6208, + "step": 122880 + }, + { + "epoch": 0.7723921198907941, + "grad_norm": 6.970386981964111, + "learning_rate": 1.4855829275039186e-05, + "loss": 1.6963, + "step": 122890 + }, + { + "epoch": 0.7724549722074913, + "grad_norm": 6.1792378425598145, + "learning_rate": 1.4855410174094533e-05, + "loss": 1.6584, + "step": 122900 + }, + { + "epoch": 0.7725178245241883, + "grad_norm": 7.270213603973389, + "learning_rate": 1.485499107314988e-05, + "loss": 1.6117, + "step": 122910 + }, + { + "epoch": 0.7725806768408854, + "grad_norm": 6.2892351150512695, + "learning_rate": 1.4854571972205227e-05, + "loss": 1.5591, + "step": 122920 + }, + { + "epoch": 0.7726435291575825, + "grad_norm": 6.083685874938965, + "learning_rate": 1.4854152871260573e-05, + "loss": 1.5936, + "step": 122930 + }, + { + "epoch": 0.7727063814742796, + "grad_norm": 6.550130844116211, + "learning_rate": 1.485373377031592e-05, + "loss": 1.6718, + "step": 122940 + }, + { + "epoch": 0.7727692337909767, + "grad_norm": 5.467560768127441, + "learning_rate": 1.4853314669371267e-05, + "loss": 1.3637, + "step": 122950 + }, + { + "epoch": 0.7728320861076738, + "grad_norm": 6.171477794647217, + "learning_rate": 1.4852895568426614e-05, + "loss": 1.7116, + "step": 122960 + }, + { + "epoch": 0.772894938424371, + "grad_norm": 6.136664867401123, + "learning_rate": 1.485247646748196e-05, + "loss": 1.5895, + "step": 122970 + }, + { + "epoch": 0.7729577907410681, + "grad_norm": 7.264563083648682, + "learning_rate": 1.4852057366537305e-05, + "loss": 1.7034, + "step": 122980 + }, + { + "epoch": 0.7730206430577652, + "grad_norm": 6.602540016174316, + "learning_rate": 1.4851638265592652e-05, + "loss": 1.648, + "step": 122990 + }, + { + "epoch": 0.7730834953744623, + "grad_norm": 8.665788650512695, + "learning_rate": 1.4851219164647999e-05, + "loss": 1.6093, + "step": 123000 + }, + { + "epoch": 0.7731463476911594, + "grad_norm": 7.170557975769043, + "learning_rate": 1.4850800063703346e-05, + "loss": 1.5464, + "step": 123010 + }, + { + "epoch": 0.7732092000078565, + "grad_norm": 6.581963539123535, + "learning_rate": 1.4850380962758691e-05, + "loss": 1.6343, + "step": 123020 + }, + { + "epoch": 0.7732720523245536, + "grad_norm": 6.883604526519775, + "learning_rate": 1.4849961861814037e-05, + "loss": 1.4948, + "step": 123030 + }, + { + "epoch": 0.7733349046412508, + "grad_norm": 6.112658977508545, + "learning_rate": 1.4849542760869384e-05, + "loss": 1.3697, + "step": 123040 + }, + { + "epoch": 0.7733977569579479, + "grad_norm": 7.500575542449951, + "learning_rate": 1.484912365992473e-05, + "loss": 1.8255, + "step": 123050 + }, + { + "epoch": 0.773460609274645, + "grad_norm": 7.116211891174316, + "learning_rate": 1.4848704558980078e-05, + "loss": 1.5943, + "step": 123060 + }, + { + "epoch": 0.7735234615913421, + "grad_norm": 6.220788478851318, + "learning_rate": 1.4848285458035423e-05, + "loss": 1.4484, + "step": 123070 + }, + { + "epoch": 0.7735863139080392, + "grad_norm": 6.182985305786133, + "learning_rate": 1.484786635709077e-05, + "loss": 1.4905, + "step": 123080 + }, + { + "epoch": 0.7736491662247363, + "grad_norm": 6.292008876800537, + "learning_rate": 1.4847447256146117e-05, + "loss": 1.6471, + "step": 123090 + }, + { + "epoch": 0.7737120185414335, + "grad_norm": 5.59975004196167, + "learning_rate": 1.4847028155201464e-05, + "loss": 1.6638, + "step": 123100 + }, + { + "epoch": 0.7737748708581306, + "grad_norm": 6.636292457580566, + "learning_rate": 1.4846609054256808e-05, + "loss": 1.6325, + "step": 123110 + }, + { + "epoch": 0.7738377231748277, + "grad_norm": 5.51218318939209, + "learning_rate": 1.4846189953312155e-05, + "loss": 1.7696, + "step": 123120 + }, + { + "epoch": 0.7739005754915248, + "grad_norm": 6.089381694793701, + "learning_rate": 1.4845770852367502e-05, + "loss": 1.5208, + "step": 123130 + }, + { + "epoch": 0.7739634278082219, + "grad_norm": 6.221317768096924, + "learning_rate": 1.484535175142285e-05, + "loss": 1.5267, + "step": 123140 + }, + { + "epoch": 0.774026280124919, + "grad_norm": 5.6198601722717285, + "learning_rate": 1.4844932650478195e-05, + "loss": 1.5002, + "step": 123150 + }, + { + "epoch": 0.7740891324416161, + "grad_norm": 7.691695690155029, + "learning_rate": 1.4844513549533542e-05, + "loss": 1.6479, + "step": 123160 + }, + { + "epoch": 0.7741519847583131, + "grad_norm": 6.768416404724121, + "learning_rate": 1.4844094448588889e-05, + "loss": 1.2711, + "step": 123170 + }, + { + "epoch": 0.7742148370750103, + "grad_norm": 7.375920295715332, + "learning_rate": 1.4843675347644236e-05, + "loss": 1.5467, + "step": 123180 + }, + { + "epoch": 0.7742776893917074, + "grad_norm": 6.8250346183776855, + "learning_rate": 1.4843256246699583e-05, + "loss": 1.561, + "step": 123190 + }, + { + "epoch": 0.7743405417084045, + "grad_norm": 7.026341438293457, + "learning_rate": 1.4842837145754927e-05, + "loss": 1.823, + "step": 123200 + }, + { + "epoch": 0.7744033940251016, + "grad_norm": 6.942145347595215, + "learning_rate": 1.4842418044810274e-05, + "loss": 1.6228, + "step": 123210 + }, + { + "epoch": 0.7744662463417987, + "grad_norm": 6.909519672393799, + "learning_rate": 1.484199894386562e-05, + "loss": 1.6935, + "step": 123220 + }, + { + "epoch": 0.7745290986584958, + "grad_norm": 6.394974708557129, + "learning_rate": 1.4841579842920968e-05, + "loss": 1.7623, + "step": 123230 + }, + { + "epoch": 0.774591950975193, + "grad_norm": 6.175522804260254, + "learning_rate": 1.4841160741976313e-05, + "loss": 1.4766, + "step": 123240 + }, + { + "epoch": 0.7746548032918901, + "grad_norm": 6.601080417633057, + "learning_rate": 1.484074164103166e-05, + "loss": 1.6036, + "step": 123250 + }, + { + "epoch": 0.7747176556085872, + "grad_norm": 6.722240924835205, + "learning_rate": 1.4840322540087007e-05, + "loss": 1.3793, + "step": 123260 + }, + { + "epoch": 0.7747805079252843, + "grad_norm": 6.2471232414245605, + "learning_rate": 1.4839903439142354e-05, + "loss": 1.4916, + "step": 123270 + }, + { + "epoch": 0.7748433602419814, + "grad_norm": 5.863661289215088, + "learning_rate": 1.48394843381977e-05, + "loss": 1.817, + "step": 123280 + }, + { + "epoch": 0.7749062125586785, + "grad_norm": 6.506509780883789, + "learning_rate": 1.4839065237253045e-05, + "loss": 1.4565, + "step": 123290 + }, + { + "epoch": 0.7749690648753756, + "grad_norm": 7.0432963371276855, + "learning_rate": 1.4838646136308392e-05, + "loss": 1.5648, + "step": 123300 + }, + { + "epoch": 0.7750319171920728, + "grad_norm": 6.736435890197754, + "learning_rate": 1.483822703536374e-05, + "loss": 1.646, + "step": 123310 + }, + { + "epoch": 0.7750947695087699, + "grad_norm": 6.402501106262207, + "learning_rate": 1.4837807934419086e-05, + "loss": 1.6051, + "step": 123320 + }, + { + "epoch": 0.775157621825467, + "grad_norm": 6.6225409507751465, + "learning_rate": 1.4837388833474432e-05, + "loss": 1.7524, + "step": 123330 + }, + { + "epoch": 0.7752204741421641, + "grad_norm": 5.071566104888916, + "learning_rate": 1.4836969732529777e-05, + "loss": 1.4562, + "step": 123340 + }, + { + "epoch": 0.7752833264588612, + "grad_norm": 5.7586188316345215, + "learning_rate": 1.4836550631585124e-05, + "loss": 1.6082, + "step": 123350 + }, + { + "epoch": 0.7753461787755583, + "grad_norm": 8.436548233032227, + "learning_rate": 1.4836131530640471e-05, + "loss": 1.4547, + "step": 123360 + }, + { + "epoch": 0.7754090310922555, + "grad_norm": 7.204708099365234, + "learning_rate": 1.4835712429695818e-05, + "loss": 1.5851, + "step": 123370 + }, + { + "epoch": 0.7754718834089526, + "grad_norm": 6.138442516326904, + "learning_rate": 1.4835293328751164e-05, + "loss": 1.669, + "step": 123380 + }, + { + "epoch": 0.7755347357256497, + "grad_norm": 6.008743762969971, + "learning_rate": 1.483487422780651e-05, + "loss": 1.5813, + "step": 123390 + }, + { + "epoch": 0.7755975880423468, + "grad_norm": 5.30795955657959, + "learning_rate": 1.4834455126861858e-05, + "loss": 1.6783, + "step": 123400 + }, + { + "epoch": 0.7756604403590439, + "grad_norm": 6.605019569396973, + "learning_rate": 1.4834036025917205e-05, + "loss": 1.768, + "step": 123410 + }, + { + "epoch": 0.7757232926757409, + "grad_norm": 6.18769645690918, + "learning_rate": 1.4833616924972549e-05, + "loss": 1.4924, + "step": 123420 + }, + { + "epoch": 0.775786144992438, + "grad_norm": 6.021730422973633, + "learning_rate": 1.4833197824027896e-05, + "loss": 1.446, + "step": 123430 + }, + { + "epoch": 0.7758489973091351, + "grad_norm": 6.385734558105469, + "learning_rate": 1.4832778723083243e-05, + "loss": 1.6865, + "step": 123440 + }, + { + "epoch": 0.7759118496258323, + "grad_norm": 5.815408229827881, + "learning_rate": 1.483235962213859e-05, + "loss": 1.5873, + "step": 123450 + }, + { + "epoch": 0.7759747019425294, + "grad_norm": 6.392560005187988, + "learning_rate": 1.4831940521193935e-05, + "loss": 1.5407, + "step": 123460 + }, + { + "epoch": 0.7760375542592265, + "grad_norm": 6.754059791564941, + "learning_rate": 1.4831521420249282e-05, + "loss": 1.7748, + "step": 123470 + }, + { + "epoch": 0.7761004065759236, + "grad_norm": 5.6090312004089355, + "learning_rate": 1.483110231930463e-05, + "loss": 1.6933, + "step": 123480 + }, + { + "epoch": 0.7761632588926207, + "grad_norm": 6.376821517944336, + "learning_rate": 1.4830683218359976e-05, + "loss": 1.709, + "step": 123490 + }, + { + "epoch": 0.7762261112093178, + "grad_norm": 6.824679851531982, + "learning_rate": 1.4830264117415323e-05, + "loss": 1.7707, + "step": 123500 + }, + { + "epoch": 0.776288963526015, + "grad_norm": 6.89393949508667, + "learning_rate": 1.4829845016470667e-05, + "loss": 1.7062, + "step": 123510 + }, + { + "epoch": 0.7763518158427121, + "grad_norm": 7.074446678161621, + "learning_rate": 1.4829425915526014e-05, + "loss": 1.9918, + "step": 123520 + }, + { + "epoch": 0.7764146681594092, + "grad_norm": 6.103003978729248, + "learning_rate": 1.4829006814581361e-05, + "loss": 1.3948, + "step": 123530 + }, + { + "epoch": 0.7764775204761063, + "grad_norm": 6.617283344268799, + "learning_rate": 1.4828587713636708e-05, + "loss": 1.585, + "step": 123540 + }, + { + "epoch": 0.7765403727928034, + "grad_norm": 6.929607391357422, + "learning_rate": 1.4828168612692054e-05, + "loss": 1.4903, + "step": 123550 + }, + { + "epoch": 0.7766032251095005, + "grad_norm": 5.60888147354126, + "learning_rate": 1.48277495117474e-05, + "loss": 1.6008, + "step": 123560 + }, + { + "epoch": 0.7766660774261976, + "grad_norm": 6.84712553024292, + "learning_rate": 1.4827330410802748e-05, + "loss": 1.5952, + "step": 123570 + }, + { + "epoch": 0.7767289297428948, + "grad_norm": 5.9359846115112305, + "learning_rate": 1.4826911309858095e-05, + "loss": 1.7788, + "step": 123580 + }, + { + "epoch": 0.7767917820595919, + "grad_norm": 5.723293304443359, + "learning_rate": 1.482649220891344e-05, + "loss": 1.6442, + "step": 123590 + }, + { + "epoch": 0.776854634376289, + "grad_norm": 7.651445388793945, + "learning_rate": 1.4826073107968786e-05, + "loss": 1.5432, + "step": 123600 + }, + { + "epoch": 0.7769174866929861, + "grad_norm": 6.02742862701416, + "learning_rate": 1.4825654007024133e-05, + "loss": 1.5669, + "step": 123610 + }, + { + "epoch": 0.7769803390096832, + "grad_norm": 4.787092685699463, + "learning_rate": 1.482523490607948e-05, + "loss": 1.7363, + "step": 123620 + }, + { + "epoch": 0.7770431913263803, + "grad_norm": 5.4203362464904785, + "learning_rate": 1.4824815805134827e-05, + "loss": 1.5502, + "step": 123630 + }, + { + "epoch": 0.7771060436430774, + "grad_norm": 6.5744853019714355, + "learning_rate": 1.4824396704190172e-05, + "loss": 1.5197, + "step": 123640 + }, + { + "epoch": 0.7771688959597746, + "grad_norm": 7.488888263702393, + "learning_rate": 1.482397760324552e-05, + "loss": 1.5051, + "step": 123650 + }, + { + "epoch": 0.7772317482764717, + "grad_norm": 7.3996124267578125, + "learning_rate": 1.4823558502300865e-05, + "loss": 1.7411, + "step": 123660 + }, + { + "epoch": 0.7772946005931688, + "grad_norm": 6.990914344787598, + "learning_rate": 1.4823139401356212e-05, + "loss": 1.6559, + "step": 123670 + }, + { + "epoch": 0.7773574529098658, + "grad_norm": 4.7599968910217285, + "learning_rate": 1.4822720300411559e-05, + "loss": 1.3768, + "step": 123680 + }, + { + "epoch": 0.7774203052265629, + "grad_norm": 6.931900501251221, + "learning_rate": 1.4822301199466904e-05, + "loss": 1.5697, + "step": 123690 + }, + { + "epoch": 0.77748315754326, + "grad_norm": 7.484622001647949, + "learning_rate": 1.4821882098522251e-05, + "loss": 1.7696, + "step": 123700 + }, + { + "epoch": 0.7775460098599571, + "grad_norm": 6.000463008880615, + "learning_rate": 1.4821462997577598e-05, + "loss": 1.4683, + "step": 123710 + }, + { + "epoch": 0.7776088621766543, + "grad_norm": 6.402760982513428, + "learning_rate": 1.4821043896632945e-05, + "loss": 1.3881, + "step": 123720 + }, + { + "epoch": 0.7776717144933514, + "grad_norm": 6.27096700668335, + "learning_rate": 1.4820624795688289e-05, + "loss": 1.5401, + "step": 123730 + }, + { + "epoch": 0.7777345668100485, + "grad_norm": 7.476364612579346, + "learning_rate": 1.4820205694743636e-05, + "loss": 1.6563, + "step": 123740 + }, + { + "epoch": 0.7777974191267456, + "grad_norm": 7.78745698928833, + "learning_rate": 1.4819786593798983e-05, + "loss": 1.5223, + "step": 123750 + }, + { + "epoch": 0.7778602714434427, + "grad_norm": 6.533060550689697, + "learning_rate": 1.481936749285433e-05, + "loss": 1.6728, + "step": 123760 + }, + { + "epoch": 0.7779231237601398, + "grad_norm": 6.133152484893799, + "learning_rate": 1.4818948391909676e-05, + "loss": 1.5037, + "step": 123770 + }, + { + "epoch": 0.777985976076837, + "grad_norm": 5.495619773864746, + "learning_rate": 1.4818529290965023e-05, + "loss": 1.6131, + "step": 123780 + }, + { + "epoch": 0.7780488283935341, + "grad_norm": 6.892193794250488, + "learning_rate": 1.481811019002037e-05, + "loss": 1.4951, + "step": 123790 + }, + { + "epoch": 0.7781116807102312, + "grad_norm": 8.555862426757812, + "learning_rate": 1.4817691089075717e-05, + "loss": 1.4632, + "step": 123800 + }, + { + "epoch": 0.7781745330269283, + "grad_norm": 7.705578804016113, + "learning_rate": 1.4817271988131064e-05, + "loss": 1.817, + "step": 123810 + }, + { + "epoch": 0.7782373853436254, + "grad_norm": 6.521873474121094, + "learning_rate": 1.4816852887186408e-05, + "loss": 1.4848, + "step": 123820 + }, + { + "epoch": 0.7783002376603225, + "grad_norm": 6.840022087097168, + "learning_rate": 1.4816433786241755e-05, + "loss": 1.6988, + "step": 123830 + }, + { + "epoch": 0.7783630899770196, + "grad_norm": 7.270923137664795, + "learning_rate": 1.4816014685297102e-05, + "loss": 1.5247, + "step": 123840 + }, + { + "epoch": 0.7784259422937168, + "grad_norm": 6.0143256187438965, + "learning_rate": 1.4815595584352449e-05, + "loss": 1.5974, + "step": 123850 + }, + { + "epoch": 0.7784887946104139, + "grad_norm": 6.783865928649902, + "learning_rate": 1.4815176483407794e-05, + "loss": 1.6884, + "step": 123860 + }, + { + "epoch": 0.778551646927111, + "grad_norm": 7.086910247802734, + "learning_rate": 1.4814757382463141e-05, + "loss": 1.7329, + "step": 123870 + }, + { + "epoch": 0.7786144992438081, + "grad_norm": 6.956778049468994, + "learning_rate": 1.4814338281518488e-05, + "loss": 1.6973, + "step": 123880 + }, + { + "epoch": 0.7786773515605052, + "grad_norm": 6.142144680023193, + "learning_rate": 1.4813919180573835e-05, + "loss": 1.7874, + "step": 123890 + }, + { + "epoch": 0.7787402038772023, + "grad_norm": 6.882087230682373, + "learning_rate": 1.4813500079629183e-05, + "loss": 1.6705, + "step": 123900 + }, + { + "epoch": 0.7788030561938994, + "grad_norm": 5.532877445220947, + "learning_rate": 1.4813080978684526e-05, + "loss": 1.7642, + "step": 123910 + }, + { + "epoch": 0.7788659085105966, + "grad_norm": 8.262584686279297, + "learning_rate": 1.4812661877739873e-05, + "loss": 1.6743, + "step": 123920 + }, + { + "epoch": 0.7789287608272936, + "grad_norm": 6.192286014556885, + "learning_rate": 1.481224277679522e-05, + "loss": 1.5542, + "step": 123930 + }, + { + "epoch": 0.7789916131439907, + "grad_norm": 6.131222248077393, + "learning_rate": 1.4811823675850567e-05, + "loss": 1.5012, + "step": 123940 + }, + { + "epoch": 0.7790544654606878, + "grad_norm": 6.869941234588623, + "learning_rate": 1.4811404574905913e-05, + "loss": 1.7849, + "step": 123950 + }, + { + "epoch": 0.7791173177773849, + "grad_norm": 6.112270832061768, + "learning_rate": 1.481098547396126e-05, + "loss": 1.3971, + "step": 123960 + }, + { + "epoch": 0.779180170094082, + "grad_norm": 8.105659484863281, + "learning_rate": 1.4810566373016605e-05, + "loss": 1.8585, + "step": 123970 + }, + { + "epoch": 0.7792430224107791, + "grad_norm": 6.909567356109619, + "learning_rate": 1.4810147272071952e-05, + "loss": 1.5651, + "step": 123980 + }, + { + "epoch": 0.7793058747274763, + "grad_norm": 6.206493377685547, + "learning_rate": 1.48097281711273e-05, + "loss": 1.6405, + "step": 123990 + }, + { + "epoch": 0.7793687270441734, + "grad_norm": 6.888452529907227, + "learning_rate": 1.4809309070182645e-05, + "loss": 1.5659, + "step": 124000 + }, + { + "epoch": 0.7794315793608705, + "grad_norm": 5.350739479064941, + "learning_rate": 1.4808889969237992e-05, + "loss": 1.4354, + "step": 124010 + }, + { + "epoch": 0.7794944316775676, + "grad_norm": 6.917699813842773, + "learning_rate": 1.4808470868293339e-05, + "loss": 1.5024, + "step": 124020 + }, + { + "epoch": 0.7795572839942647, + "grad_norm": 6.004730224609375, + "learning_rate": 1.4808051767348686e-05, + "loss": 1.8792, + "step": 124030 + }, + { + "epoch": 0.7796201363109618, + "grad_norm": 6.729684352874756, + "learning_rate": 1.480763266640403e-05, + "loss": 1.7051, + "step": 124040 + }, + { + "epoch": 0.779682988627659, + "grad_norm": 6.808758735656738, + "learning_rate": 1.4807213565459377e-05, + "loss": 1.6628, + "step": 124050 + }, + { + "epoch": 0.7797458409443561, + "grad_norm": 7.212730407714844, + "learning_rate": 1.4806794464514724e-05, + "loss": 1.601, + "step": 124060 + }, + { + "epoch": 0.7798086932610532, + "grad_norm": 7.3029375076293945, + "learning_rate": 1.4806375363570071e-05, + "loss": 1.4577, + "step": 124070 + }, + { + "epoch": 0.7798715455777503, + "grad_norm": 5.240572929382324, + "learning_rate": 1.4805956262625416e-05, + "loss": 1.4778, + "step": 124080 + }, + { + "epoch": 0.7799343978944474, + "grad_norm": 6.397128582000732, + "learning_rate": 1.4805537161680763e-05, + "loss": 1.6948, + "step": 124090 + }, + { + "epoch": 0.7799972502111445, + "grad_norm": 5.794658660888672, + "learning_rate": 1.480511806073611e-05, + "loss": 1.6997, + "step": 124100 + }, + { + "epoch": 0.7800601025278416, + "grad_norm": 7.616403102874756, + "learning_rate": 1.4804698959791457e-05, + "loss": 1.8753, + "step": 124110 + }, + { + "epoch": 0.7801229548445388, + "grad_norm": 7.255417823791504, + "learning_rate": 1.4804279858846805e-05, + "loss": 1.653, + "step": 124120 + }, + { + "epoch": 0.7801858071612359, + "grad_norm": 6.627250671386719, + "learning_rate": 1.4803860757902148e-05, + "loss": 1.7013, + "step": 124130 + }, + { + "epoch": 0.780248659477933, + "grad_norm": 6.056670665740967, + "learning_rate": 1.4803441656957495e-05, + "loss": 1.7038, + "step": 124140 + }, + { + "epoch": 0.7803115117946301, + "grad_norm": 6.512842655181885, + "learning_rate": 1.4803022556012842e-05, + "loss": 1.5134, + "step": 124150 + }, + { + "epoch": 0.7803743641113272, + "grad_norm": 6.630873203277588, + "learning_rate": 1.480260345506819e-05, + "loss": 1.5681, + "step": 124160 + }, + { + "epoch": 0.7804372164280243, + "grad_norm": 7.094987869262695, + "learning_rate": 1.4802184354123535e-05, + "loss": 1.5769, + "step": 124170 + }, + { + "epoch": 0.7805000687447214, + "grad_norm": 6.1788554191589355, + "learning_rate": 1.4801765253178882e-05, + "loss": 1.7874, + "step": 124180 + }, + { + "epoch": 0.7805629210614184, + "grad_norm": 5.971871376037598, + "learning_rate": 1.4801346152234229e-05, + "loss": 1.5268, + "step": 124190 + }, + { + "epoch": 0.7806257733781156, + "grad_norm": 6.372351169586182, + "learning_rate": 1.4800927051289576e-05, + "loss": 1.6221, + "step": 124200 + }, + { + "epoch": 0.7806886256948127, + "grad_norm": 6.5118279457092285, + "learning_rate": 1.4800507950344923e-05, + "loss": 1.7938, + "step": 124210 + }, + { + "epoch": 0.7807514780115098, + "grad_norm": 6.687560558319092, + "learning_rate": 1.4800088849400267e-05, + "loss": 1.9373, + "step": 124220 + }, + { + "epoch": 0.7808143303282069, + "grad_norm": 5.854617118835449, + "learning_rate": 1.4799669748455614e-05, + "loss": 1.6248, + "step": 124230 + }, + { + "epoch": 0.780877182644904, + "grad_norm": 7.229498386383057, + "learning_rate": 1.4799250647510961e-05, + "loss": 1.6186, + "step": 124240 + }, + { + "epoch": 0.7809400349616011, + "grad_norm": 6.641530513763428, + "learning_rate": 1.4798831546566308e-05, + "loss": 1.578, + "step": 124250 + }, + { + "epoch": 0.7810028872782983, + "grad_norm": 6.960888385772705, + "learning_rate": 1.4798412445621653e-05, + "loss": 1.7438, + "step": 124260 + }, + { + "epoch": 0.7810657395949954, + "grad_norm": 6.048774719238281, + "learning_rate": 1.4797993344677e-05, + "loss": 1.4759, + "step": 124270 + }, + { + "epoch": 0.7811285919116925, + "grad_norm": 6.7953104972839355, + "learning_rate": 1.4797574243732346e-05, + "loss": 1.5316, + "step": 124280 + }, + { + "epoch": 0.7811914442283896, + "grad_norm": 6.214657306671143, + "learning_rate": 1.4797155142787693e-05, + "loss": 1.6755, + "step": 124290 + }, + { + "epoch": 0.7812542965450867, + "grad_norm": 6.976639747619629, + "learning_rate": 1.479673604184304e-05, + "loss": 1.7267, + "step": 124300 + }, + { + "epoch": 0.7813171488617838, + "grad_norm": 7.6554694175720215, + "learning_rate": 1.4796316940898385e-05, + "loss": 1.5184, + "step": 124310 + }, + { + "epoch": 0.7813800011784809, + "grad_norm": 7.21309232711792, + "learning_rate": 1.4795897839953732e-05, + "loss": 1.702, + "step": 124320 + }, + { + "epoch": 0.7814428534951781, + "grad_norm": 6.164888381958008, + "learning_rate": 1.479547873900908e-05, + "loss": 1.4974, + "step": 124330 + }, + { + "epoch": 0.7815057058118752, + "grad_norm": 6.911844730377197, + "learning_rate": 1.4795059638064427e-05, + "loss": 1.7752, + "step": 124340 + }, + { + "epoch": 0.7815685581285723, + "grad_norm": 5.653620719909668, + "learning_rate": 1.479464053711977e-05, + "loss": 1.6673, + "step": 124350 + }, + { + "epoch": 0.7816314104452694, + "grad_norm": 6.437528133392334, + "learning_rate": 1.4794221436175117e-05, + "loss": 1.8561, + "step": 124360 + }, + { + "epoch": 0.7816942627619665, + "grad_norm": 6.037184238433838, + "learning_rate": 1.4793802335230464e-05, + "loss": 1.5835, + "step": 124370 + }, + { + "epoch": 0.7817571150786636, + "grad_norm": 5.882865905761719, + "learning_rate": 1.4793383234285811e-05, + "loss": 1.7838, + "step": 124380 + }, + { + "epoch": 0.7818199673953607, + "grad_norm": 6.115137577056885, + "learning_rate": 1.4792964133341157e-05, + "loss": 1.625, + "step": 124390 + }, + { + "epoch": 0.7818828197120579, + "grad_norm": 7.6811347007751465, + "learning_rate": 1.4792545032396504e-05, + "loss": 1.5176, + "step": 124400 + }, + { + "epoch": 0.781945672028755, + "grad_norm": 8.001575469970703, + "learning_rate": 1.4792125931451851e-05, + "loss": 1.7655, + "step": 124410 + }, + { + "epoch": 0.7820085243454521, + "grad_norm": 5.630826473236084, + "learning_rate": 1.4791706830507198e-05, + "loss": 1.601, + "step": 124420 + }, + { + "epoch": 0.7820713766621492, + "grad_norm": 5.482722759246826, + "learning_rate": 1.4791287729562545e-05, + "loss": 1.4681, + "step": 124430 + }, + { + "epoch": 0.7821342289788462, + "grad_norm": 5.712696552276611, + "learning_rate": 1.4790868628617889e-05, + "loss": 1.5048, + "step": 124440 + }, + { + "epoch": 0.7821970812955433, + "grad_norm": 6.567152500152588, + "learning_rate": 1.4790449527673236e-05, + "loss": 1.6015, + "step": 124450 + }, + { + "epoch": 0.7822599336122404, + "grad_norm": 5.875940322875977, + "learning_rate": 1.4790030426728583e-05, + "loss": 1.3869, + "step": 124460 + }, + { + "epoch": 0.7823227859289376, + "grad_norm": 7.083212852478027, + "learning_rate": 1.478961132578393e-05, + "loss": 1.9333, + "step": 124470 + }, + { + "epoch": 0.7823856382456347, + "grad_norm": 6.764925003051758, + "learning_rate": 1.4789192224839275e-05, + "loss": 1.6252, + "step": 124480 + }, + { + "epoch": 0.7824484905623318, + "grad_norm": 6.155638217926025, + "learning_rate": 1.4788773123894622e-05, + "loss": 1.6241, + "step": 124490 + }, + { + "epoch": 0.7825113428790289, + "grad_norm": 7.117341995239258, + "learning_rate": 1.478835402294997e-05, + "loss": 1.362, + "step": 124500 + }, + { + "epoch": 0.782574195195726, + "grad_norm": 6.133358478546143, + "learning_rate": 1.4787934922005317e-05, + "loss": 1.3876, + "step": 124510 + }, + { + "epoch": 0.7826370475124231, + "grad_norm": 7.302773475646973, + "learning_rate": 1.4787515821060664e-05, + "loss": 1.5911, + "step": 124520 + }, + { + "epoch": 0.7826998998291202, + "grad_norm": 6.74974250793457, + "learning_rate": 1.4787096720116007e-05, + "loss": 1.9227, + "step": 124530 + }, + { + "epoch": 0.7827627521458174, + "grad_norm": 6.843846321105957, + "learning_rate": 1.4786677619171354e-05, + "loss": 1.5501, + "step": 124540 + }, + { + "epoch": 0.7828256044625145, + "grad_norm": 7.022478103637695, + "learning_rate": 1.4786258518226701e-05, + "loss": 1.6664, + "step": 124550 + }, + { + "epoch": 0.7828884567792116, + "grad_norm": 6.756307125091553, + "learning_rate": 1.4785839417282049e-05, + "loss": 1.9876, + "step": 124560 + }, + { + "epoch": 0.7829513090959087, + "grad_norm": 6.578350067138672, + "learning_rate": 1.4785420316337394e-05, + "loss": 1.4891, + "step": 124570 + }, + { + "epoch": 0.7830141614126058, + "grad_norm": 6.477244853973389, + "learning_rate": 1.4785001215392741e-05, + "loss": 1.6783, + "step": 124580 + }, + { + "epoch": 0.7830770137293029, + "grad_norm": 6.223769187927246, + "learning_rate": 1.4784582114448088e-05, + "loss": 1.6556, + "step": 124590 + }, + { + "epoch": 0.783139866046, + "grad_norm": 4.901460647583008, + "learning_rate": 1.4784163013503433e-05, + "loss": 1.5048, + "step": 124600 + }, + { + "epoch": 0.7832027183626972, + "grad_norm": 7.089406490325928, + "learning_rate": 1.478374391255878e-05, + "loss": 1.714, + "step": 124610 + }, + { + "epoch": 0.7832655706793943, + "grad_norm": 6.149925231933594, + "learning_rate": 1.4783324811614126e-05, + "loss": 1.5439, + "step": 124620 + }, + { + "epoch": 0.7833284229960914, + "grad_norm": 6.35594367980957, + "learning_rate": 1.4782905710669473e-05, + "loss": 1.6053, + "step": 124630 + }, + { + "epoch": 0.7833912753127885, + "grad_norm": 7.102924823760986, + "learning_rate": 1.478248660972482e-05, + "loss": 1.5252, + "step": 124640 + }, + { + "epoch": 0.7834541276294856, + "grad_norm": 6.171487331390381, + "learning_rate": 1.4782067508780167e-05, + "loss": 1.5834, + "step": 124650 + }, + { + "epoch": 0.7835169799461827, + "grad_norm": 5.921186447143555, + "learning_rate": 1.478164840783551e-05, + "loss": 1.3824, + "step": 124660 + }, + { + "epoch": 0.7835798322628799, + "grad_norm": 5.365461826324463, + "learning_rate": 1.4781229306890858e-05, + "loss": 1.4792, + "step": 124670 + }, + { + "epoch": 0.783642684579577, + "grad_norm": 6.067823886871338, + "learning_rate": 1.4780810205946205e-05, + "loss": 1.5823, + "step": 124680 + }, + { + "epoch": 0.7837055368962741, + "grad_norm": 5.599748134613037, + "learning_rate": 1.4780391105001552e-05, + "loss": 1.7637, + "step": 124690 + }, + { + "epoch": 0.7837683892129711, + "grad_norm": 5.318389415740967, + "learning_rate": 1.4779972004056897e-05, + "loss": 1.5311, + "step": 124700 + }, + { + "epoch": 0.7838312415296682, + "grad_norm": 6.72338342666626, + "learning_rate": 1.4779552903112244e-05, + "loss": 1.5609, + "step": 124710 + }, + { + "epoch": 0.7838940938463653, + "grad_norm": 6.061426639556885, + "learning_rate": 1.4779133802167591e-05, + "loss": 1.7527, + "step": 124720 + }, + { + "epoch": 0.7839569461630624, + "grad_norm": 5.294411659240723, + "learning_rate": 1.4778714701222939e-05, + "loss": 1.3349, + "step": 124730 + }, + { + "epoch": 0.7840197984797596, + "grad_norm": 6.9191765785217285, + "learning_rate": 1.4778295600278286e-05, + "loss": 1.3523, + "step": 124740 + }, + { + "epoch": 0.7840826507964567, + "grad_norm": 5.913338661193848, + "learning_rate": 1.477787649933363e-05, + "loss": 1.5144, + "step": 124750 + }, + { + "epoch": 0.7841455031131538, + "grad_norm": 5.615120887756348, + "learning_rate": 1.4777457398388976e-05, + "loss": 1.4487, + "step": 124760 + }, + { + "epoch": 0.7842083554298509, + "grad_norm": 7.352620601654053, + "learning_rate": 1.4777038297444323e-05, + "loss": 1.4897, + "step": 124770 + }, + { + "epoch": 0.784271207746548, + "grad_norm": 6.531876564025879, + "learning_rate": 1.477661919649967e-05, + "loss": 1.5274, + "step": 124780 + }, + { + "epoch": 0.7843340600632451, + "grad_norm": 5.982143402099609, + "learning_rate": 1.4776200095555016e-05, + "loss": 1.5094, + "step": 124790 + }, + { + "epoch": 0.7843969123799422, + "grad_norm": 5.861140727996826, + "learning_rate": 1.4775780994610363e-05, + "loss": 1.795, + "step": 124800 + }, + { + "epoch": 0.7844597646966394, + "grad_norm": 6.9778289794921875, + "learning_rate": 1.477536189366571e-05, + "loss": 1.5421, + "step": 124810 + }, + { + "epoch": 0.7845226170133365, + "grad_norm": 5.670233249664307, + "learning_rate": 1.4774942792721057e-05, + "loss": 1.3728, + "step": 124820 + }, + { + "epoch": 0.7845854693300336, + "grad_norm": 6.536532402038574, + "learning_rate": 1.4774523691776404e-05, + "loss": 1.4614, + "step": 124830 + }, + { + "epoch": 0.7846483216467307, + "grad_norm": 6.704484462738037, + "learning_rate": 1.4774104590831748e-05, + "loss": 1.6172, + "step": 124840 + }, + { + "epoch": 0.7847111739634278, + "grad_norm": 7.304074764251709, + "learning_rate": 1.4773685489887095e-05, + "loss": 1.7006, + "step": 124850 + }, + { + "epoch": 0.7847740262801249, + "grad_norm": 7.6201395988464355, + "learning_rate": 1.4773266388942442e-05, + "loss": 1.6694, + "step": 124860 + }, + { + "epoch": 0.784836878596822, + "grad_norm": 7.4897284507751465, + "learning_rate": 1.4772847287997789e-05, + "loss": 1.5719, + "step": 124870 + }, + { + "epoch": 0.7848997309135192, + "grad_norm": 5.951974868774414, + "learning_rate": 1.4772428187053134e-05, + "loss": 1.6502, + "step": 124880 + }, + { + "epoch": 0.7849625832302163, + "grad_norm": 4.7976837158203125, + "learning_rate": 1.4772009086108482e-05, + "loss": 1.4351, + "step": 124890 + }, + { + "epoch": 0.7850254355469134, + "grad_norm": 6.078033447265625, + "learning_rate": 1.4771589985163829e-05, + "loss": 1.6329, + "step": 124900 + }, + { + "epoch": 0.7850882878636105, + "grad_norm": 6.36521053314209, + "learning_rate": 1.4771170884219174e-05, + "loss": 1.4933, + "step": 124910 + }, + { + "epoch": 0.7851511401803076, + "grad_norm": 6.84323787689209, + "learning_rate": 1.4770751783274521e-05, + "loss": 1.6187, + "step": 124920 + }, + { + "epoch": 0.7852139924970047, + "grad_norm": 6.295436859130859, + "learning_rate": 1.4770332682329866e-05, + "loss": 1.5152, + "step": 124930 + }, + { + "epoch": 0.7852768448137019, + "grad_norm": 6.235681056976318, + "learning_rate": 1.4769913581385213e-05, + "loss": 1.3081, + "step": 124940 + }, + { + "epoch": 0.7853396971303989, + "grad_norm": 5.9656476974487305, + "learning_rate": 1.476949448044056e-05, + "loss": 1.549, + "step": 124950 + }, + { + "epoch": 0.785402549447096, + "grad_norm": 6.388768196105957, + "learning_rate": 1.4769075379495908e-05, + "loss": 1.5615, + "step": 124960 + }, + { + "epoch": 0.7854654017637931, + "grad_norm": 5.354004383087158, + "learning_rate": 1.4768656278551253e-05, + "loss": 1.6545, + "step": 124970 + }, + { + "epoch": 0.7855282540804902, + "grad_norm": 6.536776542663574, + "learning_rate": 1.4768237177606598e-05, + "loss": 1.4357, + "step": 124980 + }, + { + "epoch": 0.7855911063971873, + "grad_norm": 6.764211654663086, + "learning_rate": 1.4767818076661945e-05, + "loss": 1.638, + "step": 124990 + }, + { + "epoch": 0.7856539587138844, + "grad_norm": 7.431994438171387, + "learning_rate": 1.4767398975717293e-05, + "loss": 1.7454, + "step": 125000 + }, + { + "epoch": 0.7857168110305816, + "grad_norm": 7.752243518829346, + "learning_rate": 1.476697987477264e-05, + "loss": 1.6082, + "step": 125010 + }, + { + "epoch": 0.7857796633472787, + "grad_norm": 8.0689058303833, + "learning_rate": 1.4766560773827985e-05, + "loss": 1.3525, + "step": 125020 + }, + { + "epoch": 0.7858425156639758, + "grad_norm": 6.900312900543213, + "learning_rate": 1.4766141672883332e-05, + "loss": 1.524, + "step": 125030 + }, + { + "epoch": 0.7859053679806729, + "grad_norm": 6.975441932678223, + "learning_rate": 1.4765722571938679e-05, + "loss": 1.5605, + "step": 125040 + }, + { + "epoch": 0.78596822029737, + "grad_norm": 7.432410717010498, + "learning_rate": 1.4765303470994026e-05, + "loss": 1.6976, + "step": 125050 + }, + { + "epoch": 0.7860310726140671, + "grad_norm": 5.973094463348389, + "learning_rate": 1.476488437004937e-05, + "loss": 1.5579, + "step": 125060 + }, + { + "epoch": 0.7860939249307642, + "grad_norm": 6.02004337310791, + "learning_rate": 1.4764465269104717e-05, + "loss": 1.6811, + "step": 125070 + }, + { + "epoch": 0.7861567772474614, + "grad_norm": 6.000905513763428, + "learning_rate": 1.4764046168160064e-05, + "loss": 1.3433, + "step": 125080 + }, + { + "epoch": 0.7862196295641585, + "grad_norm": 7.1472249031066895, + "learning_rate": 1.4763627067215411e-05, + "loss": 1.5057, + "step": 125090 + }, + { + "epoch": 0.7862824818808556, + "grad_norm": 6.639767646789551, + "learning_rate": 1.4763207966270756e-05, + "loss": 1.4208, + "step": 125100 + }, + { + "epoch": 0.7863453341975527, + "grad_norm": 6.315929889678955, + "learning_rate": 1.4762788865326104e-05, + "loss": 1.7368, + "step": 125110 + }, + { + "epoch": 0.7864081865142498, + "grad_norm": 6.559476375579834, + "learning_rate": 1.476236976438145e-05, + "loss": 1.5957, + "step": 125120 + }, + { + "epoch": 0.7864710388309469, + "grad_norm": 6.75954532623291, + "learning_rate": 1.4761950663436798e-05, + "loss": 1.6511, + "step": 125130 + }, + { + "epoch": 0.786533891147644, + "grad_norm": 7.152742385864258, + "learning_rate": 1.4761531562492145e-05, + "loss": 1.3636, + "step": 125140 + }, + { + "epoch": 0.7865967434643412, + "grad_norm": 6.649284839630127, + "learning_rate": 1.4761112461547488e-05, + "loss": 1.5758, + "step": 125150 + }, + { + "epoch": 0.7866595957810383, + "grad_norm": 6.113519191741943, + "learning_rate": 1.4760693360602835e-05, + "loss": 1.5074, + "step": 125160 + }, + { + "epoch": 0.7867224480977354, + "grad_norm": 6.174967288970947, + "learning_rate": 1.4760274259658183e-05, + "loss": 1.7528, + "step": 125170 + }, + { + "epoch": 0.7867853004144325, + "grad_norm": 6.821481704711914, + "learning_rate": 1.475985515871353e-05, + "loss": 1.5735, + "step": 125180 + }, + { + "epoch": 0.7868481527311296, + "grad_norm": 6.076359272003174, + "learning_rate": 1.4759436057768875e-05, + "loss": 1.9036, + "step": 125190 + }, + { + "epoch": 0.7869110050478267, + "grad_norm": 5.820423126220703, + "learning_rate": 1.4759016956824222e-05, + "loss": 1.4209, + "step": 125200 + }, + { + "epoch": 0.7869738573645237, + "grad_norm": 6.18796968460083, + "learning_rate": 1.475859785587957e-05, + "loss": 1.6042, + "step": 125210 + }, + { + "epoch": 0.7870367096812209, + "grad_norm": 6.335443019866943, + "learning_rate": 1.4758178754934915e-05, + "loss": 1.5408, + "step": 125220 + }, + { + "epoch": 0.787099561997918, + "grad_norm": 7.0418596267700195, + "learning_rate": 1.4757759653990262e-05, + "loss": 1.4753, + "step": 125230 + }, + { + "epoch": 0.7871624143146151, + "grad_norm": 6.072787284851074, + "learning_rate": 1.4757340553045607e-05, + "loss": 1.7044, + "step": 125240 + }, + { + "epoch": 0.7872252666313122, + "grad_norm": 6.614512920379639, + "learning_rate": 1.4756921452100954e-05, + "loss": 1.5064, + "step": 125250 + }, + { + "epoch": 0.7872881189480093, + "grad_norm": 5.504802703857422, + "learning_rate": 1.4756502351156301e-05, + "loss": 1.4929, + "step": 125260 + }, + { + "epoch": 0.7873509712647064, + "grad_norm": 6.460712909698486, + "learning_rate": 1.4756083250211648e-05, + "loss": 1.6838, + "step": 125270 + }, + { + "epoch": 0.7874138235814035, + "grad_norm": 5.779965877532959, + "learning_rate": 1.4755664149266994e-05, + "loss": 1.7015, + "step": 125280 + }, + { + "epoch": 0.7874766758981007, + "grad_norm": 7.005980014801025, + "learning_rate": 1.4755245048322339e-05, + "loss": 1.7211, + "step": 125290 + }, + { + "epoch": 0.7875395282147978, + "grad_norm": 6.12052583694458, + "learning_rate": 1.4754825947377686e-05, + "loss": 1.7615, + "step": 125300 + }, + { + "epoch": 0.7876023805314949, + "grad_norm": 6.373183727264404, + "learning_rate": 1.4754406846433033e-05, + "loss": 1.5446, + "step": 125310 + }, + { + "epoch": 0.787665232848192, + "grad_norm": 6.870295524597168, + "learning_rate": 1.475398774548838e-05, + "loss": 1.5771, + "step": 125320 + }, + { + "epoch": 0.7877280851648891, + "grad_norm": 6.176692485809326, + "learning_rate": 1.4753568644543726e-05, + "loss": 1.4018, + "step": 125330 + }, + { + "epoch": 0.7877909374815862, + "grad_norm": 6.8330607414245605, + "learning_rate": 1.4753149543599073e-05, + "loss": 1.289, + "step": 125340 + }, + { + "epoch": 0.7878537897982834, + "grad_norm": 6.328117847442627, + "learning_rate": 1.475273044265442e-05, + "loss": 1.4145, + "step": 125350 + }, + { + "epoch": 0.7879166421149805, + "grad_norm": 6.612305641174316, + "learning_rate": 1.4752311341709767e-05, + "loss": 1.6084, + "step": 125360 + }, + { + "epoch": 0.7879794944316776, + "grad_norm": 6.249629020690918, + "learning_rate": 1.475189224076511e-05, + "loss": 1.6003, + "step": 125370 + }, + { + "epoch": 0.7880423467483747, + "grad_norm": 6.391242027282715, + "learning_rate": 1.4751473139820457e-05, + "loss": 1.6501, + "step": 125380 + }, + { + "epoch": 0.7881051990650718, + "grad_norm": 6.851719379425049, + "learning_rate": 1.4751054038875805e-05, + "loss": 1.6964, + "step": 125390 + }, + { + "epoch": 0.7881680513817689, + "grad_norm": 7.299596309661865, + "learning_rate": 1.4750634937931152e-05, + "loss": 1.6259, + "step": 125400 + }, + { + "epoch": 0.788230903698466, + "grad_norm": 6.680347919464111, + "learning_rate": 1.4750215836986497e-05, + "loss": 1.601, + "step": 125410 + }, + { + "epoch": 0.7882937560151632, + "grad_norm": 5.847686290740967, + "learning_rate": 1.4749796736041844e-05, + "loss": 1.7636, + "step": 125420 + }, + { + "epoch": 0.7883566083318603, + "grad_norm": 6.313074588775635, + "learning_rate": 1.4749377635097191e-05, + "loss": 1.7038, + "step": 125430 + }, + { + "epoch": 0.7884194606485574, + "grad_norm": 6.959484577178955, + "learning_rate": 1.4748958534152538e-05, + "loss": 1.6529, + "step": 125440 + }, + { + "epoch": 0.7884823129652545, + "grad_norm": 7.320756912231445, + "learning_rate": 1.4748539433207885e-05, + "loss": 1.505, + "step": 125450 + }, + { + "epoch": 0.7885451652819515, + "grad_norm": 7.231621265411377, + "learning_rate": 1.4748120332263229e-05, + "loss": 1.5173, + "step": 125460 + }, + { + "epoch": 0.7886080175986486, + "grad_norm": 7.906031131744385, + "learning_rate": 1.4747701231318576e-05, + "loss": 1.6583, + "step": 125470 + }, + { + "epoch": 0.7886708699153457, + "grad_norm": 6.090366363525391, + "learning_rate": 1.4747282130373923e-05, + "loss": 1.5974, + "step": 125480 + }, + { + "epoch": 0.7887337222320429, + "grad_norm": 6.336813926696777, + "learning_rate": 1.474686302942927e-05, + "loss": 1.5699, + "step": 125490 + }, + { + "epoch": 0.78879657454874, + "grad_norm": 6.168365478515625, + "learning_rate": 1.4746443928484616e-05, + "loss": 1.7668, + "step": 125500 + }, + { + "epoch": 0.7888594268654371, + "grad_norm": 6.6962714195251465, + "learning_rate": 1.4746024827539963e-05, + "loss": 1.7307, + "step": 125510 + }, + { + "epoch": 0.7889222791821342, + "grad_norm": 6.099548816680908, + "learning_rate": 1.474560572659531e-05, + "loss": 1.7463, + "step": 125520 + }, + { + "epoch": 0.7889851314988313, + "grad_norm": 6.7555437088012695, + "learning_rate": 1.4745186625650657e-05, + "loss": 1.636, + "step": 125530 + }, + { + "epoch": 0.7890479838155284, + "grad_norm": 7.971730709075928, + "learning_rate": 1.4744767524706002e-05, + "loss": 1.6252, + "step": 125540 + }, + { + "epoch": 0.7891108361322255, + "grad_norm": 6.346310138702393, + "learning_rate": 1.4744348423761348e-05, + "loss": 1.6358, + "step": 125550 + }, + { + "epoch": 0.7891736884489227, + "grad_norm": 6.743438720703125, + "learning_rate": 1.4743929322816695e-05, + "loss": 1.5607, + "step": 125560 + }, + { + "epoch": 0.7892365407656198, + "grad_norm": 7.675486087799072, + "learning_rate": 1.4743510221872042e-05, + "loss": 1.5282, + "step": 125570 + }, + { + "epoch": 0.7892993930823169, + "grad_norm": 6.808964729309082, + "learning_rate": 1.4743091120927389e-05, + "loss": 1.3673, + "step": 125580 + }, + { + "epoch": 0.789362245399014, + "grad_norm": 5.577764987945557, + "learning_rate": 1.4742672019982734e-05, + "loss": 1.5494, + "step": 125590 + }, + { + "epoch": 0.7894250977157111, + "grad_norm": 5.431988716125488, + "learning_rate": 1.474225291903808e-05, + "loss": 1.3762, + "step": 125600 + }, + { + "epoch": 0.7894879500324082, + "grad_norm": 5.761918067932129, + "learning_rate": 1.4741833818093427e-05, + "loss": 1.3322, + "step": 125610 + }, + { + "epoch": 0.7895508023491054, + "grad_norm": 6.490145206451416, + "learning_rate": 1.4741414717148774e-05, + "loss": 1.8115, + "step": 125620 + }, + { + "epoch": 0.7896136546658025, + "grad_norm": 6.952646732330322, + "learning_rate": 1.474099561620412e-05, + "loss": 1.6042, + "step": 125630 + }, + { + "epoch": 0.7896765069824996, + "grad_norm": 6.986286640167236, + "learning_rate": 1.4740576515259466e-05, + "loss": 1.5033, + "step": 125640 + }, + { + "epoch": 0.7897393592991967, + "grad_norm": 5.9131951332092285, + "learning_rate": 1.4740157414314813e-05, + "loss": 1.4577, + "step": 125650 + }, + { + "epoch": 0.7898022116158938, + "grad_norm": 7.052176475524902, + "learning_rate": 1.473973831337016e-05, + "loss": 1.5199, + "step": 125660 + }, + { + "epoch": 0.7898650639325909, + "grad_norm": 7.309354305267334, + "learning_rate": 1.4739319212425507e-05, + "loss": 1.7211, + "step": 125670 + }, + { + "epoch": 0.789927916249288, + "grad_norm": 5.919618606567383, + "learning_rate": 1.4738900111480851e-05, + "loss": 1.6385, + "step": 125680 + }, + { + "epoch": 0.7899907685659852, + "grad_norm": 6.132586479187012, + "learning_rate": 1.4738481010536198e-05, + "loss": 1.68, + "step": 125690 + }, + { + "epoch": 0.7900536208826823, + "grad_norm": 6.035669326782227, + "learning_rate": 1.4738061909591545e-05, + "loss": 1.5735, + "step": 125700 + }, + { + "epoch": 0.7901164731993794, + "grad_norm": 6.01634407043457, + "learning_rate": 1.4737642808646892e-05, + "loss": 1.5432, + "step": 125710 + }, + { + "epoch": 0.7901793255160764, + "grad_norm": 7.341683387756348, + "learning_rate": 1.4737223707702238e-05, + "loss": 1.5898, + "step": 125720 + }, + { + "epoch": 0.7902421778327735, + "grad_norm": 7.209320068359375, + "learning_rate": 1.4736804606757585e-05, + "loss": 1.7538, + "step": 125730 + }, + { + "epoch": 0.7903050301494706, + "grad_norm": 5.579822063446045, + "learning_rate": 1.4736385505812932e-05, + "loss": 1.493, + "step": 125740 + }, + { + "epoch": 0.7903678824661677, + "grad_norm": 6.049619674682617, + "learning_rate": 1.4735966404868279e-05, + "loss": 1.5774, + "step": 125750 + }, + { + "epoch": 0.7904307347828649, + "grad_norm": 6.2716851234436035, + "learning_rate": 1.4735547303923626e-05, + "loss": 1.5059, + "step": 125760 + }, + { + "epoch": 0.790493587099562, + "grad_norm": 6.8997483253479, + "learning_rate": 1.473512820297897e-05, + "loss": 1.6502, + "step": 125770 + }, + { + "epoch": 0.7905564394162591, + "grad_norm": 6.963823318481445, + "learning_rate": 1.4734709102034317e-05, + "loss": 1.5679, + "step": 125780 + }, + { + "epoch": 0.7906192917329562, + "grad_norm": 7.256715297698975, + "learning_rate": 1.4734290001089664e-05, + "loss": 1.9256, + "step": 125790 + }, + { + "epoch": 0.7906821440496533, + "grad_norm": 6.907870292663574, + "learning_rate": 1.473387090014501e-05, + "loss": 1.4705, + "step": 125800 + }, + { + "epoch": 0.7907449963663504, + "grad_norm": 5.856288909912109, + "learning_rate": 1.4733451799200356e-05, + "loss": 1.4704, + "step": 125810 + }, + { + "epoch": 0.7908078486830475, + "grad_norm": 6.106492519378662, + "learning_rate": 1.4733032698255703e-05, + "loss": 1.6956, + "step": 125820 + }, + { + "epoch": 0.7908707009997447, + "grad_norm": 7.309217929840088, + "learning_rate": 1.473261359731105e-05, + "loss": 1.8171, + "step": 125830 + }, + { + "epoch": 0.7909335533164418, + "grad_norm": 6.840932369232178, + "learning_rate": 1.4732194496366397e-05, + "loss": 1.6197, + "step": 125840 + }, + { + "epoch": 0.7909964056331389, + "grad_norm": 5.846725940704346, + "learning_rate": 1.4731775395421743e-05, + "loss": 1.6506, + "step": 125850 + }, + { + "epoch": 0.791059257949836, + "grad_norm": 7.0063605308532715, + "learning_rate": 1.4731356294477088e-05, + "loss": 1.9317, + "step": 125860 + }, + { + "epoch": 0.7911221102665331, + "grad_norm": 5.581164360046387, + "learning_rate": 1.4730937193532435e-05, + "loss": 1.5135, + "step": 125870 + }, + { + "epoch": 0.7911849625832302, + "grad_norm": 7.6870598793029785, + "learning_rate": 1.4730518092587782e-05, + "loss": 1.8118, + "step": 125880 + }, + { + "epoch": 0.7912478148999273, + "grad_norm": 5.364130973815918, + "learning_rate": 1.473009899164313e-05, + "loss": 1.4011, + "step": 125890 + }, + { + "epoch": 0.7913106672166245, + "grad_norm": 7.236602306365967, + "learning_rate": 1.4729679890698475e-05, + "loss": 1.557, + "step": 125900 + }, + { + "epoch": 0.7913735195333216, + "grad_norm": 6.039979457855225, + "learning_rate": 1.472926078975382e-05, + "loss": 1.6451, + "step": 125910 + }, + { + "epoch": 0.7914363718500187, + "grad_norm": 5.8714823722839355, + "learning_rate": 1.4728841688809167e-05, + "loss": 1.6081, + "step": 125920 + }, + { + "epoch": 0.7914992241667158, + "grad_norm": 7.167581558227539, + "learning_rate": 1.4728422587864514e-05, + "loss": 1.85, + "step": 125930 + }, + { + "epoch": 0.7915620764834129, + "grad_norm": 7.117419242858887, + "learning_rate": 1.4728003486919861e-05, + "loss": 1.4089, + "step": 125940 + }, + { + "epoch": 0.79162492880011, + "grad_norm": 6.153970241546631, + "learning_rate": 1.4727584385975207e-05, + "loss": 1.4535, + "step": 125950 + }, + { + "epoch": 0.7916877811168072, + "grad_norm": 7.502902030944824, + "learning_rate": 1.4727165285030554e-05, + "loss": 1.6855, + "step": 125960 + }, + { + "epoch": 0.7917506334335042, + "grad_norm": 5.559210300445557, + "learning_rate": 1.47267461840859e-05, + "loss": 1.4268, + "step": 125970 + }, + { + "epoch": 0.7918134857502013, + "grad_norm": 5.9198527336120605, + "learning_rate": 1.4726327083141248e-05, + "loss": 1.5697, + "step": 125980 + }, + { + "epoch": 0.7918763380668984, + "grad_norm": 6.3073601722717285, + "learning_rate": 1.4725907982196592e-05, + "loss": 1.6166, + "step": 125990 + }, + { + "epoch": 0.7919391903835955, + "grad_norm": 6.572141170501709, + "learning_rate": 1.4725488881251939e-05, + "loss": 1.5899, + "step": 126000 + }, + { + "epoch": 0.7920020427002926, + "grad_norm": 6.698461532592773, + "learning_rate": 1.4725069780307286e-05, + "loss": 1.4765, + "step": 126010 + }, + { + "epoch": 0.7920648950169897, + "grad_norm": 7.350159168243408, + "learning_rate": 1.4724650679362633e-05, + "loss": 1.5179, + "step": 126020 + }, + { + "epoch": 0.7921277473336868, + "grad_norm": 7.004650592803955, + "learning_rate": 1.4724231578417978e-05, + "loss": 1.5526, + "step": 126030 + }, + { + "epoch": 0.792190599650384, + "grad_norm": 6.276726245880127, + "learning_rate": 1.4723812477473325e-05, + "loss": 1.5309, + "step": 126040 + }, + { + "epoch": 0.7922534519670811, + "grad_norm": 7.76663064956665, + "learning_rate": 1.4723393376528672e-05, + "loss": 1.6354, + "step": 126050 + }, + { + "epoch": 0.7923163042837782, + "grad_norm": 6.963621139526367, + "learning_rate": 1.472297427558402e-05, + "loss": 1.453, + "step": 126060 + }, + { + "epoch": 0.7923791566004753, + "grad_norm": 5.757785797119141, + "learning_rate": 1.4722555174639366e-05, + "loss": 1.6305, + "step": 126070 + }, + { + "epoch": 0.7924420089171724, + "grad_norm": 6.344888687133789, + "learning_rate": 1.472213607369471e-05, + "loss": 1.5846, + "step": 126080 + }, + { + "epoch": 0.7925048612338695, + "grad_norm": 5.937010288238525, + "learning_rate": 1.4721716972750057e-05, + "loss": 1.5592, + "step": 126090 + }, + { + "epoch": 0.7925677135505667, + "grad_norm": 6.348926544189453, + "learning_rate": 1.4721297871805404e-05, + "loss": 1.6011, + "step": 126100 + }, + { + "epoch": 0.7926305658672638, + "grad_norm": 6.579949855804443, + "learning_rate": 1.4720878770860751e-05, + "loss": 1.7442, + "step": 126110 + }, + { + "epoch": 0.7926934181839609, + "grad_norm": 5.932890892028809, + "learning_rate": 1.4720459669916097e-05, + "loss": 1.6936, + "step": 126120 + }, + { + "epoch": 0.792756270500658, + "grad_norm": 6.792901992797852, + "learning_rate": 1.4720040568971444e-05, + "loss": 1.8082, + "step": 126130 + }, + { + "epoch": 0.7928191228173551, + "grad_norm": 5.802411079406738, + "learning_rate": 1.471962146802679e-05, + "loss": 1.654, + "step": 126140 + }, + { + "epoch": 0.7928819751340522, + "grad_norm": 7.658969879150391, + "learning_rate": 1.4719202367082138e-05, + "loss": 1.8559, + "step": 126150 + }, + { + "epoch": 0.7929448274507493, + "grad_norm": 6.637668132781982, + "learning_rate": 1.4718783266137483e-05, + "loss": 1.6167, + "step": 126160 + }, + { + "epoch": 0.7930076797674465, + "grad_norm": 6.326990127563477, + "learning_rate": 1.4718364165192829e-05, + "loss": 1.7941, + "step": 126170 + }, + { + "epoch": 0.7930705320841436, + "grad_norm": 6.690548419952393, + "learning_rate": 1.4717945064248176e-05, + "loss": 1.7078, + "step": 126180 + }, + { + "epoch": 0.7931333844008407, + "grad_norm": 6.911334037780762, + "learning_rate": 1.4717525963303523e-05, + "loss": 1.5828, + "step": 126190 + }, + { + "epoch": 0.7931962367175378, + "grad_norm": 7.0854926109313965, + "learning_rate": 1.471710686235887e-05, + "loss": 1.4639, + "step": 126200 + }, + { + "epoch": 0.7932590890342349, + "grad_norm": 6.275550842285156, + "learning_rate": 1.4716687761414215e-05, + "loss": 1.7028, + "step": 126210 + }, + { + "epoch": 0.793321941350932, + "grad_norm": 6.527076244354248, + "learning_rate": 1.4716268660469562e-05, + "loss": 1.6216, + "step": 126220 + }, + { + "epoch": 0.793384793667629, + "grad_norm": 7.024774074554443, + "learning_rate": 1.4715849559524908e-05, + "loss": 1.609, + "step": 126230 + }, + { + "epoch": 0.7934476459843262, + "grad_norm": 5.79679536819458, + "learning_rate": 1.4715430458580255e-05, + "loss": 1.6176, + "step": 126240 + }, + { + "epoch": 0.7935104983010233, + "grad_norm": 6.3160247802734375, + "learning_rate": 1.4715011357635602e-05, + "loss": 1.6761, + "step": 126250 + }, + { + "epoch": 0.7935733506177204, + "grad_norm": 6.258047103881836, + "learning_rate": 1.4714592256690947e-05, + "loss": 1.4803, + "step": 126260 + }, + { + "epoch": 0.7936362029344175, + "grad_norm": 7.629281997680664, + "learning_rate": 1.4714173155746294e-05, + "loss": 1.5293, + "step": 126270 + }, + { + "epoch": 0.7936990552511146, + "grad_norm": 6.507336139678955, + "learning_rate": 1.4713754054801641e-05, + "loss": 1.8054, + "step": 126280 + }, + { + "epoch": 0.7937619075678117, + "grad_norm": 6.551023483276367, + "learning_rate": 1.4713334953856988e-05, + "loss": 1.616, + "step": 126290 + }, + { + "epoch": 0.7938247598845088, + "grad_norm": 6.801031112670898, + "learning_rate": 1.4712915852912332e-05, + "loss": 1.5809, + "step": 126300 + }, + { + "epoch": 0.793887612201206, + "grad_norm": 7.758546352386475, + "learning_rate": 1.471249675196768e-05, + "loss": 1.7554, + "step": 126310 + }, + { + "epoch": 0.7939504645179031, + "grad_norm": 5.507966041564941, + "learning_rate": 1.4712077651023026e-05, + "loss": 1.595, + "step": 126320 + }, + { + "epoch": 0.7940133168346002, + "grad_norm": 6.165992259979248, + "learning_rate": 1.4711658550078373e-05, + "loss": 1.5877, + "step": 126330 + }, + { + "epoch": 0.7940761691512973, + "grad_norm": 5.525192737579346, + "learning_rate": 1.4711239449133719e-05, + "loss": 1.7429, + "step": 126340 + }, + { + "epoch": 0.7941390214679944, + "grad_norm": 5.700862407684326, + "learning_rate": 1.4710820348189066e-05, + "loss": 1.5167, + "step": 126350 + }, + { + "epoch": 0.7942018737846915, + "grad_norm": 5.368999004364014, + "learning_rate": 1.4710401247244413e-05, + "loss": 1.5961, + "step": 126360 + }, + { + "epoch": 0.7942647261013887, + "grad_norm": 7.041489124298096, + "learning_rate": 1.470998214629976e-05, + "loss": 1.5955, + "step": 126370 + }, + { + "epoch": 0.7943275784180858, + "grad_norm": 6.7174506187438965, + "learning_rate": 1.4709563045355107e-05, + "loss": 1.461, + "step": 126380 + }, + { + "epoch": 0.7943904307347829, + "grad_norm": 6.197600364685059, + "learning_rate": 1.470914394441045e-05, + "loss": 1.6482, + "step": 126390 + }, + { + "epoch": 0.79445328305148, + "grad_norm": 7.42714786529541, + "learning_rate": 1.4708724843465798e-05, + "loss": 1.5775, + "step": 126400 + }, + { + "epoch": 0.7945161353681771, + "grad_norm": 7.02616548538208, + "learning_rate": 1.4708305742521145e-05, + "loss": 1.495, + "step": 126410 + }, + { + "epoch": 0.7945789876848742, + "grad_norm": 5.982561111450195, + "learning_rate": 1.4707886641576492e-05, + "loss": 1.4115, + "step": 126420 + }, + { + "epoch": 0.7946418400015713, + "grad_norm": 6.265834331512451, + "learning_rate": 1.4707467540631837e-05, + "loss": 1.7257, + "step": 126430 + }, + { + "epoch": 0.7947046923182685, + "grad_norm": 6.405435562133789, + "learning_rate": 1.4707048439687184e-05, + "loss": 1.4261, + "step": 126440 + }, + { + "epoch": 0.7947675446349656, + "grad_norm": 5.93803596496582, + "learning_rate": 1.4706629338742531e-05, + "loss": 1.471, + "step": 126450 + }, + { + "epoch": 0.7948303969516627, + "grad_norm": 6.864293098449707, + "learning_rate": 1.4706210237797878e-05, + "loss": 1.5623, + "step": 126460 + }, + { + "epoch": 0.7948932492683598, + "grad_norm": 7.847011566162109, + "learning_rate": 1.4705791136853226e-05, + "loss": 1.6047, + "step": 126470 + }, + { + "epoch": 0.7949561015850568, + "grad_norm": 7.684126853942871, + "learning_rate": 1.470537203590857e-05, + "loss": 1.809, + "step": 126480 + }, + { + "epoch": 0.7950189539017539, + "grad_norm": 6.885912895202637, + "learning_rate": 1.4704952934963916e-05, + "loss": 1.6312, + "step": 126490 + }, + { + "epoch": 0.795081806218451, + "grad_norm": 6.973878860473633, + "learning_rate": 1.4704533834019263e-05, + "loss": 1.6712, + "step": 126500 + }, + { + "epoch": 0.7951446585351482, + "grad_norm": 5.735217094421387, + "learning_rate": 1.470411473307461e-05, + "loss": 1.7193, + "step": 126510 + }, + { + "epoch": 0.7952075108518453, + "grad_norm": 6.294557094573975, + "learning_rate": 1.4703695632129956e-05, + "loss": 1.6743, + "step": 126520 + }, + { + "epoch": 0.7952703631685424, + "grad_norm": 7.289360046386719, + "learning_rate": 1.4703276531185303e-05, + "loss": 1.5616, + "step": 126530 + }, + { + "epoch": 0.7953332154852395, + "grad_norm": 6.456623077392578, + "learning_rate": 1.4702857430240648e-05, + "loss": 1.642, + "step": 126540 + }, + { + "epoch": 0.7953960678019366, + "grad_norm": 7.75649881362915, + "learning_rate": 1.4702438329295995e-05, + "loss": 1.523, + "step": 126550 + }, + { + "epoch": 0.7954589201186337, + "grad_norm": 7.781533241271973, + "learning_rate": 1.4702019228351342e-05, + "loss": 1.7725, + "step": 126560 + }, + { + "epoch": 0.7955217724353308, + "grad_norm": 6.417851448059082, + "learning_rate": 1.4701600127406688e-05, + "loss": 1.5826, + "step": 126570 + }, + { + "epoch": 0.795584624752028, + "grad_norm": 7.097602844238281, + "learning_rate": 1.4701181026462035e-05, + "loss": 1.6525, + "step": 126580 + }, + { + "epoch": 0.7956474770687251, + "grad_norm": 6.180857181549072, + "learning_rate": 1.4700761925517382e-05, + "loss": 1.4011, + "step": 126590 + }, + { + "epoch": 0.7957103293854222, + "grad_norm": 5.929043292999268, + "learning_rate": 1.4700342824572729e-05, + "loss": 1.5811, + "step": 126600 + }, + { + "epoch": 0.7957731817021193, + "grad_norm": 6.822539806365967, + "learning_rate": 1.4699923723628073e-05, + "loss": 1.6045, + "step": 126610 + }, + { + "epoch": 0.7958360340188164, + "grad_norm": 6.456976413726807, + "learning_rate": 1.469950462268342e-05, + "loss": 1.5757, + "step": 126620 + }, + { + "epoch": 0.7958988863355135, + "grad_norm": 5.858137130737305, + "learning_rate": 1.4699085521738767e-05, + "loss": 1.4446, + "step": 126630 + }, + { + "epoch": 0.7959617386522106, + "grad_norm": 6.887723445892334, + "learning_rate": 1.4698666420794114e-05, + "loss": 1.5294, + "step": 126640 + }, + { + "epoch": 0.7960245909689078, + "grad_norm": 6.020218372344971, + "learning_rate": 1.469824731984946e-05, + "loss": 1.5737, + "step": 126650 + }, + { + "epoch": 0.7960874432856049, + "grad_norm": 6.273682117462158, + "learning_rate": 1.4697828218904806e-05, + "loss": 1.595, + "step": 126660 + }, + { + "epoch": 0.796150295602302, + "grad_norm": 5.848662376403809, + "learning_rate": 1.4697409117960153e-05, + "loss": 1.4215, + "step": 126670 + }, + { + "epoch": 0.7962131479189991, + "grad_norm": 6.93732213973999, + "learning_rate": 1.46969900170155e-05, + "loss": 1.4694, + "step": 126680 + }, + { + "epoch": 0.7962760002356962, + "grad_norm": 5.907258033752441, + "learning_rate": 1.4696570916070848e-05, + "loss": 1.4548, + "step": 126690 + }, + { + "epoch": 0.7963388525523933, + "grad_norm": 6.12294864654541, + "learning_rate": 1.4696151815126191e-05, + "loss": 1.5239, + "step": 126700 + }, + { + "epoch": 0.7964017048690905, + "grad_norm": 5.543499946594238, + "learning_rate": 1.4695732714181538e-05, + "loss": 1.399, + "step": 126710 + }, + { + "epoch": 0.7964645571857876, + "grad_norm": 5.761287212371826, + "learning_rate": 1.4695313613236885e-05, + "loss": 1.5, + "step": 126720 + }, + { + "epoch": 0.7965274095024847, + "grad_norm": 6.135310173034668, + "learning_rate": 1.4694894512292232e-05, + "loss": 1.7174, + "step": 126730 + }, + { + "epoch": 0.7965902618191817, + "grad_norm": 6.743239879608154, + "learning_rate": 1.4694475411347578e-05, + "loss": 1.5867, + "step": 126740 + }, + { + "epoch": 0.7966531141358788, + "grad_norm": 6.6734466552734375, + "learning_rate": 1.4694056310402925e-05, + "loss": 1.7619, + "step": 126750 + }, + { + "epoch": 0.7967159664525759, + "grad_norm": 6.193628787994385, + "learning_rate": 1.4693637209458272e-05, + "loss": 1.6588, + "step": 126760 + }, + { + "epoch": 0.796778818769273, + "grad_norm": 5.738262176513672, + "learning_rate": 1.4693218108513619e-05, + "loss": 1.3049, + "step": 126770 + }, + { + "epoch": 0.7968416710859701, + "grad_norm": 5.9922003746032715, + "learning_rate": 1.4692799007568966e-05, + "loss": 1.5419, + "step": 126780 + }, + { + "epoch": 0.7969045234026673, + "grad_norm": 6.955031871795654, + "learning_rate": 1.469237990662431e-05, + "loss": 1.661, + "step": 126790 + }, + { + "epoch": 0.7969673757193644, + "grad_norm": 4.9513421058654785, + "learning_rate": 1.4691960805679657e-05, + "loss": 1.5246, + "step": 126800 + }, + { + "epoch": 0.7970302280360615, + "grad_norm": 7.062002182006836, + "learning_rate": 1.4691541704735004e-05, + "loss": 1.5576, + "step": 126810 + }, + { + "epoch": 0.7970930803527586, + "grad_norm": 5.826161861419678, + "learning_rate": 1.4691122603790351e-05, + "loss": 1.5153, + "step": 126820 + }, + { + "epoch": 0.7971559326694557, + "grad_norm": 6.426275730133057, + "learning_rate": 1.4690703502845696e-05, + "loss": 1.4776, + "step": 126830 + }, + { + "epoch": 0.7972187849861528, + "grad_norm": 6.657223224639893, + "learning_rate": 1.4690284401901043e-05, + "loss": 1.704, + "step": 126840 + }, + { + "epoch": 0.79728163730285, + "grad_norm": 7.099383354187012, + "learning_rate": 1.4689865300956389e-05, + "loss": 1.7583, + "step": 126850 + }, + { + "epoch": 0.7973444896195471, + "grad_norm": 8.146724700927734, + "learning_rate": 1.4689446200011736e-05, + "loss": 1.7484, + "step": 126860 + }, + { + "epoch": 0.7974073419362442, + "grad_norm": 6.495960235595703, + "learning_rate": 1.4689027099067083e-05, + "loss": 1.4996, + "step": 126870 + }, + { + "epoch": 0.7974701942529413, + "grad_norm": 6.567388534545898, + "learning_rate": 1.4688607998122428e-05, + "loss": 1.6315, + "step": 126880 + }, + { + "epoch": 0.7975330465696384, + "grad_norm": 5.827784538269043, + "learning_rate": 1.4688188897177775e-05, + "loss": 1.6036, + "step": 126890 + }, + { + "epoch": 0.7975958988863355, + "grad_norm": 7.262020587921143, + "learning_rate": 1.4687769796233122e-05, + "loss": 1.5515, + "step": 126900 + }, + { + "epoch": 0.7976587512030326, + "grad_norm": 6.969066619873047, + "learning_rate": 1.468735069528847e-05, + "loss": 1.4091, + "step": 126910 + }, + { + "epoch": 0.7977216035197298, + "grad_norm": 8.006285667419434, + "learning_rate": 1.468697350443828e-05, + "loss": 1.5577, + "step": 126920 + }, + { + "epoch": 0.7977844558364269, + "grad_norm": 6.737123966217041, + "learning_rate": 1.4686554403493626e-05, + "loss": 1.6531, + "step": 126930 + }, + { + "epoch": 0.797847308153124, + "grad_norm": 6.720739364624023, + "learning_rate": 1.4686135302548973e-05, + "loss": 1.6536, + "step": 126940 + }, + { + "epoch": 0.7979101604698211, + "grad_norm": 6.321871757507324, + "learning_rate": 1.4685716201604319e-05, + "loss": 1.6377, + "step": 126950 + }, + { + "epoch": 0.7979730127865182, + "grad_norm": 5.901580810546875, + "learning_rate": 1.4685297100659666e-05, + "loss": 1.4752, + "step": 126960 + }, + { + "epoch": 0.7980358651032153, + "grad_norm": 7.815974712371826, + "learning_rate": 1.4684877999715013e-05, + "loss": 1.6385, + "step": 126970 + }, + { + "epoch": 0.7980987174199125, + "grad_norm": 7.375422954559326, + "learning_rate": 1.468445889877036e-05, + "loss": 1.3614, + "step": 126980 + }, + { + "epoch": 0.7981615697366095, + "grad_norm": 7.201641082763672, + "learning_rate": 1.4684039797825707e-05, + "loss": 1.5465, + "step": 126990 + }, + { + "epoch": 0.7982244220533066, + "grad_norm": 6.602486610412598, + "learning_rate": 1.468362069688105e-05, + "loss": 1.5143, + "step": 127000 + }, + { + "epoch": 0.7982872743700037, + "grad_norm": 6.114508152008057, + "learning_rate": 1.4683201595936398e-05, + "loss": 1.7175, + "step": 127010 + }, + { + "epoch": 0.7983501266867008, + "grad_norm": 5.410009384155273, + "learning_rate": 1.4682782494991745e-05, + "loss": 1.4378, + "step": 127020 + }, + { + "epoch": 0.7984129790033979, + "grad_norm": 7.354188919067383, + "learning_rate": 1.4682363394047092e-05, + "loss": 1.5651, + "step": 127030 + }, + { + "epoch": 0.798475831320095, + "grad_norm": 7.028811931610107, + "learning_rate": 1.4681944293102437e-05, + "loss": 1.6209, + "step": 127040 + }, + { + "epoch": 0.7985386836367921, + "grad_norm": 5.333646774291992, + "learning_rate": 1.4681525192157784e-05, + "loss": 1.6728, + "step": 127050 + }, + { + "epoch": 0.7986015359534893, + "grad_norm": 6.855044364929199, + "learning_rate": 1.4681106091213131e-05, + "loss": 1.5802, + "step": 127060 + }, + { + "epoch": 0.7986643882701864, + "grad_norm": 4.967195987701416, + "learning_rate": 1.4680686990268478e-05, + "loss": 1.6561, + "step": 127070 + }, + { + "epoch": 0.7987272405868835, + "grad_norm": 6.009040832519531, + "learning_rate": 1.4680267889323822e-05, + "loss": 1.4554, + "step": 127080 + }, + { + "epoch": 0.7987900929035806, + "grad_norm": 7.387078285217285, + "learning_rate": 1.467984878837917e-05, + "loss": 1.5842, + "step": 127090 + }, + { + "epoch": 0.7988529452202777, + "grad_norm": 7.210890769958496, + "learning_rate": 1.4679429687434516e-05, + "loss": 1.7337, + "step": 127100 + }, + { + "epoch": 0.7989157975369748, + "grad_norm": 5.606622695922852, + "learning_rate": 1.4679010586489863e-05, + "loss": 1.4004, + "step": 127110 + }, + { + "epoch": 0.798978649853672, + "grad_norm": 6.040121555328369, + "learning_rate": 1.467859148554521e-05, + "loss": 1.7046, + "step": 127120 + }, + { + "epoch": 0.7990415021703691, + "grad_norm": 6.001274585723877, + "learning_rate": 1.4678172384600556e-05, + "loss": 1.6245, + "step": 127130 + }, + { + "epoch": 0.7991043544870662, + "grad_norm": 6.918370246887207, + "learning_rate": 1.4677753283655903e-05, + "loss": 1.6022, + "step": 127140 + }, + { + "epoch": 0.7991672068037633, + "grad_norm": 6.731177806854248, + "learning_rate": 1.467733418271125e-05, + "loss": 1.8094, + "step": 127150 + }, + { + "epoch": 0.7992300591204604, + "grad_norm": 6.004929065704346, + "learning_rate": 1.4676915081766597e-05, + "loss": 1.742, + "step": 127160 + }, + { + "epoch": 0.7992929114371575, + "grad_norm": 6.8932976722717285, + "learning_rate": 1.467649598082194e-05, + "loss": 1.5241, + "step": 127170 + }, + { + "epoch": 0.7993557637538546, + "grad_norm": 5.122315406799316, + "learning_rate": 1.4676076879877288e-05, + "loss": 1.5932, + "step": 127180 + }, + { + "epoch": 0.7994186160705518, + "grad_norm": 5.804356098175049, + "learning_rate": 1.4675657778932635e-05, + "loss": 1.564, + "step": 127190 + }, + { + "epoch": 0.7994814683872489, + "grad_norm": 5.46740198135376, + "learning_rate": 1.4675238677987982e-05, + "loss": 1.5938, + "step": 127200 + }, + { + "epoch": 0.799544320703946, + "grad_norm": 7.471740245819092, + "learning_rate": 1.4674819577043329e-05, + "loss": 1.5785, + "step": 127210 + }, + { + "epoch": 0.7996071730206431, + "grad_norm": 5.829692840576172, + "learning_rate": 1.4674400476098674e-05, + "loss": 1.613, + "step": 127220 + }, + { + "epoch": 0.7996700253373402, + "grad_norm": 7.80814790725708, + "learning_rate": 1.467398137515402e-05, + "loss": 1.6094, + "step": 127230 + }, + { + "epoch": 0.7997328776540373, + "grad_norm": 6.667564868927002, + "learning_rate": 1.4673562274209367e-05, + "loss": 1.7065, + "step": 127240 + }, + { + "epoch": 0.7997957299707343, + "grad_norm": 6.570542812347412, + "learning_rate": 1.4673143173264714e-05, + "loss": 1.7314, + "step": 127250 + }, + { + "epoch": 0.7998585822874315, + "grad_norm": 7.139489650726318, + "learning_rate": 1.467272407232006e-05, + "loss": 1.6567, + "step": 127260 + }, + { + "epoch": 0.7999214346041286, + "grad_norm": 6.807154655456543, + "learning_rate": 1.4672304971375406e-05, + "loss": 1.6603, + "step": 127270 + }, + { + "epoch": 0.7999842869208257, + "grad_norm": 7.466335296630859, + "learning_rate": 1.4671885870430753e-05, + "loss": 1.5509, + "step": 127280 + }, + { + "epoch": 0.8000471392375228, + "grad_norm": 5.848179340362549, + "learning_rate": 1.46714667694861e-05, + "loss": 1.5739, + "step": 127290 + }, + { + "epoch": 0.8001099915542199, + "grad_norm": 5.600815296173096, + "learning_rate": 1.4671047668541448e-05, + "loss": 1.5581, + "step": 127300 + }, + { + "epoch": 0.800172843870917, + "grad_norm": 4.473421096801758, + "learning_rate": 1.4670628567596791e-05, + "loss": 1.2654, + "step": 127310 + }, + { + "epoch": 0.8002356961876141, + "grad_norm": 5.2405829429626465, + "learning_rate": 1.4670209466652138e-05, + "loss": 1.5498, + "step": 127320 + }, + { + "epoch": 0.8002985485043113, + "grad_norm": 5.992538928985596, + "learning_rate": 1.4669790365707485e-05, + "loss": 1.5347, + "step": 127330 + }, + { + "epoch": 0.8003614008210084, + "grad_norm": 7.98852014541626, + "learning_rate": 1.4669371264762832e-05, + "loss": 1.8067, + "step": 127340 + }, + { + "epoch": 0.8004242531377055, + "grad_norm": 6.850035190582275, + "learning_rate": 1.4668952163818178e-05, + "loss": 1.8199, + "step": 127350 + }, + { + "epoch": 0.8004871054544026, + "grad_norm": 5.654129981994629, + "learning_rate": 1.4668533062873525e-05, + "loss": 1.4623, + "step": 127360 + }, + { + "epoch": 0.8005499577710997, + "grad_norm": 6.825001239776611, + "learning_rate": 1.4668113961928872e-05, + "loss": 1.5293, + "step": 127370 + }, + { + "epoch": 0.8006128100877968, + "grad_norm": 7.440295219421387, + "learning_rate": 1.4667694860984219e-05, + "loss": 1.4749, + "step": 127380 + }, + { + "epoch": 0.800675662404494, + "grad_norm": 5.969831466674805, + "learning_rate": 1.4667275760039566e-05, + "loss": 1.5684, + "step": 127390 + }, + { + "epoch": 0.8007385147211911, + "grad_norm": 6.326897621154785, + "learning_rate": 1.466685665909491e-05, + "loss": 1.5869, + "step": 127400 + }, + { + "epoch": 0.8008013670378882, + "grad_norm": 6.353309154510498, + "learning_rate": 1.4666437558150257e-05, + "loss": 1.6948, + "step": 127410 + }, + { + "epoch": 0.8008642193545853, + "grad_norm": 6.800481796264648, + "learning_rate": 1.4666018457205604e-05, + "loss": 1.7255, + "step": 127420 + }, + { + "epoch": 0.8009270716712824, + "grad_norm": 7.093092918395996, + "learning_rate": 1.4665599356260951e-05, + "loss": 1.8261, + "step": 127430 + }, + { + "epoch": 0.8009899239879795, + "grad_norm": 6.4873881340026855, + "learning_rate": 1.4665180255316296e-05, + "loss": 1.7308, + "step": 127440 + }, + { + "epoch": 0.8010527763046766, + "grad_norm": 5.64680814743042, + "learning_rate": 1.4664761154371643e-05, + "loss": 1.4815, + "step": 127450 + }, + { + "epoch": 0.8011156286213738, + "grad_norm": 7.599221706390381, + "learning_rate": 1.466434205342699e-05, + "loss": 1.4035, + "step": 127460 + }, + { + "epoch": 0.8011784809380709, + "grad_norm": 6.603479862213135, + "learning_rate": 1.4663922952482338e-05, + "loss": 1.5161, + "step": 127470 + }, + { + "epoch": 0.801241333254768, + "grad_norm": 9.680336952209473, + "learning_rate": 1.4663503851537681e-05, + "loss": 1.5093, + "step": 127480 + }, + { + "epoch": 0.8013041855714651, + "grad_norm": 7.357080936431885, + "learning_rate": 1.4663084750593028e-05, + "loss": 1.4461, + "step": 127490 + }, + { + "epoch": 0.8013670378881621, + "grad_norm": 6.3268914222717285, + "learning_rate": 1.4662665649648375e-05, + "loss": 1.6055, + "step": 127500 + }, + { + "epoch": 0.8014298902048592, + "grad_norm": 5.655613422393799, + "learning_rate": 1.4662246548703722e-05, + "loss": 1.3888, + "step": 127510 + }, + { + "epoch": 0.8014927425215563, + "grad_norm": 6.709163665771484, + "learning_rate": 1.466182744775907e-05, + "loss": 1.522, + "step": 127520 + }, + { + "epoch": 0.8015555948382534, + "grad_norm": 5.913792610168457, + "learning_rate": 1.4661408346814415e-05, + "loss": 1.4601, + "step": 127530 + }, + { + "epoch": 0.8016184471549506, + "grad_norm": 6.938288688659668, + "learning_rate": 1.4660989245869762e-05, + "loss": 1.4908, + "step": 127540 + }, + { + "epoch": 0.8016812994716477, + "grad_norm": 5.863032341003418, + "learning_rate": 1.4660570144925107e-05, + "loss": 1.4513, + "step": 127550 + }, + { + "epoch": 0.8017441517883448, + "grad_norm": 6.508087635040283, + "learning_rate": 1.4660151043980454e-05, + "loss": 1.6434, + "step": 127560 + }, + { + "epoch": 0.8018070041050419, + "grad_norm": 7.035425662994385, + "learning_rate": 1.46597319430358e-05, + "loss": 1.7906, + "step": 127570 + }, + { + "epoch": 0.801869856421739, + "grad_norm": 5.500781536102295, + "learning_rate": 1.4659312842091147e-05, + "loss": 1.7829, + "step": 127580 + }, + { + "epoch": 0.8019327087384361, + "grad_norm": 6.251988887786865, + "learning_rate": 1.4658893741146494e-05, + "loss": 1.6834, + "step": 127590 + }, + { + "epoch": 0.8019955610551333, + "grad_norm": 6.672684192657471, + "learning_rate": 1.4658474640201841e-05, + "loss": 1.8085, + "step": 127600 + }, + { + "epoch": 0.8020584133718304, + "grad_norm": 7.56199836730957, + "learning_rate": 1.4658055539257188e-05, + "loss": 1.7226, + "step": 127610 + }, + { + "epoch": 0.8021212656885275, + "grad_norm": 5.457205772399902, + "learning_rate": 1.4657636438312532e-05, + "loss": 1.7512, + "step": 127620 + }, + { + "epoch": 0.8021841180052246, + "grad_norm": 6.948217868804932, + "learning_rate": 1.4657217337367879e-05, + "loss": 1.4926, + "step": 127630 + }, + { + "epoch": 0.8022469703219217, + "grad_norm": 5.721208095550537, + "learning_rate": 1.4656798236423226e-05, + "loss": 1.5069, + "step": 127640 + }, + { + "epoch": 0.8023098226386188, + "grad_norm": 6.864531517028809, + "learning_rate": 1.4656379135478573e-05, + "loss": 1.5754, + "step": 127650 + }, + { + "epoch": 0.802372674955316, + "grad_norm": 5.888504505157471, + "learning_rate": 1.4655960034533918e-05, + "loss": 1.5975, + "step": 127660 + }, + { + "epoch": 0.8024355272720131, + "grad_norm": 6.1465888023376465, + "learning_rate": 1.4655540933589265e-05, + "loss": 1.5006, + "step": 127670 + }, + { + "epoch": 0.8024983795887102, + "grad_norm": 5.462531089782715, + "learning_rate": 1.4655121832644612e-05, + "loss": 1.6284, + "step": 127680 + }, + { + "epoch": 0.8025612319054073, + "grad_norm": 5.80675745010376, + "learning_rate": 1.465470273169996e-05, + "loss": 1.4707, + "step": 127690 + }, + { + "epoch": 0.8026240842221044, + "grad_norm": 5.968596935272217, + "learning_rate": 1.4654283630755307e-05, + "loss": 1.6834, + "step": 127700 + }, + { + "epoch": 0.8026869365388015, + "grad_norm": 6.94851541519165, + "learning_rate": 1.465386452981065e-05, + "loss": 1.7579, + "step": 127710 + }, + { + "epoch": 0.8027497888554986, + "grad_norm": 6.325634002685547, + "learning_rate": 1.4653445428865997e-05, + "loss": 1.6801, + "step": 127720 + }, + { + "epoch": 0.8028126411721958, + "grad_norm": 7.044205665588379, + "learning_rate": 1.4653026327921344e-05, + "loss": 1.6491, + "step": 127730 + }, + { + "epoch": 0.8028754934888929, + "grad_norm": 6.051846504211426, + "learning_rate": 1.4652607226976692e-05, + "loss": 1.3398, + "step": 127740 + }, + { + "epoch": 0.80293834580559, + "grad_norm": 7.053153038024902, + "learning_rate": 1.4652188126032037e-05, + "loss": 1.6287, + "step": 127750 + }, + { + "epoch": 0.803001198122287, + "grad_norm": 6.5534234046936035, + "learning_rate": 1.4651769025087384e-05, + "loss": 1.4482, + "step": 127760 + }, + { + "epoch": 0.8030640504389841, + "grad_norm": 5.681098461151123, + "learning_rate": 1.4651349924142731e-05, + "loss": 1.4801, + "step": 127770 + }, + { + "epoch": 0.8031269027556812, + "grad_norm": 5.653262615203857, + "learning_rate": 1.4650930823198078e-05, + "loss": 1.4962, + "step": 127780 + }, + { + "epoch": 0.8031897550723783, + "grad_norm": 6.983116626739502, + "learning_rate": 1.4650511722253422e-05, + "loss": 1.6625, + "step": 127790 + }, + { + "epoch": 0.8032526073890754, + "grad_norm": 6.244621276855469, + "learning_rate": 1.4650092621308769e-05, + "loss": 1.7325, + "step": 127800 + }, + { + "epoch": 0.8033154597057726, + "grad_norm": 7.330923080444336, + "learning_rate": 1.4649673520364116e-05, + "loss": 1.4747, + "step": 127810 + }, + { + "epoch": 0.8033783120224697, + "grad_norm": 6.687651634216309, + "learning_rate": 1.4649254419419463e-05, + "loss": 1.593, + "step": 127820 + }, + { + "epoch": 0.8034411643391668, + "grad_norm": 5.817078590393066, + "learning_rate": 1.464883531847481e-05, + "loss": 1.491, + "step": 127830 + }, + { + "epoch": 0.8035040166558639, + "grad_norm": 6.812237739562988, + "learning_rate": 1.4648416217530155e-05, + "loss": 1.8285, + "step": 127840 + }, + { + "epoch": 0.803566868972561, + "grad_norm": 6.4954423904418945, + "learning_rate": 1.4647997116585503e-05, + "loss": 1.603, + "step": 127850 + }, + { + "epoch": 0.8036297212892581, + "grad_norm": 6.715160369873047, + "learning_rate": 1.4647578015640848e-05, + "loss": 1.6452, + "step": 127860 + }, + { + "epoch": 0.8036925736059553, + "grad_norm": 7.735368251800537, + "learning_rate": 1.4647158914696195e-05, + "loss": 1.8229, + "step": 127870 + }, + { + "epoch": 0.8037554259226524, + "grad_norm": 6.043717384338379, + "learning_rate": 1.464673981375154e-05, + "loss": 1.5291, + "step": 127880 + }, + { + "epoch": 0.8038182782393495, + "grad_norm": 7.275433540344238, + "learning_rate": 1.4646320712806887e-05, + "loss": 1.5868, + "step": 127890 + }, + { + "epoch": 0.8038811305560466, + "grad_norm": 6.811432838439941, + "learning_rate": 1.4645901611862234e-05, + "loss": 1.6473, + "step": 127900 + }, + { + "epoch": 0.8039439828727437, + "grad_norm": 7.094412803649902, + "learning_rate": 1.4645482510917582e-05, + "loss": 1.5549, + "step": 127910 + }, + { + "epoch": 0.8040068351894408, + "grad_norm": 7.31378173828125, + "learning_rate": 1.4645063409972929e-05, + "loss": 1.5162, + "step": 127920 + }, + { + "epoch": 0.8040696875061379, + "grad_norm": 6.261271953582764, + "learning_rate": 1.4644644309028272e-05, + "loss": 1.6291, + "step": 127930 + }, + { + "epoch": 0.8041325398228351, + "grad_norm": 6.118953704833984, + "learning_rate": 1.464422520808362e-05, + "loss": 1.5994, + "step": 127940 + }, + { + "epoch": 0.8041953921395322, + "grad_norm": 5.418634414672852, + "learning_rate": 1.4643806107138966e-05, + "loss": 1.589, + "step": 127950 + }, + { + "epoch": 0.8042582444562293, + "grad_norm": 6.648963451385498, + "learning_rate": 1.4643387006194314e-05, + "loss": 1.6008, + "step": 127960 + }, + { + "epoch": 0.8043210967729264, + "grad_norm": 7.5236382484436035, + "learning_rate": 1.4642967905249659e-05, + "loss": 1.6685, + "step": 127970 + }, + { + "epoch": 0.8043839490896235, + "grad_norm": 5.9270477294921875, + "learning_rate": 1.4642548804305006e-05, + "loss": 1.6547, + "step": 127980 + }, + { + "epoch": 0.8044468014063206, + "grad_norm": 7.353875637054443, + "learning_rate": 1.4642129703360353e-05, + "loss": 1.4871, + "step": 127990 + }, + { + "epoch": 0.8045096537230177, + "grad_norm": 7.18826961517334, + "learning_rate": 1.46417106024157e-05, + "loss": 1.8931, + "step": 128000 + }, + { + "epoch": 0.8045725060397148, + "grad_norm": 5.180515766143799, + "learning_rate": 1.4641291501471047e-05, + "loss": 1.5294, + "step": 128010 + }, + { + "epoch": 0.8046353583564119, + "grad_norm": 5.289478778839111, + "learning_rate": 1.4640872400526391e-05, + "loss": 1.4159, + "step": 128020 + }, + { + "epoch": 0.804698210673109, + "grad_norm": 6.481279373168945, + "learning_rate": 1.4640453299581738e-05, + "loss": 1.9098, + "step": 128030 + }, + { + "epoch": 0.8047610629898061, + "grad_norm": 6.723080635070801, + "learning_rate": 1.4640034198637085e-05, + "loss": 1.5707, + "step": 128040 + }, + { + "epoch": 0.8048239153065032, + "grad_norm": 5.434211254119873, + "learning_rate": 1.4639615097692432e-05, + "loss": 1.3818, + "step": 128050 + }, + { + "epoch": 0.8048867676232003, + "grad_norm": 7.487259387969971, + "learning_rate": 1.4639195996747777e-05, + "loss": 1.3277, + "step": 128060 + }, + { + "epoch": 0.8049496199398974, + "grad_norm": 6.7534942626953125, + "learning_rate": 1.4638776895803125e-05, + "loss": 1.6875, + "step": 128070 + }, + { + "epoch": 0.8050124722565946, + "grad_norm": 6.124451160430908, + "learning_rate": 1.4638357794858472e-05, + "loss": 1.6727, + "step": 128080 + }, + { + "epoch": 0.8050753245732917, + "grad_norm": 6.262798309326172, + "learning_rate": 1.4637938693913819e-05, + "loss": 1.5039, + "step": 128090 + }, + { + "epoch": 0.8051381768899888, + "grad_norm": 6.658420562744141, + "learning_rate": 1.4637519592969162e-05, + "loss": 1.5885, + "step": 128100 + }, + { + "epoch": 0.8052010292066859, + "grad_norm": 7.250082015991211, + "learning_rate": 1.463710049202451e-05, + "loss": 1.4267, + "step": 128110 + }, + { + "epoch": 0.805263881523383, + "grad_norm": 6.2989420890808105, + "learning_rate": 1.4636681391079856e-05, + "loss": 1.4532, + "step": 128120 + }, + { + "epoch": 0.8053267338400801, + "grad_norm": 7.128006935119629, + "learning_rate": 1.4636262290135204e-05, + "loss": 1.7059, + "step": 128130 + }, + { + "epoch": 0.8053895861567772, + "grad_norm": 6.768283367156982, + "learning_rate": 1.463584318919055e-05, + "loss": 1.7792, + "step": 128140 + }, + { + "epoch": 0.8054524384734744, + "grad_norm": 7.063636302947998, + "learning_rate": 1.4635424088245896e-05, + "loss": 1.4833, + "step": 128150 + }, + { + "epoch": 0.8055152907901715, + "grad_norm": 4.943026542663574, + "learning_rate": 1.4635004987301243e-05, + "loss": 1.4822, + "step": 128160 + }, + { + "epoch": 0.8055781431068686, + "grad_norm": 6.337063312530518, + "learning_rate": 1.4634585886356588e-05, + "loss": 1.4695, + "step": 128170 + }, + { + "epoch": 0.8056409954235657, + "grad_norm": 6.414618492126465, + "learning_rate": 1.4634166785411936e-05, + "loss": 1.5663, + "step": 128180 + }, + { + "epoch": 0.8057038477402628, + "grad_norm": 6.596250534057617, + "learning_rate": 1.4633747684467281e-05, + "loss": 1.6292, + "step": 128190 + }, + { + "epoch": 0.8057667000569599, + "grad_norm": 7.159796237945557, + "learning_rate": 1.4633328583522628e-05, + "loss": 1.6192, + "step": 128200 + }, + { + "epoch": 0.805829552373657, + "grad_norm": 5.642171382904053, + "learning_rate": 1.4632909482577975e-05, + "loss": 1.3166, + "step": 128210 + }, + { + "epoch": 0.8058924046903542, + "grad_norm": 6.385321140289307, + "learning_rate": 1.4632490381633322e-05, + "loss": 1.5261, + "step": 128220 + }, + { + "epoch": 0.8059552570070513, + "grad_norm": 7.153017997741699, + "learning_rate": 1.463207128068867e-05, + "loss": 1.5968, + "step": 128230 + }, + { + "epoch": 0.8060181093237484, + "grad_norm": 4.993671894073486, + "learning_rate": 1.4631652179744013e-05, + "loss": 1.6347, + "step": 128240 + }, + { + "epoch": 0.8060809616404455, + "grad_norm": 6.2314982414245605, + "learning_rate": 1.463123307879936e-05, + "loss": 1.4873, + "step": 128250 + }, + { + "epoch": 0.8061438139571426, + "grad_norm": 5.749682903289795, + "learning_rate": 1.4630813977854707e-05, + "loss": 1.2734, + "step": 128260 + }, + { + "epoch": 0.8062066662738396, + "grad_norm": 6.511654376983643, + "learning_rate": 1.4630394876910054e-05, + "loss": 1.7047, + "step": 128270 + }, + { + "epoch": 0.8062695185905367, + "grad_norm": 6.978699684143066, + "learning_rate": 1.46299757759654e-05, + "loss": 1.6343, + "step": 128280 + }, + { + "epoch": 0.8063323709072339, + "grad_norm": 6.761673450469971, + "learning_rate": 1.4629556675020747e-05, + "loss": 1.5412, + "step": 128290 + }, + { + "epoch": 0.806395223223931, + "grad_norm": 6.2292914390563965, + "learning_rate": 1.4629137574076094e-05, + "loss": 1.7083, + "step": 128300 + }, + { + "epoch": 0.8064580755406281, + "grad_norm": 6.495267868041992, + "learning_rate": 1.462871847313144e-05, + "loss": 1.5973, + "step": 128310 + }, + { + "epoch": 0.8065209278573252, + "grad_norm": 6.518203258514404, + "learning_rate": 1.4628299372186788e-05, + "loss": 1.5758, + "step": 128320 + }, + { + "epoch": 0.8065837801740223, + "grad_norm": 5.8363566398620605, + "learning_rate": 1.4627880271242131e-05, + "loss": 1.2377, + "step": 128330 + }, + { + "epoch": 0.8066466324907194, + "grad_norm": 7.506755352020264, + "learning_rate": 1.4627461170297478e-05, + "loss": 1.4138, + "step": 128340 + }, + { + "epoch": 0.8067094848074166, + "grad_norm": 6.106832027435303, + "learning_rate": 1.4627042069352826e-05, + "loss": 1.6611, + "step": 128350 + }, + { + "epoch": 0.8067723371241137, + "grad_norm": 5.887393474578857, + "learning_rate": 1.4626622968408173e-05, + "loss": 1.6214, + "step": 128360 + }, + { + "epoch": 0.8068351894408108, + "grad_norm": 5.715047836303711, + "learning_rate": 1.4626203867463518e-05, + "loss": 1.7647, + "step": 128370 + }, + { + "epoch": 0.8068980417575079, + "grad_norm": 4.8202080726623535, + "learning_rate": 1.4625784766518865e-05, + "loss": 1.6554, + "step": 128380 + }, + { + "epoch": 0.806960894074205, + "grad_norm": 6.009983539581299, + "learning_rate": 1.4625365665574212e-05, + "loss": 1.4481, + "step": 128390 + }, + { + "epoch": 0.8070237463909021, + "grad_norm": 6.740313529968262, + "learning_rate": 1.462494656462956e-05, + "loss": 1.6375, + "step": 128400 + }, + { + "epoch": 0.8070865987075992, + "grad_norm": 6.739039421081543, + "learning_rate": 1.4624527463684903e-05, + "loss": 1.4724, + "step": 128410 + }, + { + "epoch": 0.8071494510242964, + "grad_norm": 6.603867053985596, + "learning_rate": 1.462410836274025e-05, + "loss": 1.462, + "step": 128420 + }, + { + "epoch": 0.8072123033409935, + "grad_norm": 6.111894130706787, + "learning_rate": 1.4623689261795597e-05, + "loss": 1.3688, + "step": 128430 + }, + { + "epoch": 0.8072751556576906, + "grad_norm": 5.348015308380127, + "learning_rate": 1.4623270160850944e-05, + "loss": 1.4991, + "step": 128440 + }, + { + "epoch": 0.8073380079743877, + "grad_norm": 5.96295690536499, + "learning_rate": 1.4622851059906291e-05, + "loss": 1.5457, + "step": 128450 + }, + { + "epoch": 0.8074008602910848, + "grad_norm": 5.46589469909668, + "learning_rate": 1.4622431958961637e-05, + "loss": 1.4834, + "step": 128460 + }, + { + "epoch": 0.8074637126077819, + "grad_norm": 11.722580909729004, + "learning_rate": 1.4622012858016984e-05, + "loss": 1.7034, + "step": 128470 + }, + { + "epoch": 0.807526564924479, + "grad_norm": 6.253749370574951, + "learning_rate": 1.462159375707233e-05, + "loss": 1.4491, + "step": 128480 + }, + { + "epoch": 0.8075894172411762, + "grad_norm": 5.999065399169922, + "learning_rate": 1.4621174656127676e-05, + "loss": 1.6275, + "step": 128490 + }, + { + "epoch": 0.8076522695578733, + "grad_norm": 6.462035179138184, + "learning_rate": 1.4620755555183021e-05, + "loss": 1.5508, + "step": 128500 + }, + { + "epoch": 0.8077151218745704, + "grad_norm": 6.563819408416748, + "learning_rate": 1.4620336454238369e-05, + "loss": 1.7511, + "step": 128510 + }, + { + "epoch": 0.8077779741912675, + "grad_norm": 5.843883037567139, + "learning_rate": 1.4619917353293716e-05, + "loss": 1.9093, + "step": 128520 + }, + { + "epoch": 0.8078408265079645, + "grad_norm": 6.107423305511475, + "learning_rate": 1.4619498252349063e-05, + "loss": 1.6832, + "step": 128530 + }, + { + "epoch": 0.8079036788246616, + "grad_norm": 5.802535057067871, + "learning_rate": 1.461907915140441e-05, + "loss": 1.5708, + "step": 128540 + }, + { + "epoch": 0.8079665311413587, + "grad_norm": 7.047648906707764, + "learning_rate": 1.4618660050459753e-05, + "loss": 1.6832, + "step": 128550 + }, + { + "epoch": 0.8080293834580559, + "grad_norm": 6.248017311096191, + "learning_rate": 1.46182409495151e-05, + "loss": 1.3829, + "step": 128560 + }, + { + "epoch": 0.808092235774753, + "grad_norm": 5.463923454284668, + "learning_rate": 1.4617821848570448e-05, + "loss": 1.6062, + "step": 128570 + }, + { + "epoch": 0.8081550880914501, + "grad_norm": 6.9560136795043945, + "learning_rate": 1.4617402747625795e-05, + "loss": 1.7184, + "step": 128580 + }, + { + "epoch": 0.8082179404081472, + "grad_norm": 7.091958045959473, + "learning_rate": 1.461698364668114e-05, + "loss": 1.4344, + "step": 128590 + }, + { + "epoch": 0.8082807927248443, + "grad_norm": 6.31084680557251, + "learning_rate": 1.4616564545736487e-05, + "loss": 1.6064, + "step": 128600 + }, + { + "epoch": 0.8083436450415414, + "grad_norm": 6.201137542724609, + "learning_rate": 1.4616145444791834e-05, + "loss": 1.4612, + "step": 128610 + }, + { + "epoch": 0.8084064973582386, + "grad_norm": 3.948305606842041, + "learning_rate": 1.4615726343847181e-05, + "loss": 1.4763, + "step": 128620 + }, + { + "epoch": 0.8084693496749357, + "grad_norm": 6.799100875854492, + "learning_rate": 1.4615307242902528e-05, + "loss": 1.5969, + "step": 128630 + }, + { + "epoch": 0.8085322019916328, + "grad_norm": 5.397566795349121, + "learning_rate": 1.4614888141957872e-05, + "loss": 1.4547, + "step": 128640 + }, + { + "epoch": 0.8085950543083299, + "grad_norm": 6.227335453033447, + "learning_rate": 1.4614469041013219e-05, + "loss": 1.4481, + "step": 128650 + }, + { + "epoch": 0.808657906625027, + "grad_norm": 6.401673793792725, + "learning_rate": 1.4614049940068566e-05, + "loss": 1.5091, + "step": 128660 + }, + { + "epoch": 0.8087207589417241, + "grad_norm": 6.572157382965088, + "learning_rate": 1.4613630839123913e-05, + "loss": 1.7148, + "step": 128670 + }, + { + "epoch": 0.8087836112584212, + "grad_norm": 5.99587345123291, + "learning_rate": 1.4613211738179259e-05, + "loss": 1.5073, + "step": 128680 + }, + { + "epoch": 0.8088464635751184, + "grad_norm": 6.44499397277832, + "learning_rate": 1.4612792637234606e-05, + "loss": 1.6425, + "step": 128690 + }, + { + "epoch": 0.8089093158918155, + "grad_norm": 6.990484714508057, + "learning_rate": 1.4612373536289953e-05, + "loss": 1.5177, + "step": 128700 + } + ], + "logging_steps": 10, + "max_steps": 477312, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.892036676517192e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}