{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.16691208545898775, "eval_steps": 500, "global_step": 81500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00010240005242882685, "grad_norm": 1.5689221620559692, "learning_rate": 1.47e-05, "loss": 10.437386474609376, "step": 50 }, { "epoch": 0.0002048001048576537, "grad_norm": 1.4314604997634888, "learning_rate": 2.97e-05, "loss": 8.872786865234374, "step": 100 }, { "epoch": 0.00030720015728648054, "grad_norm": 2.1900956630706787, "learning_rate": 4.4699999999999996e-05, "loss": 6.67523681640625, "step": 150 }, { "epoch": 0.0004096002097153074, "grad_norm": 0.7161903381347656, "learning_rate": 5.97e-05, "loss": 4.3745730590820315, "step": 200 }, { "epoch": 0.0005120002621441342, "grad_norm": 1.2864420413970947, "learning_rate": 7.47e-05, "loss": 1.9937155151367187, "step": 250 }, { "epoch": 0.0006144003145729611, "grad_norm": 1.5255779027938843, "learning_rate": 8.969999999999998e-05, "loss": 6.841383056640625, "step": 300 }, { "epoch": 0.0007168003670017879, "grad_norm": 1.0778907537460327, "learning_rate": 0.00010469999999999998, "loss": 6.193285522460937, "step": 350 }, { "epoch": 0.0008192004194306148, "grad_norm": 0.8099711537361145, "learning_rate": 0.0001197, "loss": 6.307914428710937, "step": 400 }, { "epoch": 0.0009216004718594416, "grad_norm": 1.3735090494155884, "learning_rate": 0.0001347, "loss": 5.728865966796875, "step": 450 }, { "epoch": 0.0010240005242882684, "grad_norm": 1.2599254846572876, "learning_rate": 0.00014969999999999998, "loss": 5.8577117919921875, "step": 500 }, { "epoch": 0.0011264005767170954, "grad_norm": 1.0690525770187378, "learning_rate": 0.0001647, "loss": 5.578800048828125, "step": 550 }, { "epoch": 0.0012288006291459222, "grad_norm": 0.9692347049713135, "learning_rate": 0.00017969999999999998, "loss": 5.012375183105469, "step": 600 }, { "epoch": 0.001331200681574749, "grad_norm": 1.241825819015503, "learning_rate": 0.0001947, "loss": 6.2367095947265625, "step": 650 }, { "epoch": 0.0014336007340035757, "grad_norm": 1.1092249155044556, "learning_rate": 0.00020969999999999997, "loss": 5.6980218505859375, "step": 700 }, { "epoch": 0.0015360007864324027, "grad_norm": 0.8965554237365723, "learning_rate": 0.0002247, "loss": 5.374319458007813, "step": 750 }, { "epoch": 0.0016384008388612295, "grad_norm": 1.4790899753570557, "learning_rate": 0.0002397, "loss": 4.950992126464843, "step": 800 }, { "epoch": 0.0017408008912900563, "grad_norm": 0.9521295428276062, "learning_rate": 0.00025469999999999996, "loss": 5.40803955078125, "step": 850 }, { "epoch": 0.0018432009437188831, "grad_norm": 0.836391806602478, "learning_rate": 0.0002697, "loss": 5.781373291015625, "step": 900 }, { "epoch": 0.0019456009961477101, "grad_norm": 0.9251846075057983, "learning_rate": 0.0002847, "loss": 5.027839660644531, "step": 950 }, { "epoch": 0.0020480010485765367, "grad_norm": 0.8701666593551636, "learning_rate": 0.00029969999999999997, "loss": 5.40314208984375, "step": 1000 }, { "epoch": 0.002150401101005364, "grad_norm": 1.102386474609375, "learning_rate": 0.0002999999925149585, "loss": 5.350908203125, "step": 1050 }, { "epoch": 0.0022528011534341907, "grad_norm": 1.2463473081588745, "learning_rate": 0.0002999999694456937, "loss": 5.666819458007812, "step": 1100 }, { "epoch": 0.0023552012058630175, "grad_norm": 1.5089119672775269, "learning_rate": 0.00029999993078909046, "loss": 5.699287109375, "step": 1150 }, { "epoch": 0.0024576012582918443, "grad_norm": 0.8943452835083008, "learning_rate": 0.0002999998765451527, "loss": 5.515684814453125, "step": 1200 }, { "epoch": 0.002560001310720671, "grad_norm": 0.9431995749473572, "learning_rate": 0.0002999998067138862, "loss": 5.668785400390625, "step": 1250 }, { "epoch": 0.002662401363149498, "grad_norm": 0.7088674902915955, "learning_rate": 0.00029999972129529813, "loss": 5.059076843261718, "step": 1300 }, { "epoch": 0.0027648014155783247, "grad_norm": 0.7139531970024109, "learning_rate": 0.00029999962028939744, "loss": 5.189839477539063, "step": 1350 }, { "epoch": 0.0028672014680071515, "grad_norm": 0.6557895541191101, "learning_rate": 0.0002999995036961946, "loss": 5.100037841796875, "step": 1400 }, { "epoch": 0.0029696015204359783, "grad_norm": 2.214290142059326, "learning_rate": 0.0002999993715157016, "loss": 3.933666076660156, "step": 1450 }, { "epoch": 0.0030720015728648055, "grad_norm": 1.1693249940872192, "learning_rate": 0.0002999992237479324, "loss": 5.641339721679688, "step": 1500 }, { "epoch": 0.0031744016252936323, "grad_norm": 0.812566876411438, "learning_rate": 0.0002999990603929022, "loss": 4.826256408691406, "step": 1550 }, { "epoch": 0.003276801677722459, "grad_norm": 0.8744778037071228, "learning_rate": 0.00029999888145062803, "loss": 5.060762329101562, "step": 1600 }, { "epoch": 0.003379201730151286, "grad_norm": 1.4869335889816284, "learning_rate": 0.0002999986869211285, "loss": 5.231287231445313, "step": 1650 }, { "epoch": 0.0034816017825801127, "grad_norm": 2.121548652648926, "learning_rate": 0.0002999984768044237, "loss": 5.097483520507812, "step": 1700 }, { "epoch": 0.0035840018350089395, "grad_norm": 0.8741556406021118, "learning_rate": 0.00029999825110053565, "loss": 4.697709045410156, "step": 1750 }, { "epoch": 0.0036864018874377662, "grad_norm": 0.6771953105926514, "learning_rate": 0.00029999800980948764, "loss": 5.405962524414062, "step": 1800 }, { "epoch": 0.003788801939866593, "grad_norm": 0.7090007066726685, "learning_rate": 0.00029999775293130485, "loss": 5.24799560546875, "step": 1850 }, { "epoch": 0.0038912019922954203, "grad_norm": 0.561838686466217, "learning_rate": 0.00029999748046601396, "loss": 5.034546813964844, "step": 1900 }, { "epoch": 0.003993602044724247, "grad_norm": 1.955099105834961, "learning_rate": 0.0002999971924136432, "loss": 4.816056823730468, "step": 1950 }, { "epoch": 0.004096002097153073, "grad_norm": 1.5861859321594238, "learning_rate": 0.00029999688877422264, "loss": 4.836883544921875, "step": 2000 }, { "epoch": 0.0041984021495819, "grad_norm": 0.599829375743866, "learning_rate": 0.00029999656954778374, "loss": 4.677350463867188, "step": 2050 }, { "epoch": 0.004300802202010728, "grad_norm": 0.7785560488700867, "learning_rate": 0.0002999962347343597, "loss": 4.665549621582032, "step": 2100 }, { "epoch": 0.004403202254439555, "grad_norm": 0.7040075659751892, "learning_rate": 0.00029999588433398533, "loss": 4.816753540039063, "step": 2150 }, { "epoch": 0.0045056023068683814, "grad_norm": 0.9000102877616882, "learning_rate": 0.00029999551834669695, "loss": 4.776250915527344, "step": 2200 }, { "epoch": 0.004608002359297208, "grad_norm": 0.8187811374664307, "learning_rate": 0.0002999951367725327, "loss": 5.544743041992188, "step": 2250 }, { "epoch": 0.004710402411726035, "grad_norm": 0.684819757938385, "learning_rate": 0.0002999947396115322, "loss": 5.165157470703125, "step": 2300 }, { "epoch": 0.004812802464154862, "grad_norm": 1.125178337097168, "learning_rate": 0.0002999943268637367, "loss": 4.768605651855469, "step": 2350 }, { "epoch": 0.004915202516583689, "grad_norm": 0.8499088287353516, "learning_rate": 0.0002999938985291891, "loss": 4.563653869628906, "step": 2400 }, { "epoch": 0.005017602569012515, "grad_norm": 0.8239416480064392, "learning_rate": 0.0002999934546079339, "loss": 4.3343331909179685, "step": 2450 }, { "epoch": 0.005120002621441342, "grad_norm": 0.9708461761474609, "learning_rate": 0.00029999299510001726, "loss": 4.572106018066406, "step": 2500 }, { "epoch": 0.005222402673870169, "grad_norm": 0.5595722794532776, "learning_rate": 0.0002999925200054869, "loss": 3.886677551269531, "step": 2550 }, { "epoch": 0.005324802726298996, "grad_norm": 0.843467116355896, "learning_rate": 0.0002999920293243922, "loss": 4.781981506347656, "step": 2600 }, { "epoch": 0.005427202778727823, "grad_norm": 0.7127471566200256, "learning_rate": 0.0002999915230567842, "loss": 4.583160400390625, "step": 2650 }, { "epoch": 0.005529602831156649, "grad_norm": 1.2107303142547607, "learning_rate": 0.00029999100120271544, "loss": 4.792764587402344, "step": 2700 }, { "epoch": 0.005632002883585476, "grad_norm": 0.46370163559913635, "learning_rate": 0.0002999904637622402, "loss": 4.452548522949218, "step": 2750 }, { "epoch": 0.005734402936014303, "grad_norm": 0.8558986186981201, "learning_rate": 0.00029998991073541424, "loss": 4.687911376953125, "step": 2800 }, { "epoch": 0.00583680298844313, "grad_norm": 0.716712236404419, "learning_rate": 0.0002999893421222951, "loss": 5.1007318115234375, "step": 2850 }, { "epoch": 0.0059392030408719565, "grad_norm": 0.6236938238143921, "learning_rate": 0.00029998875792294186, "loss": 4.0649325561523435, "step": 2900 }, { "epoch": 0.006041603093300784, "grad_norm": 0.7991392612457275, "learning_rate": 0.0002999881581374152, "loss": 5.119035339355468, "step": 2950 }, { "epoch": 0.006144003145729611, "grad_norm": 0.8357495665550232, "learning_rate": 0.00029998754276577757, "loss": 4.757432556152343, "step": 3000 }, { "epoch": 0.006246403198158438, "grad_norm": 0.6117859482765198, "learning_rate": 0.0002999869118080927, "loss": 4.448386840820312, "step": 3050 }, { "epoch": 0.0063488032505872646, "grad_norm": 0.49256569147109985, "learning_rate": 0.0002999862652644263, "loss": 3.11305419921875, "step": 3100 }, { "epoch": 0.006451203303016091, "grad_norm": 0.6232755184173584, "learning_rate": 0.00029998560313484557, "loss": 4.7346923828125, "step": 3150 }, { "epoch": 0.006553603355444918, "grad_norm": 0.9806835055351257, "learning_rate": 0.00029998492541941926, "loss": 5.011588745117187, "step": 3200 }, { "epoch": 0.006656003407873745, "grad_norm": 0.5504988431930542, "learning_rate": 0.00029998423211821776, "loss": 4.568263549804687, "step": 3250 }, { "epoch": 0.006758403460302572, "grad_norm": 1.2172794342041016, "learning_rate": 0.0002999835232313133, "loss": 4.617164306640625, "step": 3300 }, { "epoch": 0.0068608035127313985, "grad_norm": 0.8813052773475647, "learning_rate": 0.0002999827987587793, "loss": 4.053099975585938, "step": 3350 }, { "epoch": 0.006963203565160225, "grad_norm": 0.9132696986198425, "learning_rate": 0.0002999820587006912, "loss": 3.3842108154296877, "step": 3400 }, { "epoch": 0.007065603617589052, "grad_norm": 0.6898446679115295, "learning_rate": 0.0002999813030571258, "loss": 4.640269470214844, "step": 3450 }, { "epoch": 0.007168003670017879, "grad_norm": 0.8895163536071777, "learning_rate": 0.0002999805318281617, "loss": 4.337832641601563, "step": 3500 }, { "epoch": 0.007270403722446706, "grad_norm": 0.8650217056274414, "learning_rate": 0.000299979745013879, "loss": 4.312217102050782, "step": 3550 }, { "epoch": 0.0073728037748755325, "grad_norm": 0.8591002821922302, "learning_rate": 0.0002999789426143595, "loss": 4.517200622558594, "step": 3600 }, { "epoch": 0.007475203827304359, "grad_norm": 1.0993435382843018, "learning_rate": 0.0002999781246296866, "loss": 5.017222900390625, "step": 3650 }, { "epoch": 0.007577603879733186, "grad_norm": 0.826409101486206, "learning_rate": 0.00029997729105994523, "loss": 5.4449609375, "step": 3700 }, { "epoch": 0.007680003932162013, "grad_norm": 0.7336626052856445, "learning_rate": 0.0002999764419052221, "loss": 5.442882080078125, "step": 3750 }, { "epoch": 0.0077824039845908405, "grad_norm": 0.8554229140281677, "learning_rate": 0.00029997557716560536, "loss": 5.044765625, "step": 3800 }, { "epoch": 0.007884804037019667, "grad_norm": 1.2047715187072754, "learning_rate": 0.0002999746968411849, "loss": 5.347750244140625, "step": 3850 }, { "epoch": 0.007987204089448493, "grad_norm": 0.6852602362632751, "learning_rate": 0.00029997380093205227, "loss": 5.431246948242188, "step": 3900 }, { "epoch": 0.008089604141877321, "grad_norm": 0.599185585975647, "learning_rate": 0.00029997288943830043, "loss": 5.4587548828125, "step": 3950 }, { "epoch": 0.008192004194306147, "grad_norm": 0.6573649644851685, "learning_rate": 0.0002999719623600242, "loss": 5.388607177734375, "step": 4000 }, { "epoch": 0.008294404246734974, "grad_norm": 0.8899281024932861, "learning_rate": 0.00029997101969731995, "loss": 5.013424072265625, "step": 4050 }, { "epoch": 0.0083968042991638, "grad_norm": 0.7623964548110962, "learning_rate": 0.0002999700614502855, "loss": 5.455863037109375, "step": 4100 }, { "epoch": 0.008499204351592628, "grad_norm": 0.6434335112571716, "learning_rate": 0.0002999690876190205, "loss": 4.965211791992187, "step": 4150 }, { "epoch": 0.008601604404021456, "grad_norm": 1.0846576690673828, "learning_rate": 0.0002999680982036263, "loss": 5.367398071289062, "step": 4200 }, { "epoch": 0.008704004456450282, "grad_norm": 0.687623143196106, "learning_rate": 0.0002999670932042054, "loss": 5.260775146484375, "step": 4250 }, { "epoch": 0.00880640450887911, "grad_norm": 0.7438795566558838, "learning_rate": 0.0002999660726208625, "loss": 4.861600341796875, "step": 4300 }, { "epoch": 0.008908804561307935, "grad_norm": 0.653516948223114, "learning_rate": 0.0002999650364537035, "loss": 5.213981323242187, "step": 4350 }, { "epoch": 0.009011204613736763, "grad_norm": 0.6365879774093628, "learning_rate": 0.0002999639847028362, "loss": 5.282333984375, "step": 4400 }, { "epoch": 0.009113604666165589, "grad_norm": 1.073702335357666, "learning_rate": 0.00029996291736836977, "loss": 4.728897705078125, "step": 4450 }, { "epoch": 0.009216004718594416, "grad_norm": 0.5726307034492493, "learning_rate": 0.00029996183445041524, "loss": 4.985563354492188, "step": 4500 }, { "epoch": 0.009318404771023242, "grad_norm": 0.8428155779838562, "learning_rate": 0.00029996073594908503, "loss": 5.237740478515625, "step": 4550 }, { "epoch": 0.00942080482345207, "grad_norm": 0.7983867526054382, "learning_rate": 0.0002999596218644934, "loss": 5.2612847900390625, "step": 4600 }, { "epoch": 0.009523204875880896, "grad_norm": 1.4800513982772827, "learning_rate": 0.000299958492196756, "loss": 5.220035400390625, "step": 4650 }, { "epoch": 0.009625604928309724, "grad_norm": 0.7891004085540771, "learning_rate": 0.00029995734694599033, "loss": 4.930169677734375, "step": 4700 }, { "epoch": 0.00972800498073855, "grad_norm": 0.6847373247146606, "learning_rate": 0.0002999561861123153, "loss": 4.984630126953125, "step": 4750 }, { "epoch": 0.009830405033167377, "grad_norm": 0.6594445705413818, "learning_rate": 0.0002999550096958517, "loss": 5.030910034179687, "step": 4800 }, { "epoch": 0.009932805085596203, "grad_norm": 0.6435703635215759, "learning_rate": 0.0002999538176967216, "loss": 5.204117431640625, "step": 4850 }, { "epoch": 0.01003520513802503, "grad_norm": 0.43691107630729675, "learning_rate": 0.0002999526101150489, "loss": 4.9494412231445315, "step": 4900 }, { "epoch": 0.010137605190453857, "grad_norm": 0.839853823184967, "learning_rate": 0.00029995138695095914, "loss": 3.1014248657226564, "step": 4950 }, { "epoch": 0.010240005242882684, "grad_norm": 0.8040403723716736, "learning_rate": 0.00029995014820457947, "loss": 5.11622314453125, "step": 5000 }, { "epoch": 0.010342405295311512, "grad_norm": 0.6953795552253723, "learning_rate": 0.0002999488938760385, "loss": 5.122266235351563, "step": 5050 }, { "epoch": 0.010444805347740338, "grad_norm": 0.5960660576820374, "learning_rate": 0.00029994762396546665, "loss": 4.512597961425781, "step": 5100 }, { "epoch": 0.010547205400169166, "grad_norm": 0.7795936465263367, "learning_rate": 0.0002999463384729958, "loss": 4.1439907836914065, "step": 5150 }, { "epoch": 0.010649605452597992, "grad_norm": 0.5827996730804443, "learning_rate": 0.0002999450373987597, "loss": 5.13221435546875, "step": 5200 }, { "epoch": 0.01075200550502682, "grad_norm": 0.5559226870536804, "learning_rate": 0.0002999437207428934, "loss": 5.330996704101563, "step": 5250 }, { "epoch": 0.010854405557455645, "grad_norm": 0.7576444745063782, "learning_rate": 0.0002999423885055338, "loss": 5.0482110595703125, "step": 5300 }, { "epoch": 0.010956805609884473, "grad_norm": 0.6038886308670044, "learning_rate": 0.0002999410406868193, "loss": 5.026975708007813, "step": 5350 }, { "epoch": 0.011059205662313299, "grad_norm": 0.8441299200057983, "learning_rate": 0.00029993967728688997, "loss": 5.212452392578125, "step": 5400 }, { "epoch": 0.011161605714742126, "grad_norm": 0.6785016655921936, "learning_rate": 0.00029993829830588745, "loss": 5.052464599609375, "step": 5450 }, { "epoch": 0.011264005767170952, "grad_norm": 0.7248463034629822, "learning_rate": 0.0002999369037439551, "loss": 4.948311157226563, "step": 5500 }, { "epoch": 0.01136640581959978, "grad_norm": 1.8698147535324097, "learning_rate": 0.00029993549360123777, "loss": 4.748592529296875, "step": 5550 }, { "epoch": 0.011468805872028606, "grad_norm": 0.5474430918693542, "learning_rate": 0.0002999340678778821, "loss": 4.849425659179688, "step": 5600 }, { "epoch": 0.011571205924457434, "grad_norm": 0.6169009804725647, "learning_rate": 0.00029993262657403613, "loss": 4.795867919921875, "step": 5650 }, { "epoch": 0.01167360597688626, "grad_norm": 0.773813009262085, "learning_rate": 0.0002999311696898497, "loss": 4.561126098632813, "step": 5700 }, { "epoch": 0.011776006029315087, "grad_norm": 0.841324508190155, "learning_rate": 0.00029992969722547424, "loss": 4.801204223632812, "step": 5750 }, { "epoch": 0.011878406081743913, "grad_norm": 0.6325180530548096, "learning_rate": 0.0002999282091810627, "loss": 5.0141598510742185, "step": 5800 }, { "epoch": 0.01198080613417274, "grad_norm": 0.6073687672615051, "learning_rate": 0.00029992670555676964, "loss": 4.727720642089844, "step": 5850 }, { "epoch": 0.012083206186601568, "grad_norm": 0.7254152297973633, "learning_rate": 0.00029992518635275147, "loss": 5.180827026367187, "step": 5900 }, { "epoch": 0.012185606239030394, "grad_norm": 0.6669420599937439, "learning_rate": 0.000299923651569166, "loss": 5.232777099609375, "step": 5950 }, { "epoch": 0.012288006291459222, "grad_norm": 0.9034198522567749, "learning_rate": 0.0002999221012061726, "loss": 4.571735229492187, "step": 6000 }, { "epoch": 0.012390406343888048, "grad_norm": 0.9541974663734436, "learning_rate": 0.0002999205352639326, "loss": 4.678871459960938, "step": 6050 }, { "epoch": 0.012492806396316876, "grad_norm": 1.4738138914108276, "learning_rate": 0.0002999189537426085, "loss": 4.96472412109375, "step": 6100 }, { "epoch": 0.012595206448745701, "grad_norm": 0.7434485554695129, "learning_rate": 0.0002999173566423648, "loss": 5.090062255859375, "step": 6150 }, { "epoch": 0.012697606501174529, "grad_norm": 0.5921583771705627, "learning_rate": 0.0002999157439633674, "loss": 4.839577026367188, "step": 6200 }, { "epoch": 0.012800006553603355, "grad_norm": 0.5730924606323242, "learning_rate": 0.00029991411570578385, "loss": 4.44057373046875, "step": 6250 }, { "epoch": 0.012902406606032183, "grad_norm": 0.6314680576324463, "learning_rate": 0.0002999124718697834, "loss": 4.906407165527344, "step": 6300 }, { "epoch": 0.013004806658461009, "grad_norm": 0.5586856603622437, "learning_rate": 0.00029991081245553695, "loss": 4.8386752319335935, "step": 6350 }, { "epoch": 0.013107206710889836, "grad_norm": 0.4960859417915344, "learning_rate": 0.0002999091374632168, "loss": 4.7797067260742185, "step": 6400 }, { "epoch": 0.013209606763318662, "grad_norm": 0.7504858374595642, "learning_rate": 0.0002999074468929971, "loss": 4.906391906738281, "step": 6450 }, { "epoch": 0.01331200681574749, "grad_norm": 0.5791200995445251, "learning_rate": 0.0002999057407450534, "loss": 4.6073193359375, "step": 6500 }, { "epoch": 0.013414406868176316, "grad_norm": 1.04066002368927, "learning_rate": 0.00029990401901956314, "loss": 4.697982177734375, "step": 6550 }, { "epoch": 0.013516806920605143, "grad_norm": 0.5570167899131775, "learning_rate": 0.0002999022817167052, "loss": 5.063222351074219, "step": 6600 }, { "epoch": 0.01361920697303397, "grad_norm": 0.6061655879020691, "learning_rate": 0.00029990052883666004, "loss": 4.329053955078125, "step": 6650 }, { "epoch": 0.013721607025462797, "grad_norm": 0.6637709736824036, "learning_rate": 0.0002998987603796099, "loss": 4.776343688964844, "step": 6700 }, { "epoch": 0.013824007077891625, "grad_norm": 0.6519717574119568, "learning_rate": 0.0002998969763457385, "loss": 4.839088439941406, "step": 6750 }, { "epoch": 0.01392640713032045, "grad_norm": 0.643963098526001, "learning_rate": 0.00029989517673523127, "loss": 4.581628112792969, "step": 6800 }, { "epoch": 0.014028807182749278, "grad_norm": 1.4058446884155273, "learning_rate": 0.0002998933615482751, "loss": 4.007187194824219, "step": 6850 }, { "epoch": 0.014131207235178104, "grad_norm": 0.7021802067756653, "learning_rate": 0.00029989153078505886, "loss": 4.761097106933594, "step": 6900 }, { "epoch": 0.014233607287606932, "grad_norm": 0.6105393171310425, "learning_rate": 0.0002998896844457725, "loss": 5.0122119140625, "step": 6950 }, { "epoch": 0.014336007340035758, "grad_norm": 0.7652610540390015, "learning_rate": 0.00029988782253060806, "loss": 4.946090393066406, "step": 7000 }, { "epoch": 0.014438407392464585, "grad_norm": 0.7618656754493713, "learning_rate": 0.000299885945039759, "loss": 3.561051025390625, "step": 7050 }, { "epoch": 0.014540807444893411, "grad_norm": 0.6516929864883423, "learning_rate": 0.0002998840519734204, "loss": 4.529894409179687, "step": 7100 }, { "epoch": 0.014643207497322239, "grad_norm": 1.0100959539413452, "learning_rate": 0.000299882143331789, "loss": 4.72200927734375, "step": 7150 }, { "epoch": 0.014745607549751065, "grad_norm": 0.9135130047798157, "learning_rate": 0.0002998802191150631, "loss": 4.017086791992187, "step": 7200 }, { "epoch": 0.014848007602179893, "grad_norm": 1.0336369276046753, "learning_rate": 0.0002998782793234427, "loss": 4.969613952636719, "step": 7250 }, { "epoch": 0.014950407654608719, "grad_norm": 0.6827586889266968, "learning_rate": 0.0002998763239571293, "loss": 4.958232421875, "step": 7300 }, { "epoch": 0.015052807707037546, "grad_norm": 0.8095134496688843, "learning_rate": 0.00029987435301632624, "loss": 4.539352722167969, "step": 7350 }, { "epoch": 0.015155207759466372, "grad_norm": 0.811736524105072, "learning_rate": 0.0002998723665012382, "loss": 4.618602905273438, "step": 7400 }, { "epoch": 0.0152576078118952, "grad_norm": 0.6750462651252747, "learning_rate": 0.00029987036441207163, "loss": 4.390194702148437, "step": 7450 }, { "epoch": 0.015360007864324026, "grad_norm": 0.6136668920516968, "learning_rate": 0.0002998683467490346, "loss": 4.691050109863281, "step": 7500 }, { "epoch": 0.015462407916752853, "grad_norm": 0.608397364616394, "learning_rate": 0.0002998663135123368, "loss": 5.00837646484375, "step": 7550 }, { "epoch": 0.015564807969181681, "grad_norm": 0.6426307559013367, "learning_rate": 0.0002998642647021895, "loss": 4.924872741699219, "step": 7600 }, { "epoch": 0.015667208021610507, "grad_norm": 0.8153278827667236, "learning_rate": 0.00029986220031880557, "loss": 4.830538635253906, "step": 7650 }, { "epoch": 0.015769608074039335, "grad_norm": 0.6194471120834351, "learning_rate": 0.0002998601203623995, "loss": 4.807819213867187, "step": 7700 }, { "epoch": 0.015872008126468162, "grad_norm": 1.5707075595855713, "learning_rate": 0.00029985802483318755, "loss": 4.509772644042969, "step": 7750 }, { "epoch": 0.015974408178896986, "grad_norm": 0.8517248630523682, "learning_rate": 0.0002998559137313874, "loss": 4.2860891723632815, "step": 7800 }, { "epoch": 0.016076808231325814, "grad_norm": 1.0736734867095947, "learning_rate": 0.00029985378705721843, "loss": 4.5593634033203125, "step": 7850 }, { "epoch": 0.016179208283754642, "grad_norm": 0.6145778894424438, "learning_rate": 0.0002998516448109016, "loss": 4.50625, "step": 7900 }, { "epoch": 0.01628160833618347, "grad_norm": 0.7230775356292725, "learning_rate": 0.00029984948699265967, "loss": 4.884090270996094, "step": 7950 }, { "epoch": 0.016384008388612294, "grad_norm": 0.7744879722595215, "learning_rate": 0.0002998473136027167, "loss": 4.186481018066406, "step": 8000 }, { "epoch": 0.01648640844104112, "grad_norm": 0.7375713586807251, "learning_rate": 0.00029984512464129856, "loss": 4.879469299316407, "step": 8050 }, { "epoch": 0.01658880849346995, "grad_norm": 1.0072307586669922, "learning_rate": 0.0002998429201086329, "loss": 4.755104064941406, "step": 8100 }, { "epoch": 0.016691208545898777, "grad_norm": 0.9491130113601685, "learning_rate": 0.00029984070000494854, "loss": 4.182529907226563, "step": 8150 }, { "epoch": 0.0167936085983276, "grad_norm": 0.9159969687461853, "learning_rate": 0.00029983846433047633, "loss": 4.361718444824219, "step": 8200 }, { "epoch": 0.01689600865075643, "grad_norm": 0.9138163328170776, "learning_rate": 0.00029983621308544864, "loss": 4.748040466308594, "step": 8250 }, { "epoch": 0.016998408703185256, "grad_norm": 0.7999444603919983, "learning_rate": 0.0002998339462700993, "loss": 4.52157470703125, "step": 8300 }, { "epoch": 0.017100808755614084, "grad_norm": 0.732362687587738, "learning_rate": 0.0002998316638846639, "loss": 4.664584045410156, "step": 8350 }, { "epoch": 0.01720320880804291, "grad_norm": 0.9679093956947327, "learning_rate": 0.00029982936592937967, "loss": 4.6484066772460935, "step": 8400 }, { "epoch": 0.017305608860471736, "grad_norm": 0.7307636141777039, "learning_rate": 0.0002998270524044853, "loss": 4.694376220703125, "step": 8450 }, { "epoch": 0.017408008912900563, "grad_norm": 0.7069781422615051, "learning_rate": 0.00029982472331022126, "loss": 4.551060180664063, "step": 8500 }, { "epoch": 0.01751040896532939, "grad_norm": 0.764034628868103, "learning_rate": 0.00029982237864682965, "loss": 4.622559814453125, "step": 8550 }, { "epoch": 0.01761280901775822, "grad_norm": 0.7239750623703003, "learning_rate": 0.000299820018414554, "loss": 4.617013549804687, "step": 8600 }, { "epoch": 0.017715209070187043, "grad_norm": 0.6056758165359497, "learning_rate": 0.0002998176426136396, "loss": 4.456921997070313, "step": 8650 }, { "epoch": 0.01781760912261587, "grad_norm": 0.8634012341499329, "learning_rate": 0.0002998152512443334, "loss": 4.55794677734375, "step": 8700 }, { "epoch": 0.017920009175044698, "grad_norm": 0.7804837226867676, "learning_rate": 0.00029981284430688384, "loss": 4.680322570800781, "step": 8750 }, { "epoch": 0.018022409227473526, "grad_norm": 0.773954451084137, "learning_rate": 0.00029981042180154103, "loss": 4.5744256591796875, "step": 8800 }, { "epoch": 0.01812480927990235, "grad_norm": 0.691335916519165, "learning_rate": 0.0002998079837285568, "loss": 4.607868347167969, "step": 8850 }, { "epoch": 0.018227209332331178, "grad_norm": 0.4418846368789673, "learning_rate": 0.0002998055300881844, "loss": 4.455259094238281, "step": 8900 }, { "epoch": 0.018329609384760005, "grad_norm": 1.0125758647918701, "learning_rate": 0.00029980306088067877, "loss": 3.1990432739257812, "step": 8950 }, { "epoch": 0.018432009437188833, "grad_norm": 0.7495264410972595, "learning_rate": 0.00029980057610629664, "loss": 4.650667419433594, "step": 9000 }, { "epoch": 0.018534409489617657, "grad_norm": 0.8682289123535156, "learning_rate": 0.0002997980757652961, "loss": 3.851683349609375, "step": 9050 }, { "epoch": 0.018636809542046485, "grad_norm": 0.9349716305732727, "learning_rate": 0.000299795559857937, "loss": 4.859715576171875, "step": 9100 }, { "epoch": 0.018739209594475312, "grad_norm": 0.7786422967910767, "learning_rate": 0.0002997930283844809, "loss": 4.666428833007813, "step": 9150 }, { "epoch": 0.01884160964690414, "grad_norm": 0.7877052426338196, "learning_rate": 0.0002997904813451907, "loss": 4.6610784912109375, "step": 9200 }, { "epoch": 0.018944009699332968, "grad_norm": 0.9601690173149109, "learning_rate": 0.00029978791874033114, "loss": 4.808619384765625, "step": 9250 }, { "epoch": 0.019046409751761792, "grad_norm": 0.5345655083656311, "learning_rate": 0.0002997853405701684, "loss": 4.262407836914062, "step": 9300 }, { "epoch": 0.01914880980419062, "grad_norm": 0.8365965485572815, "learning_rate": 0.00029978274683497067, "loss": 3.8195550537109373, "step": 9350 }, { "epoch": 0.019251209856619447, "grad_norm": 0.8324418663978577, "learning_rate": 0.00029978013753500723, "loss": 4.371593933105469, "step": 9400 }, { "epoch": 0.019353609909048275, "grad_norm": 0.7757883071899414, "learning_rate": 0.00029977751267054934, "loss": 4.406093444824219, "step": 9450 }, { "epoch": 0.0194560099614771, "grad_norm": 0.8704003095626831, "learning_rate": 0.0002997748722418697, "loss": 4.736319885253907, "step": 9500 }, { "epoch": 0.019558410013905927, "grad_norm": 0.8212069869041443, "learning_rate": 0.0002997722162492427, "loss": 4.341388549804687, "step": 9550 }, { "epoch": 0.019660810066334754, "grad_norm": 0.5836915373802185, "learning_rate": 0.0002997695446929444, "loss": 4.658592529296875, "step": 9600 }, { "epoch": 0.019763210118763582, "grad_norm": 0.8792363405227661, "learning_rate": 0.0002997668575732524, "loss": 4.1852349853515625, "step": 9650 }, { "epoch": 0.019865610171192406, "grad_norm": 0.6817139387130737, "learning_rate": 0.00029976415489044585, "loss": 4.120821838378906, "step": 9700 }, { "epoch": 0.019968010223621234, "grad_norm": 0.9270561337471008, "learning_rate": 0.0002997614366448057, "loss": 4.595604553222656, "step": 9750 }, { "epoch": 0.02007041027605006, "grad_norm": 0.7752207517623901, "learning_rate": 0.0002997587028366144, "loss": 4.643276977539062, "step": 9800 }, { "epoch": 0.02017281032847889, "grad_norm": 0.6949714422225952, "learning_rate": 0.000299755953466156, "loss": 4.598296203613281, "step": 9850 }, { "epoch": 0.020275210380907713, "grad_norm": 0.6971185207366943, "learning_rate": 0.00029975318853371624, "loss": 3.976045837402344, "step": 9900 }, { "epoch": 0.02037761043333654, "grad_norm": 0.6620817184448242, "learning_rate": 0.00029975040803958237, "loss": 4.670194396972656, "step": 9950 }, { "epoch": 0.02048001048576537, "grad_norm": 0.7390024065971375, "learning_rate": 0.0002997476119840434, "loss": 4.440447998046875, "step": 10000 }, { "epoch": 0.020582410538194196, "grad_norm": 1.074389934539795, "learning_rate": 0.0002997448003673899, "loss": 4.406011352539062, "step": 10050 }, { "epoch": 0.020684810590623024, "grad_norm": 0.7580602765083313, "learning_rate": 0.000299741973189914, "loss": 4.489655456542969, "step": 10100 }, { "epoch": 0.02078721064305185, "grad_norm": 0.8966153860092163, "learning_rate": 0.0002997391304519094, "loss": 4.419082946777344, "step": 10150 }, { "epoch": 0.020889610695480676, "grad_norm": 0.8477383255958557, "learning_rate": 0.00029973627215367166, "loss": 4.569579467773438, "step": 10200 }, { "epoch": 0.020992010747909504, "grad_norm": 1.0875380039215088, "learning_rate": 0.00029973339829549776, "loss": 4.634755859375, "step": 10250 }, { "epoch": 0.02109441080033833, "grad_norm": 1.043662190437317, "learning_rate": 0.00029973050887768625, "loss": 4.522914123535156, "step": 10300 }, { "epoch": 0.021196810852767155, "grad_norm": 0.7864259481430054, "learning_rate": 0.0002997276039005375, "loss": 4.525141296386718, "step": 10350 }, { "epoch": 0.021299210905195983, "grad_norm": 0.7917724251747131, "learning_rate": 0.00029972468336435335, "loss": 4.140654602050781, "step": 10400 }, { "epoch": 0.02140161095762481, "grad_norm": 0.8878633975982666, "learning_rate": 0.0002997217472694372, "loss": 4.351778564453125, "step": 10450 }, { "epoch": 0.02150401101005364, "grad_norm": 0.8213766813278198, "learning_rate": 0.0002997187956160943, "loss": 4.426820068359375, "step": 10500 }, { "epoch": 0.021606411062482463, "grad_norm": 0.8385847210884094, "learning_rate": 0.0002997158284046313, "loss": 4.410148315429687, "step": 10550 }, { "epoch": 0.02170881111491129, "grad_norm": 1.029899001121521, "learning_rate": 0.0002997128456353565, "loss": 4.456363830566406, "step": 10600 }, { "epoch": 0.021811211167340118, "grad_norm": 0.8777541518211365, "learning_rate": 0.0002997098473085799, "loss": 4.56017578125, "step": 10650 }, { "epoch": 0.021913611219768946, "grad_norm": 0.7988455891609192, "learning_rate": 0.0002997068334246131, "loss": 4.490418701171875, "step": 10700 }, { "epoch": 0.02201601127219777, "grad_norm": 1.208889126777649, "learning_rate": 0.00029970380398376917, "loss": 4.553769836425781, "step": 10750 }, { "epoch": 0.022118411324626597, "grad_norm": 1.0305062532424927, "learning_rate": 0.0002997007589863631, "loss": 4.295799865722656, "step": 10800 }, { "epoch": 0.022220811377055425, "grad_norm": 0.8051795363426208, "learning_rate": 0.00029969769843271116, "loss": 4.561275329589844, "step": 10850 }, { "epoch": 0.022323211429484253, "grad_norm": 0.8639199733734131, "learning_rate": 0.00029969462232313154, "loss": 4.470157165527343, "step": 10900 }, { "epoch": 0.02242561148191308, "grad_norm": 0.7574597597122192, "learning_rate": 0.00029969153065794374, "loss": 4.476951599121094, "step": 10950 }, { "epoch": 0.022528011534341905, "grad_norm": 0.787169337272644, "learning_rate": 0.00029968842343746906, "loss": 4.459609375, "step": 11000 }, { "epoch": 0.022630411586770732, "grad_norm": 0.7591275572776794, "learning_rate": 0.0002996853006620305, "loss": 4.777853393554688, "step": 11050 }, { "epoch": 0.02273281163919956, "grad_norm": 0.9186727404594421, "learning_rate": 0.0002996821623319524, "loss": 4.658177185058594, "step": 11100 }, { "epoch": 0.022835211691628388, "grad_norm": 0.7670950293540955, "learning_rate": 0.0002996790084475611, "loss": 4.718930053710937, "step": 11150 }, { "epoch": 0.022937611744057212, "grad_norm": 0.9697529077529907, "learning_rate": 0.00029967583900918413, "loss": 4.4181521606445315, "step": 11200 }, { "epoch": 0.02304001179648604, "grad_norm": 1.807626724243164, "learning_rate": 0.00029967265401715083, "loss": 4.645519104003906, "step": 11250 }, { "epoch": 0.023142411848914867, "grad_norm": 0.8117107152938843, "learning_rate": 0.00029966945347179236, "loss": 3.835715637207031, "step": 11300 }, { "epoch": 0.023244811901343695, "grad_norm": 1.2005183696746826, "learning_rate": 0.00029966623737344124, "loss": 4.443558959960938, "step": 11350 }, { "epoch": 0.02334721195377252, "grad_norm": 0.8746537566184998, "learning_rate": 0.0002996630057224316, "loss": 4.395650024414063, "step": 11400 }, { "epoch": 0.023449612006201347, "grad_norm": 0.7293654680252075, "learning_rate": 0.00029965975851909934, "loss": 4.513606262207031, "step": 11450 }, { "epoch": 0.023552012058630174, "grad_norm": 0.7779085636138916, "learning_rate": 0.00029965649576378184, "loss": 4.524747009277344, "step": 11500 }, { "epoch": 0.023654412111059002, "grad_norm": 1.0146737098693848, "learning_rate": 0.00029965321745681816, "loss": 4.670032348632812, "step": 11550 }, { "epoch": 0.023756812163487826, "grad_norm": 0.9226559400558472, "learning_rate": 0.00029964992359854896, "loss": 4.319842529296875, "step": 11600 }, { "epoch": 0.023859212215916654, "grad_norm": 0.729659378528595, "learning_rate": 0.0002996466141893166, "loss": 4.3390591430664065, "step": 11650 }, { "epoch": 0.02396161226834548, "grad_norm": 0.8851988315582275, "learning_rate": 0.00029964328922946486, "loss": 4.193225708007812, "step": 11700 }, { "epoch": 0.02406401232077431, "grad_norm": 1.142880916595459, "learning_rate": 0.0002996399487193393, "loss": 4.7212896728515625, "step": 11750 }, { "epoch": 0.024166412373203137, "grad_norm": 0.6688424944877625, "learning_rate": 0.0002996365926592871, "loss": 4.5033807373046875, "step": 11800 }, { "epoch": 0.02426881242563196, "grad_norm": 0.89569491147995, "learning_rate": 0.00029963322104965693, "loss": 4.241100463867188, "step": 11850 }, { "epoch": 0.02437121247806079, "grad_norm": 0.8132964372634888, "learning_rate": 0.0002996298338907992, "loss": 4.217136535644531, "step": 11900 }, { "epoch": 0.024473612530489616, "grad_norm": 1.4552931785583496, "learning_rate": 0.00029962643118306597, "loss": 4.451352844238281, "step": 11950 }, { "epoch": 0.024576012582918444, "grad_norm": 0.7032333612442017, "learning_rate": 0.00029962301292681066, "loss": 3.709466857910156, "step": 12000 }, { "epoch": 0.024678412635347268, "grad_norm": 0.7736782431602478, "learning_rate": 0.0002996195791223886, "loss": 3.931116027832031, "step": 12050 }, { "epoch": 0.024780812687776096, "grad_norm": 1.0214853286743164, "learning_rate": 0.0002996161297701566, "loss": 4.091096496582031, "step": 12100 }, { "epoch": 0.024883212740204923, "grad_norm": 0.7319433093070984, "learning_rate": 0.00029961266487047307, "loss": 4.754253234863281, "step": 12150 }, { "epoch": 0.02498561279263375, "grad_norm": 0.7848948240280151, "learning_rate": 0.00029960918442369804, "loss": 4.210378723144531, "step": 12200 }, { "epoch": 0.025088012845062575, "grad_norm": 0.8420546650886536, "learning_rate": 0.00029960568843019327, "loss": 4.331927185058594, "step": 12250 }, { "epoch": 0.025190412897491403, "grad_norm": 0.7843689322471619, "learning_rate": 0.00029960217689032205, "loss": 4.491570129394531, "step": 12300 }, { "epoch": 0.02529281294992023, "grad_norm": 1.0013247728347778, "learning_rate": 0.0002995986498044491, "loss": 4.356235961914063, "step": 12350 }, { "epoch": 0.025395213002349058, "grad_norm": 0.8285472989082336, "learning_rate": 0.0002995951071729412, "loss": 4.19695556640625, "step": 12400 }, { "epoch": 0.025497613054777882, "grad_norm": 0.8935615420341492, "learning_rate": 0.0002995915489961663, "loss": 4.556292724609375, "step": 12450 }, { "epoch": 0.02560001310720671, "grad_norm": 1.1061961650848389, "learning_rate": 0.0002995879752744942, "loss": 4.260919799804688, "step": 12500 }, { "epoch": 0.025702413159635538, "grad_norm": 0.7796922922134399, "learning_rate": 0.00029958438600829633, "loss": 3.7681890869140626, "step": 12550 }, { "epoch": 0.025804813212064365, "grad_norm": 0.9937464594841003, "learning_rate": 0.0002995807811979456, "loss": 4.396112670898438, "step": 12600 }, { "epoch": 0.025907213264493193, "grad_norm": 0.9796547889709473, "learning_rate": 0.0002995771608438166, "loss": 4.378516540527344, "step": 12650 }, { "epoch": 0.026009613316922017, "grad_norm": 0.9051157236099243, "learning_rate": 0.00029957352494628563, "loss": 4.480902404785156, "step": 12700 }, { "epoch": 0.026112013369350845, "grad_norm": 0.706322968006134, "learning_rate": 0.0002995698735057304, "loss": 4.157791442871094, "step": 12750 }, { "epoch": 0.026214413421779673, "grad_norm": 1.033637285232544, "learning_rate": 0.0002995662065225304, "loss": 4.6359164428710935, "step": 12800 }, { "epoch": 0.0263168134742085, "grad_norm": 0.9319335222244263, "learning_rate": 0.00029956252399706673, "loss": 4.510284423828125, "step": 12850 }, { "epoch": 0.026419213526637324, "grad_norm": 0.887332022190094, "learning_rate": 0.000299558825929722, "loss": 4.224294738769531, "step": 12900 }, { "epoch": 0.026521613579066152, "grad_norm": 0.7545831203460693, "learning_rate": 0.0002995551123208805, "loss": 3.612664794921875, "step": 12950 }, { "epoch": 0.02662401363149498, "grad_norm": 1.4527435302734375, "learning_rate": 0.0002995513831709281, "loss": 3.0348556518554686, "step": 13000 }, { "epoch": 0.026726413683923807, "grad_norm": 0.826316237449646, "learning_rate": 0.00029954763848025244, "loss": 3.7530322265625, "step": 13050 }, { "epoch": 0.02682881373635263, "grad_norm": 0.7737396955490112, "learning_rate": 0.0002995438782492426, "loss": 4.3491796875, "step": 13100 }, { "epoch": 0.02693121378878146, "grad_norm": 0.7360561490058899, "learning_rate": 0.0002995401024782892, "loss": 4.23507568359375, "step": 13150 }, { "epoch": 0.027033613841210287, "grad_norm": 1.048795223236084, "learning_rate": 0.00029953631116778483, "loss": 4.128821716308594, "step": 13200 }, { "epoch": 0.027136013893639115, "grad_norm": 0.744465172290802, "learning_rate": 0.00029953250431812326, "loss": 4.229864501953125, "step": 13250 }, { "epoch": 0.02723841394606794, "grad_norm": 1.0225343704223633, "learning_rate": 0.0002995286819297002, "loss": 4.329259033203125, "step": 13300 }, { "epoch": 0.027340813998496766, "grad_norm": 0.8426514863967896, "learning_rate": 0.0002995248440029128, "loss": 4.405516662597656, "step": 13350 }, { "epoch": 0.027443214050925594, "grad_norm": 0.8175310492515564, "learning_rate": 0.00029952099053815996, "loss": 4.2612826538085935, "step": 13400 }, { "epoch": 0.027545614103354422, "grad_norm": 0.9133870601654053, "learning_rate": 0.000299517121535842, "loss": 4.32334228515625, "step": 13450 }, { "epoch": 0.02764801415578325, "grad_norm": 0.9261609315872192, "learning_rate": 0.00029951323699636107, "loss": 4.267542114257813, "step": 13500 }, { "epoch": 0.027750414208212074, "grad_norm": 0.964561402797699, "learning_rate": 0.00029950933692012076, "loss": 4.246123657226563, "step": 13550 }, { "epoch": 0.0278528142606409, "grad_norm": 1.1370861530303955, "learning_rate": 0.00029950542130752634, "loss": 4.350406188964843, "step": 13600 }, { "epoch": 0.02795521431306973, "grad_norm": 0.8274940848350525, "learning_rate": 0.00029950149015898483, "loss": 4.124059448242187, "step": 13650 }, { "epoch": 0.028057614365498557, "grad_norm": 1.0486522912979126, "learning_rate": 0.0002994975434749046, "loss": 4.241673278808594, "step": 13700 }, { "epoch": 0.02816001441792738, "grad_norm": 0.8022660613059998, "learning_rate": 0.0002994935812556958, "loss": 3.647921447753906, "step": 13750 }, { "epoch": 0.02826241447035621, "grad_norm": 1.1589747667312622, "learning_rate": 0.00029948960350177026, "loss": 4.2052005004882815, "step": 13800 }, { "epoch": 0.028364814522785036, "grad_norm": 0.6878979802131653, "learning_rate": 0.0002994856102135412, "loss": 4.0764639282226565, "step": 13850 }, { "epoch": 0.028467214575213864, "grad_norm": 1.299386739730835, "learning_rate": 0.0002994816013914236, "loss": 3.8260293579101563, "step": 13900 }, { "epoch": 0.028569614627642688, "grad_norm": 0.7897019982337952, "learning_rate": 0.0002994775770358342, "loss": 4.474502258300781, "step": 13950 }, { "epoch": 0.028672014680071516, "grad_norm": 1.031049132347107, "learning_rate": 0.000299473537147191, "loss": 4.189122924804687, "step": 14000 }, { "epoch": 0.028774414732500343, "grad_norm": 1.224804401397705, "learning_rate": 0.0002994694817259139, "loss": 4.223143615722656, "step": 14050 }, { "epoch": 0.02887681478492917, "grad_norm": 0.8684813380241394, "learning_rate": 0.00029946541077242433, "loss": 4.23610107421875, "step": 14100 }, { "epoch": 0.028979214837357995, "grad_norm": 0.6440140008926392, "learning_rate": 0.0002994613242871453, "loss": 3.841741638183594, "step": 14150 }, { "epoch": 0.029081614889786823, "grad_norm": 0.49674278497695923, "learning_rate": 0.0002994572222705014, "loss": 2.3330259704589844, "step": 14200 }, { "epoch": 0.02918401494221565, "grad_norm": 0.8202585577964783, "learning_rate": 0.00029945310472291906, "loss": 3.3214230346679687, "step": 14250 }, { "epoch": 0.029286414994644478, "grad_norm": 0.9601882100105286, "learning_rate": 0.00029944897164482597, "loss": 4.437399291992188, "step": 14300 }, { "epoch": 0.029388815047073306, "grad_norm": 0.8373337388038635, "learning_rate": 0.00029944482303665175, "loss": 4.476743469238281, "step": 14350 }, { "epoch": 0.02949121509950213, "grad_norm": 0.7051481008529663, "learning_rate": 0.0002994406588988274, "loss": 4.182169189453125, "step": 14400 }, { "epoch": 0.029593615151930958, "grad_norm": 1.0870895385742188, "learning_rate": 0.00029943647923178575, "loss": 4.3550872802734375, "step": 14450 }, { "epoch": 0.029696015204359785, "grad_norm": 0.768278181552887, "learning_rate": 0.00029943228403596107, "loss": 4.4534228515625, "step": 14500 }, { "epoch": 0.029798415256788613, "grad_norm": 0.8001137971878052, "learning_rate": 0.00029942807331178933, "loss": 4.255840148925781, "step": 14550 }, { "epoch": 0.029900815309217437, "grad_norm": 0.779834508895874, "learning_rate": 0.000299423847059708, "loss": 4.268035888671875, "step": 14600 }, { "epoch": 0.030003215361646265, "grad_norm": 0.7155870199203491, "learning_rate": 0.00029941960528015644, "loss": 3.9607696533203125, "step": 14650 }, { "epoch": 0.030105615414075092, "grad_norm": 0.8414117693901062, "learning_rate": 0.0002994153479735753, "loss": 3.9243670654296876, "step": 14700 }, { "epoch": 0.03020801546650392, "grad_norm": 1.1119954586029053, "learning_rate": 0.00029941107514040694, "loss": 4.47902099609375, "step": 14750 }, { "epoch": 0.030310415518932744, "grad_norm": 1.0905205011367798, "learning_rate": 0.00029940678678109546, "loss": 4.391621398925781, "step": 14800 }, { "epoch": 0.030412815571361572, "grad_norm": 0.9594521522521973, "learning_rate": 0.00029940248289608655, "loss": 4.156022644042968, "step": 14850 }, { "epoch": 0.0305152156237904, "grad_norm": 0.830136775970459, "learning_rate": 0.0002993981634858273, "loss": 4.088116760253906, "step": 14900 }, { "epoch": 0.030617615676219227, "grad_norm": 0.7149996161460876, "learning_rate": 0.00029939382855076664, "loss": 3.857545166015625, "step": 14950 }, { "epoch": 0.03072001572864805, "grad_norm": 0.8593119978904724, "learning_rate": 0.0002993894780913551, "loss": 4.10996826171875, "step": 15000 }, { "epoch": 0.03082241578107688, "grad_norm": 0.8296166658401489, "learning_rate": 0.0002993851121080446, "loss": 4.353337097167969, "step": 15050 }, { "epoch": 0.030924815833505707, "grad_norm": 0.7708966732025146, "learning_rate": 0.00029938073060128896, "loss": 4.261842651367187, "step": 15100 }, { "epoch": 0.031027215885934534, "grad_norm": 0.6590582132339478, "learning_rate": 0.00029937633357154345, "loss": 3.885545349121094, "step": 15150 }, { "epoch": 0.031129615938363362, "grad_norm": 1.0012860298156738, "learning_rate": 0.000299371921019265, "loss": 4.263138427734375, "step": 15200 }, { "epoch": 0.031232015990792186, "grad_norm": 0.9702419638633728, "learning_rate": 0.00029936749294491214, "loss": 4.242536926269532, "step": 15250 }, { "epoch": 0.031334416043221014, "grad_norm": 1.740096092224121, "learning_rate": 0.000299363049348945, "loss": 3.794849853515625, "step": 15300 }, { "epoch": 0.03143681609564984, "grad_norm": 0.8641059994697571, "learning_rate": 0.0002993585902318254, "loss": 3.7392898559570313, "step": 15350 }, { "epoch": 0.03153921614807867, "grad_norm": 0.7307964563369751, "learning_rate": 0.0002993541155940166, "loss": 4.284304809570313, "step": 15400 }, { "epoch": 0.0316416162005075, "grad_norm": 0.8395029902458191, "learning_rate": 0.0002993496254359837, "loss": 3.666776428222656, "step": 15450 }, { "epoch": 0.031744016252936325, "grad_norm": 0.88369220495224, "learning_rate": 0.00029934511975819323, "loss": 4.232069396972657, "step": 15500 }, { "epoch": 0.031846416305365145, "grad_norm": 0.900976836681366, "learning_rate": 0.00029934059856111337, "loss": 4.181927490234375, "step": 15550 }, { "epoch": 0.03194881635779397, "grad_norm": 0.8746826648712158, "learning_rate": 0.00029933606184521404, "loss": 4.177504577636719, "step": 15600 }, { "epoch": 0.0320512164102228, "grad_norm": 0.9220513105392456, "learning_rate": 0.0002993315096109666, "loss": 4.219546813964843, "step": 15650 }, { "epoch": 0.03215361646265163, "grad_norm": 0.9001684784889221, "learning_rate": 0.00029932694185884416, "loss": 4.161190490722657, "step": 15700 }, { "epoch": 0.032256016515080456, "grad_norm": 1.1615084409713745, "learning_rate": 0.0002993223585893213, "loss": 4.272937316894531, "step": 15750 }, { "epoch": 0.032358416567509284, "grad_norm": 0.9227635860443115, "learning_rate": 0.0002993177598028743, "loss": 4.0247500610351565, "step": 15800 }, { "epoch": 0.03246081661993811, "grad_norm": 0.9501990675926208, "learning_rate": 0.0002993131454999812, "loss": 3.6547119140625, "step": 15850 }, { "epoch": 0.03256321667236694, "grad_norm": 0.8894864320755005, "learning_rate": 0.0002993085156811213, "loss": 4.238618469238281, "step": 15900 }, { "epoch": 0.03266561672479577, "grad_norm": 1.1804680824279785, "learning_rate": 0.0002993038703467758, "loss": 4.274075317382812, "step": 15950 }, { "epoch": 0.03276801677722459, "grad_norm": 0.9597388505935669, "learning_rate": 0.00029929920949742743, "loss": 2.690977783203125, "step": 16000 }, { "epoch": 0.032870416829653415, "grad_norm": 0.8713410496711731, "learning_rate": 0.0002992945331335605, "loss": 4.216771850585937, "step": 16050 }, { "epoch": 0.03297281688208224, "grad_norm": 0.7275038361549377, "learning_rate": 0.000299289841255661, "loss": 3.7311639404296875, "step": 16100 }, { "epoch": 0.03307521693451107, "grad_norm": 0.987648069858551, "learning_rate": 0.0002992851338642164, "loss": 4.2468328857421875, "step": 16150 }, { "epoch": 0.0331776169869399, "grad_norm": 0.8776699900627136, "learning_rate": 0.00029928041095971593, "loss": 4.107083435058594, "step": 16200 }, { "epoch": 0.033280017039368726, "grad_norm": 1.0074553489685059, "learning_rate": 0.00029927567254265037, "loss": 4.185172119140625, "step": 16250 }, { "epoch": 0.03338241709179755, "grad_norm": 0.8109734058380127, "learning_rate": 0.00029927091861351216, "loss": 4.268891296386719, "step": 16300 }, { "epoch": 0.03348481714422638, "grad_norm": 1.1346583366394043, "learning_rate": 0.00029926614917279523, "loss": 4.282049865722656, "step": 16350 }, { "epoch": 0.0335872171966552, "grad_norm": 0.8583949208259583, "learning_rate": 0.0002992613642209952, "loss": 4.328241577148438, "step": 16400 }, { "epoch": 0.03368961724908403, "grad_norm": 0.8398747444152832, "learning_rate": 0.0002992565637586094, "loss": 4.186492004394531, "step": 16450 }, { "epoch": 0.03379201730151286, "grad_norm": 0.643873929977417, "learning_rate": 0.0002992517477861366, "loss": 3.162231140136719, "step": 16500 }, { "epoch": 0.033894417353941685, "grad_norm": 0.9688578248023987, "learning_rate": 0.00029924691630407724, "loss": 4.235280151367188, "step": 16550 }, { "epoch": 0.03399681740637051, "grad_norm": 0.8266287446022034, "learning_rate": 0.0002992420693129334, "loss": 4.479638977050781, "step": 16600 }, { "epoch": 0.03409921745879934, "grad_norm": 0.8200719356536865, "learning_rate": 0.0002992372068132088, "loss": 4.379118957519531, "step": 16650 }, { "epoch": 0.03420161751122817, "grad_norm": 0.9193712472915649, "learning_rate": 0.00029923232880540865, "loss": 4.209988708496094, "step": 16700 }, { "epoch": 0.034304017563656995, "grad_norm": 0.9132387638092041, "learning_rate": 0.0002992274352900399, "loss": 4.341851501464844, "step": 16750 }, { "epoch": 0.03440641761608582, "grad_norm": 1.0033169984817505, "learning_rate": 0.0002992225262676111, "loss": 4.356620483398437, "step": 16800 }, { "epoch": 0.034508817668514644, "grad_norm": 1.109008550643921, "learning_rate": 0.0002992176017386323, "loss": 4.189815368652344, "step": 16850 }, { "epoch": 0.03461121772094347, "grad_norm": 1.2428394556045532, "learning_rate": 0.00029921266170361533, "loss": 4.286259460449219, "step": 16900 }, { "epoch": 0.0347136177733723, "grad_norm": 0.9120133519172668, "learning_rate": 0.0002992077061630734, "loss": 4.392665405273437, "step": 16950 }, { "epoch": 0.03481601782580113, "grad_norm": 1.2237519025802612, "learning_rate": 0.0002992027351175216, "loss": 4.461217041015625, "step": 17000 }, { "epoch": 0.034918417878229954, "grad_norm": 0.9254854917526245, "learning_rate": 0.00029919774856747636, "loss": 4.2495333862304685, "step": 17050 }, { "epoch": 0.03502081793065878, "grad_norm": 1.204923391342163, "learning_rate": 0.000299192746513456, "loss": 4.237576293945312, "step": 17100 }, { "epoch": 0.03512321798308761, "grad_norm": 0.8846333026885986, "learning_rate": 0.0002991877289559803, "loss": 3.958520812988281, "step": 17150 }, { "epoch": 0.03522561803551644, "grad_norm": 0.8742989897727966, "learning_rate": 0.00029918269589557055, "loss": 4.097115173339843, "step": 17200 }, { "epoch": 0.03532801808794526, "grad_norm": 0.9790547490119934, "learning_rate": 0.0002991776473327499, "loss": 4.068385314941406, "step": 17250 }, { "epoch": 0.035430418140374086, "grad_norm": 0.8808755278587341, "learning_rate": 0.0002991725832680428, "loss": 4.071025390625, "step": 17300 }, { "epoch": 0.03553281819280291, "grad_norm": 0.9796196818351746, "learning_rate": 0.00029916750370197567, "loss": 3.7829425048828127, "step": 17350 }, { "epoch": 0.03563521824523174, "grad_norm": 0.9726704955101013, "learning_rate": 0.00029916240863507625, "loss": 4.105780334472656, "step": 17400 }, { "epoch": 0.03573761829766057, "grad_norm": 1.0631580352783203, "learning_rate": 0.000299157298067874, "loss": 4.146686401367187, "step": 17450 }, { "epoch": 0.035840018350089396, "grad_norm": 0.8494559526443481, "learning_rate": 0.0002991521720009001, "loss": 4.303363342285156, "step": 17500 }, { "epoch": 0.035942418402518224, "grad_norm": 1.3400248289108276, "learning_rate": 0.00029914703043468704, "loss": 4.124955749511718, "step": 17550 }, { "epoch": 0.03604481845494705, "grad_norm": 1.2535253763198853, "learning_rate": 0.00029914187336976925, "loss": 3.625634765625, "step": 17600 }, { "epoch": 0.03614721850737588, "grad_norm": 0.9625725746154785, "learning_rate": 0.0002991367008066826, "loss": 4.224259948730468, "step": 17650 }, { "epoch": 0.0362496185598047, "grad_norm": 0.9419931769371033, "learning_rate": 0.00029913151274596456, "loss": 4.3089794921875, "step": 17700 }, { "epoch": 0.03635201861223353, "grad_norm": 1.2326748371124268, "learning_rate": 0.0002991263091881543, "loss": 4.07185791015625, "step": 17750 }, { "epoch": 0.036454418664662355, "grad_norm": 0.9051257967948914, "learning_rate": 0.00029912109013379253, "loss": 4.346282958984375, "step": 17800 }, { "epoch": 0.03655681871709118, "grad_norm": 0.8675338625907898, "learning_rate": 0.0002991158555834216, "loss": 4.14196044921875, "step": 17850 }, { "epoch": 0.03665921876952001, "grad_norm": 1.7800242900848389, "learning_rate": 0.0002991106055375854, "loss": 4.262186279296875, "step": 17900 }, { "epoch": 0.03676161882194884, "grad_norm": 0.8730024099349976, "learning_rate": 0.0002991053399968296, "loss": 3.647480163574219, "step": 17950 }, { "epoch": 0.036864018874377666, "grad_norm": 0.8715499639511108, "learning_rate": 0.0002991000589617013, "loss": 3.8062033081054687, "step": 18000 }, { "epoch": 0.036966418926806494, "grad_norm": 1.1045186519622803, "learning_rate": 0.0002990947624327493, "loss": 3.142933349609375, "step": 18050 }, { "epoch": 0.037068818979235314, "grad_norm": 0.9436252117156982, "learning_rate": 0.000299089450410524, "loss": 3.2230126953125, "step": 18100 }, { "epoch": 0.03717121903166414, "grad_norm": 0.7957382798194885, "learning_rate": 0.00029908412289557737, "loss": 4.389481811523438, "step": 18150 }, { "epoch": 0.03727361908409297, "grad_norm": 1.0775970220565796, "learning_rate": 0.0002990787798884631, "loss": 3.8384576416015626, "step": 18200 }, { "epoch": 0.0373760191365218, "grad_norm": 0.9266685843467712, "learning_rate": 0.00029907342138973627, "loss": 4.209334106445312, "step": 18250 }, { "epoch": 0.037478419188950625, "grad_norm": 0.9169478416442871, "learning_rate": 0.00029906804739995385, "loss": 4.067582092285156, "step": 18300 }, { "epoch": 0.03758081924137945, "grad_norm": 0.8588764071464539, "learning_rate": 0.0002990626579196742, "loss": 4.140736694335938, "step": 18350 }, { "epoch": 0.03768321929380828, "grad_norm": 1.0396775007247925, "learning_rate": 0.0002990572529494574, "loss": 4.312765502929688, "step": 18400 }, { "epoch": 0.03778561934623711, "grad_norm": 1.0524662733078003, "learning_rate": 0.0002990518324898652, "loss": 3.2222711181640626, "step": 18450 }, { "epoch": 0.037888019398665936, "grad_norm": 0.8703554272651672, "learning_rate": 0.00029904639654146066, "loss": 4.180811462402343, "step": 18500 }, { "epoch": 0.037990419451094756, "grad_norm": 2.620311737060547, "learning_rate": 0.00029904094510480885, "loss": 4.130848388671875, "step": 18550 }, { "epoch": 0.038092819503523584, "grad_norm": 0.8157406449317932, "learning_rate": 0.0002990354781804762, "loss": 3.6872372436523437, "step": 18600 }, { "epoch": 0.03819521955595241, "grad_norm": 0.8512464165687561, "learning_rate": 0.0002990299957690308, "loss": 4.433642883300781, "step": 18650 }, { "epoch": 0.03829761960838124, "grad_norm": 0.9459244012832642, "learning_rate": 0.0002990244978710423, "loss": 4.312282104492187, "step": 18700 }, { "epoch": 0.03840001966081007, "grad_norm": 0.8191068768501282, "learning_rate": 0.0002990189844870821, "loss": 4.3835546875, "step": 18750 }, { "epoch": 0.038502419713238895, "grad_norm": 0.9797852039337158, "learning_rate": 0.0002990134556177231, "loss": 4.277929077148437, "step": 18800 }, { "epoch": 0.03860481976566772, "grad_norm": 0.957114040851593, "learning_rate": 0.00029900791126353984, "loss": 4.525142822265625, "step": 18850 }, { "epoch": 0.03870721981809655, "grad_norm": 0.9237158894538879, "learning_rate": 0.0002990023514251085, "loss": 3.7692413330078125, "step": 18900 }, { "epoch": 0.03880961987052537, "grad_norm": 1.055321455001831, "learning_rate": 0.0002989967761030067, "loss": 4.0058810424804685, "step": 18950 }, { "epoch": 0.0389120199229542, "grad_norm": 0.9850941896438599, "learning_rate": 0.000298991185297814, "loss": 3.8927227783203127, "step": 19000 }, { "epoch": 0.039014419975383026, "grad_norm": 0.8424584269523621, "learning_rate": 0.0002989855790101112, "loss": 4.3304986572265625, "step": 19050 }, { "epoch": 0.039116820027811854, "grad_norm": 0.8309029936790466, "learning_rate": 0.00029897995724048105, "loss": 4.19474609375, "step": 19100 }, { "epoch": 0.03921922008024068, "grad_norm": 0.8734010457992554, "learning_rate": 0.00029897431998950763, "loss": 4.056589965820312, "step": 19150 }, { "epoch": 0.03932162013266951, "grad_norm": 1.723552942276001, "learning_rate": 0.0002989686672577767, "loss": 4.061507568359375, "step": 19200 }, { "epoch": 0.03942402018509834, "grad_norm": 1.0202237367630005, "learning_rate": 0.0002989629990458757, "loss": 3.8971566772460937, "step": 19250 }, { "epoch": 0.039526420237527164, "grad_norm": 1.2921315431594849, "learning_rate": 0.00029895731535439367, "loss": 3.0908432006835938, "step": 19300 }, { "epoch": 0.03962882028995599, "grad_norm": 1.0007706880569458, "learning_rate": 0.00029895161618392126, "loss": 3.4613546752929687, "step": 19350 }, { "epoch": 0.03973122034238481, "grad_norm": 1.0438216924667358, "learning_rate": 0.00029894590153505066, "loss": 3.344393615722656, "step": 19400 }, { "epoch": 0.03983362039481364, "grad_norm": 1.0282576084136963, "learning_rate": 0.0002989401714083757, "loss": 3.807875671386719, "step": 19450 }, { "epoch": 0.03993602044724247, "grad_norm": 1.20839262008667, "learning_rate": 0.00029893442580449187, "loss": 4.143163452148437, "step": 19500 }, { "epoch": 0.040038420499671296, "grad_norm": 1.1626482009887695, "learning_rate": 0.0002989286647239962, "loss": 4.075806884765625, "step": 19550 }, { "epoch": 0.04014082055210012, "grad_norm": 0.7632113695144653, "learning_rate": 0.0002989228881674874, "loss": 4.186883239746094, "step": 19600 }, { "epoch": 0.04024322060452895, "grad_norm": 0.8571646213531494, "learning_rate": 0.00029891709613556565, "loss": 3.8722219848632813, "step": 19650 }, { "epoch": 0.04034562065695778, "grad_norm": 1.4133912324905396, "learning_rate": 0.0002989112886288329, "loss": 3.877001953125, "step": 19700 }, { "epoch": 0.040448020709386606, "grad_norm": 0.9766141176223755, "learning_rate": 0.0002989054656478927, "loss": 3.9540411376953126, "step": 19750 }, { "epoch": 0.04055042076181543, "grad_norm": 0.8429685235023499, "learning_rate": 0.00029889962719335003, "loss": 4.412438049316406, "step": 19800 }, { "epoch": 0.040652820814244255, "grad_norm": 0.7656176686286926, "learning_rate": 0.0002988937732658116, "loss": 4.269136657714844, "step": 19850 }, { "epoch": 0.04075522086667308, "grad_norm": 1.1075332164764404, "learning_rate": 0.0002988879038658859, "loss": 4.419913330078125, "step": 19900 }, { "epoch": 0.04085762091910191, "grad_norm": 0.8199209570884705, "learning_rate": 0.0002988820189941826, "loss": 4.36384765625, "step": 19950 }, { "epoch": 0.04096002097153074, "grad_norm": 0.8144904375076294, "learning_rate": 0.00029887611865131344, "loss": 4.030648803710937, "step": 20000 }, { "epoch": 0.041062421023959565, "grad_norm": 0.9372329711914062, "learning_rate": 0.00029887020283789147, "loss": 4.1174404907226565, "step": 20050 }, { "epoch": 0.04116482107638839, "grad_norm": 0.8546763062477112, "learning_rate": 0.0002988642715545314, "loss": 4.441152648925781, "step": 20100 }, { "epoch": 0.04126722112881722, "grad_norm": 1.333139181137085, "learning_rate": 0.00029885832480184963, "loss": 4.200628356933594, "step": 20150 }, { "epoch": 0.04136962118124605, "grad_norm": 1.320517659187317, "learning_rate": 0.0002988523625804641, "loss": 3.89320068359375, "step": 20200 }, { "epoch": 0.04147202123367487, "grad_norm": 0.9039347171783447, "learning_rate": 0.0002988463848909944, "loss": 3.9010406494140626, "step": 20250 }, { "epoch": 0.0415744212861037, "grad_norm": 0.9151229858398438, "learning_rate": 0.00029884039173406167, "loss": 3.6283367919921874, "step": 20300 }, { "epoch": 0.041676821338532524, "grad_norm": 0.8544915318489075, "learning_rate": 0.00029883438311028876, "loss": 4.021604919433594, "step": 20350 }, { "epoch": 0.04177922139096135, "grad_norm": 1.2115877866744995, "learning_rate": 0.0002988283590203, "loss": 4.037056579589843, "step": 20400 }, { "epoch": 0.04188162144339018, "grad_norm": 0.8434769511222839, "learning_rate": 0.0002988223194647214, "loss": 4.190481262207031, "step": 20450 }, { "epoch": 0.04198402149581901, "grad_norm": 1.0086390972137451, "learning_rate": 0.00029881626444418056, "loss": 3.7280892944335937, "step": 20500 }, { "epoch": 0.042086421548247835, "grad_norm": 1.0009269714355469, "learning_rate": 0.0002988101939593067, "loss": 4.065418090820312, "step": 20550 }, { "epoch": 0.04218882160067666, "grad_norm": 0.7844799160957336, "learning_rate": 0.0002988041080107307, "loss": 3.97632080078125, "step": 20600 }, { "epoch": 0.04229122165310548, "grad_norm": 0.9640885591506958, "learning_rate": 0.00029879800659908485, "loss": 4.065289916992188, "step": 20650 }, { "epoch": 0.04239362170553431, "grad_norm": 0.8006758093833923, "learning_rate": 0.0002987918897250033, "loss": 4.137116088867187, "step": 20700 }, { "epoch": 0.04249602175796314, "grad_norm": 0.624839186668396, "learning_rate": 0.00029878575738912156, "loss": 2.075597839355469, "step": 20750 }, { "epoch": 0.042598421810391966, "grad_norm": 0.8152270317077637, "learning_rate": 0.00029877960959207706, "loss": 3.2935858154296875, "step": 20800 }, { "epoch": 0.042700821862820794, "grad_norm": 0.9872801303863525, "learning_rate": 0.0002987734463345085, "loss": 3.3229608154296875, "step": 20850 }, { "epoch": 0.04280322191524962, "grad_norm": 0.6640042066574097, "learning_rate": 0.00029876726761705636, "loss": 2.9013262939453126, "step": 20900 }, { "epoch": 0.04290562196767845, "grad_norm": 0.6145225167274475, "learning_rate": 0.00029876107344036277, "loss": 2.4409584045410155, "step": 20950 }, { "epoch": 0.04300802202010728, "grad_norm": 1.0556402206420898, "learning_rate": 0.0002987548638050714, "loss": 2.4114979553222655, "step": 21000 }, { "epoch": 0.043110422072536105, "grad_norm": 0.9862767457962036, "learning_rate": 0.00029874863871182745, "loss": 3.802875671386719, "step": 21050 }, { "epoch": 0.043212822124964925, "grad_norm": 0.852150559425354, "learning_rate": 0.0002987423981612778, "loss": 3.66058349609375, "step": 21100 }, { "epoch": 0.04331522217739375, "grad_norm": 0.8836477398872375, "learning_rate": 0.0002987361421540711, "loss": 3.4694943237304687, "step": 21150 }, { "epoch": 0.04341762222982258, "grad_norm": 1.5402307510375977, "learning_rate": 0.00029872987069085727, "loss": 3.277726135253906, "step": 21200 }, { "epoch": 0.04352002228225141, "grad_norm": 0.9419423341751099, "learning_rate": 0.0002987235837722881, "loss": 3.5211444091796875, "step": 21250 }, { "epoch": 0.043622422334680236, "grad_norm": 0.7486373782157898, "learning_rate": 0.0002987172813990169, "loss": 3.471663818359375, "step": 21300 }, { "epoch": 0.043724822387109064, "grad_norm": 0.7535277605056763, "learning_rate": 0.0002987109635716985, "loss": 3.376907958984375, "step": 21350 }, { "epoch": 0.04382722243953789, "grad_norm": 0.8332289457321167, "learning_rate": 0.0002987046302909895, "loss": 3.9842266845703125, "step": 21400 }, { "epoch": 0.04392962249196672, "grad_norm": 1.322947382926941, "learning_rate": 0.000298698281557548, "loss": 3.1945089721679687, "step": 21450 }, { "epoch": 0.04403202254439554, "grad_norm": 1.0296247005462646, "learning_rate": 0.00029869191737203377, "loss": 3.6288201904296873, "step": 21500 }, { "epoch": 0.04413442259682437, "grad_norm": 0.9314439296722412, "learning_rate": 0.0002986855377351081, "loss": 3.4926687622070314, "step": 21550 }, { "epoch": 0.044236822649253195, "grad_norm": 0.7597600221633911, "learning_rate": 0.000298679142647434, "loss": 2.996235046386719, "step": 21600 }, { "epoch": 0.04433922270168202, "grad_norm": 1.4043519496917725, "learning_rate": 0.00029867273210967593, "loss": 3.252802429199219, "step": 21650 }, { "epoch": 0.04444162275411085, "grad_norm": 3.3350236415863037, "learning_rate": 0.00029866630612250013, "loss": 3.2056927490234375, "step": 21700 }, { "epoch": 0.04454402280653968, "grad_norm": 0.8740987777709961, "learning_rate": 0.0002986598646865743, "loss": 3.5895626831054686, "step": 21750 }, { "epoch": 0.044646422858968506, "grad_norm": 1.1191177368164062, "learning_rate": 0.00029865340780256777, "loss": 3.456165466308594, "step": 21800 }, { "epoch": 0.04474882291139733, "grad_norm": 0.8428330421447754, "learning_rate": 0.0002986469354711516, "loss": 3.3481961059570313, "step": 21850 }, { "epoch": 0.04485122296382616, "grad_norm": 0.9282798767089844, "learning_rate": 0.0002986404476929984, "loss": 3.2974124145507813, "step": 21900 }, { "epoch": 0.04495362301625498, "grad_norm": 1.1790461540222168, "learning_rate": 0.00029863394446878223, "loss": 2.619112854003906, "step": 21950 }, { "epoch": 0.04505602306868381, "grad_norm": 0.905838131904602, "learning_rate": 0.00029862742579917894, "loss": 3.3288262939453124, "step": 22000 }, { "epoch": 0.04515842312111264, "grad_norm": 0.7021234631538391, "learning_rate": 0.00029862089168486596, "loss": 3.40490234375, "step": 22050 }, { "epoch": 0.045260823173541465, "grad_norm": 0.8678475618362427, "learning_rate": 0.00029861434212652215, "loss": 3.6314691162109374, "step": 22100 }, { "epoch": 0.04536322322597029, "grad_norm": 0.9551572203636169, "learning_rate": 0.00029860777712482824, "loss": 3.654752197265625, "step": 22150 }, { "epoch": 0.04546562327839912, "grad_norm": 1.1007713079452515, "learning_rate": 0.00029860119668046636, "loss": 3.439637451171875, "step": 22200 }, { "epoch": 0.04556802333082795, "grad_norm": 0.8319056034088135, "learning_rate": 0.0002985946007941204, "loss": 3.5101995849609375, "step": 22250 }, { "epoch": 0.045670423383256775, "grad_norm": 1.040257215499878, "learning_rate": 0.0002985879894664757, "loss": 3.7279443359375, "step": 22300 }, { "epoch": 0.045772823435685596, "grad_norm": 1.222548246383667, "learning_rate": 0.00029858136269821935, "loss": 3.6467132568359375, "step": 22350 }, { "epoch": 0.045875223488114424, "grad_norm": 0.7653852701187134, "learning_rate": 0.00029857472049003993, "loss": 3.789747619628906, "step": 22400 }, { "epoch": 0.04597762354054325, "grad_norm": 1.0074176788330078, "learning_rate": 0.00029856806284262767, "loss": 3.3356439208984376, "step": 22450 }, { "epoch": 0.04608002359297208, "grad_norm": 0.9829652309417725, "learning_rate": 0.0002985613897566744, "loss": 2.86457763671875, "step": 22500 }, { "epoch": 0.04618242364540091, "grad_norm": 1.0552867650985718, "learning_rate": 0.0002985547012328736, "loss": 3.389576110839844, "step": 22550 }, { "epoch": 0.046284823697829734, "grad_norm": 0.7977453470230103, "learning_rate": 0.00029854799727192024, "loss": 3.094827880859375, "step": 22600 }, { "epoch": 0.04638722375025856, "grad_norm": 1.0439661741256714, "learning_rate": 0.00029854127787451104, "loss": 3.353898620605469, "step": 22650 }, { "epoch": 0.04648962380268739, "grad_norm": 0.8338518738746643, "learning_rate": 0.0002985345430413442, "loss": 3.2231854248046874, "step": 22700 }, { "epoch": 0.04659202385511622, "grad_norm": 1.1333472728729248, "learning_rate": 0.0002985277927731196, "loss": 3.30358642578125, "step": 22750 }, { "epoch": 0.04669442390754504, "grad_norm": 0.8333401679992676, "learning_rate": 0.0002985210270705387, "loss": 3.2313726806640624, "step": 22800 }, { "epoch": 0.046796823959973866, "grad_norm": 0.926623523235321, "learning_rate": 0.0002985142459343045, "loss": 3.3423468017578126, "step": 22850 }, { "epoch": 0.04689922401240269, "grad_norm": 0.7728790640830994, "learning_rate": 0.00029850744936512177, "loss": 3.470130615234375, "step": 22900 }, { "epoch": 0.04700162406483152, "grad_norm": 1.0513544082641602, "learning_rate": 0.0002985006373636967, "loss": 4.155077514648437, "step": 22950 }, { "epoch": 0.04710402411726035, "grad_norm": 0.8886310458183289, "learning_rate": 0.00029849380993073716, "loss": 4.144877319335937, "step": 23000 }, { "epoch": 0.047206424169689176, "grad_norm": 0.615044116973877, "learning_rate": 0.0002984869670669527, "loss": 4.217498779296875, "step": 23050 }, { "epoch": 0.047308824222118004, "grad_norm": 1.0154633522033691, "learning_rate": 0.00029848010877305437, "loss": 3.5084097290039065, "step": 23100 }, { "epoch": 0.04741122427454683, "grad_norm": 1.1519191265106201, "learning_rate": 0.0002984732350497548, "loss": 4.138232727050781, "step": 23150 }, { "epoch": 0.04751362432697565, "grad_norm": 1.1761195659637451, "learning_rate": 0.0002984663458977683, "loss": 4.233868713378906, "step": 23200 }, { "epoch": 0.04761602437940448, "grad_norm": 1.0882890224456787, "learning_rate": 0.00029845944131781085, "loss": 3.8094412231445314, "step": 23250 }, { "epoch": 0.04771842443183331, "grad_norm": 1.145857810974121, "learning_rate": 0.0002984525213105998, "loss": 4.4981906127929685, "step": 23300 }, { "epoch": 0.047820824484262135, "grad_norm": 1.0446664094924927, "learning_rate": 0.0002984455858768544, "loss": 3.7824630737304688, "step": 23350 }, { "epoch": 0.04792322453669096, "grad_norm": 0.9234415292739868, "learning_rate": 0.0002984386350172952, "loss": 4.244895629882812, "step": 23400 }, { "epoch": 0.04802562458911979, "grad_norm": 0.8664620518684387, "learning_rate": 0.0002984316687326446, "loss": 4.05336181640625, "step": 23450 }, { "epoch": 0.04812802464154862, "grad_norm": 1.1607353687286377, "learning_rate": 0.0002984246870236265, "loss": 3.920790710449219, "step": 23500 }, { "epoch": 0.048230424693977446, "grad_norm": 1.0881608724594116, "learning_rate": 0.00029841768989096633, "loss": 4.012793273925781, "step": 23550 }, { "epoch": 0.048332824746406274, "grad_norm": 1.136512041091919, "learning_rate": 0.0002984106773353913, "loss": 3.7952926635742186, "step": 23600 }, { "epoch": 0.048435224798835094, "grad_norm": 0.9657559990882874, "learning_rate": 0.0002984036493576301, "loss": 3.48884033203125, "step": 23650 }, { "epoch": 0.04853762485126392, "grad_norm": 0.8505204319953918, "learning_rate": 0.000298396605958413, "loss": 3.842665710449219, "step": 23700 }, { "epoch": 0.04864002490369275, "grad_norm": 0.9779611825942993, "learning_rate": 0.00029838954713847193, "loss": 3.847880859375, "step": 23750 }, { "epoch": 0.04874242495612158, "grad_norm": 1.0220547914505005, "learning_rate": 0.0002983824728985404, "loss": 4.149264831542968, "step": 23800 }, { "epoch": 0.048844825008550405, "grad_norm": 1.3035789728164673, "learning_rate": 0.00029837538323935364, "loss": 4.045937194824218, "step": 23850 }, { "epoch": 0.04894722506097923, "grad_norm": 1.0806480646133423, "learning_rate": 0.00029836827816164826, "loss": 3.93858154296875, "step": 23900 }, { "epoch": 0.04904962511340806, "grad_norm": 1.0183125734329224, "learning_rate": 0.0002983611576661626, "loss": 3.665546875, "step": 23950 }, { "epoch": 0.04915202516583689, "grad_norm": 1.1539430618286133, "learning_rate": 0.0002983540217536367, "loss": 4.074727783203125, "step": 24000 }, { "epoch": 0.04925442521826571, "grad_norm": 1.0822535753250122, "learning_rate": 0.00029834687042481193, "loss": 4.032168579101563, "step": 24050 }, { "epoch": 0.049356825270694536, "grad_norm": 1.0588322877883911, "learning_rate": 0.00029833970368043153, "loss": 4.178402404785157, "step": 24100 }, { "epoch": 0.049459225323123364, "grad_norm": 0.7627548575401306, "learning_rate": 0.0002983325215212402, "loss": 4.084798889160156, "step": 24150 }, { "epoch": 0.04956162537555219, "grad_norm": 1.185702919960022, "learning_rate": 0.0002983253239479843, "loss": 4.136662292480469, "step": 24200 }, { "epoch": 0.04966402542798102, "grad_norm": 2.4309804439544678, "learning_rate": 0.0002983181109614118, "loss": 4.230069885253906, "step": 24250 }, { "epoch": 0.04976642548040985, "grad_norm": 1.0039188861846924, "learning_rate": 0.00029831088256227216, "loss": 3.9972125244140626, "step": 24300 }, { "epoch": 0.049868825532838675, "grad_norm": 0.9414103627204895, "learning_rate": 0.0002983036387513166, "loss": 4.060273742675781, "step": 24350 }, { "epoch": 0.0499712255852675, "grad_norm": 1.0714952945709229, "learning_rate": 0.0002982963795292978, "loss": 3.6833465576171873, "step": 24400 }, { "epoch": 0.05007362563769633, "grad_norm": 0.924064576625824, "learning_rate": 0.00029828910489697016, "loss": 3.9215875244140626, "step": 24450 }, { "epoch": 0.05017602569012515, "grad_norm": 0.9032275080680847, "learning_rate": 0.00029828181485508956, "loss": 4.0937020874023435, "step": 24500 }, { "epoch": 0.05027842574255398, "grad_norm": 0.9629778861999512, "learning_rate": 0.00029827450940441363, "loss": 3.5827789306640625, "step": 24550 }, { "epoch": 0.050380825794982806, "grad_norm": 1.0797669887542725, "learning_rate": 0.00029826718854570147, "loss": 3.6074313354492187, "step": 24600 }, { "epoch": 0.050483225847411634, "grad_norm": 1.1837302446365356, "learning_rate": 0.00029825985227971386, "loss": 3.8778558349609376, "step": 24650 }, { "epoch": 0.05058562589984046, "grad_norm": 1.0532505512237549, "learning_rate": 0.0002982525006072131, "loss": 4.007304382324219, "step": 24700 }, { "epoch": 0.05068802595226929, "grad_norm": 1.024993896484375, "learning_rate": 0.00029824513352896327, "loss": 4.1383056640625, "step": 24750 }, { "epoch": 0.050790426004698117, "grad_norm": 2.709007978439331, "learning_rate": 0.00029823775104572976, "loss": 3.71488525390625, "step": 24800 }, { "epoch": 0.050892826057126944, "grad_norm": 0.9420567750930786, "learning_rate": 0.0002982303531582799, "loss": 4.161868591308593, "step": 24850 }, { "epoch": 0.050995226109555765, "grad_norm": 1.638623595237732, "learning_rate": 0.0002982229398673822, "loss": 4.007568969726562, "step": 24900 }, { "epoch": 0.05109762616198459, "grad_norm": 0.7433112859725952, "learning_rate": 0.0002982155111738073, "loss": 3.716796875, "step": 24950 }, { "epoch": 0.05120002621441342, "grad_norm": 1.1634193658828735, "learning_rate": 0.00029820806707832694, "loss": 4.099712524414063, "step": 25000 }, { "epoch": 0.05130242626684225, "grad_norm": 1.0174721479415894, "learning_rate": 0.0002982006075817148, "loss": 3.70357666015625, "step": 25050 }, { "epoch": 0.051404826319271076, "grad_norm": 1.041905164718628, "learning_rate": 0.00029819313268474593, "loss": 3.85610107421875, "step": 25100 }, { "epoch": 0.0515072263716999, "grad_norm": 1.108231782913208, "learning_rate": 0.00029818564238819723, "loss": 4.048504333496094, "step": 25150 }, { "epoch": 0.05160962642412873, "grad_norm": 0.8780749440193176, "learning_rate": 0.00029817813669284695, "loss": 4.2607119750976565, "step": 25200 }, { "epoch": 0.05171202647655756, "grad_norm": 1.0939981937408447, "learning_rate": 0.0002981706155994751, "loss": 4.242766723632813, "step": 25250 }, { "epoch": 0.051814426528986386, "grad_norm": 0.9443891644477844, "learning_rate": 0.00029816307910886323, "loss": 4.077508850097656, "step": 25300 }, { "epoch": 0.05191682658141521, "grad_norm": 0.8710380792617798, "learning_rate": 0.00029815552722179447, "loss": 3.954695739746094, "step": 25350 }, { "epoch": 0.052019226633844035, "grad_norm": 0.9465594291687012, "learning_rate": 0.0002981479599390536, "loss": 3.9642620849609376, "step": 25400 }, { "epoch": 0.05212162668627286, "grad_norm": 1.2072516679763794, "learning_rate": 0.00029814037726142703, "loss": 3.5950994873046875, "step": 25450 }, { "epoch": 0.05222402673870169, "grad_norm": 0.9787052869796753, "learning_rate": 0.0002981327791897026, "loss": 3.669163818359375, "step": 25500 }, { "epoch": 0.05232642679113052, "grad_norm": 0.9823593497276306, "learning_rate": 0.00029812516572467, "loss": 3.70659423828125, "step": 25550 }, { "epoch": 0.052428826843559345, "grad_norm": 0.9548662304878235, "learning_rate": 0.00029811753686712024, "loss": 4.188983459472656, "step": 25600 }, { "epoch": 0.05253122689598817, "grad_norm": 0.8237021565437317, "learning_rate": 0.0002981098926178462, "loss": 4.097180786132813, "step": 25650 }, { "epoch": 0.052633626948417, "grad_norm": 0.8100720047950745, "learning_rate": 0.00029810223297764224, "loss": 4.057103271484375, "step": 25700 }, { "epoch": 0.05273602700084582, "grad_norm": 0.9498805403709412, "learning_rate": 0.00029809455794730424, "loss": 3.9076028442382813, "step": 25750 }, { "epoch": 0.05283842705327465, "grad_norm": 0.9514391422271729, "learning_rate": 0.00029808686752762984, "loss": 3.881569519042969, "step": 25800 }, { "epoch": 0.05294082710570348, "grad_norm": 0.5591891407966614, "learning_rate": 0.0002980791617194181, "loss": 4.816184692382812, "step": 25850 }, { "epoch": 0.053043227158132304, "grad_norm": 0.8840929269790649, "learning_rate": 0.0002980714405234698, "loss": 3.9826123046875, "step": 25900 }, { "epoch": 0.05314562721056113, "grad_norm": 0.6732226610183716, "learning_rate": 0.00029806370394058735, "loss": 3.7573004150390625, "step": 25950 }, { "epoch": 0.05324802726298996, "grad_norm": 1.1279404163360596, "learning_rate": 0.0002980559519715747, "loss": 3.7439083862304687, "step": 26000 }, { "epoch": 0.05335042731541879, "grad_norm": 1.28814697265625, "learning_rate": 0.0002980481846172372, "loss": 3.40891357421875, "step": 26050 }, { "epoch": 0.053452827367847615, "grad_norm": 0.8305365443229675, "learning_rate": 0.0002980404018783823, "loss": 3.9074551391601564, "step": 26100 }, { "epoch": 0.05355522742027644, "grad_norm": 1.059561848640442, "learning_rate": 0.0002980326037558186, "loss": 3.3790802001953124, "step": 26150 }, { "epoch": 0.05365762747270526, "grad_norm": 0.7863622903823853, "learning_rate": 0.00029802479025035645, "loss": 3.8910751342773438, "step": 26200 }, { "epoch": 0.05376002752513409, "grad_norm": 0.8412345051765442, "learning_rate": 0.0002980169613628078, "loss": 3.905106201171875, "step": 26250 }, { "epoch": 0.05386242757756292, "grad_norm": 0.6786169409751892, "learning_rate": 0.0002980091170939862, "loss": 3.6419586181640624, "step": 26300 }, { "epoch": 0.053964827629991746, "grad_norm": 0.8411727547645569, "learning_rate": 0.00029800125744470677, "loss": 3.3573968505859373, "step": 26350 }, { "epoch": 0.054067227682420574, "grad_norm": 0.9979608654975891, "learning_rate": 0.0002979933824157863, "loss": 3.6130526733398436, "step": 26400 }, { "epoch": 0.0541696277348494, "grad_norm": 0.8738940358161926, "learning_rate": 0.00029798549200804305, "loss": 3.2773031616210937, "step": 26450 }, { "epoch": 0.05427202778727823, "grad_norm": 0.8625099062919617, "learning_rate": 0.0002979775862222971, "loss": 3.92064453125, "step": 26500 }, { "epoch": 0.05437442783970706, "grad_norm": 1.1380776166915894, "learning_rate": 0.00029796966505936975, "loss": 3.9016488647460936, "step": 26550 }, { "epoch": 0.05447682789213588, "grad_norm": 0.8728241324424744, "learning_rate": 0.0002979617285200844, "loss": 4.155015258789063, "step": 26600 }, { "epoch": 0.054579227944564705, "grad_norm": 1.174974799156189, "learning_rate": 0.0002979537766052656, "loss": 3.755271301269531, "step": 26650 }, { "epoch": 0.05468162799699353, "grad_norm": 1.0797170400619507, "learning_rate": 0.00029794580931573973, "loss": 3.6002767944335936, "step": 26700 }, { "epoch": 0.05478402804942236, "grad_norm": 0.8095331192016602, "learning_rate": 0.0002979378266523347, "loss": 3.9049578857421876, "step": 26750 }, { "epoch": 0.05488642810185119, "grad_norm": 0.8785421252250671, "learning_rate": 0.00029792982861588007, "loss": 3.594248046875, "step": 26800 }, { "epoch": 0.054988828154280016, "grad_norm": 0.8992822766304016, "learning_rate": 0.0002979218152072069, "loss": 4.156261901855469, "step": 26850 }, { "epoch": 0.055091228206708844, "grad_norm": 1.633196234703064, "learning_rate": 0.000297913786427148, "loss": 3.608190612792969, "step": 26900 }, { "epoch": 0.05519362825913767, "grad_norm": 1.1997803449630737, "learning_rate": 0.0002979057422765376, "loss": 3.6971340942382813, "step": 26950 }, { "epoch": 0.0552960283115665, "grad_norm": 0.987196147441864, "learning_rate": 0.00029789768275621163, "loss": 3.6062017822265626, "step": 27000 }, { "epoch": 0.05539842836399532, "grad_norm": 1.0470249652862549, "learning_rate": 0.00029788960786700767, "loss": 3.6216055297851564, "step": 27050 }, { "epoch": 0.05550082841642415, "grad_norm": 1.3368786573410034, "learning_rate": 0.00029788151760976473, "loss": 3.4363177490234373, "step": 27100 }, { "epoch": 0.055603228468852975, "grad_norm": 1.0057690143585205, "learning_rate": 0.0002978734119853236, "loss": 2.9398748779296877, "step": 27150 }, { "epoch": 0.0557056285212818, "grad_norm": 1.0253512859344482, "learning_rate": 0.0002978652909945265, "loss": 3.5486212158203125, "step": 27200 }, { "epoch": 0.05580802857371063, "grad_norm": 0.9567630887031555, "learning_rate": 0.0002978571546382174, "loss": 3.531204833984375, "step": 27250 }, { "epoch": 0.05591042862613946, "grad_norm": 0.7189958691596985, "learning_rate": 0.00029784900291724174, "loss": 4.003550415039062, "step": 27300 }, { "epoch": 0.056012828678568286, "grad_norm": 0.7804083228111267, "learning_rate": 0.0002978408358324466, "loss": 3.952115173339844, "step": 27350 }, { "epoch": 0.05611522873099711, "grad_norm": 0.7131394743919373, "learning_rate": 0.00029783265338468077, "loss": 3.712818298339844, "step": 27400 }, { "epoch": 0.056217628783425934, "grad_norm": 0.9421349167823792, "learning_rate": 0.0002978244555747944, "loss": 3.955911865234375, "step": 27450 }, { "epoch": 0.05632002883585476, "grad_norm": 1.1702853441238403, "learning_rate": 0.0002978162424036395, "loss": 3.715908203125, "step": 27500 }, { "epoch": 0.05642242888828359, "grad_norm": 1.0307793617248535, "learning_rate": 0.0002978080138720694, "loss": 4.063231506347656, "step": 27550 }, { "epoch": 0.05652482894071242, "grad_norm": 1.0633025169372559, "learning_rate": 0.00029779976998093926, "loss": 3.9883132934570313, "step": 27600 }, { "epoch": 0.056627228993141245, "grad_norm": 1.6195343732833862, "learning_rate": 0.0002977915107311058, "loss": 4.001260681152344, "step": 27650 }, { "epoch": 0.05672962904557007, "grad_norm": 0.9477188587188721, "learning_rate": 0.00029778323612342716, "loss": 3.9925576782226564, "step": 27700 }, { "epoch": 0.0568320290979989, "grad_norm": 0.7277911305427551, "learning_rate": 0.00029777494615876337, "loss": 3.8298355102539063, "step": 27750 }, { "epoch": 0.05693442915042773, "grad_norm": 0.8074896931648254, "learning_rate": 0.0002977666408379757, "loss": 3.53470947265625, "step": 27800 }, { "epoch": 0.057036829202856555, "grad_norm": 1.870801568031311, "learning_rate": 0.0002977583201619273, "loss": 4.093720703125, "step": 27850 }, { "epoch": 0.057139229255285376, "grad_norm": 0.9061904549598694, "learning_rate": 0.00029774998413148283, "loss": 3.6751202392578124, "step": 27900 }, { "epoch": 0.057241629307714204, "grad_norm": 0.766776978969574, "learning_rate": 0.0002977416327475085, "loss": 3.7472940063476563, "step": 27950 }, { "epoch": 0.05734402936014303, "grad_norm": 0.9437297582626343, "learning_rate": 0.0002977332660108722, "loss": 3.1673342895507814, "step": 28000 }, { "epoch": 0.05744642941257186, "grad_norm": 0.9875741004943848, "learning_rate": 0.00029772488392244324, "loss": 3.69399658203125, "step": 28050 }, { "epoch": 0.057548829465000687, "grad_norm": 1.2089347839355469, "learning_rate": 0.00029771648648309275, "loss": 3.5663076782226564, "step": 28100 }, { "epoch": 0.057651229517429514, "grad_norm": 1.0613031387329102, "learning_rate": 0.00029770807369369334, "loss": 3.696695556640625, "step": 28150 }, { "epoch": 0.05775362956985834, "grad_norm": 1.1133229732513428, "learning_rate": 0.00029769964555511925, "loss": 3.527508544921875, "step": 28200 }, { "epoch": 0.05785602962228717, "grad_norm": 1.0089772939682007, "learning_rate": 0.0002976912020682463, "loss": 3.744898376464844, "step": 28250 }, { "epoch": 0.05795842967471599, "grad_norm": 0.9647061824798584, "learning_rate": 0.00029768274323395183, "loss": 3.6294049072265624, "step": 28300 }, { "epoch": 0.05806082972714482, "grad_norm": 1.680829644203186, "learning_rate": 0.00029767426905311485, "loss": 3.6455474853515626, "step": 28350 }, { "epoch": 0.058163229779573646, "grad_norm": 0.9101169109344482, "learning_rate": 0.00029766577952661607, "loss": 3.9211056518554686, "step": 28400 }, { "epoch": 0.05826562983200247, "grad_norm": 1.0310935974121094, "learning_rate": 0.00029765727465533764, "loss": 3.7476397705078126, "step": 28450 }, { "epoch": 0.0583680298844313, "grad_norm": 1.042888879776001, "learning_rate": 0.00029764875444016325, "loss": 4.1402108764648435, "step": 28500 }, { "epoch": 0.05847042993686013, "grad_norm": 1.0709255933761597, "learning_rate": 0.00029764021888197835, "loss": 3.9610775756835936, "step": 28550 }, { "epoch": 0.058572829989288956, "grad_norm": 1.027099370956421, "learning_rate": 0.00029763166798166995, "loss": 3.751552734375, "step": 28600 }, { "epoch": 0.058675230041717784, "grad_norm": 0.7349804639816284, "learning_rate": 0.0002976231017401266, "loss": 3.742770080566406, "step": 28650 }, { "epoch": 0.05877763009414661, "grad_norm": 1.0283441543579102, "learning_rate": 0.0002976145201582384, "loss": 3.7890921020507813, "step": 28700 }, { "epoch": 0.05888003014657543, "grad_norm": 0.8082360029220581, "learning_rate": 0.00029760592323689725, "loss": 4.149041748046875, "step": 28750 }, { "epoch": 0.05898243019900426, "grad_norm": 0.9537481665611267, "learning_rate": 0.00029759731097699635, "loss": 4.166469421386719, "step": 28800 }, { "epoch": 0.05908483025143309, "grad_norm": 1.1642649173736572, "learning_rate": 0.0002975886833794308, "loss": 4.074107360839844, "step": 28850 }, { "epoch": 0.059187230303861915, "grad_norm": 1.0695040225982666, "learning_rate": 0.00029758004044509707, "loss": 4.009411926269531, "step": 28900 }, { "epoch": 0.05928963035629074, "grad_norm": 0.933382511138916, "learning_rate": 0.00029757138217489324, "loss": 3.857533264160156, "step": 28950 }, { "epoch": 0.05939203040871957, "grad_norm": 1.0519219636917114, "learning_rate": 0.0002975627085697191, "loss": 3.5922341918945313, "step": 29000 }, { "epoch": 0.0594944304611484, "grad_norm": 0.879135251045227, "learning_rate": 0.00029755401963047596, "loss": 4.271012268066406, "step": 29050 }, { "epoch": 0.059596830513577226, "grad_norm": 1.0314289331436157, "learning_rate": 0.0002975453153580667, "loss": 3.891732177734375, "step": 29100 }, { "epoch": 0.05969923056600605, "grad_norm": 0.9761302471160889, "learning_rate": 0.000297536595753396, "loss": 3.860959167480469, "step": 29150 }, { "epoch": 0.059801630618434874, "grad_norm": 0.866371750831604, "learning_rate": 0.0002975278608173697, "loss": 3.8342303466796874, "step": 29200 }, { "epoch": 0.0599040306708637, "grad_norm": 0.9015768766403198, "learning_rate": 0.0002975191105508957, "loss": 3.8901824951171875, "step": 29250 }, { "epoch": 0.06000643072329253, "grad_norm": 0.9253438711166382, "learning_rate": 0.0002975103449548832, "loss": 3.8019094848632813, "step": 29300 }, { "epoch": 0.06010883077572136, "grad_norm": 0.7289124727249146, "learning_rate": 0.0002975015640302431, "loss": 3.34075439453125, "step": 29350 }, { "epoch": 0.060211230828150185, "grad_norm": 0.713688313961029, "learning_rate": 0.0002974927677778879, "loss": 3.235279235839844, "step": 29400 }, { "epoch": 0.06031363088057901, "grad_norm": 0.6275246143341064, "learning_rate": 0.0002974839561987316, "loss": 3.8884927368164064, "step": 29450 }, { "epoch": 0.06041603093300784, "grad_norm": 1.1090385913848877, "learning_rate": 0.0002974751292936899, "loss": 3.3796435546875, "step": 29500 }, { "epoch": 0.06051843098543667, "grad_norm": 0.8206045031547546, "learning_rate": 0.0002974662870636801, "loss": 3.9724603271484376, "step": 29550 }, { "epoch": 0.06062083103786549, "grad_norm": 1.3841317892074585, "learning_rate": 0.00029745742950962095, "loss": 3.951322021484375, "step": 29600 }, { "epoch": 0.060723231090294316, "grad_norm": 0.9978547692298889, "learning_rate": 0.000297448556632433, "loss": 4.5418917846679685, "step": 29650 }, { "epoch": 0.060825631142723144, "grad_norm": 0.9191545248031616, "learning_rate": 0.0002974396684330382, "loss": 3.5654345703125, "step": 29700 }, { "epoch": 0.06092803119515197, "grad_norm": 1.4994065761566162, "learning_rate": 0.0002974307649123602, "loss": 3.7218731689453124, "step": 29750 }, { "epoch": 0.0610304312475808, "grad_norm": 0.6516634821891785, "learning_rate": 0.0002974218460713242, "loss": 3.561522216796875, "step": 29800 }, { "epoch": 0.06113283130000963, "grad_norm": 1.1022820472717285, "learning_rate": 0.000297412911910857, "loss": 4.207413024902344, "step": 29850 }, { "epoch": 0.061235231352438454, "grad_norm": 0.861346960067749, "learning_rate": 0.000297403962431887, "loss": 3.959631042480469, "step": 29900 }, { "epoch": 0.06133763140486728, "grad_norm": 0.8098173141479492, "learning_rate": 0.0002973949976353442, "loss": 4.688843383789062, "step": 29950 }, { "epoch": 0.0614400314572961, "grad_norm": 0.8004640936851501, "learning_rate": 0.0002973860175221603, "loss": 4.384559631347656, "step": 30000 }, { "epoch": 0.06154243150972493, "grad_norm": 1.4548406600952148, "learning_rate": 0.0002973770220932683, "loss": 4.232876281738282, "step": 30050 }, { "epoch": 0.06164483156215376, "grad_norm": 1.1136951446533203, "learning_rate": 0.00029736801134960296, "loss": 4.017593994140625, "step": 30100 }, { "epoch": 0.061747231614582586, "grad_norm": 0.9526700377464294, "learning_rate": 0.00029735898529210074, "loss": 3.694122619628906, "step": 30150 }, { "epoch": 0.061849631667011414, "grad_norm": 0.9094407558441162, "learning_rate": 0.0002973499439216996, "loss": 2.5258544921875, "step": 30200 }, { "epoch": 0.06195203171944024, "grad_norm": 0.9788475632667542, "learning_rate": 0.000297340887239339, "loss": 3.6749945068359375, "step": 30250 }, { "epoch": 0.06205443177186907, "grad_norm": 0.9837728142738342, "learning_rate": 0.00029733181524596006, "loss": 3.9548965454101563, "step": 30300 }, { "epoch": 0.062156831824297896, "grad_norm": 0.9949678778648376, "learning_rate": 0.00029732272794250563, "loss": 3.194211730957031, "step": 30350 }, { "epoch": 0.062259231876726724, "grad_norm": 1.091620683670044, "learning_rate": 0.00029731362532991985, "loss": 3.8439263916015625, "step": 30400 }, { "epoch": 0.062361631929155545, "grad_norm": 0.759272575378418, "learning_rate": 0.0002973045074091488, "loss": 3.965645751953125, "step": 30450 }, { "epoch": 0.06246403198158437, "grad_norm": 0.9479434490203857, "learning_rate": 0.0002972953741811398, "loss": 3.6418606567382814, "step": 30500 }, { "epoch": 0.0625664320340132, "grad_norm": 0.8087990880012512, "learning_rate": 0.00029728622564684204, "loss": 3.7622882080078126, "step": 30550 }, { "epoch": 0.06266883208644203, "grad_norm": 1.2932571172714233, "learning_rate": 0.0002972770618072062, "loss": 4.1614468383789065, "step": 30600 }, { "epoch": 0.06277123213887086, "grad_norm": 0.6852632761001587, "learning_rate": 0.00029726788266318455, "loss": 3.5135552978515623, "step": 30650 }, { "epoch": 0.06287363219129968, "grad_norm": 0.9849332571029663, "learning_rate": 0.0002972586882157309, "loss": 3.3184869384765623, "step": 30700 }, { "epoch": 0.06297603224372851, "grad_norm": 1.1004332304000854, "learning_rate": 0.00029724947846580064, "loss": 3.4316140747070314, "step": 30750 }, { "epoch": 0.06307843229615734, "grad_norm": 0.9240966439247131, "learning_rate": 0.00029724025341435097, "loss": 4.058392333984375, "step": 30800 }, { "epoch": 0.06318083234858617, "grad_norm": 0.8939677476882935, "learning_rate": 0.0002972310130623404, "loss": 4.048366088867187, "step": 30850 }, { "epoch": 0.063283232401015, "grad_norm": 0.8218761086463928, "learning_rate": 0.00029722175741072915, "loss": 4.063833618164063, "step": 30900 }, { "epoch": 0.06338563245344382, "grad_norm": 0.9675712585449219, "learning_rate": 0.0002972124864604791, "loss": 3.7749728393554687, "step": 30950 }, { "epoch": 0.06348803250587265, "grad_norm": 1.2063570022583008, "learning_rate": 0.0002972032002125536, "loss": 3.5751220703125, "step": 31000 }, { "epoch": 0.06359043255830148, "grad_norm": 1.0709924697875977, "learning_rate": 0.00029719389866791755, "loss": 3.7293637084960936, "step": 31050 }, { "epoch": 0.06369283261073029, "grad_norm": 0.8866503834724426, "learning_rate": 0.0002971845818275377, "loss": 3.907535400390625, "step": 31100 }, { "epoch": 0.06379523266315912, "grad_norm": 1.0057551860809326, "learning_rate": 0.00029717524969238206, "loss": 3.222738037109375, "step": 31150 }, { "epoch": 0.06389763271558795, "grad_norm": 0.9129172563552856, "learning_rate": 0.0002971659022634205, "loss": 3.4403155517578123, "step": 31200 }, { "epoch": 0.06400003276801677, "grad_norm": 0.9336997866630554, "learning_rate": 0.0002971565395416243, "loss": 3.30571044921875, "step": 31250 }, { "epoch": 0.0641024328204456, "grad_norm": 1.1164926290512085, "learning_rate": 0.0002971471615279664, "loss": 3.8188116455078127, "step": 31300 }, { "epoch": 0.06420483287287443, "grad_norm": 0.8115789890289307, "learning_rate": 0.0002971377682234213, "loss": 3.9151617431640626, "step": 31350 }, { "epoch": 0.06430723292530326, "grad_norm": 0.9240061044692993, "learning_rate": 0.00029712835962896514, "loss": 3.709864196777344, "step": 31400 }, { "epoch": 0.06440963297773208, "grad_norm": 1.6785798072814941, "learning_rate": 0.0002971189357455756, "loss": 3.689013671875, "step": 31450 }, { "epoch": 0.06451203303016091, "grad_norm": 0.7833497524261475, "learning_rate": 0.0002971094965742321, "loss": 3.3715243530273438, "step": 31500 }, { "epoch": 0.06461443308258974, "grad_norm": 0.8799951076507568, "learning_rate": 0.0002971000421159153, "loss": 4.09000244140625, "step": 31550 }, { "epoch": 0.06471683313501857, "grad_norm": 0.7977895736694336, "learning_rate": 0.0002970905723716078, "loss": 4.248508911132813, "step": 31600 }, { "epoch": 0.0648192331874474, "grad_norm": 0.8709924221038818, "learning_rate": 0.00029708108734229365, "loss": 3.489057922363281, "step": 31650 }, { "epoch": 0.06492163323987622, "grad_norm": 0.8895650506019592, "learning_rate": 0.00029707158702895847, "loss": 3.898555908203125, "step": 31700 }, { "epoch": 0.06502403329230505, "grad_norm": 0.8814746737480164, "learning_rate": 0.00029706207143258945, "loss": 3.7208917236328123, "step": 31750 }, { "epoch": 0.06512643334473388, "grad_norm": 0.9977162480354309, "learning_rate": 0.0002970525405541755, "loss": 4.208245849609375, "step": 31800 }, { "epoch": 0.0652288333971627, "grad_norm": 0.7882950901985168, "learning_rate": 0.0002970429943947069, "loss": 3.7341409301757813, "step": 31850 }, { "epoch": 0.06533123344959153, "grad_norm": 0.9084259867668152, "learning_rate": 0.00029703343295517577, "loss": 3.782439880371094, "step": 31900 }, { "epoch": 0.06543363350202035, "grad_norm": 1.0745272636413574, "learning_rate": 0.0002970238562365756, "loss": 3.530187072753906, "step": 31950 }, { "epoch": 0.06553603355444917, "grad_norm": 0.7873273491859436, "learning_rate": 0.0002970142642399017, "loss": 3.5862966918945314, "step": 32000 }, { "epoch": 0.065638433606878, "grad_norm": 0.9771028757095337, "learning_rate": 0.0002970046569661506, "loss": 3.8827175903320312, "step": 32050 }, { "epoch": 0.06574083365930683, "grad_norm": 0.8443105816841125, "learning_rate": 0.00029699503441632085, "loss": 3.28268310546875, "step": 32100 }, { "epoch": 0.06584323371173566, "grad_norm": 0.7213400602340698, "learning_rate": 0.0002969853965914123, "loss": 3.13387939453125, "step": 32150 }, { "epoch": 0.06594563376416449, "grad_norm": 1.1795644760131836, "learning_rate": 0.0002969757434924265, "loss": 3.658702087402344, "step": 32200 }, { "epoch": 0.06604803381659331, "grad_norm": 0.7857722640037537, "learning_rate": 0.0002969660751203665, "loss": 3.4446502685546876, "step": 32250 }, { "epoch": 0.06615043386902214, "grad_norm": 1.0390616655349731, "learning_rate": 0.00029695639147623703, "loss": 3.644783630371094, "step": 32300 }, { "epoch": 0.06625283392145097, "grad_norm": 0.7487825155258179, "learning_rate": 0.00029694669256104446, "loss": 3.63455810546875, "step": 32350 }, { "epoch": 0.0663552339738798, "grad_norm": 0.8825246691703796, "learning_rate": 0.0002969369783757965, "loss": 3.3496524047851564, "step": 32400 }, { "epoch": 0.06645763402630862, "grad_norm": 1.1626224517822266, "learning_rate": 0.00029692724892150266, "loss": 3.726259460449219, "step": 32450 }, { "epoch": 0.06656003407873745, "grad_norm": 0.74493008852005, "learning_rate": 0.00029691750419917406, "loss": 3.7289053344726564, "step": 32500 }, { "epoch": 0.06666243413116628, "grad_norm": 0.6749517917633057, "learning_rate": 0.00029690774420982317, "loss": 3.5053274536132815, "step": 32550 }, { "epoch": 0.0667648341835951, "grad_norm": 1.099471926689148, "learning_rate": 0.0002968979689544644, "loss": 3.514427490234375, "step": 32600 }, { "epoch": 0.06686723423602393, "grad_norm": 0.9038723111152649, "learning_rate": 0.00029688817843411344, "loss": 3.616097106933594, "step": 32650 }, { "epoch": 0.06696963428845276, "grad_norm": 0.7338837385177612, "learning_rate": 0.0002968783726497877, "loss": 3.4425479125976564, "step": 32700 }, { "epoch": 0.06707203434088159, "grad_norm": 1.254689335823059, "learning_rate": 0.0002968685516025061, "loss": 3.3777651977539063, "step": 32750 }, { "epoch": 0.0671744343933104, "grad_norm": 0.8535405397415161, "learning_rate": 0.00029685871529328933, "loss": 4.319814758300781, "step": 32800 }, { "epoch": 0.06727683444573923, "grad_norm": 0.9299177527427673, "learning_rate": 0.00029684886372315935, "loss": 3.78345458984375, "step": 32850 }, { "epoch": 0.06737923449816806, "grad_norm": 1.0497288703918457, "learning_rate": 0.0002968389968931401, "loss": 3.619969787597656, "step": 32900 }, { "epoch": 0.06748163455059689, "grad_norm": 0.9285115599632263, "learning_rate": 0.00029682911480425673, "loss": 3.488844909667969, "step": 32950 }, { "epoch": 0.06758403460302571, "grad_norm": 1.2114810943603516, "learning_rate": 0.0002968192174575362, "loss": 3.8050308227539062, "step": 33000 }, { "epoch": 0.06768643465545454, "grad_norm": 0.9714403748512268, "learning_rate": 0.0002968093048540071, "loss": 3.402801208496094, "step": 33050 }, { "epoch": 0.06778883470788337, "grad_norm": 1.049149990081787, "learning_rate": 0.00029679937699469934, "loss": 3.4410101318359376, "step": 33100 }, { "epoch": 0.0678912347603122, "grad_norm": 0.8005252480506897, "learning_rate": 0.0002967894338806446, "loss": 3.9667138671875, "step": 33150 }, { "epoch": 0.06799363481274102, "grad_norm": 1.0901520252227783, "learning_rate": 0.00029677947551287625, "loss": 3.6659295654296873, "step": 33200 }, { "epoch": 0.06809603486516985, "grad_norm": 0.9532211422920227, "learning_rate": 0.000296769501892429, "loss": 3.9132586669921876, "step": 33250 }, { "epoch": 0.06819843491759868, "grad_norm": 1.3878906965255737, "learning_rate": 0.0002967595130203394, "loss": 3.927642822265625, "step": 33300 }, { "epoch": 0.06830083497002751, "grad_norm": 1.046176552772522, "learning_rate": 0.00029674950889764523, "loss": 3.9409329223632814, "step": 33350 }, { "epoch": 0.06840323502245634, "grad_norm": 0.8497132062911987, "learning_rate": 0.0002967394895253863, "loss": 3.60568359375, "step": 33400 }, { "epoch": 0.06850563507488516, "grad_norm": 1.1313576698303223, "learning_rate": 0.00029672945490460365, "loss": 4.024774780273438, "step": 33450 }, { "epoch": 0.06860803512731399, "grad_norm": 1.0038946866989136, "learning_rate": 0.00029671940503634006, "loss": 3.707646484375, "step": 33500 }, { "epoch": 0.06871043517974282, "grad_norm": 1.1383546590805054, "learning_rate": 0.0002967093399216399, "loss": 3.2730068969726562, "step": 33550 }, { "epoch": 0.06881283523217165, "grad_norm": 0.9146387577056885, "learning_rate": 0.00029669925956154905, "loss": 3.9269442749023438, "step": 33600 }, { "epoch": 0.06891523528460046, "grad_norm": 0.6965939402580261, "learning_rate": 0.000296689163957115, "loss": 2.7985528564453124, "step": 33650 }, { "epoch": 0.06901763533702929, "grad_norm": 0.8769970536231995, "learning_rate": 0.00029667905310938695, "loss": 4.186055908203125, "step": 33700 }, { "epoch": 0.06912003538945811, "grad_norm": 0.8398081660270691, "learning_rate": 0.0002966689270194154, "loss": 3.677633056640625, "step": 33750 }, { "epoch": 0.06922243544188694, "grad_norm": 0.7318697571754456, "learning_rate": 0.00029665878568825284, "loss": 4.001636352539062, "step": 33800 }, { "epoch": 0.06932483549431577, "grad_norm": 1.0592197179794312, "learning_rate": 0.00029664862911695286, "loss": 3.9292962646484373, "step": 33850 }, { "epoch": 0.0694272355467446, "grad_norm": 1.3345533609390259, "learning_rate": 0.0002966384573065711, "loss": 3.7566705322265626, "step": 33900 }, { "epoch": 0.06952963559917343, "grad_norm": 0.9815147519111633, "learning_rate": 0.00029662827025816443, "loss": 3.8881317138671876, "step": 33950 }, { "epoch": 0.06963203565160225, "grad_norm": 0.996683657169342, "learning_rate": 0.00029661806797279147, "loss": 3.9453826904296876, "step": 34000 }, { "epoch": 0.06973443570403108, "grad_norm": 1.0983341932296753, "learning_rate": 0.0002966078504515125, "loss": 4.025393371582031, "step": 34050 }, { "epoch": 0.06983683575645991, "grad_norm": 1.2514588832855225, "learning_rate": 0.0002965976176953891, "loss": 4.020445556640625, "step": 34100 }, { "epoch": 0.06993923580888874, "grad_norm": 0.7997650504112244, "learning_rate": 0.00029658736970548477, "loss": 3.7041055297851564, "step": 34150 }, { "epoch": 0.07004163586131756, "grad_norm": 0.7876397371292114, "learning_rate": 0.00029657710648286437, "loss": 3.2046856689453125, "step": 34200 }, { "epoch": 0.07014403591374639, "grad_norm": 0.9293930530548096, "learning_rate": 0.00029656682802859443, "loss": 3.7819009399414063, "step": 34250 }, { "epoch": 0.07024643596617522, "grad_norm": 0.6517935395240784, "learning_rate": 0.000296556534343743, "loss": 2.6689993286132814, "step": 34300 }, { "epoch": 0.07034883601860405, "grad_norm": 1.03813898563385, "learning_rate": 0.00029654622542937977, "loss": 2.5518731689453125, "step": 34350 }, { "epoch": 0.07045123607103287, "grad_norm": 0.7847388386726379, "learning_rate": 0.00029653590128657603, "loss": 3.8658258056640626, "step": 34400 }, { "epoch": 0.0705536361234617, "grad_norm": 0.9255051612854004, "learning_rate": 0.0002965255619164046, "loss": 3.4440853881835936, "step": 34450 }, { "epoch": 0.07065603617589052, "grad_norm": 0.8334102630615234, "learning_rate": 0.00029651520731993993, "loss": 3.837626647949219, "step": 34500 }, { "epoch": 0.07075843622831934, "grad_norm": 1.0661958456039429, "learning_rate": 0.000296504837498258, "loss": 4.052589111328125, "step": 34550 }, { "epoch": 0.07086083628074817, "grad_norm": 0.8307774662971497, "learning_rate": 0.0002964944524524363, "loss": 4.152563781738281, "step": 34600 }, { "epoch": 0.070963236333177, "grad_norm": 1.1851427555084229, "learning_rate": 0.00029648405218355415, "loss": 3.877910461425781, "step": 34650 }, { "epoch": 0.07106563638560583, "grad_norm": 1.024609088897705, "learning_rate": 0.0002964736366926923, "loss": 2.9125543212890626, "step": 34700 }, { "epoch": 0.07116803643803465, "grad_norm": 1.091864824295044, "learning_rate": 0.00029646320598093295, "loss": 3.8221173095703125, "step": 34750 }, { "epoch": 0.07127043649046348, "grad_norm": 0.9245619177818298, "learning_rate": 0.0002964527600493601, "loss": 2.984726867675781, "step": 34800 }, { "epoch": 0.07137283654289231, "grad_norm": 0.9920394420623779, "learning_rate": 0.0002964422988990592, "loss": 3.581501159667969, "step": 34850 }, { "epoch": 0.07147523659532114, "grad_norm": 0.7046719789505005, "learning_rate": 0.0002964318225311174, "loss": 2.9280935668945314, "step": 34900 }, { "epoch": 0.07157763664774996, "grad_norm": 0.7766258120536804, "learning_rate": 0.0002964213309466233, "loss": 2.3795321655273436, "step": 34950 }, { "epoch": 0.07168003670017879, "grad_norm": 1.3687994480133057, "learning_rate": 0.0002964108241466672, "loss": 3.43721923828125, "step": 35000 }, { "epoch": 0.07178243675260762, "grad_norm": 1.0140045881271362, "learning_rate": 0.00029640030213234084, "loss": 3.19546875, "step": 35050 }, { "epoch": 0.07188483680503645, "grad_norm": 0.8950518369674683, "learning_rate": 0.0002963897649047376, "loss": 3.794825134277344, "step": 35100 }, { "epoch": 0.07198723685746528, "grad_norm": 1.7291576862335205, "learning_rate": 0.0002963792124649526, "loss": 4.249531555175781, "step": 35150 }, { "epoch": 0.0720896369098941, "grad_norm": 1.7866417169570923, "learning_rate": 0.0002963686448140823, "loss": 3.9559259033203125, "step": 35200 }, { "epoch": 0.07219203696232293, "grad_norm": 0.9784793257713318, "learning_rate": 0.0002963580619532249, "loss": 3.6990866088867187, "step": 35250 }, { "epoch": 0.07229443701475176, "grad_norm": 1.6409136056900024, "learning_rate": 0.00029634746388348005, "loss": 3.6978335571289063, "step": 35300 }, { "epoch": 0.07239683706718057, "grad_norm": 1.109778642654419, "learning_rate": 0.00029633685060594914, "loss": 3.7638284301757814, "step": 35350 }, { "epoch": 0.0724992371196094, "grad_norm": 1.3247849941253662, "learning_rate": 0.000296326222121735, "loss": 4.101665954589844, "step": 35400 }, { "epoch": 0.07260163717203823, "grad_norm": 1.0803288221359253, "learning_rate": 0.0002963155784319421, "loss": 3.9325439453125, "step": 35450 }, { "epoch": 0.07270403722446706, "grad_norm": 1.2640902996063232, "learning_rate": 0.00029630491953767647, "loss": 3.4765811157226563, "step": 35500 }, { "epoch": 0.07280643727689588, "grad_norm": 1.0323841571807861, "learning_rate": 0.0002962942454400458, "loss": 3.8790185546875, "step": 35550 }, { "epoch": 0.07290883732932471, "grad_norm": 0.9365559816360474, "learning_rate": 0.0002962835561401592, "loss": 3.8441122436523436, "step": 35600 }, { "epoch": 0.07301123738175354, "grad_norm": 0.9189864993095398, "learning_rate": 0.00029627285163912753, "loss": 3.819436340332031, "step": 35650 }, { "epoch": 0.07311363743418237, "grad_norm": 1.2897831201553345, "learning_rate": 0.00029626213193806317, "loss": 3.544706115722656, "step": 35700 }, { "epoch": 0.0732160374866112, "grad_norm": 0.86373370885849, "learning_rate": 0.00029625139703807996, "loss": 3.7399945068359375, "step": 35750 }, { "epoch": 0.07331843753904002, "grad_norm": 1.0938329696655273, "learning_rate": 0.00029624064694029357, "loss": 3.89250244140625, "step": 35800 }, { "epoch": 0.07342083759146885, "grad_norm": 0.9408879280090332, "learning_rate": 0.000296229881645821, "loss": 3.056258239746094, "step": 35850 }, { "epoch": 0.07352323764389768, "grad_norm": 1.1271533966064453, "learning_rate": 0.0002962191011557809, "loss": 3.544586181640625, "step": 35900 }, { "epoch": 0.0736256376963265, "grad_norm": 1.011702537536621, "learning_rate": 0.0002962083054712936, "loss": 3.683125305175781, "step": 35950 }, { "epoch": 0.07372803774875533, "grad_norm": 0.8757224678993225, "learning_rate": 0.000296197494593481, "loss": 3.3673382568359376, "step": 36000 }, { "epoch": 0.07383043780118416, "grad_norm": 0.6535724997520447, "learning_rate": 0.00029618666852346644, "loss": 4.935340881347656, "step": 36050 }, { "epoch": 0.07393283785361299, "grad_norm": 0.6584002375602722, "learning_rate": 0.0002961758272623749, "loss": 4.499714660644531, "step": 36100 }, { "epoch": 0.07403523790604181, "grad_norm": 0.8999959230422974, "learning_rate": 0.000296164970811333, "loss": 4.462132568359375, "step": 36150 }, { "epoch": 0.07413763795847063, "grad_norm": 1.0016320943832397, "learning_rate": 0.00029615409917146886, "loss": 3.6168402099609374, "step": 36200 }, { "epoch": 0.07424003801089946, "grad_norm": 0.9233262538909912, "learning_rate": 0.0002961432123439122, "loss": 3.7079287719726564, "step": 36250 }, { "epoch": 0.07434243806332828, "grad_norm": 1.2862437963485718, "learning_rate": 0.0002961323103297944, "loss": 3.554483642578125, "step": 36300 }, { "epoch": 0.07444483811575711, "grad_norm": 1.0531319379806519, "learning_rate": 0.0002961213931302483, "loss": 3.9057122802734376, "step": 36350 }, { "epoch": 0.07454723816818594, "grad_norm": 1.0585157871246338, "learning_rate": 0.00029611046074640835, "loss": 4.065590209960938, "step": 36400 }, { "epoch": 0.07464963822061477, "grad_norm": 0.9923078417778015, "learning_rate": 0.00029609951317941067, "loss": 3.753091125488281, "step": 36450 }, { "epoch": 0.0747520382730436, "grad_norm": 1.4187533855438232, "learning_rate": 0.0002960885504303928, "loss": 3.81512939453125, "step": 36500 }, { "epoch": 0.07485443832547242, "grad_norm": 0.9602039456367493, "learning_rate": 0.000296077572500494, "loss": 3.3714874267578123, "step": 36550 }, { "epoch": 0.07495683837790125, "grad_norm": 0.9583538770675659, "learning_rate": 0.000296066579390855, "loss": 4.172694702148437, "step": 36600 }, { "epoch": 0.07505923843033008, "grad_norm": 0.9498001933097839, "learning_rate": 0.0002960555711026182, "loss": 3.799460144042969, "step": 36650 }, { "epoch": 0.0751616384827589, "grad_norm": 1.037429928779602, "learning_rate": 0.00029604454763692753, "loss": 3.3060308837890626, "step": 36700 }, { "epoch": 0.07526403853518773, "grad_norm": 0.9222440123558044, "learning_rate": 0.0002960335089949284, "loss": 3.724703063964844, "step": 36750 }, { "epoch": 0.07536643858761656, "grad_norm": 0.891686201095581, "learning_rate": 0.0002960224551777681, "loss": 3.8415121459960937, "step": 36800 }, { "epoch": 0.07546883864004539, "grad_norm": 1.1739381551742554, "learning_rate": 0.0002960113861865951, "loss": 3.6421640014648435, "step": 36850 }, { "epoch": 0.07557123869247422, "grad_norm": 1.118273138999939, "learning_rate": 0.0002960003020225598, "loss": 4.042873229980469, "step": 36900 }, { "epoch": 0.07567363874490304, "grad_norm": 1.1903246641159058, "learning_rate": 0.00029598920268681387, "loss": 3.7228439331054686, "step": 36950 }, { "epoch": 0.07577603879733187, "grad_norm": 0.8074274063110352, "learning_rate": 0.00029597808818051076, "loss": 3.74279296875, "step": 37000 }, { "epoch": 0.07587843884976068, "grad_norm": 0.7993521690368652, "learning_rate": 0.00029596695850480547, "loss": 3.5909658813476564, "step": 37050 }, { "epoch": 0.07598083890218951, "grad_norm": 0.9763518571853638, "learning_rate": 0.0002959558136608545, "loss": 3.4760845947265624, "step": 37100 }, { "epoch": 0.07608323895461834, "grad_norm": 0.9700019359588623, "learning_rate": 0.000295944653649816, "loss": 3.5362197875976564, "step": 37150 }, { "epoch": 0.07618563900704717, "grad_norm": 0.9611456990242004, "learning_rate": 0.0002959334784728497, "loss": 3.392528381347656, "step": 37200 }, { "epoch": 0.076288039059476, "grad_norm": 1.0106040239334106, "learning_rate": 0.0002959222881311168, "loss": 3.9602230834960936, "step": 37250 }, { "epoch": 0.07639043911190482, "grad_norm": 0.9530378580093384, "learning_rate": 0.00029591108262578023, "loss": 3.755385437011719, "step": 37300 }, { "epoch": 0.07649283916433365, "grad_norm": 1.2167539596557617, "learning_rate": 0.0002958998619580044, "loss": 3.5471917724609376, "step": 37350 }, { "epoch": 0.07659523921676248, "grad_norm": 0.6693082451820374, "learning_rate": 0.0002958886261289553, "loss": 2.953871154785156, "step": 37400 }, { "epoch": 0.0766976392691913, "grad_norm": 0.8044131398200989, "learning_rate": 0.0002958773751398004, "loss": 3.6775543212890627, "step": 37450 }, { "epoch": 0.07680003932162013, "grad_norm": 0.9389724731445312, "learning_rate": 0.00029586610899170904, "loss": 3.951288757324219, "step": 37500 }, { "epoch": 0.07690243937404896, "grad_norm": 0.9143916964530945, "learning_rate": 0.0002958548276858519, "loss": 3.773663330078125, "step": 37550 }, { "epoch": 0.07700483942647779, "grad_norm": 1.3429774045944214, "learning_rate": 0.0002958435312234012, "loss": 3.516551818847656, "step": 37600 }, { "epoch": 0.07710723947890662, "grad_norm": 0.9084817171096802, "learning_rate": 0.00029583221960553086, "loss": 3.5966671752929686, "step": 37650 }, { "epoch": 0.07720963953133544, "grad_norm": 0.9403077363967896, "learning_rate": 0.0002958208928334164, "loss": 3.7960610961914063, "step": 37700 }, { "epoch": 0.07731203958376427, "grad_norm": 0.9307132363319397, "learning_rate": 0.0002958095509082347, "loss": 3.526631164550781, "step": 37750 }, { "epoch": 0.0774144396361931, "grad_norm": 1.0403499603271484, "learning_rate": 0.0002957981938311645, "loss": 3.657856140136719, "step": 37800 }, { "epoch": 0.07751683968862193, "grad_norm": 0.8535223007202148, "learning_rate": 0.00029578682160338594, "loss": 3.4064453125, "step": 37850 }, { "epoch": 0.07761923974105074, "grad_norm": 0.6557066440582275, "learning_rate": 0.00029577543422608073, "loss": 3.3173226928710937, "step": 37900 }, { "epoch": 0.07772163979347957, "grad_norm": 1.1363869905471802, "learning_rate": 0.0002957640317004323, "loss": 3.3008172607421873, "step": 37950 }, { "epoch": 0.0778240398459084, "grad_norm": 0.9868215322494507, "learning_rate": 0.0002957526140276254, "loss": 3.6425216674804686, "step": 38000 }, { "epoch": 0.07792643989833722, "grad_norm": 0.6904926896095276, "learning_rate": 0.00029574118120884657, "loss": 3.5323916625976564, "step": 38050 }, { "epoch": 0.07802883995076605, "grad_norm": 1.4620712995529175, "learning_rate": 0.00029572973324528394, "loss": 3.751639709472656, "step": 38100 }, { "epoch": 0.07813124000319488, "grad_norm": 0.9893333315849304, "learning_rate": 0.000295718270138127, "loss": 3.4107330322265623, "step": 38150 }, { "epoch": 0.07823364005562371, "grad_norm": 0.8329883217811584, "learning_rate": 0.00029570679188856705, "loss": 3.1873550415039062, "step": 38200 }, { "epoch": 0.07833604010805253, "grad_norm": 1.5774136781692505, "learning_rate": 0.00029569529849779685, "loss": 3.432158203125, "step": 38250 }, { "epoch": 0.07843844016048136, "grad_norm": 0.8458206057548523, "learning_rate": 0.0002956837899670107, "loss": 3.302817077636719, "step": 38300 }, { "epoch": 0.07854084021291019, "grad_norm": 1.3035576343536377, "learning_rate": 0.00029567226629740445, "loss": 3.1465521240234375, "step": 38350 }, { "epoch": 0.07864324026533902, "grad_norm": 0.9802455902099609, "learning_rate": 0.00029566072749017574, "loss": 3.3001138305664064, "step": 38400 }, { "epoch": 0.07874564031776785, "grad_norm": 0.9335483312606812, "learning_rate": 0.00029564917354652355, "loss": 3.2266500854492186, "step": 38450 }, { "epoch": 0.07884804037019667, "grad_norm": 1.0493320226669312, "learning_rate": 0.0002956376044676485, "loss": 3.0587277221679687, "step": 38500 }, { "epoch": 0.0789504404226255, "grad_norm": 1.096993088722229, "learning_rate": 0.00029562602025475285, "loss": 4.07334716796875, "step": 38550 }, { "epoch": 0.07905284047505433, "grad_norm": 0.7419833540916443, "learning_rate": 0.0002956144209090403, "loss": 3.633370056152344, "step": 38600 }, { "epoch": 0.07915524052748316, "grad_norm": 1.1980725526809692, "learning_rate": 0.00029560280643171633, "loss": 3.7685275268554688, "step": 38650 }, { "epoch": 0.07925764057991198, "grad_norm": 2.977545738220215, "learning_rate": 0.00029559117682398774, "loss": 3.9755072021484374, "step": 38700 }, { "epoch": 0.0793600406323408, "grad_norm": 1.0279217958450317, "learning_rate": 0.0002955795320870631, "loss": 3.4383935546875, "step": 38750 }, { "epoch": 0.07946244068476963, "grad_norm": 0.9934809803962708, "learning_rate": 0.00029556787222215247, "loss": 3.663726501464844, "step": 38800 }, { "epoch": 0.07956484073719845, "grad_norm": 1.145448088645935, "learning_rate": 0.00029555619723046746, "loss": 3.51242431640625, "step": 38850 }, { "epoch": 0.07966724078962728, "grad_norm": 0.6591020226478577, "learning_rate": 0.00029554450711322133, "loss": 3.402906494140625, "step": 38900 }, { "epoch": 0.07976964084205611, "grad_norm": 0.9392556548118591, "learning_rate": 0.00029553280187162876, "loss": 3.1334713745117186, "step": 38950 }, { "epoch": 0.07987204089448494, "grad_norm": 1.0616618394851685, "learning_rate": 0.0002955210815069063, "loss": 3.934781494140625, "step": 39000 }, { "epoch": 0.07997444094691376, "grad_norm": 0.7064653038978577, "learning_rate": 0.0002955093460202717, "loss": 3.5139471435546876, "step": 39050 }, { "epoch": 0.08007684099934259, "grad_norm": 0.8327042460441589, "learning_rate": 0.0002954975954129445, "loss": 3.4110308837890626, "step": 39100 }, { "epoch": 0.08017924105177142, "grad_norm": 1.7272720336914062, "learning_rate": 0.0002954858296861459, "loss": 3.403736877441406, "step": 39150 }, { "epoch": 0.08028164110420025, "grad_norm": 0.7907924056053162, "learning_rate": 0.00029547404884109837, "loss": 2.493211212158203, "step": 39200 }, { "epoch": 0.08038404115662907, "grad_norm": 0.8305758237838745, "learning_rate": 0.00029546225287902623, "loss": 3.276422119140625, "step": 39250 }, { "epoch": 0.0804864412090579, "grad_norm": 1.02776300907135, "learning_rate": 0.0002954504418011552, "loss": 3.471695861816406, "step": 39300 }, { "epoch": 0.08058884126148673, "grad_norm": 1.1195554733276367, "learning_rate": 0.0002954386156087127, "loss": 3.8007080078125, "step": 39350 }, { "epoch": 0.08069124131391556, "grad_norm": 1.0131675004959106, "learning_rate": 0.00029542677430292755, "loss": 3.80172119140625, "step": 39400 }, { "epoch": 0.08079364136634438, "grad_norm": 1.0135658979415894, "learning_rate": 0.0002954149178850304, "loss": 3.3349169921875, "step": 39450 }, { "epoch": 0.08089604141877321, "grad_norm": 0.9203832149505615, "learning_rate": 0.00029540304635625316, "loss": 3.532286376953125, "step": 39500 }, { "epoch": 0.08099844147120204, "grad_norm": 0.9541352987289429, "learning_rate": 0.0002953911597178296, "loss": 3.9118218994140626, "step": 39550 }, { "epoch": 0.08110084152363085, "grad_norm": 0.8768864870071411, "learning_rate": 0.0002953792579709948, "loss": 2.9628286743164063, "step": 39600 }, { "epoch": 0.08120324157605968, "grad_norm": 0.9336755275726318, "learning_rate": 0.00029536734111698567, "loss": 3.8077597045898437, "step": 39650 }, { "epoch": 0.08130564162848851, "grad_norm": 0.7618170380592346, "learning_rate": 0.00029535540915704046, "loss": 3.6045367431640627, "step": 39700 }, { "epoch": 0.08140804168091734, "grad_norm": 0.7901123762130737, "learning_rate": 0.0002953434620923991, "loss": 3.8127349853515624, "step": 39750 }, { "epoch": 0.08151044173334616, "grad_norm": 0.90858393907547, "learning_rate": 0.0002953314999243032, "loss": 3.246180419921875, "step": 39800 }, { "epoch": 0.08161284178577499, "grad_norm": 1.1294665336608887, "learning_rate": 0.00029531952265399565, "loss": 3.8714260864257812, "step": 39850 }, { "epoch": 0.08171524183820382, "grad_norm": 0.8707161545753479, "learning_rate": 0.0002953075302827211, "loss": 3.1378076171875, "step": 39900 }, { "epoch": 0.08181764189063265, "grad_norm": 0.9953368902206421, "learning_rate": 0.0002952955228117258, "loss": 3.7785629272460937, "step": 39950 }, { "epoch": 0.08192004194306148, "grad_norm": 1.116952657699585, "learning_rate": 0.00029528350024225753, "loss": 3.962169494628906, "step": 40000 }, { "epoch": 0.0820224419954903, "grad_norm": 1.140546202659607, "learning_rate": 0.0002952714625755656, "loss": 4.067502136230469, "step": 40050 }, { "epoch": 0.08212484204791913, "grad_norm": 0.8150861859321594, "learning_rate": 0.0002952594098129008, "loss": 3.7707586669921875, "step": 40100 }, { "epoch": 0.08222724210034796, "grad_norm": 1.0610780715942383, "learning_rate": 0.00029524734195551577, "loss": 3.650087890625, "step": 40150 }, { "epoch": 0.08232964215277679, "grad_norm": 1.0550084114074707, "learning_rate": 0.00029523525900466453, "loss": 3.259920959472656, "step": 40200 }, { "epoch": 0.08243204220520561, "grad_norm": 0.9846071004867554, "learning_rate": 0.00029522316096160256, "loss": 3.823460998535156, "step": 40250 }, { "epoch": 0.08253444225763444, "grad_norm": 0.932036280632019, "learning_rate": 0.00029521104782758714, "loss": 3.7148446655273437, "step": 40300 }, { "epoch": 0.08263684231006327, "grad_norm": 0.7645614743232727, "learning_rate": 0.00029519891960387703, "loss": 2.9853546142578127, "step": 40350 }, { "epoch": 0.0827392423624921, "grad_norm": 0.9208267331123352, "learning_rate": 0.00029518677629173246, "loss": 3.2360791015625, "step": 40400 }, { "epoch": 0.08284164241492091, "grad_norm": 0.6297294497489929, "learning_rate": 0.0002951746178924153, "loss": 3.471868896484375, "step": 40450 }, { "epoch": 0.08294404246734974, "grad_norm": 0.7016891241073608, "learning_rate": 0.0002951624444071891, "loss": 1.757879180908203, "step": 40500 }, { "epoch": 0.08304644251977857, "grad_norm": 0.9499281048774719, "learning_rate": 0.00029515025583731877, "loss": 3.28075439453125, "step": 40550 }, { "epoch": 0.0831488425722074, "grad_norm": 1.3156087398529053, "learning_rate": 0.00029513805218407105, "loss": 3.5208966064453127, "step": 40600 }, { "epoch": 0.08325124262463622, "grad_norm": 0.9431557059288025, "learning_rate": 0.00029512583344871383, "loss": 3.794385986328125, "step": 40650 }, { "epoch": 0.08335364267706505, "grad_norm": 0.9936553835868835, "learning_rate": 0.0002951135996325171, "loss": 3.725905456542969, "step": 40700 }, { "epoch": 0.08345604272949388, "grad_norm": 0.9810524582862854, "learning_rate": 0.00029510135073675196, "loss": 3.7150784301757813, "step": 40750 }, { "epoch": 0.0835584427819227, "grad_norm": 0.9625670313835144, "learning_rate": 0.0002950890867626914, "loss": 3.2056814575195314, "step": 40800 }, { "epoch": 0.08366084283435153, "grad_norm": 0.8976542949676514, "learning_rate": 0.0002950768077116097, "loss": 3.6317849731445313, "step": 40850 }, { "epoch": 0.08376324288678036, "grad_norm": 0.835045576095581, "learning_rate": 0.00029506451358478293, "loss": 3.50963623046875, "step": 40900 }, { "epoch": 0.08386564293920919, "grad_norm": 0.9474772810935974, "learning_rate": 0.0002950522043834886, "loss": 3.357117919921875, "step": 40950 }, { "epoch": 0.08396804299163801, "grad_norm": 0.9703244566917419, "learning_rate": 0.0002950398801090059, "loss": 3.6155599975585937, "step": 41000 }, { "epoch": 0.08407044304406684, "grad_norm": 0.9425392746925354, "learning_rate": 0.0002950275407626154, "loss": 3.6346676635742186, "step": 41050 }, { "epoch": 0.08417284309649567, "grad_norm": 0.8194516897201538, "learning_rate": 0.00029501518634559947, "loss": 3.578563232421875, "step": 41100 }, { "epoch": 0.0842752431489245, "grad_norm": 0.9572771191596985, "learning_rate": 0.00029500281685924186, "loss": 3.8818609619140627, "step": 41150 }, { "epoch": 0.08437764320135333, "grad_norm": 0.9095619320869446, "learning_rate": 0.0002949904323048279, "loss": 4.035207214355469, "step": 41200 }, { "epoch": 0.08448004325378215, "grad_norm": 1.0001695156097412, "learning_rate": 0.0002949780326836447, "loss": 3.6542062377929687, "step": 41250 }, { "epoch": 0.08458244330621097, "grad_norm": 0.7178096175193787, "learning_rate": 0.00029496561799698064, "loss": 4.071335754394531, "step": 41300 }, { "epoch": 0.0846848433586398, "grad_norm": 1.246907114982605, "learning_rate": 0.0002949531882461258, "loss": 3.1734967041015625, "step": 41350 }, { "epoch": 0.08478724341106862, "grad_norm": 0.930445671081543, "learning_rate": 0.0002949407434323719, "loss": 3.6632540893554686, "step": 41400 }, { "epoch": 0.08488964346349745, "grad_norm": 1.0094538927078247, "learning_rate": 0.0002949282835570121, "loss": 3.3916510009765624, "step": 41450 }, { "epoch": 0.08499204351592628, "grad_norm": 0.8062929511070251, "learning_rate": 0.0002949158086213412, "loss": 3.894981689453125, "step": 41500 }, { "epoch": 0.0850944435683551, "grad_norm": 0.9255414009094238, "learning_rate": 0.0002949033186266555, "loss": 2.9597015380859375, "step": 41550 }, { "epoch": 0.08519684362078393, "grad_norm": 1.0212881565093994, "learning_rate": 0.00029489081357425296, "loss": 3.641199951171875, "step": 41600 }, { "epoch": 0.08529924367321276, "grad_norm": 1.0403817892074585, "learning_rate": 0.000294878293465433, "loss": 3.671487121582031, "step": 41650 }, { "epoch": 0.08540164372564159, "grad_norm": 1.1286342144012451, "learning_rate": 0.0002948657583014967, "loss": 3.9286517333984374, "step": 41700 }, { "epoch": 0.08550404377807042, "grad_norm": 0.8692294359207153, "learning_rate": 0.00029485320808374666, "loss": 3.6199725341796873, "step": 41750 }, { "epoch": 0.08560644383049924, "grad_norm": 0.9791249632835388, "learning_rate": 0.000294840642813487, "loss": 4.0375192260742185, "step": 41800 }, { "epoch": 0.08570884388292807, "grad_norm": 0.9460155367851257, "learning_rate": 0.0002948280624920234, "loss": 3.7891888427734375, "step": 41850 }, { "epoch": 0.0858112439353569, "grad_norm": 0.9373695254325867, "learning_rate": 0.0002948154671206633, "loss": 3.796607666015625, "step": 41900 }, { "epoch": 0.08591364398778573, "grad_norm": 0.9017443656921387, "learning_rate": 0.0002948028567007155, "loss": 3.572817077636719, "step": 41950 }, { "epoch": 0.08601604404021455, "grad_norm": 1.0376108884811401, "learning_rate": 0.0002947902312334904, "loss": 3.006630859375, "step": 42000 }, { "epoch": 0.08611844409264338, "grad_norm": 1.2481657266616821, "learning_rate": 0.00029477759072029985, "loss": 3.8225225830078124, "step": 42050 }, { "epoch": 0.08622084414507221, "grad_norm": 0.9803751707077026, "learning_rate": 0.00029476493516245766, "loss": 3.7674041748046876, "step": 42100 }, { "epoch": 0.08632324419750102, "grad_norm": 0.6287221908569336, "learning_rate": 0.00029475226456127877, "loss": 3.707611083984375, "step": 42150 }, { "epoch": 0.08642564424992985, "grad_norm": 0.8981334567070007, "learning_rate": 0.00029473957891807984, "loss": 3.517123107910156, "step": 42200 }, { "epoch": 0.08652804430235868, "grad_norm": 0.8311030864715576, "learning_rate": 0.0002947268782341792, "loss": 3.3404605102539064, "step": 42250 }, { "epoch": 0.0866304443547875, "grad_norm": 0.9770768284797668, "learning_rate": 0.00029471416251089657, "loss": 3.50871337890625, "step": 42300 }, { "epoch": 0.08673284440721633, "grad_norm": 0.9695789813995361, "learning_rate": 0.0002947014317495534, "loss": 2.9593045043945314, "step": 42350 }, { "epoch": 0.08683524445964516, "grad_norm": 1.279931902885437, "learning_rate": 0.0002946886859514726, "loss": 3.4032522583007814, "step": 42400 }, { "epoch": 0.08693764451207399, "grad_norm": 0.846924364566803, "learning_rate": 0.00029467592511797853, "loss": 3.4247207641601562, "step": 42450 }, { "epoch": 0.08704004456450282, "grad_norm": 0.9505274891853333, "learning_rate": 0.0002946631492503974, "loss": 3.43554931640625, "step": 42500 }, { "epoch": 0.08714244461693164, "grad_norm": 0.8762588500976562, "learning_rate": 0.00029465035835005664, "loss": 3.82656982421875, "step": 42550 }, { "epoch": 0.08724484466936047, "grad_norm": 1.11127769947052, "learning_rate": 0.0002946375524182856, "loss": 3.80216796875, "step": 42600 }, { "epoch": 0.0873472447217893, "grad_norm": 0.989086925983429, "learning_rate": 0.00029462473145641497, "loss": 3.4192626953125, "step": 42650 }, { "epoch": 0.08744964477421813, "grad_norm": 1.940375804901123, "learning_rate": 0.000294611895465777, "loss": 3.6263909912109376, "step": 42700 }, { "epoch": 0.08755204482664695, "grad_norm": 0.9889708161354065, "learning_rate": 0.0002945990444477056, "loss": 3.809459533691406, "step": 42750 }, { "epoch": 0.08765444487907578, "grad_norm": 0.7751711010932922, "learning_rate": 0.0002945861784035362, "loss": 3.572983703613281, "step": 42800 }, { "epoch": 0.08775684493150461, "grad_norm": 0.9419236779212952, "learning_rate": 0.0002945732973346057, "loss": 2.524838714599609, "step": 42850 }, { "epoch": 0.08785924498393344, "grad_norm": 0.8961177468299866, "learning_rate": 0.0002945604012422527, "loss": 3.410054016113281, "step": 42900 }, { "epoch": 0.08796164503636227, "grad_norm": 1.059244155883789, "learning_rate": 0.00029454749012781733, "loss": 3.40218994140625, "step": 42950 }, { "epoch": 0.08806404508879108, "grad_norm": 1.054032325744629, "learning_rate": 0.0002945345639926412, "loss": 3.8609942626953124, "step": 43000 }, { "epoch": 0.0881664451412199, "grad_norm": 1.5438601970672607, "learning_rate": 0.00029452162283806764, "loss": 3.6072705078125, "step": 43050 }, { "epoch": 0.08826884519364873, "grad_norm": 1.3585573434829712, "learning_rate": 0.0002945086666654413, "loss": 2.9131259155273437, "step": 43100 }, { "epoch": 0.08837124524607756, "grad_norm": 0.9829738736152649, "learning_rate": 0.0002944956954761086, "loss": 3.900540771484375, "step": 43150 }, { "epoch": 0.08847364529850639, "grad_norm": 0.8328433036804199, "learning_rate": 0.00029448270927141747, "loss": 2.7704718017578127, "step": 43200 }, { "epoch": 0.08857604535093522, "grad_norm": 0.9175614714622498, "learning_rate": 0.0002944697080527173, "loss": 3.7308444213867187, "step": 43250 }, { "epoch": 0.08867844540336405, "grad_norm": 0.991267204284668, "learning_rate": 0.0002944566918213592, "loss": 3.726257019042969, "step": 43300 }, { "epoch": 0.08878084545579287, "grad_norm": 0.8164985775947571, "learning_rate": 0.0002944436605786958, "loss": 3.6927761840820312, "step": 43350 }, { "epoch": 0.0888832455082217, "grad_norm": 0.6943197846412659, "learning_rate": 0.00029443061432608104, "loss": 3.4184146118164063, "step": 43400 }, { "epoch": 0.08898564556065053, "grad_norm": 0.7790623307228088, "learning_rate": 0.00029441755306487086, "loss": 2.976038818359375, "step": 43450 }, { "epoch": 0.08908804561307936, "grad_norm": 1.6038352251052856, "learning_rate": 0.00029440447679642245, "loss": 3.82299560546875, "step": 43500 }, { "epoch": 0.08919044566550818, "grad_norm": 0.8337633013725281, "learning_rate": 0.0002943913855220946, "loss": 3.1569622802734374, "step": 43550 }, { "epoch": 0.08929284571793701, "grad_norm": 0.7999888062477112, "learning_rate": 0.0002943782792432477, "loss": 1.872430877685547, "step": 43600 }, { "epoch": 0.08939524577036584, "grad_norm": 1.0902249813079834, "learning_rate": 0.00029436515796124374, "loss": 3.7972311401367187, "step": 43650 }, { "epoch": 0.08949764582279467, "grad_norm": 1.0413930416107178, "learning_rate": 0.0002943520216774462, "loss": 3.2716705322265627, "step": 43700 }, { "epoch": 0.0896000458752235, "grad_norm": 1.2011021375656128, "learning_rate": 0.00029433887039322017, "loss": 3.626478271484375, "step": 43750 }, { "epoch": 0.08970244592765232, "grad_norm": 0.9314827919006348, "learning_rate": 0.00029432570410993226, "loss": 2.8823446655273437, "step": 43800 }, { "epoch": 0.08980484598008114, "grad_norm": 0.7399948835372925, "learning_rate": 0.0002943125228289507, "loss": 4.1010568237304685, "step": 43850 }, { "epoch": 0.08990724603250996, "grad_norm": 0.8250963687896729, "learning_rate": 0.0002942993265516451, "loss": 3.1346914672851565, "step": 43900 }, { "epoch": 0.09000964608493879, "grad_norm": 0.9391615390777588, "learning_rate": 0.00029428611527938683, "loss": 3.304781188964844, "step": 43950 }, { "epoch": 0.09011204613736762, "grad_norm": 1.3079005479812622, "learning_rate": 0.0002942728890135488, "loss": 3.67544189453125, "step": 44000 }, { "epoch": 0.09021444618979645, "grad_norm": 0.8453630208969116, "learning_rate": 0.0002942596477555054, "loss": 3.5969857788085937, "step": 44050 }, { "epoch": 0.09031684624222527, "grad_norm": 0.8319234251976013, "learning_rate": 0.0002942463915066326, "loss": 3.4878445434570313, "step": 44100 }, { "epoch": 0.0904192462946541, "grad_norm": 0.8988983035087585, "learning_rate": 0.0002942331202683078, "loss": 3.648201599121094, "step": 44150 }, { "epoch": 0.09052164634708293, "grad_norm": 0.9342271089553833, "learning_rate": 0.00029421983404191027, "loss": 3.274960632324219, "step": 44200 }, { "epoch": 0.09062404639951176, "grad_norm": 0.6616138219833374, "learning_rate": 0.0002942065328288206, "loss": 3.499600830078125, "step": 44250 }, { "epoch": 0.09072644645194058, "grad_norm": 0.5895013213157654, "learning_rate": 0.00029419321663042106, "loss": 3.312397155761719, "step": 44300 }, { "epoch": 0.09082884650436941, "grad_norm": 0.9072443246841431, "learning_rate": 0.0002941798854480952, "loss": 3.4259283447265627, "step": 44350 }, { "epoch": 0.09093124655679824, "grad_norm": 0.9704260230064392, "learning_rate": 0.00029416653928322854, "loss": 3.4393576049804686, "step": 44400 }, { "epoch": 0.09103364660922707, "grad_norm": 1.2480257749557495, "learning_rate": 0.0002941531781372079, "loss": 3.769176025390625, "step": 44450 }, { "epoch": 0.0911360466616559, "grad_norm": 0.7178895473480225, "learning_rate": 0.0002941398020114217, "loss": 3.466965637207031, "step": 44500 }, { "epoch": 0.09123844671408472, "grad_norm": 0.870692789554596, "learning_rate": 0.0002941264109072599, "loss": 3.033865661621094, "step": 44550 }, { "epoch": 0.09134084676651355, "grad_norm": 0.8359827399253845, "learning_rate": 0.0002941130048261141, "loss": 2.9268179321289063, "step": 44600 }, { "epoch": 0.09144324681894238, "grad_norm": 0.8175760507583618, "learning_rate": 0.0002940995837693774, "loss": 3.4120257568359373, "step": 44650 }, { "epoch": 0.09154564687137119, "grad_norm": 0.9331879615783691, "learning_rate": 0.00029408614773844435, "loss": 3.2513809204101562, "step": 44700 }, { "epoch": 0.09164804692380002, "grad_norm": 1.0784333944320679, "learning_rate": 0.0002940726967347113, "loss": 3.6439849853515627, "step": 44750 }, { "epoch": 0.09175044697622885, "grad_norm": 1.1995110511779785, "learning_rate": 0.000294059230759576, "loss": 3.4360501098632814, "step": 44800 }, { "epoch": 0.09185284702865767, "grad_norm": 1.0588244199752808, "learning_rate": 0.0002940457498144377, "loss": 3.7745849609375, "step": 44850 }, { "epoch": 0.0919552470810865, "grad_norm": 0.8767450451850891, "learning_rate": 0.0002940322539006973, "loss": 3.844217529296875, "step": 44900 }, { "epoch": 0.09205764713351533, "grad_norm": 1.0808109045028687, "learning_rate": 0.00029401874301975727, "loss": 3.6274505615234376, "step": 44950 }, { "epoch": 0.09216004718594416, "grad_norm": 0.7221155762672424, "learning_rate": 0.00029400521717302166, "loss": 3.7380535888671873, "step": 45000 }, { "epoch": 0.09226244723837299, "grad_norm": 0.847489595413208, "learning_rate": 0.0002939916763618958, "loss": 3.911776428222656, "step": 45050 }, { "epoch": 0.09236484729080181, "grad_norm": 0.9442451596260071, "learning_rate": 0.00029397812058778707, "loss": 3.614713439941406, "step": 45100 }, { "epoch": 0.09246724734323064, "grad_norm": 0.8224995136260986, "learning_rate": 0.0002939645498521039, "loss": 2.9758087158203126, "step": 45150 }, { "epoch": 0.09256964739565947, "grad_norm": 0.5964682698249817, "learning_rate": 0.0002939509641562567, "loss": 1.742069854736328, "step": 45200 }, { "epoch": 0.0926720474480883, "grad_norm": 0.5689894556999207, "learning_rate": 0.000293937363501657, "loss": 2.5378482055664064, "step": 45250 }, { "epoch": 0.09277444750051712, "grad_norm": 1.1072094440460205, "learning_rate": 0.00029392374788971833, "loss": 3.6468490600585937, "step": 45300 }, { "epoch": 0.09287684755294595, "grad_norm": 0.8903659582138062, "learning_rate": 0.0002939101173218555, "loss": 3.3196234130859374, "step": 45350 }, { "epoch": 0.09297924760537478, "grad_norm": 0.7800249457359314, "learning_rate": 0.0002938964717994849, "loss": 3.895177001953125, "step": 45400 }, { "epoch": 0.0930816476578036, "grad_norm": 0.8999978303909302, "learning_rate": 0.00029388281132402454, "loss": 3.9075274658203125, "step": 45450 }, { "epoch": 0.09318404771023243, "grad_norm": 0.8825941681861877, "learning_rate": 0.00029386913589689393, "loss": 3.787184753417969, "step": 45500 }, { "epoch": 0.09328644776266125, "grad_norm": 0.8387885689735413, "learning_rate": 0.0002938554455195142, "loss": 3.0614547729492188, "step": 45550 }, { "epoch": 0.09338884781509008, "grad_norm": 0.8926045894622803, "learning_rate": 0.000293841740193308, "loss": 3.60509765625, "step": 45600 }, { "epoch": 0.0934912478675189, "grad_norm": 0.8237414956092834, "learning_rate": 0.00029382801991969945, "loss": 3.7173092651367186, "step": 45650 }, { "epoch": 0.09359364791994773, "grad_norm": 0.9613683223724365, "learning_rate": 0.0002938142847001144, "loss": 3.6430303955078127, "step": 45700 }, { "epoch": 0.09369604797237656, "grad_norm": 0.6676517724990845, "learning_rate": 0.0002938005345359801, "loss": 3.367287292480469, "step": 45750 }, { "epoch": 0.09379844802480539, "grad_norm": 1.0207992792129517, "learning_rate": 0.0002937867694287254, "loss": 3.720477294921875, "step": 45800 }, { "epoch": 0.09390084807723421, "grad_norm": 1.102271318435669, "learning_rate": 0.00029377298937978077, "loss": 3.620904846191406, "step": 45850 }, { "epoch": 0.09400324812966304, "grad_norm": 0.7163046598434448, "learning_rate": 0.0002937591943905781, "loss": 2.606007080078125, "step": 45900 }, { "epoch": 0.09410564818209187, "grad_norm": 0.4335630238056183, "learning_rate": 0.0002937453844625509, "loss": 1.4070957946777343, "step": 45950 }, { "epoch": 0.0942080482345207, "grad_norm": 0.6361094117164612, "learning_rate": 0.0002937315595971343, "loss": 3.254652099609375, "step": 46000 }, { "epoch": 0.09431044828694952, "grad_norm": 0.8255375027656555, "learning_rate": 0.0002937177197957649, "loss": 3.422788391113281, "step": 46050 }, { "epoch": 0.09441284833937835, "grad_norm": 0.5633572340011597, "learning_rate": 0.0002937038650598809, "loss": 3.6044976806640623, "step": 46100 }, { "epoch": 0.09451524839180718, "grad_norm": 1.2230876684188843, "learning_rate": 0.0002936899953909219, "loss": 3.22015869140625, "step": 46150 }, { "epoch": 0.09461764844423601, "grad_norm": 0.8899142742156982, "learning_rate": 0.0002936761107903293, "loss": 4.0206103515625, "step": 46200 }, { "epoch": 0.09472004849666484, "grad_norm": 0.9843432307243347, "learning_rate": 0.00029366221125954586, "loss": 3.70310791015625, "step": 46250 }, { "epoch": 0.09482244854909366, "grad_norm": 0.9883196353912354, "learning_rate": 0.000293648296800016, "loss": 3.84428466796875, "step": 46300 }, { "epoch": 0.09492484860152249, "grad_norm": 0.8828408718109131, "learning_rate": 0.0002936343674131856, "loss": 3.496847839355469, "step": 46350 }, { "epoch": 0.0950272486539513, "grad_norm": 0.6513479351997375, "learning_rate": 0.0002936204231005023, "loss": 3.3328936767578123, "step": 46400 }, { "epoch": 0.09512964870638013, "grad_norm": 0.9128335118293762, "learning_rate": 0.0002936064638634149, "loss": 2.726371154785156, "step": 46450 }, { "epoch": 0.09523204875880896, "grad_norm": 0.9786936044692993, "learning_rate": 0.00029359248970337406, "loss": 3.190602111816406, "step": 46500 }, { "epoch": 0.09533444881123779, "grad_norm": 0.8290608525276184, "learning_rate": 0.00029357850062183203, "loss": 3.8881295776367186, "step": 46550 }, { "epoch": 0.09543684886366662, "grad_norm": 0.9058592319488525, "learning_rate": 0.0002935644966202424, "loss": 3.583518371582031, "step": 46600 }, { "epoch": 0.09553924891609544, "grad_norm": 1.0789927244186401, "learning_rate": 0.00029355047770006034, "loss": 3.5978643798828127, "step": 46650 }, { "epoch": 0.09564164896852427, "grad_norm": 0.9313961863517761, "learning_rate": 0.00029353644386274273, "loss": 3.6306307983398436, "step": 46700 }, { "epoch": 0.0957440490209531, "grad_norm": 0.9315224289894104, "learning_rate": 0.00029352239510974787, "loss": 3.5369802856445314, "step": 46750 }, { "epoch": 0.09584644907338193, "grad_norm": 0.8773080110549927, "learning_rate": 0.0002935083314425357, "loss": 3.766584777832031, "step": 46800 }, { "epoch": 0.09594884912581075, "grad_norm": 0.8457773923873901, "learning_rate": 0.00029349425286256763, "loss": 3.876020812988281, "step": 46850 }, { "epoch": 0.09605124917823958, "grad_norm": 0.9530948400497437, "learning_rate": 0.00029348015937130656, "loss": 3.862485046386719, "step": 46900 }, { "epoch": 0.09615364923066841, "grad_norm": 1.1303527355194092, "learning_rate": 0.0002934660509702171, "loss": 2.8852374267578127, "step": 46950 }, { "epoch": 0.09625604928309724, "grad_norm": 1.0009031295776367, "learning_rate": 0.0002934519276607653, "loss": 3.7352252197265625, "step": 47000 }, { "epoch": 0.09635844933552606, "grad_norm": 0.5930376052856445, "learning_rate": 0.00029343778944441887, "loss": 2.9531982421875, "step": 47050 }, { "epoch": 0.09646084938795489, "grad_norm": 0.8898753523826599, "learning_rate": 0.0002934236363226469, "loss": 3.4945404052734377, "step": 47100 }, { "epoch": 0.09656324944038372, "grad_norm": 1.1294547319412231, "learning_rate": 0.00029340946829692013, "loss": 3.6753500366210936, "step": 47150 }, { "epoch": 0.09666564949281255, "grad_norm": 1.0108319520950317, "learning_rate": 0.00029339528536871087, "loss": 3.531564025878906, "step": 47200 }, { "epoch": 0.09676804954524136, "grad_norm": 1.2252336740493774, "learning_rate": 0.00029338108753949296, "loss": 3.4618963623046874, "step": 47250 }, { "epoch": 0.09687044959767019, "grad_norm": 0.975235641002655, "learning_rate": 0.0002933668748107418, "loss": 3.814194641113281, "step": 47300 }, { "epoch": 0.09697284965009902, "grad_norm": 0.9449312090873718, "learning_rate": 0.00029335264718393424, "loss": 3.584350891113281, "step": 47350 }, { "epoch": 0.09707524970252784, "grad_norm": 0.677931010723114, "learning_rate": 0.00029333840466054875, "loss": 3.3688113403320314, "step": 47400 }, { "epoch": 0.09717764975495667, "grad_norm": 0.8441532254219055, "learning_rate": 0.0002933241472420654, "loss": 3.5264968872070312, "step": 47450 }, { "epoch": 0.0972800498073855, "grad_norm": 1.312727451324463, "learning_rate": 0.0002933098749299657, "loss": 3.4481561279296873, "step": 47500 }, { "epoch": 0.09738244985981433, "grad_norm": 1.3327820301055908, "learning_rate": 0.0002932955877257329, "loss": 3.394440002441406, "step": 47550 }, { "epoch": 0.09748484991224315, "grad_norm": 0.9248800277709961, "learning_rate": 0.00029328128563085154, "loss": 3.8233456420898437, "step": 47600 }, { "epoch": 0.09758724996467198, "grad_norm": 0.8401134014129639, "learning_rate": 0.00029326696864680787, "loss": 3.518874206542969, "step": 47650 }, { "epoch": 0.09768965001710081, "grad_norm": 1.0612273216247559, "learning_rate": 0.0002932526367750896, "loss": 3.5591195678710936, "step": 47700 }, { "epoch": 0.09779205006952964, "grad_norm": 0.7437798380851746, "learning_rate": 0.00029323829001718613, "loss": 3.0408529663085937, "step": 47750 }, { "epoch": 0.09789445012195847, "grad_norm": 0.8572849631309509, "learning_rate": 0.0002932239283745882, "loss": 3.37136474609375, "step": 47800 }, { "epoch": 0.09799685017438729, "grad_norm": 1.062315583229065, "learning_rate": 0.0002932095518487883, "loss": 3.3033380126953125, "step": 47850 }, { "epoch": 0.09809925022681612, "grad_norm": 1.2052414417266846, "learning_rate": 0.0002931951604412804, "loss": 2.8181661987304687, "step": 47900 }, { "epoch": 0.09820165027924495, "grad_norm": 1.1409345865249634, "learning_rate": 0.00029318075415355984, "loss": 3.9486019897460936, "step": 47950 }, { "epoch": 0.09830405033167378, "grad_norm": 1.0399415493011475, "learning_rate": 0.0002931663329871238, "loss": 3.3418069458007813, "step": 48000 }, { "epoch": 0.0984064503841026, "grad_norm": 0.919865608215332, "learning_rate": 0.0002931518969434708, "loss": 3.688287658691406, "step": 48050 }, { "epoch": 0.09850885043653142, "grad_norm": 0.994552493095398, "learning_rate": 0.000293137446024101, "loss": 3.589768371582031, "step": 48100 }, { "epoch": 0.09861125048896024, "grad_norm": 0.9309687614440918, "learning_rate": 0.00029312298023051605, "loss": 3.7281314086914064, "step": 48150 }, { "epoch": 0.09871365054138907, "grad_norm": 1.0229836702346802, "learning_rate": 0.0002931084995642192, "loss": 3.7916598510742188, "step": 48200 }, { "epoch": 0.0988160505938179, "grad_norm": 0.7249611616134644, "learning_rate": 0.0002930940040267152, "loss": 3.300664367675781, "step": 48250 }, { "epoch": 0.09891845064624673, "grad_norm": 1.0371336936950684, "learning_rate": 0.0002930794936195104, "loss": 3.6068963623046875, "step": 48300 }, { "epoch": 0.09902085069867556, "grad_norm": 1.082552194595337, "learning_rate": 0.0002930649683441126, "loss": 3.424382629394531, "step": 48350 }, { "epoch": 0.09912325075110438, "grad_norm": 1.1194738149642944, "learning_rate": 0.0002930504282020312, "loss": 3.5506494140625, "step": 48400 }, { "epoch": 0.09922565080353321, "grad_norm": 0.8479589223861694, "learning_rate": 0.00029303587319477715, "loss": 3.7008261108398437, "step": 48450 }, { "epoch": 0.09932805085596204, "grad_norm": 1.6099497079849243, "learning_rate": 0.00029302130332386307, "loss": 3.1615875244140623, "step": 48500 }, { "epoch": 0.09943045090839087, "grad_norm": 0.9683935046195984, "learning_rate": 0.00029300671859080275, "loss": 3.681039123535156, "step": 48550 }, { "epoch": 0.0995328509608197, "grad_norm": 1.2820624113082886, "learning_rate": 0.000292992118997112, "loss": 3.5511508178710938, "step": 48600 }, { "epoch": 0.09963525101324852, "grad_norm": 0.9168037176132202, "learning_rate": 0.00029297750454430785, "loss": 3.657781677246094, "step": 48650 }, { "epoch": 0.09973765106567735, "grad_norm": 4.044058322906494, "learning_rate": 0.000292962875233909, "loss": 3.3354425048828125, "step": 48700 }, { "epoch": 0.09984005111810618, "grad_norm": 1.3989704847335815, "learning_rate": 0.00029294823106743565, "loss": 3.1698623657226563, "step": 48750 }, { "epoch": 0.099942451170535, "grad_norm": 1.0335566997528076, "learning_rate": 0.00029293357204640953, "loss": 2.8218838500976564, "step": 48800 }, { "epoch": 0.10004485122296383, "grad_norm": 0.9593238234519958, "learning_rate": 0.00029291889817235396, "loss": 3.712968444824219, "step": 48850 }, { "epoch": 0.10014725127539266, "grad_norm": 0.8883773684501648, "learning_rate": 0.0002929042094467938, "loss": 3.7107012939453123, "step": 48900 }, { "epoch": 0.10024965132782147, "grad_norm": 0.817356526851654, "learning_rate": 0.00029288950587125543, "loss": 3.442810363769531, "step": 48950 }, { "epoch": 0.1003520513802503, "grad_norm": 0.7103580236434937, "learning_rate": 0.0002928747874472667, "loss": 3.518086242675781, "step": 49000 }, { "epoch": 0.10045445143267913, "grad_norm": 0.9503324031829834, "learning_rate": 0.0002928600541763573, "loss": 3.3786196899414063, "step": 49050 }, { "epoch": 0.10055685148510796, "grad_norm": 1.2780108451843262, "learning_rate": 0.000292845306060058, "loss": 3.57686767578125, "step": 49100 }, { "epoch": 0.10065925153753678, "grad_norm": 0.7724966406822205, "learning_rate": 0.0002928305430999015, "loss": 3.4349874877929687, "step": 49150 }, { "epoch": 0.10076165158996561, "grad_norm": 0.73604816198349, "learning_rate": 0.0002928157652974219, "loss": 2.7600396728515624, "step": 49200 }, { "epoch": 0.10086405164239444, "grad_norm": 1.0744940042495728, "learning_rate": 0.00029280097265415477, "loss": 3.0249954223632813, "step": 49250 }, { "epoch": 0.10096645169482327, "grad_norm": 1.065955400466919, "learning_rate": 0.0002927861651716373, "loss": 3.8623785400390624, "step": 49300 }, { "epoch": 0.1010688517472521, "grad_norm": 0.9593607783317566, "learning_rate": 0.00029277134285140833, "loss": 3.2714468383789064, "step": 49350 }, { "epoch": 0.10117125179968092, "grad_norm": 0.875238835811615, "learning_rate": 0.00029275650569500803, "loss": 3.738236999511719, "step": 49400 }, { "epoch": 0.10127365185210975, "grad_norm": 0.6924005150794983, "learning_rate": 0.00029274165370397827, "loss": 3.841283874511719, "step": 49450 }, { "epoch": 0.10137605190453858, "grad_norm": 0.8927252292633057, "learning_rate": 0.00029272678687986236, "loss": 3.7357077026367187, "step": 49500 }, { "epoch": 0.1014784519569674, "grad_norm": 0.9974352121353149, "learning_rate": 0.0002927119052242052, "loss": 3.653075866699219, "step": 49550 }, { "epoch": 0.10158085200939623, "grad_norm": 0.9928423762321472, "learning_rate": 0.00029269700873855325, "loss": 3.923564147949219, "step": 49600 }, { "epoch": 0.10168325206182506, "grad_norm": 0.727768063545227, "learning_rate": 0.0002926820974244544, "loss": 3.398980712890625, "step": 49650 }, { "epoch": 0.10178565211425389, "grad_norm": 0.5926229357719421, "learning_rate": 0.00029266717128345837, "loss": 2.8432931518554687, "step": 49700 }, { "epoch": 0.10188805216668272, "grad_norm": 0.7883087992668152, "learning_rate": 0.000292652230317116, "loss": 3.0143255615234374, "step": 49750 }, { "epoch": 0.10199045221911153, "grad_norm": 0.7131246328353882, "learning_rate": 0.00029263727452698, "loss": 2.733319091796875, "step": 49800 }, { "epoch": 0.10209285227154036, "grad_norm": 0.6186954379081726, "learning_rate": 0.0002926223039146045, "loss": 2.7473519897460936, "step": 49850 }, { "epoch": 0.10219525232396919, "grad_norm": 1.1395238637924194, "learning_rate": 0.0002926073184815452, "loss": 3.2758560180664062, "step": 49900 }, { "epoch": 0.10229765237639801, "grad_norm": 0.9928449988365173, "learning_rate": 0.0002925923182293592, "loss": 4.000916442871094, "step": 49950 }, { "epoch": 0.10240005242882684, "grad_norm": 2.3027689456939697, "learning_rate": 0.00029257730315960547, "loss": 3.35286865234375, "step": 50000 }, { "epoch": 0.10250245248125567, "grad_norm": 0.7039443254470825, "learning_rate": 0.0002925622732738441, "loss": 3.695789794921875, "step": 50050 }, { "epoch": 0.1026048525336845, "grad_norm": 0.8762661218643188, "learning_rate": 0.00029254722857363706, "loss": 3.2338931274414064, "step": 50100 }, { "epoch": 0.10270725258611332, "grad_norm": 1.2041362524032593, "learning_rate": 0.00029253216906054765, "loss": 3.807637023925781, "step": 50150 }, { "epoch": 0.10280965263854215, "grad_norm": 0.7898900508880615, "learning_rate": 0.0002925170947361409, "loss": 3.60753662109375, "step": 50200 }, { "epoch": 0.10291205269097098, "grad_norm": 1.3529953956604004, "learning_rate": 0.00029250200560198316, "loss": 3.5016552734375, "step": 50250 }, { "epoch": 0.1030144527433998, "grad_norm": 1.0065248012542725, "learning_rate": 0.00029248690165964246, "loss": 3.634730224609375, "step": 50300 }, { "epoch": 0.10311685279582863, "grad_norm": 0.8603098392486572, "learning_rate": 0.00029247178291068836, "loss": 3.8783328247070314, "step": 50350 }, { "epoch": 0.10321925284825746, "grad_norm": 0.9008740186691284, "learning_rate": 0.00029245664935669186, "loss": 3.682059631347656, "step": 50400 }, { "epoch": 0.10332165290068629, "grad_norm": 1.109923243522644, "learning_rate": 0.00029244150099922567, "loss": 3.8022805786132814, "step": 50450 }, { "epoch": 0.10342405295311512, "grad_norm": 0.9621108770370483, "learning_rate": 0.0002924263378398639, "loss": 4.338629455566406, "step": 50500 }, { "epoch": 0.10352645300554394, "grad_norm": 0.8833173513412476, "learning_rate": 0.00029241115988018224, "loss": 3.44856689453125, "step": 50550 }, { "epoch": 0.10362885305797277, "grad_norm": 0.8892256617546082, "learning_rate": 0.0002923959671217579, "loss": 3.9488174438476564, "step": 50600 }, { "epoch": 0.10373125311040159, "grad_norm": 0.8669362664222717, "learning_rate": 0.00029238075956616963, "loss": 3.4380224609375, "step": 50650 }, { "epoch": 0.10383365316283041, "grad_norm": 0.8463394045829773, "learning_rate": 0.0002923655372149978, "loss": 3.5007855224609377, "step": 50700 }, { "epoch": 0.10393605321525924, "grad_norm": 1.0633169412612915, "learning_rate": 0.00029235030006982416, "loss": 3.4543692016601564, "step": 50750 }, { "epoch": 0.10403845326768807, "grad_norm": 1.0498319864273071, "learning_rate": 0.0002923350481322322, "loss": 3.207664794921875, "step": 50800 }, { "epoch": 0.1041408533201169, "grad_norm": 1.1241377592086792, "learning_rate": 0.00029231978140380676, "loss": 3.3383258056640623, "step": 50850 }, { "epoch": 0.10424325337254572, "grad_norm": 0.990468442440033, "learning_rate": 0.0002923044998861343, "loss": 3.237965393066406, "step": 50900 }, { "epoch": 0.10434565342497455, "grad_norm": 1.204306721687317, "learning_rate": 0.0002922892035808027, "loss": 3.101645812988281, "step": 50950 }, { "epoch": 0.10444805347740338, "grad_norm": 0.9017521739006042, "learning_rate": 0.00029227389248940173, "loss": 2.567582702636719, "step": 51000 }, { "epoch": 0.10455045352983221, "grad_norm": 1.0600897073745728, "learning_rate": 0.00029225856661352226, "loss": 3.3536370849609374, "step": 51050 }, { "epoch": 0.10465285358226104, "grad_norm": 0.9761303663253784, "learning_rate": 0.00029224322595475694, "loss": 3.4230682373046877, "step": 51100 }, { "epoch": 0.10475525363468986, "grad_norm": 0.7504201531410217, "learning_rate": 0.0002922278705147, "loss": 3.14410888671875, "step": 51150 }, { "epoch": 0.10485765368711869, "grad_norm": 1.2177993059158325, "learning_rate": 0.00029221250029494694, "loss": 3.2004080200195313, "step": 51200 }, { "epoch": 0.10496005373954752, "grad_norm": 0.744257926940918, "learning_rate": 0.000292197115297095, "loss": 3.388042297363281, "step": 51250 }, { "epoch": 0.10506245379197635, "grad_norm": 0.8854607939720154, "learning_rate": 0.000292181715522743, "loss": 3.436142578125, "step": 51300 }, { "epoch": 0.10516485384440517, "grad_norm": 1.030616283416748, "learning_rate": 0.00029216630097349125, "loss": 3.2875115966796873, "step": 51350 }, { "epoch": 0.105267253896834, "grad_norm": 0.8778656721115112, "learning_rate": 0.00029215087165094145, "loss": 3.6679806518554687, "step": 51400 }, { "epoch": 0.10536965394926283, "grad_norm": 0.6675243377685547, "learning_rate": 0.000292135427556697, "loss": 2.4680940246582033, "step": 51450 }, { "epoch": 0.10547205400169164, "grad_norm": 0.7153152227401733, "learning_rate": 0.0002921199686923628, "loss": 2.0736355590820312, "step": 51500 }, { "epoch": 0.10557445405412047, "grad_norm": 0.9837950468063354, "learning_rate": 0.0002921044950595452, "loss": 3.5398410034179686, "step": 51550 }, { "epoch": 0.1056768541065493, "grad_norm": 0.9022551774978638, "learning_rate": 0.00029208900665985213, "loss": 3.0752154541015626, "step": 51600 }, { "epoch": 0.10577925415897813, "grad_norm": 0.802068293094635, "learning_rate": 0.0002920735034948932, "loss": 3.4963739013671873, "step": 51650 }, { "epoch": 0.10588165421140695, "grad_norm": 0.8395520448684692, "learning_rate": 0.00029205798556627944, "loss": 3.05790771484375, "step": 51700 }, { "epoch": 0.10598405426383578, "grad_norm": 0.8915894627571106, "learning_rate": 0.0002920424528756233, "loss": 3.4001889038085937, "step": 51750 }, { "epoch": 0.10608645431626461, "grad_norm": 0.6118106842041016, "learning_rate": 0.00029202690542453886, "loss": 2.7612185668945313, "step": 51800 }, { "epoch": 0.10618885436869344, "grad_norm": 0.8468356132507324, "learning_rate": 0.00029201134321464177, "loss": 3.045502014160156, "step": 51850 }, { "epoch": 0.10629125442112226, "grad_norm": 0.837311327457428, "learning_rate": 0.00029199576624754927, "loss": 2.9287734985351563, "step": 51900 }, { "epoch": 0.10639365447355109, "grad_norm": 0.8536468744277954, "learning_rate": 0.00029198017452487996, "loss": 3.28405517578125, "step": 51950 }, { "epoch": 0.10649605452597992, "grad_norm": 0.5843203067779541, "learning_rate": 0.0002919645680482541, "loss": 3.158900146484375, "step": 52000 }, { "epoch": 0.10659845457840875, "grad_norm": 0.8552372455596924, "learning_rate": 0.0002919489468192934, "loss": 2.7685336303710937, "step": 52050 }, { "epoch": 0.10670085463083757, "grad_norm": 0.9539399743080139, "learning_rate": 0.00029193331083962127, "loss": 3.3179580688476564, "step": 52100 }, { "epoch": 0.1068032546832664, "grad_norm": 0.9066966772079468, "learning_rate": 0.00029191766011086234, "loss": 3.2776177978515624, "step": 52150 }, { "epoch": 0.10690565473569523, "grad_norm": 1.242811918258667, "learning_rate": 0.0002919019946346431, "loss": 3.0137930297851563, "step": 52200 }, { "epoch": 0.10700805478812406, "grad_norm": 0.7365036606788635, "learning_rate": 0.0002918863144125915, "loss": 3.2038583374023437, "step": 52250 }, { "epoch": 0.10711045484055289, "grad_norm": 1.0112019777297974, "learning_rate": 0.00029187061944633674, "loss": 3.0209481811523435, "step": 52300 }, { "epoch": 0.1072128548929817, "grad_norm": 0.9452877640724182, "learning_rate": 0.00029185490973751, "loss": 3.137978210449219, "step": 52350 }, { "epoch": 0.10731525494541053, "grad_norm": 0.6544405817985535, "learning_rate": 0.0002918391852877436, "loss": 3.417147521972656, "step": 52400 }, { "epoch": 0.10741765499783935, "grad_norm": 0.5501437783241272, "learning_rate": 0.0002918234460986717, "loss": 2.9117431640625, "step": 52450 }, { "epoch": 0.10752005505026818, "grad_norm": 0.8474003076553345, "learning_rate": 0.0002918076921719297, "loss": 3.441578063964844, "step": 52500 }, { "epoch": 0.10762245510269701, "grad_norm": 1.1429380178451538, "learning_rate": 0.0002917919235091548, "loss": 3.6524429321289062, "step": 52550 }, { "epoch": 0.10772485515512584, "grad_norm": 0.6069464683532715, "learning_rate": 0.0002917761401119855, "loss": 3.335174865722656, "step": 52600 }, { "epoch": 0.10782725520755466, "grad_norm": 0.880016028881073, "learning_rate": 0.00029176034198206204, "loss": 3.1002215576171874, "step": 52650 }, { "epoch": 0.10792965525998349, "grad_norm": 0.8357744812965393, "learning_rate": 0.000291744529121026, "loss": 3.146689147949219, "step": 52700 }, { "epoch": 0.10803205531241232, "grad_norm": 1.0419034957885742, "learning_rate": 0.0002917287015305207, "loss": 2.675668029785156, "step": 52750 }, { "epoch": 0.10813445536484115, "grad_norm": 1.0537099838256836, "learning_rate": 0.0002917128592121908, "loss": 3.45556640625, "step": 52800 }, { "epoch": 0.10823685541726998, "grad_norm": 0.8088594675064087, "learning_rate": 0.0002916970021676825, "loss": 3.272125244140625, "step": 52850 }, { "epoch": 0.1083392554696988, "grad_norm": 0.9163171648979187, "learning_rate": 0.0002916811303986437, "loss": 3.303933410644531, "step": 52900 }, { "epoch": 0.10844165552212763, "grad_norm": 1.4584760665893555, "learning_rate": 0.00029166524390672374, "loss": 3.420548095703125, "step": 52950 }, { "epoch": 0.10854405557455646, "grad_norm": 0.9255147576332092, "learning_rate": 0.0002916493426935734, "loss": 3.422330017089844, "step": 53000 }, { "epoch": 0.10864645562698529, "grad_norm": 0.7951823472976685, "learning_rate": 0.0002916334267608451, "loss": 3.435027770996094, "step": 53050 }, { "epoch": 0.10874885567941411, "grad_norm": 0.8804548978805542, "learning_rate": 0.00029161749611019273, "loss": 3.147249755859375, "step": 53100 }, { "epoch": 0.10885125573184294, "grad_norm": 0.6578207015991211, "learning_rate": 0.00029160155074327174, "loss": 3.2707638549804687, "step": 53150 }, { "epoch": 0.10895365578427176, "grad_norm": 0.9386240243911743, "learning_rate": 0.0002915855906617391, "loss": 3.227253723144531, "step": 53200 }, { "epoch": 0.10905605583670058, "grad_norm": 1.0355428457260132, "learning_rate": 0.00029156961586725334, "loss": 3.180726318359375, "step": 53250 }, { "epoch": 0.10915845588912941, "grad_norm": 0.7636610269546509, "learning_rate": 0.0002915536263614745, "loss": 3.335189208984375, "step": 53300 }, { "epoch": 0.10926085594155824, "grad_norm": 0.8508041501045227, "learning_rate": 0.0002915376221460641, "loss": 3.4625811767578125, "step": 53350 }, { "epoch": 0.10936325599398707, "grad_norm": 0.8359129428863525, "learning_rate": 0.0002915216032226852, "loss": 3.2652053833007812, "step": 53400 }, { "epoch": 0.1094656560464159, "grad_norm": 0.7385704517364502, "learning_rate": 0.0002915055695930025, "loss": 3.322744140625, "step": 53450 }, { "epoch": 0.10956805609884472, "grad_norm": 0.8769361972808838, "learning_rate": 0.0002914895212586821, "loss": 2.853853454589844, "step": 53500 }, { "epoch": 0.10967045615127355, "grad_norm": 0.8474721908569336, "learning_rate": 0.00029147345822139165, "loss": 3.6555825805664064, "step": 53550 }, { "epoch": 0.10977285620370238, "grad_norm": 1.0334205627441406, "learning_rate": 0.0002914573804828004, "loss": 3.35653564453125, "step": 53600 }, { "epoch": 0.1098752562561312, "grad_norm": 0.9623045325279236, "learning_rate": 0.000291441288044579, "loss": 3.910505676269531, "step": 53650 }, { "epoch": 0.10997765630856003, "grad_norm": 1.3744142055511475, "learning_rate": 0.0002914251809083998, "loss": 3.520959777832031, "step": 53700 }, { "epoch": 0.11008005636098886, "grad_norm": 0.7532607316970825, "learning_rate": 0.00029140905907593654, "loss": 3.816366882324219, "step": 53750 }, { "epoch": 0.11018245641341769, "grad_norm": 0.8342536091804504, "learning_rate": 0.00029139292254886447, "loss": 3.52778076171875, "step": 53800 }, { "epoch": 0.11028485646584651, "grad_norm": 0.928033173084259, "learning_rate": 0.0002913767713288606, "loss": 3.7655780029296877, "step": 53850 }, { "epoch": 0.11038725651827534, "grad_norm": 0.7441654801368713, "learning_rate": 0.00029136060541760304, "loss": 3.655460205078125, "step": 53900 }, { "epoch": 0.11048965657070417, "grad_norm": 0.9803574085235596, "learning_rate": 0.0002913444248167719, "loss": 3.4816769409179686, "step": 53950 }, { "epoch": 0.110592056623133, "grad_norm": 0.9637227654457092, "learning_rate": 0.00029132822952804846, "loss": 3.7213009643554686, "step": 54000 }, { "epoch": 0.11069445667556181, "grad_norm": 1.0016183853149414, "learning_rate": 0.0002913120195531158, "loss": 3.2758993530273437, "step": 54050 }, { "epoch": 0.11079685672799064, "grad_norm": 0.8481519818305969, "learning_rate": 0.0002912957948936583, "loss": 4.208623962402344, "step": 54100 }, { "epoch": 0.11089925678041947, "grad_norm": 0.9075637459754944, "learning_rate": 0.00029127955555136194, "loss": 3.764527587890625, "step": 54150 }, { "epoch": 0.1110016568328483, "grad_norm": 0.648526132106781, "learning_rate": 0.0002912633015279143, "loss": 3.3832241821289064, "step": 54200 }, { "epoch": 0.11110405688527712, "grad_norm": 0.9115505218505859, "learning_rate": 0.0002912470328250044, "loss": 3.866526794433594, "step": 54250 }, { "epoch": 0.11120645693770595, "grad_norm": 1.0018901824951172, "learning_rate": 0.00029123074944432275, "loss": 3.3686892700195314, "step": 54300 }, { "epoch": 0.11130885699013478, "grad_norm": 0.9239192605018616, "learning_rate": 0.0002912144513875615, "loss": 3.808809509277344, "step": 54350 }, { "epoch": 0.1114112570425636, "grad_norm": 0.7452714443206787, "learning_rate": 0.0002911981386564143, "loss": 3.228931884765625, "step": 54400 }, { "epoch": 0.11151365709499243, "grad_norm": 1.3459135293960571, "learning_rate": 0.0002911818112525763, "loss": 3.20574951171875, "step": 54450 }, { "epoch": 0.11161605714742126, "grad_norm": 1.0211737155914307, "learning_rate": 0.0002911654691777441, "loss": 2.2418772888183596, "step": 54500 }, { "epoch": 0.11171845719985009, "grad_norm": 1.1944788694381714, "learning_rate": 0.00029114911243361595, "loss": 3.102964172363281, "step": 54550 }, { "epoch": 0.11182085725227892, "grad_norm": 0.8922857642173767, "learning_rate": 0.0002911327410218916, "loss": 3.3322735595703126, "step": 54600 }, { "epoch": 0.11192325730470774, "grad_norm": 0.8530144095420837, "learning_rate": 0.0002911163549442722, "loss": 3.8322744750976563, "step": 54650 }, { "epoch": 0.11202565735713657, "grad_norm": 0.9170437455177307, "learning_rate": 0.00029109995420246066, "loss": 4.006968994140625, "step": 54700 }, { "epoch": 0.1121280574095654, "grad_norm": 0.8506373763084412, "learning_rate": 0.0002910835387981612, "loss": 3.5530404663085937, "step": 54750 }, { "epoch": 0.11223045746199423, "grad_norm": 1.0387393236160278, "learning_rate": 0.00029106710873307956, "loss": 3.33231201171875, "step": 54800 }, { "epoch": 0.11233285751442305, "grad_norm": 0.5560868382453918, "learning_rate": 0.00029105066400892315, "loss": 2.382226867675781, "step": 54850 }, { "epoch": 0.11243525756685187, "grad_norm": 0.9407156705856323, "learning_rate": 0.00029103420462740087, "loss": 3.685501708984375, "step": 54900 }, { "epoch": 0.1125376576192807, "grad_norm": 0.6936938166618347, "learning_rate": 0.000291017730590223, "loss": 3.4359500122070314, "step": 54950 }, { "epoch": 0.11264005767170952, "grad_norm": 0.9318833947181702, "learning_rate": 0.0002910012418991016, "loss": 3.430309143066406, "step": 55000 }, { "epoch": 0.11274245772413835, "grad_norm": 1.0374469757080078, "learning_rate": 0.00029098473855574997, "loss": 3.467359619140625, "step": 55050 }, { "epoch": 0.11284485777656718, "grad_norm": 1.1647804975509644, "learning_rate": 0.0002909682205618831, "loss": 3.3597537231445314, "step": 55100 }, { "epoch": 0.112947257828996, "grad_norm": 1.8363399505615234, "learning_rate": 0.00029095168791921753, "loss": 3.4801220703125, "step": 55150 }, { "epoch": 0.11304965788142483, "grad_norm": 0.8499084711074829, "learning_rate": 0.0002909351406294712, "loss": 3.76212158203125, "step": 55200 }, { "epoch": 0.11315205793385366, "grad_norm": 0.6988071203231812, "learning_rate": 0.0002909185786943636, "loss": 3.5692535400390626, "step": 55250 }, { "epoch": 0.11325445798628249, "grad_norm": 0.6894858479499817, "learning_rate": 0.0002909020021156159, "loss": 3.47705810546875, "step": 55300 }, { "epoch": 0.11335685803871132, "grad_norm": 1.0136258602142334, "learning_rate": 0.00029088541089495056, "loss": 3.2412789916992186, "step": 55350 }, { "epoch": 0.11345925809114014, "grad_norm": 0.6407994031906128, "learning_rate": 0.00029086880503409164, "loss": 3.150567626953125, "step": 55400 }, { "epoch": 0.11356165814356897, "grad_norm": 0.8704736828804016, "learning_rate": 0.00029085218453476483, "loss": 3.5778497314453124, "step": 55450 }, { "epoch": 0.1136640581959978, "grad_norm": 0.9866794943809509, "learning_rate": 0.00029083554939869725, "loss": 3.52720703125, "step": 55500 }, { "epoch": 0.11376645824842663, "grad_norm": 0.4646882712841034, "learning_rate": 0.0002908188996276175, "loss": 2.8131982421875, "step": 55550 }, { "epoch": 0.11386885830085546, "grad_norm": 0.7451179623603821, "learning_rate": 0.00029080223522325575, "loss": 3.3548162841796874, "step": 55600 }, { "epoch": 0.11397125835328428, "grad_norm": 1.027496337890625, "learning_rate": 0.0002907855561873438, "loss": 3.047060852050781, "step": 55650 }, { "epoch": 0.11407365840571311, "grad_norm": 1.0272102355957031, "learning_rate": 0.0002907688625216147, "loss": 3.21407958984375, "step": 55700 }, { "epoch": 0.11417605845814192, "grad_norm": 0.8392390012741089, "learning_rate": 0.0002907521542278033, "loss": 3.5421328735351563, "step": 55750 }, { "epoch": 0.11427845851057075, "grad_norm": 0.8752363324165344, "learning_rate": 0.0002907354313076458, "loss": 3.536468811035156, "step": 55800 }, { "epoch": 0.11438085856299958, "grad_norm": 0.6718413233757019, "learning_rate": 0.00029071869376288, "loss": 3.5975299072265625, "step": 55850 }, { "epoch": 0.11448325861542841, "grad_norm": 0.8909393548965454, "learning_rate": 0.0002907019415952452, "loss": 3.8420440673828127, "step": 55900 }, { "epoch": 0.11458565866785723, "grad_norm": 0.7395539879798889, "learning_rate": 0.00029068517480648217, "loss": 3.4465701293945314, "step": 55950 }, { "epoch": 0.11468805872028606, "grad_norm": 0.7831642627716064, "learning_rate": 0.00029066839339833333, "loss": 3.2164300537109374, "step": 56000 }, { "epoch": 0.11479045877271489, "grad_norm": 0.8047283291816711, "learning_rate": 0.0002906515973725424, "loss": 3.697811279296875, "step": 56050 }, { "epoch": 0.11489285882514372, "grad_norm": 0.7210569977760315, "learning_rate": 0.00029063478673085484, "loss": 3.0727462768554688, "step": 56100 }, { "epoch": 0.11499525887757255, "grad_norm": 0.9832913875579834, "learning_rate": 0.0002906179614750175, "loss": 3.4165048217773437, "step": 56150 }, { "epoch": 0.11509765893000137, "grad_norm": 0.9115371108055115, "learning_rate": 0.0002906011216067788, "loss": 3.485976257324219, "step": 56200 }, { "epoch": 0.1152000589824302, "grad_norm": 0.9409294724464417, "learning_rate": 0.0002905842671278887, "loss": 2.942160339355469, "step": 56250 }, { "epoch": 0.11530245903485903, "grad_norm": 1.1528805494308472, "learning_rate": 0.0002905673980400986, "loss": 3.5174395751953127, "step": 56300 }, { "epoch": 0.11540485908728786, "grad_norm": 0.740906834602356, "learning_rate": 0.0002905505143451614, "loss": 3.086756286621094, "step": 56350 }, { "epoch": 0.11550725913971668, "grad_norm": 0.888832688331604, "learning_rate": 0.00029053361604483173, "loss": 3.832029113769531, "step": 56400 }, { "epoch": 0.11560965919214551, "grad_norm": 0.889111340045929, "learning_rate": 0.00029051670314086546, "loss": 3.207186584472656, "step": 56450 }, { "epoch": 0.11571205924457434, "grad_norm": 0.7387615442276001, "learning_rate": 0.0002904997756350202, "loss": 3.152142333984375, "step": 56500 }, { "epoch": 0.11581445929700317, "grad_norm": 0.8200859427452087, "learning_rate": 0.00029048283352905486, "loss": 3.671814270019531, "step": 56550 }, { "epoch": 0.11591685934943198, "grad_norm": 1.1036324501037598, "learning_rate": 0.0002904658768247301, "loss": 3.4229196166992186, "step": 56600 }, { "epoch": 0.11601925940186081, "grad_norm": 0.8785697221755981, "learning_rate": 0.00029044890552380796, "loss": 3.5630813598632813, "step": 56650 }, { "epoch": 0.11612165945428964, "grad_norm": 1.0525559186935425, "learning_rate": 0.000290431919628052, "loss": 3.124271240234375, "step": 56700 }, { "epoch": 0.11622405950671846, "grad_norm": 0.9268920421600342, "learning_rate": 0.00029041491913922736, "loss": 3.26138916015625, "step": 56750 }, { "epoch": 0.11632645955914729, "grad_norm": 0.8036125898361206, "learning_rate": 0.0002903979040591006, "loss": 3.1208505249023437, "step": 56800 }, { "epoch": 0.11642885961157612, "grad_norm": 0.871330976486206, "learning_rate": 0.0002903808743894399, "loss": 3.6094674682617187, "step": 56850 }, { "epoch": 0.11653125966400495, "grad_norm": 0.7573062181472778, "learning_rate": 0.00029036383013201486, "loss": 3.4403109741210938, "step": 56900 }, { "epoch": 0.11663365971643377, "grad_norm": 0.8866212964057922, "learning_rate": 0.0002903467712885967, "loss": 3.023941955566406, "step": 56950 }, { "epoch": 0.1167360597688626, "grad_norm": 1.240868330001831, "learning_rate": 0.00029032969786095807, "loss": 3.81320556640625, "step": 57000 }, { "epoch": 0.11683845982129143, "grad_norm": 0.9338199496269226, "learning_rate": 0.0002903126098508732, "loss": 3.1648443603515624, "step": 57050 }, { "epoch": 0.11694085987372026, "grad_norm": 0.8921442031860352, "learning_rate": 0.0002902955072601177, "loss": 3.7754312133789063, "step": 57100 }, { "epoch": 0.11704325992614908, "grad_norm": 0.7555287480354309, "learning_rate": 0.00029027839009046887, "loss": 3.6020452880859377, "step": 57150 }, { "epoch": 0.11714565997857791, "grad_norm": 0.8668673038482666, "learning_rate": 0.00029026125834370547, "loss": 3.1613735961914062, "step": 57200 }, { "epoch": 0.11724806003100674, "grad_norm": 0.8572468757629395, "learning_rate": 0.00029024411202160775, "loss": 3.5449398803710936, "step": 57250 }, { "epoch": 0.11735046008343557, "grad_norm": 1.0183916091918945, "learning_rate": 0.0002902269511259575, "loss": 3.4537921142578125, "step": 57300 }, { "epoch": 0.1174528601358644, "grad_norm": 0.7662498354911804, "learning_rate": 0.00029020977565853793, "loss": 3.6329010009765623, "step": 57350 }, { "epoch": 0.11755526018829322, "grad_norm": 0.7248380780220032, "learning_rate": 0.0002901925856211339, "loss": 3.476121826171875, "step": 57400 }, { "epoch": 0.11765766024072204, "grad_norm": 0.5883516073226929, "learning_rate": 0.0002901753810155316, "loss": 3.1229867553710937, "step": 57450 }, { "epoch": 0.11776006029315086, "grad_norm": 1.3006634712219238, "learning_rate": 0.00029015816184351905, "loss": 3.42736572265625, "step": 57500 }, { "epoch": 0.11786246034557969, "grad_norm": 0.9047501683235168, "learning_rate": 0.0002901409281068855, "loss": 3.190472412109375, "step": 57550 }, { "epoch": 0.11796486039800852, "grad_norm": 0.8864745497703552, "learning_rate": 0.00029012367980742177, "loss": 3.5744329833984376, "step": 57600 }, { "epoch": 0.11806726045043735, "grad_norm": 0.9945940971374512, "learning_rate": 0.0002901064169469203, "loss": 3.613050537109375, "step": 57650 }, { "epoch": 0.11816966050286618, "grad_norm": 0.9474062919616699, "learning_rate": 0.00029008913952717486, "loss": 3.755731201171875, "step": 57700 }, { "epoch": 0.118272060555295, "grad_norm": 0.9160423874855042, "learning_rate": 0.000290071847549981, "loss": 3.7074078369140624, "step": 57750 }, { "epoch": 0.11837446060772383, "grad_norm": 0.7401031851768494, "learning_rate": 0.0002900545410171355, "loss": 3.9821441650390623, "step": 57800 }, { "epoch": 0.11847686066015266, "grad_norm": 0.9346218705177307, "learning_rate": 0.00029003721993043686, "loss": 3.6518328857421873, "step": 57850 }, { "epoch": 0.11857926071258149, "grad_norm": 0.8762102723121643, "learning_rate": 0.0002900198842916849, "loss": 3.3181643676757813, "step": 57900 }, { "epoch": 0.11868166076501031, "grad_norm": 0.8260309100151062, "learning_rate": 0.00029000253410268117, "loss": 3.745126953125, "step": 57950 }, { "epoch": 0.11878406081743914, "grad_norm": 0.728134274482727, "learning_rate": 0.00028998516936522864, "loss": 3.5524822998046877, "step": 58000 }, { "epoch": 0.11888646086986797, "grad_norm": 1.0089305639266968, "learning_rate": 0.0002899677900811316, "loss": 3.7944915771484373, "step": 58050 }, { "epoch": 0.1189888609222968, "grad_norm": 0.8292215466499329, "learning_rate": 0.0002899503962521963, "loss": 3.89322021484375, "step": 58100 }, { "epoch": 0.11909126097472562, "grad_norm": 0.8529530167579651, "learning_rate": 0.00028993298788023005, "loss": 2.9111569213867186, "step": 58150 }, { "epoch": 0.11919366102715445, "grad_norm": 0.8902004361152649, "learning_rate": 0.00028991556496704186, "loss": 3.6739492797851563, "step": 58200 }, { "epoch": 0.11929606107958328, "grad_norm": 0.9264180660247803, "learning_rate": 0.0002898981275144423, "loss": 4.041621704101562, "step": 58250 }, { "epoch": 0.1193984611320121, "grad_norm": 0.8773983716964722, "learning_rate": 0.0002898806755242433, "loss": 3.3137640380859374, "step": 58300 }, { "epoch": 0.11950086118444092, "grad_norm": 0.8258083462715149, "learning_rate": 0.00028986320899825855, "loss": 3.7007760620117187, "step": 58350 }, { "epoch": 0.11960326123686975, "grad_norm": 0.4484880566596985, "learning_rate": 0.00028984572793830295, "loss": 2.6619467163085937, "step": 58400 }, { "epoch": 0.11970566128929858, "grad_norm": 0.7841198444366455, "learning_rate": 0.0002898282323461931, "loss": 3.9329864501953127, "step": 58450 }, { "epoch": 0.1198080613417274, "grad_norm": 1.0872740745544434, "learning_rate": 0.0002898107222237471, "loss": 3.7529037475585936, "step": 58500 }, { "epoch": 0.11991046139415623, "grad_norm": 1.1370536088943481, "learning_rate": 0.0002897931975727845, "loss": 3.598294372558594, "step": 58550 }, { "epoch": 0.12001286144658506, "grad_norm": 0.5730462670326233, "learning_rate": 0.0002897756583951264, "loss": 3.4985086059570314, "step": 58600 }, { "epoch": 0.12011526149901389, "grad_norm": 0.8920771479606628, "learning_rate": 0.00028975810469259535, "loss": 3.025179443359375, "step": 58650 }, { "epoch": 0.12021766155144271, "grad_norm": 0.877116858959198, "learning_rate": 0.0002897405364670155, "loss": 3.4373843383789064, "step": 58700 }, { "epoch": 0.12032006160387154, "grad_norm": 0.9569665193557739, "learning_rate": 0.0002897229537202124, "loss": 4.03067626953125, "step": 58750 }, { "epoch": 0.12042246165630037, "grad_norm": 0.9027877449989319, "learning_rate": 0.00028970535645401324, "loss": 3.0247479248046876, "step": 58800 }, { "epoch": 0.1205248617087292, "grad_norm": 0.7448411583900452, "learning_rate": 0.0002896877446702467, "loss": 3.632384948730469, "step": 58850 }, { "epoch": 0.12062726176115803, "grad_norm": 0.8100590705871582, "learning_rate": 0.0002896701183707428, "loss": 3.263778076171875, "step": 58900 }, { "epoch": 0.12072966181358685, "grad_norm": 1.191540241241455, "learning_rate": 0.0002896524775573332, "loss": 3.6475961303710935, "step": 58950 }, { "epoch": 0.12083206186601568, "grad_norm": 0.7784574031829834, "learning_rate": 0.00028963482223185106, "loss": 3.554160461425781, "step": 59000 }, { "epoch": 0.12093446191844451, "grad_norm": 0.5998643040657043, "learning_rate": 0.0002896171523961312, "loss": 3.2943960571289064, "step": 59050 }, { "epoch": 0.12103686197087334, "grad_norm": 0.8640596270561218, "learning_rate": 0.0002895994680520096, "loss": 2.6187591552734375, "step": 59100 }, { "epoch": 0.12113926202330215, "grad_norm": 0.6376426219940186, "learning_rate": 0.00028958176920132396, "loss": 3.336057434082031, "step": 59150 }, { "epoch": 0.12124166207573098, "grad_norm": 1.1029490232467651, "learning_rate": 0.0002895640558459136, "loss": 3.192468566894531, "step": 59200 }, { "epoch": 0.1213440621281598, "grad_norm": 0.9253978729248047, "learning_rate": 0.00028954632798761906, "loss": 3.82802490234375, "step": 59250 }, { "epoch": 0.12144646218058863, "grad_norm": 1.0808192491531372, "learning_rate": 0.0002895285856282826, "loss": 3.537474365234375, "step": 59300 }, { "epoch": 0.12154886223301746, "grad_norm": 0.7610458731651306, "learning_rate": 0.000289510828769748, "loss": 3.7149560546875, "step": 59350 }, { "epoch": 0.12165126228544629, "grad_norm": 1.0239511728286743, "learning_rate": 0.0002894930574138604, "loss": 3.168520202636719, "step": 59400 }, { "epoch": 0.12175366233787512, "grad_norm": 1.482177495956421, "learning_rate": 0.0002894752715624665, "loss": 3.8551751708984376, "step": 59450 }, { "epoch": 0.12185606239030394, "grad_norm": 0.8012579083442688, "learning_rate": 0.00028945747121741455, "loss": 3.244693603515625, "step": 59500 }, { "epoch": 0.12195846244273277, "grad_norm": 0.6927148699760437, "learning_rate": 0.0002894396563805543, "loss": 3.722396240234375, "step": 59550 }, { "epoch": 0.1220608624951616, "grad_norm": 0.7614629864692688, "learning_rate": 0.00028942182705373707, "loss": 3.183421630859375, "step": 59600 }, { "epoch": 0.12216326254759043, "grad_norm": 0.8808593153953552, "learning_rate": 0.0002894039832388154, "loss": 3.444737854003906, "step": 59650 }, { "epoch": 0.12226566260001925, "grad_norm": 0.9153810143470764, "learning_rate": 0.0002893861249376437, "loss": 3.483736267089844, "step": 59700 }, { "epoch": 0.12236806265244808, "grad_norm": 0.8851150870323181, "learning_rate": 0.0002893682521520777, "loss": 3.7175869750976562, "step": 59750 }, { "epoch": 0.12247046270487691, "grad_norm": 0.7266696095466614, "learning_rate": 0.00028935036488397466, "loss": 3.335245361328125, "step": 59800 }, { "epoch": 0.12257286275730574, "grad_norm": 0.9137750864028931, "learning_rate": 0.0002893324631351933, "loss": 2.9954302978515623, "step": 59850 }, { "epoch": 0.12267526280973456, "grad_norm": 0.8360620141029358, "learning_rate": 0.00028931454690759396, "loss": 3.0706732177734377, "step": 59900 }, { "epoch": 0.12277766286216339, "grad_norm": 0.8443347811698914, "learning_rate": 0.00028929661620303833, "loss": 3.848203430175781, "step": 59950 }, { "epoch": 0.1228800629145922, "grad_norm": 0.9306533932685852, "learning_rate": 0.0002892786710233898, "loss": 2.975295104980469, "step": 60000 }, { "epoch": 0.12298246296702103, "grad_norm": 0.9441879391670227, "learning_rate": 0.00028926071137051307, "loss": 3.9100912475585936, "step": 60050 }, { "epoch": 0.12308486301944986, "grad_norm": 0.7004597187042236, "learning_rate": 0.00028924273724627444, "loss": 3.670945739746094, "step": 60100 }, { "epoch": 0.12318726307187869, "grad_norm": 0.7978895306587219, "learning_rate": 0.00028922474865254174, "loss": 3.39288818359375, "step": 60150 }, { "epoch": 0.12328966312430752, "grad_norm": 0.8944730758666992, "learning_rate": 0.0002892067455911842, "loss": 3.4790631103515626, "step": 60200 }, { "epoch": 0.12339206317673634, "grad_norm": 0.9302740097045898, "learning_rate": 0.0002891887280640727, "loss": 3.564194641113281, "step": 60250 }, { "epoch": 0.12349446322916517, "grad_norm": 0.7751696109771729, "learning_rate": 0.0002891706960730795, "loss": 3.4631011962890623, "step": 60300 }, { "epoch": 0.123596863281594, "grad_norm": 0.998839795589447, "learning_rate": 0.00028915264962007836, "loss": 3.490992126464844, "step": 60350 }, { "epoch": 0.12369926333402283, "grad_norm": 1.2390878200531006, "learning_rate": 0.0002891345887069447, "loss": 3.6483099365234377, "step": 60400 }, { "epoch": 0.12380166338645165, "grad_norm": 0.8795660138130188, "learning_rate": 0.0002891165133355553, "loss": 3.6523648071289063, "step": 60450 }, { "epoch": 0.12390406343888048, "grad_norm": 0.8491701483726501, "learning_rate": 0.00028909842350778836, "loss": 3.5479266357421877, "step": 60500 }, { "epoch": 0.12400646349130931, "grad_norm": 0.7775550484657288, "learning_rate": 0.00028908031922552377, "loss": 3.1797994995117187, "step": 60550 }, { "epoch": 0.12410886354373814, "grad_norm": 0.7711923718452454, "learning_rate": 0.0002890622004906429, "loss": 3.398070068359375, "step": 60600 }, { "epoch": 0.12421126359616697, "grad_norm": 0.490875244140625, "learning_rate": 0.0002890440673050285, "loss": 3.022109069824219, "step": 60650 }, { "epoch": 0.12431366364859579, "grad_norm": 0.7348693609237671, "learning_rate": 0.0002890259196705649, "loss": 3.46414794921875, "step": 60700 }, { "epoch": 0.12441606370102462, "grad_norm": 0.9327791929244995, "learning_rate": 0.000289007757589138, "loss": 3.681882629394531, "step": 60750 }, { "epoch": 0.12451846375345345, "grad_norm": 0.8426567912101746, "learning_rate": 0.000288989581062635, "loss": 4.021507568359375, "step": 60800 }, { "epoch": 0.12462086380588226, "grad_norm": 0.9796308875083923, "learning_rate": 0.0002889713900929448, "loss": 3.8382940673828125, "step": 60850 }, { "epoch": 0.12472326385831109, "grad_norm": 0.7347166538238525, "learning_rate": 0.0002889531846819577, "loss": 3.32147216796875, "step": 60900 }, { "epoch": 0.12482566391073992, "grad_norm": 0.770237147808075, "learning_rate": 0.0002889349648315655, "loss": 3.648823547363281, "step": 60950 }, { "epoch": 0.12492806396316875, "grad_norm": 0.6420400738716125, "learning_rate": 0.00028891673054366165, "loss": 3.17007568359375, "step": 61000 }, { "epoch": 0.1250304640155976, "grad_norm": 0.7027015089988708, "learning_rate": 0.00028889848182014086, "loss": 3.246382141113281, "step": 61050 }, { "epoch": 0.1251328640680264, "grad_norm": 0.868607759475708, "learning_rate": 0.0002888802186628995, "loss": 3.6044903564453126, "step": 61100 }, { "epoch": 0.12523526412045524, "grad_norm": 0.8410335183143616, "learning_rate": 0.00028886194107383535, "loss": 3.066201171875, "step": 61150 }, { "epoch": 0.12533766417288406, "grad_norm": 1.0808706283569336, "learning_rate": 0.00028884364905484784, "loss": 3.1906118774414063, "step": 61200 }, { "epoch": 0.12544006422531287, "grad_norm": 0.872553825378418, "learning_rate": 0.00028882534260783765, "loss": 3.3807113647460936, "step": 61250 }, { "epoch": 0.1255424642777417, "grad_norm": 0.9935702681541443, "learning_rate": 0.0002888070217347072, "loss": 3.3980447387695314, "step": 61300 }, { "epoch": 0.12564486433017052, "grad_norm": 0.8990649580955505, "learning_rate": 0.0002887886864373603, "loss": 3.4861651611328126, "step": 61350 }, { "epoch": 0.12574726438259937, "grad_norm": 0.8892736434936523, "learning_rate": 0.0002887703367177023, "loss": 3.9071136474609376, "step": 61400 }, { "epoch": 0.12584966443502818, "grad_norm": 0.7861908078193665, "learning_rate": 0.00028875197257763997, "loss": 3.886827392578125, "step": 61450 }, { "epoch": 0.12595206448745702, "grad_norm": 0.8096019625663757, "learning_rate": 0.0002887335940190817, "loss": 2.9763027954101564, "step": 61500 }, { "epoch": 0.12605446453988584, "grad_norm": 0.8015087246894836, "learning_rate": 0.00028871520104393724, "loss": 3.5265399169921876, "step": 61550 }, { "epoch": 0.12615686459231468, "grad_norm": 1.0955448150634766, "learning_rate": 0.00028869679365411786, "loss": 3.746468811035156, "step": 61600 }, { "epoch": 0.1262592646447435, "grad_norm": 0.9293431043624878, "learning_rate": 0.00028867837185153654, "loss": 3.725838317871094, "step": 61650 }, { "epoch": 0.12636166469717233, "grad_norm": 0.881248950958252, "learning_rate": 0.0002886599356381075, "loss": 3.873548583984375, "step": 61700 }, { "epoch": 0.12646406474960115, "grad_norm": 0.7995479702949524, "learning_rate": 0.00028864148501574655, "loss": 3.55103515625, "step": 61750 }, { "epoch": 0.12656646480203, "grad_norm": 0.7834081053733826, "learning_rate": 0.00028862301998637096, "loss": 3.5016546630859375, "step": 61800 }, { "epoch": 0.1266688648544588, "grad_norm": 0.8396415710449219, "learning_rate": 0.00028860454055189955, "loss": 3.15347900390625, "step": 61850 }, { "epoch": 0.12677126490688764, "grad_norm": 0.7540357112884521, "learning_rate": 0.00028858604671425266, "loss": 3.5248077392578123, "step": 61900 }, { "epoch": 0.12687366495931646, "grad_norm": 1.1297228336334229, "learning_rate": 0.00028856753847535213, "loss": 3.4668838500976564, "step": 61950 }, { "epoch": 0.1269760650117453, "grad_norm": 0.7924526929855347, "learning_rate": 0.0002885490158371212, "loss": 3.7679620361328126, "step": 62000 }, { "epoch": 0.1270784650641741, "grad_norm": 0.8227761387825012, "learning_rate": 0.0002885304788014846, "loss": 3.809046325683594, "step": 62050 }, { "epoch": 0.12718086511660295, "grad_norm": 0.8400523662567139, "learning_rate": 0.0002885119273703687, "loss": 3.74009765625, "step": 62100 }, { "epoch": 0.12728326516903177, "grad_norm": 1.1692306995391846, "learning_rate": 0.0002884933615457012, "loss": 4.000062866210937, "step": 62150 }, { "epoch": 0.12738566522146058, "grad_norm": 0.7943342328071594, "learning_rate": 0.00028847478132941153, "loss": 3.8031546020507814, "step": 62200 }, { "epoch": 0.12748806527388942, "grad_norm": 0.9809468984603882, "learning_rate": 0.0002884561867234303, "loss": 3.805234680175781, "step": 62250 }, { "epoch": 0.12759046532631824, "grad_norm": 0.9183539748191833, "learning_rate": 0.00028843757772968994, "loss": 4.105808715820313, "step": 62300 }, { "epoch": 0.12769286537874708, "grad_norm": 0.9354544281959534, "learning_rate": 0.0002884189543501241, "loss": 3.7343814086914064, "step": 62350 }, { "epoch": 0.1277952654311759, "grad_norm": 0.8216899633407593, "learning_rate": 0.00028840031658666803, "loss": 3.678810729980469, "step": 62400 }, { "epoch": 0.12789766548360473, "grad_norm": 0.8827342987060547, "learning_rate": 0.00028838166444125857, "loss": 3.634096374511719, "step": 62450 }, { "epoch": 0.12800006553603355, "grad_norm": 0.9468240141868591, "learning_rate": 0.00028836299791583386, "loss": 3.0597830200195313, "step": 62500 }, { "epoch": 0.1281024655884624, "grad_norm": 0.9269732236862183, "learning_rate": 0.00028834431701233376, "loss": 3.667522277832031, "step": 62550 }, { "epoch": 0.1282048656408912, "grad_norm": 0.7396625280380249, "learning_rate": 0.0002883256217326994, "loss": 3.823531494140625, "step": 62600 }, { "epoch": 0.12830726569332004, "grad_norm": 0.653838574886322, "learning_rate": 0.0002883069120788737, "loss": 3.5563314819335936, "step": 62650 }, { "epoch": 0.12840966574574886, "grad_norm": 0.8573964834213257, "learning_rate": 0.0002882881880528006, "loss": 3.9133944702148438, "step": 62700 }, { "epoch": 0.1285120657981777, "grad_norm": 0.8567407727241516, "learning_rate": 0.00028826944965642604, "loss": 3.63771484375, "step": 62750 }, { "epoch": 0.1286144658506065, "grad_norm": 0.8221452832221985, "learning_rate": 0.00028825069689169706, "loss": 3.7375106811523438, "step": 62800 }, { "epoch": 0.12871686590303535, "grad_norm": 0.8458483815193176, "learning_rate": 0.0002882319297605626, "loss": 3.4764666748046875, "step": 62850 }, { "epoch": 0.12881926595546417, "grad_norm": 0.5829837322235107, "learning_rate": 0.0002882131482649727, "loss": 3.4318243408203126, "step": 62900 }, { "epoch": 0.128921666007893, "grad_norm": 0.6864559054374695, "learning_rate": 0.000288194352406879, "loss": 2.9922601318359376, "step": 62950 }, { "epoch": 0.12902406606032182, "grad_norm": 0.7200921177864075, "learning_rate": 0.0002881755421882348, "loss": 3.436331481933594, "step": 63000 }, { "epoch": 0.12912646611275064, "grad_norm": 0.8018766045570374, "learning_rate": 0.00028815671761099474, "loss": 3.8753070068359374, "step": 63050 }, { "epoch": 0.12922886616517948, "grad_norm": 0.7417867183685303, "learning_rate": 0.00028813787867711495, "loss": 3.4831881713867188, "step": 63100 }, { "epoch": 0.1293312662176083, "grad_norm": 0.8872492909431458, "learning_rate": 0.0002881190253885531, "loss": 3.471279296875, "step": 63150 }, { "epoch": 0.12943366627003713, "grad_norm": 0.9026205539703369, "learning_rate": 0.00028810015774726844, "loss": 3.930486755371094, "step": 63200 }, { "epoch": 0.12953606632246595, "grad_norm": 0.8319406509399414, "learning_rate": 0.0002880812757552215, "loss": 3.876917419433594, "step": 63250 }, { "epoch": 0.1296384663748948, "grad_norm": 0.7153857946395874, "learning_rate": 0.00028806237941437444, "loss": 3.4448760986328124, "step": 63300 }, { "epoch": 0.1297408664273236, "grad_norm": 0.7869312763214111, "learning_rate": 0.00028804346872669085, "loss": 3.9350848388671875, "step": 63350 }, { "epoch": 0.12984326647975244, "grad_norm": 0.7719307541847229, "learning_rate": 0.00028802454369413594, "loss": 3.8888482666015625, "step": 63400 }, { "epoch": 0.12994566653218126, "grad_norm": 1.150686502456665, "learning_rate": 0.00028800560431867624, "loss": 2.990634765625, "step": 63450 }, { "epoch": 0.1300480665846101, "grad_norm": 0.7204848527908325, "learning_rate": 0.00028798665060227984, "loss": 2.155850067138672, "step": 63500 }, { "epoch": 0.13015046663703891, "grad_norm": 1.2251051664352417, "learning_rate": 0.0002879676825469164, "loss": 3.9703302001953125, "step": 63550 }, { "epoch": 0.13025286668946776, "grad_norm": 0.7243852615356445, "learning_rate": 0.00028794870015455695, "loss": 3.6895037841796876, "step": 63600 }, { "epoch": 0.13035526674189657, "grad_norm": 0.8284658193588257, "learning_rate": 0.00028792970342717407, "loss": 3.7690008544921874, "step": 63650 }, { "epoch": 0.1304576667943254, "grad_norm": 0.05757651478052139, "learning_rate": 0.0002879106923667418, "loss": 1.9905595397949218, "step": 63700 }, { "epoch": 0.13056006684675422, "grad_norm": 0.7437557578086853, "learning_rate": 0.0002878916669752357, "loss": 1.6622731018066406, "step": 63750 }, { "epoch": 0.13066246689918307, "grad_norm": 0.8517412543296814, "learning_rate": 0.0002878726272546328, "loss": 4.094966430664062, "step": 63800 }, { "epoch": 0.13076486695161188, "grad_norm": 0.8423788547515869, "learning_rate": 0.00028785357320691154, "loss": 4.379864196777344, "step": 63850 }, { "epoch": 0.1308672670040407, "grad_norm": 0.853164792060852, "learning_rate": 0.0002878345048340521, "loss": 3.3010690307617185, "step": 63900 }, { "epoch": 0.13096966705646954, "grad_norm": 0.7724624872207642, "learning_rate": 0.00028781542213803587, "loss": 3.83298095703125, "step": 63950 }, { "epoch": 0.13107206710889835, "grad_norm": 0.599814236164093, "learning_rate": 0.0002877963251208459, "loss": 3.6117398071289064, "step": 64000 }, { "epoch": 0.1311744671613272, "grad_norm": 0.87944096326828, "learning_rate": 0.00028777721378446655, "loss": 3.77650390625, "step": 64050 }, { "epoch": 0.131276867213756, "grad_norm": 0.8216091990470886, "learning_rate": 0.000287758088130884, "loss": 3.966680603027344, "step": 64100 }, { "epoch": 0.13137926726618485, "grad_norm": 0.9879843592643738, "learning_rate": 0.00028773894816208547, "loss": 3.368244323730469, "step": 64150 }, { "epoch": 0.13148166731861366, "grad_norm": 0.7889556288719177, "learning_rate": 0.00028771979388006, "loss": 3.514817199707031, "step": 64200 }, { "epoch": 0.1315840673710425, "grad_norm": 0.7712281942367554, "learning_rate": 0.00028770062528679814, "loss": 3.8969122314453126, "step": 64250 }, { "epoch": 0.13168646742347132, "grad_norm": 0.6825302243232727, "learning_rate": 0.0002876814423842916, "loss": 2.702755126953125, "step": 64300 }, { "epoch": 0.13178886747590016, "grad_norm": 0.7740472555160522, "learning_rate": 0.0002876622451745339, "loss": 3.027957763671875, "step": 64350 }, { "epoch": 0.13189126752832897, "grad_norm": 0.7272697687149048, "learning_rate": 0.00028764303365951986, "loss": 3.3588211059570314, "step": 64400 }, { "epoch": 0.1319936675807578, "grad_norm": 0.8405432105064392, "learning_rate": 0.00028762380784124597, "loss": 3.6030569458007813, "step": 64450 }, { "epoch": 0.13209606763318663, "grad_norm": 0.8467888236045837, "learning_rate": 0.00028760456772171004, "loss": 3.6072647094726564, "step": 64500 }, { "epoch": 0.13219846768561547, "grad_norm": 0.7771287560462952, "learning_rate": 0.0002875853133029113, "loss": 3.8847897338867186, "step": 64550 }, { "epoch": 0.13230086773804428, "grad_norm": 0.9590752720832825, "learning_rate": 0.0002875660445868507, "loss": 3.6549798583984376, "step": 64600 }, { "epoch": 0.13240326779047312, "grad_norm": 0.7539810538291931, "learning_rate": 0.0002875467615755306, "loss": 3.3866226196289064, "step": 64650 }, { "epoch": 0.13250566784290194, "grad_norm": 0.8308656215667725, "learning_rate": 0.0002875274642709548, "loss": 3.721044006347656, "step": 64700 }, { "epoch": 0.13260806789533075, "grad_norm": 0.8056835532188416, "learning_rate": 0.00028750815267512847, "loss": 3.9817669677734373, "step": 64750 }, { "epoch": 0.1327104679477596, "grad_norm": 1.1581485271453857, "learning_rate": 0.0002874888267900585, "loss": 3.785094909667969, "step": 64800 }, { "epoch": 0.1328128680001884, "grad_norm": 0.6927155256271362, "learning_rate": 0.0002874694866177531, "loss": 3.867703857421875, "step": 64850 }, { "epoch": 0.13291526805261725, "grad_norm": 1.1712969541549683, "learning_rate": 0.00028745013216022197, "loss": 3.79897705078125, "step": 64900 }, { "epoch": 0.13301766810504606, "grad_norm": 0.7104830741882324, "learning_rate": 0.0002874307634194765, "loss": 3.595622863769531, "step": 64950 }, { "epoch": 0.1331200681574749, "grad_norm": 0.8754029273986816, "learning_rate": 0.00028741138039752923, "loss": 3.6854147338867187, "step": 65000 }, { "epoch": 0.13322246820990372, "grad_norm": 0.8316354751586914, "learning_rate": 0.00028739198309639445, "loss": 3.955341491699219, "step": 65050 }, { "epoch": 0.13332486826233256, "grad_norm": 0.7100203037261963, "learning_rate": 0.00028737257151808783, "loss": 3.53195556640625, "step": 65100 }, { "epoch": 0.13342726831476137, "grad_norm": 0.7703724503517151, "learning_rate": 0.00028735314566462653, "loss": 3.5027481079101563, "step": 65150 }, { "epoch": 0.1335296683671902, "grad_norm": 0.6825149059295654, "learning_rate": 0.00028733370553802917, "loss": 2.5823513793945314, "step": 65200 }, { "epoch": 0.13363206841961903, "grad_norm": 0.7070282101631165, "learning_rate": 0.00028731425114031595, "loss": 3.5302462768554688, "step": 65250 }, { "epoch": 0.13373446847204787, "grad_norm": 0.8907217383384705, "learning_rate": 0.0002872947824735084, "loss": 3.7343438720703124, "step": 65300 }, { "epoch": 0.13383686852447668, "grad_norm": 0.6310061812400818, "learning_rate": 0.00028727529953962973, "loss": 3.046968688964844, "step": 65350 }, { "epoch": 0.13393926857690552, "grad_norm": 0.830430269241333, "learning_rate": 0.00028725580234070444, "loss": 3.7792376708984374, "step": 65400 }, { "epoch": 0.13404166862933434, "grad_norm": 0.7595807313919067, "learning_rate": 0.0002872362908787586, "loss": 4.01267578125, "step": 65450 }, { "epoch": 0.13414406868176318, "grad_norm": 0.939785897731781, "learning_rate": 0.00028721676515581975, "loss": 3.7015313720703125, "step": 65500 }, { "epoch": 0.134246468734192, "grad_norm": 0.7830142378807068, "learning_rate": 0.00028719722517391694, "loss": 3.7573947143554687, "step": 65550 }, { "epoch": 0.1343488687866208, "grad_norm": 0.8249261379241943, "learning_rate": 0.00028717767093508066, "loss": 2.924357604980469, "step": 65600 }, { "epoch": 0.13445126883904965, "grad_norm": 0.8103399276733398, "learning_rate": 0.00028715810244134293, "loss": 3.1508941650390625, "step": 65650 }, { "epoch": 0.13455366889147846, "grad_norm": 1.0751904249191284, "learning_rate": 0.0002871385196947372, "loss": 3.2744952392578126, "step": 65700 }, { "epoch": 0.1346560689439073, "grad_norm": 0.8905739188194275, "learning_rate": 0.0002871189226972984, "loss": 3.7018252563476564, "step": 65750 }, { "epoch": 0.13475846899633612, "grad_norm": 0.9014281630516052, "learning_rate": 0.00028709931145106304, "loss": 3.712538757324219, "step": 65800 }, { "epoch": 0.13486086904876496, "grad_norm": 0.9917147159576416, "learning_rate": 0.000287079685958069, "loss": 3.321624755859375, "step": 65850 }, { "epoch": 0.13496326910119377, "grad_norm": 0.9449427127838135, "learning_rate": 0.0002870600462203556, "loss": 3.589186096191406, "step": 65900 }, { "epoch": 0.13506566915362261, "grad_norm": 0.8208171725273132, "learning_rate": 0.00028704039223996383, "loss": 3.7818731689453124, "step": 65950 }, { "epoch": 0.13516806920605143, "grad_norm": 0.8270769119262695, "learning_rate": 0.0002870207240189359, "loss": 3.731416015625, "step": 66000 }, { "epoch": 0.13527046925848027, "grad_norm": 1.045253038406372, "learning_rate": 0.0002870010415593159, "loss": 3.7312091064453123, "step": 66050 }, { "epoch": 0.13537286931090908, "grad_norm": 0.7662860155105591, "learning_rate": 0.00028698134486314884, "loss": 3.7503961181640624, "step": 66100 }, { "epoch": 0.13547526936333792, "grad_norm": 0.7599702477455139, "learning_rate": 0.0002869616339324817, "loss": 2.9531689453125, "step": 66150 }, { "epoch": 0.13557766941576674, "grad_norm": 0.9016150236129761, "learning_rate": 0.00028694190876936274, "loss": 3.9108657836914062, "step": 66200 }, { "epoch": 0.13568006946819558, "grad_norm": 0.9253189563751221, "learning_rate": 0.00028692216937584164, "loss": 3.645496520996094, "step": 66250 }, { "epoch": 0.1357824695206244, "grad_norm": 0.9780471324920654, "learning_rate": 0.0002869024157539697, "loss": 3.7777984619140623, "step": 66300 }, { "epoch": 0.13588486957305324, "grad_norm": 0.7383785843849182, "learning_rate": 0.00028688264790579956, "loss": 3.588190002441406, "step": 66350 }, { "epoch": 0.13598726962548205, "grad_norm": 0.8228618502616882, "learning_rate": 0.00028686286583338554, "loss": 3.2836099243164063, "step": 66400 }, { "epoch": 0.13608966967791086, "grad_norm": 0.989874541759491, "learning_rate": 0.00028684306953878316, "loss": 3.2741690063476563, "step": 66450 }, { "epoch": 0.1361920697303397, "grad_norm": 0.6227463483810425, "learning_rate": 0.00028682325902404957, "loss": 3.5655419921875, "step": 66500 }, { "epoch": 0.13629446978276852, "grad_norm": 0.9205330014228821, "learning_rate": 0.00028680343429124356, "loss": 3.891072998046875, "step": 66550 }, { "epoch": 0.13639686983519736, "grad_norm": 0.9149171113967896, "learning_rate": 0.000286783595342425, "loss": 3.59095458984375, "step": 66600 }, { "epoch": 0.13649926988762617, "grad_norm": 0.9638737440109253, "learning_rate": 0.00028676374217965567, "loss": 3.623572998046875, "step": 66650 }, { "epoch": 0.13660166994005501, "grad_norm": 1.3770073652267456, "learning_rate": 0.0002867438748049985, "loss": 3.716294250488281, "step": 66700 }, { "epoch": 0.13670406999248383, "grad_norm": 0.7525309324264526, "learning_rate": 0.000286723993220518, "loss": 4.217930603027344, "step": 66750 }, { "epoch": 0.13680647004491267, "grad_norm": 0.8076726198196411, "learning_rate": 0.0002867040974282803, "loss": 3.8803009033203124, "step": 66800 }, { "epoch": 0.13690887009734148, "grad_norm": 0.6948472261428833, "learning_rate": 0.00028668418743035275, "loss": 3.5436331176757814, "step": 66850 }, { "epoch": 0.13701127014977033, "grad_norm": 0.8509873151779175, "learning_rate": 0.00028666426322880443, "loss": 3.499276428222656, "step": 66900 }, { "epoch": 0.13711367020219914, "grad_norm": 0.734075665473938, "learning_rate": 0.0002866443248257057, "loss": 3.3526876831054686, "step": 66950 }, { "epoch": 0.13721607025462798, "grad_norm": 0.8169065713882446, "learning_rate": 0.0002866243722231285, "loss": 3.3126312255859376, "step": 67000 }, { "epoch": 0.1373184703070568, "grad_norm": 0.8438522219657898, "learning_rate": 0.0002866044054231462, "loss": 3.438792419433594, "step": 67050 }, { "epoch": 0.13742087035948564, "grad_norm": 0.8047662973403931, "learning_rate": 0.00028658442442783364, "loss": 3.4803237915039062, "step": 67100 }, { "epoch": 0.13752327041191445, "grad_norm": 0.7526935338973999, "learning_rate": 0.00028656442923926723, "loss": 3.5479522705078126, "step": 67150 }, { "epoch": 0.1376256704643433, "grad_norm": 0.8287502527236938, "learning_rate": 0.0002865444198595247, "loss": 3.7390045166015624, "step": 67200 }, { "epoch": 0.1377280705167721, "grad_norm": 0.6148055791854858, "learning_rate": 0.00028652439629068535, "loss": 3.7372897338867186, "step": 67250 }, { "epoch": 0.13783047056920092, "grad_norm": 0.8581375479698181, "learning_rate": 0.00028650435853483006, "loss": 2.9981643676757814, "step": 67300 }, { "epoch": 0.13793287062162976, "grad_norm": 1.0106624364852905, "learning_rate": 0.0002864843065940409, "loss": 3.4451068115234373, "step": 67350 }, { "epoch": 0.13803527067405857, "grad_norm": 0.938605785369873, "learning_rate": 0.0002864642404704017, "loss": 3.6765966796875, "step": 67400 }, { "epoch": 0.13813767072648742, "grad_norm": 1.1929186582565308, "learning_rate": 0.0002864441601659975, "loss": 3.147588195800781, "step": 67450 }, { "epoch": 0.13824007077891623, "grad_norm": 0.5836741328239441, "learning_rate": 0.00028642406568291513, "loss": 2.8205252075195313, "step": 67500 }, { "epoch": 0.13834247083134507, "grad_norm": 0.8532480001449585, "learning_rate": 0.0002864039570232426, "loss": 3.003614501953125, "step": 67550 }, { "epoch": 0.13844487088377389, "grad_norm": 0.7523052096366882, "learning_rate": 0.0002863838341890696, "loss": 3.6742901611328125, "step": 67600 }, { "epoch": 0.13854727093620273, "grad_norm": 0.8439714908599854, "learning_rate": 0.0002863636971824872, "loss": 3.3492770385742188, "step": 67650 }, { "epoch": 0.13864967098863154, "grad_norm": 0.8030802607536316, "learning_rate": 0.00028634354600558785, "loss": 3.6775198364257813, "step": 67700 }, { "epoch": 0.13875207104106038, "grad_norm": 0.8020223379135132, "learning_rate": 0.00028632338066046566, "loss": 3.622167663574219, "step": 67750 }, { "epoch": 0.1388544710934892, "grad_norm": 0.7629789710044861, "learning_rate": 0.00028630320114921606, "loss": 3.1032611083984376, "step": 67800 }, { "epoch": 0.13895687114591804, "grad_norm": 0.8953397274017334, "learning_rate": 0.0002862830074739361, "loss": 3.3124514770507814, "step": 67850 }, { "epoch": 0.13905927119834685, "grad_norm": 0.7486206293106079, "learning_rate": 0.00028626279963672415, "loss": 3.154571838378906, "step": 67900 }, { "epoch": 0.1391616712507757, "grad_norm": 0.8250375986099243, "learning_rate": 0.00028624257763968015, "loss": 3.6296453857421875, "step": 67950 }, { "epoch": 0.1392640713032045, "grad_norm": 1.0587407350540161, "learning_rate": 0.00028622234148490544, "loss": 3.5324700927734374, "step": 68000 }, { "epoch": 0.13936647135563335, "grad_norm": 0.7875683903694153, "learning_rate": 0.00028620209117450295, "loss": 3.170576477050781, "step": 68050 }, { "epoch": 0.13946887140806216, "grad_norm": 1.1913716793060303, "learning_rate": 0.00028618182671057694, "loss": 3.6836483764648436, "step": 68100 }, { "epoch": 0.13957127146049098, "grad_norm": 0.8803089261054993, "learning_rate": 0.00028616154809523326, "loss": 3.468567199707031, "step": 68150 }, { "epoch": 0.13967367151291982, "grad_norm": 0.6812267303466797, "learning_rate": 0.00028614125533057906, "loss": 3.56980712890625, "step": 68200 }, { "epoch": 0.13977607156534863, "grad_norm": 0.6622804999351501, "learning_rate": 0.0002861209484187232, "loss": 3.2988763427734376, "step": 68250 }, { "epoch": 0.13987847161777747, "grad_norm": 0.8914295434951782, "learning_rate": 0.0002861006273617758, "loss": 3.1821719360351564, "step": 68300 }, { "epoch": 0.13998087167020629, "grad_norm": 0.9383370876312256, "learning_rate": 0.00028608029216184867, "loss": 3.6463201904296874, "step": 68350 }, { "epoch": 0.14008327172263513, "grad_norm": 0.795408308506012, "learning_rate": 0.0002860599428210548, "loss": 3.4441323852539063, "step": 68400 }, { "epoch": 0.14018567177506394, "grad_norm": 0.9368188381195068, "learning_rate": 0.0002860395793415088, "loss": 3.4534127807617185, "step": 68450 }, { "epoch": 0.14028807182749278, "grad_norm": 0.9888190627098083, "learning_rate": 0.0002860192017253269, "loss": 3.839812927246094, "step": 68500 }, { "epoch": 0.1403904718799216, "grad_norm": 1.1791257858276367, "learning_rate": 0.0002859988099746266, "loss": 3.30308837890625, "step": 68550 }, { "epoch": 0.14049287193235044, "grad_norm": 0.8144651651382446, "learning_rate": 0.00028597840409152683, "loss": 3.5844757080078127, "step": 68600 }, { "epoch": 0.14059527198477925, "grad_norm": 0.8788326382637024, "learning_rate": 0.00028595798407814817, "loss": 3.5440103149414064, "step": 68650 }, { "epoch": 0.1406976720372081, "grad_norm": 0.754426121711731, "learning_rate": 0.00028593754993661247, "loss": 3.38293701171875, "step": 68700 }, { "epoch": 0.1408000720896369, "grad_norm": 0.8822509050369263, "learning_rate": 0.0002859171016690433, "loss": 3.699421691894531, "step": 68750 }, { "epoch": 0.14090247214206575, "grad_norm": 0.6882439255714417, "learning_rate": 0.00028589663927756546, "loss": 3.33095947265625, "step": 68800 }, { "epoch": 0.14100487219449456, "grad_norm": 0.8108435273170471, "learning_rate": 0.00028587616276430536, "loss": 3.5015853881835937, "step": 68850 }, { "epoch": 0.1411072722469234, "grad_norm": 0.6340552568435669, "learning_rate": 0.00028585567213139075, "loss": 3.374276123046875, "step": 68900 }, { "epoch": 0.14120967229935222, "grad_norm": 0.6358705163002014, "learning_rate": 0.0002858351673809511, "loss": 3.372686462402344, "step": 68950 }, { "epoch": 0.14131207235178103, "grad_norm": 0.6987962126731873, "learning_rate": 0.000285814648515117, "loss": 3.635752868652344, "step": 69000 }, { "epoch": 0.14141447240420987, "grad_norm": 0.9426242113113403, "learning_rate": 0.0002857941155360207, "loss": 3.790219421386719, "step": 69050 }, { "epoch": 0.1415168724566387, "grad_norm": 0.5323778986930847, "learning_rate": 0.000285773568445796, "loss": 3.5456610107421875, "step": 69100 }, { "epoch": 0.14161927250906753, "grad_norm": 0.7765032052993774, "learning_rate": 0.000285753007246578, "loss": 2.8608853149414064, "step": 69150 }, { "epoch": 0.14172167256149634, "grad_norm": 1.0102488994598389, "learning_rate": 0.0002857324319405033, "loss": 3.7360980224609377, "step": 69200 }, { "epoch": 0.14182407261392518, "grad_norm": 0.6676150560379028, "learning_rate": 0.00028571184252971, "loss": 3.7574533081054686, "step": 69250 }, { "epoch": 0.141926472666354, "grad_norm": 0.8389192223548889, "learning_rate": 0.00028569123901633773, "loss": 3.7205816650390626, "step": 69300 }, { "epoch": 0.14202887271878284, "grad_norm": 0.9630427956581116, "learning_rate": 0.0002856706214025275, "loss": 3.1625067138671876, "step": 69350 }, { "epoch": 0.14213127277121165, "grad_norm": 0.8320639729499817, "learning_rate": 0.0002856499896904217, "loss": 2.9422607421875, "step": 69400 }, { "epoch": 0.1422336728236405, "grad_norm": 0.9393151998519897, "learning_rate": 0.0002856293438821644, "loss": 3.5568783569335936, "step": 69450 }, { "epoch": 0.1423360728760693, "grad_norm": 0.8972524404525757, "learning_rate": 0.000285608683979901, "loss": 3.5590420532226563, "step": 69500 }, { "epoch": 0.14243847292849815, "grad_norm": 0.5622543096542358, "learning_rate": 0.00028558800998577835, "loss": 2.8899127197265626, "step": 69550 }, { "epoch": 0.14254087298092696, "grad_norm": 0.8466945886611938, "learning_rate": 0.00028556732190194485, "loss": 3.2979135131835937, "step": 69600 }, { "epoch": 0.1426432730333558, "grad_norm": 1.3375204801559448, "learning_rate": 0.00028554661973055026, "loss": 3.5246792602539063, "step": 69650 }, { "epoch": 0.14274567308578462, "grad_norm": 0.7531492114067078, "learning_rate": 0.00028552590347374586, "loss": 3.3118746948242186, "step": 69700 }, { "epoch": 0.14284807313821346, "grad_norm": 0.8651145100593567, "learning_rate": 0.00028550517313368444, "loss": 3.485458984375, "step": 69750 }, { "epoch": 0.14295047319064227, "grad_norm": 0.625991940498352, "learning_rate": 0.0002854844287125202, "loss": 3.164065246582031, "step": 69800 }, { "epoch": 0.1430528732430711, "grad_norm": 2.080458402633667, "learning_rate": 0.0002854636702124088, "loss": 3.0976217651367186, "step": 69850 }, { "epoch": 0.14315527329549993, "grad_norm": 0.7514007687568665, "learning_rate": 0.00028544289763550733, "loss": 3.78799072265625, "step": 69900 }, { "epoch": 0.14325767334792874, "grad_norm": 0.5652868151664734, "learning_rate": 0.00028542211098397447, "loss": 2.5083651733398438, "step": 69950 }, { "epoch": 0.14336007340035758, "grad_norm": 0.7237803339958191, "learning_rate": 0.0002854013102599702, "loss": 3.534099426269531, "step": 70000 }, { "epoch": 0.1434624734527864, "grad_norm": 0.8753382563591003, "learning_rate": 0.00028538049546565603, "loss": 4.047043762207031, "step": 70050 }, { "epoch": 0.14356487350521524, "grad_norm": 0.8999593257904053, "learning_rate": 0.000285359666603195, "loss": 3.6831619262695314, "step": 70100 }, { "epoch": 0.14366727355764405, "grad_norm": 0.7087032794952393, "learning_rate": 0.00028533882367475156, "loss": 2.866451416015625, "step": 70150 }, { "epoch": 0.1437696736100729, "grad_norm": 0.6140325665473938, "learning_rate": 0.0002853179666824916, "loss": 1.8367611694335937, "step": 70200 }, { "epoch": 0.1438720736625017, "grad_norm": 0.7460519671440125, "learning_rate": 0.0002852970956285824, "loss": 2.957001037597656, "step": 70250 }, { "epoch": 0.14397447371493055, "grad_norm": 1.0516009330749512, "learning_rate": 0.0002852762105151929, "loss": 2.5348553466796875, "step": 70300 }, { "epoch": 0.14407687376735936, "grad_norm": 0.5429277420043945, "learning_rate": 0.0002852553113444934, "loss": 3.53834228515625, "step": 70350 }, { "epoch": 0.1441792738197882, "grad_norm": 0.8015134334564209, "learning_rate": 0.0002852343981186556, "loss": 3.64453857421875, "step": 70400 }, { "epoch": 0.14428167387221702, "grad_norm": 0.7445142269134521, "learning_rate": 0.00028521347083985266, "loss": 3.6188226318359376, "step": 70450 }, { "epoch": 0.14438407392464586, "grad_norm": 0.9419240355491638, "learning_rate": 0.00028519252951025935, "loss": 2.8771868896484376, "step": 70500 }, { "epoch": 0.14448647397707468, "grad_norm": 0.8755508065223694, "learning_rate": 0.0002851715741320517, "loss": 3.6672409057617186, "step": 70550 }, { "epoch": 0.14458887402950352, "grad_norm": 0.6970762014389038, "learning_rate": 0.00028515060470740743, "loss": 3.7528070068359374, "step": 70600 }, { "epoch": 0.14469127408193233, "grad_norm": 0.5237036943435669, "learning_rate": 0.0002851296212385055, "loss": 3.318054504394531, "step": 70650 }, { "epoch": 0.14479367413436114, "grad_norm": 0.7218162417411804, "learning_rate": 0.0002851086237275264, "loss": 1.9354142761230468, "step": 70700 }, { "epoch": 0.14489607418678999, "grad_norm": 0.46474677324295044, "learning_rate": 0.00028508761217665215, "loss": 1.882958221435547, "step": 70750 }, { "epoch": 0.1449984742392188, "grad_norm": 0.665745198726654, "learning_rate": 0.0002850665865880662, "loss": 1.9264730834960937, "step": 70800 }, { "epoch": 0.14510087429164764, "grad_norm": 0.9114018082618713, "learning_rate": 0.00028504554696395334, "loss": 2.677998046875, "step": 70850 }, { "epoch": 0.14520327434407646, "grad_norm": 0.729942798614502, "learning_rate": 0.0002850244933065, "loss": 3.4295562744140624, "step": 70900 }, { "epoch": 0.1453056743965053, "grad_norm": 1.0335681438446045, "learning_rate": 0.000285003425617894, "loss": 3.524678039550781, "step": 70950 }, { "epoch": 0.1454080744489341, "grad_norm": 1.325173258781433, "learning_rate": 0.00028498234390032453, "loss": 3.4061398315429687, "step": 71000 }, { "epoch": 0.14551047450136295, "grad_norm": 0.7562994956970215, "learning_rate": 0.00028496124815598233, "loss": 3.4216473388671873, "step": 71050 }, { "epoch": 0.14561287455379177, "grad_norm": 0.8231451511383057, "learning_rate": 0.00028494013838705964, "loss": 3.4331399536132814, "step": 71100 }, { "epoch": 0.1457152746062206, "grad_norm": 0.9096212387084961, "learning_rate": 0.00028491901459575, "loss": 3.2372637939453126, "step": 71150 }, { "epoch": 0.14581767465864942, "grad_norm": 0.8419906497001648, "learning_rate": 0.00028489787678424855, "loss": 3.490650329589844, "step": 71200 }, { "epoch": 0.14592007471107826, "grad_norm": 0.9181749224662781, "learning_rate": 0.00028487672495475187, "loss": 3.988592224121094, "step": 71250 }, { "epoch": 0.14602247476350708, "grad_norm": 0.7534500360488892, "learning_rate": 0.0002848555591094579, "loss": 3.256888427734375, "step": 71300 }, { "epoch": 0.14612487481593592, "grad_norm": 1.1033469438552856, "learning_rate": 0.00028483437925056615, "loss": 3.3334320068359373, "step": 71350 }, { "epoch": 0.14622727486836473, "grad_norm": 0.9356803894042969, "learning_rate": 0.0002848131853802775, "loss": 2.933785705566406, "step": 71400 }, { "epoch": 0.14632967492079357, "grad_norm": 0.8622822165489197, "learning_rate": 0.00028479197750079434, "loss": 3.4190252685546874, "step": 71450 }, { "epoch": 0.1464320749732224, "grad_norm": 0.7642265558242798, "learning_rate": 0.0002847707556143205, "loss": 3.637124328613281, "step": 71500 }, { "epoch": 0.1465344750256512, "grad_norm": 0.8362610340118408, "learning_rate": 0.0002847495197230613, "loss": 3.78397705078125, "step": 71550 }, { "epoch": 0.14663687507808004, "grad_norm": 0.7937034964561462, "learning_rate": 0.0002847282698292234, "loss": 3.7184579467773435, "step": 71600 }, { "epoch": 0.14673927513050886, "grad_norm": 0.8799037933349609, "learning_rate": 0.0002847070059350151, "loss": 3.535165100097656, "step": 71650 }, { "epoch": 0.1468416751829377, "grad_norm": 0.7818918824195862, "learning_rate": 0.000284685728042646, "loss": 3.6417041015625, "step": 71700 }, { "epoch": 0.1469440752353665, "grad_norm": 0.84147709608078, "learning_rate": 0.00028466443615432713, "loss": 3.487315673828125, "step": 71750 }, { "epoch": 0.14704647528779535, "grad_norm": 0.6987602710723877, "learning_rate": 0.00028464313027227117, "loss": 3.3947982788085938, "step": 71800 }, { "epoch": 0.14714887534022417, "grad_norm": 0.7040350437164307, "learning_rate": 0.0002846218103986921, "loss": 3.48228271484375, "step": 71850 }, { "epoch": 0.147251275392653, "grad_norm": 0.753700852394104, "learning_rate": 0.0002846004765358053, "loss": 2.8886663818359377, "step": 71900 }, { "epoch": 0.14735367544508182, "grad_norm": 0.6964828968048096, "learning_rate": 0.0002845791286858278, "loss": 3.3485205078125, "step": 71950 }, { "epoch": 0.14745607549751066, "grad_norm": 0.7957376837730408, "learning_rate": 0.00028455776685097796, "loss": 4.061175842285156, "step": 72000 }, { "epoch": 0.14755847554993948, "grad_norm": 0.8718839883804321, "learning_rate": 0.00028453639103347557, "loss": 3.8466424560546875, "step": 72050 }, { "epoch": 0.14766087560236832, "grad_norm": 0.6680410504341125, "learning_rate": 0.00028451500123554194, "loss": 3.4465017700195313, "step": 72100 }, { "epoch": 0.14776327565479713, "grad_norm": 0.7632986903190613, "learning_rate": 0.0002844935974593998, "loss": 3.496392822265625, "step": 72150 }, { "epoch": 0.14786567570722597, "grad_norm": 0.8088258504867554, "learning_rate": 0.0002844721797072733, "loss": 3.6753024291992187, "step": 72200 }, { "epoch": 0.1479680757596548, "grad_norm": 0.7746132016181946, "learning_rate": 0.0002844507479813881, "loss": 3.4665756225585938, "step": 72250 }, { "epoch": 0.14807047581208363, "grad_norm": 0.9574618339538574, "learning_rate": 0.00028442930228397134, "loss": 3.6266830444335936, "step": 72300 }, { "epoch": 0.14817287586451244, "grad_norm": 3.6261954307556152, "learning_rate": 0.0002844078426172515, "loss": 3.3307794189453124, "step": 72350 }, { "epoch": 0.14827527591694126, "grad_norm": 1.3732120990753174, "learning_rate": 0.00028438636898345856, "loss": 3.371138000488281, "step": 72400 }, { "epoch": 0.1483776759693701, "grad_norm": 0.8364174962043762, "learning_rate": 0.00028436488138482407, "loss": 3.6167337036132814, "step": 72450 }, { "epoch": 0.1484800760217989, "grad_norm": 0.7466509938240051, "learning_rate": 0.0002843433798235808, "loss": 3.7814892578125, "step": 72500 }, { "epoch": 0.14858247607422775, "grad_norm": 0.8802339434623718, "learning_rate": 0.00028432186430196315, "loss": 3.364360656738281, "step": 72550 }, { "epoch": 0.14868487612665657, "grad_norm": 0.6437531113624573, "learning_rate": 0.00028430033482220693, "loss": 3.211015625, "step": 72600 }, { "epoch": 0.1487872761790854, "grad_norm": 0.7954172492027283, "learning_rate": 0.0002842787913865494, "loss": 3.1693716430664063, "step": 72650 }, { "epoch": 0.14888967623151422, "grad_norm": 0.735313892364502, "learning_rate": 0.0002842572339972292, "loss": 3.4131680297851563, "step": 72700 }, { "epoch": 0.14899207628394306, "grad_norm": 1.2493815422058105, "learning_rate": 0.0002842356626564866, "loss": 2.6316799926757812, "step": 72750 }, { "epoch": 0.14909447633637188, "grad_norm": 0.506629228591919, "learning_rate": 0.00028421407736656305, "loss": 2.7435052490234373, "step": 72800 }, { "epoch": 0.14919687638880072, "grad_norm": 0.4948784112930298, "learning_rate": 0.0002841924781297017, "loss": 1.9055368041992187, "step": 72850 }, { "epoch": 0.14929927644122953, "grad_norm": 0.9006844162940979, "learning_rate": 0.000284170864948147, "loss": 2.6189675903320313, "step": 72900 }, { "epoch": 0.14940167649365838, "grad_norm": 0.6539232134819031, "learning_rate": 0.00028414923782414496, "loss": 3.5976483154296877, "step": 72950 }, { "epoch": 0.1495040765460872, "grad_norm": 0.6356167793273926, "learning_rate": 0.0002841275967599429, "loss": 3.402171936035156, "step": 73000 }, { "epoch": 0.14960647659851603, "grad_norm": 1.5707745552062988, "learning_rate": 0.00028410594175778964, "loss": 3.7100360107421877, "step": 73050 }, { "epoch": 0.14970887665094484, "grad_norm": 0.5473312139511108, "learning_rate": 0.0002840842728199356, "loss": 2.539022674560547, "step": 73100 }, { "epoch": 0.14981127670337369, "grad_norm": 0.8895722031593323, "learning_rate": 0.00028406258994863245, "loss": 2.5107452392578127, "step": 73150 }, { "epoch": 0.1499136767558025, "grad_norm": 0.5692305564880371, "learning_rate": 0.00028404089314613333, "loss": 1.6453628540039062, "step": 73200 }, { "epoch": 0.1500160768082313, "grad_norm": 0.6427550315856934, "learning_rate": 0.00028401918241469294, "loss": 1.628760986328125, "step": 73250 }, { "epoch": 0.15011847686066015, "grad_norm": 0.705071747303009, "learning_rate": 0.0002839974577565674, "loss": 3.1914212036132814, "step": 73300 }, { "epoch": 0.15022087691308897, "grad_norm": 0.7019383907318115, "learning_rate": 0.0002839757191740141, "loss": 3.628504333496094, "step": 73350 }, { "epoch": 0.1503232769655178, "grad_norm": 0.8977711200714111, "learning_rate": 0.0002839539666692921, "loss": 3.583207092285156, "step": 73400 }, { "epoch": 0.15042567701794662, "grad_norm": 0.7372389435768127, "learning_rate": 0.00028393220024466187, "loss": 2.969400634765625, "step": 73450 }, { "epoch": 0.15052807707037547, "grad_norm": 0.7931883931159973, "learning_rate": 0.0002839104199023853, "loss": 3.232490234375, "step": 73500 }, { "epoch": 0.15063047712280428, "grad_norm": 0.6523383259773254, "learning_rate": 0.0002838886256447256, "loss": 3.325892028808594, "step": 73550 }, { "epoch": 0.15073287717523312, "grad_norm": 0.6729732155799866, "learning_rate": 0.00028386681747394755, "loss": 3.335216064453125, "step": 73600 }, { "epoch": 0.15083527722766193, "grad_norm": 0.818371057510376, "learning_rate": 0.0002838449953923174, "loss": 3.518477783203125, "step": 73650 }, { "epoch": 0.15093767728009078, "grad_norm": 0.7028401494026184, "learning_rate": 0.00028382315940210284, "loss": 3.509742431640625, "step": 73700 }, { "epoch": 0.1510400773325196, "grad_norm": 1.2517348527908325, "learning_rate": 0.0002838013095055729, "loss": 3.626214599609375, "step": 73750 }, { "epoch": 0.15114247738494843, "grad_norm": 0.7776418328285217, "learning_rate": 0.00028377944570499814, "loss": 3.3807473754882813, "step": 73800 }, { "epoch": 0.15124487743737725, "grad_norm": 1.246304988861084, "learning_rate": 0.0002837575680026506, "loss": 3.671220703125, "step": 73850 }, { "epoch": 0.1513472774898061, "grad_norm": 0.8468489050865173, "learning_rate": 0.00028373567640080366, "loss": 3.7939553833007813, "step": 73900 }, { "epoch": 0.1514496775422349, "grad_norm": 0.9071077108383179, "learning_rate": 0.0002837137709017322, "loss": 4.0460302734375, "step": 73950 }, { "epoch": 0.15155207759466374, "grad_norm": 0.8705784678459167, "learning_rate": 0.00028369185150771257, "loss": 3.0467730712890626, "step": 74000 }, { "epoch": 0.15165447764709256, "grad_norm": 1.1212836503982544, "learning_rate": 0.00028366991822102256, "loss": 3.2574063110351563, "step": 74050 }, { "epoch": 0.15175687769952137, "grad_norm": 0.6991548538208008, "learning_rate": 0.0002836479710439413, "loss": 3.221210632324219, "step": 74100 }, { "epoch": 0.1518592777519502, "grad_norm": 0.7652693390846252, "learning_rate": 0.00028362600997874953, "loss": 3.4262896728515626, "step": 74150 }, { "epoch": 0.15196167780437903, "grad_norm": 0.5328712463378906, "learning_rate": 0.00028360403502772927, "loss": 2.2504594421386717, "step": 74200 }, { "epoch": 0.15206407785680787, "grad_norm": 0.623674750328064, "learning_rate": 0.00028358204619316414, "loss": 1.6738412475585938, "step": 74250 }, { "epoch": 0.15216647790923668, "grad_norm": 0.7511982321739197, "learning_rate": 0.0002835600434773391, "loss": 3.5196023559570313, "step": 74300 }, { "epoch": 0.15226887796166552, "grad_norm": 0.7045626640319824, "learning_rate": 0.0002835380268825405, "loss": 3.4125076293945313, "step": 74350 }, { "epoch": 0.15237127801409434, "grad_norm": 0.688127875328064, "learning_rate": 0.00028351599641105634, "loss": 3.532620544433594, "step": 74400 }, { "epoch": 0.15247367806652318, "grad_norm": 0.7123726606369019, "learning_rate": 0.0002834939520651758, "loss": 3.450240478515625, "step": 74450 }, { "epoch": 0.152576078118952, "grad_norm": 0.6914170980453491, "learning_rate": 0.0002834718938471897, "loss": 3.7383859252929685, "step": 74500 }, { "epoch": 0.15267847817138083, "grad_norm": 1.8187841176986694, "learning_rate": 0.0002834498217593902, "loss": 3.352877197265625, "step": 74550 }, { "epoch": 0.15278087822380965, "grad_norm": 0.5876966118812561, "learning_rate": 0.00028342773580407104, "loss": 3.5138931274414062, "step": 74600 }, { "epoch": 0.1528832782762385, "grad_norm": 0.6575474143028259, "learning_rate": 0.00028340563598352716, "loss": 2.800203857421875, "step": 74650 }, { "epoch": 0.1529856783286673, "grad_norm": 0.9087119102478027, "learning_rate": 0.0002833835223000551, "loss": 3.22402587890625, "step": 74700 }, { "epoch": 0.15308807838109614, "grad_norm": 0.5269556641578674, "learning_rate": 0.0002833613947559529, "loss": 3.43998291015625, "step": 74750 }, { "epoch": 0.15319047843352496, "grad_norm": 0.7771069407463074, "learning_rate": 0.0002833392533535198, "loss": 3.7308123779296873, "step": 74800 }, { "epoch": 0.1532928784859538, "grad_norm": 0.9969513416290283, "learning_rate": 0.00028331709809505687, "loss": 3.7192803955078126, "step": 74850 }, { "epoch": 0.1533952785383826, "grad_norm": 0.705575704574585, "learning_rate": 0.00028329492898286623, "loss": 3.504131164550781, "step": 74900 }, { "epoch": 0.15349767859081143, "grad_norm": 0.5487853288650513, "learning_rate": 0.0002832727460192516, "loss": 2.8572216796875, "step": 74950 }, { "epoch": 0.15360007864324027, "grad_norm": 0.7012720108032227, "learning_rate": 0.00028325054920651813, "loss": 2.238103485107422, "step": 75000 }, { "epoch": 0.15370247869566908, "grad_norm": 0.7673011422157288, "learning_rate": 0.00028322833854697247, "loss": 3.7670169067382813, "step": 75050 }, { "epoch": 0.15380487874809792, "grad_norm": 0.7762806415557861, "learning_rate": 0.00028320611404292266, "loss": 3.65732177734375, "step": 75100 }, { "epoch": 0.15390727880052674, "grad_norm": 0.8351573348045349, "learning_rate": 0.0002831838756966781, "loss": 3.3902908325195313, "step": 75150 }, { "epoch": 0.15400967885295558, "grad_norm": 0.9058986306190491, "learning_rate": 0.00028316162351054976, "loss": 3.2655902099609375, "step": 75200 }, { "epoch": 0.1541120789053844, "grad_norm": 0.7718729376792908, "learning_rate": 0.0002831393574868499, "loss": 2.826809997558594, "step": 75250 }, { "epoch": 0.15421447895781323, "grad_norm": 0.8767629861831665, "learning_rate": 0.00028311707762789255, "loss": 3.345711975097656, "step": 75300 }, { "epoch": 0.15431687901024205, "grad_norm": 0.7267951369285583, "learning_rate": 0.00028309478393599263, "loss": 3.440138244628906, "step": 75350 }, { "epoch": 0.1544192790626709, "grad_norm": 0.8214264512062073, "learning_rate": 0.000283072476413467, "loss": 3.795940246582031, "step": 75400 }, { "epoch": 0.1545216791150997, "grad_norm": 0.7978084087371826, "learning_rate": 0.0002830501550626337, "loss": 3.481332092285156, "step": 75450 }, { "epoch": 0.15462407916752854, "grad_norm": 0.6108945608139038, "learning_rate": 0.0002830278198858122, "loss": 3.0552932739257814, "step": 75500 }, { "epoch": 0.15472647921995736, "grad_norm": 0.7742500901222229, "learning_rate": 0.0002830054708853236, "loss": 3.446549072265625, "step": 75550 }, { "epoch": 0.1548288792723862, "grad_norm": 0.8176902532577515, "learning_rate": 0.0002829831080634903, "loss": 3.7375308227539064, "step": 75600 }, { "epoch": 0.154931279324815, "grad_norm": 0.7569313645362854, "learning_rate": 0.00028296073142263596, "loss": 3.7137493896484375, "step": 75650 }, { "epoch": 0.15503367937724385, "grad_norm": 0.4838169515132904, "learning_rate": 0.00028293834096508613, "loss": 3.6915240478515625, "step": 75700 }, { "epoch": 0.15513607942967267, "grad_norm": 0.669226348400116, "learning_rate": 0.0002829159366931673, "loss": 3.220513916015625, "step": 75750 }, { "epoch": 0.15523847948210148, "grad_norm": 0.8119651079177856, "learning_rate": 0.0002828935186092078, "loss": 3.58036865234375, "step": 75800 }, { "epoch": 0.15534087953453032, "grad_norm": 0.5975949168205261, "learning_rate": 0.00028287108671553706, "loss": 3.3334951782226563, "step": 75850 }, { "epoch": 0.15544327958695914, "grad_norm": 0.8672727942466736, "learning_rate": 0.0002828486410144862, "loss": 3.343040771484375, "step": 75900 }, { "epoch": 0.15554567963938798, "grad_norm": 0.7106810212135315, "learning_rate": 0.0002828261815083877, "loss": 3.3205633544921875, "step": 75950 }, { "epoch": 0.1556480796918168, "grad_norm": 0.6641435623168945, "learning_rate": 0.0002828037081995754, "loss": 3.215283203125, "step": 76000 }, { "epoch": 0.15575047974424563, "grad_norm": 0.8929319977760315, "learning_rate": 0.0002827812210903846, "loss": 2.9793209838867187, "step": 76050 }, { "epoch": 0.15585287979667445, "grad_norm": 0.9121986627578735, "learning_rate": 0.0002827587201831522, "loss": 3.9058187866210936, "step": 76100 }, { "epoch": 0.1559552798491033, "grad_norm": 0.7762316465377808, "learning_rate": 0.00028273620548021624, "loss": 3.6440216064453126, "step": 76150 }, { "epoch": 0.1560576799015321, "grad_norm": 0.9442132711410522, "learning_rate": 0.0002827136769839164, "loss": 3.2715243530273437, "step": 76200 }, { "epoch": 0.15616007995396095, "grad_norm": 0.8481264710426331, "learning_rate": 0.00028269113469659373, "loss": 3.4502252197265624, "step": 76250 }, { "epoch": 0.15626248000638976, "grad_norm": 0.48445141315460205, "learning_rate": 0.00028266857862059076, "loss": 2.9071063232421874, "step": 76300 }, { "epoch": 0.1563648800588186, "grad_norm": 0.7879681587219238, "learning_rate": 0.00028264600875825145, "loss": 3.08685546875, "step": 76350 }, { "epoch": 0.15646728011124741, "grad_norm": 0.6723935604095459, "learning_rate": 0.00028262342511192106, "loss": 3.23456298828125, "step": 76400 }, { "epoch": 0.15656968016367626, "grad_norm": 0.8366503119468689, "learning_rate": 0.0002826008276839465, "loss": 3.12440185546875, "step": 76450 }, { "epoch": 0.15667208021610507, "grad_norm": 0.6648255586624146, "learning_rate": 0.00028257821647667585, "loss": 3.6282342529296874, "step": 76500 }, { "epoch": 0.1567744802685339, "grad_norm": 0.233867809176445, "learning_rate": 0.00028255559149245894, "loss": 2.4692172241210937, "step": 76550 }, { "epoch": 0.15687688032096272, "grad_norm": 0.9552132487297058, "learning_rate": 0.00028253295273364675, "loss": 2.3723199462890623, "step": 76600 }, { "epoch": 0.15697928037339154, "grad_norm": 0.9119518995285034, "learning_rate": 0.00028251030020259177, "loss": 3.5183111572265626, "step": 76650 }, { "epoch": 0.15708168042582038, "grad_norm": 0.7538524866104126, "learning_rate": 0.00028248763390164807, "loss": 3.69493896484375, "step": 76700 }, { "epoch": 0.1571840804782492, "grad_norm": 0.9746021628379822, "learning_rate": 0.00028246495383317093, "loss": 2.4459327697753905, "step": 76750 }, { "epoch": 0.15728648053067804, "grad_norm": 1.0355682373046875, "learning_rate": 0.0002824422599995172, "loss": 3.1521530151367188, "step": 76800 }, { "epoch": 0.15738888058310685, "grad_norm": 1.0927115678787231, "learning_rate": 0.00028241955240304513, "loss": 3.066300048828125, "step": 76850 }, { "epoch": 0.1574912806355357, "grad_norm": 0.9498497247695923, "learning_rate": 0.00028239683104611433, "loss": 3.6030181884765624, "step": 76900 }, { "epoch": 0.1575936806879645, "grad_norm": 0.6711775660514832, "learning_rate": 0.00028237409593108605, "loss": 2.9892807006835938, "step": 76950 }, { "epoch": 0.15769608074039335, "grad_norm": 0.8505134582519531, "learning_rate": 0.00028235134706032267, "loss": 3.558472900390625, "step": 77000 }, { "epoch": 0.15779848079282216, "grad_norm": 0.7711685299873352, "learning_rate": 0.0002823285844361883, "loss": 3.2466412353515626, "step": 77050 }, { "epoch": 0.157900880845251, "grad_norm": 0.8435817360877991, "learning_rate": 0.00028230580806104814, "loss": 3.2454754638671877, "step": 77100 }, { "epoch": 0.15800328089767982, "grad_norm": 0.7799451947212219, "learning_rate": 0.00028228301793726916, "loss": 3.4074356079101564, "step": 77150 }, { "epoch": 0.15810568095010866, "grad_norm": 0.7728955149650574, "learning_rate": 0.0002822602140672196, "loss": 3.511580505371094, "step": 77200 }, { "epoch": 0.15820808100253747, "grad_norm": 0.7063544988632202, "learning_rate": 0.0002822373964532691, "loss": 3.2738442993164063, "step": 77250 }, { "epoch": 0.1583104810549663, "grad_norm": 0.7023544907569885, "learning_rate": 0.00028221456509778875, "loss": 3.7345950317382814, "step": 77300 }, { "epoch": 0.15841288110739513, "grad_norm": 0.9115304946899414, "learning_rate": 0.0002821917200031511, "loss": 3.140256042480469, "step": 77350 }, { "epoch": 0.15851528115982397, "grad_norm": 0.7438466548919678, "learning_rate": 0.00028216886117173013, "loss": 3.7709716796875, "step": 77400 }, { "epoch": 0.15861768121225278, "grad_norm": 0.7658351063728333, "learning_rate": 0.0002821459886059013, "loss": 2.747354736328125, "step": 77450 }, { "epoch": 0.1587200812646816, "grad_norm": 0.6408258676528931, "learning_rate": 0.0002821231023080412, "loss": 3.999375915527344, "step": 77500 }, { "epoch": 0.15882248131711044, "grad_norm": 0.772813081741333, "learning_rate": 0.0002821002022805283, "loss": 3.42555419921875, "step": 77550 }, { "epoch": 0.15892488136953925, "grad_norm": 0.950205385684967, "learning_rate": 0.0002820772885257422, "loss": 3.6866400146484377, "step": 77600 }, { "epoch": 0.1590272814219681, "grad_norm": 0.5234514474868774, "learning_rate": 0.000282054361046064, "loss": 2.6636093139648436, "step": 77650 }, { "epoch": 0.1591296814743969, "grad_norm": 0.7165791988372803, "learning_rate": 0.0002820314198438761, "loss": 3.308489990234375, "step": 77700 }, { "epoch": 0.15923208152682575, "grad_norm": 0.7389218211174011, "learning_rate": 0.00028200846492156266, "loss": 3.701646728515625, "step": 77750 }, { "epoch": 0.15933448157925456, "grad_norm": 0.6821298599243164, "learning_rate": 0.0002819854962815089, "loss": 3.6112545776367186, "step": 77800 }, { "epoch": 0.1594368816316834, "grad_norm": 0.6218832731246948, "learning_rate": 0.00028196251392610173, "loss": 3.4739862060546876, "step": 77850 }, { "epoch": 0.15953928168411222, "grad_norm": 0.7211093902587891, "learning_rate": 0.00028193951785772923, "loss": 3.646156921386719, "step": 77900 }, { "epoch": 0.15964168173654106, "grad_norm": 0.9792724847793579, "learning_rate": 0.00028191650807878125, "loss": 2.4151596069335937, "step": 77950 }, { "epoch": 0.15974408178896987, "grad_norm": 0.6804146766662598, "learning_rate": 0.0002818934845916487, "loss": 2.791448059082031, "step": 78000 }, { "epoch": 0.1598464818413987, "grad_norm": 0.7963101863861084, "learning_rate": 0.0002818704473987241, "loss": 2.887415771484375, "step": 78050 }, { "epoch": 0.15994888189382753, "grad_norm": 1.2759873867034912, "learning_rate": 0.00028184739650240144, "loss": 3.3000274658203126, "step": 78100 }, { "epoch": 0.16005128194625637, "grad_norm": 0.9473127126693726, "learning_rate": 0.0002818243319050761, "loss": 3.280038146972656, "step": 78150 }, { "epoch": 0.16015368199868518, "grad_norm": 1.025072693824768, "learning_rate": 0.0002818012536091447, "loss": 3.278867492675781, "step": 78200 }, { "epoch": 0.16025608205111402, "grad_norm": 0.6705909967422485, "learning_rate": 0.00028177816161700553, "loss": 3.659829406738281, "step": 78250 }, { "epoch": 0.16035848210354284, "grad_norm": 0.5486139059066772, "learning_rate": 0.00028175505593105825, "loss": 2.785064697265625, "step": 78300 }, { "epoch": 0.16046088215597165, "grad_norm": 0.7707447409629822, "learning_rate": 0.0002817319365537038, "loss": 3.8123992919921874, "step": 78350 }, { "epoch": 0.1605632822084005, "grad_norm": 0.8866334557533264, "learning_rate": 0.0002817088034873448, "loss": 3.45692626953125, "step": 78400 }, { "epoch": 0.1606656822608293, "grad_norm": 0.6880883574485779, "learning_rate": 0.0002816856567343849, "loss": 3.526729736328125, "step": 78450 }, { "epoch": 0.16076808231325815, "grad_norm": 0.6768396496772766, "learning_rate": 0.00028166249629722956, "loss": 3.4408908081054688, "step": 78500 }, { "epoch": 0.16087048236568696, "grad_norm": 0.9719237685203552, "learning_rate": 0.0002816393221782856, "loss": 2.7149722290039064, "step": 78550 }, { "epoch": 0.1609728824181158, "grad_norm": 0.9855000376701355, "learning_rate": 0.000281616134379961, "loss": 3.5189361572265625, "step": 78600 }, { "epoch": 0.16107528247054462, "grad_norm": 0.8920623660087585, "learning_rate": 0.0002815929329046654, "loss": 3.5056884765625, "step": 78650 }, { "epoch": 0.16117768252297346, "grad_norm": 1.0610677003860474, "learning_rate": 0.0002815697177548098, "loss": 3.097916259765625, "step": 78700 }, { "epoch": 0.16128008257540227, "grad_norm": 0.8963422775268555, "learning_rate": 0.0002815464889328066, "loss": 3.181166076660156, "step": 78750 }, { "epoch": 0.16138248262783111, "grad_norm": 0.7184786200523376, "learning_rate": 0.00028152324644106964, "loss": 3.4025540161132812, "step": 78800 }, { "epoch": 0.16148488268025993, "grad_norm": 0.8972223401069641, "learning_rate": 0.00028149999028201426, "loss": 3.6795730590820312, "step": 78850 }, { "epoch": 0.16158728273268877, "grad_norm": 0.7884389758110046, "learning_rate": 0.000281476720458057, "loss": 3.221437683105469, "step": 78900 }, { "epoch": 0.16168968278511758, "grad_norm": 0.5186575651168823, "learning_rate": 0.00028145343697161604, "loss": 3.423157043457031, "step": 78950 }, { "epoch": 0.16179208283754642, "grad_norm": 0.6838876605033875, "learning_rate": 0.0002814301398251109, "loss": 2.6037109375, "step": 79000 }, { "epoch": 0.16189448288997524, "grad_norm": 0.7351782321929932, "learning_rate": 0.0002814068290209625, "loss": 3.258736877441406, "step": 79050 }, { "epoch": 0.16199688294240408, "grad_norm": 0.8734768033027649, "learning_rate": 0.00028138350456159315, "loss": 3.6974835205078125, "step": 79100 }, { "epoch": 0.1620992829948329, "grad_norm": 0.7983621954917908, "learning_rate": 0.00028136016644942665, "loss": 3.60543701171875, "step": 79150 }, { "epoch": 0.1622016830472617, "grad_norm": 0.6577921509742737, "learning_rate": 0.0002813368146868883, "loss": 3.336464538574219, "step": 79200 }, { "epoch": 0.16230408309969055, "grad_norm": 0.6782569289207458, "learning_rate": 0.0002813134492764046, "loss": 3.735032043457031, "step": 79250 }, { "epoch": 0.16240648315211936, "grad_norm": 0.8127371072769165, "learning_rate": 0.0002812900702204036, "loss": 3.584259033203125, "step": 79300 }, { "epoch": 0.1625088832045482, "grad_norm": 0.8227265477180481, "learning_rate": 0.00028126667752131473, "loss": 2.95750244140625, "step": 79350 }, { "epoch": 0.16261128325697702, "grad_norm": 0.8162257671356201, "learning_rate": 0.00028124327118156893, "loss": 3.5253372192382812, "step": 79400 }, { "epoch": 0.16271368330940586, "grad_norm": 0.7429577708244324, "learning_rate": 0.0002812198512035984, "loss": 3.1853790283203125, "step": 79450 }, { "epoch": 0.16281608336183467, "grad_norm": 0.7135078310966492, "learning_rate": 0.00028119641758983695, "loss": 3.3338772583007814, "step": 79500 }, { "epoch": 0.16291848341426352, "grad_norm": 0.8013560175895691, "learning_rate": 0.00028117297034271953, "loss": 3.761092224121094, "step": 79550 }, { "epoch": 0.16302088346669233, "grad_norm": 0.678429126739502, "learning_rate": 0.0002811495094646828, "loss": 3.4881576538085937, "step": 79600 }, { "epoch": 0.16312328351912117, "grad_norm": 0.9312568306922913, "learning_rate": 0.0002811260349581647, "loss": 3.203521728515625, "step": 79650 }, { "epoch": 0.16322568357154998, "grad_norm": 0.8246340751647949, "learning_rate": 0.0002811025468256046, "loss": 3.5164459228515623, "step": 79700 }, { "epoch": 0.16332808362397883, "grad_norm": 1.7668465375900269, "learning_rate": 0.00028107904506944324, "loss": 2.763003234863281, "step": 79750 }, { "epoch": 0.16343048367640764, "grad_norm": 0.4166733920574188, "learning_rate": 0.00028105552969212284, "loss": 2.9914471435546877, "step": 79800 }, { "epoch": 0.16353288372883648, "grad_norm": 0.6907941699028015, "learning_rate": 0.0002810320006960871, "loss": 3.1851446533203127, "step": 79850 }, { "epoch": 0.1636352837812653, "grad_norm": 0.6729636192321777, "learning_rate": 0.00028100845808378083, "loss": 3.1257308959960937, "step": 79900 }, { "epoch": 0.16373768383369414, "grad_norm": 0.7356630563735962, "learning_rate": 0.0002809849018576507, "loss": 3.1083673095703124, "step": 79950 }, { "epoch": 0.16384008388612295, "grad_norm": 0.9909185767173767, "learning_rate": 0.00028096133202014443, "loss": 3.5541717529296877, "step": 80000 }, { "epoch": 0.16394248393855176, "grad_norm": 0.4952726662158966, "learning_rate": 0.00028093774857371146, "loss": 2.9160995483398438, "step": 80050 }, { "epoch": 0.1640448839909806, "grad_norm": 0.6858778595924377, "learning_rate": 0.00028091415152080225, "loss": 3.611160888671875, "step": 80100 }, { "epoch": 0.16414728404340942, "grad_norm": 0.6998670101165771, "learning_rate": 0.0002808905408638691, "loss": 3.506941833496094, "step": 80150 }, { "epoch": 0.16424968409583826, "grad_norm": 0.8609181642532349, "learning_rate": 0.0002808669166053654, "loss": 3.5502630615234376, "step": 80200 }, { "epoch": 0.16435208414826707, "grad_norm": 0.7192445993423462, "learning_rate": 0.00028084327874774615, "loss": 3.56413330078125, "step": 80250 }, { "epoch": 0.16445448420069592, "grad_norm": 0.7595858573913574, "learning_rate": 0.0002808196272934676, "loss": 3.339110107421875, "step": 80300 }, { "epoch": 0.16455688425312473, "grad_norm": 0.7111859321594238, "learning_rate": 0.0002807959622449877, "loss": 3.051424560546875, "step": 80350 }, { "epoch": 0.16465928430555357, "grad_norm": 0.6211318373680115, "learning_rate": 0.00028077228360476537, "loss": 3.5583587646484376, "step": 80400 }, { "epoch": 0.16476168435798239, "grad_norm": 0.783703625202179, "learning_rate": 0.00028074859137526136, "loss": 2.740151062011719, "step": 80450 }, { "epoch": 0.16486408441041123, "grad_norm": 0.30799928307533264, "learning_rate": 0.0002807248855589376, "loss": 2.932861633300781, "step": 80500 }, { "epoch": 0.16496648446284004, "grad_norm": 0.9074276089668274, "learning_rate": 0.0002807011661582575, "loss": 2.6673263549804687, "step": 80550 }, { "epoch": 0.16506888451526888, "grad_norm": 0.680355429649353, "learning_rate": 0.00028067743317568587, "loss": 3.8975335693359376, "step": 80600 }, { "epoch": 0.1651712845676977, "grad_norm": 1.1440931558609009, "learning_rate": 0.000280653686613689, "loss": 3.4063519287109374, "step": 80650 }, { "epoch": 0.16527368462012654, "grad_norm": 0.7195169925689697, "learning_rate": 0.00028062992647473445, "loss": 3.4463735961914064, "step": 80700 }, { "epoch": 0.16537608467255535, "grad_norm": 2.4933488368988037, "learning_rate": 0.0002806061527612913, "loss": 2.4636448669433593, "step": 80750 }, { "epoch": 0.1654784847249842, "grad_norm": 0.8146184086799622, "learning_rate": 0.00028058236547582997, "loss": 3.016216125488281, "step": 80800 }, { "epoch": 0.165580884777413, "grad_norm": 0.7645831108093262, "learning_rate": 0.0002805585646208224, "loss": 3.7815518188476562, "step": 80850 }, { "epoch": 0.16568328482984182, "grad_norm": 0.5321808457374573, "learning_rate": 0.00028053475019874187, "loss": 3.1232025146484377, "step": 80900 }, { "epoch": 0.16578568488227066, "grad_norm": 0.5005676746368408, "learning_rate": 0.000280510922212063, "loss": 2.5462303161621094, "step": 80950 }, { "epoch": 0.16588808493469948, "grad_norm": 0.5022799372673035, "learning_rate": 0.00028048708066326193, "loss": 2.064752502441406, "step": 81000 }, { "epoch": 0.16599048498712832, "grad_norm": 0.40124621987342834, "learning_rate": 0.0002804632255548162, "loss": 3.146656799316406, "step": 81050 }, { "epoch": 0.16609288503955713, "grad_norm": 0.45695436000823975, "learning_rate": 0.00028043935688920466, "loss": 1.9398663330078125, "step": 81100 }, { "epoch": 0.16619528509198597, "grad_norm": 0.9878008365631104, "learning_rate": 0.0002804154746689077, "loss": 3.3690643310546875, "step": 81150 }, { "epoch": 0.1662976851444148, "grad_norm": 0.9054487943649292, "learning_rate": 0.000280391578896407, "loss": 3.499583740234375, "step": 81200 }, { "epoch": 0.16640008519684363, "grad_norm": 0.7078624367713928, "learning_rate": 0.00028036766957418576, "loss": 3.645855712890625, "step": 81250 }, { "epoch": 0.16650248524927244, "grad_norm": 0.8132025003433228, "learning_rate": 0.0002803437467047285, "loss": 2.701116027832031, "step": 81300 }, { "epoch": 0.16660488530170128, "grad_norm": 0.6939849257469177, "learning_rate": 0.00028031981029052116, "loss": 3.399428405761719, "step": 81350 }, { "epoch": 0.1667072853541301, "grad_norm": 0.7499716877937317, "learning_rate": 0.00028029586033405114, "loss": 3.7939776611328124, "step": 81400 }, { "epoch": 0.16680968540655894, "grad_norm": 0.6567860245704651, "learning_rate": 0.00028027189683780716, "loss": 3.5192059326171874, "step": 81450 }, { "epoch": 0.16691208545898775, "grad_norm": 0.5446729063987732, "learning_rate": 0.0002802479198042795, "loss": 4.063154602050782, "step": 81500 } ], "logging_steps": 50, "max_steps": 488281, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.56691253788672e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }