{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 7106, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00028145229383619476, "grad_norm": 1.9831818342208862, "learning_rate": 2.8129395218002816e-07, "loss": 2.1254, "step": 1 }, { "epoch": 0.0014072614691809737, "grad_norm": 1.082022786140442, "learning_rate": 1.4064697609001406e-06, "loss": 1.2669, "step": 5 }, { "epoch": 0.0028145229383619475, "grad_norm": 1.065866231918335, "learning_rate": 2.8129395218002813e-06, "loss": 1.3653, "step": 10 }, { "epoch": 0.004221784407542921, "grad_norm": 2.5712316036224365, "learning_rate": 4.219409282700422e-06, "loss": 1.53, "step": 15 }, { "epoch": 0.005629045876723895, "grad_norm": 2.248335838317871, "learning_rate": 5.6258790436005626e-06, "loss": 1.4474, "step": 20 }, { "epoch": 0.007036307345904869, "grad_norm": 1.0500571727752686, "learning_rate": 7.032348804500703e-06, "loss": 1.3372, "step": 25 }, { "epoch": 0.008443568815085843, "grad_norm": 1.867329716682434, "learning_rate": 8.438818565400844e-06, "loss": 1.2333, "step": 30 }, { "epoch": 0.009850830284266816, "grad_norm": 3.1149637699127197, "learning_rate": 9.845288326300985e-06, "loss": 1.2918, "step": 35 }, { "epoch": 0.01125809175344779, "grad_norm": 1.9895963668823242, "learning_rate": 1.1251758087201125e-05, "loss": 1.3152, "step": 40 }, { "epoch": 0.012665353222628765, "grad_norm": 2.0947887897491455, "learning_rate": 1.2658227848101267e-05, "loss": 1.3231, "step": 45 }, { "epoch": 0.014072614691809739, "grad_norm": 1.4856278896331787, "learning_rate": 1.4064697609001406e-05, "loss": 1.2376, "step": 50 }, { "epoch": 0.015479876160990712, "grad_norm": 1.2920206785202026, "learning_rate": 1.547116736990155e-05, "loss": 1.1083, "step": 55 }, { "epoch": 0.016887137630171686, "grad_norm": 1.3694531917572021, "learning_rate": 1.6877637130801688e-05, "loss": 0.9554, "step": 60 }, { "epoch": 0.01829439909935266, "grad_norm": 1.335752248764038, "learning_rate": 1.828410689170183e-05, "loss": 0.8074, "step": 65 }, { "epoch": 0.019701660568533633, "grad_norm": 0.8360928297042847, "learning_rate": 1.969057665260197e-05, "loss": 0.9003, "step": 70 }, { "epoch": 0.021108922037714608, "grad_norm": 1.4033712148666382, "learning_rate": 2.1097046413502112e-05, "loss": 1.0069, "step": 75 }, { "epoch": 0.02251618350689558, "grad_norm": 3.524489164352417, "learning_rate": 2.250351617440225e-05, "loss": 0.8655, "step": 80 }, { "epoch": 0.023923444976076555, "grad_norm": 5.065525054931641, "learning_rate": 2.3909985935302392e-05, "loss": 0.8884, "step": 85 }, { "epoch": 0.02533070644525753, "grad_norm": 1.1002086400985718, "learning_rate": 2.5316455696202533e-05, "loss": 0.7538, "step": 90 }, { "epoch": 0.026737967914438502, "grad_norm": 1.5529321432113647, "learning_rate": 2.672292545710267e-05, "loss": 0.9944, "step": 95 }, { "epoch": 0.028145229383619477, "grad_norm": 2.5230774879455566, "learning_rate": 2.8129395218002813e-05, "loss": 0.742, "step": 100 }, { "epoch": 0.02955249085280045, "grad_norm": 1.8407468795776367, "learning_rate": 2.9535864978902954e-05, "loss": 0.6106, "step": 105 }, { "epoch": 0.030959752321981424, "grad_norm": 1.8544448614120483, "learning_rate": 3.09423347398031e-05, "loss": 0.7784, "step": 110 }, { "epoch": 0.0323670137911624, "grad_norm": 1.5393428802490234, "learning_rate": 3.234880450070324e-05, "loss": 0.6225, "step": 115 }, { "epoch": 0.03377427526034337, "grad_norm": 0.9650129675865173, "learning_rate": 3.3755274261603375e-05, "loss": 0.466, "step": 120 }, { "epoch": 0.03518153672952434, "grad_norm": 1.2199194431304932, "learning_rate": 3.516174402250352e-05, "loss": 0.7885, "step": 125 }, { "epoch": 0.03658879819870532, "grad_norm": 3.1491034030914307, "learning_rate": 3.656821378340366e-05, "loss": 0.642, "step": 130 }, { "epoch": 0.037996059667886294, "grad_norm": 1.015199899673462, "learning_rate": 3.79746835443038e-05, "loss": 1.0289, "step": 135 }, { "epoch": 0.039403321137067265, "grad_norm": 1.211543321609497, "learning_rate": 3.938115330520394e-05, "loss": 0.8134, "step": 140 }, { "epoch": 0.040810582606248244, "grad_norm": 1.6816538572311401, "learning_rate": 4.078762306610408e-05, "loss": 0.8157, "step": 145 }, { "epoch": 0.042217844075429216, "grad_norm": 1.9145057201385498, "learning_rate": 4.2194092827004224e-05, "loss": 0.6091, "step": 150 }, { "epoch": 0.04362510554461019, "grad_norm": 1.168205976486206, "learning_rate": 4.3600562587904366e-05, "loss": 0.5557, "step": 155 }, { "epoch": 0.04503236701379116, "grad_norm": 0.8458957076072693, "learning_rate": 4.50070323488045e-05, "loss": 0.4024, "step": 160 }, { "epoch": 0.04643962848297214, "grad_norm": 1.442372441291809, "learning_rate": 4.641350210970464e-05, "loss": 0.8014, "step": 165 }, { "epoch": 0.04784688995215311, "grad_norm": 1.6391854286193848, "learning_rate": 4.7819971870604783e-05, "loss": 1.012, "step": 170 }, { "epoch": 0.04925415142133408, "grad_norm": 1.334926724433899, "learning_rate": 4.9226441631504925e-05, "loss": 0.6832, "step": 175 }, { "epoch": 0.05066141289051506, "grad_norm": 1.0498499870300293, "learning_rate": 5.0632911392405066e-05, "loss": 0.6351, "step": 180 }, { "epoch": 0.05206867435969603, "grad_norm": 2.0023510456085205, "learning_rate": 5.203938115330521e-05, "loss": 0.6497, "step": 185 }, { "epoch": 0.053475935828877004, "grad_norm": 1.9690536260604858, "learning_rate": 5.344585091420534e-05, "loss": 0.7836, "step": 190 }, { "epoch": 0.054883197298057976, "grad_norm": 1.4102208614349365, "learning_rate": 5.4852320675105484e-05, "loss": 0.5955, "step": 195 }, { "epoch": 0.056290458767238954, "grad_norm": 0.9214100241661072, "learning_rate": 5.6258790436005626e-05, "loss": 0.7519, "step": 200 }, { "epoch": 0.057697720236419926, "grad_norm": 1.3210060596466064, "learning_rate": 5.766526019690577e-05, "loss": 0.5468, "step": 205 }, { "epoch": 0.0591049817056009, "grad_norm": 1.723496437072754, "learning_rate": 5.907172995780591e-05, "loss": 0.4599, "step": 210 }, { "epoch": 0.06051224317478188, "grad_norm": 1.1883797645568848, "learning_rate": 6.047819971870605e-05, "loss": 0.7824, "step": 215 }, { "epoch": 0.06191950464396285, "grad_norm": 1.0189827680587769, "learning_rate": 6.18846694796062e-05, "loss": 0.5021, "step": 220 }, { "epoch": 0.06332676611314382, "grad_norm": 1.1384845972061157, "learning_rate": 6.329113924050633e-05, "loss": 0.7703, "step": 225 }, { "epoch": 0.0647340275823248, "grad_norm": 2.097339391708374, "learning_rate": 6.469760900140648e-05, "loss": 1.0998, "step": 230 }, { "epoch": 0.06614128905150576, "grad_norm": 2.55668044090271, "learning_rate": 6.610407876230662e-05, "loss": 0.5333, "step": 235 }, { "epoch": 0.06754855052068674, "grad_norm": 1.1277037858963013, "learning_rate": 6.751054852320675e-05, "loss": 0.6361, "step": 240 }, { "epoch": 0.06895581198986772, "grad_norm": 2.0660481452941895, "learning_rate": 6.89170182841069e-05, "loss": 0.7486, "step": 245 }, { "epoch": 0.07036307345904869, "grad_norm": 2.1117303371429443, "learning_rate": 7.032348804500703e-05, "loss": 0.7103, "step": 250 }, { "epoch": 0.07177033492822966, "grad_norm": 1.1796034574508667, "learning_rate": 7.172995780590718e-05, "loss": 0.6379, "step": 255 }, { "epoch": 0.07317759639741064, "grad_norm": 1.470502257347107, "learning_rate": 7.313642756680732e-05, "loss": 0.4737, "step": 260 }, { "epoch": 0.07458485786659161, "grad_norm": 1.443248987197876, "learning_rate": 7.454289732770746e-05, "loss": 0.812, "step": 265 }, { "epoch": 0.07599211933577259, "grad_norm": 3.0095481872558594, "learning_rate": 7.59493670886076e-05, "loss": 0.4456, "step": 270 }, { "epoch": 0.07739938080495357, "grad_norm": 1.157353401184082, "learning_rate": 7.735583684950773e-05, "loss": 0.524, "step": 275 }, { "epoch": 0.07880664227413453, "grad_norm": 1.1761438846588135, "learning_rate": 7.876230661040788e-05, "loss": 0.7222, "step": 280 }, { "epoch": 0.08021390374331551, "grad_norm": 0.64066082239151, "learning_rate": 8.016877637130802e-05, "loss": 0.5886, "step": 285 }, { "epoch": 0.08162116521249649, "grad_norm": 0.9376239776611328, "learning_rate": 8.157524613220817e-05, "loss": 0.6901, "step": 290 }, { "epoch": 0.08302842668167745, "grad_norm": 0.9339331388473511, "learning_rate": 8.29817158931083e-05, "loss": 0.389, "step": 295 }, { "epoch": 0.08443568815085843, "grad_norm": 1.1914637088775635, "learning_rate": 8.438818565400845e-05, "loss": 0.535, "step": 300 }, { "epoch": 0.0858429496200394, "grad_norm": 1.1882398128509521, "learning_rate": 8.579465541490858e-05, "loss": 0.3909, "step": 305 }, { "epoch": 0.08725021108922038, "grad_norm": 1.5186290740966797, "learning_rate": 8.720112517580873e-05, "loss": 0.6317, "step": 310 }, { "epoch": 0.08865747255840135, "grad_norm": 1.509944200515747, "learning_rate": 8.860759493670887e-05, "loss": 0.4739, "step": 315 }, { "epoch": 0.09006473402758232, "grad_norm": 1.4957388639450073, "learning_rate": 9.0014064697609e-05, "loss": 0.6078, "step": 320 }, { "epoch": 0.0914719954967633, "grad_norm": 1.8821747303009033, "learning_rate": 9.142053445850915e-05, "loss": 0.8152, "step": 325 }, { "epoch": 0.09287925696594428, "grad_norm": 0.9399609565734863, "learning_rate": 9.282700421940928e-05, "loss": 0.6356, "step": 330 }, { "epoch": 0.09428651843512524, "grad_norm": 1.4053034782409668, "learning_rate": 9.423347398030943e-05, "loss": 0.7405, "step": 335 }, { "epoch": 0.09569377990430622, "grad_norm": 0.9742883443832397, "learning_rate": 9.563994374120957e-05, "loss": 0.7251, "step": 340 }, { "epoch": 0.0971010413734872, "grad_norm": 3.047891616821289, "learning_rate": 9.704641350210972e-05, "loss": 0.7387, "step": 345 }, { "epoch": 0.09850830284266816, "grad_norm": 0.8324292898178101, "learning_rate": 9.845288326300985e-05, "loss": 0.584, "step": 350 }, { "epoch": 0.09991556431184914, "grad_norm": 1.0198436975479126, "learning_rate": 9.985935302391e-05, "loss": 0.4691, "step": 355 }, { "epoch": 0.10132282578103012, "grad_norm": 3.0640432834625244, "learning_rate": 0.00010126582278481013, "loss": 0.4508, "step": 360 }, { "epoch": 0.10273008725021109, "grad_norm": 0.9727720022201538, "learning_rate": 0.00010267229254571027, "loss": 0.4544, "step": 365 }, { "epoch": 0.10413734871939206, "grad_norm": 1.4771376848220825, "learning_rate": 0.00010407876230661042, "loss": 0.5085, "step": 370 }, { "epoch": 0.10554461018857304, "grad_norm": 1.5016095638275146, "learning_rate": 0.00010548523206751055, "loss": 0.5482, "step": 375 }, { "epoch": 0.10695187165775401, "grad_norm": 1.5180020332336426, "learning_rate": 0.00010689170182841069, "loss": 0.7243, "step": 380 }, { "epoch": 0.10835913312693499, "grad_norm": 1.8111554384231567, "learning_rate": 0.00010829817158931083, "loss": 0.5539, "step": 385 }, { "epoch": 0.10976639459611595, "grad_norm": 1.488231897354126, "learning_rate": 0.00010970464135021097, "loss": 0.4533, "step": 390 }, { "epoch": 0.11117365606529693, "grad_norm": 1.7389737367630005, "learning_rate": 0.00011111111111111112, "loss": 0.6554, "step": 395 }, { "epoch": 0.11258091753447791, "grad_norm": 0.9282882213592529, "learning_rate": 0.00011251758087201125, "loss": 0.5665, "step": 400 }, { "epoch": 0.11398817900365887, "grad_norm": 1.2808202505111694, "learning_rate": 0.0001139240506329114, "loss": 0.8137, "step": 405 }, { "epoch": 0.11539544047283985, "grad_norm": 1.520807147026062, "learning_rate": 0.00011533052039381153, "loss": 0.7432, "step": 410 }, { "epoch": 0.11680270194202083, "grad_norm": 1.4392223358154297, "learning_rate": 0.0001167369901547117, "loss": 0.478, "step": 415 }, { "epoch": 0.1182099634112018, "grad_norm": 0.8880683779716492, "learning_rate": 0.00011814345991561182, "loss": 0.4246, "step": 420 }, { "epoch": 0.11961722488038277, "grad_norm": 0.832594633102417, "learning_rate": 0.00011954992967651195, "loss": 0.5505, "step": 425 }, { "epoch": 0.12102448634956375, "grad_norm": 0.4944693148136139, "learning_rate": 0.0001209563994374121, "loss": 0.4342, "step": 430 }, { "epoch": 0.12243174781874472, "grad_norm": 0.8733665943145752, "learning_rate": 0.00012236286919831225, "loss": 0.5839, "step": 435 }, { "epoch": 0.1238390092879257, "grad_norm": 1.1832093000411987, "learning_rate": 0.0001237693389592124, "loss": 0.6976, "step": 440 }, { "epoch": 0.12524627075710668, "grad_norm": 1.0406477451324463, "learning_rate": 0.00012517580872011252, "loss": 0.6353, "step": 445 }, { "epoch": 0.12665353222628764, "grad_norm": 0.788364827632904, "learning_rate": 0.00012658227848101267, "loss": 0.3272, "step": 450 }, { "epoch": 0.1280607936954686, "grad_norm": 1.2941433191299438, "learning_rate": 0.00012798874824191281, "loss": 0.7372, "step": 455 }, { "epoch": 0.1294680551646496, "grad_norm": 0.9147971272468567, "learning_rate": 0.00012939521800281296, "loss": 0.5474, "step": 460 }, { "epoch": 0.13087531663383056, "grad_norm": 1.0644923448562622, "learning_rate": 0.00013080168776371308, "loss": 0.6286, "step": 465 }, { "epoch": 0.13228257810301153, "grad_norm": 0.8214511275291443, "learning_rate": 0.00013220815752461323, "loss": 0.3655, "step": 470 }, { "epoch": 0.13368983957219252, "grad_norm": 0.7348743677139282, "learning_rate": 0.00013361462728551338, "loss": 0.5278, "step": 475 }, { "epoch": 0.13509710104137349, "grad_norm": 1.0437523126602173, "learning_rate": 0.0001350210970464135, "loss": 0.4665, "step": 480 }, { "epoch": 0.13650436251055445, "grad_norm": 1.6613603830337524, "learning_rate": 0.00013642756680731365, "loss": 0.7575, "step": 485 }, { "epoch": 0.13791162397973544, "grad_norm": 1.0844550132751465, "learning_rate": 0.0001378340365682138, "loss": 0.6744, "step": 490 }, { "epoch": 0.1393188854489164, "grad_norm": 1.3651305437088013, "learning_rate": 0.00013924050632911395, "loss": 0.8377, "step": 495 }, { "epoch": 0.14072614691809737, "grad_norm": 1.256631851196289, "learning_rate": 0.00014064697609001407, "loss": 0.6523, "step": 500 }, { "epoch": 0.14213340838727836, "grad_norm": 1.7894726991653442, "learning_rate": 0.0001420534458509142, "loss": 0.5191, "step": 505 }, { "epoch": 0.14354066985645933, "grad_norm": 0.8206887245178223, "learning_rate": 0.00014345991561181436, "loss": 0.378, "step": 510 }, { "epoch": 0.1449479313256403, "grad_norm": 1.6677026748657227, "learning_rate": 0.00014486638537271449, "loss": 0.326, "step": 515 }, { "epoch": 0.1463551927948213, "grad_norm": 1.4679995775222778, "learning_rate": 0.00014627285513361463, "loss": 0.6619, "step": 520 }, { "epoch": 0.14776245426400225, "grad_norm": 0.829093337059021, "learning_rate": 0.00014767932489451478, "loss": 0.7372, "step": 525 }, { "epoch": 0.14916971573318322, "grad_norm": 1.6188422441482544, "learning_rate": 0.00014908579465541493, "loss": 0.5666, "step": 530 }, { "epoch": 0.1505769772023642, "grad_norm": 1.319091558456421, "learning_rate": 0.00015049226441631505, "loss": 0.8461, "step": 535 }, { "epoch": 0.15198423867154517, "grad_norm": 1.7154995203018188, "learning_rate": 0.0001518987341772152, "loss": 0.6463, "step": 540 }, { "epoch": 0.15339150014072614, "grad_norm": 1.4643100500106812, "learning_rate": 0.00015330520393811535, "loss": 0.6149, "step": 545 }, { "epoch": 0.15479876160990713, "grad_norm": 1.554081916809082, "learning_rate": 0.00015471167369901547, "loss": 0.7509, "step": 550 }, { "epoch": 0.1562060230790881, "grad_norm": 1.040045976638794, "learning_rate": 0.00015611814345991562, "loss": 0.6607, "step": 555 }, { "epoch": 0.15761328454826906, "grad_norm": 1.9093159437179565, "learning_rate": 0.00015752461322081577, "loss": 0.6108, "step": 560 }, { "epoch": 0.15902054601745005, "grad_norm": 0.8650393486022949, "learning_rate": 0.0001589310829817159, "loss": 0.629, "step": 565 }, { "epoch": 0.16042780748663102, "grad_norm": 1.011257529258728, "learning_rate": 0.00016033755274261603, "loss": 0.2586, "step": 570 }, { "epoch": 0.16183506895581198, "grad_norm": 0.8653711676597595, "learning_rate": 0.00016174402250351618, "loss": 0.6063, "step": 575 }, { "epoch": 0.16324233042499298, "grad_norm": 1.7408281564712524, "learning_rate": 0.00016315049226441633, "loss": 0.4728, "step": 580 }, { "epoch": 0.16464959189417394, "grad_norm": 0.7200327515602112, "learning_rate": 0.00016455696202531648, "loss": 0.6803, "step": 585 }, { "epoch": 0.1660568533633549, "grad_norm": 2.032118320465088, "learning_rate": 0.0001659634317862166, "loss": 0.6615, "step": 590 }, { "epoch": 0.1674641148325359, "grad_norm": 1.1240061521530151, "learning_rate": 0.00016736990154711675, "loss": 0.4675, "step": 595 }, { "epoch": 0.16887137630171686, "grad_norm": 0.8609156012535095, "learning_rate": 0.0001687763713080169, "loss": 0.6737, "step": 600 }, { "epoch": 0.17027863777089783, "grad_norm": 1.4271563291549683, "learning_rate": 0.00017018284106891702, "loss": 0.6479, "step": 605 }, { "epoch": 0.1716858992400788, "grad_norm": 0.8409131765365601, "learning_rate": 0.00017158931082981717, "loss": 0.5877, "step": 610 }, { "epoch": 0.17309316070925979, "grad_norm": 1.002172827720642, "learning_rate": 0.00017299578059071731, "loss": 0.5572, "step": 615 }, { "epoch": 0.17450042217844075, "grad_norm": 0.7729489207267761, "learning_rate": 0.00017440225035161746, "loss": 0.64, "step": 620 }, { "epoch": 0.17590768364762172, "grad_norm": 1.3359206914901733, "learning_rate": 0.00017580872011251758, "loss": 0.5652, "step": 625 }, { "epoch": 0.1773149451168027, "grad_norm": 2.492105722427368, "learning_rate": 0.00017721518987341773, "loss": 0.584, "step": 630 }, { "epoch": 0.17872220658598367, "grad_norm": 1.271020770072937, "learning_rate": 0.00017862165963431788, "loss": 0.4011, "step": 635 }, { "epoch": 0.18012946805516464, "grad_norm": 0.8744266629219055, "learning_rate": 0.000180028129395218, "loss": 0.616, "step": 640 }, { "epoch": 0.18153672952434563, "grad_norm": 1.2818926572799683, "learning_rate": 0.00018143459915611815, "loss": 0.463, "step": 645 }, { "epoch": 0.1829439909935266, "grad_norm": 1.3106176853179932, "learning_rate": 0.0001828410689170183, "loss": 0.4851, "step": 650 }, { "epoch": 0.18435125246270756, "grad_norm": 1.068864345550537, "learning_rate": 0.00018424753867791845, "loss": 0.6297, "step": 655 }, { "epoch": 0.18575851393188855, "grad_norm": 1.879895567893982, "learning_rate": 0.00018565400843881857, "loss": 0.6638, "step": 660 }, { "epoch": 0.18716577540106952, "grad_norm": 1.4671173095703125, "learning_rate": 0.00018706047819971872, "loss": 0.7588, "step": 665 }, { "epoch": 0.18857303687025048, "grad_norm": 1.5851764678955078, "learning_rate": 0.00018846694796061886, "loss": 0.7505, "step": 670 }, { "epoch": 0.18998029833943147, "grad_norm": 0.7149075269699097, "learning_rate": 0.00018987341772151899, "loss": 0.3806, "step": 675 }, { "epoch": 0.19138755980861244, "grad_norm": 1.049310326576233, "learning_rate": 0.00019127988748241913, "loss": 0.5908, "step": 680 }, { "epoch": 0.1927948212777934, "grad_norm": 0.950442373752594, "learning_rate": 0.00019268635724331928, "loss": 0.6755, "step": 685 }, { "epoch": 0.1942020827469744, "grad_norm": 0.9287855625152588, "learning_rate": 0.00019409282700421943, "loss": 0.5703, "step": 690 }, { "epoch": 0.19560934421615536, "grad_norm": 0.7228776216506958, "learning_rate": 0.00019549929676511955, "loss": 0.5971, "step": 695 }, { "epoch": 0.19701660568533633, "grad_norm": 1.04582941532135, "learning_rate": 0.0001969057665260197, "loss": 0.9477, "step": 700 }, { "epoch": 0.19842386715451732, "grad_norm": 1.6367225646972656, "learning_rate": 0.00019831223628691985, "loss": 0.5875, "step": 705 }, { "epoch": 0.19983112862369828, "grad_norm": 0.724415123462677, "learning_rate": 0.00019971870604782, "loss": 0.5531, "step": 710 }, { "epoch": 0.20123839009287925, "grad_norm": 1.1167938709259033, "learning_rate": 0.00019999980693280142, "loss": 0.4568, "step": 715 }, { "epoch": 0.20264565156206024, "grad_norm": 3.7291440963745117, "learning_rate": 0.00019999902259858484, "loss": 0.4796, "step": 720 }, { "epoch": 0.2040529130312412, "grad_norm": 1.0626037120819092, "learning_rate": 0.00019999763493537887, "loss": 0.5454, "step": 725 }, { "epoch": 0.20546017450042217, "grad_norm": 1.1673458814620972, "learning_rate": 0.00019999564395155577, "loss": 0.6261, "step": 730 }, { "epoch": 0.20686743596960316, "grad_norm": 1.1592299938201904, "learning_rate": 0.00019999304965912784, "loss": 0.6726, "step": 735 }, { "epoch": 0.20827469743878413, "grad_norm": 1.117803692817688, "learning_rate": 0.00019998985207374736, "loss": 0.8504, "step": 740 }, { "epoch": 0.2096819589079651, "grad_norm": 0.8449244499206543, "learning_rate": 0.00019998605121470645, "loss": 0.4394, "step": 745 }, { "epoch": 0.2110892203771461, "grad_norm": 0.9696683883666992, "learning_rate": 0.00019998164710493705, "loss": 0.3861, "step": 750 }, { "epoch": 0.21249648184632705, "grad_norm": 1.5206379890441895, "learning_rate": 0.00019997663977101068, "loss": 0.6289, "step": 755 }, { "epoch": 0.21390374331550802, "grad_norm": 1.5071372985839844, "learning_rate": 0.00019997102924313836, "loss": 0.8584, "step": 760 }, { "epoch": 0.215311004784689, "grad_norm": 0.9600889086723328, "learning_rate": 0.00019996481555517028, "loss": 0.3949, "step": 765 }, { "epoch": 0.21671826625386997, "grad_norm": 0.8249372839927673, "learning_rate": 0.00019995799874459585, "loss": 0.559, "step": 770 }, { "epoch": 0.21812552772305094, "grad_norm": 1.2509324550628662, "learning_rate": 0.00019995057885254333, "loss": 0.5327, "step": 775 }, { "epoch": 0.2195327891922319, "grad_norm": 0.8242643475532532, "learning_rate": 0.00019994255592377936, "loss": 0.4605, "step": 780 }, { "epoch": 0.2209400506614129, "grad_norm": 0.7586041688919067, "learning_rate": 0.00019993393000670916, "loss": 0.4722, "step": 785 }, { "epoch": 0.22234731213059386, "grad_norm": 1.2805287837982178, "learning_rate": 0.00019992470115337592, "loss": 0.2861, "step": 790 }, { "epoch": 0.22375457359977483, "grad_norm": 1.6375665664672852, "learning_rate": 0.00019991486941946048, "loss": 0.5846, "step": 795 }, { "epoch": 0.22516183506895582, "grad_norm": 0.8348977565765381, "learning_rate": 0.00019990443486428118, "loss": 0.4657, "step": 800 }, { "epoch": 0.22656909653813678, "grad_norm": 1.1735246181488037, "learning_rate": 0.0001998933975507933, "loss": 0.6255, "step": 805 }, { "epoch": 0.22797635800731775, "grad_norm": 1.1627134084701538, "learning_rate": 0.00019988175754558874, "loss": 0.7479, "step": 810 }, { "epoch": 0.22938361947649874, "grad_norm": 1.916646957397461, "learning_rate": 0.00019986951491889578, "loss": 0.4814, "step": 815 }, { "epoch": 0.2307908809456797, "grad_norm": 1.5607751607894897, "learning_rate": 0.00019985666974457847, "loss": 0.5807, "step": 820 }, { "epoch": 0.23219814241486067, "grad_norm": 0.759840726852417, "learning_rate": 0.0001998432221001362, "loss": 0.5299, "step": 825 }, { "epoch": 0.23360540388404166, "grad_norm": 0.8141459226608276, "learning_rate": 0.0001998291720667033, "loss": 0.584, "step": 830 }, { "epoch": 0.23501266535322263, "grad_norm": 1.113457441329956, "learning_rate": 0.00019981451972904854, "loss": 0.5733, "step": 835 }, { "epoch": 0.2364199268224036, "grad_norm": 1.1313204765319824, "learning_rate": 0.00019979926517557458, "loss": 0.6995, "step": 840 }, { "epoch": 0.23782718829158458, "grad_norm": 0.8379271626472473, "learning_rate": 0.00019978340849831743, "loss": 0.3914, "step": 845 }, { "epoch": 0.23923444976076555, "grad_norm": 0.8467435240745544, "learning_rate": 0.00019976694979294596, "loss": 0.6813, "step": 850 }, { "epoch": 0.24064171122994651, "grad_norm": 1.30973219871521, "learning_rate": 0.00019974988915876134, "loss": 0.4174, "step": 855 }, { "epoch": 0.2420489726991275, "grad_norm": 0.9715356230735779, "learning_rate": 0.0001997322266986963, "loss": 0.4208, "step": 860 }, { "epoch": 0.24345623416830847, "grad_norm": 1.0101361274719238, "learning_rate": 0.0001997139625193146, "loss": 0.602, "step": 865 }, { "epoch": 0.24486349563748944, "grad_norm": 0.9341487288475037, "learning_rate": 0.0001996950967308104, "loss": 0.3989, "step": 870 }, { "epoch": 0.24627075710667043, "grad_norm": 1.2196135520935059, "learning_rate": 0.00019967562944700763, "loss": 0.4883, "step": 875 }, { "epoch": 0.2476780185758514, "grad_norm": 1.2374253273010254, "learning_rate": 0.00019965556078535917, "loss": 0.7397, "step": 880 }, { "epoch": 0.24908528004503236, "grad_norm": 0.561997652053833, "learning_rate": 0.00019963489086694626, "loss": 0.7548, "step": 885 }, { "epoch": 0.25049254151421335, "grad_norm": 0.8023036122322083, "learning_rate": 0.00019961361981647775, "loss": 0.4486, "step": 890 }, { "epoch": 0.2518998029833943, "grad_norm": 0.9484225511550903, "learning_rate": 0.00019959174776228928, "loss": 0.4158, "step": 895 }, { "epoch": 0.2533070644525753, "grad_norm": 1.119430661201477, "learning_rate": 0.0001995692748363426, "loss": 0.7553, "step": 900 }, { "epoch": 0.25471432592175625, "grad_norm": 1.4776628017425537, "learning_rate": 0.0001995462011742247, "loss": 0.2808, "step": 905 }, { "epoch": 0.2561215873909372, "grad_norm": 1.370290756225586, "learning_rate": 0.00019952252691514706, "loss": 0.4522, "step": 910 }, { "epoch": 0.25752884886011823, "grad_norm": 1.1513909101486206, "learning_rate": 0.00019949825220194468, "loss": 0.5382, "step": 915 }, { "epoch": 0.2589361103292992, "grad_norm": 1.0892587900161743, "learning_rate": 0.00019947337718107547, "loss": 0.5407, "step": 920 }, { "epoch": 0.26034337179848016, "grad_norm": 1.1014186143875122, "learning_rate": 0.00019944790200261903, "loss": 0.5723, "step": 925 }, { "epoch": 0.2617506332676611, "grad_norm": 1.4293971061706543, "learning_rate": 0.000199421826820276, "loss": 0.7333, "step": 930 }, { "epoch": 0.2631578947368421, "grad_norm": 0.5284586548805237, "learning_rate": 0.00019939515179136713, "loss": 0.6351, "step": 935 }, { "epoch": 0.26456515620602306, "grad_norm": 0.7904505133628845, "learning_rate": 0.0001993678770768321, "loss": 0.6792, "step": 940 }, { "epoch": 0.2659724176752041, "grad_norm": 0.5654340982437134, "learning_rate": 0.0001993400028412288, "loss": 0.4223, "step": 945 }, { "epoch": 0.26737967914438504, "grad_norm": 0.9616327285766602, "learning_rate": 0.00019931152925273225, "loss": 0.4585, "step": 950 }, { "epoch": 0.268786940613566, "grad_norm": 1.3930063247680664, "learning_rate": 0.00019928245648313347, "loss": 0.7828, "step": 955 }, { "epoch": 0.27019420208274697, "grad_norm": 1.6367273330688477, "learning_rate": 0.00019925278470783866, "loss": 0.6883, "step": 960 }, { "epoch": 0.27160146355192794, "grad_norm": 0.9764294028282166, "learning_rate": 0.00019922251410586802, "loss": 0.4474, "step": 965 }, { "epoch": 0.2730087250211089, "grad_norm": 0.7450019121170044, "learning_rate": 0.00019919164485985463, "loss": 0.436, "step": 970 }, { "epoch": 0.2744159864902899, "grad_norm": 0.774627149105072, "learning_rate": 0.0001991601771560434, "loss": 0.3708, "step": 975 }, { "epoch": 0.2758232479594709, "grad_norm": 1.1829273700714111, "learning_rate": 0.00019912811118429, "loss": 0.4453, "step": 980 }, { "epoch": 0.27723050942865185, "grad_norm": 1.0340484380722046, "learning_rate": 0.0001990954471380596, "loss": 0.3123, "step": 985 }, { "epoch": 0.2786377708978328, "grad_norm": 0.6128121018409729, "learning_rate": 0.00019906218521442576, "loss": 0.3459, "step": 990 }, { "epoch": 0.2800450323670138, "grad_norm": 0.8443979024887085, "learning_rate": 0.00019902832561406934, "loss": 0.7583, "step": 995 }, { "epoch": 0.28145229383619474, "grad_norm": 1.4136847257614136, "learning_rate": 0.00019899386854127705, "loss": 0.6206, "step": 1000 }, { "epoch": 0.28285955530537576, "grad_norm": 0.7922631502151489, "learning_rate": 0.00019895881420394052, "loss": 0.5676, "step": 1005 }, { "epoch": 0.28426681677455673, "grad_norm": 1.7876763343811035, "learning_rate": 0.0001989231628135547, "loss": 0.5216, "step": 1010 }, { "epoch": 0.2856740782437377, "grad_norm": 1.3975410461425781, "learning_rate": 0.00019888691458521692, "loss": 0.5053, "step": 1015 }, { "epoch": 0.28708133971291866, "grad_norm": 1.0760260820388794, "learning_rate": 0.00019885006973762535, "loss": 0.3415, "step": 1020 }, { "epoch": 0.2884886011820996, "grad_norm": 1.2067842483520508, "learning_rate": 0.00019881262849307785, "loss": 0.4352, "step": 1025 }, { "epoch": 0.2898958626512806, "grad_norm": 0.8484588265419006, "learning_rate": 0.0001987745910774705, "loss": 0.6558, "step": 1030 }, { "epoch": 0.29130312412046155, "grad_norm": 1.2854669094085693, "learning_rate": 0.00019873595772029628, "loss": 0.5144, "step": 1035 }, { "epoch": 0.2927103855896426, "grad_norm": 0.8903659582138062, "learning_rate": 0.00019869672865464373, "loss": 0.7212, "step": 1040 }, { "epoch": 0.29411764705882354, "grad_norm": 1.1273301839828491, "learning_rate": 0.00019865690411719546, "loss": 0.5763, "step": 1045 }, { "epoch": 0.2955249085280045, "grad_norm": 1.6163692474365234, "learning_rate": 0.00019861648434822687, "loss": 0.8076, "step": 1050 }, { "epoch": 0.29693216999718547, "grad_norm": 1.0796860456466675, "learning_rate": 0.00019857546959160444, "loss": 0.8208, "step": 1055 }, { "epoch": 0.29833943146636643, "grad_norm": 0.8399056792259216, "learning_rate": 0.00019853386009478454, "loss": 0.5939, "step": 1060 }, { "epoch": 0.2997466929355474, "grad_norm": 1.2428550720214844, "learning_rate": 0.0001984916561088118, "loss": 0.2594, "step": 1065 }, { "epoch": 0.3011539544047284, "grad_norm": 2.2983717918395996, "learning_rate": 0.00019844885788831756, "loss": 0.7697, "step": 1070 }, { "epoch": 0.3025612158739094, "grad_norm": 1.0774344205856323, "learning_rate": 0.0001984054656915184, "loss": 0.6441, "step": 1075 }, { "epoch": 0.30396847734309035, "grad_norm": 0.6637004613876343, "learning_rate": 0.00019836147978021467, "loss": 0.4219, "step": 1080 }, { "epoch": 0.3053757388122713, "grad_norm": 0.9496357440948486, "learning_rate": 0.00019831690041978862, "loss": 0.6518, "step": 1085 }, { "epoch": 0.3067830002814523, "grad_norm": 1.3843315839767456, "learning_rate": 0.00019827172787920315, "loss": 0.6269, "step": 1090 }, { "epoch": 0.30819026175063324, "grad_norm": 0.9899942278862, "learning_rate": 0.0001982259624309999, "loss": 0.5791, "step": 1095 }, { "epoch": 0.30959752321981426, "grad_norm": 0.8998156785964966, "learning_rate": 0.00019817960435129778, "loss": 0.7362, "step": 1100 }, { "epoch": 0.31100478468899523, "grad_norm": 0.615544319152832, "learning_rate": 0.00019813265391979137, "loss": 0.457, "step": 1105 }, { "epoch": 0.3124120461581762, "grad_norm": 1.026685118675232, "learning_rate": 0.00019808511141974886, "loss": 0.5494, "step": 1110 }, { "epoch": 0.31381930762735716, "grad_norm": 1.0256643295288086, "learning_rate": 0.00019803697713801084, "loss": 0.3588, "step": 1115 }, { "epoch": 0.3152265690965381, "grad_norm": 0.8720577359199524, "learning_rate": 0.00019798825136498814, "loss": 0.5563, "step": 1120 }, { "epoch": 0.3166338305657191, "grad_norm": 0.8864659667015076, "learning_rate": 0.00019793893439466043, "loss": 0.3091, "step": 1125 }, { "epoch": 0.3180410920349001, "grad_norm": 1.0853145122528076, "learning_rate": 0.00019788902652457412, "loss": 0.6204, "step": 1130 }, { "epoch": 0.3194483535040811, "grad_norm": 1.6496775150299072, "learning_rate": 0.0001978385280558409, "loss": 0.4948, "step": 1135 }, { "epoch": 0.32085561497326204, "grad_norm": 1.668879508972168, "learning_rate": 0.00019778743929313555, "loss": 0.7545, "step": 1140 }, { "epoch": 0.322262876442443, "grad_norm": 0.7751437425613403, "learning_rate": 0.00019773576054469446, "loss": 0.4416, "step": 1145 }, { "epoch": 0.32367013791162397, "grad_norm": 1.3606644868850708, "learning_rate": 0.0001976834921223135, "loss": 0.4837, "step": 1150 }, { "epoch": 0.32507739938080493, "grad_norm": 0.5276009440422058, "learning_rate": 0.0001976306343413463, "loss": 0.2264, "step": 1155 }, { "epoch": 0.32648466084998595, "grad_norm": 1.034682035446167, "learning_rate": 0.00019757718752070239, "loss": 0.5388, "step": 1160 }, { "epoch": 0.3278919223191669, "grad_norm": 0.9205548763275146, "learning_rate": 0.00019752315198284497, "loss": 0.7432, "step": 1165 }, { "epoch": 0.3292991837883479, "grad_norm": 0.2892135977745056, "learning_rate": 0.00019746852805378932, "loss": 0.2681, "step": 1170 }, { "epoch": 0.33070644525752885, "grad_norm": 1.4844127893447876, "learning_rate": 0.0001974133160631007, "loss": 0.4837, "step": 1175 }, { "epoch": 0.3321137067267098, "grad_norm": 0.7771471738815308, "learning_rate": 0.00019735751634389226, "loss": 0.7133, "step": 1180 }, { "epoch": 0.3335209681958908, "grad_norm": 1.23273766040802, "learning_rate": 0.00019730112923282321, "loss": 0.789, "step": 1185 }, { "epoch": 0.3349282296650718, "grad_norm": 1.751483678817749, "learning_rate": 0.0001972441550700966, "loss": 0.7569, "step": 1190 }, { "epoch": 0.33633549113425276, "grad_norm": 0.31647899746894836, "learning_rate": 0.00019718659419945756, "loss": 0.4276, "step": 1195 }, { "epoch": 0.3377427526034337, "grad_norm": 1.3560551404953003, "learning_rate": 0.00019712844696819076, "loss": 0.4853, "step": 1200 }, { "epoch": 0.3391500140726147, "grad_norm": 1.571906328201294, "learning_rate": 0.00019706971372711882, "loss": 0.3889, "step": 1205 }, { "epoch": 0.34055727554179566, "grad_norm": 1.2469801902770996, "learning_rate": 0.00019701039483059981, "loss": 0.5063, "step": 1210 }, { "epoch": 0.3419645370109766, "grad_norm": 0.660874605178833, "learning_rate": 0.00019695049063652543, "loss": 0.4789, "step": 1215 }, { "epoch": 0.3433717984801576, "grad_norm": 0.9069953560829163, "learning_rate": 0.00019689000150631845, "loss": 0.393, "step": 1220 }, { "epoch": 0.3447790599493386, "grad_norm": 1.9359229803085327, "learning_rate": 0.000196828927804931, "loss": 0.4297, "step": 1225 }, { "epoch": 0.34618632141851957, "grad_norm": 1.063952088356018, "learning_rate": 0.00019676726990084195, "loss": 0.5455, "step": 1230 }, { "epoch": 0.34759358288770054, "grad_norm": 1.7802363634109497, "learning_rate": 0.000196705028166055, "loss": 0.5684, "step": 1235 }, { "epoch": 0.3490008443568815, "grad_norm": 1.1787841320037842, "learning_rate": 0.00019664220297609624, "loss": 0.6942, "step": 1240 }, { "epoch": 0.35040810582606247, "grad_norm": 1.146467924118042, "learning_rate": 0.00019657879471001195, "loss": 0.6188, "step": 1245 }, { "epoch": 0.35181536729524343, "grad_norm": 1.322690486907959, "learning_rate": 0.0001965148037503663, "loss": 0.5142, "step": 1250 }, { "epoch": 0.35322262876442445, "grad_norm": 0.8079725503921509, "learning_rate": 0.0001964502304832391, "loss": 0.4729, "step": 1255 }, { "epoch": 0.3546298902336054, "grad_norm": 1.8152616024017334, "learning_rate": 0.0001963850752982234, "loss": 0.7246, "step": 1260 }, { "epoch": 0.3560371517027864, "grad_norm": 1.4570809602737427, "learning_rate": 0.00019631933858842317, "loss": 0.8113, "step": 1265 }, { "epoch": 0.35744441317196735, "grad_norm": 1.1229805946350098, "learning_rate": 0.00019625302075045088, "loss": 0.5401, "step": 1270 }, { "epoch": 0.3588516746411483, "grad_norm": 0.693499743938446, "learning_rate": 0.00019618612218442517, "loss": 0.3536, "step": 1275 }, { "epoch": 0.3602589361103293, "grad_norm": 1.592119574546814, "learning_rate": 0.00019611864329396853, "loss": 0.5994, "step": 1280 }, { "epoch": 0.3616661975795103, "grad_norm": 1.087098479270935, "learning_rate": 0.00019605058448620452, "loss": 0.5211, "step": 1285 }, { "epoch": 0.36307345904869126, "grad_norm": 1.002854585647583, "learning_rate": 0.0001959819461717557, "loss": 0.6473, "step": 1290 }, { "epoch": 0.3644807205178722, "grad_norm": 1.2526451349258423, "learning_rate": 0.00019591272876474106, "loss": 0.4721, "step": 1295 }, { "epoch": 0.3658879819870532, "grad_norm": 0.9391024112701416, "learning_rate": 0.00019584293268277324, "loss": 0.5849, "step": 1300 }, { "epoch": 0.36729524345623416, "grad_norm": 1.1725986003875732, "learning_rate": 0.00019577255834695643, "loss": 0.4718, "step": 1305 }, { "epoch": 0.3687025049254151, "grad_norm": 1.1449577808380127, "learning_rate": 0.00019570160618188353, "loss": 0.5429, "step": 1310 }, { "epoch": 0.37010976639459614, "grad_norm": 1.8632793426513672, "learning_rate": 0.00019563007661563367, "loss": 0.5791, "step": 1315 }, { "epoch": 0.3715170278637771, "grad_norm": 0.6620994210243225, "learning_rate": 0.00019555797007976975, "loss": 0.4016, "step": 1320 }, { "epoch": 0.37292428933295807, "grad_norm": 1.7540533542633057, "learning_rate": 0.00019548528700933559, "loss": 0.5039, "step": 1325 }, { "epoch": 0.37433155080213903, "grad_norm": 0.9329980611801147, "learning_rate": 0.00019541202784285352, "loss": 0.403, "step": 1330 }, { "epoch": 0.37573881227132, "grad_norm": 0.4586445689201355, "learning_rate": 0.00019533819302232168, "loss": 0.3944, "step": 1335 }, { "epoch": 0.37714607374050096, "grad_norm": 1.575636863708496, "learning_rate": 0.00019526378299321127, "loss": 0.5372, "step": 1340 }, { "epoch": 0.378553335209682, "grad_norm": 1.2038066387176514, "learning_rate": 0.00019518879820446398, "loss": 0.4409, "step": 1345 }, { "epoch": 0.37996059667886295, "grad_norm": 0.9737414121627808, "learning_rate": 0.0001951132391084892, "loss": 0.7155, "step": 1350 }, { "epoch": 0.3813678581480439, "grad_norm": 1.0166410207748413, "learning_rate": 0.00019503710616116128, "loss": 0.6772, "step": 1355 }, { "epoch": 0.3827751196172249, "grad_norm": 1.1660302877426147, "learning_rate": 0.0001949603998218169, "loss": 0.7076, "step": 1360 }, { "epoch": 0.38418238108640584, "grad_norm": 0.576275110244751, "learning_rate": 0.0001948831205532521, "loss": 0.5392, "step": 1365 }, { "epoch": 0.3855896425555868, "grad_norm": 1.453596830368042, "learning_rate": 0.00019480526882171976, "loss": 0.7963, "step": 1370 }, { "epoch": 0.38699690402476783, "grad_norm": 0.7829164862632751, "learning_rate": 0.00019472684509692646, "loss": 0.3505, "step": 1375 }, { "epoch": 0.3884041654939488, "grad_norm": 0.9208312630653381, "learning_rate": 0.0001946478498520299, "loss": 0.5539, "step": 1380 }, { "epoch": 0.38981142696312976, "grad_norm": 1.0814006328582764, "learning_rate": 0.00019456828356363598, "loss": 0.3839, "step": 1385 }, { "epoch": 0.3912186884323107, "grad_norm": 1.592490553855896, "learning_rate": 0.00019448814671179585, "loss": 0.6688, "step": 1390 }, { "epoch": 0.3926259499014917, "grad_norm": 0.880333662033081, "learning_rate": 0.00019440743978000312, "loss": 0.6542, "step": 1395 }, { "epoch": 0.39403321137067265, "grad_norm": 0.516769528388977, "learning_rate": 0.00019432616325519084, "loss": 0.4571, "step": 1400 }, { "epoch": 0.3954404728398536, "grad_norm": 1.1296850442886353, "learning_rate": 0.00019424431762772866, "loss": 0.5596, "step": 1405 }, { "epoch": 0.39684773430903464, "grad_norm": 0.8967404365539551, "learning_rate": 0.00019416190339141976, "loss": 0.4144, "step": 1410 }, { "epoch": 0.3982549957782156, "grad_norm": 1.983446478843689, "learning_rate": 0.00019407892104349804, "loss": 0.2378, "step": 1415 }, { "epoch": 0.39966225724739657, "grad_norm": 0.868871808052063, "learning_rate": 0.00019399537108462494, "loss": 0.8016, "step": 1420 }, { "epoch": 0.40106951871657753, "grad_norm": 1.9956140518188477, "learning_rate": 0.00019391125401888644, "loss": 0.5541, "step": 1425 }, { "epoch": 0.4024767801857585, "grad_norm": 1.437330961227417, "learning_rate": 0.00019382657035379026, "loss": 0.299, "step": 1430 }, { "epoch": 0.40388404165493946, "grad_norm": 1.0055358409881592, "learning_rate": 0.00019374132060026242, "loss": 0.5419, "step": 1435 }, { "epoch": 0.4052913031241205, "grad_norm": 1.3034961223602295, "learning_rate": 0.00019365550527264443, "loss": 0.7488, "step": 1440 }, { "epoch": 0.40669856459330145, "grad_norm": 1.9104148149490356, "learning_rate": 0.0001935691248886901, "loss": 0.4039, "step": 1445 }, { "epoch": 0.4081058260624824, "grad_norm": 1.3824232816696167, "learning_rate": 0.00019348217996956245, "loss": 0.5864, "step": 1450 }, { "epoch": 0.4095130875316634, "grad_norm": 0.18742340803146362, "learning_rate": 0.00019339467103983044, "loss": 0.3931, "step": 1455 }, { "epoch": 0.41092034900084434, "grad_norm": 1.0197157859802246, "learning_rate": 0.00019330659862746603, "loss": 0.4888, "step": 1460 }, { "epoch": 0.4123276104700253, "grad_norm": 1.248344898223877, "learning_rate": 0.00019321796326384082, "loss": 0.4607, "step": 1465 }, { "epoch": 0.41373487193920633, "grad_norm": 0.8360584378242493, "learning_rate": 0.00019312876548372286, "loss": 0.5113, "step": 1470 }, { "epoch": 0.4151421334083873, "grad_norm": 1.7348827123641968, "learning_rate": 0.00019303900582527344, "loss": 0.511, "step": 1475 }, { "epoch": 0.41654939487756826, "grad_norm": 1.2273963689804077, "learning_rate": 0.00019294868483004396, "loss": 0.3603, "step": 1480 }, { "epoch": 0.4179566563467492, "grad_norm": 1.0628288984298706, "learning_rate": 0.00019285780304297245, "loss": 0.5377, "step": 1485 }, { "epoch": 0.4193639178159302, "grad_norm": 1.1135960817337036, "learning_rate": 0.00019276636101238045, "loss": 0.3928, "step": 1490 }, { "epoch": 0.42077117928511115, "grad_norm": 0.8842063546180725, "learning_rate": 0.00019267435928996962, "loss": 0.4252, "step": 1495 }, { "epoch": 0.4221784407542922, "grad_norm": 0.56885826587677, "learning_rate": 0.00019258179843081847, "loss": 0.5456, "step": 1500 }, { "epoch": 0.42358570222347314, "grad_norm": 0.5579463243484497, "learning_rate": 0.00019248867899337896, "loss": 0.3585, "step": 1505 }, { "epoch": 0.4249929636926541, "grad_norm": 1.1640398502349854, "learning_rate": 0.00019239500153947305, "loss": 0.5048, "step": 1510 }, { "epoch": 0.42640022516183507, "grad_norm": 0.8812012672424316, "learning_rate": 0.00019230076663428962, "loss": 0.4503, "step": 1515 }, { "epoch": 0.42780748663101603, "grad_norm": 1.1245768070220947, "learning_rate": 0.0001922059748463807, "loss": 0.364, "step": 1520 }, { "epoch": 0.429214748100197, "grad_norm": 1.0180691480636597, "learning_rate": 0.00019211062674765817, "loss": 0.4229, "step": 1525 }, { "epoch": 0.430622009569378, "grad_norm": 1.3053510189056396, "learning_rate": 0.0001920147229133904, "loss": 0.4794, "step": 1530 }, { "epoch": 0.432029271038559, "grad_norm": 0.8506336808204651, "learning_rate": 0.00019191826392219867, "loss": 0.5524, "step": 1535 }, { "epoch": 0.43343653250773995, "grad_norm": 1.0151127576828003, "learning_rate": 0.00019182125035605376, "loss": 0.5024, "step": 1540 }, { "epoch": 0.4348437939769209, "grad_norm": 1.094344973564148, "learning_rate": 0.00019172368280027233, "loss": 0.5535, "step": 1545 }, { "epoch": 0.4362510554461019, "grad_norm": 1.0190297365188599, "learning_rate": 0.00019162556184351348, "loss": 0.393, "step": 1550 }, { "epoch": 0.43765831691528284, "grad_norm": 1.502398133277893, "learning_rate": 0.00019152688807777516, "loss": 0.4018, "step": 1555 }, { "epoch": 0.4390655783844638, "grad_norm": 0.8518544435501099, "learning_rate": 0.00019142766209839064, "loss": 0.5682, "step": 1560 }, { "epoch": 0.4404728398536448, "grad_norm": 0.42057764530181885, "learning_rate": 0.0001913278845040249, "loss": 0.2624, "step": 1565 }, { "epoch": 0.4418801013228258, "grad_norm": 0.8204036951065063, "learning_rate": 0.00019122755589667093, "loss": 0.6987, "step": 1570 }, { "epoch": 0.44328736279200676, "grad_norm": 1.2145869731903076, "learning_rate": 0.00019112667688164626, "loss": 0.575, "step": 1575 }, { "epoch": 0.4446946242611877, "grad_norm": 1.5361616611480713, "learning_rate": 0.0001910252480675891, "loss": 0.466, "step": 1580 }, { "epoch": 0.4461018857303687, "grad_norm": 1.8853634595870972, "learning_rate": 0.00019092327006645497, "loss": 0.4938, "step": 1585 }, { "epoch": 0.44750914719954965, "grad_norm": 1.2990604639053345, "learning_rate": 0.00019082074349351268, "loss": 0.5759, "step": 1590 }, { "epoch": 0.44891640866873067, "grad_norm": 1.3845807313919067, "learning_rate": 0.0001907176689673408, "loss": 0.6341, "step": 1595 }, { "epoch": 0.45032367013791164, "grad_norm": 0.8449406027793884, "learning_rate": 0.0001906140471098239, "loss": 0.546, "step": 1600 }, { "epoch": 0.4517309316070926, "grad_norm": 1.2000244855880737, "learning_rate": 0.00019050987854614886, "loss": 0.5149, "step": 1605 }, { "epoch": 0.45313819307627357, "grad_norm": 0.8644974827766418, "learning_rate": 0.0001904051639048009, "loss": 0.5419, "step": 1610 }, { "epoch": 0.45454545454545453, "grad_norm": 0.4699718654155731, "learning_rate": 0.00019029990381756002, "loss": 0.3501, "step": 1615 }, { "epoch": 0.4559527160146355, "grad_norm": 0.6143896579742432, "learning_rate": 0.00019019409891949703, "loss": 0.4732, "step": 1620 }, { "epoch": 0.4573599774838165, "grad_norm": 1.4060841798782349, "learning_rate": 0.0001900877498489698, "loss": 0.6648, "step": 1625 }, { "epoch": 0.4587672389529975, "grad_norm": 1.3622968196868896, "learning_rate": 0.00018998085724761935, "loss": 0.3465, "step": 1630 }, { "epoch": 0.46017450042217845, "grad_norm": 0.6618224382400513, "learning_rate": 0.00018987342176036607, "loss": 0.5135, "step": 1635 }, { "epoch": 0.4615817618913594, "grad_norm": 1.253423810005188, "learning_rate": 0.0001897654440354057, "loss": 0.5411, "step": 1640 }, { "epoch": 0.4629890233605404, "grad_norm": 1.0359442234039307, "learning_rate": 0.00018965692472420554, "loss": 0.5266, "step": 1645 }, { "epoch": 0.46439628482972134, "grad_norm": 1.4265358448028564, "learning_rate": 0.00018954786448150047, "loss": 0.481, "step": 1650 }, { "epoch": 0.46580354629890236, "grad_norm": 0.6981240510940552, "learning_rate": 0.00018943826396528897, "loss": 0.287, "step": 1655 }, { "epoch": 0.4672108077680833, "grad_norm": 0.8274213671684265, "learning_rate": 0.00018932812383682917, "loss": 0.4081, "step": 1660 }, { "epoch": 0.4686180692372643, "grad_norm": 0.7835836410522461, "learning_rate": 0.0001892174447606349, "loss": 0.344, "step": 1665 }, { "epoch": 0.47002533070644525, "grad_norm": 1.9255175590515137, "learning_rate": 0.00018910622740447167, "loss": 0.6834, "step": 1670 }, { "epoch": 0.4714325921756262, "grad_norm": 1.7480101585388184, "learning_rate": 0.00018899447243935256, "loss": 0.4431, "step": 1675 }, { "epoch": 0.4728398536448072, "grad_norm": 0.7691779136657715, "learning_rate": 0.00018888218053953425, "loss": 0.5831, "step": 1680 }, { "epoch": 0.4742471151139882, "grad_norm": 0.6671115756034851, "learning_rate": 0.00018876935238251296, "loss": 0.3096, "step": 1685 }, { "epoch": 0.47565437658316917, "grad_norm": 0.7756052613258362, "learning_rate": 0.00018865598864902035, "loss": 0.4505, "step": 1690 }, { "epoch": 0.47706163805235013, "grad_norm": 0.7612590193748474, "learning_rate": 0.00018854209002301932, "loss": 0.5595, "step": 1695 }, { "epoch": 0.4784688995215311, "grad_norm": 0.9925332069396973, "learning_rate": 0.00018842765719170006, "loss": 0.3256, "step": 1700 }, { "epoch": 0.47987616099071206, "grad_norm": 1.4211307764053345, "learning_rate": 0.00018831269084547574, "loss": 0.3897, "step": 1705 }, { "epoch": 0.48128342245989303, "grad_norm": 0.8699591159820557, "learning_rate": 0.00018819719167797842, "loss": 0.348, "step": 1710 }, { "epoch": 0.48269068392907405, "grad_norm": 1.1962676048278809, "learning_rate": 0.00018808116038605493, "loss": 0.6022, "step": 1715 }, { "epoch": 0.484097945398255, "grad_norm": 1.0962321758270264, "learning_rate": 0.00018796459766976247, "loss": 0.4853, "step": 1720 }, { "epoch": 0.485505206867436, "grad_norm": 1.8502682447433472, "learning_rate": 0.00018784750423236462, "loss": 0.5438, "step": 1725 }, { "epoch": 0.48691246833661694, "grad_norm": 0.8780159950256348, "learning_rate": 0.0001877298807803269, "loss": 0.4728, "step": 1730 }, { "epoch": 0.4883197298057979, "grad_norm": 1.3143213987350464, "learning_rate": 0.00018761172802331263, "loss": 0.648, "step": 1735 }, { "epoch": 0.4897269912749789, "grad_norm": 1.3124626874923706, "learning_rate": 0.00018749304667417863, "loss": 0.568, "step": 1740 }, { "epoch": 0.49113425274415984, "grad_norm": 1.2247035503387451, "learning_rate": 0.0001873738374489709, "loss": 0.3325, "step": 1745 }, { "epoch": 0.49254151421334086, "grad_norm": 0.8056420683860779, "learning_rate": 0.00018725410106692025, "loss": 0.5355, "step": 1750 }, { "epoch": 0.4939487756825218, "grad_norm": 1.782456636428833, "learning_rate": 0.00018713383825043806, "loss": 0.3927, "step": 1755 }, { "epoch": 0.4953560371517028, "grad_norm": 0.9671362638473511, "learning_rate": 0.00018701304972511187, "loss": 0.4428, "step": 1760 }, { "epoch": 0.49676329862088375, "grad_norm": 0.8646135330200195, "learning_rate": 0.00018689173621970096, "loss": 0.396, "step": 1765 }, { "epoch": 0.4981705600900647, "grad_norm": 1.406186580657959, "learning_rate": 0.00018676989846613205, "loss": 0.4296, "step": 1770 }, { "epoch": 0.4995778215592457, "grad_norm": 1.2148306369781494, "learning_rate": 0.00018664753719949478, "loss": 0.3217, "step": 1775 }, { "epoch": 0.5009850830284267, "grad_norm": 2.317777395248413, "learning_rate": 0.00018652465315803745, "loss": 0.5039, "step": 1780 }, { "epoch": 0.5023923444976076, "grad_norm": 2.461662530899048, "learning_rate": 0.00018640124708316225, "loss": 0.5716, "step": 1785 }, { "epoch": 0.5037996059667886, "grad_norm": 1.3684732913970947, "learning_rate": 0.0001862773197194211, "loss": 0.3489, "step": 1790 }, { "epoch": 0.5052068674359697, "grad_norm": 0.7968658208847046, "learning_rate": 0.00018615287181451108, "loss": 0.4202, "step": 1795 }, { "epoch": 0.5066141289051506, "grad_norm": 1.1133559942245483, "learning_rate": 0.00018602790411926975, "loss": 0.4799, "step": 1800 }, { "epoch": 0.5080213903743316, "grad_norm": 1.4438867568969727, "learning_rate": 0.0001859024173876709, "loss": 0.5841, "step": 1805 }, { "epoch": 0.5094286518435125, "grad_norm": 0.5369459986686707, "learning_rate": 0.0001857764123768196, "loss": 0.4793, "step": 1810 }, { "epoch": 0.5108359133126935, "grad_norm": 0.7949886918067932, "learning_rate": 0.0001856498898469482, "loss": 0.4041, "step": 1815 }, { "epoch": 0.5122431747818744, "grad_norm": 0.5967936515808105, "learning_rate": 0.00018552285056141124, "loss": 0.3951, "step": 1820 }, { "epoch": 0.5136504362510554, "grad_norm": 0.32833540439605713, "learning_rate": 0.00018539529528668094, "loss": 0.2362, "step": 1825 }, { "epoch": 0.5150576977202365, "grad_norm": 0.7846612334251404, "learning_rate": 0.00018526722479234286, "loss": 0.4279, "step": 1830 }, { "epoch": 0.5164649591894174, "grad_norm": 1.5786385536193848, "learning_rate": 0.00018513863985109095, "loss": 0.429, "step": 1835 }, { "epoch": 0.5178722206585984, "grad_norm": 1.2571947574615479, "learning_rate": 0.00018500954123872303, "loss": 0.6325, "step": 1840 }, { "epoch": 0.5192794821277793, "grad_norm": 0.807839035987854, "learning_rate": 0.00018487992973413605, "loss": 0.3732, "step": 1845 }, { "epoch": 0.5206867435969603, "grad_norm": 0.9321346282958984, "learning_rate": 0.00018474980611932144, "loss": 0.5329, "step": 1850 }, { "epoch": 0.5220940050661413, "grad_norm": 1.1516450643539429, "learning_rate": 0.0001846191711793604, "loss": 0.553, "step": 1855 }, { "epoch": 0.5235012665353223, "grad_norm": 1.2552000284194946, "learning_rate": 0.000184488025702419, "loss": 0.5088, "step": 1860 }, { "epoch": 0.5249085280045033, "grad_norm": 0.7412288188934326, "learning_rate": 0.00018435637047974375, "loss": 0.623, "step": 1865 }, { "epoch": 0.5263157894736842, "grad_norm": 0.7325606942176819, "learning_rate": 0.0001842242063056565, "loss": 0.4663, "step": 1870 }, { "epoch": 0.5277230509428652, "grad_norm": 0.7041971683502197, "learning_rate": 0.0001840915339775498, "loss": 0.3317, "step": 1875 }, { "epoch": 0.5291303124120461, "grad_norm": 0.8097009062767029, "learning_rate": 0.00018395835429588215, "loss": 0.5374, "step": 1880 }, { "epoch": 0.5305375738812271, "grad_norm": 0.5471770763397217, "learning_rate": 0.000183824668064173, "loss": 0.6708, "step": 1885 }, { "epoch": 0.5319448353504082, "grad_norm": 0.9955052137374878, "learning_rate": 0.00018369047608899798, "loss": 0.3958, "step": 1890 }, { "epoch": 0.5333520968195891, "grad_norm": 0.980060875415802, "learning_rate": 0.00018355577917998414, "loss": 0.5356, "step": 1895 }, { "epoch": 0.5347593582887701, "grad_norm": 0.8592010736465454, "learning_rate": 0.00018342057814980494, "loss": 0.5253, "step": 1900 }, { "epoch": 0.536166619757951, "grad_norm": 0.8325905799865723, "learning_rate": 0.00018328487381417532, "loss": 0.5743, "step": 1905 }, { "epoch": 0.537573881227132, "grad_norm": 1.0972857475280762, "learning_rate": 0.00018314866699184687, "loss": 0.6613, "step": 1910 }, { "epoch": 0.5389811426963129, "grad_norm": 0.9051984548568726, "learning_rate": 0.00018301195850460293, "loss": 0.5146, "step": 1915 }, { "epoch": 0.5403884041654939, "grad_norm": 0.8490184545516968, "learning_rate": 0.00018287474917725343, "loss": 0.6052, "step": 1920 }, { "epoch": 0.541795665634675, "grad_norm": 0.9744853377342224, "learning_rate": 0.00018273703983763017, "loss": 0.556, "step": 1925 }, { "epoch": 0.5432029271038559, "grad_norm": 0.9393332600593567, "learning_rate": 0.0001825988313165816, "loss": 0.6805, "step": 1930 }, { "epoch": 0.5446101885730369, "grad_norm": 0.786738932132721, "learning_rate": 0.0001824601244479679, "loss": 0.5313, "step": 1935 }, { "epoch": 0.5460174500422178, "grad_norm": 1.7297477722167969, "learning_rate": 0.00018232092006865606, "loss": 0.6627, "step": 1940 }, { "epoch": 0.5474247115113988, "grad_norm": 0.8226016759872437, "learning_rate": 0.00018218121901851468, "loss": 0.4177, "step": 1945 }, { "epoch": 0.5488319729805798, "grad_norm": 1.1636661291122437, "learning_rate": 0.0001820410221404089, "loss": 0.5303, "step": 1950 }, { "epoch": 0.5502392344497608, "grad_norm": 1.3004634380340576, "learning_rate": 0.00018190033028019534, "loss": 0.5114, "step": 1955 }, { "epoch": 0.5516464959189418, "grad_norm": 1.512581706047058, "learning_rate": 0.00018175914428671716, "loss": 0.5918, "step": 1960 }, { "epoch": 0.5530537573881227, "grad_norm": 0.7482631206512451, "learning_rate": 0.0001816174650117987, "loss": 0.6304, "step": 1965 }, { "epoch": 0.5544610188573037, "grad_norm": 1.3120630979537964, "learning_rate": 0.00018147529331024044, "loss": 0.5008, "step": 1970 }, { "epoch": 0.5558682803264846, "grad_norm": 0.9526933431625366, "learning_rate": 0.00018133263003981384, "loss": 0.6951, "step": 1975 }, { "epoch": 0.5572755417956656, "grad_norm": 0.8142489194869995, "learning_rate": 0.0001811894760612562, "loss": 0.478, "step": 1980 }, { "epoch": 0.5586828032648467, "grad_norm": 1.5639302730560303, "learning_rate": 0.0001810458322382654, "loss": 0.6378, "step": 1985 }, { "epoch": 0.5600900647340276, "grad_norm": 0.6878836154937744, "learning_rate": 0.00018090169943749476, "loss": 0.6067, "step": 1990 }, { "epoch": 0.5614973262032086, "grad_norm": 1.1296664476394653, "learning_rate": 0.0001807570785285477, "loss": 0.6044, "step": 1995 }, { "epoch": 0.5629045876723895, "grad_norm": 0.837823748588562, "learning_rate": 0.00018061197038397268, "loss": 0.4684, "step": 2000 }, { "epoch": 0.5643118491415705, "grad_norm": 1.2144043445587158, "learning_rate": 0.0001804663758792577, "loss": 0.3649, "step": 2005 }, { "epoch": 0.5657191106107515, "grad_norm": 0.8372750878334045, "learning_rate": 0.00018032029589282525, "loss": 0.4253, "step": 2010 }, { "epoch": 0.5671263720799324, "grad_norm": 0.8684276342391968, "learning_rate": 0.00018017373130602683, "loss": 0.3992, "step": 2015 }, { "epoch": 0.5685336335491135, "grad_norm": 0.9675285816192627, "learning_rate": 0.0001800266830031377, "loss": 0.5995, "step": 2020 }, { "epoch": 0.5699408950182944, "grad_norm": 0.9824860692024231, "learning_rate": 0.00017987915187135157, "loss": 0.2531, "step": 2025 }, { "epoch": 0.5713481564874754, "grad_norm": 2.90608549118042, "learning_rate": 0.0001797311388007753, "loss": 0.6474, "step": 2030 }, { "epoch": 0.5727554179566563, "grad_norm": 0.922585666179657, "learning_rate": 0.00017958264468442332, "loss": 0.4685, "step": 2035 }, { "epoch": 0.5741626794258373, "grad_norm": 1.4679278135299683, "learning_rate": 0.00017943367041821243, "loss": 0.4786, "step": 2040 }, { "epoch": 0.5755699408950183, "grad_norm": 0.8750627040863037, "learning_rate": 0.00017928421690095636, "loss": 0.317, "step": 2045 }, { "epoch": 0.5769772023641992, "grad_norm": 1.1974796056747437, "learning_rate": 0.00017913428503436035, "loss": 0.496, "step": 2050 }, { "epoch": 0.5783844638333803, "grad_norm": 0.8931379914283752, "learning_rate": 0.00017898387572301563, "loss": 0.6886, "step": 2055 }, { "epoch": 0.5797917253025612, "grad_norm": 1.0573607683181763, "learning_rate": 0.00017883298987439404, "loss": 0.5887, "step": 2060 }, { "epoch": 0.5811989867717422, "grad_norm": 1.1087405681610107, "learning_rate": 0.00017868162839884254, "loss": 0.5817, "step": 2065 }, { "epoch": 0.5826062482409231, "grad_norm": 0.5602430701255798, "learning_rate": 0.00017852979220957775, "loss": 0.4194, "step": 2070 }, { "epoch": 0.5840135097101041, "grad_norm": 0.9328368306159973, "learning_rate": 0.00017837748222268037, "loss": 0.3816, "step": 2075 }, { "epoch": 0.5854207711792851, "grad_norm": 1.4052832126617432, "learning_rate": 0.00017822469935708965, "loss": 0.7981, "step": 2080 }, { "epoch": 0.5868280326484661, "grad_norm": 1.0276223421096802, "learning_rate": 0.00017807144453459793, "loss": 0.4105, "step": 2085 }, { "epoch": 0.5882352941176471, "grad_norm": 1.257156491279602, "learning_rate": 0.00017791771867984503, "loss": 0.5565, "step": 2090 }, { "epoch": 0.589642555586828, "grad_norm": 1.0978988409042358, "learning_rate": 0.00017776352272031264, "loss": 0.5929, "step": 2095 }, { "epoch": 0.591049817056009, "grad_norm": 0.8809897303581238, "learning_rate": 0.0001776088575863188, "loss": 0.3527, "step": 2100 }, { "epoch": 0.59245707852519, "grad_norm": 0.6997563242912292, "learning_rate": 0.00017745372421101223, "loss": 0.5211, "step": 2105 }, { "epoch": 0.5938643399943709, "grad_norm": 0.9955636262893677, "learning_rate": 0.00017729812353036668, "loss": 0.5267, "step": 2110 }, { "epoch": 0.595271601463552, "grad_norm": 0.8788183927536011, "learning_rate": 0.00017714205648317535, "loss": 0.5372, "step": 2115 }, { "epoch": 0.5966788629327329, "grad_norm": 1.0072330236434937, "learning_rate": 0.00017698552401104517, "loss": 0.5234, "step": 2120 }, { "epoch": 0.5980861244019139, "grad_norm": 1.6254470348358154, "learning_rate": 0.00017682852705839115, "loss": 0.4621, "step": 2125 }, { "epoch": 0.5994933858710948, "grad_norm": 1.0389853715896606, "learning_rate": 0.00017667106657243072, "loss": 0.5439, "step": 2130 }, { "epoch": 0.6009006473402758, "grad_norm": 0.9769371151924133, "learning_rate": 0.00017651314350317787, "loss": 0.6171, "step": 2135 }, { "epoch": 0.6023079088094568, "grad_norm": 1.7502343654632568, "learning_rate": 0.0001763547588034376, "loss": 0.612, "step": 2140 }, { "epoch": 0.6037151702786377, "grad_norm": 1.1023430824279785, "learning_rate": 0.00017619591342880005, "loss": 0.4228, "step": 2145 }, { "epoch": 0.6051224317478188, "grad_norm": 2.0511550903320312, "learning_rate": 0.00017603660833763476, "loss": 0.3462, "step": 2150 }, { "epoch": 0.6065296932169997, "grad_norm": 0.7986024022102356, "learning_rate": 0.00017587684449108497, "loss": 0.4616, "step": 2155 }, { "epoch": 0.6079369546861807, "grad_norm": 0.7450430989265442, "learning_rate": 0.00017571662285306166, "loss": 0.5481, "step": 2160 }, { "epoch": 0.6093442161553617, "grad_norm": 1.1748677492141724, "learning_rate": 0.00017555594439023787, "loss": 0.5419, "step": 2165 }, { "epoch": 0.6107514776245426, "grad_norm": 0.7183251976966858, "learning_rate": 0.0001753948100720429, "loss": 0.4122, "step": 2170 }, { "epoch": 0.6121587390937236, "grad_norm": 0.7296462655067444, "learning_rate": 0.00017523322087065614, "loss": 0.3651, "step": 2175 }, { "epoch": 0.6135660005629046, "grad_norm": 0.5904517769813538, "learning_rate": 0.00017507117776100178, "loss": 0.3728, "step": 2180 }, { "epoch": 0.6149732620320856, "grad_norm": 1.5718715190887451, "learning_rate": 0.00017490868172074232, "loss": 0.4729, "step": 2185 }, { "epoch": 0.6163805235012665, "grad_norm": 1.053885579109192, "learning_rate": 0.00017474573373027315, "loss": 0.4341, "step": 2190 }, { "epoch": 0.6177877849704475, "grad_norm": 0.723726212978363, "learning_rate": 0.00017458233477271628, "loss": 0.4755, "step": 2195 }, { "epoch": 0.6191950464396285, "grad_norm": 1.133907437324524, "learning_rate": 0.00017441848583391463, "loss": 0.7399, "step": 2200 }, { "epoch": 0.6206023079088094, "grad_norm": 0.5922422409057617, "learning_rate": 0.00017425418790242606, "loss": 0.4381, "step": 2205 }, { "epoch": 0.6220095693779905, "grad_norm": 0.534817636013031, "learning_rate": 0.0001740894419695172, "loss": 0.4668, "step": 2210 }, { "epoch": 0.6234168308471714, "grad_norm": 0.5950006246566772, "learning_rate": 0.00017392424902915786, "loss": 0.3497, "step": 2215 }, { "epoch": 0.6248240923163524, "grad_norm": 3.878748655319214, "learning_rate": 0.00017375861007801465, "loss": 0.2247, "step": 2220 }, { "epoch": 0.6262313537855334, "grad_norm": 1.3402066230773926, "learning_rate": 0.00017359252611544505, "loss": 0.3214, "step": 2225 }, { "epoch": 0.6276386152547143, "grad_norm": 1.3445652723312378, "learning_rate": 0.0001734259981434917, "loss": 0.4757, "step": 2230 }, { "epoch": 0.6290458767238953, "grad_norm": 0.801052987575531, "learning_rate": 0.00017325902716687578, "loss": 0.542, "step": 2235 }, { "epoch": 0.6304531381930762, "grad_norm": 0.6313127279281616, "learning_rate": 0.0001730916141929916, "loss": 0.6026, "step": 2240 }, { "epoch": 0.6318603996622573, "grad_norm": 0.7048347592353821, "learning_rate": 0.00017292376023189996, "loss": 0.4769, "step": 2245 }, { "epoch": 0.6332676611314382, "grad_norm": 1.3377580642700195, "learning_rate": 0.00017275546629632235, "loss": 0.3727, "step": 2250 }, { "epoch": 0.6346749226006192, "grad_norm": 1.3854931592941284, "learning_rate": 0.00017258673340163485, "loss": 0.4537, "step": 2255 }, { "epoch": 0.6360821840698002, "grad_norm": 1.5850138664245605, "learning_rate": 0.00017241756256586183, "loss": 0.5933, "step": 2260 }, { "epoch": 0.6374894455389811, "grad_norm": 1.3591883182525635, "learning_rate": 0.00017224795480967, "loss": 0.3786, "step": 2265 }, { "epoch": 0.6388967070081621, "grad_norm": 0.685483992099762, "learning_rate": 0.00017207791115636206, "loss": 0.3562, "step": 2270 }, { "epoch": 0.640303968477343, "grad_norm": 1.1758111715316772, "learning_rate": 0.00017190743263187076, "loss": 0.3506, "step": 2275 }, { "epoch": 0.6417112299465241, "grad_norm": 0.9146699905395508, "learning_rate": 0.00017173652026475247, "loss": 0.4753, "step": 2280 }, { "epoch": 0.643118491415705, "grad_norm": 0.6895302534103394, "learning_rate": 0.00017156517508618116, "loss": 0.2637, "step": 2285 }, { "epoch": 0.644525752884886, "grad_norm": 1.011983036994934, "learning_rate": 0.00017139339812994204, "loss": 0.551, "step": 2290 }, { "epoch": 0.645933014354067, "grad_norm": 1.5470740795135498, "learning_rate": 0.0001712211904324254, "loss": 0.6397, "step": 2295 }, { "epoch": 0.6473402758232479, "grad_norm": 0.8334661722183228, "learning_rate": 0.0001710485530326204, "loss": 0.3297, "step": 2300 }, { "epoch": 0.648747537292429, "grad_norm": 1.3184936046600342, "learning_rate": 0.00017087548697210868, "loss": 0.2933, "step": 2305 }, { "epoch": 0.6501547987616099, "grad_norm": 0.6180691719055176, "learning_rate": 0.00017070199329505815, "loss": 0.316, "step": 2310 }, { "epoch": 0.6515620602307909, "grad_norm": 1.5314627885818481, "learning_rate": 0.00017052807304821673, "loss": 0.4908, "step": 2315 }, { "epoch": 0.6529693216999719, "grad_norm": 0.2867351472377777, "learning_rate": 0.0001703537272809059, "loss": 0.4078, "step": 2320 }, { "epoch": 0.6543765831691528, "grad_norm": 1.513857126235962, "learning_rate": 0.00017017895704501447, "loss": 0.5121, "step": 2325 }, { "epoch": 0.6557838446383338, "grad_norm": 0.7989262938499451, "learning_rate": 0.00017000376339499233, "loss": 0.4578, "step": 2330 }, { "epoch": 0.6571911061075147, "grad_norm": 1.8081159591674805, "learning_rate": 0.00016982814738784386, "loss": 0.3809, "step": 2335 }, { "epoch": 0.6585983675766958, "grad_norm": 1.2163859605789185, "learning_rate": 0.0001696521100831216, "loss": 0.3293, "step": 2340 }, { "epoch": 0.6600056290458767, "grad_norm": 1.5051732063293457, "learning_rate": 0.00016947565254292016, "loss": 0.33, "step": 2345 }, { "epoch": 0.6614128905150577, "grad_norm": 0.6793294548988342, "learning_rate": 0.00016929877583186936, "loss": 0.5292, "step": 2350 }, { "epoch": 0.6628201519842387, "grad_norm": 1.8864996433258057, "learning_rate": 0.00016912148101712814, "loss": 0.1853, "step": 2355 }, { "epoch": 0.6642274134534196, "grad_norm": 1.2697969675064087, "learning_rate": 0.00016894376916837795, "loss": 0.4886, "step": 2360 }, { "epoch": 0.6656346749226006, "grad_norm": 1.4264556169509888, "learning_rate": 0.00016876564135781638, "loss": 0.5061, "step": 2365 }, { "epoch": 0.6670419363917816, "grad_norm": 0.5291624665260315, "learning_rate": 0.00016858709866015065, "loss": 0.4241, "step": 2370 }, { "epoch": 0.6684491978609626, "grad_norm": 1.5842996835708618, "learning_rate": 0.00016840814215259112, "loss": 0.4321, "step": 2375 }, { "epoch": 0.6698564593301436, "grad_norm": 0.7339175939559937, "learning_rate": 0.0001682287729148449, "loss": 0.4975, "step": 2380 }, { "epoch": 0.6712637207993245, "grad_norm": 0.6193541884422302, "learning_rate": 0.00016804899202910907, "loss": 0.1977, "step": 2385 }, { "epoch": 0.6726709822685055, "grad_norm": 1.8930505514144897, "learning_rate": 0.00016786880058006453, "loss": 0.6117, "step": 2390 }, { "epoch": 0.6740782437376864, "grad_norm": 1.268921971321106, "learning_rate": 0.0001676881996548691, "loss": 0.5449, "step": 2395 }, { "epoch": 0.6754855052068675, "grad_norm": 1.5368669033050537, "learning_rate": 0.00016750719034315121, "loss": 0.4734, "step": 2400 }, { "epoch": 0.6768927666760484, "grad_norm": 0.8705158233642578, "learning_rate": 0.00016732577373700314, "loss": 0.4644, "step": 2405 }, { "epoch": 0.6783000281452294, "grad_norm": 0.3128531873226166, "learning_rate": 0.00016714395093097458, "loss": 0.4438, "step": 2410 }, { "epoch": 0.6797072896144104, "grad_norm": 1.795952558517456, "learning_rate": 0.00016696172302206597, "loss": 0.463, "step": 2415 }, { "epoch": 0.6811145510835913, "grad_norm": 0.8031005263328552, "learning_rate": 0.00016677909110972183, "loss": 0.727, "step": 2420 }, { "epoch": 0.6825218125527723, "grad_norm": 1.083425760269165, "learning_rate": 0.00016659605629582418, "loss": 0.6498, "step": 2425 }, { "epoch": 0.6839290740219532, "grad_norm": 0.9262056350708008, "learning_rate": 0.00016641261968468598, "loss": 0.3122, "step": 2430 }, { "epoch": 0.6853363354911343, "grad_norm": 0.27757611870765686, "learning_rate": 0.00016622878238304424, "loss": 0.3477, "step": 2435 }, { "epoch": 0.6867435969603152, "grad_norm": 0.6037611365318298, "learning_rate": 0.00016604454550005356, "loss": 0.2896, "step": 2440 }, { "epoch": 0.6881508584294962, "grad_norm": 0.7902546525001526, "learning_rate": 0.00016585991014727932, "loss": 0.6687, "step": 2445 }, { "epoch": 0.6895581198986772, "grad_norm": 0.8998187184333801, "learning_rate": 0.000165674877438691, "loss": 0.5168, "step": 2450 }, { "epoch": 0.6909653813678581, "grad_norm": 0.9715900421142578, "learning_rate": 0.0001654894484906555, "loss": 0.6263, "step": 2455 }, { "epoch": 0.6923726428370391, "grad_norm": 1.390411138534546, "learning_rate": 0.00016530362442193037, "loss": 0.4905, "step": 2460 }, { "epoch": 0.69377990430622, "grad_norm": 0.8985224366188049, "learning_rate": 0.00016511740635365705, "loss": 0.5525, "step": 2465 }, { "epoch": 0.6951871657754011, "grad_norm": 0.8099625110626221, "learning_rate": 0.00016493079540935406, "loss": 0.3906, "step": 2470 }, { "epoch": 0.6965944272445821, "grad_norm": 1.9844683408737183, "learning_rate": 0.00016474379271491033, "loss": 0.5456, "step": 2475 }, { "epoch": 0.698001688713763, "grad_norm": 1.053562045097351, "learning_rate": 0.00016455639939857842, "loss": 0.2934, "step": 2480 }, { "epoch": 0.699408950182944, "grad_norm": 1.4200698137283325, "learning_rate": 0.00016436861659096752, "loss": 0.6771, "step": 2485 }, { "epoch": 0.7008162116521249, "grad_norm": 0.7813885807991028, "learning_rate": 0.00016418044542503685, "loss": 0.357, "step": 2490 }, { "epoch": 0.702223473121306, "grad_norm": 1.131839632987976, "learning_rate": 0.00016399188703608867, "loss": 0.528, "step": 2495 }, { "epoch": 0.7036307345904869, "grad_norm": 0.7668808698654175, "learning_rate": 0.00016380294256176155, "loss": 0.4434, "step": 2500 }, { "epoch": 0.7050379960596679, "grad_norm": 2.0037477016448975, "learning_rate": 0.00016361361314202343, "loss": 0.5884, "step": 2505 }, { "epoch": 0.7064452575288489, "grad_norm": 0.726494550704956, "learning_rate": 0.0001634238999191647, "loss": 0.4555, "step": 2510 }, { "epoch": 0.7078525189980298, "grad_norm": 0.5868455171585083, "learning_rate": 0.0001632338040377915, "loss": 0.4513, "step": 2515 }, { "epoch": 0.7092597804672108, "grad_norm": 0.8666847348213196, "learning_rate": 0.00016304332664481848, "loss": 0.7028, "step": 2520 }, { "epoch": 0.7106670419363917, "grad_norm": 1.0513399839401245, "learning_rate": 0.00016285246888946234, "loss": 0.3972, "step": 2525 }, { "epoch": 0.7120743034055728, "grad_norm": 0.765617847442627, "learning_rate": 0.0001626612319232344, "loss": 0.4364, "step": 2530 }, { "epoch": 0.7134815648747538, "grad_norm": 0.7804258465766907, "learning_rate": 0.00016246961689993404, "loss": 0.6756, "step": 2535 }, { "epoch": 0.7148888263439347, "grad_norm": 1.0644882917404175, "learning_rate": 0.00016227762497564153, "loss": 0.4398, "step": 2540 }, { "epoch": 0.7162960878131157, "grad_norm": 1.0868752002716064, "learning_rate": 0.0001620852573087111, "loss": 0.4097, "step": 2545 }, { "epoch": 0.7177033492822966, "grad_norm": 0.877193033695221, "learning_rate": 0.00016189251505976403, "loss": 0.4445, "step": 2550 }, { "epoch": 0.7191106107514776, "grad_norm": 1.735767126083374, "learning_rate": 0.00016169939939168155, "loss": 0.4002, "step": 2555 }, { "epoch": 0.7205178722206586, "grad_norm": 0.679560124874115, "learning_rate": 0.00016150591146959787, "loss": 0.4376, "step": 2560 }, { "epoch": 0.7219251336898396, "grad_norm": 0.7569028735160828, "learning_rate": 0.00016131205246089304, "loss": 0.5988, "step": 2565 }, { "epoch": 0.7233323951590206, "grad_norm": 0.7681282758712769, "learning_rate": 0.00016111782353518624, "loss": 0.6736, "step": 2570 }, { "epoch": 0.7247396566282015, "grad_norm": 0.9109302759170532, "learning_rate": 0.0001609232258643282, "loss": 0.4269, "step": 2575 }, { "epoch": 0.7261469180973825, "grad_norm": 1.033499002456665, "learning_rate": 0.00016072826062239458, "loss": 0.4186, "step": 2580 }, { "epoch": 0.7275541795665634, "grad_norm": 0.765438437461853, "learning_rate": 0.00016053292898567876, "loss": 0.4688, "step": 2585 }, { "epoch": 0.7289614410357445, "grad_norm": 1.352359414100647, "learning_rate": 0.00016033723213268464, "loss": 0.4242, "step": 2590 }, { "epoch": 0.7303687025049254, "grad_norm": 0.9118134379386902, "learning_rate": 0.00016014117124411954, "loss": 0.4915, "step": 2595 }, { "epoch": 0.7317759639741064, "grad_norm": 1.1372839212417603, "learning_rate": 0.00015994474750288725, "loss": 0.3128, "step": 2600 }, { "epoch": 0.7331832254432874, "grad_norm": 0.23089000582695007, "learning_rate": 0.00015974796209408071, "loss": 0.4923, "step": 2605 }, { "epoch": 0.7345904869124683, "grad_norm": 1.543110728263855, "learning_rate": 0.00015955081620497497, "loss": 0.5901, "step": 2610 }, { "epoch": 0.7359977483816493, "grad_norm": 1.474463939666748, "learning_rate": 0.00015935331102501994, "loss": 0.5367, "step": 2615 }, { "epoch": 0.7374050098508302, "grad_norm": 0.7584693431854248, "learning_rate": 0.00015915544774583324, "loss": 0.6098, "step": 2620 }, { "epoch": 0.7388122713200113, "grad_norm": 0.6778565645217896, "learning_rate": 0.0001589572275611931, "loss": 0.4514, "step": 2625 }, { "epoch": 0.7402195327891923, "grad_norm": 0.7713000178337097, "learning_rate": 0.00015875865166703105, "loss": 0.2646, "step": 2630 }, { "epoch": 0.7416267942583732, "grad_norm": 1.2152999639511108, "learning_rate": 0.0001585597212614247, "loss": 0.5909, "step": 2635 }, { "epoch": 0.7430340557275542, "grad_norm": 1.4983125925064087, "learning_rate": 0.00015836043754459064, "loss": 0.4621, "step": 2640 }, { "epoch": 0.7444413171967351, "grad_norm": 1.0301270484924316, "learning_rate": 0.000158160801718877, "loss": 0.2372, "step": 2645 }, { "epoch": 0.7458485786659161, "grad_norm": 1.2305338382720947, "learning_rate": 0.0001579608149887564, "loss": 0.3397, "step": 2650 }, { "epoch": 0.747255840135097, "grad_norm": 1.1948976516723633, "learning_rate": 0.00015776047856081853, "loss": 0.3388, "step": 2655 }, { "epoch": 0.7486631016042781, "grad_norm": 1.539473295211792, "learning_rate": 0.00015755979364376295, "loss": 0.239, "step": 2660 }, { "epoch": 0.7500703630734591, "grad_norm": 2.136974811553955, "learning_rate": 0.0001573587614483918, "loss": 0.5409, "step": 2665 }, { "epoch": 0.75147762454264, "grad_norm": 1.2603963613510132, "learning_rate": 0.0001571573831876024, "loss": 0.3763, "step": 2670 }, { "epoch": 0.752884886011821, "grad_norm": 0.9054425954818726, "learning_rate": 0.00015695566007638013, "loss": 0.4531, "step": 2675 }, { "epoch": 0.7542921474810019, "grad_norm": 0.6948245763778687, "learning_rate": 0.0001567535933317908, "loss": 0.3894, "step": 2680 }, { "epoch": 0.755699408950183, "grad_norm": 1.3231799602508545, "learning_rate": 0.00015655118417297366, "loss": 0.4352, "step": 2685 }, { "epoch": 0.757106670419364, "grad_norm": 0.8093194365501404, "learning_rate": 0.00015634843382113372, "loss": 0.5505, "step": 2690 }, { "epoch": 0.7585139318885449, "grad_norm": 0.7088418006896973, "learning_rate": 0.0001561453434995346, "loss": 0.4232, "step": 2695 }, { "epoch": 0.7599211933577259, "grad_norm": 0.48376569151878357, "learning_rate": 0.00015594191443349105, "loss": 0.5123, "step": 2700 }, { "epoch": 0.7613284548269068, "grad_norm": 1.2853504419326782, "learning_rate": 0.00015573814785036164, "loss": 0.3733, "step": 2705 }, { "epoch": 0.7627357162960878, "grad_norm": 0.7034462690353394, "learning_rate": 0.00015553404497954117, "loss": 0.4144, "step": 2710 }, { "epoch": 0.7641429777652687, "grad_norm": 1.340484380722046, "learning_rate": 0.00015532960705245356, "loss": 0.4388, "step": 2715 }, { "epoch": 0.7655502392344498, "grad_norm": 0.7512633204460144, "learning_rate": 0.00015512483530254412, "loss": 0.4672, "step": 2720 }, { "epoch": 0.7669575007036308, "grad_norm": 2.1453585624694824, "learning_rate": 0.00015491973096527217, "loss": 0.8132, "step": 2725 }, { "epoch": 0.7683647621728117, "grad_norm": 1.0686702728271484, "learning_rate": 0.00015471429527810383, "loss": 0.3679, "step": 2730 }, { "epoch": 0.7697720236419927, "grad_norm": 1.7490125894546509, "learning_rate": 0.00015450852948050426, "loss": 0.3288, "step": 2735 }, { "epoch": 0.7711792851111736, "grad_norm": 1.7581394910812378, "learning_rate": 0.00015430243481393024, "loss": 0.6833, "step": 2740 }, { "epoch": 0.7725865465803546, "grad_norm": 1.5255379676818848, "learning_rate": 0.00015409601252182285, "loss": 0.4711, "step": 2745 }, { "epoch": 0.7739938080495357, "grad_norm": 1.7117855548858643, "learning_rate": 0.00015388926384959976, "loss": 0.6609, "step": 2750 }, { "epoch": 0.7754010695187166, "grad_norm": 0.5109424591064453, "learning_rate": 0.00015368219004464786, "loss": 0.3426, "step": 2755 }, { "epoch": 0.7768083309878976, "grad_norm": 1.3394129276275635, "learning_rate": 0.0001534747923563156, "loss": 0.4882, "step": 2760 }, { "epoch": 0.7782155924570785, "grad_norm": 1.1809154748916626, "learning_rate": 0.00015326707203590568, "loss": 0.262, "step": 2765 }, { "epoch": 0.7796228539262595, "grad_norm": 0.6428471207618713, "learning_rate": 0.0001530590303366672, "loss": 0.3657, "step": 2770 }, { "epoch": 0.7810301153954404, "grad_norm": 0.5726737976074219, "learning_rate": 0.0001528506685137883, "loss": 0.4514, "step": 2775 }, { "epoch": 0.7824373768646214, "grad_norm": 0.589094877243042, "learning_rate": 0.00015264198782438858, "loss": 0.5539, "step": 2780 }, { "epoch": 0.7838446383338025, "grad_norm": 0.7207341194152832, "learning_rate": 0.00015243298952751145, "loss": 0.3529, "step": 2785 }, { "epoch": 0.7852518998029834, "grad_norm": 1.0593701601028442, "learning_rate": 0.0001522236748841165, "loss": 0.317, "step": 2790 }, { "epoch": 0.7866591612721644, "grad_norm": 1.1395798921585083, "learning_rate": 0.000152014045157072, "loss": 0.5062, "step": 2795 }, { "epoch": 0.7880664227413453, "grad_norm": 1.3966251611709595, "learning_rate": 0.00015180410161114724, "loss": 0.4887, "step": 2800 }, { "epoch": 0.7894736842105263, "grad_norm": 0.7492479681968689, "learning_rate": 0.00015159384551300493, "loss": 0.3919, "step": 2805 }, { "epoch": 0.7908809456797072, "grad_norm": 1.2680071592330933, "learning_rate": 0.00015138327813119337, "loss": 0.3053, "step": 2810 }, { "epoch": 0.7922882071488883, "grad_norm": 1.4319703578948975, "learning_rate": 0.00015117240073613908, "loss": 0.3683, "step": 2815 }, { "epoch": 0.7936954686180693, "grad_norm": 1.0931735038757324, "learning_rate": 0.00015096121460013895, "loss": 0.5054, "step": 2820 }, { "epoch": 0.7951027300872502, "grad_norm": 0.627133309841156, "learning_rate": 0.00015074972099735266, "loss": 0.4424, "step": 2825 }, { "epoch": 0.7965099915564312, "grad_norm": 0.90239417552948, "learning_rate": 0.00015053792120379476, "loss": 0.5346, "step": 2830 }, { "epoch": 0.7979172530256121, "grad_norm": 1.3932188749313354, "learning_rate": 0.0001503258164973274, "loss": 0.5265, "step": 2835 }, { "epoch": 0.7993245144947931, "grad_norm": 1.2821606397628784, "learning_rate": 0.0001501134081576523, "loss": 0.3778, "step": 2840 }, { "epoch": 0.8007317759639742, "grad_norm": 0.8399055600166321, "learning_rate": 0.00014990069746630299, "loss": 0.5459, "step": 2845 }, { "epoch": 0.8021390374331551, "grad_norm": 2.0415430068969727, "learning_rate": 0.00014968768570663735, "loss": 0.534, "step": 2850 }, { "epoch": 0.8035462989023361, "grad_norm": 1.1202126741409302, "learning_rate": 0.00014947437416382956, "loss": 0.3913, "step": 2855 }, { "epoch": 0.804953560371517, "grad_norm": 1.3579108715057373, "learning_rate": 0.00014926076412486263, "loss": 0.3769, "step": 2860 }, { "epoch": 0.806360821840698, "grad_norm": 1.1060523986816406, "learning_rate": 0.00014904685687852043, "loss": 0.4045, "step": 2865 }, { "epoch": 0.8077680833098789, "grad_norm": 1.785001277923584, "learning_rate": 0.00014883265371538, "loss": 0.4895, "step": 2870 }, { "epoch": 0.80917534477906, "grad_norm": 0.7138920426368713, "learning_rate": 0.00014861815592780378, "loss": 0.2431, "step": 2875 }, { "epoch": 0.810582606248241, "grad_norm": 1.0932033061981201, "learning_rate": 0.00014840336480993172, "loss": 0.4196, "step": 2880 }, { "epoch": 0.8119898677174219, "grad_norm": 1.47943115234375, "learning_rate": 0.00014818828165767355, "loss": 0.4288, "step": 2885 }, { "epoch": 0.8133971291866029, "grad_norm": 1.5669611692428589, "learning_rate": 0.00014797290776870101, "loss": 0.7103, "step": 2890 }, { "epoch": 0.8148043906557838, "grad_norm": 1.002616047859192, "learning_rate": 0.0001477572444424399, "loss": 0.2174, "step": 2895 }, { "epoch": 0.8162116521249648, "grad_norm": 1.2607040405273438, "learning_rate": 0.00014754129298006228, "loss": 0.3312, "step": 2900 }, { "epoch": 0.8176189135941458, "grad_norm": 1.2113310098648071, "learning_rate": 0.00014732505468447867, "loss": 0.309, "step": 2905 }, { "epoch": 0.8190261750633268, "grad_norm": 0.6215373277664185, "learning_rate": 0.00014710853086033013, "loss": 0.3802, "step": 2910 }, { "epoch": 0.8204334365325078, "grad_norm": 0.9997283220291138, "learning_rate": 0.00014689172281398042, "loss": 0.5467, "step": 2915 }, { "epoch": 0.8218406980016887, "grad_norm": 0.7299907803535461, "learning_rate": 0.0001466746318535082, "loss": 0.4039, "step": 2920 }, { "epoch": 0.8232479594708697, "grad_norm": 0.8940709829330444, "learning_rate": 0.00014645725928869892, "loss": 0.282, "step": 2925 }, { "epoch": 0.8246552209400506, "grad_norm": 1.1947124004364014, "learning_rate": 0.00014623960643103705, "loss": 0.4364, "step": 2930 }, { "epoch": 0.8260624824092316, "grad_norm": 0.6835992932319641, "learning_rate": 0.00014602167459369826, "loss": 0.4539, "step": 2935 }, { "epoch": 0.8274697438784127, "grad_norm": 0.7021106481552124, "learning_rate": 0.00014580346509154136, "loss": 0.2876, "step": 2940 }, { "epoch": 0.8288770053475936, "grad_norm": 1.7289482355117798, "learning_rate": 0.00014558497924110038, "loss": 0.4377, "step": 2945 }, { "epoch": 0.8302842668167746, "grad_norm": 1.0549077987670898, "learning_rate": 0.00014536621836057665, "loss": 0.5667, "step": 2950 }, { "epoch": 0.8316915282859555, "grad_norm": 0.5255772471427917, "learning_rate": 0.000145147183769831, "loss": 0.4976, "step": 2955 }, { "epoch": 0.8330987897551365, "grad_norm": 2.376354694366455, "learning_rate": 0.00014492787679037537, "loss": 0.8001, "step": 2960 }, { "epoch": 0.8345060512243174, "grad_norm": 0.8916311264038086, "learning_rate": 0.0001447082987453654, "loss": 0.4217, "step": 2965 }, { "epoch": 0.8359133126934984, "grad_norm": 0.5236600637435913, "learning_rate": 0.00014448845095959192, "loss": 0.4531, "step": 2970 }, { "epoch": 0.8373205741626795, "grad_norm": 1.5615344047546387, "learning_rate": 0.00014426833475947345, "loss": 0.3796, "step": 2975 }, { "epoch": 0.8387278356318604, "grad_norm": 0.6851219534873962, "learning_rate": 0.00014404795147304774, "loss": 0.3966, "step": 2980 }, { "epoch": 0.8401350971010414, "grad_norm": 1.6611498594284058, "learning_rate": 0.00014382730242996404, "loss": 0.6284, "step": 2985 }, { "epoch": 0.8415423585702223, "grad_norm": 2.139336109161377, "learning_rate": 0.00014360638896147501, "loss": 0.4697, "step": 2990 }, { "epoch": 0.8429496200394033, "grad_norm": 1.0581591129302979, "learning_rate": 0.00014338521240042873, "loss": 0.5119, "step": 2995 }, { "epoch": 0.8443568815085843, "grad_norm": 0.885945200920105, "learning_rate": 0.00014316377408126046, "loss": 0.4225, "step": 3000 }, { "epoch": 0.8457641429777653, "grad_norm": 2.1063387393951416, "learning_rate": 0.00014294207533998486, "loss": 0.4308, "step": 3005 }, { "epoch": 0.8471714044469463, "grad_norm": 0.6381533741950989, "learning_rate": 0.00014272011751418782, "loss": 0.4063, "step": 3010 }, { "epoch": 0.8485786659161272, "grad_norm": 0.740987241268158, "learning_rate": 0.00014249790194301832, "loss": 0.2807, "step": 3015 }, { "epoch": 0.8499859273853082, "grad_norm": 0.8399060964584351, "learning_rate": 0.0001422754299671804, "loss": 0.3904, "step": 3020 }, { "epoch": 0.8513931888544891, "grad_norm": 1.4542044401168823, "learning_rate": 0.00014205270292892512, "loss": 0.5098, "step": 3025 }, { "epoch": 0.8528004503236701, "grad_norm": 0.8759632706642151, "learning_rate": 0.00014182972217204238, "loss": 0.438, "step": 3030 }, { "epoch": 0.8542077117928512, "grad_norm": 1.2544376850128174, "learning_rate": 0.00014160648904185295, "loss": 0.3654, "step": 3035 }, { "epoch": 0.8556149732620321, "grad_norm": 0.9191109538078308, "learning_rate": 0.00014138300488520007, "loss": 0.4855, "step": 3040 }, { "epoch": 0.8570222347312131, "grad_norm": 1.2452969551086426, "learning_rate": 0.00014115927105044172, "loss": 0.1865, "step": 3045 }, { "epoch": 0.858429496200394, "grad_norm": 1.0692249536514282, "learning_rate": 0.00014093528888744212, "loss": 0.3869, "step": 3050 }, { "epoch": 0.859836757669575, "grad_norm": 0.9611905217170715, "learning_rate": 0.00014071105974756382, "loss": 0.4429, "step": 3055 }, { "epoch": 0.861244019138756, "grad_norm": 1.419103741645813, "learning_rate": 0.00014048658498365946, "loss": 0.3828, "step": 3060 }, { "epoch": 0.8626512806079369, "grad_norm": 0.70958012342453, "learning_rate": 0.00014026186595006356, "loss": 0.4098, "step": 3065 }, { "epoch": 0.864058542077118, "grad_norm": 0.7273248434066772, "learning_rate": 0.0001400369040025845, "loss": 0.3795, "step": 3070 }, { "epoch": 0.8654658035462989, "grad_norm": 1.2816479206085205, "learning_rate": 0.00013981170049849614, "loss": 0.3648, "step": 3075 }, { "epoch": 0.8668730650154799, "grad_norm": 1.0046167373657227, "learning_rate": 0.00013958625679652982, "loss": 0.3949, "step": 3080 }, { "epoch": 0.8682803264846608, "grad_norm": 0.45679983496665955, "learning_rate": 0.000139360574256866, "loss": 0.3828, "step": 3085 }, { "epoch": 0.8696875879538418, "grad_norm": 0.7042393684387207, "learning_rate": 0.00013913465424112627, "loss": 0.3163, "step": 3090 }, { "epoch": 0.8710948494230228, "grad_norm": 0.7769744992256165, "learning_rate": 0.00013890849811236478, "loss": 0.275, "step": 3095 }, { "epoch": 0.8725021108922038, "grad_norm": 0.5500330328941345, "learning_rate": 0.0001386821072350604, "loss": 0.36, "step": 3100 }, { "epoch": 0.8739093723613848, "grad_norm": 1.508569359779358, "learning_rate": 0.00013845548297510834, "loss": 0.3744, "step": 3105 }, { "epoch": 0.8753166338305657, "grad_norm": 1.6323150396347046, "learning_rate": 0.0001382286266998117, "loss": 0.5385, "step": 3110 }, { "epoch": 0.8767238952997467, "grad_norm": 1.0691790580749512, "learning_rate": 0.00013800153977787364, "loss": 0.4918, "step": 3115 }, { "epoch": 0.8781311567689276, "grad_norm": 0.8545736074447632, "learning_rate": 0.0001377742235793887, "loss": 0.327, "step": 3120 }, { "epoch": 0.8795384182381086, "grad_norm": 1.2977032661437988, "learning_rate": 0.00013754667947583486, "loss": 0.3627, "step": 3125 }, { "epoch": 0.8809456797072897, "grad_norm": 0.8414074778556824, "learning_rate": 0.00013731890884006507, "loss": 0.4126, "step": 3130 }, { "epoch": 0.8823529411764706, "grad_norm": 1.2440998554229736, "learning_rate": 0.00013709091304629903, "loss": 0.5402, "step": 3135 }, { "epoch": 0.8837602026456516, "grad_norm": 1.1474038362503052, "learning_rate": 0.00013686269347011487, "loss": 0.4402, "step": 3140 }, { "epoch": 0.8851674641148325, "grad_norm": 1.9769107103347778, "learning_rate": 0.00013663425148844097, "loss": 0.5528, "step": 3145 }, { "epoch": 0.8865747255840135, "grad_norm": 1.071049451828003, "learning_rate": 0.00013640558847954746, "loss": 0.3496, "step": 3150 }, { "epoch": 0.8879819870531945, "grad_norm": 1.002313494682312, "learning_rate": 0.00013617670582303804, "loss": 0.4351, "step": 3155 }, { "epoch": 0.8893892485223754, "grad_norm": 0.8908954858779907, "learning_rate": 0.00013594760489984167, "loss": 0.3371, "step": 3160 }, { "epoch": 0.8907965099915565, "grad_norm": 0.9060853123664856, "learning_rate": 0.00013571828709220413, "loss": 0.2489, "step": 3165 }, { "epoch": 0.8922037714607374, "grad_norm": 0.7479000687599182, "learning_rate": 0.00013548875378367972, "loss": 0.2874, "step": 3170 }, { "epoch": 0.8936110329299184, "grad_norm": 0.9289246201515198, "learning_rate": 0.00013525900635912299, "loss": 0.466, "step": 3175 }, { "epoch": 0.8950182943990993, "grad_norm": 1.428377628326416, "learning_rate": 0.0001350290462046803, "loss": 0.5203, "step": 3180 }, { "epoch": 0.8964255558682803, "grad_norm": 0.7524283528327942, "learning_rate": 0.00013479887470778149, "loss": 0.365, "step": 3185 }, { "epoch": 0.8978328173374613, "grad_norm": 1.021815299987793, "learning_rate": 0.0001345684932571315, "loss": 0.5084, "step": 3190 }, { "epoch": 0.8992400788066423, "grad_norm": 0.7522305846214294, "learning_rate": 0.00013433790324270199, "loss": 0.2659, "step": 3195 }, { "epoch": 0.9006473402758233, "grad_norm": 1.3865163326263428, "learning_rate": 0.00013410710605572294, "loss": 0.2533, "step": 3200 }, { "epoch": 0.9020546017450042, "grad_norm": 1.8485382795333862, "learning_rate": 0.00013387610308867437, "loss": 0.3675, "step": 3205 }, { "epoch": 0.9034618632141852, "grad_norm": 1.203482985496521, "learning_rate": 0.0001336448957352777, "loss": 0.3284, "step": 3210 }, { "epoch": 0.9048691246833662, "grad_norm": 0.9714936017990112, "learning_rate": 0.00013341348539048752, "loss": 0.2657, "step": 3215 }, { "epoch": 0.9062763861525471, "grad_norm": 1.062326192855835, "learning_rate": 0.00013318187345048328, "loss": 0.3837, "step": 3220 }, { "epoch": 0.9076836476217282, "grad_norm": 1.3822613954544067, "learning_rate": 0.00013295006131266055, "loss": 0.3584, "step": 3225 }, { "epoch": 0.9090909090909091, "grad_norm": 1.2804548740386963, "learning_rate": 0.0001327180503756228, "loss": 0.4558, "step": 3230 }, { "epoch": 0.9104981705600901, "grad_norm": 0.6253718137741089, "learning_rate": 0.00013248584203917298, "loss": 0.2871, "step": 3235 }, { "epoch": 0.911905432029271, "grad_norm": 0.8237050175666809, "learning_rate": 0.00013225343770430502, "loss": 0.4014, "step": 3240 }, { "epoch": 0.913312693498452, "grad_norm": 0.9199953675270081, "learning_rate": 0.00013202083877319538, "loss": 0.597, "step": 3245 }, { "epoch": 0.914719954967633, "grad_norm": 1.0530214309692383, "learning_rate": 0.00013178804664919444, "loss": 0.5745, "step": 3250 }, { "epoch": 0.9161272164368139, "grad_norm": 1.0369855165481567, "learning_rate": 0.00013155506273681837, "loss": 0.2493, "step": 3255 }, { "epoch": 0.917534477905995, "grad_norm": 0.37017834186553955, "learning_rate": 0.00013132188844174042, "loss": 0.5125, "step": 3260 }, { "epoch": 0.9189417393751759, "grad_norm": 0.5272582769393921, "learning_rate": 0.0001310885251707824, "loss": 0.2099, "step": 3265 }, { "epoch": 0.9203490008443569, "grad_norm": 1.3228068351745605, "learning_rate": 0.00013085497433190635, "loss": 0.3625, "step": 3270 }, { "epoch": 0.9217562623135379, "grad_norm": 1.2980788946151733, "learning_rate": 0.000130621237334206, "loss": 0.3258, "step": 3275 }, { "epoch": 0.9231635237827188, "grad_norm": 0.7955147624015808, "learning_rate": 0.00013038731558789816, "loss": 0.331, "step": 3280 }, { "epoch": 0.9245707852518998, "grad_norm": 0.33198082447052, "learning_rate": 0.00013015321050431435, "loss": 0.2828, "step": 3285 }, { "epoch": 0.9259780467210807, "grad_norm": 1.193824052810669, "learning_rate": 0.0001299189234958922, "loss": 0.5299, "step": 3290 }, { "epoch": 0.9273853081902618, "grad_norm": 0.6841180324554443, "learning_rate": 0.00012968445597616695, "loss": 0.2236, "step": 3295 }, { "epoch": 0.9287925696594427, "grad_norm": 1.009793758392334, "learning_rate": 0.00012944980935976295, "loss": 0.4583, "step": 3300 }, { "epoch": 0.9301998311286237, "grad_norm": 1.1918591260910034, "learning_rate": 0.00012921498506238512, "loss": 0.4523, "step": 3305 }, { "epoch": 0.9316070925978047, "grad_norm": 0.7123336791992188, "learning_rate": 0.00012897998450081037, "loss": 0.3185, "step": 3310 }, { "epoch": 0.9330143540669856, "grad_norm": 0.6820237040519714, "learning_rate": 0.00012874480909287904, "loss": 0.4963, "step": 3315 }, { "epoch": 0.9344216155361666, "grad_norm": 0.6030889749526978, "learning_rate": 0.00012850946025748643, "loss": 0.3238, "step": 3320 }, { "epoch": 0.9358288770053476, "grad_norm": 0.3159545958042145, "learning_rate": 0.00012827393941457416, "loss": 0.1804, "step": 3325 }, { "epoch": 0.9372361384745286, "grad_norm": 0.500643789768219, "learning_rate": 0.00012803824798512166, "loss": 0.4421, "step": 3330 }, { "epoch": 0.9386433999437095, "grad_norm": 1.0271189212799072, "learning_rate": 0.00012780238739113755, "loss": 0.4825, "step": 3335 }, { "epoch": 0.9400506614128905, "grad_norm": 1.3835067749023438, "learning_rate": 0.000127566359055651, "loss": 0.5109, "step": 3340 }, { "epoch": 0.9414579228820715, "grad_norm": 0.6945546269416809, "learning_rate": 0.00012733016440270344, "loss": 0.3438, "step": 3345 }, { "epoch": 0.9428651843512524, "grad_norm": 0.5347813367843628, "learning_rate": 0.0001270938048573395, "loss": 0.2245, "step": 3350 }, { "epoch": 0.9442724458204335, "grad_norm": 0.5110495090484619, "learning_rate": 0.00012685728184559878, "loss": 0.3236, "step": 3355 }, { "epoch": 0.9456797072896144, "grad_norm": 1.1028776168823242, "learning_rate": 0.00012662059679450715, "loss": 0.3656, "step": 3360 }, { "epoch": 0.9470869687587954, "grad_norm": 1.0305935144424438, "learning_rate": 0.0001263837511320681, "loss": 0.2271, "step": 3365 }, { "epoch": 0.9484942302279764, "grad_norm": 1.1044567823410034, "learning_rate": 0.0001261467462872541, "loss": 0.3901, "step": 3370 }, { "epoch": 0.9499014916971573, "grad_norm": 1.0489617586135864, "learning_rate": 0.00012590958368999817, "loss": 0.3906, "step": 3375 }, { "epoch": 0.9513087531663383, "grad_norm": 0.9781221747398376, "learning_rate": 0.0001256722647711849, "loss": 0.3616, "step": 3380 }, { "epoch": 0.9527160146355192, "grad_norm": 1.1387841701507568, "learning_rate": 0.0001254347909626421, "loss": 0.2382, "step": 3385 }, { "epoch": 0.9541232761047003, "grad_norm": 1.3473316431045532, "learning_rate": 0.00012519716369713214, "loss": 0.446, "step": 3390 }, { "epoch": 0.9555305375738812, "grad_norm": 1.1464128494262695, "learning_rate": 0.00012495938440834327, "loss": 0.341, "step": 3395 }, { "epoch": 0.9569377990430622, "grad_norm": 0.9990252256393433, "learning_rate": 0.0001247214545308808, "loss": 0.4666, "step": 3400 }, { "epoch": 0.9583450605122432, "grad_norm": 1.9256302118301392, "learning_rate": 0.0001244833755002587, "loss": 0.4555, "step": 3405 }, { "epoch": 0.9597523219814241, "grad_norm": 0.8169670104980469, "learning_rate": 0.00012424514875289088, "loss": 0.6558, "step": 3410 }, { "epoch": 0.9611595834506051, "grad_norm": 1.60161554813385, "learning_rate": 0.0001240067757260824, "loss": 0.4544, "step": 3415 }, { "epoch": 0.9625668449197861, "grad_norm": 0.7437291741371155, "learning_rate": 0.0001237682578580208, "loss": 0.3022, "step": 3420 }, { "epoch": 0.9639741063889671, "grad_norm": 0.9030975699424744, "learning_rate": 0.00012352959658776767, "loss": 0.4267, "step": 3425 }, { "epoch": 0.9653813678581481, "grad_norm": 1.0298916101455688, "learning_rate": 0.00012329079335524973, "loss": 0.5084, "step": 3430 }, { "epoch": 0.966788629327329, "grad_norm": 1.4346392154693604, "learning_rate": 0.0001230518496012502, "loss": 0.5032, "step": 3435 }, { "epoch": 0.96819589079651, "grad_norm": 1.988788366317749, "learning_rate": 0.00012281276676739996, "loss": 0.5206, "step": 3440 }, { "epoch": 0.9696031522656909, "grad_norm": 0.627189040184021, "learning_rate": 0.00012257354629616933, "loss": 0.3927, "step": 3445 }, { "epoch": 0.971010413734872, "grad_norm": 1.1982104778289795, "learning_rate": 0.0001223341896308588, "loss": 0.4134, "step": 3450 }, { "epoch": 0.9724176752040529, "grad_norm": 1.1405185461044312, "learning_rate": 0.00012209469821559062, "loss": 0.314, "step": 3455 }, { "epoch": 0.9738249366732339, "grad_norm": 1.0637789964675903, "learning_rate": 0.00012185507349530006, "loss": 0.4855, "step": 3460 }, { "epoch": 0.9752321981424149, "grad_norm": 1.1884607076644897, "learning_rate": 0.00012161531691572665, "loss": 0.4043, "step": 3465 }, { "epoch": 0.9766394596115958, "grad_norm": 0.7082695960998535, "learning_rate": 0.00012137542992340552, "loss": 0.3864, "step": 3470 }, { "epoch": 0.9780467210807768, "grad_norm": 1.400940179824829, "learning_rate": 0.0001211354139656585, "loss": 0.3179, "step": 3475 }, { "epoch": 0.9794539825499577, "grad_norm": 1.0918678045272827, "learning_rate": 0.00012089527049058566, "loss": 0.3724, "step": 3480 }, { "epoch": 0.9808612440191388, "grad_norm": 0.8317002654075623, "learning_rate": 0.00012065500094705635, "loss": 0.4669, "step": 3485 }, { "epoch": 0.9822685054883197, "grad_norm": 2.4732000827789307, "learning_rate": 0.00012041460678470057, "loss": 0.536, "step": 3490 }, { "epoch": 0.9836757669575007, "grad_norm": 0.4239155650138855, "learning_rate": 0.00012017408945390009, "loss": 0.4178, "step": 3495 }, { "epoch": 0.9850830284266817, "grad_norm": 1.0096583366394043, "learning_rate": 0.00011993345040577995, "loss": 0.5533, "step": 3500 }, { "epoch": 0.9864902898958626, "grad_norm": 1.6637718677520752, "learning_rate": 0.00011969269109219945, "loss": 0.1999, "step": 3505 }, { "epoch": 0.9878975513650436, "grad_norm": 1.4339228868484497, "learning_rate": 0.0001194518129657435, "loss": 0.2913, "step": 3510 }, { "epoch": 0.9893048128342246, "grad_norm": 0.9473050236701965, "learning_rate": 0.00011921081747971392, "loss": 0.4202, "step": 3515 }, { "epoch": 0.9907120743034056, "grad_norm": 1.5468287467956543, "learning_rate": 0.00011896970608812053, "loss": 0.2755, "step": 3520 }, { "epoch": 0.9921193357725866, "grad_norm": 1.0197608470916748, "learning_rate": 0.00011872848024567245, "loss": 0.399, "step": 3525 }, { "epoch": 0.9935265972417675, "grad_norm": 1.9030907154083252, "learning_rate": 0.00011848714140776936, "loss": 0.3538, "step": 3530 }, { "epoch": 0.9949338587109485, "grad_norm": 1.1370608806610107, "learning_rate": 0.00011824569103049264, "loss": 0.6243, "step": 3535 }, { "epoch": 0.9963411201801294, "grad_norm": 0.7336493134498596, "learning_rate": 0.0001180041305705967, "loss": 0.287, "step": 3540 }, { "epoch": 0.9977483816493105, "grad_norm": 0.8091352581977844, "learning_rate": 0.0001177624614855, "loss": 0.4314, "step": 3545 }, { "epoch": 0.9991556431184914, "grad_norm": 0.8396396636962891, "learning_rate": 0.0001175206852332765, "loss": 0.243, "step": 3550 }, { "epoch": 1.0005629045876725, "grad_norm": 0.4893011152744293, "learning_rate": 0.00011727880327264667, "loss": 0.4008, "step": 3555 }, { "epoch": 1.0019701660568534, "grad_norm": 0.5934264659881592, "learning_rate": 0.00011703681706296871, "loss": 0.197, "step": 3560 }, { "epoch": 1.0033774275260343, "grad_norm": 0.9697572588920593, "learning_rate": 0.00011679472806422991, "loss": 0.2565, "step": 3565 }, { "epoch": 1.0047846889952152, "grad_norm": 0.6383791565895081, "learning_rate": 0.00011655253773703763, "loss": 0.1732, "step": 3570 }, { "epoch": 1.0061919504643964, "grad_norm": 2.7294044494628906, "learning_rate": 0.00011631024754261057, "loss": 0.344, "step": 3575 }, { "epoch": 1.0075992119335773, "grad_norm": 0.7987744212150574, "learning_rate": 0.00011606785894277002, "loss": 0.2462, "step": 3580 }, { "epoch": 1.0090064734027582, "grad_norm": 1.0963287353515625, "learning_rate": 0.00011582537339993102, "loss": 0.2017, "step": 3585 }, { "epoch": 1.0104137348719393, "grad_norm": 0.2937074303627014, "learning_rate": 0.00011558279237709337, "loss": 0.2587, "step": 3590 }, { "epoch": 1.0118209963411202, "grad_norm": 1.1680563688278198, "learning_rate": 0.00011534011733783303, "loss": 0.3315, "step": 3595 }, { "epoch": 1.0132282578103011, "grad_norm": 0.8227936029434204, "learning_rate": 0.00011509734974629316, "loss": 0.1936, "step": 3600 }, { "epoch": 1.014635519279482, "grad_norm": 1.266236424446106, "learning_rate": 0.0001148544910671754, "loss": 0.283, "step": 3605 }, { "epoch": 1.0160427807486632, "grad_norm": 0.4134606122970581, "learning_rate": 0.0001146115427657308, "loss": 0.1711, "step": 3610 }, { "epoch": 1.017450042217844, "grad_norm": 0.5949440598487854, "learning_rate": 0.00011436850630775127, "loss": 0.2659, "step": 3615 }, { "epoch": 1.018857303687025, "grad_norm": 1.2255134582519531, "learning_rate": 0.00011412538315956051, "loss": 0.331, "step": 3620 }, { "epoch": 1.0202645651562061, "grad_norm": 0.7793748378753662, "learning_rate": 0.00011388217478800536, "loss": 0.3107, "step": 3625 }, { "epoch": 1.021671826625387, "grad_norm": 1.5764113664627075, "learning_rate": 0.00011363888266044668, "loss": 0.2801, "step": 3630 }, { "epoch": 1.023079088094568, "grad_norm": 0.7818349599838257, "learning_rate": 0.0001133955082447508, "loss": 0.4592, "step": 3635 }, { "epoch": 1.0244863495637488, "grad_norm": 0.8325141072273254, "learning_rate": 0.00011315205300928047, "loss": 0.2221, "step": 3640 }, { "epoch": 1.02589361103293, "grad_norm": 0.8759342432022095, "learning_rate": 0.0001129085184228861, "loss": 0.2282, "step": 3645 }, { "epoch": 1.0273008725021109, "grad_norm": 0.8269652724266052, "learning_rate": 0.00011266490595489672, "loss": 0.288, "step": 3650 }, { "epoch": 1.0287081339712918, "grad_norm": 0.9182637929916382, "learning_rate": 0.0001124212170751114, "loss": 0.2124, "step": 3655 }, { "epoch": 1.030115395440473, "grad_norm": 0.7247250080108643, "learning_rate": 0.00011217745325379017, "loss": 0.2818, "step": 3660 }, { "epoch": 1.0315226569096538, "grad_norm": 1.1736894845962524, "learning_rate": 0.00011193361596164517, "loss": 0.2349, "step": 3665 }, { "epoch": 1.0329299183788347, "grad_norm": 0.3809513747692108, "learning_rate": 0.00011168970666983184, "loss": 0.158, "step": 3670 }, { "epoch": 1.0343371798480159, "grad_norm": 1.4163240194320679, "learning_rate": 0.0001114457268499401, "loss": 0.3035, "step": 3675 }, { "epoch": 1.0357444413171968, "grad_norm": 1.8142826557159424, "learning_rate": 0.00011120167797398527, "loss": 0.3572, "step": 3680 }, { "epoch": 1.0371517027863777, "grad_norm": 0.9238508343696594, "learning_rate": 0.00011095756151439934, "loss": 0.2104, "step": 3685 }, { "epoch": 1.0385589642555586, "grad_norm": 1.3922544717788696, "learning_rate": 0.0001107133789440221, "loss": 0.3846, "step": 3690 }, { "epoch": 1.0399662257247397, "grad_norm": 0.5761235952377319, "learning_rate": 0.00011046913173609217, "loss": 0.1728, "step": 3695 }, { "epoch": 1.0413734871939206, "grad_norm": 1.3399313688278198, "learning_rate": 0.0001102248213642382, "loss": 0.2158, "step": 3700 }, { "epoch": 1.0427807486631016, "grad_norm": 0.5189816355705261, "learning_rate": 0.00010998044930246985, "loss": 0.2724, "step": 3705 }, { "epoch": 1.0441880101322827, "grad_norm": 1.0454604625701904, "learning_rate": 0.00010973601702516903, "loss": 0.3016, "step": 3710 }, { "epoch": 1.0455952716014636, "grad_norm": 0.9476893544197083, "learning_rate": 0.00010949152600708096, "loss": 0.161, "step": 3715 }, { "epoch": 1.0470025330706445, "grad_norm": 1.1760029792785645, "learning_rate": 0.00010924697772330525, "loss": 0.3402, "step": 3720 }, { "epoch": 1.0484097945398254, "grad_norm": 0.7986089587211609, "learning_rate": 0.000109002373649287, "loss": 0.3381, "step": 3725 }, { "epoch": 1.0498170560090065, "grad_norm": 0.46115541458129883, "learning_rate": 0.00010875771526080791, "loss": 0.2121, "step": 3730 }, { "epoch": 1.0512243174781875, "grad_norm": 0.8159217238426208, "learning_rate": 0.00010851300403397741, "loss": 0.1618, "step": 3735 }, { "epoch": 1.0526315789473684, "grad_norm": 0.9532806277275085, "learning_rate": 0.00010826824144522369, "loss": 0.2001, "step": 3740 }, { "epoch": 1.0540388404165495, "grad_norm": 0.987647294998169, "learning_rate": 0.00010802342897128484, "loss": 0.1255, "step": 3745 }, { "epoch": 1.0554461018857304, "grad_norm": 0.5456539988517761, "learning_rate": 0.00010777856808919993, "loss": 0.1738, "step": 3750 }, { "epoch": 1.0568533633549113, "grad_norm": 1.2354178428649902, "learning_rate": 0.00010753366027630005, "loss": 0.1968, "step": 3755 }, { "epoch": 1.0582606248240922, "grad_norm": 1.5054504871368408, "learning_rate": 0.00010728870701019952, "loss": 0.3881, "step": 3760 }, { "epoch": 1.0596678862932734, "grad_norm": 0.33300110697746277, "learning_rate": 0.00010704370976878683, "loss": 0.3455, "step": 3765 }, { "epoch": 1.0610751477624543, "grad_norm": 0.28057172894477844, "learning_rate": 0.00010679867003021582, "loss": 0.3676, "step": 3770 }, { "epoch": 1.0624824092316352, "grad_norm": 0.78326416015625, "learning_rate": 0.0001065535892728967, "loss": 0.2051, "step": 3775 }, { "epoch": 1.0638896707008163, "grad_norm": 0.30371785163879395, "learning_rate": 0.00010630846897548719, "loss": 0.2172, "step": 3780 }, { "epoch": 1.0652969321699972, "grad_norm": 0.951871931552887, "learning_rate": 0.00010606331061688352, "loss": 0.2731, "step": 3785 }, { "epoch": 1.0667041936391781, "grad_norm": 0.9194802641868591, "learning_rate": 0.00010581811567621165, "loss": 0.437, "step": 3790 }, { "epoch": 1.068111455108359, "grad_norm": 1.3185656070709229, "learning_rate": 0.00010557288563281819, "loss": 0.1762, "step": 3795 }, { "epoch": 1.0695187165775402, "grad_norm": 0.6637858152389526, "learning_rate": 0.00010532762196626151, "loss": 0.3499, "step": 3800 }, { "epoch": 1.070925978046721, "grad_norm": 0.5646357536315918, "learning_rate": 0.00010508232615630291, "loss": 0.1794, "step": 3805 }, { "epoch": 1.072333239515902, "grad_norm": 0.7347474694252014, "learning_rate": 0.00010483699968289754, "loss": 0.2088, "step": 3810 }, { "epoch": 1.0737405009850831, "grad_norm": 0.7603871822357178, "learning_rate": 0.00010459164402618567, "loss": 0.2723, "step": 3815 }, { "epoch": 1.075147762454264, "grad_norm": 1.574090838432312, "learning_rate": 0.0001043462606664835, "loss": 0.3175, "step": 3820 }, { "epoch": 1.076555023923445, "grad_norm": 1.8480275869369507, "learning_rate": 0.00010410085108427448, "loss": 0.3903, "step": 3825 }, { "epoch": 1.0779622853926258, "grad_norm": 3.3462395668029785, "learning_rate": 0.00010385541676020026, "loss": 0.2867, "step": 3830 }, { "epoch": 1.079369546861807, "grad_norm": 1.0282424688339233, "learning_rate": 0.00010360995917505167, "loss": 0.3542, "step": 3835 }, { "epoch": 1.0807768083309879, "grad_norm": 1.081586241722107, "learning_rate": 0.00010336447980976, "loss": 0.1933, "step": 3840 }, { "epoch": 1.0821840698001688, "grad_norm": 0.7061908841133118, "learning_rate": 0.00010311898014538788, "loss": 0.3673, "step": 3845 }, { "epoch": 1.08359133126935, "grad_norm": 1.0589807033538818, "learning_rate": 0.00010287346166312048, "loss": 0.2017, "step": 3850 }, { "epoch": 1.0849985927385308, "grad_norm": 0.7850357890129089, "learning_rate": 0.0001026279258442564, "loss": 0.3781, "step": 3855 }, { "epoch": 1.0864058542077117, "grad_norm": 0.8800612688064575, "learning_rate": 0.00010238237417019889, "loss": 0.2454, "step": 3860 }, { "epoch": 1.0878131156768927, "grad_norm": 0.8004993796348572, "learning_rate": 0.00010213680812244693, "loss": 0.3253, "step": 3865 }, { "epoch": 1.0892203771460738, "grad_norm": 1.0395301580429077, "learning_rate": 0.00010189122918258611, "loss": 0.3023, "step": 3870 }, { "epoch": 1.0906276386152547, "grad_norm": 0.7087461352348328, "learning_rate": 0.00010164563883227982, "loss": 0.258, "step": 3875 }, { "epoch": 1.0920349000844356, "grad_norm": 1.0742789506912231, "learning_rate": 0.00010140003855326034, "loss": 0.1768, "step": 3880 }, { "epoch": 1.0934421615536167, "grad_norm": 1.7721843719482422, "learning_rate": 0.00010115442982731988, "loss": 0.2673, "step": 3885 }, { "epoch": 1.0948494230227976, "grad_norm": 0.5749943256378174, "learning_rate": 0.00010090881413630154, "loss": 0.2943, "step": 3890 }, { "epoch": 1.0962566844919786, "grad_norm": 1.210871696472168, "learning_rate": 0.00010066319296209043, "loss": 0.2569, "step": 3895 }, { "epoch": 1.0976639459611597, "grad_norm": 0.7546014189720154, "learning_rate": 0.00010041756778660483, "loss": 0.1277, "step": 3900 }, { "epoch": 1.0990712074303406, "grad_norm": 0.45546409487724304, "learning_rate": 0.0001001719400917871, "loss": 0.2447, "step": 3905 }, { "epoch": 1.1004784688995215, "grad_norm": 0.9810652136802673, "learning_rate": 9.992631135959484e-05, "loss": 0.1891, "step": 3910 }, { "epoch": 1.1018857303687024, "grad_norm": 0.26853448152542114, "learning_rate": 9.96806830719918e-05, "loss": 0.2793, "step": 3915 }, { "epoch": 1.1032929918378835, "grad_norm": 0.815556526184082, "learning_rate": 9.943505671093923e-05, "loss": 0.1589, "step": 3920 }, { "epoch": 1.1047002533070645, "grad_norm": 1.1649208068847656, "learning_rate": 9.918943375838658e-05, "loss": 0.1692, "step": 3925 }, { "epoch": 1.1061075147762454, "grad_norm": 1.3160449266433716, "learning_rate": 9.894381569626286e-05, "loss": 0.1748, "step": 3930 }, { "epoch": 1.1075147762454265, "grad_norm": 0.7906925082206726, "learning_rate": 9.869820400646752e-05, "loss": 0.2706, "step": 3935 }, { "epoch": 1.1089220377146074, "grad_norm": 1.7690831422805786, "learning_rate": 9.845260017086152e-05, "loss": 0.4101, "step": 3940 }, { "epoch": 1.1103292991837883, "grad_norm": 0.7361578941345215, "learning_rate": 9.820700567125855e-05, "loss": 0.2352, "step": 3945 }, { "epoch": 1.1117365606529692, "grad_norm": 0.7984316945075989, "learning_rate": 9.79614219894159e-05, "loss": 0.2466, "step": 3950 }, { "epoch": 1.1131438221221504, "grad_norm": 1.6478660106658936, "learning_rate": 9.771585060702551e-05, "loss": 0.2434, "step": 3955 }, { "epoch": 1.1145510835913313, "grad_norm": 0.8288646936416626, "learning_rate": 9.747029300570528e-05, "loss": 0.1954, "step": 3960 }, { "epoch": 1.1159583450605122, "grad_norm": 1.0649809837341309, "learning_rate": 9.722475066698992e-05, "loss": 0.1995, "step": 3965 }, { "epoch": 1.1173656065296933, "grad_norm": 1.0399101972579956, "learning_rate": 9.697922507232194e-05, "loss": 0.2972, "step": 3970 }, { "epoch": 1.1187728679988742, "grad_norm": 0.9969576001167297, "learning_rate": 9.673371770304291e-05, "loss": 0.2133, "step": 3975 }, { "epoch": 1.1201801294680551, "grad_norm": 0.7914555072784424, "learning_rate": 9.648823004038452e-05, "loss": 0.2006, "step": 3980 }, { "epoch": 1.1215873909372363, "grad_norm": 0.8462080359458923, "learning_rate": 9.62427635654594e-05, "loss": 0.1759, "step": 3985 }, { "epoch": 1.1229946524064172, "grad_norm": 1.5257298946380615, "learning_rate": 9.599731975925248e-05, "loss": 0.2961, "step": 3990 }, { "epoch": 1.124401913875598, "grad_norm": 0.918910562992096, "learning_rate": 9.575190010261179e-05, "loss": 0.2468, "step": 3995 }, { "epoch": 1.125809175344779, "grad_norm": 0.9318897128105164, "learning_rate": 9.550650607623982e-05, "loss": 0.2609, "step": 4000 }, { "epoch": 1.12721643681396, "grad_norm": 0.49596425890922546, "learning_rate": 9.526113916068431e-05, "loss": 0.2369, "step": 4005 }, { "epoch": 1.128623698283141, "grad_norm": 0.6530629396438599, "learning_rate": 9.501580083632946e-05, "loss": 0.1354, "step": 4010 }, { "epoch": 1.130030959752322, "grad_norm": 0.39932572841644287, "learning_rate": 9.477049258338694e-05, "loss": 0.2277, "step": 4015 }, { "epoch": 1.131438221221503, "grad_norm": 0.8406773805618286, "learning_rate": 9.452521588188711e-05, "loss": 0.1472, "step": 4020 }, { "epoch": 1.132845482690684, "grad_norm": 0.7629873752593994, "learning_rate": 9.427997221166978e-05, "loss": 0.2421, "step": 4025 }, { "epoch": 1.1342527441598649, "grad_norm": 1.1697338819503784, "learning_rate": 9.40347630523756e-05, "loss": 0.2181, "step": 4030 }, { "epoch": 1.1356600056290458, "grad_norm": 0.924167811870575, "learning_rate": 9.378958988343702e-05, "loss": 0.3934, "step": 4035 }, { "epoch": 1.137067267098227, "grad_norm": 0.8078356385231018, "learning_rate": 9.354445418406924e-05, "loss": 0.1403, "step": 4040 }, { "epoch": 1.1384745285674078, "grad_norm": 0.520318329334259, "learning_rate": 9.329935743326144e-05, "loss": 0.2916, "step": 4045 }, { "epoch": 1.1398817900365887, "grad_norm": 0.45882686972618103, "learning_rate": 9.305430110976793e-05, "loss": 0.1297, "step": 4050 }, { "epoch": 1.1412890515057699, "grad_norm": 0.5139206051826477, "learning_rate": 9.280928669209887e-05, "loss": 0.2342, "step": 4055 }, { "epoch": 1.1426963129749508, "grad_norm": 0.9370526671409607, "learning_rate": 9.256431565851181e-05, "loss": 0.1581, "step": 4060 }, { "epoch": 1.1441035744441317, "grad_norm": 1.525415301322937, "learning_rate": 9.23193894870024e-05, "loss": 0.255, "step": 4065 }, { "epoch": 1.1455108359133126, "grad_norm": 1.745328426361084, "learning_rate": 9.207450965529571e-05, "loss": 0.1585, "step": 4070 }, { "epoch": 1.1469180973824937, "grad_norm": 0.5603808760643005, "learning_rate": 9.18296776408372e-05, "loss": 0.2085, "step": 4075 }, { "epoch": 1.1483253588516746, "grad_norm": 0.24650625884532928, "learning_rate": 9.158489492078381e-05, "loss": 0.2441, "step": 4080 }, { "epoch": 1.1497326203208555, "grad_norm": 1.2769076824188232, "learning_rate": 9.134016297199506e-05, "loss": 0.1923, "step": 4085 }, { "epoch": 1.1511398817900367, "grad_norm": 0.6759532690048218, "learning_rate": 9.109548327102424e-05, "loss": 0.1818, "step": 4090 }, { "epoch": 1.1525471432592176, "grad_norm": 1.7534480094909668, "learning_rate": 9.085085729410928e-05, "loss": 0.2677, "step": 4095 }, { "epoch": 1.1539544047283985, "grad_norm": 1.578730583190918, "learning_rate": 9.060628651716409e-05, "loss": 0.3868, "step": 4100 }, { "epoch": 1.1553616661975794, "grad_norm": 1.5693743228912354, "learning_rate": 9.036177241576949e-05, "loss": 0.4238, "step": 4105 }, { "epoch": 1.1567689276667605, "grad_norm": 0.7190649509429932, "learning_rate": 9.011731646516429e-05, "loss": 0.2943, "step": 4110 }, { "epoch": 1.1581761891359414, "grad_norm": 1.3021358251571655, "learning_rate": 8.987292014023658e-05, "loss": 0.282, "step": 4115 }, { "epoch": 1.1595834506051224, "grad_norm": 0.7299554944038391, "learning_rate": 8.962858491551467e-05, "loss": 0.2086, "step": 4120 }, { "epoch": 1.1609907120743035, "grad_norm": 0.8138667345046997, "learning_rate": 8.938431226515813e-05, "loss": 0.3847, "step": 4125 }, { "epoch": 1.1623979735434844, "grad_norm": 1.6948626041412354, "learning_rate": 8.914010366294917e-05, "loss": 0.2519, "step": 4130 }, { "epoch": 1.1638052350126653, "grad_norm": 0.4518921971321106, "learning_rate": 8.889596058228339e-05, "loss": 0.1481, "step": 4135 }, { "epoch": 1.1652124964818462, "grad_norm": 0.9538673162460327, "learning_rate": 8.865188449616124e-05, "loss": 0.2342, "step": 4140 }, { "epoch": 1.1666197579510273, "grad_norm": 1.5478556156158447, "learning_rate": 8.84078768771789e-05, "loss": 0.2741, "step": 4145 }, { "epoch": 1.1680270194202083, "grad_norm": 0.8891351222991943, "learning_rate": 8.816393919751937e-05, "loss": 0.2279, "step": 4150 }, { "epoch": 1.1694342808893892, "grad_norm": 1.0661555528640747, "learning_rate": 8.792007292894387e-05, "loss": 0.2588, "step": 4155 }, { "epoch": 1.1708415423585703, "grad_norm": 1.0529447793960571, "learning_rate": 8.767627954278267e-05, "loss": 0.3593, "step": 4160 }, { "epoch": 1.1722488038277512, "grad_norm": 1.0678569078445435, "learning_rate": 8.743256050992623e-05, "loss": 0.1596, "step": 4165 }, { "epoch": 1.1736560652969321, "grad_norm": 0.7005488276481628, "learning_rate": 8.71889173008166e-05, "loss": 0.2517, "step": 4170 }, { "epoch": 1.175063326766113, "grad_norm": 0.4683868885040283, "learning_rate": 8.69453513854382e-05, "loss": 0.1622, "step": 4175 }, { "epoch": 1.1764705882352942, "grad_norm": 0.8689951300621033, "learning_rate": 8.67018642333092e-05, "loss": 0.1776, "step": 4180 }, { "epoch": 1.177877849704475, "grad_norm": 0.7526000738143921, "learning_rate": 8.645845731347248e-05, "loss": 0.1588, "step": 4185 }, { "epoch": 1.179285111173656, "grad_norm": 1.2025400400161743, "learning_rate": 8.621513209448701e-05, "loss": 0.197, "step": 4190 }, { "epoch": 1.180692372642837, "grad_norm": 1.2456661462783813, "learning_rate": 8.597189004441863e-05, "loss": 0.2185, "step": 4195 }, { "epoch": 1.182099634112018, "grad_norm": 0.26599639654159546, "learning_rate": 8.572873263083152e-05, "loss": 0.1736, "step": 4200 }, { "epoch": 1.183506895581199, "grad_norm": 0.6946321725845337, "learning_rate": 8.548566132077916e-05, "loss": 0.2439, "step": 4205 }, { "epoch": 1.18491415705038, "grad_norm": 0.8973987102508545, "learning_rate": 8.524267758079557e-05, "loss": 0.2171, "step": 4210 }, { "epoch": 1.186321418519561, "grad_norm": 0.653135359287262, "learning_rate": 8.499978287688648e-05, "loss": 0.1822, "step": 4215 }, { "epoch": 1.1877286799887419, "grad_norm": 1.1294854879379272, "learning_rate": 8.475697867452028e-05, "loss": 0.3998, "step": 4220 }, { "epoch": 1.189135941457923, "grad_norm": 0.7260348200798035, "learning_rate": 8.451426643861946e-05, "loss": 0.3177, "step": 4225 }, { "epoch": 1.190543202927104, "grad_norm": 0.9421544075012207, "learning_rate": 8.427164763355169e-05, "loss": 0.3644, "step": 4230 }, { "epoch": 1.1919504643962848, "grad_norm": 1.8454887866973877, "learning_rate": 8.402912372312076e-05, "loss": 0.2601, "step": 4235 }, { "epoch": 1.1933577258654657, "grad_norm": 0.7556844353675842, "learning_rate": 8.378669617055806e-05, "loss": 0.1539, "step": 4240 }, { "epoch": 1.1947649873346469, "grad_norm": 1.1138182878494263, "learning_rate": 8.354436643851365e-05, "loss": 0.2221, "step": 4245 }, { "epoch": 1.1961722488038278, "grad_norm": 1.7039527893066406, "learning_rate": 8.330213598904726e-05, "loss": 0.3543, "step": 4250 }, { "epoch": 1.1975795102730087, "grad_norm": 1.6566787958145142, "learning_rate": 8.306000628361972e-05, "loss": 0.1975, "step": 4255 }, { "epoch": 1.1989867717421898, "grad_norm": 1.0765029191970825, "learning_rate": 8.281797878308406e-05, "loss": 0.1358, "step": 4260 }, { "epoch": 1.2003940332113707, "grad_norm": 0.7748456001281738, "learning_rate": 8.257605494767654e-05, "loss": 0.1821, "step": 4265 }, { "epoch": 1.2018012946805516, "grad_norm": 0.32174113392829895, "learning_rate": 8.233423623700816e-05, "loss": 0.1391, "step": 4270 }, { "epoch": 1.2032085561497325, "grad_norm": 0.5359024405479431, "learning_rate": 8.209252411005548e-05, "loss": 0.1476, "step": 4275 }, { "epoch": 1.2046158176189137, "grad_norm": 0.9815373420715332, "learning_rate": 8.185092002515209e-05, "loss": 0.3173, "step": 4280 }, { "epoch": 1.2060230790880946, "grad_norm": 0.6186626553535461, "learning_rate": 8.16094254399798e-05, "loss": 0.3268, "step": 4285 }, { "epoch": 1.2074303405572755, "grad_norm": 1.598221778869629, "learning_rate": 8.136804181155961e-05, "loss": 0.2788, "step": 4290 }, { "epoch": 1.2088376020264566, "grad_norm": 0.409020334482193, "learning_rate": 8.112677059624316e-05, "loss": 0.2455, "step": 4295 }, { "epoch": 1.2102448634956375, "grad_norm": 1.0623451471328735, "learning_rate": 8.088561324970396e-05, "loss": 0.2883, "step": 4300 }, { "epoch": 1.2116521249648184, "grad_norm": 0.9107158780097961, "learning_rate": 8.064457122692828e-05, "loss": 0.191, "step": 4305 }, { "epoch": 1.2130593864339994, "grad_norm": 1.021278738975525, "learning_rate": 8.040364598220682e-05, "loss": 0.2287, "step": 4310 }, { "epoch": 1.2144666479031805, "grad_norm": 1.0348402261734009, "learning_rate": 8.016283896912563e-05, "loss": 0.1455, "step": 4315 }, { "epoch": 1.2158739093723614, "grad_norm": 1.06684410572052, "learning_rate": 7.992215164055737e-05, "loss": 0.1786, "step": 4320 }, { "epoch": 1.2172811708415423, "grad_norm": 0.45586028695106506, "learning_rate": 7.968158544865272e-05, "loss": 0.2625, "step": 4325 }, { "epoch": 1.2186884323107234, "grad_norm": 1.0333331823349, "learning_rate": 7.944114184483144e-05, "loss": 0.1766, "step": 4330 }, { "epoch": 1.2200956937799043, "grad_norm": 1.477582335472107, "learning_rate": 7.920082227977361e-05, "loss": 0.2547, "step": 4335 }, { "epoch": 1.2215029552490853, "grad_norm": 0.732683539390564, "learning_rate": 7.89606282034111e-05, "loss": 0.1894, "step": 4340 }, { "epoch": 1.2229102167182662, "grad_norm": 1.199336290359497, "learning_rate": 7.872056106491846e-05, "loss": 0.3359, "step": 4345 }, { "epoch": 1.2243174781874473, "grad_norm": 2.6119384765625, "learning_rate": 7.848062231270458e-05, "loss": 0.3301, "step": 4350 }, { "epoch": 1.2257247396566282, "grad_norm": 1.0260940790176392, "learning_rate": 7.824081339440364e-05, "loss": 0.1735, "step": 4355 }, { "epoch": 1.2271320011258091, "grad_norm": 0.7368533611297607, "learning_rate": 7.800113575686643e-05, "loss": 0.1741, "step": 4360 }, { "epoch": 1.2285392625949902, "grad_norm": 0.8837445378303528, "learning_rate": 7.776159084615183e-05, "loss": 0.2789, "step": 4365 }, { "epoch": 1.2299465240641712, "grad_norm": 1.0234431028366089, "learning_rate": 7.752218010751786e-05, "loss": 0.1811, "step": 4370 }, { "epoch": 1.231353785533352, "grad_norm": 1.1849218606948853, "learning_rate": 7.728290498541297e-05, "loss": 0.2951, "step": 4375 }, { "epoch": 1.232761047002533, "grad_norm": 1.1420046091079712, "learning_rate": 7.704376692346748e-05, "loss": 0.2964, "step": 4380 }, { "epoch": 1.234168308471714, "grad_norm": 0.44826436042785645, "learning_rate": 7.680476736448477e-05, "loss": 0.165, "step": 4385 }, { "epoch": 1.235575569940895, "grad_norm": 0.6397153735160828, "learning_rate": 7.656590775043249e-05, "loss": 0.138, "step": 4390 }, { "epoch": 1.236982831410076, "grad_norm": 1.1096476316452026, "learning_rate": 7.632718952243404e-05, "loss": 0.2673, "step": 4395 }, { "epoch": 1.238390092879257, "grad_norm": 0.7769279479980469, "learning_rate": 7.608861412075987e-05, "loss": 0.1631, "step": 4400 }, { "epoch": 1.239797354348438, "grad_norm": 0.8061667084693909, "learning_rate": 7.585018298481849e-05, "loss": 0.1851, "step": 4405 }, { "epoch": 1.2412046158176189, "grad_norm": 1.618454098701477, "learning_rate": 7.561189755314817e-05, "loss": 0.2377, "step": 4410 }, { "epoch": 1.2426118772867998, "grad_norm": 1.1752551794052124, "learning_rate": 7.537375926340802e-05, "loss": 0.1806, "step": 4415 }, { "epoch": 1.244019138755981, "grad_norm": 0.29463231563568115, "learning_rate": 7.513576955236944e-05, "loss": 0.1611, "step": 4420 }, { "epoch": 1.2454264002251618, "grad_norm": 0.7407804131507874, "learning_rate": 7.489792985590743e-05, "loss": 0.3176, "step": 4425 }, { "epoch": 1.2468336616943427, "grad_norm": 0.8456223011016846, "learning_rate": 7.466024160899173e-05, "loss": 0.2742, "step": 4430 }, { "epoch": 1.2482409231635239, "grad_norm": 1.3502225875854492, "learning_rate": 7.442270624567856e-05, "loss": 0.2477, "step": 4435 }, { "epoch": 1.2496481846327048, "grad_norm": 1.0241039991378784, "learning_rate": 7.418532519910162e-05, "loss": 0.2415, "step": 4440 }, { "epoch": 1.2510554461018857, "grad_norm": 0.570637047290802, "learning_rate": 7.394809990146356e-05, "loss": 0.2094, "step": 4445 }, { "epoch": 1.2524627075710666, "grad_norm": 0.4012211859226227, "learning_rate": 7.371103178402731e-05, "loss": 0.2591, "step": 4450 }, { "epoch": 1.2538699690402477, "grad_norm": 1.1546359062194824, "learning_rate": 7.347412227710766e-05, "loss": 0.2837, "step": 4455 }, { "epoch": 1.2552772305094286, "grad_norm": 0.8672778606414795, "learning_rate": 7.32373728100622e-05, "loss": 0.298, "step": 4460 }, { "epoch": 1.2566844919786098, "grad_norm": 0.4911658465862274, "learning_rate": 7.300078481128306e-05, "loss": 0.1921, "step": 4465 }, { "epoch": 1.2580917534477907, "grad_norm": 1.1717147827148438, "learning_rate": 7.276435970818824e-05, "loss": 0.1687, "step": 4470 }, { "epoch": 1.2594990149169716, "grad_norm": 0.5286734104156494, "learning_rate": 7.252809892721282e-05, "loss": 0.2104, "step": 4475 }, { "epoch": 1.2609062763861525, "grad_norm": 2.43472957611084, "learning_rate": 7.229200389380056e-05, "loss": 0.2763, "step": 4480 }, { "epoch": 1.2623135378553334, "grad_norm": 0.9692918062210083, "learning_rate": 7.205607603239508e-05, "loss": 0.1913, "step": 4485 }, { "epoch": 1.2637207993245145, "grad_norm": 0.8969650268554688, "learning_rate": 7.182031676643153e-05, "loss": 0.4249, "step": 4490 }, { "epoch": 1.2651280607936954, "grad_norm": 0.7135694026947021, "learning_rate": 7.158472751832783e-05, "loss": 0.1957, "step": 4495 }, { "epoch": 1.2665353222628766, "grad_norm": 2.911539077758789, "learning_rate": 7.134930970947607e-05, "loss": 0.3644, "step": 4500 }, { "epoch": 1.2679425837320575, "grad_norm": 1.8338284492492676, "learning_rate": 7.111406476023398e-05, "loss": 0.2941, "step": 4505 }, { "epoch": 1.2693498452012384, "grad_norm": 0.736365020275116, "learning_rate": 7.087899408991651e-05, "loss": 0.2541, "step": 4510 }, { "epoch": 1.2707571066704193, "grad_norm": 1.269327163696289, "learning_rate": 7.06440991167869e-05, "loss": 0.2847, "step": 4515 }, { "epoch": 1.2721643681396002, "grad_norm": 0.6774185299873352, "learning_rate": 7.040938125804858e-05, "loss": 0.2047, "step": 4520 }, { "epoch": 1.2735716296087813, "grad_norm": 1.0028345584869385, "learning_rate": 7.017484192983623e-05, "loss": 0.2327, "step": 4525 }, { "epoch": 1.2749788910779623, "grad_norm": 0.9345621466636658, "learning_rate": 6.99404825472074e-05, "loss": 0.2574, "step": 4530 }, { "epoch": 1.2763861525471434, "grad_norm": 1.2837140560150146, "learning_rate": 6.970630452413407e-05, "loss": 0.298, "step": 4535 }, { "epoch": 1.2777934140163243, "grad_norm": 0.5337740182876587, "learning_rate": 6.947230927349396e-05, "loss": 0.1538, "step": 4540 }, { "epoch": 1.2792006754855052, "grad_norm": 0.5805062651634216, "learning_rate": 6.923849820706194e-05, "loss": 0.1483, "step": 4545 }, { "epoch": 1.280607936954686, "grad_norm": 0.8201838135719299, "learning_rate": 6.900487273550187e-05, "loss": 0.163, "step": 4550 }, { "epoch": 1.282015198423867, "grad_norm": 0.5184070467948914, "learning_rate": 6.877143426835764e-05, "loss": 0.2611, "step": 4555 }, { "epoch": 1.2834224598930482, "grad_norm": 1.0877232551574707, "learning_rate": 6.853818421404496e-05, "loss": 0.3085, "step": 4560 }, { "epoch": 1.284829721362229, "grad_norm": 1.616977572441101, "learning_rate": 6.830512397984288e-05, "loss": 0.3108, "step": 4565 }, { "epoch": 1.2862369828314102, "grad_norm": 0.6340872049331665, "learning_rate": 6.807225497188496e-05, "loss": 0.177, "step": 4570 }, { "epoch": 1.287644244300591, "grad_norm": 0.8518214821815491, "learning_rate": 6.783957859515127e-05, "loss": 0.1805, "step": 4575 }, { "epoch": 1.289051505769772, "grad_norm": 1.280093789100647, "learning_rate": 6.760709625345953e-05, "loss": 0.2854, "step": 4580 }, { "epoch": 1.290458767238953, "grad_norm": 0.7486845850944519, "learning_rate": 6.737480934945677e-05, "loss": 0.1399, "step": 4585 }, { "epoch": 1.291866028708134, "grad_norm": 1.3590744733810425, "learning_rate": 6.714271928461097e-05, "loss": 0.1735, "step": 4590 }, { "epoch": 1.293273290177315, "grad_norm": 0.6231881380081177, "learning_rate": 6.691082745920247e-05, "loss": 0.2083, "step": 4595 }, { "epoch": 1.2946805516464959, "grad_norm": 1.0750889778137207, "learning_rate": 6.667913527231549e-05, "loss": 0.2304, "step": 4600 }, { "epoch": 1.296087813115677, "grad_norm": 1.3983303308486938, "learning_rate": 6.644764412182986e-05, "loss": 0.3285, "step": 4605 }, { "epoch": 1.297495074584858, "grad_norm": 0.5835619568824768, "learning_rate": 6.621635540441249e-05, "loss": 0.2651, "step": 4610 }, { "epoch": 1.2989023360540388, "grad_norm": 0.7869633436203003, "learning_rate": 6.598527051550882e-05, "loss": 0.2144, "step": 4615 }, { "epoch": 1.3003095975232197, "grad_norm": 0.4034360945224762, "learning_rate": 6.575439084933468e-05, "loss": 0.1919, "step": 4620 }, { "epoch": 1.3017168589924009, "grad_norm": 1.0225868225097656, "learning_rate": 6.552371779886756e-05, "loss": 0.2942, "step": 4625 }, { "epoch": 1.3031241204615818, "grad_norm": 1.8515701293945312, "learning_rate": 6.52932527558385e-05, "loss": 0.2579, "step": 4630 }, { "epoch": 1.3045313819307627, "grad_norm": 1.13215172290802, "learning_rate": 6.506299711072353e-05, "loss": 0.189, "step": 4635 }, { "epoch": 1.3059386433999438, "grad_norm": 1.1587252616882324, "learning_rate": 6.483295225273521e-05, "loss": 0.2055, "step": 4640 }, { "epoch": 1.3073459048691247, "grad_norm": 1.6920759677886963, "learning_rate": 6.460311956981444e-05, "loss": 0.3108, "step": 4645 }, { "epoch": 1.3087531663383056, "grad_norm": 0.5736072659492493, "learning_rate": 6.437350044862207e-05, "loss": 0.2675, "step": 4650 }, { "epoch": 1.3101604278074865, "grad_norm": 0.9719104170799255, "learning_rate": 6.414409627453025e-05, "loss": 0.1933, "step": 4655 }, { "epoch": 1.3115676892766677, "grad_norm": 0.8271322250366211, "learning_rate": 6.391490843161442e-05, "loss": 0.0908, "step": 4660 }, { "epoch": 1.3129749507458486, "grad_norm": 1.2622920274734497, "learning_rate": 6.368593830264485e-05, "loss": 0.1837, "step": 4665 }, { "epoch": 1.3143822122150295, "grad_norm": 1.0141448974609375, "learning_rate": 6.345718726907815e-05, "loss": 0.1396, "step": 4670 }, { "epoch": 1.3157894736842106, "grad_norm": 0.5923504829406738, "learning_rate": 6.322865671104909e-05, "loss": 0.1631, "step": 4675 }, { "epoch": 1.3171967351533915, "grad_norm": 1.8866256475448608, "learning_rate": 6.300034800736233e-05, "loss": 0.1407, "step": 4680 }, { "epoch": 1.3186039966225724, "grad_norm": 0.8495520353317261, "learning_rate": 6.277226253548385e-05, "loss": 0.2345, "step": 4685 }, { "epoch": 1.3200112580917533, "grad_norm": 0.8851481080055237, "learning_rate": 6.254440167153295e-05, "loss": 0.2431, "step": 4690 }, { "epoch": 1.3214185195609345, "grad_norm": 0.5228270292282104, "learning_rate": 6.231676679027364e-05, "loss": 0.1606, "step": 4695 }, { "epoch": 1.3228257810301154, "grad_norm": 1.2752258777618408, "learning_rate": 6.208935926510659e-05, "loss": 0.2588, "step": 4700 }, { "epoch": 1.3242330424992963, "grad_norm": 1.6664029359817505, "learning_rate": 6.186218046806078e-05, "loss": 0.2418, "step": 4705 }, { "epoch": 1.3256403039684774, "grad_norm": 0.7116133570671082, "learning_rate": 6.16352317697851e-05, "loss": 0.1839, "step": 4710 }, { "epoch": 1.3270475654376583, "grad_norm": 1.6506725549697876, "learning_rate": 6.140851453954021e-05, "loss": 0.2076, "step": 4715 }, { "epoch": 1.3284548269068392, "grad_norm": 1.0681225061416626, "learning_rate": 6.118203014519034e-05, "loss": 0.2491, "step": 4720 }, { "epoch": 1.3298620883760202, "grad_norm": 0.969599723815918, "learning_rate": 6.095577995319476e-05, "loss": 0.273, "step": 4725 }, { "epoch": 1.3312693498452013, "grad_norm": 1.4593223333358765, "learning_rate": 6.072976532859982e-05, "loss": 0.358, "step": 4730 }, { "epoch": 1.3326766113143822, "grad_norm": 0.29552891850471497, "learning_rate": 6.0503987635030656e-05, "loss": 0.2655, "step": 4735 }, { "epoch": 1.334083872783563, "grad_norm": 2.189373731613159, "learning_rate": 6.0278448234682784e-05, "loss": 0.2624, "step": 4740 }, { "epoch": 1.3354911342527442, "grad_norm": 0.28230440616607666, "learning_rate": 6.005314848831415e-05, "loss": 0.1886, "step": 4745 }, { "epoch": 1.3368983957219251, "grad_norm": 0.5569413304328918, "learning_rate": 5.9828089755236714e-05, "loss": 0.231, "step": 4750 }, { "epoch": 1.338305657191106, "grad_norm": 0.8192738890647888, "learning_rate": 5.960327339330828e-05, "loss": 0.23, "step": 4755 }, { "epoch": 1.339712918660287, "grad_norm": 1.0859158039093018, "learning_rate": 5.9378700758924466e-05, "loss": 0.3275, "step": 4760 }, { "epoch": 1.341120180129468, "grad_norm": 0.8077869415283203, "learning_rate": 5.915437320701025e-05, "loss": 0.0847, "step": 4765 }, { "epoch": 1.342527441598649, "grad_norm": 1.8826837539672852, "learning_rate": 5.8930292091012015e-05, "loss": 0.2158, "step": 4770 }, { "epoch": 1.3439347030678301, "grad_norm": 0.6470653414726257, "learning_rate": 5.870645876288938e-05, "loss": 0.3325, "step": 4775 }, { "epoch": 1.345341964537011, "grad_norm": 0.7090429067611694, "learning_rate": 5.848287457310681e-05, "loss": 0.2083, "step": 4780 }, { "epoch": 1.346749226006192, "grad_norm": 0.1886598914861679, "learning_rate": 5.825954087062579e-05, "loss": 0.2118, "step": 4785 }, { "epoch": 1.3481564874753729, "grad_norm": 0.5092473030090332, "learning_rate": 5.8036459002896473e-05, "loss": 0.253, "step": 4790 }, { "epoch": 1.3495637489445538, "grad_norm": 0.9652419686317444, "learning_rate": 5.78136303158495e-05, "loss": 0.1499, "step": 4795 }, { "epoch": 1.350971010413735, "grad_norm": 0.6111290454864502, "learning_rate": 5.759105615388814e-05, "loss": 0.1805, "step": 4800 }, { "epoch": 1.3523782718829158, "grad_norm": 2.2469632625579834, "learning_rate": 5.736873785987997e-05, "loss": 0.3536, "step": 4805 }, { "epoch": 1.353785533352097, "grad_norm": 0.9734948873519897, "learning_rate": 5.714667677514882e-05, "loss": 0.2784, "step": 4810 }, { "epoch": 1.3551927948212779, "grad_norm": 1.076882243156433, "learning_rate": 5.692487423946662e-05, "loss": 0.1953, "step": 4815 }, { "epoch": 1.3566000562904588, "grad_norm": 0.7746699452400208, "learning_rate": 5.6703331591045524e-05, "loss": 0.2175, "step": 4820 }, { "epoch": 1.3580073177596397, "grad_norm": 0.7650654315948486, "learning_rate": 5.6482050166529546e-05, "loss": 0.1676, "step": 4825 }, { "epoch": 1.3594145792288206, "grad_norm": 0.6610764861106873, "learning_rate": 5.62610313009868e-05, "loss": 0.1721, "step": 4830 }, { "epoch": 1.3608218406980017, "grad_norm": 0.8137916326522827, "learning_rate": 5.604027632790112e-05, "loss": 0.1374, "step": 4835 }, { "epoch": 1.3622291021671826, "grad_norm": 0.6320801377296448, "learning_rate": 5.581978657916431e-05, "loss": 0.209, "step": 4840 }, { "epoch": 1.3636363636363638, "grad_norm": 1.4471935033798218, "learning_rate": 5.5599563385067996e-05, "loss": 0.1163, "step": 4845 }, { "epoch": 1.3650436251055447, "grad_norm": 0.9794873595237732, "learning_rate": 5.537960807429547e-05, "loss": 0.2077, "step": 4850 }, { "epoch": 1.3664508865747256, "grad_norm": 1.3119271993637085, "learning_rate": 5.5159921973913866e-05, "loss": 0.2667, "step": 4855 }, { "epoch": 1.3678581480439065, "grad_norm": 1.156152367591858, "learning_rate": 5.49405064093661e-05, "loss": 0.1734, "step": 4860 }, { "epoch": 1.3692654095130874, "grad_norm": 0.06259223818778992, "learning_rate": 5.472136270446275e-05, "loss": 0.2067, "step": 4865 }, { "epoch": 1.3706726709822685, "grad_norm": 0.6296875476837158, "learning_rate": 5.4502492181374284e-05, "loss": 0.229, "step": 4870 }, { "epoch": 1.3720799324514494, "grad_norm": 1.3139517307281494, "learning_rate": 5.428389616062298e-05, "loss": 0.286, "step": 4875 }, { "epoch": 1.3734871939206306, "grad_norm": 0.5777654051780701, "learning_rate": 5.40655759610748e-05, "loss": 0.2024, "step": 4880 }, { "epoch": 1.3748944553898115, "grad_norm": 0.5422516465187073, "learning_rate": 5.384753289993173e-05, "loss": 0.2453, "step": 4885 }, { "epoch": 1.3763017168589924, "grad_norm": 1.2088871002197266, "learning_rate": 5.3629768292723614e-05, "loss": 0.1644, "step": 4890 }, { "epoch": 1.3777089783281733, "grad_norm": 0.6206454634666443, "learning_rate": 5.341228345330025e-05, "loss": 0.3293, "step": 4895 }, { "epoch": 1.3791162397973544, "grad_norm": 1.0353143215179443, "learning_rate": 5.3195079693823624e-05, "loss": 0.2197, "step": 4900 }, { "epoch": 1.3805235012665353, "grad_norm": 1.076452612876892, "learning_rate": 5.297815832475971e-05, "loss": 0.1435, "step": 4905 }, { "epoch": 1.3819307627357162, "grad_norm": 0.7797285914421082, "learning_rate": 5.2761520654870846e-05, "loss": 0.1499, "step": 4910 }, { "epoch": 1.3833380242048974, "grad_norm": 3.2293171882629395, "learning_rate": 5.25451679912077e-05, "loss": 0.4037, "step": 4915 }, { "epoch": 1.3847452856740783, "grad_norm": 0.7513951659202576, "learning_rate": 5.232910163910132e-05, "loss": 0.136, "step": 4920 }, { "epoch": 1.3861525471432592, "grad_norm": 0.43260759115219116, "learning_rate": 5.211332290215543e-05, "loss": 0.2419, "step": 4925 }, { "epoch": 1.38755980861244, "grad_norm": 0.7441173791885376, "learning_rate": 5.189783308223841e-05, "loss": 0.1678, "step": 4930 }, { "epoch": 1.3889670700816212, "grad_norm": 0.4429182708263397, "learning_rate": 5.1682633479475484e-05, "loss": 0.1767, "step": 4935 }, { "epoch": 1.3903743315508021, "grad_norm": 1.6440355777740479, "learning_rate": 5.146772539224094e-05, "loss": 0.2831, "step": 4940 }, { "epoch": 1.391781593019983, "grad_norm": 1.1421854496002197, "learning_rate": 5.1253110117150314e-05, "loss": 0.157, "step": 4945 }, { "epoch": 1.3931888544891642, "grad_norm": 1.013460397720337, "learning_rate": 5.1038788949052344e-05, "loss": 0.3537, "step": 4950 }, { "epoch": 1.394596115958345, "grad_norm": 1.2984402179718018, "learning_rate": 5.082476318102144e-05, "loss": 0.2869, "step": 4955 }, { "epoch": 1.396003377427526, "grad_norm": 0.8296849727630615, "learning_rate": 5.061103410434978e-05, "loss": 0.2029, "step": 4960 }, { "epoch": 1.397410638896707, "grad_norm": 1.1972373723983765, "learning_rate": 5.0397603008539374e-05, "loss": 0.182, "step": 4965 }, { "epoch": 1.398817900365888, "grad_norm": 1.5300724506378174, "learning_rate": 5.0184471181294515e-05, "loss": 0.1537, "step": 4970 }, { "epoch": 1.400225161835069, "grad_norm": 0.9540086984634399, "learning_rate": 4.997163990851381e-05, "loss": 0.1679, "step": 4975 }, { "epoch": 1.4016324233042499, "grad_norm": 0.15063901245594025, "learning_rate": 4.975911047428263e-05, "loss": 0.1512, "step": 4980 }, { "epoch": 1.403039684773431, "grad_norm": 1.925596833229065, "learning_rate": 4.954688416086524e-05, "loss": 0.2077, "step": 4985 }, { "epoch": 1.404446946242612, "grad_norm": 1.4239457845687866, "learning_rate": 4.9334962248696934e-05, "loss": 0.2464, "step": 4990 }, { "epoch": 1.4058542077117928, "grad_norm": 0.3618084490299225, "learning_rate": 4.912334601637658e-05, "loss": 0.1579, "step": 4995 }, { "epoch": 1.4072614691809737, "grad_norm": 0.8101370334625244, "learning_rate": 4.8912036740658776e-05, "loss": 0.2682, "step": 5000 }, { "epoch": 1.4086687306501549, "grad_norm": 0.7149579524993896, "learning_rate": 4.8701035696446064e-05, "loss": 0.3497, "step": 5005 }, { "epoch": 1.4100759921193358, "grad_norm": 1.0598907470703125, "learning_rate": 4.849034415678131e-05, "loss": 0.2342, "step": 5010 }, { "epoch": 1.4114832535885167, "grad_norm": 1.2105034589767456, "learning_rate": 4.8279963392840156e-05, "loss": 0.2693, "step": 5015 }, { "epoch": 1.4128905150576978, "grad_norm": 0.6534488201141357, "learning_rate": 4.8069894673923064e-05, "loss": 0.2475, "step": 5020 }, { "epoch": 1.4142977765268787, "grad_norm": 1.4907587766647339, "learning_rate": 4.7860139267447956e-05, "loss": 0.2958, "step": 5025 }, { "epoch": 1.4157050379960596, "grad_norm": 1.1340523958206177, "learning_rate": 4.765069843894239e-05, "loss": 0.1087, "step": 5030 }, { "epoch": 1.4171122994652405, "grad_norm": 0.6139047145843506, "learning_rate": 4.744157345203588e-05, "loss": 0.1827, "step": 5035 }, { "epoch": 1.4185195609344217, "grad_norm": 1.5109590291976929, "learning_rate": 4.723276556845252e-05, "loss": 0.1851, "step": 5040 }, { "epoch": 1.4199268224036026, "grad_norm": 0.593103289604187, "learning_rate": 4.702427604800307e-05, "loss": 0.2019, "step": 5045 }, { "epoch": 1.4213340838727835, "grad_norm": 1.3064155578613281, "learning_rate": 4.681610614857749e-05, "loss": 0.1086, "step": 5050 }, { "epoch": 1.4227413453419646, "grad_norm": 1.4465229511260986, "learning_rate": 4.66082571261375e-05, "loss": 0.099, "step": 5055 }, { "epoch": 1.4241486068111455, "grad_norm": 1.0164941549301147, "learning_rate": 4.6400730234708676e-05, "loss": 0.2006, "step": 5060 }, { "epoch": 1.4255558682803264, "grad_norm": 1.600894808769226, "learning_rate": 4.61935267263732e-05, "loss": 0.2938, "step": 5065 }, { "epoch": 1.4269631297495073, "grad_norm": 0.8022120594978333, "learning_rate": 4.598664785126217e-05, "loss": 0.2981, "step": 5070 }, { "epoch": 1.4283703912186885, "grad_norm": 0.6564612984657288, "learning_rate": 4.578009485754791e-05, "loss": 0.1266, "step": 5075 }, { "epoch": 1.4297776526878694, "grad_norm": 0.7073236107826233, "learning_rate": 4.557386899143678e-05, "loss": 0.2229, "step": 5080 }, { "epoch": 1.4311849141570505, "grad_norm": 0.9632103443145752, "learning_rate": 4.536797149716133e-05, "loss": 0.1511, "step": 5085 }, { "epoch": 1.4325921756262314, "grad_norm": 1.1304622888565063, "learning_rate": 4.5162403616972945e-05, "loss": 0.2341, "step": 5090 }, { "epoch": 1.4339994370954123, "grad_norm": 1.135055422782898, "learning_rate": 4.4957166591134405e-05, "loss": 0.3898, "step": 5095 }, { "epoch": 1.4354066985645932, "grad_norm": 0.6786003112792969, "learning_rate": 4.475226165791231e-05, "loss": 0.2129, "step": 5100 }, { "epoch": 1.4368139600337742, "grad_norm": 1.3296654224395752, "learning_rate": 4.454769005356955e-05, "loss": 0.3128, "step": 5105 }, { "epoch": 1.4382212215029553, "grad_norm": 0.7507737278938293, "learning_rate": 4.434345301235802e-05, "loss": 0.1069, "step": 5110 }, { "epoch": 1.4396284829721362, "grad_norm": 1.4222168922424316, "learning_rate": 4.4139551766511e-05, "loss": 0.1529, "step": 5115 }, { "epoch": 1.4410357444413173, "grad_norm": 0.21092858910560608, "learning_rate": 4.39359875462359e-05, "loss": 0.2159, "step": 5120 }, { "epoch": 1.4424430059104982, "grad_norm": 1.0862993001937866, "learning_rate": 4.373276157970665e-05, "loss": 0.1262, "step": 5125 }, { "epoch": 1.4438502673796791, "grad_norm": 1.6479579210281372, "learning_rate": 4.352987509305635e-05, "loss": 0.2165, "step": 5130 }, { "epoch": 1.44525752884886, "grad_norm": 0.11600520461797714, "learning_rate": 4.3327329310370016e-05, "loss": 0.1696, "step": 5135 }, { "epoch": 1.446664790318041, "grad_norm": 0.9424710869789124, "learning_rate": 4.312512545367702e-05, "loss": 0.3328, "step": 5140 }, { "epoch": 1.448072051787222, "grad_norm": 0.6428975462913513, "learning_rate": 4.292326474294372e-05, "loss": 0.1069, "step": 5145 }, { "epoch": 1.449479313256403, "grad_norm": 0.8455730676651001, "learning_rate": 4.272174839606628e-05, "loss": 0.3006, "step": 5150 }, { "epoch": 1.4508865747255841, "grad_norm": 0.6467002034187317, "learning_rate": 4.252057762886305e-05, "loss": 0.1345, "step": 5155 }, { "epoch": 1.452293836194765, "grad_norm": 0.7402626276016235, "learning_rate": 4.2319753655067505e-05, "loss": 0.1928, "step": 5160 }, { "epoch": 1.453701097663946, "grad_norm": 1.142514705657959, "learning_rate": 4.211927768632068e-05, "loss": 0.3225, "step": 5165 }, { "epoch": 1.4551083591331269, "grad_norm": 0.9843090772628784, "learning_rate": 4.191915093216411e-05, "loss": 0.1223, "step": 5170 }, { "epoch": 1.4565156206023078, "grad_norm": 0.9305518865585327, "learning_rate": 4.171937460003223e-05, "loss": 0.1518, "step": 5175 }, { "epoch": 1.457922882071489, "grad_norm": 0.9245863556861877, "learning_rate": 4.1519949895245435e-05, "loss": 0.161, "step": 5180 }, { "epoch": 1.4593301435406698, "grad_norm": 0.5494176149368286, "learning_rate": 4.1320878021002466e-05, "loss": 0.1645, "step": 5185 }, { "epoch": 1.460737405009851, "grad_norm": 0.454455703496933, "learning_rate": 4.112216017837346e-05, "loss": 0.1784, "step": 5190 }, { "epoch": 1.4621446664790319, "grad_norm": 0.8797675967216492, "learning_rate": 4.092379756629244e-05, "loss": 0.1915, "step": 5195 }, { "epoch": 1.4635519279482128, "grad_norm": 0.5059092044830322, "learning_rate": 4.072579138155024e-05, "loss": 0.1533, "step": 5200 }, { "epoch": 1.4649591894173937, "grad_norm": 1.5164445638656616, "learning_rate": 4.052814281878725e-05, "loss": 0.3054, "step": 5205 }, { "epoch": 1.4663664508865748, "grad_norm": 0.8489431738853455, "learning_rate": 4.033085307048626e-05, "loss": 0.1573, "step": 5210 }, { "epoch": 1.4677737123557557, "grad_norm": 0.8418503999710083, "learning_rate": 4.0133923326965073e-05, "loss": 0.2269, "step": 5215 }, { "epoch": 1.4691809738249366, "grad_norm": 0.4309021830558777, "learning_rate": 3.9937354776369565e-05, "loss": 0.1621, "step": 5220 }, { "epoch": 1.4705882352941178, "grad_norm": 1.8004333972930908, "learning_rate": 3.974114860466641e-05, "loss": 0.1821, "step": 5225 }, { "epoch": 1.4719954967632987, "grad_norm": 0.5034974217414856, "learning_rate": 3.954530599563586e-05, "loss": 0.1586, "step": 5230 }, { "epoch": 1.4734027582324796, "grad_norm": 1.8636256456375122, "learning_rate": 3.934982813086466e-05, "loss": 0.1778, "step": 5235 }, { "epoch": 1.4748100197016605, "grad_norm": 0.7782198190689087, "learning_rate": 3.915471618973905e-05, "loss": 0.2362, "step": 5240 }, { "epoch": 1.4762172811708416, "grad_norm": 0.5170087218284607, "learning_rate": 3.895997134943735e-05, "loss": 0.1389, "step": 5245 }, { "epoch": 1.4776245426400225, "grad_norm": 0.6563436388969421, "learning_rate": 3.876559478492319e-05, "loss": 0.1972, "step": 5250 }, { "epoch": 1.4790318041092034, "grad_norm": 0.6524726748466492, "learning_rate": 3.857158766893814e-05, "loss": 0.2123, "step": 5255 }, { "epoch": 1.4804390655783846, "grad_norm": 0.8341132402420044, "learning_rate": 3.837795117199483e-05, "loss": 0.2374, "step": 5260 }, { "epoch": 1.4818463270475655, "grad_norm": 0.37632039189338684, "learning_rate": 3.818468646236984e-05, "loss": 0.114, "step": 5265 }, { "epoch": 1.4832535885167464, "grad_norm": 2.116046190261841, "learning_rate": 3.799179470609656e-05, "loss": 0.3048, "step": 5270 }, { "epoch": 1.4846608499859273, "grad_norm": 2.3138134479522705, "learning_rate": 3.7799277066958205e-05, "loss": 0.1414, "step": 5275 }, { "epoch": 1.4860681114551084, "grad_norm": 1.4033293724060059, "learning_rate": 3.760713470648093e-05, "loss": 0.1972, "step": 5280 }, { "epoch": 1.4874753729242893, "grad_norm": 0.9336678981781006, "learning_rate": 3.741536878392654e-05, "loss": 0.1519, "step": 5285 }, { "epoch": 1.4888826343934702, "grad_norm": 1.4050379991531372, "learning_rate": 3.7223980456285813e-05, "loss": 0.1493, "step": 5290 }, { "epoch": 1.4902898958626514, "grad_norm": 0.4991312623023987, "learning_rate": 3.70329708782713e-05, "loss": 0.157, "step": 5295 }, { "epoch": 1.4916971573318323, "grad_norm": 1.6823819875717163, "learning_rate": 3.6842341202310374e-05, "loss": 0.2532, "step": 5300 }, { "epoch": 1.4931044188010132, "grad_norm": 0.81031733751297, "learning_rate": 3.665209257853843e-05, "loss": 0.3201, "step": 5305 }, { "epoch": 1.494511680270194, "grad_norm": 1.287041425704956, "learning_rate": 3.646222615479177e-05, "loss": 0.1398, "step": 5310 }, { "epoch": 1.4959189417393752, "grad_norm": 0.4528125822544098, "learning_rate": 3.62727430766007e-05, "loss": 0.2131, "step": 5315 }, { "epoch": 1.4973262032085561, "grad_norm": 1.0578283071517944, "learning_rate": 3.608364448718283e-05, "loss": 0.1415, "step": 5320 }, { "epoch": 1.498733464677737, "grad_norm": 0.4122551679611206, "learning_rate": 3.589493152743585e-05, "loss": 0.0914, "step": 5325 }, { "epoch": 1.5001407261469182, "grad_norm": 0.6634222269058228, "learning_rate": 3.570660533593091e-05, "loss": 0.1269, "step": 5330 }, { "epoch": 1.501547987616099, "grad_norm": 0.27888017892837524, "learning_rate": 3.551866704890564e-05, "loss": 0.1288, "step": 5335 }, { "epoch": 1.50295524908528, "grad_norm": 1.0966591835021973, "learning_rate": 3.533111780025725e-05, "loss": 0.1822, "step": 5340 }, { "epoch": 1.504362510554461, "grad_norm": 1.1912025213241577, "learning_rate": 3.514395872153584e-05, "loss": 0.2205, "step": 5345 }, { "epoch": 1.505769772023642, "grad_norm": 0.34254777431488037, "learning_rate": 3.49571909419374e-05, "loss": 0.1333, "step": 5350 }, { "epoch": 1.507177033492823, "grad_norm": 0.7154930233955383, "learning_rate": 3.4770815588297054e-05, "loss": 0.1758, "step": 5355 }, { "epoch": 1.508584294962004, "grad_norm": 0.7776800394058228, "learning_rate": 3.4584833785082385e-05, "loss": 0.1721, "step": 5360 }, { "epoch": 1.509991556431185, "grad_norm": 1.0347821712493896, "learning_rate": 3.43992466543865e-05, "loss": 0.1735, "step": 5365 }, { "epoch": 1.511398817900366, "grad_norm": 0.773311972618103, "learning_rate": 3.4214055315921245e-05, "loss": 0.1798, "step": 5370 }, { "epoch": 1.5128060793695468, "grad_norm": 0.15166114270687103, "learning_rate": 3.402926088701062e-05, "loss": 0.2025, "step": 5375 }, { "epoch": 1.5142133408387277, "grad_norm": 0.4494927227497101, "learning_rate": 3.38448644825839e-05, "loss": 0.1211, "step": 5380 }, { "epoch": 1.5156206023079088, "grad_norm": 1.2481530904769897, "learning_rate": 3.36608672151689e-05, "loss": 0.1325, "step": 5385 }, { "epoch": 1.5170278637770898, "grad_norm": 0.7955223321914673, "learning_rate": 3.347727019488531e-05, "loss": 0.1334, "step": 5390 }, { "epoch": 1.518435125246271, "grad_norm": 1.1012686491012573, "learning_rate": 3.329407452943799e-05, "loss": 0.1978, "step": 5395 }, { "epoch": 1.5198423867154518, "grad_norm": 2.147088050842285, "learning_rate": 3.311128132411031e-05, "loss": 0.1742, "step": 5400 }, { "epoch": 1.5212496481846327, "grad_norm": 1.0812978744506836, "learning_rate": 3.292889168175751e-05, "loss": 0.1237, "step": 5405 }, { "epoch": 1.5226569096538136, "grad_norm": 0.8602486848831177, "learning_rate": 3.274690670279984e-05, "loss": 0.1628, "step": 5410 }, { "epoch": 1.5240641711229945, "grad_norm": 0.4767683446407318, "learning_rate": 3.25653274852162e-05, "loss": 0.0893, "step": 5415 }, { "epoch": 1.5254714325921757, "grad_norm": 1.434166431427002, "learning_rate": 3.238415512453741e-05, "loss": 0.3905, "step": 5420 }, { "epoch": 1.5268786940613566, "grad_norm": 3.7128000259399414, "learning_rate": 3.220339071383948e-05, "loss": 0.336, "step": 5425 }, { "epoch": 1.5282859555305377, "grad_norm": 0.9743013381958008, "learning_rate": 3.202303534373712e-05, "loss": 0.17, "step": 5430 }, { "epoch": 1.5296932169997186, "grad_norm": 0.4060254991054535, "learning_rate": 3.184309010237728e-05, "loss": 0.1817, "step": 5435 }, { "epoch": 1.5311004784688995, "grad_norm": 1.3302080631256104, "learning_rate": 3.16635560754323e-05, "loss": 0.2442, "step": 5440 }, { "epoch": 1.5325077399380804, "grad_norm": 1.5643320083618164, "learning_rate": 3.148443434609367e-05, "loss": 0.3225, "step": 5445 }, { "epoch": 1.5339150014072613, "grad_norm": 1.2559304237365723, "learning_rate": 3.1305725995065205e-05, "loss": 0.1861, "step": 5450 }, { "epoch": 1.5353222628764425, "grad_norm": 1.1454960107803345, "learning_rate": 3.112743210055677e-05, "loss": 0.1262, "step": 5455 }, { "epoch": 1.5367295243456234, "grad_norm": 0.46115657687187195, "learning_rate": 3.0949553738277634e-05, "loss": 0.1827, "step": 5460 }, { "epoch": 1.5381367858148045, "grad_norm": 1.2840021848678589, "learning_rate": 3.077209198143002e-05, "loss": 0.1399, "step": 5465 }, { "epoch": 1.5395440472839854, "grad_norm": 1.189970850944519, "learning_rate": 3.0595047900702564e-05, "loss": 0.2078, "step": 5470 }, { "epoch": 1.5409513087531663, "grad_norm": 0.5335509181022644, "learning_rate": 3.041842256426404e-05, "loss": 0.1423, "step": 5475 }, { "epoch": 1.5423585702223472, "grad_norm": 0.8606838583946228, "learning_rate": 3.024221703775665e-05, "loss": 0.1468, "step": 5480 }, { "epoch": 1.5437658316915281, "grad_norm": 1.3679966926574707, "learning_rate": 3.0066432384289844e-05, "loss": 0.1247, "step": 5485 }, { "epoch": 1.5451730931607093, "grad_norm": 1.2723866701126099, "learning_rate": 2.989106966443379e-05, "loss": 0.1482, "step": 5490 }, { "epoch": 1.5465803546298902, "grad_norm": 0.8712704181671143, "learning_rate": 2.97161299362129e-05, "loss": 0.2848, "step": 5495 }, { "epoch": 1.5479876160990713, "grad_norm": 0.6967242360115051, "learning_rate": 2.9541614255099625e-05, "loss": 0.1604, "step": 5500 }, { "epoch": 1.5493948775682522, "grad_norm": 1.0415253639221191, "learning_rate": 2.9367523674007947e-05, "loss": 0.1876, "step": 5505 }, { "epoch": 1.5508021390374331, "grad_norm": 0.5861086845397949, "learning_rate": 2.9193859243287036e-05, "loss": 0.1835, "step": 5510 }, { "epoch": 1.552209400506614, "grad_norm": 1.444682002067566, "learning_rate": 2.902062201071505e-05, "loss": 0.1588, "step": 5515 }, { "epoch": 1.553616661975795, "grad_norm": 1.0231586694717407, "learning_rate": 2.8847813021492574e-05, "loss": 0.3833, "step": 5520 }, { "epoch": 1.555023923444976, "grad_norm": 1.2998064756393433, "learning_rate": 2.8675433318236567e-05, "loss": 0.1849, "step": 5525 }, { "epoch": 1.556431184914157, "grad_norm": 0.8349362015724182, "learning_rate": 2.8503483940973952e-05, "loss": 0.1391, "step": 5530 }, { "epoch": 1.5578384463833381, "grad_norm": 0.9555754661560059, "learning_rate": 2.8331965927135274e-05, "loss": 0.2073, "step": 5535 }, { "epoch": 1.559245707852519, "grad_norm": 1.703472375869751, "learning_rate": 2.8160880311548522e-05, "loss": 0.2548, "step": 5540 }, { "epoch": 1.5606529693217, "grad_norm": 0.39019107818603516, "learning_rate": 2.799022812643295e-05, "loss": 0.1277, "step": 5545 }, { "epoch": 1.5620602307908809, "grad_norm": 1.0451160669326782, "learning_rate": 2.782001040139267e-05, "loss": 0.3046, "step": 5550 }, { "epoch": 1.5634674922600618, "grad_norm": 0.8136467337608337, "learning_rate": 2.765022816341063e-05, "loss": 0.197, "step": 5555 }, { "epoch": 1.564874753729243, "grad_norm": 0.6249985098838806, "learning_rate": 2.7480882436842335e-05, "loss": 0.1592, "step": 5560 }, { "epoch": 1.566282015198424, "grad_norm": 0.5969499945640564, "learning_rate": 2.7311974243409565e-05, "loss": 0.2353, "step": 5565 }, { "epoch": 1.567689276667605, "grad_norm": 0.5542153716087341, "learning_rate": 2.7143504602194448e-05, "loss": 0.1407, "step": 5570 }, { "epoch": 1.5690965381367858, "grad_norm": 0.40066176652908325, "learning_rate": 2.697547452963307e-05, "loss": 0.1318, "step": 5575 }, { "epoch": 1.5705037996059668, "grad_norm": 0.4262009859085083, "learning_rate": 2.680788503950944e-05, "loss": 0.171, "step": 5580 }, { "epoch": 1.5719110610751477, "grad_norm": 0.7851074934005737, "learning_rate": 2.664073714294948e-05, "loss": 0.2443, "step": 5585 }, { "epoch": 1.5733183225443286, "grad_norm": 0.39711621403694153, "learning_rate": 2.6474031848414704e-05, "loss": 0.2419, "step": 5590 }, { "epoch": 1.5747255840135097, "grad_norm": 0.4387623369693756, "learning_rate": 2.6307770161696354e-05, "loss": 0.0821, "step": 5595 }, { "epoch": 1.5761328454826908, "grad_norm": 0.9057246446609497, "learning_rate": 2.6141953085909198e-05, "loss": 0.2652, "step": 5600 }, { "epoch": 1.5775401069518717, "grad_norm": 0.7787453532218933, "learning_rate": 2.597658162148544e-05, "loss": 0.2335, "step": 5605 }, { "epoch": 1.5789473684210527, "grad_norm": 1.116365909576416, "learning_rate": 2.5811656766168902e-05, "loss": 0.2092, "step": 5610 }, { "epoch": 1.5803546298902336, "grad_norm": 0.741118848323822, "learning_rate": 2.5647179515008724e-05, "loss": 0.18, "step": 5615 }, { "epoch": 1.5817618913594145, "grad_norm": 0.9240850806236267, "learning_rate": 2.548315086035351e-05, "loss": 0.2047, "step": 5620 }, { "epoch": 1.5831691528285956, "grad_norm": 1.0324885845184326, "learning_rate": 2.5319571791845408e-05, "loss": 0.1117, "step": 5625 }, { "epoch": 1.5845764142977765, "grad_norm": 1.108396053314209, "learning_rate": 2.5156443296414013e-05, "loss": 0.1582, "step": 5630 }, { "epoch": 1.5859836757669576, "grad_norm": 1.0466639995574951, "learning_rate": 2.4993766358270388e-05, "loss": 0.2145, "step": 5635 }, { "epoch": 1.5873909372361386, "grad_norm": 1.1003303527832031, "learning_rate": 2.4831541958901293e-05, "loss": 0.1401, "step": 5640 }, { "epoch": 1.5887981987053195, "grad_norm": 0.7945972084999084, "learning_rate": 2.4669771077063152e-05, "loss": 0.101, "step": 5645 }, { "epoch": 1.5902054601745004, "grad_norm": 1.6851614713668823, "learning_rate": 2.4508454688776105e-05, "loss": 0.2356, "step": 5650 }, { "epoch": 1.5916127216436813, "grad_norm": 0.708411693572998, "learning_rate": 2.434759376731819e-05, "loss": 0.2346, "step": 5655 }, { "epoch": 1.5930199831128624, "grad_norm": 0.9913239479064941, "learning_rate": 2.4187189283219446e-05, "loss": 0.1195, "step": 5660 }, { "epoch": 1.5944272445820433, "grad_norm": 1.0097897052764893, "learning_rate": 2.4027242204256108e-05, "loss": 0.1723, "step": 5665 }, { "epoch": 1.5958345060512245, "grad_norm": 0.8258925080299377, "learning_rate": 2.3867753495444723e-05, "loss": 0.1539, "step": 5670 }, { "epoch": 1.5972417675204054, "grad_norm": 0.5283498764038086, "learning_rate": 2.3708724119036262e-05, "loss": 0.1165, "step": 5675 }, { "epoch": 1.5986490289895863, "grad_norm": 1.170369267463684, "learning_rate": 2.355015503451048e-05, "loss": 0.1951, "step": 5680 }, { "epoch": 1.6000562904587672, "grad_norm": 0.8622944355010986, "learning_rate": 2.339204719856998e-05, "loss": 0.153, "step": 5685 }, { "epoch": 1.601463551927948, "grad_norm": 0.6249514818191528, "learning_rate": 2.323440156513448e-05, "loss": 0.0686, "step": 5690 }, { "epoch": 1.6028708133971292, "grad_norm": 0.2732272446155548, "learning_rate": 2.3077219085335054e-05, "loss": 0.1054, "step": 5695 }, { "epoch": 1.6042780748663101, "grad_norm": 1.5117753744125366, "learning_rate": 2.2920500707508496e-05, "loss": 0.1682, "step": 5700 }, { "epoch": 1.6056853363354913, "grad_norm": 1.9940603971481323, "learning_rate": 2.2764247377191405e-05, "loss": 0.2375, "step": 5705 }, { "epoch": 1.6070925978046722, "grad_norm": 1.0817060470581055, "learning_rate": 2.2608460037114642e-05, "loss": 0.2294, "step": 5710 }, { "epoch": 1.608499859273853, "grad_norm": 0.4378751814365387, "learning_rate": 2.2453139627197618e-05, "loss": 0.1674, "step": 5715 }, { "epoch": 1.609907120743034, "grad_norm": 0.5405195951461792, "learning_rate": 2.22982870845425e-05, "loss": 0.3422, "step": 5720 }, { "epoch": 1.611314382212215, "grad_norm": 1.4159220457077026, "learning_rate": 2.214390334342875e-05, "loss": 0.2116, "step": 5725 }, { "epoch": 1.612721643681396, "grad_norm": 1.1930686235427856, "learning_rate": 2.1989989335307304e-05, "loss": 0.0965, "step": 5730 }, { "epoch": 1.614128905150577, "grad_norm": 1.2334959506988525, "learning_rate": 2.1836545988795054e-05, "loss": 0.1547, "step": 5735 }, { "epoch": 1.615536166619758, "grad_norm": 0.7615369558334351, "learning_rate": 2.168357422966928e-05, "loss": 0.2468, "step": 5740 }, { "epoch": 1.616943428088939, "grad_norm": 0.7710257172584534, "learning_rate": 2.153107498086193e-05, "loss": 0.1674, "step": 5745 }, { "epoch": 1.61835068955812, "grad_norm": 0.464054673910141, "learning_rate": 2.137904916245419e-05, "loss": 0.2004, "step": 5750 }, { "epoch": 1.6197579510273008, "grad_norm": 0.3523075580596924, "learning_rate": 2.1227497691670894e-05, "loss": 0.2314, "step": 5755 }, { "epoch": 1.6211652124964817, "grad_norm": 0.8045745491981506, "learning_rate": 2.1076421482874877e-05, "loss": 0.1431, "step": 5760 }, { "epoch": 1.6225724739656628, "grad_norm": 0.7054654955863953, "learning_rate": 2.0925821447561665e-05, "loss": 0.1056, "step": 5765 }, { "epoch": 1.6239797354348438, "grad_norm": 1.5930366516113281, "learning_rate": 2.077569849435379e-05, "loss": 0.2394, "step": 5770 }, { "epoch": 1.6253869969040249, "grad_norm": 0.678402304649353, "learning_rate": 2.062605352899537e-05, "loss": 0.1482, "step": 5775 }, { "epoch": 1.6267942583732058, "grad_norm": 1.009436845779419, "learning_rate": 2.0476887454346716e-05, "loss": 0.2381, "step": 5780 }, { "epoch": 1.6282015198423867, "grad_norm": 0.5717734098434448, "learning_rate": 2.0328201170378813e-05, "loss": 0.1877, "step": 5785 }, { "epoch": 1.6296087813115676, "grad_norm": 1.0021076202392578, "learning_rate": 2.0179995574167842e-05, "loss": 0.1836, "step": 5790 }, { "epoch": 1.6310160427807485, "grad_norm": 0.5409684777259827, "learning_rate": 2.0032271559889915e-05, "loss": 0.21, "step": 5795 }, { "epoch": 1.6324233042499297, "grad_norm": 1.6268481016159058, "learning_rate": 1.9885030018815487e-05, "loss": 0.1786, "step": 5800 }, { "epoch": 1.6338305657191106, "grad_norm": 1.0220392942428589, "learning_rate": 1.9738271839304213e-05, "loss": 0.2016, "step": 5805 }, { "epoch": 1.6352378271882917, "grad_norm": 0.8178629875183105, "learning_rate": 1.959199790679934e-05, "loss": 0.1491, "step": 5810 }, { "epoch": 1.6366450886574726, "grad_norm": 2.1935439109802246, "learning_rate": 1.944620910382252e-05, "loss": 0.1966, "step": 5815 }, { "epoch": 1.6380523501266535, "grad_norm": 1.1369730234146118, "learning_rate": 1.930090630996849e-05, "loss": 0.2084, "step": 5820 }, { "epoch": 1.6394596115958344, "grad_norm": 0.8570969104766846, "learning_rate": 1.915609040189972e-05, "loss": 0.1779, "step": 5825 }, { "epoch": 1.6408668730650153, "grad_norm": 0.8881973624229431, "learning_rate": 1.901176225334105e-05, "loss": 0.2334, "step": 5830 }, { "epoch": 1.6422741345341965, "grad_norm": 1.057015299797058, "learning_rate": 1.886792273507457e-05, "loss": 0.2208, "step": 5835 }, { "epoch": 1.6436813960033776, "grad_norm": 0.40783455967903137, "learning_rate": 1.8724572714934307e-05, "loss": 0.0648, "step": 5840 }, { "epoch": 1.6450886574725585, "grad_norm": 0.8724305629730225, "learning_rate": 1.8581713057800933e-05, "loss": 0.2695, "step": 5845 }, { "epoch": 1.6464959189417394, "grad_norm": 1.3229783773422241, "learning_rate": 1.8439344625596534e-05, "loss": 0.1555, "step": 5850 }, { "epoch": 1.6479031804109203, "grad_norm": 0.7381983399391174, "learning_rate": 1.8297468277279618e-05, "loss": 0.177, "step": 5855 }, { "epoch": 1.6493104418801012, "grad_norm": 0.4356767535209656, "learning_rate": 1.8156084868839617e-05, "loss": 0.094, "step": 5860 }, { "epoch": 1.6507177033492821, "grad_norm": 2.0452256202697754, "learning_rate": 1.8015195253292016e-05, "loss": 0.3872, "step": 5865 }, { "epoch": 1.6521249648184633, "grad_norm": 0.7345725297927856, "learning_rate": 1.7874800280672953e-05, "loss": 0.3794, "step": 5870 }, { "epoch": 1.6535322262876444, "grad_norm": 0.5564286112785339, "learning_rate": 1.773490079803436e-05, "loss": 0.194, "step": 5875 }, { "epoch": 1.6549394877568253, "grad_norm": 1.4534375667572021, "learning_rate": 1.7595497649438565e-05, "loss": 0.2468, "step": 5880 }, { "epoch": 1.6563467492260062, "grad_norm": 1.159037709236145, "learning_rate": 1.745659167595337e-05, "loss": 0.2072, "step": 5885 }, { "epoch": 1.6577540106951871, "grad_norm": 0.9856454133987427, "learning_rate": 1.7318183715647017e-05, "loss": 0.2057, "step": 5890 }, { "epoch": 1.659161272164368, "grad_norm": 0.9816296696662903, "learning_rate": 1.7180274603583035e-05, "loss": 0.0591, "step": 5895 }, { "epoch": 1.660568533633549, "grad_norm": 0.6953201293945312, "learning_rate": 1.7042865171815158e-05, "loss": 0.1549, "step": 5900 }, { "epoch": 1.66197579510273, "grad_norm": 0.9859986901283264, "learning_rate": 1.6905956249382448e-05, "loss": 0.1446, "step": 5905 }, { "epoch": 1.6633830565719112, "grad_norm": 2.2135300636291504, "learning_rate": 1.6769548662304224e-05, "loss": 0.2074, "step": 5910 }, { "epoch": 1.6647903180410921, "grad_norm": 0.7724807858467102, "learning_rate": 1.6633643233575014e-05, "loss": 0.1867, "step": 5915 }, { "epoch": 1.666197579510273, "grad_norm": 0.6000497341156006, "learning_rate": 1.6498240783159656e-05, "loss": 0.3259, "step": 5920 }, { "epoch": 1.667604840979454, "grad_norm": 1.0605989694595337, "learning_rate": 1.6363342127988435e-05, "loss": 0.2042, "step": 5925 }, { "epoch": 1.6690121024486348, "grad_norm": 0.4106568396091461, "learning_rate": 1.6228948081951943e-05, "loss": 0.1073, "step": 5930 }, { "epoch": 1.670419363917816, "grad_norm": 0.9518342614173889, "learning_rate": 1.6095059455896387e-05, "loss": 0.1523, "step": 5935 }, { "epoch": 1.671826625386997, "grad_norm": 0.7186952829360962, "learning_rate": 1.596167705761852e-05, "loss": 0.1052, "step": 5940 }, { "epoch": 1.673233886856178, "grad_norm": 0.5331084728240967, "learning_rate": 1.5828801691860895e-05, "loss": 0.1007, "step": 5945 }, { "epoch": 1.674641148325359, "grad_norm": 0.530546247959137, "learning_rate": 1.5696434160306983e-05, "loss": 0.0948, "step": 5950 }, { "epoch": 1.6760484097945398, "grad_norm": 0.9805326461791992, "learning_rate": 1.5564575261576254e-05, "loss": 0.2097, "step": 5955 }, { "epoch": 1.6774556712637207, "grad_norm": 0.8919891715049744, "learning_rate": 1.5433225791219407e-05, "loss": 0.1409, "step": 5960 }, { "epoch": 1.6788629327329017, "grad_norm": 0.8015194535255432, "learning_rate": 1.5302386541713687e-05, "loss": 0.126, "step": 5965 }, { "epoch": 1.6802701942020828, "grad_norm": 0.47212257981300354, "learning_rate": 1.5172058302457881e-05, "loss": 0.1573, "step": 5970 }, { "epoch": 1.6816774556712637, "grad_norm": 0.6983383297920227, "learning_rate": 1.5042241859767735e-05, "loss": 0.1209, "step": 5975 }, { "epoch": 1.6830847171404448, "grad_norm": 1.2159236669540405, "learning_rate": 1.4912937996871168e-05, "loss": 0.1802, "step": 5980 }, { "epoch": 1.6844919786096257, "grad_norm": 0.764870822429657, "learning_rate": 1.4784147493903455e-05, "loss": 0.2714, "step": 5985 }, { "epoch": 1.6858992400788066, "grad_norm": 0.9790758490562439, "learning_rate": 1.4655871127902655e-05, "loss": 0.2561, "step": 5990 }, { "epoch": 1.6873065015479876, "grad_norm": 2.1390011310577393, "learning_rate": 1.4528109672804835e-05, "loss": 0.23, "step": 5995 }, { "epoch": 1.6887137630171685, "grad_norm": 0.39941343665122986, "learning_rate": 1.4400863899439387e-05, "loss": 0.2019, "step": 6000 }, { "epoch": 1.6901210244863496, "grad_norm": 0.6225385069847107, "learning_rate": 1.42741345755245e-05, "loss": 0.1884, "step": 6005 }, { "epoch": 1.6915282859555305, "grad_norm": 0.7307006120681763, "learning_rate": 1.4147922465662367e-05, "loss": 0.1126, "step": 6010 }, { "epoch": 1.6929355474247116, "grad_norm": 1.095548152923584, "learning_rate": 1.4022228331334675e-05, "loss": 0.1279, "step": 6015 }, { "epoch": 1.6943428088938925, "grad_norm": 0.45030713081359863, "learning_rate": 1.3897052930898035e-05, "loss": 0.1378, "step": 6020 }, { "epoch": 1.6957500703630735, "grad_norm": 1.7270435094833374, "learning_rate": 1.3772397019579242e-05, "loss": 0.2399, "step": 6025 }, { "epoch": 1.6971573318322544, "grad_norm": 1.0650115013122559, "learning_rate": 1.3648261349470948e-05, "loss": 0.1895, "step": 6030 }, { "epoch": 1.6985645933014353, "grad_norm": 1.0545300245285034, "learning_rate": 1.352464666952694e-05, "loss": 0.1122, "step": 6035 }, { "epoch": 1.6999718547706164, "grad_norm": 1.0150022506713867, "learning_rate": 1.3401553725557681e-05, "loss": 0.1585, "step": 6040 }, { "epoch": 1.7013791162397973, "grad_norm": 0.5082919001579285, "learning_rate": 1.3278983260225875e-05, "loss": 0.2291, "step": 6045 }, { "epoch": 1.7027863777089784, "grad_norm": 0.9131124019622803, "learning_rate": 1.3156936013041898e-05, "loss": 0.1303, "step": 6050 }, { "epoch": 1.7041936391781594, "grad_norm": 0.6868187189102173, "learning_rate": 1.3035412720359353e-05, "loss": 0.1357, "step": 6055 }, { "epoch": 1.7056009006473403, "grad_norm": 0.8841606378555298, "learning_rate": 1.2914414115370666e-05, "loss": 0.1271, "step": 6060 }, { "epoch": 1.7070081621165212, "grad_norm": 0.7348530888557434, "learning_rate": 1.2793940928102654e-05, "loss": 0.1773, "step": 6065 }, { "epoch": 1.708415423585702, "grad_norm": 0.7667552828788757, "learning_rate": 1.2673993885412073e-05, "loss": 0.2278, "step": 6070 }, { "epoch": 1.7098226850548832, "grad_norm": 1.5741273164749146, "learning_rate": 1.2554573710981276e-05, "loss": 0.1607, "step": 6075 }, { "epoch": 1.7112299465240641, "grad_norm": 1.1054571866989136, "learning_rate": 1.2435681125313803e-05, "loss": 0.1732, "step": 6080 }, { "epoch": 1.7126372079932453, "grad_norm": 1.193298101425171, "learning_rate": 1.2317316845730131e-05, "loss": 0.2668, "step": 6085 }, { "epoch": 1.7140444694624262, "grad_norm": 0.5256794691085815, "learning_rate": 1.2199481586363281e-05, "loss": 0.1741, "step": 6090 }, { "epoch": 1.715451730931607, "grad_norm": 1.2280601263046265, "learning_rate": 1.2082176058154426e-05, "loss": 0.1479, "step": 6095 }, { "epoch": 1.716858992400788, "grad_norm": 1.0573979616165161, "learning_rate": 1.196540096884876e-05, "loss": 0.1264, "step": 6100 }, { "epoch": 1.718266253869969, "grad_norm": 1.5370665788650513, "learning_rate": 1.1849157022991163e-05, "loss": 0.2142, "step": 6105 }, { "epoch": 1.71967351533915, "grad_norm": 0.7827951312065125, "learning_rate": 1.1733444921921899e-05, "loss": 0.2057, "step": 6110 }, { "epoch": 1.721080776808331, "grad_norm": 1.3667113780975342, "learning_rate": 1.1618265363772407e-05, "loss": 0.2746, "step": 6115 }, { "epoch": 1.722488038277512, "grad_norm": 1.506797432899475, "learning_rate": 1.15036190434612e-05, "loss": 0.1855, "step": 6120 }, { "epoch": 1.723895299746693, "grad_norm": 0.9613803029060364, "learning_rate": 1.1389506652689474e-05, "loss": 0.1031, "step": 6125 }, { "epoch": 1.7253025612158739, "grad_norm": 1.2002402544021606, "learning_rate": 1.1275928879937114e-05, "loss": 0.1781, "step": 6130 }, { "epoch": 1.7267098226850548, "grad_norm": 0.5957798361778259, "learning_rate": 1.1162886410458462e-05, "loss": 0.1176, "step": 6135 }, { "epoch": 1.7281170841542357, "grad_norm": 0.9620370268821716, "learning_rate": 1.1050379926278132e-05, "loss": 0.1515, "step": 6140 }, { "epoch": 1.7295243456234168, "grad_norm": 0.9195571541786194, "learning_rate": 1.0938410106187046e-05, "loss": 0.1121, "step": 6145 }, { "epoch": 1.730931607092598, "grad_norm": 0.4538973867893219, "learning_rate": 1.0826977625738155e-05, "loss": 0.1129, "step": 6150 }, { "epoch": 1.7323388685617789, "grad_norm": 1.3514046669006348, "learning_rate": 1.0716083157242484e-05, "loss": 0.1743, "step": 6155 }, { "epoch": 1.7337461300309598, "grad_norm": 0.8769412636756897, "learning_rate": 1.0605727369765072e-05, "loss": 0.1615, "step": 6160 }, { "epoch": 1.7351533915001407, "grad_norm": 1.3082162141799927, "learning_rate": 1.0495910929120866e-05, "loss": 0.1344, "step": 6165 }, { "epoch": 1.7365606529693216, "grad_norm": 0.8667125105857849, "learning_rate": 1.0386634497870751e-05, "loss": 0.2135, "step": 6170 }, { "epoch": 1.7379679144385025, "grad_norm": 0.7873309850692749, "learning_rate": 1.0277898735317614e-05, "loss": 0.1445, "step": 6175 }, { "epoch": 1.7393751759076836, "grad_norm": 1.0749235153198242, "learning_rate": 1.016970429750218e-05, "loss": 0.1792, "step": 6180 }, { "epoch": 1.7407824373768648, "grad_norm": 0.7576783299446106, "learning_rate": 1.0062051837199282e-05, "loss": 0.1597, "step": 6185 }, { "epoch": 1.7421896988460457, "grad_norm": 0.7447710037231445, "learning_rate": 9.954942003913758e-06, "loss": 0.1363, "step": 6190 }, { "epoch": 1.7435969603152266, "grad_norm": 0.756251335144043, "learning_rate": 9.848375443876578e-06, "loss": 0.1474, "step": 6195 }, { "epoch": 1.7450042217844075, "grad_norm": 0.45274704694747925, "learning_rate": 9.742352800040988e-06, "loss": 0.065, "step": 6200 }, { "epoch": 1.7464114832535884, "grad_norm": 1.0789294242858887, "learning_rate": 9.636874712078603e-06, "loss": 0.2623, "step": 6205 }, { "epoch": 1.7478187447227693, "grad_norm": 1.4076869487762451, "learning_rate": 9.531941816375501e-06, "loss": 0.2516, "step": 6210 }, { "epoch": 1.7492260061919505, "grad_norm": 2.701754331588745, "learning_rate": 9.427554746028478e-06, "loss": 0.2951, "step": 6215 }, { "epoch": 1.7506332676611316, "grad_norm": 0.36146071553230286, "learning_rate": 9.3237141308411e-06, "loss": 0.0842, "step": 6220 }, { "epoch": 1.7520405291303125, "grad_norm": 1.120956540107727, "learning_rate": 9.22042059732008e-06, "loss": 0.2894, "step": 6225 }, { "epoch": 1.7534477905994934, "grad_norm": 0.5138603448867798, "learning_rate": 9.117674768671313e-06, "loss": 0.0713, "step": 6230 }, { "epoch": 1.7548550520686743, "grad_norm": 0.8469157814979553, "learning_rate": 9.015477264796202e-06, "loss": 0.2038, "step": 6235 }, { "epoch": 1.7562623135378552, "grad_norm": 1.5071958303451538, "learning_rate": 8.913828702287974e-06, "loss": 0.3285, "step": 6240 }, { "epoch": 1.7576695750070364, "grad_norm": 1.6233199834823608, "learning_rate": 8.812729694427879e-06, "loss": 0.1192, "step": 6245 }, { "epoch": 1.7590768364762173, "grad_norm": 0.884638786315918, "learning_rate": 8.712180851181462e-06, "loss": 0.1612, "step": 6250 }, { "epoch": 1.7604840979453984, "grad_norm": 1.5049396753311157, "learning_rate": 8.612182779195021e-06, "loss": 0.1233, "step": 6255 }, { "epoch": 1.7618913594145793, "grad_norm": 1.0843751430511475, "learning_rate": 8.512736081791772e-06, "loss": 0.2496, "step": 6260 }, { "epoch": 1.7632986208837602, "grad_norm": 0.9301806688308716, "learning_rate": 8.413841358968332e-06, "loss": 0.2379, "step": 6265 }, { "epoch": 1.7647058823529411, "grad_norm": 1.611035943031311, "learning_rate": 8.315499207391075e-06, "loss": 0.1856, "step": 6270 }, { "epoch": 1.766113143822122, "grad_norm": 1.3043655157089233, "learning_rate": 8.217710220392526e-06, "loss": 0.1456, "step": 6275 }, { "epoch": 1.7675204052913032, "grad_norm": 1.800098180770874, "learning_rate": 8.12047498796773e-06, "loss": 0.2416, "step": 6280 }, { "epoch": 1.768927666760484, "grad_norm": 0.7097885608673096, "learning_rate": 8.023794096770808e-06, "loss": 0.141, "step": 6285 }, { "epoch": 1.7703349282296652, "grad_norm": 1.1929750442504883, "learning_rate": 7.927668130111243e-06, "loss": 0.3433, "step": 6290 }, { "epoch": 1.7717421896988461, "grad_norm": 1.647980809211731, "learning_rate": 7.832097667950588e-06, "loss": 0.2052, "step": 6295 }, { "epoch": 1.773149451168027, "grad_norm": 0.43591317534446716, "learning_rate": 7.737083286898749e-06, "loss": 0.2104, "step": 6300 }, { "epoch": 1.774556712637208, "grad_norm": 1.241782546043396, "learning_rate": 7.642625560210637e-06, "loss": 0.1109, "step": 6305 }, { "epoch": 1.7759639741063888, "grad_norm": 0.9579405784606934, "learning_rate": 7.548725057782658e-06, "loss": 0.1786, "step": 6310 }, { "epoch": 1.77737123557557, "grad_norm": 0.7312494516372681, "learning_rate": 7.455382346149342e-06, "loss": 0.1228, "step": 6315 }, { "epoch": 1.7787784970447509, "grad_norm": 0.7087497711181641, "learning_rate": 7.36259798847978e-06, "loss": 0.1424, "step": 6320 }, { "epoch": 1.780185758513932, "grad_norm": 1.6807194948196411, "learning_rate": 7.2703725445744105e-06, "loss": 0.1199, "step": 6325 }, { "epoch": 1.781593019983113, "grad_norm": 1.101808786392212, "learning_rate": 7.178706570861515e-06, "loss": 0.0979, "step": 6330 }, { "epoch": 1.7830002814522938, "grad_norm": 1.7121551036834717, "learning_rate": 7.087600620393864e-06, "loss": 0.101, "step": 6335 }, { "epoch": 1.7844075429214747, "grad_norm": 0.6395900845527649, "learning_rate": 6.997055242845441e-06, "loss": 0.2197, "step": 6340 }, { "epoch": 1.7858148043906557, "grad_norm": 0.9732767343521118, "learning_rate": 6.907070984508124e-06, "loss": 0.1321, "step": 6345 }, { "epoch": 1.7872220658598368, "grad_norm": 1.2426737546920776, "learning_rate": 6.8176483882883e-06, "loss": 0.2246, "step": 6350 }, { "epoch": 1.7886293273290177, "grad_norm": 1.6869935989379883, "learning_rate": 6.728787993703733e-06, "loss": 0.2733, "step": 6355 }, { "epoch": 1.7900365887981988, "grad_norm": 0.49518850445747375, "learning_rate": 6.640490336880134e-06, "loss": 0.1142, "step": 6360 }, { "epoch": 1.7914438502673797, "grad_norm": 0.7494794726371765, "learning_rate": 6.552755950548095e-06, "loss": 0.2115, "step": 6365 }, { "epoch": 1.7928511117365606, "grad_norm": 0.7595309019088745, "learning_rate": 6.465585364039795e-06, "loss": 0.1135, "step": 6370 }, { "epoch": 1.7942583732057416, "grad_norm": 0.7823693752288818, "learning_rate": 6.378979103285765e-06, "loss": 0.1422, "step": 6375 }, { "epoch": 1.7956656346749225, "grad_norm": 1.9872539043426514, "learning_rate": 6.292937690811795e-06, "loss": 0.22, "step": 6380 }, { "epoch": 1.7970728961441036, "grad_norm": 0.46582117676734924, "learning_rate": 6.207461645735746e-06, "loss": 0.1519, "step": 6385 }, { "epoch": 1.7984801576132845, "grad_norm": 0.40433597564697266, "learning_rate": 6.122551483764416e-06, "loss": 0.2422, "step": 6390 }, { "epoch": 1.7998874190824656, "grad_norm": 1.4909939765930176, "learning_rate": 6.038207717190436e-06, "loss": 0.1638, "step": 6395 }, { "epoch": 1.8012946805516465, "grad_norm": 0.7252668738365173, "learning_rate": 5.954430854889182e-06, "loss": 0.1053, "step": 6400 }, { "epoch": 1.8027019420208275, "grad_norm": 1.4477570056915283, "learning_rate": 5.871221402315674e-06, "loss": 0.1934, "step": 6405 }, { "epoch": 1.8041092034900084, "grad_norm": 0.43066859245300293, "learning_rate": 5.788579861501597e-06, "loss": 0.114, "step": 6410 }, { "epoch": 1.8055164649591893, "grad_norm": 1.1360474824905396, "learning_rate": 5.706506731052175e-06, "loss": 0.1447, "step": 6415 }, { "epoch": 1.8069237264283704, "grad_norm": 0.6951930522918701, "learning_rate": 5.625002506143218e-06, "loss": 0.1401, "step": 6420 }, { "epoch": 1.8083309878975513, "grad_norm": 1.213666319847107, "learning_rate": 5.544067678518194e-06, "loss": 0.1737, "step": 6425 }, { "epoch": 1.8097382493667324, "grad_norm": 0.9512806534767151, "learning_rate": 5.46370273648511e-06, "loss": 0.1517, "step": 6430 }, { "epoch": 1.8111455108359134, "grad_norm": 1.4045182466506958, "learning_rate": 5.3839081649137205e-06, "loss": 0.1899, "step": 6435 }, { "epoch": 1.8125527723050943, "grad_norm": 0.579311192035675, "learning_rate": 5.304684445232522e-06, "loss": 0.1442, "step": 6440 }, { "epoch": 1.8139600337742752, "grad_norm": 1.6119418144226074, "learning_rate": 5.2260320554258225e-06, "loss": 0.1473, "step": 6445 }, { "epoch": 1.815367295243456, "grad_norm": 1.2963722944259644, "learning_rate": 5.147951470030976e-06, "loss": 0.227, "step": 6450 }, { "epoch": 1.8167745567126372, "grad_norm": 1.3112095594406128, "learning_rate": 5.070443160135352e-06, "loss": 0.116, "step": 6455 }, { "epoch": 1.8181818181818183, "grad_norm": 0.49451136589050293, "learning_rate": 4.993507593373625e-06, "loss": 0.2077, "step": 6460 }, { "epoch": 1.8195890796509993, "grad_norm": 1.0468064546585083, "learning_rate": 4.917145233924924e-06, "loss": 0.246, "step": 6465 }, { "epoch": 1.8209963411201802, "grad_norm": 0.5947392582893372, "learning_rate": 4.841356542510022e-06, "loss": 0.1534, "step": 6470 }, { "epoch": 1.822403602589361, "grad_norm": 0.3909468352794647, "learning_rate": 4.766141976388494e-06, "loss": 0.1792, "step": 6475 }, { "epoch": 1.823810864058542, "grad_norm": 0.911483645439148, "learning_rate": 4.691501989356084e-06, "loss": 0.2147, "step": 6480 }, { "epoch": 1.825218125527723, "grad_norm": 0.5338053703308105, "learning_rate": 4.617437031741867e-06, "loss": 0.0811, "step": 6485 }, { "epoch": 1.826625386996904, "grad_norm": 0.5877882242202759, "learning_rate": 4.54394755040558e-06, "loss": 0.1473, "step": 6490 }, { "epoch": 1.8280326484660852, "grad_norm": 0.21510696411132812, "learning_rate": 4.471033988734885e-06, "loss": 0.2545, "step": 6495 }, { "epoch": 1.829439909935266, "grad_norm": 1.325976014137268, "learning_rate": 4.398696786642731e-06, "loss": 0.1934, "step": 6500 }, { "epoch": 1.830847171404447, "grad_norm": 0.5961741805076599, "learning_rate": 4.326936380564705e-06, "loss": 0.1732, "step": 6505 }, { "epoch": 1.8322544328736279, "grad_norm": 1.4790091514587402, "learning_rate": 4.255753203456392e-06, "loss": 0.1699, "step": 6510 }, { "epoch": 1.8336616943428088, "grad_norm": 0.5095391869544983, "learning_rate": 4.185147684790691e-06, "loss": 0.1335, "step": 6515 }, { "epoch": 1.83506895581199, "grad_norm": 0.5565084218978882, "learning_rate": 4.115120250555349e-06, "loss": 0.1748, "step": 6520 }, { "epoch": 1.8364762172811708, "grad_norm": 1.2198169231414795, "learning_rate": 4.045671323250333e-06, "loss": 0.2197, "step": 6525 }, { "epoch": 1.837883478750352, "grad_norm": 0.4299394488334656, "learning_rate": 3.976801321885215e-06, "loss": 0.1229, "step": 6530 }, { "epoch": 1.8392907402195329, "grad_norm": 0.8082312345504761, "learning_rate": 3.908510661976739e-06, "loss": 0.2784, "step": 6535 }, { "epoch": 1.8406980016887138, "grad_norm": 0.7714455723762512, "learning_rate": 3.840799755546298e-06, "loss": 0.1128, "step": 6540 }, { "epoch": 1.8421052631578947, "grad_norm": 1.8380225896835327, "learning_rate": 3.773669011117398e-06, "loss": 0.2196, "step": 6545 }, { "epoch": 1.8435125246270756, "grad_norm": 1.4072784185409546, "learning_rate": 3.707118833713241e-06, "loss": 0.1164, "step": 6550 }, { "epoch": 1.8449197860962567, "grad_norm": 2.7376558780670166, "learning_rate": 3.6411496248542897e-06, "loss": 0.1715, "step": 6555 }, { "epoch": 1.8463270475654376, "grad_norm": 1.3996756076812744, "learning_rate": 3.5757617825557533e-06, "loss": 0.1792, "step": 6560 }, { "epoch": 1.8477343090346188, "grad_norm": 1.6355584859848022, "learning_rate": 3.5109557013253357e-06, "loss": 0.1213, "step": 6565 }, { "epoch": 1.8491415705037997, "grad_norm": 0.6846399903297424, "learning_rate": 3.446731772160716e-06, "loss": 0.1431, "step": 6570 }, { "epoch": 1.8505488319729806, "grad_norm": 1.0300202369689941, "learning_rate": 3.3830903825472493e-06, "loss": 0.1996, "step": 6575 }, { "epoch": 1.8519560934421615, "grad_norm": 0.8449344038963318, "learning_rate": 3.3200319164556683e-06, "loss": 0.1457, "step": 6580 }, { "epoch": 1.8533633549113424, "grad_norm": 0.8704646825790405, "learning_rate": 3.2575567543396746e-06, "loss": 0.1493, "step": 6585 }, { "epoch": 1.8547706163805235, "grad_norm": 1.0447726249694824, "learning_rate": 3.195665273133719e-06, "loss": 0.2999, "step": 6590 }, { "epoch": 1.8561778778497044, "grad_norm": 0.6128522157669067, "learning_rate": 3.134357846250735e-06, "loss": 0.0989, "step": 6595 }, { "epoch": 1.8575851393188856, "grad_norm": 0.7889478802680969, "learning_rate": 3.073634843579776e-06, "loss": 0.1107, "step": 6600 }, { "epoch": 1.8589924007880665, "grad_norm": 1.114986777305603, "learning_rate": 3.0134966314839144e-06, "loss": 0.0739, "step": 6605 }, { "epoch": 1.8603996622572474, "grad_norm": 0.4977349638938904, "learning_rate": 2.953943572797968e-06, "loss": 0.0591, "step": 6610 }, { "epoch": 1.8618069237264283, "grad_norm": 0.6706826686859131, "learning_rate": 2.8949760268263017e-06, "loss": 0.1383, "step": 6615 }, { "epoch": 1.8632141851956092, "grad_norm": 0.6721628308296204, "learning_rate": 2.8365943493406934e-06, "loss": 0.1539, "step": 6620 }, { "epoch": 1.8646214466647903, "grad_norm": 0.6661956310272217, "learning_rate": 2.7787988925782048e-06, "loss": 0.1833, "step": 6625 }, { "epoch": 1.8660287081339713, "grad_norm": 1.3089790344238281, "learning_rate": 2.7215900052389497e-06, "loss": 0.1368, "step": 6630 }, { "epoch": 1.8674359696031524, "grad_norm": 1.6742780208587646, "learning_rate": 2.6649680324841166e-06, "loss": 0.2486, "step": 6635 }, { "epoch": 1.8688432310723333, "grad_norm": 0.8076462745666504, "learning_rate": 2.608933315933837e-06, "loss": 0.115, "step": 6640 }, { "epoch": 1.8702504925415142, "grad_norm": 1.4497947692871094, "learning_rate": 2.5534861936650665e-06, "loss": 0.1807, "step": 6645 }, { "epoch": 1.8716577540106951, "grad_norm": 0.8782854676246643, "learning_rate": 2.4986270002096747e-06, "loss": 0.1052, "step": 6650 }, { "epoch": 1.873065015479876, "grad_norm": 0.6687735915184021, "learning_rate": 2.4443560665523e-06, "loss": 0.2023, "step": 6655 }, { "epoch": 1.8744722769490572, "grad_norm": 0.698962390422821, "learning_rate": 2.3906737201284002e-06, "loss": 0.1023, "step": 6660 }, { "epoch": 1.875879538418238, "grad_norm": 1.2811174392700195, "learning_rate": 2.3375802848223385e-06, "loss": 0.1374, "step": 6665 }, { "epoch": 1.8772867998874192, "grad_norm": 0.8447235226631165, "learning_rate": 2.285076080965287e-06, "loss": 0.1569, "step": 6670 }, { "epoch": 1.8786940613566, "grad_norm": 0.6996911764144897, "learning_rate": 2.233161425333474e-06, "loss": 0.1395, "step": 6675 }, { "epoch": 1.880101322825781, "grad_norm": 1.388584852218628, "learning_rate": 2.1818366311460946e-06, "loss": 0.1692, "step": 6680 }, { "epoch": 1.881508584294962, "grad_norm": 0.5281504988670349, "learning_rate": 2.1311020080635346e-06, "loss": 0.1218, "step": 6685 }, { "epoch": 1.8829158457641428, "grad_norm": 0.8310534954071045, "learning_rate": 2.080957862185484e-06, "loss": 0.2253, "step": 6690 }, { "epoch": 1.884323107233324, "grad_norm": 0.5924013257026672, "learning_rate": 2.031404496049072e-06, "loss": 0.0862, "step": 6695 }, { "epoch": 1.8857303687025049, "grad_norm": 0.445305198431015, "learning_rate": 1.982442208627033e-06, "loss": 0.2208, "step": 6700 }, { "epoch": 1.887137630171686, "grad_norm": 0.66776442527771, "learning_rate": 1.9340712953259565e-06, "loss": 0.159, "step": 6705 }, { "epoch": 1.888544891640867, "grad_norm": 0.8003804683685303, "learning_rate": 1.886292047984395e-06, "loss": 0.1276, "step": 6710 }, { "epoch": 1.8899521531100478, "grad_norm": 1.1968119144439697, "learning_rate": 1.839104754871257e-06, "loss": 0.1147, "step": 6715 }, { "epoch": 1.8913594145792287, "grad_norm": 2.06772518157959, "learning_rate": 1.7925097006839198e-06, "loss": 0.1263, "step": 6720 }, { "epoch": 1.8927666760484096, "grad_norm": 0.8591898083686829, "learning_rate": 1.746507166546596e-06, "loss": 0.1612, "step": 6725 }, { "epoch": 1.8941739375175908, "grad_norm": 1.3790104389190674, "learning_rate": 1.7010974300086358e-06, "loss": 0.1714, "step": 6730 }, { "epoch": 1.8955811989867717, "grad_norm": 0.6857600808143616, "learning_rate": 1.656280765042828e-06, "loss": 0.1331, "step": 6735 }, { "epoch": 1.8969884604559528, "grad_norm": 0.9561905860900879, "learning_rate": 1.612057442043724e-06, "loss": 0.13, "step": 6740 }, { "epoch": 1.8983957219251337, "grad_norm": 1.3840196132659912, "learning_rate": 1.5684277278260718e-06, "loss": 0.2562, "step": 6745 }, { "epoch": 1.8998029833943146, "grad_norm": 0.6963467001914978, "learning_rate": 1.525391885623173e-06, "loss": 0.1882, "step": 6750 }, { "epoch": 1.9012102448634955, "grad_norm": 0.9500248432159424, "learning_rate": 1.4829501750852626e-06, "loss": 0.131, "step": 6755 }, { "epoch": 1.9026175063326765, "grad_norm": 0.8108523488044739, "learning_rate": 1.4411028522779757e-06, "loss": 0.1891, "step": 6760 }, { "epoch": 1.9040247678018576, "grad_norm": 0.6868911981582642, "learning_rate": 1.3998501696808274e-06, "loss": 0.1761, "step": 6765 }, { "epoch": 1.9054320292710387, "grad_norm": 1.8471946716308594, "learning_rate": 1.3591923761856363e-06, "loss": 0.2683, "step": 6770 }, { "epoch": 1.9068392907402196, "grad_norm": 0.5496200919151306, "learning_rate": 1.3191297170950578e-06, "loss": 0.1627, "step": 6775 }, { "epoch": 1.9082465522094005, "grad_norm": 0.7432734370231628, "learning_rate": 1.2796624341210873e-06, "loss": 0.1406, "step": 6780 }, { "epoch": 1.9096538136785814, "grad_norm": 0.773916482925415, "learning_rate": 1.2407907653836038e-06, "loss": 0.1308, "step": 6785 }, { "epoch": 1.9110610751477624, "grad_norm": 1.0941839218139648, "learning_rate": 1.2025149454089723e-06, "loss": 0.1269, "step": 6790 }, { "epoch": 1.9124683366169433, "grad_norm": 0.5930225253105164, "learning_rate": 1.1648352051285448e-06, "loss": 0.1393, "step": 6795 }, { "epoch": 1.9138755980861244, "grad_norm": 0.38355159759521484, "learning_rate": 1.127751771877339e-06, "loss": 0.128, "step": 6800 }, { "epoch": 1.9152828595553055, "grad_norm": 0.8687125444412231, "learning_rate": 1.0912648693926497e-06, "loss": 0.128, "step": 6805 }, { "epoch": 1.9166901210244864, "grad_norm": 0.9181435704231262, "learning_rate": 1.055374717812696e-06, "loss": 0.2078, "step": 6810 }, { "epoch": 1.9180973824936673, "grad_norm": 1.5709048509597778, "learning_rate": 1.0200815336752657e-06, "loss": 0.1745, "step": 6815 }, { "epoch": 1.9195046439628483, "grad_norm": 0.8740848302841187, "learning_rate": 9.853855299164717e-07, "loss": 0.1209, "step": 6820 }, { "epoch": 1.9209119054320292, "grad_norm": 0.46822214126586914, "learning_rate": 9.512869158693982e-07, "loss": 0.1031, "step": 6825 }, { "epoch": 1.9223191669012103, "grad_norm": 0.6493380665779114, "learning_rate": 9.177858972628794e-07, "loss": 0.1665, "step": 6830 }, { "epoch": 1.9237264283703912, "grad_norm": 0.628223180770874, "learning_rate": 8.848826762202556e-07, "loss": 0.1375, "step": 6835 }, { "epoch": 1.9251336898395723, "grad_norm": 0.8677277565002441, "learning_rate": 8.525774512581297e-07, "loss": 0.1193, "step": 6840 }, { "epoch": 1.9265409513087532, "grad_norm": 0.34191542863845825, "learning_rate": 8.208704172851911e-07, "loss": 0.1605, "step": 6845 }, { "epoch": 1.9279482127779342, "grad_norm": 0.3965689539909363, "learning_rate": 7.897617656010381e-07, "loss": 0.2008, "step": 6850 }, { "epoch": 1.929355474247115, "grad_norm": 1.651140809059143, "learning_rate": 7.592516838950348e-07, "loss": 0.259, "step": 6855 }, { "epoch": 1.930762735716296, "grad_norm": 1.2457526922225952, "learning_rate": 7.293403562451229e-07, "loss": 0.1243, "step": 6860 }, { "epoch": 1.932169997185477, "grad_norm": 0.42919033765792847, "learning_rate": 7.000279631168005e-07, "loss": 0.0686, "step": 6865 }, { "epoch": 1.933577258654658, "grad_norm": 1.004384160041809, "learning_rate": 6.713146813619564e-07, "loss": 0.1132, "step": 6870 }, { "epoch": 1.9349845201238391, "grad_norm": 0.7319831252098083, "learning_rate": 6.432006842178262e-07, "loss": 0.0594, "step": 6875 }, { "epoch": 1.93639178159302, "grad_norm": 0.9444944262504578, "learning_rate": 6.156861413059601e-07, "loss": 0.1181, "step": 6880 }, { "epoch": 1.937799043062201, "grad_norm": 1.6310319900512695, "learning_rate": 5.887712186312011e-07, "loss": 0.2333, "step": 6885 }, { "epoch": 1.9392063045313819, "grad_norm": 0.7760756015777588, "learning_rate": 5.624560785806754e-07, "loss": 0.1101, "step": 6890 }, { "epoch": 1.9406135660005628, "grad_norm": 1.4316829442977905, "learning_rate": 5.367408799227925e-07, "loss": 0.1512, "step": 6895 }, { "epoch": 1.942020827469744, "grad_norm": 0.6632144451141357, "learning_rate": 5.116257778063238e-07, "loss": 0.176, "step": 6900 }, { "epoch": 1.9434280889389248, "grad_norm": 0.4353666603565216, "learning_rate": 4.871109237594373e-07, "loss": 0.1293, "step": 6905 }, { "epoch": 1.944835350408106, "grad_norm": 2.0593976974487305, "learning_rate": 4.631964656888088e-07, "loss": 0.4206, "step": 6910 }, { "epoch": 1.9462426118772869, "grad_norm": 0.8553899526596069, "learning_rate": 4.3988254787868945e-07, "loss": 0.2033, "step": 6915 }, { "epoch": 1.9476498733464678, "grad_norm": 2.4069225788116455, "learning_rate": 4.171693109900954e-07, "loss": 0.1747, "step": 6920 }, { "epoch": 1.9490571348156487, "grad_norm": 1.0317012071609497, "learning_rate": 3.950568920598974e-07, "loss": 0.1857, "step": 6925 }, { "epoch": 1.9504643962848296, "grad_norm": 0.16559715569019318, "learning_rate": 3.735454245000436e-07, "loss": 0.1506, "step": 6930 }, { "epoch": 1.9518716577540107, "grad_norm": 1.008353590965271, "learning_rate": 3.526350380967047e-07, "loss": 0.1661, "step": 6935 }, { "epoch": 1.9532789192231916, "grad_norm": 0.8605316877365112, "learning_rate": 3.323258590095635e-07, "loss": 0.1547, "step": 6940 }, { "epoch": 1.9546861806923728, "grad_norm": 0.8140857815742493, "learning_rate": 3.126180097709597e-07, "loss": 0.204, "step": 6945 }, { "epoch": 1.9560934421615537, "grad_norm": 0.250213086605072, "learning_rate": 2.9351160928522416e-07, "loss": 0.1531, "step": 6950 }, { "epoch": 1.9575007036307346, "grad_norm": 2.0146706104278564, "learning_rate": 2.7500677282795704e-07, "loss": 0.135, "step": 6955 }, { "epoch": 1.9589079650999155, "grad_norm": 0.43031638860702515, "learning_rate": 2.57103612045273e-07, "loss": 0.1118, "step": 6960 }, { "epoch": 1.9603152265690964, "grad_norm": 1.1351455450057983, "learning_rate": 2.3980223495319034e-07, "loss": 0.1474, "step": 6965 }, { "epoch": 1.9617224880382775, "grad_norm": 0.6760854721069336, "learning_rate": 2.231027459369539e-07, "loss": 0.1577, "step": 6970 }, { "epoch": 1.9631297495074584, "grad_norm": 0.6344230771064758, "learning_rate": 2.0700524575041347e-07, "loss": 0.0911, "step": 6975 }, { "epoch": 1.9645370109766396, "grad_norm": 0.8816024661064148, "learning_rate": 1.915098315153907e-07, "loss": 0.1711, "step": 6980 }, { "epoch": 1.9659442724458205, "grad_norm": 1.2508419752120972, "learning_rate": 1.766165967211464e-07, "loss": 0.2165, "step": 6985 }, { "epoch": 1.9673515339150014, "grad_norm": 0.9682034254074097, "learning_rate": 1.6232563122373645e-07, "loss": 0.1176, "step": 6990 }, { "epoch": 1.9687587953841823, "grad_norm": 0.5194812417030334, "learning_rate": 1.4863702124554567e-07, "loss": 0.1792, "step": 6995 }, { "epoch": 1.9701660568533632, "grad_norm": 0.7501698136329651, "learning_rate": 1.3555084937475483e-07, "loss": 0.1375, "step": 7000 }, { "epoch": 1.9715733183225443, "grad_norm": 0.8848897218704224, "learning_rate": 1.2306719456478544e-07, "loss": 0.1218, "step": 7005 }, { "epoch": 1.9729805797917253, "grad_norm": 0.5296036601066589, "learning_rate": 1.1118613213388918e-07, "loss": 0.0949, "step": 7010 }, { "epoch": 1.9743878412609064, "grad_norm": 0.5823400616645813, "learning_rate": 9.990773376464812e-08, "loss": 0.1266, "step": 7015 }, { "epoch": 1.9757951027300873, "grad_norm": 1.2051528692245483, "learning_rate": 8.923206750359736e-08, "loss": 0.1841, "step": 7020 }, { "epoch": 1.9772023641992682, "grad_norm": 2.1660141944885254, "learning_rate": 7.915919776073644e-08, "loss": 0.1758, "step": 7025 }, { "epoch": 1.9786096256684491, "grad_norm": 0.9142996072769165, "learning_rate": 6.968918530920742e-08, "loss": 0.2226, "step": 7030 }, { "epoch": 1.98001688713763, "grad_norm": 2.0500295162200928, "learning_rate": 6.082208728490635e-08, "loss": 0.1638, "step": 7035 }, { "epoch": 1.9814241486068112, "grad_norm": 0.7084165811538696, "learning_rate": 5.255795718611678e-08, "loss": 0.1535, "step": 7040 }, { "epoch": 1.9828314100759923, "grad_norm": 0.5557725429534912, "learning_rate": 4.489684487322121e-08, "loss": 0.1053, "step": 7045 }, { "epoch": 1.9842386715451732, "grad_norm": 0.3313843905925751, "learning_rate": 3.783879656840128e-08, "loss": 0.1593, "step": 7050 }, { "epoch": 1.985645933014354, "grad_norm": 2.084636688232422, "learning_rate": 3.1383854855304705e-08, "loss": 0.1938, "step": 7055 }, { "epoch": 1.987053194483535, "grad_norm": 0.47041577100753784, "learning_rate": 2.553205867884545e-08, "loss": 0.0875, "step": 7060 }, { "epoch": 1.988460455952716, "grad_norm": 0.6036000847816467, "learning_rate": 2.0283443344959464e-08, "loss": 0.064, "step": 7065 }, { "epoch": 1.9898677174218968, "grad_norm": 0.40105298161506653, "learning_rate": 1.5638040520382646e-08, "loss": 0.1467, "step": 7070 }, { "epoch": 1.991274978891078, "grad_norm": 0.8283329606056213, "learning_rate": 1.1595878232428803e-08, "loss": 0.1675, "step": 7075 }, { "epoch": 1.992682240360259, "grad_norm": 0.612358570098877, "learning_rate": 8.15698086888972e-09, "loss": 0.1813, "step": 7080 }, { "epoch": 1.99408950182944, "grad_norm": 0.3482489287853241, "learning_rate": 5.321369177835323e-09, "loss": 0.1543, "step": 7085 }, { "epoch": 1.995496763298621, "grad_norm": 0.9294025301933289, "learning_rate": 3.089060267480459e-09, "loss": 0.1197, "step": 7090 }, { "epoch": 1.9969040247678018, "grad_norm": 1.7287979125976562, "learning_rate": 1.4600676061404805e-09, "loss": 0.1638, "step": 7095 }, { "epoch": 1.9983112862369827, "grad_norm": 0.451955109834671, "learning_rate": 4.344010220980188e-10, "loss": 0.2378, "step": 7100 }, { "epoch": 1.9997185477061636, "grad_norm": 0.541246771812439, "learning_rate": 1.20667035474753e-11, "loss": 0.1537, "step": 7105 }, { "epoch": 2.0, "step": 7106, "total_flos": 1.54790643235396e+18, "train_loss": 0.3593486731773929, "train_runtime": 16225.5696, "train_samples_per_second": 3.503, "train_steps_per_second": 0.438 } ], "logging_steps": 5, "max_steps": 7106, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.54790643235396e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }