{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 4467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006715916722632639, "grad_norm": 5.630690574645996, "learning_rate": 9e-07, "loss": 0.7133, "step": 10 }, { "epoch": 0.013431833445265278, "grad_norm": 4.237407207489014, "learning_rate": 1.9e-06, "loss": 0.6472, "step": 20 }, { "epoch": 0.020147750167897917, "grad_norm": 6.263025760650635, "learning_rate": 2.9e-06, "loss": 0.5851, "step": 30 }, { "epoch": 0.026863666890530557, "grad_norm": 10.21045207977295, "learning_rate": 3.9e-06, "loss": 0.5366, "step": 40 }, { "epoch": 0.0335795836131632, "grad_norm": 3.691526174545288, "learning_rate": 4.9000000000000005e-06, "loss": 0.4325, "step": 50 }, { "epoch": 0.040295500335795834, "grad_norm": 5.201944351196289, "learning_rate": 5.9e-06, "loss": 0.354, "step": 60 }, { "epoch": 0.04701141705842848, "grad_norm": 15.92503547668457, "learning_rate": 6.900000000000001e-06, "loss": 0.2003, "step": 70 }, { "epoch": 0.053727333781061114, "grad_norm": 7.618583679199219, "learning_rate": 7.9e-06, "loss": 0.2309, "step": 80 }, { "epoch": 0.06044325050369376, "grad_norm": 4.151483535766602, "learning_rate": 8.9e-06, "loss": 0.1384, "step": 90 }, { "epoch": 0.0671591672263264, "grad_norm": 9.921517372131348, "learning_rate": 9.900000000000002e-06, "loss": 0.1763, "step": 100 }, { "epoch": 0.07387508394895903, "grad_norm": 2.020857810974121, "learning_rate": 1.09e-05, "loss": 0.1451, "step": 110 }, { "epoch": 0.08059100067159167, "grad_norm": 1.8496060371398926, "learning_rate": 1.19e-05, "loss": 0.1004, "step": 120 }, { "epoch": 0.08730691739422432, "grad_norm": 9.236139297485352, "learning_rate": 1.29e-05, "loss": 0.1879, "step": 130 }, { "epoch": 0.09402283411685695, "grad_norm": 10.885250091552734, "learning_rate": 1.3900000000000002e-05, "loss": 0.1709, "step": 140 }, { "epoch": 0.10073875083948959, "grad_norm": 18.184890747070312, "learning_rate": 1.49e-05, "loss": 0.1324, "step": 150 }, { "epoch": 0.10745466756212223, "grad_norm": 6.792922019958496, "learning_rate": 1.59e-05, "loss": 0.1485, "step": 160 }, { "epoch": 0.11417058428475486, "grad_norm": 2.8304738998413086, "learning_rate": 1.69e-05, "loss": 0.0533, "step": 170 }, { "epoch": 0.12088650100738751, "grad_norm": 0.03287811204791069, "learning_rate": 1.79e-05, "loss": 0.1616, "step": 180 }, { "epoch": 0.12760241773002015, "grad_norm": 4.168348789215088, "learning_rate": 1.8900000000000002e-05, "loss": 0.0667, "step": 190 }, { "epoch": 0.1343183344526528, "grad_norm": 0.9931334257125854, "learning_rate": 1.9900000000000003e-05, "loss": 0.1678, "step": 200 }, { "epoch": 0.14103425117528542, "grad_norm": 14.898422241210938, "learning_rate": 2.09e-05, "loss": 0.1274, "step": 210 }, { "epoch": 0.14775016789791806, "grad_norm": 2.7615175247192383, "learning_rate": 2.19e-05, "loss": 0.1324, "step": 220 }, { "epoch": 0.1544660846205507, "grad_norm": 0.19374199211597443, "learning_rate": 2.29e-05, "loss": 0.2972, "step": 230 }, { "epoch": 0.16118200134318333, "grad_norm": 6.9305419921875, "learning_rate": 2.39e-05, "loss": 0.0863, "step": 240 }, { "epoch": 0.16789791806581597, "grad_norm": 8.841792106628418, "learning_rate": 2.4900000000000002e-05, "loss": 0.1672, "step": 250 }, { "epoch": 0.17461383478844864, "grad_norm": 0.10736802220344543, "learning_rate": 2.5900000000000003e-05, "loss": 0.1535, "step": 260 }, { "epoch": 0.18132975151108127, "grad_norm": 9.984793663024902, "learning_rate": 2.6900000000000003e-05, "loss": 0.0592, "step": 270 }, { "epoch": 0.1880456682337139, "grad_norm": 17.420595169067383, "learning_rate": 2.7900000000000004e-05, "loss": 0.2605, "step": 280 }, { "epoch": 0.19476158495634655, "grad_norm": 2.0211360454559326, "learning_rate": 2.8899999999999998e-05, "loss": 0.0581, "step": 290 }, { "epoch": 0.20147750167897918, "grad_norm": 4.574538707733154, "learning_rate": 2.9900000000000002e-05, "loss": 0.1315, "step": 300 }, { "epoch": 0.20819341840161182, "grad_norm": 0.37417200207710266, "learning_rate": 3.09e-05, "loss": 0.0863, "step": 310 }, { "epoch": 0.21490933512424445, "grad_norm": 0.08471789956092834, "learning_rate": 3.19e-05, "loss": 0.0509, "step": 320 }, { "epoch": 0.2216252518468771, "grad_norm": 2.391404151916504, "learning_rate": 3.29e-05, "loss": 0.0897, "step": 330 }, { "epoch": 0.22834116856950973, "grad_norm": 0.2534968852996826, "learning_rate": 3.3900000000000004e-05, "loss": 0.1236, "step": 340 }, { "epoch": 0.23505708529214236, "grad_norm": 0.9776629209518433, "learning_rate": 3.49e-05, "loss": 0.1281, "step": 350 }, { "epoch": 0.24177300201477503, "grad_norm": 2.469677209854126, "learning_rate": 3.59e-05, "loss": 0.2372, "step": 360 }, { "epoch": 0.24848891873740767, "grad_norm": 0.3327536880970001, "learning_rate": 3.69e-05, "loss": 0.083, "step": 370 }, { "epoch": 0.2552048354600403, "grad_norm": 10.457483291625977, "learning_rate": 3.79e-05, "loss": 0.1516, "step": 380 }, { "epoch": 0.2619207521826729, "grad_norm": 0.4218825697898865, "learning_rate": 3.8900000000000004e-05, "loss": 0.137, "step": 390 }, { "epoch": 0.2686366689053056, "grad_norm": 4.134459495544434, "learning_rate": 3.99e-05, "loss": 0.1811, "step": 400 }, { "epoch": 0.27535258562793824, "grad_norm": 0.6121009588241577, "learning_rate": 4.09e-05, "loss": 0.0851, "step": 410 }, { "epoch": 0.28206850235057085, "grad_norm": 0.04361363872885704, "learning_rate": 4.19e-05, "loss": 0.0387, "step": 420 }, { "epoch": 0.2887844190732035, "grad_norm": 109.2441635131836, "learning_rate": 4.29e-05, "loss": 0.1153, "step": 430 }, { "epoch": 0.2955003357958361, "grad_norm": 0.012662663124501705, "learning_rate": 4.39e-05, "loss": 0.251, "step": 440 }, { "epoch": 0.3022162525184688, "grad_norm": 0.8022059202194214, "learning_rate": 4.49e-05, "loss": 0.184, "step": 450 }, { "epoch": 0.3089321692411014, "grad_norm": 18.51458740234375, "learning_rate": 4.5900000000000004e-05, "loss": 0.1241, "step": 460 }, { "epoch": 0.31564808596373406, "grad_norm": 2.7725183963775635, "learning_rate": 4.69e-05, "loss": 0.1641, "step": 470 }, { "epoch": 0.32236400268636667, "grad_norm": 0.11448180675506592, "learning_rate": 4.79e-05, "loss": 0.0613, "step": 480 }, { "epoch": 0.32907991940899933, "grad_norm": 0.0008866311982274055, "learning_rate": 4.89e-05, "loss": 0.0064, "step": 490 }, { "epoch": 0.33579583613163194, "grad_norm": 3.4407784938812256, "learning_rate": 4.99e-05, "loss": 0.1556, "step": 500 }, { "epoch": 0.3425117528542646, "grad_norm": 12.447320938110352, "learning_rate": 4.988656415427275e-05, "loss": 0.2918, "step": 510 }, { "epoch": 0.34922766957689727, "grad_norm": 3.2200286388397217, "learning_rate": 4.976052432568692e-05, "loss": 0.155, "step": 520 }, { "epoch": 0.3559435862995299, "grad_norm": 0.31001582741737366, "learning_rate": 4.963448449710109e-05, "loss": 0.0628, "step": 530 }, { "epoch": 0.36265950302216254, "grad_norm": 1.2337498664855957, "learning_rate": 4.9508444668515254e-05, "loss": 0.1069, "step": 540 }, { "epoch": 0.36937541974479515, "grad_norm": 0.11717221140861511, "learning_rate": 4.938240483992942e-05, "loss": 0.0971, "step": 550 }, { "epoch": 0.3760913364674278, "grad_norm": 1.2564656734466553, "learning_rate": 4.925636501134358e-05, "loss": 0.0889, "step": 560 }, { "epoch": 0.3828072531900604, "grad_norm": 1.1871436834335327, "learning_rate": 4.9130325182757754e-05, "loss": 0.2583, "step": 570 }, { "epoch": 0.3895231699126931, "grad_norm": 0.31010547280311584, "learning_rate": 4.900428535417192e-05, "loss": 0.0479, "step": 580 }, { "epoch": 0.3962390866353257, "grad_norm": 0.8911293745040894, "learning_rate": 4.8878245525586084e-05, "loss": 0.1356, "step": 590 }, { "epoch": 0.40295500335795836, "grad_norm": 2.5432729721069336, "learning_rate": 4.8752205697000255e-05, "loss": 0.0841, "step": 600 }, { "epoch": 0.40967092008059103, "grad_norm": 0.2870176136493683, "learning_rate": 4.862616586841442e-05, "loss": 0.0695, "step": 610 }, { "epoch": 0.41638683680322364, "grad_norm": 1.6922942399978638, "learning_rate": 4.8500126039828585e-05, "loss": 0.0721, "step": 620 }, { "epoch": 0.4231027535258563, "grad_norm": 0.4180830717086792, "learning_rate": 4.8374086211242756e-05, "loss": 0.0194, "step": 630 }, { "epoch": 0.4298186702484889, "grad_norm": 4.236703872680664, "learning_rate": 4.824804638265692e-05, "loss": 0.1223, "step": 640 }, { "epoch": 0.4365345869711216, "grad_norm": 2.4450433254241943, "learning_rate": 4.8122006554071086e-05, "loss": 0.0755, "step": 650 }, { "epoch": 0.4432505036937542, "grad_norm": 1.0442028045654297, "learning_rate": 4.799596672548525e-05, "loss": 0.0912, "step": 660 }, { "epoch": 0.44996642041638685, "grad_norm": 5.720908164978027, "learning_rate": 4.786992689689943e-05, "loss": 0.0321, "step": 670 }, { "epoch": 0.45668233713901946, "grad_norm": 0.7653585076332092, "learning_rate": 4.774388706831359e-05, "loss": 0.0998, "step": 680 }, { "epoch": 0.4633982538616521, "grad_norm": 0.3201594352722168, "learning_rate": 4.761784723972776e-05, "loss": 0.0781, "step": 690 }, { "epoch": 0.47011417058428473, "grad_norm": 0.001702402252703905, "learning_rate": 4.749180741114192e-05, "loss": 0.013, "step": 700 }, { "epoch": 0.4768300873069174, "grad_norm": 14.604610443115234, "learning_rate": 4.7365767582556094e-05, "loss": 0.083, "step": 710 }, { "epoch": 0.48354600402955006, "grad_norm": 5.529962539672852, "learning_rate": 4.723972775397026e-05, "loss": 0.182, "step": 720 }, { "epoch": 0.49026192075218267, "grad_norm": 1.8030890226364136, "learning_rate": 4.7113687925384424e-05, "loss": 0.125, "step": 730 }, { "epoch": 0.49697783747481533, "grad_norm": 6.454009532928467, "learning_rate": 4.6987648096798595e-05, "loss": 0.0709, "step": 740 }, { "epoch": 0.503693754197448, "grad_norm": 5.522768020629883, "learning_rate": 4.686160826821276e-05, "loss": 0.1086, "step": 750 }, { "epoch": 0.5104096709200806, "grad_norm": 0.5437310338020325, "learning_rate": 4.6735568439626924e-05, "loss": 0.0835, "step": 760 }, { "epoch": 0.5171255876427132, "grad_norm": 1.4233680963516235, "learning_rate": 4.660952861104109e-05, "loss": 0.15, "step": 770 }, { "epoch": 0.5238415043653458, "grad_norm": 2.103144884109497, "learning_rate": 4.648348878245526e-05, "loss": 0.0615, "step": 780 }, { "epoch": 0.5305574210879785, "grad_norm": 4.456714630126953, "learning_rate": 4.6357448953869425e-05, "loss": 0.117, "step": 790 }, { "epoch": 0.5372733378106112, "grad_norm": 2.5074684619903564, "learning_rate": 4.623140912528359e-05, "loss": 0.1606, "step": 800 }, { "epoch": 0.5439892545332438, "grad_norm": 0.8483042120933533, "learning_rate": 4.610536929669776e-05, "loss": 0.0747, "step": 810 }, { "epoch": 0.5507051712558765, "grad_norm": 0.05906325578689575, "learning_rate": 4.5979329468111926e-05, "loss": 0.1355, "step": 820 }, { "epoch": 0.5574210879785091, "grad_norm": 0.8080152273178101, "learning_rate": 4.585328963952609e-05, "loss": 0.1001, "step": 830 }, { "epoch": 0.5641370047011417, "grad_norm": 0.1343095898628235, "learning_rate": 4.572724981094026e-05, "loss": 0.0376, "step": 840 }, { "epoch": 0.5708529214237743, "grad_norm": 1.397542119026184, "learning_rate": 4.560120998235443e-05, "loss": 0.1431, "step": 850 }, { "epoch": 0.577568838146407, "grad_norm": 0.5649687051773071, "learning_rate": 4.547517015376859e-05, "loss": 0.0864, "step": 860 }, { "epoch": 0.5842847548690396, "grad_norm": 1.3697500228881836, "learning_rate": 4.5349130325182757e-05, "loss": 0.0727, "step": 870 }, { "epoch": 0.5910006715916722, "grad_norm": 0.849629819393158, "learning_rate": 4.522309049659693e-05, "loss": 0.073, "step": 880 }, { "epoch": 0.5977165883143049, "grad_norm": 1.3777800798416138, "learning_rate": 4.509705066801109e-05, "loss": 0.1439, "step": 890 }, { "epoch": 0.6044325050369376, "grad_norm": 3.064931869506836, "learning_rate": 4.497101083942526e-05, "loss": 0.0923, "step": 900 }, { "epoch": 0.6111484217595702, "grad_norm": 0.47612619400024414, "learning_rate": 4.484497101083943e-05, "loss": 0.0279, "step": 910 }, { "epoch": 0.6178643384822028, "grad_norm": 0.2581987679004669, "learning_rate": 4.4718931182253594e-05, "loss": 0.0275, "step": 920 }, { "epoch": 0.6245802552048355, "grad_norm": 3.8201942443847656, "learning_rate": 4.459289135366776e-05, "loss": 0.0821, "step": 930 }, { "epoch": 0.6312961719274681, "grad_norm": 5.899015426635742, "learning_rate": 4.446685152508192e-05, "loss": 0.1238, "step": 940 }, { "epoch": 0.6380120886501007, "grad_norm": 0.38817426562309265, "learning_rate": 4.4340811696496094e-05, "loss": 0.1314, "step": 950 }, { "epoch": 0.6447280053727333, "grad_norm": 0.4328228235244751, "learning_rate": 4.421477186791026e-05, "loss": 0.0884, "step": 960 }, { "epoch": 0.6514439220953661, "grad_norm": 0.645894467830658, "learning_rate": 4.4088732039324424e-05, "loss": 0.0464, "step": 970 }, { "epoch": 0.6581598388179987, "grad_norm": 0.25149455666542053, "learning_rate": 4.3962692210738595e-05, "loss": 0.1324, "step": 980 }, { "epoch": 0.6648757555406313, "grad_norm": 3.0216193199157715, "learning_rate": 4.383665238215276e-05, "loss": 0.1317, "step": 990 }, { "epoch": 0.6715916722632639, "grad_norm": 0.4481722116470337, "learning_rate": 4.371061255356693e-05, "loss": 0.0304, "step": 1000 }, { "epoch": 0.6783075889858966, "grad_norm": 0.4348456859588623, "learning_rate": 4.3584572724981096e-05, "loss": 0.1263, "step": 1010 }, { "epoch": 0.6850235057085292, "grad_norm": 0.7903152108192444, "learning_rate": 4.345853289639527e-05, "loss": 0.0465, "step": 1020 }, { "epoch": 0.6917394224311618, "grad_norm": 0.6616799831390381, "learning_rate": 4.333249306780943e-05, "loss": 0.0438, "step": 1030 }, { "epoch": 0.6984553391537945, "grad_norm": 0.45970794558525085, "learning_rate": 4.32064532392236e-05, "loss": 0.0662, "step": 1040 }, { "epoch": 0.7051712558764271, "grad_norm": 1.471104621887207, "learning_rate": 4.308041341063777e-05, "loss": 0.0858, "step": 1050 }, { "epoch": 0.7118871725990598, "grad_norm": 8.536913871765137, "learning_rate": 4.295437358205193e-05, "loss": 0.0661, "step": 1060 }, { "epoch": 0.7186030893216924, "grad_norm": 0.17855380475521088, "learning_rate": 4.28283337534661e-05, "loss": 0.0604, "step": 1070 }, { "epoch": 0.7253190060443251, "grad_norm": 1.2315560579299927, "learning_rate": 4.270229392488026e-05, "loss": 0.0738, "step": 1080 }, { "epoch": 0.7320349227669577, "grad_norm": 2.4337143898010254, "learning_rate": 4.2576254096294434e-05, "loss": 0.111, "step": 1090 }, { "epoch": 0.7387508394895903, "grad_norm": 0.5771811604499817, "learning_rate": 4.24502142677086e-05, "loss": 0.0873, "step": 1100 }, { "epoch": 0.7454667562122229, "grad_norm": 0.29616767168045044, "learning_rate": 4.2324174439122764e-05, "loss": 0.1899, "step": 1110 }, { "epoch": 0.7521826729348556, "grad_norm": 0.005016247741878033, "learning_rate": 4.2198134610536935e-05, "loss": 0.0512, "step": 1120 }, { "epoch": 0.7588985896574882, "grad_norm": 0.6726338267326355, "learning_rate": 4.20720947819511e-05, "loss": 0.0749, "step": 1130 }, { "epoch": 0.7656145063801209, "grad_norm": 15.482983589172363, "learning_rate": 4.1946054953365264e-05, "loss": 0.1208, "step": 1140 }, { "epoch": 0.7723304231027536, "grad_norm": 0.13030461966991425, "learning_rate": 4.182001512477943e-05, "loss": 0.0716, "step": 1150 }, { "epoch": 0.7790463398253862, "grad_norm": 0.00033535558031871915, "learning_rate": 4.16939752961936e-05, "loss": 0.0575, "step": 1160 }, { "epoch": 0.7857622565480188, "grad_norm": 9.646067337598652e-05, "learning_rate": 4.1567935467607765e-05, "loss": 0.0491, "step": 1170 }, { "epoch": 0.7924781732706514, "grad_norm": 0.6216703653335571, "learning_rate": 4.144189563902193e-05, "loss": 0.1496, "step": 1180 }, { "epoch": 0.7991940899932841, "grad_norm": 0.1437370628118515, "learning_rate": 4.13158558104361e-05, "loss": 0.0467, "step": 1190 }, { "epoch": 0.8059100067159167, "grad_norm": 0.5845384001731873, "learning_rate": 4.1189815981850266e-05, "loss": 0.0423, "step": 1200 }, { "epoch": 0.8126259234385493, "grad_norm": 0.006311117671430111, "learning_rate": 4.106377615326443e-05, "loss": 0.0434, "step": 1210 }, { "epoch": 0.8193418401611821, "grad_norm": 0.28811392188072205, "learning_rate": 4.09377363246786e-05, "loss": 0.0401, "step": 1220 }, { "epoch": 0.8260577568838147, "grad_norm": 0.00534482067450881, "learning_rate": 4.081169649609277e-05, "loss": 0.0974, "step": 1230 }, { "epoch": 0.8327736736064473, "grad_norm": 0.3741544187068939, "learning_rate": 4.068565666750693e-05, "loss": 0.0457, "step": 1240 }, { "epoch": 0.8394895903290799, "grad_norm": 0.9101037383079529, "learning_rate": 4.0559616838921097e-05, "loss": 0.0547, "step": 1250 }, { "epoch": 0.8462055070517126, "grad_norm": 0.2954216003417969, "learning_rate": 4.043357701033527e-05, "loss": 0.0656, "step": 1260 }, { "epoch": 0.8529214237743452, "grad_norm": 1.8236368894577026, "learning_rate": 4.030753718174943e-05, "loss": 0.0607, "step": 1270 }, { "epoch": 0.8596373404969778, "grad_norm": 0.17372089624404907, "learning_rate": 4.01814973531636e-05, "loss": 0.0894, "step": 1280 }, { "epoch": 0.8663532572196104, "grad_norm": 0.0005210727686062455, "learning_rate": 4.005545752457777e-05, "loss": 0.0761, "step": 1290 }, { "epoch": 0.8730691739422431, "grad_norm": 0.29330241680145264, "learning_rate": 3.9929417695991934e-05, "loss": 0.0626, "step": 1300 }, { "epoch": 0.8797850906648758, "grad_norm": 0.7308351397514343, "learning_rate": 3.98033778674061e-05, "loss": 0.0754, "step": 1310 }, { "epoch": 0.8865010073875084, "grad_norm": 0.0010205712169408798, "learning_rate": 3.967733803882027e-05, "loss": 0.0991, "step": 1320 }, { "epoch": 0.8932169241101411, "grad_norm": 0.26492127776145935, "learning_rate": 3.955129821023444e-05, "loss": 0.0371, "step": 1330 }, { "epoch": 0.8999328408327737, "grad_norm": 0.3958211839199066, "learning_rate": 3.9425258381648606e-05, "loss": 0.1793, "step": 1340 }, { "epoch": 0.9066487575554063, "grad_norm": 0.006723370868712664, "learning_rate": 3.929921855306277e-05, "loss": 0.0385, "step": 1350 }, { "epoch": 0.9133646742780389, "grad_norm": 16.94601821899414, "learning_rate": 3.917317872447694e-05, "loss": 0.0664, "step": 1360 }, { "epoch": 0.9200805910006716, "grad_norm": 0.3626093864440918, "learning_rate": 3.904713889589111e-05, "loss": 0.0424, "step": 1370 }, { "epoch": 0.9267965077233042, "grad_norm": 0.347401887178421, "learning_rate": 3.892109906730527e-05, "loss": 0.0558, "step": 1380 }, { "epoch": 0.9335124244459369, "grad_norm": 0.5849874019622803, "learning_rate": 3.8795059238719436e-05, "loss": 0.0543, "step": 1390 }, { "epoch": 0.9402283411685695, "grad_norm": 0.0007323077879846096, "learning_rate": 3.866901941013361e-05, "loss": 0.065, "step": 1400 }, { "epoch": 0.9469442578912022, "grad_norm": 0.29866859316825867, "learning_rate": 3.854297958154777e-05, "loss": 0.083, "step": 1410 }, { "epoch": 0.9536601746138348, "grad_norm": 1.046910047531128, "learning_rate": 3.841693975296194e-05, "loss": 0.0795, "step": 1420 }, { "epoch": 0.9603760913364674, "grad_norm": 0.28778284788131714, "learning_rate": 3.829089992437611e-05, "loss": 0.121, "step": 1430 }, { "epoch": 0.9670920080591001, "grad_norm": 0.6510307192802429, "learning_rate": 3.816486009579027e-05, "loss": 0.0976, "step": 1440 }, { "epoch": 0.9738079247817327, "grad_norm": 0.7349569797515869, "learning_rate": 3.803882026720444e-05, "loss": 0.027, "step": 1450 }, { "epoch": 0.9805238415043653, "grad_norm": 0.41969776153564453, "learning_rate": 3.79127804386186e-05, "loss": 0.0609, "step": 1460 }, { "epoch": 0.9872397582269979, "grad_norm": 0.765001118183136, "learning_rate": 3.7786740610032774e-05, "loss": 0.0823, "step": 1470 }, { "epoch": 0.9939556749496307, "grad_norm": 0.5332460999488831, "learning_rate": 3.766070078144694e-05, "loss": 0.063, "step": 1480 }, { "epoch": 1.0, "eval_loss": 0.05031801387667656, "eval_runtime": 16.4028, "eval_samples_per_second": 362.988, "eval_steps_per_second": 5.731, "step": 1489 }, { "epoch": 1.0006715916722633, "grad_norm": 1.0223584175109863, "learning_rate": 3.7534660952861104e-05, "loss": 0.0612, "step": 1490 }, { "epoch": 1.007387508394896, "grad_norm": 0.2597563862800598, "learning_rate": 3.7408621124275275e-05, "loss": 0.0986, "step": 1500 }, { "epoch": 1.0141034251175285, "grad_norm": 0.4414527714252472, "learning_rate": 3.728258129568944e-05, "loss": 0.0544, "step": 1510 }, { "epoch": 1.0208193418401612, "grad_norm": 0.5101744532585144, "learning_rate": 3.7156541467103605e-05, "loss": 0.0549, "step": 1520 }, { "epoch": 1.027535258562794, "grad_norm": 0.695880651473999, "learning_rate": 3.7030501638517776e-05, "loss": 0.0429, "step": 1530 }, { "epoch": 1.0342511752854264, "grad_norm": 0.0017918674275279045, "learning_rate": 3.690446180993194e-05, "loss": 0.057, "step": 1540 }, { "epoch": 1.0409670920080591, "grad_norm": 5.918661117553711, "learning_rate": 3.6778421981346105e-05, "loss": 0.0428, "step": 1550 }, { "epoch": 1.0476830087306916, "grad_norm": 0.017390791326761246, "learning_rate": 3.665238215276027e-05, "loss": 0.107, "step": 1560 }, { "epoch": 1.0543989254533244, "grad_norm": 0.022303210571408272, "learning_rate": 3.652634232417444e-05, "loss": 0.0683, "step": 1570 }, { "epoch": 1.061114842175957, "grad_norm": 0.003013026202097535, "learning_rate": 3.6400302495588606e-05, "loss": 0.0675, "step": 1580 }, { "epoch": 1.0678307588985896, "grad_norm": 0.0006217322661541402, "learning_rate": 3.627426266700277e-05, "loss": 0.0552, "step": 1590 }, { "epoch": 1.0745466756212223, "grad_norm": 0.2726440131664276, "learning_rate": 3.614822283841694e-05, "loss": 0.0427, "step": 1600 }, { "epoch": 1.081262592343855, "grad_norm": 0.29946306347846985, "learning_rate": 3.602218300983111e-05, "loss": 0.0866, "step": 1610 }, { "epoch": 1.0879785090664875, "grad_norm": 0.0426226370036602, "learning_rate": 3.589614318124527e-05, "loss": 0.0447, "step": 1620 }, { "epoch": 1.0946944257891202, "grad_norm": 0.6733192801475525, "learning_rate": 3.5770103352659437e-05, "loss": 0.1095, "step": 1630 }, { "epoch": 1.1014103425117527, "grad_norm": 0.2110043466091156, "learning_rate": 3.564406352407361e-05, "loss": 0.0213, "step": 1640 }, { "epoch": 1.1081262592343855, "grad_norm": 0.0002463633718434721, "learning_rate": 3.551802369548777e-05, "loss": 0.0531, "step": 1650 }, { "epoch": 1.1148421759570182, "grad_norm": 0.660849392414093, "learning_rate": 3.5391983866901944e-05, "loss": 0.0565, "step": 1660 }, { "epoch": 1.1215580926796507, "grad_norm": 0.0015531358076259494, "learning_rate": 3.526594403831611e-05, "loss": 0.0355, "step": 1670 }, { "epoch": 1.1282740094022834, "grad_norm": 0.5349295735359192, "learning_rate": 3.513990420973028e-05, "loss": 0.0461, "step": 1680 }, { "epoch": 1.1349899261249161, "grad_norm": 0.0036038027610629797, "learning_rate": 3.5013864381144445e-05, "loss": 0.0672, "step": 1690 }, { "epoch": 1.1417058428475486, "grad_norm": 0.0011060705874115229, "learning_rate": 3.488782455255861e-05, "loss": 0.0501, "step": 1700 }, { "epoch": 1.1484217595701813, "grad_norm": 3.7792551517486572, "learning_rate": 3.476178472397278e-05, "loss": 0.0436, "step": 1710 }, { "epoch": 1.155137676292814, "grad_norm": 8.058547019958496, "learning_rate": 3.4635744895386946e-05, "loss": 0.0752, "step": 1720 }, { "epoch": 1.1618535930154466, "grad_norm": 0.0012638597982004285, "learning_rate": 3.450970506680111e-05, "loss": 0.0591, "step": 1730 }, { "epoch": 1.1685695097380793, "grad_norm": 0.0024126614443957806, "learning_rate": 3.438366523821528e-05, "loss": 0.0399, "step": 1740 }, { "epoch": 1.1752854264607118, "grad_norm": 3.4204914569854736, "learning_rate": 3.425762540962945e-05, "loss": 0.0456, "step": 1750 }, { "epoch": 1.1820013431833445, "grad_norm": 0.6280553340911865, "learning_rate": 3.413158558104361e-05, "loss": 0.0623, "step": 1760 }, { "epoch": 1.1887172599059772, "grad_norm": 0.0001932340528583154, "learning_rate": 3.4005545752457776e-05, "loss": 0.015, "step": 1770 }, { "epoch": 1.19543317662861, "grad_norm": 3.976154088973999, "learning_rate": 3.387950592387195e-05, "loss": 0.0464, "step": 1780 }, { "epoch": 1.2021490933512424, "grad_norm": 0.632788360118866, "learning_rate": 3.375346609528611e-05, "loss": 0.1215, "step": 1790 }, { "epoch": 1.2088650100738751, "grad_norm": 0.24529287219047546, "learning_rate": 3.362742626670028e-05, "loss": 0.0582, "step": 1800 }, { "epoch": 1.2155809267965076, "grad_norm": 0.42675378918647766, "learning_rate": 3.350138643811445e-05, "loss": 0.0812, "step": 1810 }, { "epoch": 1.2222968435191404, "grad_norm": 0.0022362687159329653, "learning_rate": 3.337534660952861e-05, "loss": 0.047, "step": 1820 }, { "epoch": 1.229012760241773, "grad_norm": 0.45256197452545166, "learning_rate": 3.324930678094278e-05, "loss": 0.0664, "step": 1830 }, { "epoch": 1.2357286769644056, "grad_norm": 0.8793488144874573, "learning_rate": 3.312326695235694e-05, "loss": 0.0452, "step": 1840 }, { "epoch": 1.2424445936870383, "grad_norm": 0.0005373602034524083, "learning_rate": 3.2997227123771114e-05, "loss": 0.0289, "step": 1850 }, { "epoch": 1.2491605104096708, "grad_norm": 1.7183722257614136, "learning_rate": 3.287118729518528e-05, "loss": 0.0455, "step": 1860 }, { "epoch": 1.2558764271323035, "grad_norm": 0.005168843548744917, "learning_rate": 3.2745147466599444e-05, "loss": 0.0554, "step": 1870 }, { "epoch": 1.2625923438549362, "grad_norm": 0.0013626981526613235, "learning_rate": 3.2619107638013615e-05, "loss": 0.0344, "step": 1880 }, { "epoch": 1.269308260577569, "grad_norm": 0.42641666531562805, "learning_rate": 3.249306780942778e-05, "loss": 0.0683, "step": 1890 }, { "epoch": 1.2760241773002015, "grad_norm": 0.005407288204878569, "learning_rate": 3.2367027980841945e-05, "loss": 0.0167, "step": 1900 }, { "epoch": 1.2827400940228342, "grad_norm": 0.00033558416180312634, "learning_rate": 3.2240988152256116e-05, "loss": 0.0603, "step": 1910 }, { "epoch": 1.2894560107454667, "grad_norm": 0.10696588456630707, "learning_rate": 3.211494832367028e-05, "loss": 0.0787, "step": 1920 }, { "epoch": 1.2961719274680994, "grad_norm": 0.7402423620223999, "learning_rate": 3.1988908495084445e-05, "loss": 0.1398, "step": 1930 }, { "epoch": 1.3028878441907321, "grad_norm": 0.8248059749603271, "learning_rate": 3.186286866649861e-05, "loss": 0.0925, "step": 1940 }, { "epoch": 1.3096037609133646, "grad_norm": 0.0007926832186058164, "learning_rate": 3.173682883791278e-05, "loss": 0.0942, "step": 1950 }, { "epoch": 1.3163196776359973, "grad_norm": 0.47744426131248474, "learning_rate": 3.1610789009326946e-05, "loss": 0.0348, "step": 1960 }, { "epoch": 1.3230355943586298, "grad_norm": 0.2431195080280304, "learning_rate": 3.148474918074111e-05, "loss": 0.0533, "step": 1970 }, { "epoch": 1.3297515110812625, "grad_norm": 3.9710357189178467, "learning_rate": 3.135870935215528e-05, "loss": 0.0338, "step": 1980 }, { "epoch": 1.3364674278038953, "grad_norm": 0.6269270181655884, "learning_rate": 3.1232669523569454e-05, "loss": 0.0741, "step": 1990 }, { "epoch": 1.343183344526528, "grad_norm": 0.004900393076241016, "learning_rate": 3.110662969498362e-05, "loss": 0.0309, "step": 2000 }, { "epoch": 1.3498992612491605, "grad_norm": 1.1476103067398071, "learning_rate": 3.098058986639778e-05, "loss": 0.047, "step": 2010 }, { "epoch": 1.3566151779717932, "grad_norm": 0.30575576424598694, "learning_rate": 3.0854550037811955e-05, "loss": 0.0541, "step": 2020 }, { "epoch": 1.3633310946944257, "grad_norm": 0.0030384438578039408, "learning_rate": 3.072851020922612e-05, "loss": 0.0721, "step": 2030 }, { "epoch": 1.3700470114170584, "grad_norm": 0.3193557560443878, "learning_rate": 3.0602470380640284e-05, "loss": 0.0818, "step": 2040 }, { "epoch": 1.3767629281396911, "grad_norm": 0.09504958987236023, "learning_rate": 3.0476430552054452e-05, "loss": 0.0238, "step": 2050 }, { "epoch": 1.3834788448623236, "grad_norm": 0.40825656056404114, "learning_rate": 3.035039072346862e-05, "loss": 0.0558, "step": 2060 }, { "epoch": 1.3901947615849564, "grad_norm": 0.08500930666923523, "learning_rate": 3.0224350894882785e-05, "loss": 0.0854, "step": 2070 }, { "epoch": 1.3969106783075889, "grad_norm": 0.9645543098449707, "learning_rate": 3.0098311066296953e-05, "loss": 0.0704, "step": 2080 }, { "epoch": 1.4036265950302216, "grad_norm": 0.5297892093658447, "learning_rate": 2.9972271237711118e-05, "loss": 0.0528, "step": 2090 }, { "epoch": 1.4103425117528543, "grad_norm": 0.002040713559836149, "learning_rate": 2.9846231409125286e-05, "loss": 0.064, "step": 2100 }, { "epoch": 1.417058428475487, "grad_norm": 0.0018752739997580647, "learning_rate": 2.9720191580539454e-05, "loss": 0.0442, "step": 2110 }, { "epoch": 1.4237743451981195, "grad_norm": 0.012829877436161041, "learning_rate": 2.959415175195362e-05, "loss": 0.0457, "step": 2120 }, { "epoch": 1.4304902619207522, "grad_norm": 0.7869836688041687, "learning_rate": 2.9468111923367787e-05, "loss": 0.0562, "step": 2130 }, { "epoch": 1.4372061786433847, "grad_norm": 0.3505788743495941, "learning_rate": 2.934207209478195e-05, "loss": 0.0734, "step": 2140 }, { "epoch": 1.4439220953660175, "grad_norm": 0.35704436898231506, "learning_rate": 2.921603226619612e-05, "loss": 0.055, "step": 2150 }, { "epoch": 1.4506380120886502, "grad_norm": 0.45379963517189026, "learning_rate": 2.9089992437610288e-05, "loss": 0.0495, "step": 2160 }, { "epoch": 1.4573539288112827, "grad_norm": 0.7454732656478882, "learning_rate": 2.8963952609024453e-05, "loss": 0.085, "step": 2170 }, { "epoch": 1.4640698455339154, "grad_norm": 0.04314453899860382, "learning_rate": 2.883791278043862e-05, "loss": 0.0336, "step": 2180 }, { "epoch": 1.470785762256548, "grad_norm": 0.306495726108551, "learning_rate": 2.8711872951852785e-05, "loss": 0.0458, "step": 2190 }, { "epoch": 1.4775016789791806, "grad_norm": 0.3177057206630707, "learning_rate": 2.8585833123266953e-05, "loss": 0.09, "step": 2200 }, { "epoch": 1.4842175957018133, "grad_norm": 0.4207700192928314, "learning_rate": 2.8459793294681118e-05, "loss": 0.0506, "step": 2210 }, { "epoch": 1.490933512424446, "grad_norm": 0.43287593126296997, "learning_rate": 2.8333753466095286e-05, "loss": 0.042, "step": 2220 }, { "epoch": 1.4976494291470785, "grad_norm": 0.6717241406440735, "learning_rate": 2.8207713637509454e-05, "loss": 0.0523, "step": 2230 }, { "epoch": 1.5043653458697113, "grad_norm": 0.00016471787239424884, "learning_rate": 2.808167380892362e-05, "loss": 0.0308, "step": 2240 }, { "epoch": 1.5110812625923438, "grad_norm": 0.32060420513153076, "learning_rate": 2.7955633980337787e-05, "loss": 0.0945, "step": 2250 }, { "epoch": 1.5177971793149765, "grad_norm": 0.399581640958786, "learning_rate": 2.7829594151751952e-05, "loss": 0.0374, "step": 2260 }, { "epoch": 1.5245130960376092, "grad_norm": 0.6255159378051758, "learning_rate": 2.770355432316612e-05, "loss": 0.049, "step": 2270 }, { "epoch": 1.531229012760242, "grad_norm": 0.005870606750249863, "learning_rate": 2.7577514494580288e-05, "loss": 0.0183, "step": 2280 }, { "epoch": 1.5379449294828744, "grad_norm": 9.138748282566667e-05, "learning_rate": 2.7451474665994453e-05, "loss": 0.0429, "step": 2290 }, { "epoch": 1.544660846205507, "grad_norm": 0.780626654624939, "learning_rate": 2.732543483740862e-05, "loss": 0.0528, "step": 2300 }, { "epoch": 1.5513767629281396, "grad_norm": 0.28625041246414185, "learning_rate": 2.7199395008822785e-05, "loss": 0.0555, "step": 2310 }, { "epoch": 1.5580926796507724, "grad_norm": 0.6077707409858704, "learning_rate": 2.707335518023696e-05, "loss": 0.0651, "step": 2320 }, { "epoch": 1.564808596373405, "grad_norm": 0.005245072301477194, "learning_rate": 2.6947315351651125e-05, "loss": 0.0269, "step": 2330 }, { "epoch": 1.5715245130960376, "grad_norm": 0.3782106935977936, "learning_rate": 2.6821275523065293e-05, "loss": 0.0622, "step": 2340 }, { "epoch": 1.5782404298186703, "grad_norm": 1.065748929977417, "learning_rate": 2.6695235694479458e-05, "loss": 0.0855, "step": 2350 }, { "epoch": 1.5849563465413028, "grad_norm": 0.4613993763923645, "learning_rate": 2.6569195865893626e-05, "loss": 0.0365, "step": 2360 }, { "epoch": 1.5916722632639355, "grad_norm": 0.3238326907157898, "learning_rate": 2.6443156037307794e-05, "loss": 0.0357, "step": 2370 }, { "epoch": 1.5983881799865682, "grad_norm": 0.0309292059391737, "learning_rate": 2.631711620872196e-05, "loss": 0.228, "step": 2380 }, { "epoch": 1.605104096709201, "grad_norm": 13.997108459472656, "learning_rate": 2.6191076380136127e-05, "loss": 0.0675, "step": 2390 }, { "epoch": 1.6118200134318335, "grad_norm": 0.4026775658130646, "learning_rate": 2.606503655155029e-05, "loss": 0.1093, "step": 2400 }, { "epoch": 1.618535930154466, "grad_norm": 0.29253387451171875, "learning_rate": 2.593899672296446e-05, "loss": 0.0617, "step": 2410 }, { "epoch": 1.6252518468770987, "grad_norm": 0.017194174230098724, "learning_rate": 2.5812956894378624e-05, "loss": 0.0371, "step": 2420 }, { "epoch": 1.6319677635997314, "grad_norm": 0.2723110020160675, "learning_rate": 2.5686917065792792e-05, "loss": 0.0377, "step": 2430 }, { "epoch": 1.6386836803223641, "grad_norm": 1.9029347896575928, "learning_rate": 2.556087723720696e-05, "loss": 0.062, "step": 2440 }, { "epoch": 1.6453995970449966, "grad_norm": 0.6120497584342957, "learning_rate": 2.5434837408621125e-05, "loss": 0.0396, "step": 2450 }, { "epoch": 1.6521155137676293, "grad_norm": 0.012274952605366707, "learning_rate": 2.5308797580035293e-05, "loss": 0.1047, "step": 2460 }, { "epoch": 1.6588314304902618, "grad_norm": 0.0005646342178806663, "learning_rate": 2.5182757751449458e-05, "loss": 0.0295, "step": 2470 }, { "epoch": 1.6655473472128945, "grad_norm": 0.2949657738208771, "learning_rate": 2.5056717922863626e-05, "loss": 0.0687, "step": 2480 }, { "epoch": 1.6722632639355273, "grad_norm": 0.0009046673658303916, "learning_rate": 2.4930678094277794e-05, "loss": 0.0363, "step": 2490 }, { "epoch": 1.67897918065816, "grad_norm": 0.666671633720398, "learning_rate": 2.480463826569196e-05, "loss": 0.0456, "step": 2500 }, { "epoch": 1.6856950973807925, "grad_norm": 0.40098175406455994, "learning_rate": 2.4678598437106127e-05, "loss": 0.046, "step": 2510 }, { "epoch": 1.692411014103425, "grad_norm": 0.43292441964149475, "learning_rate": 2.455255860852029e-05, "loss": 0.0506, "step": 2520 }, { "epoch": 1.6991269308260577, "grad_norm": 0.9216532707214355, "learning_rate": 2.442651877993446e-05, "loss": 0.0481, "step": 2530 }, { "epoch": 1.7058428475486904, "grad_norm": 46.244651794433594, "learning_rate": 2.4300478951348628e-05, "loss": 0.0886, "step": 2540 }, { "epoch": 1.7125587642713231, "grad_norm": 0.6050248742103577, "learning_rate": 2.4174439122762793e-05, "loss": 0.0557, "step": 2550 }, { "epoch": 1.7192746809939556, "grad_norm": 1.492427110671997, "learning_rate": 2.404839929417696e-05, "loss": 0.0499, "step": 2560 }, { "epoch": 1.7259905977165884, "grad_norm": 2.4528143405914307, "learning_rate": 2.392235946559113e-05, "loss": 0.0938, "step": 2570 }, { "epoch": 1.7327065144392209, "grad_norm": 0.029096094891428947, "learning_rate": 2.3796319637005297e-05, "loss": 0.0313, "step": 2580 }, { "epoch": 1.7394224311618536, "grad_norm": 0.3839016556739807, "learning_rate": 2.367027980841946e-05, "loss": 0.0503, "step": 2590 }, { "epoch": 1.7461383478844863, "grad_norm": 1.5034056901931763, "learning_rate": 2.354423997983363e-05, "loss": 0.0269, "step": 2600 }, { "epoch": 1.752854264607119, "grad_norm": 0.0012208319967612624, "learning_rate": 2.3418200151247798e-05, "loss": 0.0609, "step": 2610 }, { "epoch": 1.7595701813297515, "grad_norm": 0.9314192533493042, "learning_rate": 2.3292160322661962e-05, "loss": 0.0475, "step": 2620 }, { "epoch": 1.766286098052384, "grad_norm": 0.16898050904273987, "learning_rate": 2.316612049407613e-05, "loss": 0.078, "step": 2630 }, { "epoch": 1.7730020147750167, "grad_norm": 0.6302085518836975, "learning_rate": 2.3040080665490295e-05, "loss": 0.173, "step": 2640 }, { "epoch": 1.7797179314976495, "grad_norm": 0.01881493628025055, "learning_rate": 2.2914040836904463e-05, "loss": 0.0291, "step": 2650 }, { "epoch": 1.7864338482202822, "grad_norm": 0.1634562611579895, "learning_rate": 2.2788001008318628e-05, "loss": 0.0672, "step": 2660 }, { "epoch": 1.7931497649429147, "grad_norm": 0.2767401933670044, "learning_rate": 2.2661961179732796e-05, "loss": 0.052, "step": 2670 }, { "epoch": 1.7998656816655474, "grad_norm": 0.5256266593933105, "learning_rate": 2.2535921351146964e-05, "loss": 0.0783, "step": 2680 }, { "epoch": 1.8065815983881799, "grad_norm": 0.006557038053870201, "learning_rate": 2.240988152256113e-05, "loss": 0.0281, "step": 2690 }, { "epoch": 1.8132975151108126, "grad_norm": 0.5430310368537903, "learning_rate": 2.2283841693975297e-05, "loss": 0.0274, "step": 2700 }, { "epoch": 1.8200134318334453, "grad_norm": 0.0013151871971786022, "learning_rate": 2.215780186538946e-05, "loss": 0.0453, "step": 2710 }, { "epoch": 1.826729348556078, "grad_norm": 3.120368480682373, "learning_rate": 2.203176203680363e-05, "loss": 0.06, "step": 2720 }, { "epoch": 1.8334452652787105, "grad_norm": 0.46205762028694153, "learning_rate": 2.1905722208217798e-05, "loss": 0.1034, "step": 2730 }, { "epoch": 1.840161182001343, "grad_norm": 0.6960582137107849, "learning_rate": 2.1779682379631966e-05, "loss": 0.0706, "step": 2740 }, { "epoch": 1.8468770987239758, "grad_norm": 0.552689254283905, "learning_rate": 2.1653642551046134e-05, "loss": 0.0281, "step": 2750 }, { "epoch": 1.8535930154466085, "grad_norm": 0.37117066979408264, "learning_rate": 2.15276027224603e-05, "loss": 0.0655, "step": 2760 }, { "epoch": 1.8603089321692412, "grad_norm": 0.3859846293926239, "learning_rate": 2.1401562893874467e-05, "loss": 0.0769, "step": 2770 }, { "epoch": 1.8670248488918737, "grad_norm": 0.47629514336586, "learning_rate": 2.127552306528863e-05, "loss": 0.0398, "step": 2780 }, { "epoch": 1.8737407656145064, "grad_norm": 0.5712143182754517, "learning_rate": 2.11494832367028e-05, "loss": 0.0549, "step": 2790 }, { "epoch": 1.880456682337139, "grad_norm": 1.7003332376480103, "learning_rate": 2.1023443408116968e-05, "loss": 0.0624, "step": 2800 }, { "epoch": 1.8871725990597716, "grad_norm": 0.46252167224884033, "learning_rate": 2.0897403579531132e-05, "loss": 0.0584, "step": 2810 }, { "epoch": 1.8938885157824044, "grad_norm": 4.596211910247803, "learning_rate": 2.07713637509453e-05, "loss": 0.1001, "step": 2820 }, { "epoch": 1.900604432505037, "grad_norm": 0.0004219062684569508, "learning_rate": 2.0645323922359465e-05, "loss": 0.0698, "step": 2830 }, { "epoch": 1.9073203492276696, "grad_norm": 0.5564634799957275, "learning_rate": 2.0519284093773633e-05, "loss": 0.0587, "step": 2840 }, { "epoch": 1.914036265950302, "grad_norm": 0.3310578763484955, "learning_rate": 2.03932442651878e-05, "loss": 0.0921, "step": 2850 }, { "epoch": 1.9207521826729348, "grad_norm": 0.4154551327228546, "learning_rate": 2.0267204436601966e-05, "loss": 0.0653, "step": 2860 }, { "epoch": 1.9274680993955675, "grad_norm": 0.30634987354278564, "learning_rate": 2.0141164608016134e-05, "loss": 0.0439, "step": 2870 }, { "epoch": 1.9341840161182002, "grad_norm": 0.353935569524765, "learning_rate": 2.00151247794303e-05, "loss": 0.0453, "step": 2880 }, { "epoch": 1.9408999328408327, "grad_norm": 0.009207702241837978, "learning_rate": 1.9889084950844467e-05, "loss": 0.019, "step": 2890 }, { "epoch": 1.9476158495634655, "grad_norm": 0.5521604418754578, "learning_rate": 1.9763045122258635e-05, "loss": 0.1271, "step": 2900 }, { "epoch": 1.954331766286098, "grad_norm": 4.1649088859558105, "learning_rate": 1.9637005293672803e-05, "loss": 0.0635, "step": 2910 }, { "epoch": 1.9610476830087307, "grad_norm": 0.36337295174598694, "learning_rate": 1.9510965465086968e-05, "loss": 0.0704, "step": 2920 }, { "epoch": 1.9677635997313634, "grad_norm": 0.1291392743587494, "learning_rate": 1.9384925636501136e-05, "loss": 0.046, "step": 2930 }, { "epoch": 1.974479516453996, "grad_norm": 0.011192429810762405, "learning_rate": 1.9258885807915304e-05, "loss": 0.0733, "step": 2940 }, { "epoch": 1.9811954331766286, "grad_norm": 0.048028428107500076, "learning_rate": 1.913284597932947e-05, "loss": 0.0419, "step": 2950 }, { "epoch": 1.987911349899261, "grad_norm": 0.3492966890335083, "learning_rate": 1.9006806150743637e-05, "loss": 0.0585, "step": 2960 }, { "epoch": 1.9946272666218938, "grad_norm": 0.00014002685202285647, "learning_rate": 1.88807663221578e-05, "loss": 0.0612, "step": 2970 }, { "epoch": 2.0, "eval_loss": 0.05646410211920738, "eval_runtime": 13.5636, "eval_samples_per_second": 438.969, "eval_steps_per_second": 6.93, "step": 2978 }, { "epoch": 2.0013431833445265, "grad_norm": 0.3761807680130005, "learning_rate": 1.875472649357197e-05, "loss": 0.0616, "step": 2980 }, { "epoch": 2.0080591000671593, "grad_norm": 0.015631867572665215, "learning_rate": 1.8628686664986138e-05, "loss": 0.0211, "step": 2990 }, { "epoch": 2.014775016789792, "grad_norm": 0.02187258005142212, "learning_rate": 1.8502646836400302e-05, "loss": 0.0486, "step": 3000 }, { "epoch": 2.0214909335124243, "grad_norm": 0.28977006673812866, "learning_rate": 1.837660700781447e-05, "loss": 0.0663, "step": 3010 }, { "epoch": 2.028206850235057, "grad_norm": 0.006137068383395672, "learning_rate": 1.8250567179228635e-05, "loss": 0.034, "step": 3020 }, { "epoch": 2.0349227669576897, "grad_norm": 0.022373627871274948, "learning_rate": 1.8124527350642803e-05, "loss": 0.0503, "step": 3030 }, { "epoch": 2.0416386836803224, "grad_norm": 0.017447108402848244, "learning_rate": 1.799848752205697e-05, "loss": 0.0307, "step": 3040 }, { "epoch": 2.048354600402955, "grad_norm": 0.09448742121458054, "learning_rate": 1.7872447693471136e-05, "loss": 0.0401, "step": 3050 }, { "epoch": 2.055070517125588, "grad_norm": 0.5452361702919006, "learning_rate": 1.7746407864885304e-05, "loss": 0.0496, "step": 3060 }, { "epoch": 2.06178643384822, "grad_norm": 0.5851243138313293, "learning_rate": 1.7620368036299472e-05, "loss": 0.0307, "step": 3070 }, { "epoch": 2.068502350570853, "grad_norm": 0.1558593511581421, "learning_rate": 1.749432820771364e-05, "loss": 0.0437, "step": 3080 }, { "epoch": 2.0752182672934856, "grad_norm": 0.4523046612739563, "learning_rate": 1.7368288379127805e-05, "loss": 0.0318, "step": 3090 }, { "epoch": 2.0819341840161183, "grad_norm": 0.0008133721421472728, "learning_rate": 1.7242248550541973e-05, "loss": 0.0968, "step": 3100 }, { "epoch": 2.088650100738751, "grad_norm": 0.28596845269203186, "learning_rate": 1.7116208721956138e-05, "loss": 0.0308, "step": 3110 }, { "epoch": 2.0953660174613833, "grad_norm": 0.0006714498158544302, "learning_rate": 1.6990168893370306e-05, "loss": 0.0451, "step": 3120 }, { "epoch": 2.102081934184016, "grad_norm": 0.3977266550064087, "learning_rate": 1.6864129064784474e-05, "loss": 0.0308, "step": 3130 }, { "epoch": 2.1087978509066487, "grad_norm": 1.4212539196014404, "learning_rate": 1.673808923619864e-05, "loss": 0.0518, "step": 3140 }, { "epoch": 2.1155137676292814, "grad_norm": 11.565996170043945, "learning_rate": 1.6612049407612807e-05, "loss": 0.0819, "step": 3150 }, { "epoch": 2.122229684351914, "grad_norm": 2.776099920272827, "learning_rate": 1.648600957902697e-05, "loss": 0.055, "step": 3160 }, { "epoch": 2.1289456010745464, "grad_norm": 0.0052488865330815315, "learning_rate": 1.635996975044114e-05, "loss": 0.0485, "step": 3170 }, { "epoch": 2.135661517797179, "grad_norm": 0.7042301893234253, "learning_rate": 1.6233929921855308e-05, "loss": 0.0454, "step": 3180 }, { "epoch": 2.142377434519812, "grad_norm": 0.3006635308265686, "learning_rate": 1.6107890093269472e-05, "loss": 0.0377, "step": 3190 }, { "epoch": 2.1490933512424446, "grad_norm": 1.699796199798584, "learning_rate": 1.598185026468364e-05, "loss": 0.0731, "step": 3200 }, { "epoch": 2.1558092679650773, "grad_norm": 0.20682166516780853, "learning_rate": 1.5855810436097805e-05, "loss": 0.0413, "step": 3210 }, { "epoch": 2.16252518468771, "grad_norm": 0.0004898414481431246, "learning_rate": 1.5729770607511973e-05, "loss": 0.1173, "step": 3220 }, { "epoch": 2.1692411014103423, "grad_norm": 0.0038841627538204193, "learning_rate": 1.560373077892614e-05, "loss": 0.0435, "step": 3230 }, { "epoch": 2.175957018132975, "grad_norm": 0.014045453630387783, "learning_rate": 1.547769095034031e-05, "loss": 0.0457, "step": 3240 }, { "epoch": 2.1826729348556078, "grad_norm": 0.010443809442222118, "learning_rate": 1.5351651121754478e-05, "loss": 0.0456, "step": 3250 }, { "epoch": 2.1893888515782405, "grad_norm": 0.0003762775450013578, "learning_rate": 1.5225611293168642e-05, "loss": 0.0493, "step": 3260 }, { "epoch": 2.196104768300873, "grad_norm": 0.3689648509025574, "learning_rate": 1.509957146458281e-05, "loss": 0.0494, "step": 3270 }, { "epoch": 2.2028206850235055, "grad_norm": 5.371860243030824e-05, "learning_rate": 1.4973531635996977e-05, "loss": 0.0415, "step": 3280 }, { "epoch": 2.209536601746138, "grad_norm": 0.3309116065502167, "learning_rate": 1.4847491807411143e-05, "loss": 0.0428, "step": 3290 }, { "epoch": 2.216252518468771, "grad_norm": 0.4156160056591034, "learning_rate": 1.472145197882531e-05, "loss": 0.0647, "step": 3300 }, { "epoch": 2.2229684351914036, "grad_norm": 0.44069477915763855, "learning_rate": 1.4595412150239476e-05, "loss": 0.0266, "step": 3310 }, { "epoch": 2.2296843519140364, "grad_norm": 4.7501349449157715, "learning_rate": 1.4469372321653642e-05, "loss": 0.0934, "step": 3320 }, { "epoch": 2.236400268636669, "grad_norm": 4.7144985728664324e-05, "learning_rate": 1.434333249306781e-05, "loss": 0.0488, "step": 3330 }, { "epoch": 2.2431161853593014, "grad_norm": 0.0005517873796634376, "learning_rate": 1.4217292664481977e-05, "loss": 0.1232, "step": 3340 }, { "epoch": 2.249832102081934, "grad_norm": 0.046205148100852966, "learning_rate": 1.4091252835896143e-05, "loss": 0.1431, "step": 3350 }, { "epoch": 2.256548018804567, "grad_norm": 0.06340274959802628, "learning_rate": 1.396521300731031e-05, "loss": 0.0771, "step": 3360 }, { "epoch": 2.2632639355271995, "grad_norm": 0.38464516401290894, "learning_rate": 1.3839173178724476e-05, "loss": 0.0383, "step": 3370 }, { "epoch": 2.2699798522498322, "grad_norm": 1.4843387603759766, "learning_rate": 1.3713133350138642e-05, "loss": 0.0477, "step": 3380 }, { "epoch": 2.2766957689724645, "grad_norm": 0.006335641257464886, "learning_rate": 1.358709352155281e-05, "loss": 0.0583, "step": 3390 }, { "epoch": 2.2834116856950972, "grad_norm": 0.009423257783055305, "learning_rate": 1.3461053692966979e-05, "loss": 0.0493, "step": 3400 }, { "epoch": 2.29012760241773, "grad_norm": 0.7405962347984314, "learning_rate": 1.3335013864381147e-05, "loss": 0.0444, "step": 3410 }, { "epoch": 2.2968435191403627, "grad_norm": 0.13406887650489807, "learning_rate": 1.3208974035795313e-05, "loss": 0.0185, "step": 3420 }, { "epoch": 2.3035594358629954, "grad_norm": 1.9850287437438965, "learning_rate": 1.308293420720948e-05, "loss": 0.0305, "step": 3430 }, { "epoch": 2.310275352585628, "grad_norm": 0.35775044560432434, "learning_rate": 1.2956894378623646e-05, "loss": 0.0604, "step": 3440 }, { "epoch": 2.316991269308261, "grad_norm": 0.46501487493515015, "learning_rate": 1.2830854550037812e-05, "loss": 0.0569, "step": 3450 }, { "epoch": 2.323707186030893, "grad_norm": 0.0007875201408751309, "learning_rate": 1.270481472145198e-05, "loss": 0.0305, "step": 3460 }, { "epoch": 2.330423102753526, "grad_norm": 0.0034116168972104788, "learning_rate": 1.2578774892866147e-05, "loss": 0.0443, "step": 3470 }, { "epoch": 2.3371390194761585, "grad_norm": 0.4020250141620636, "learning_rate": 1.2452735064280313e-05, "loss": 0.0364, "step": 3480 }, { "epoch": 2.3438549361987913, "grad_norm": 0.6396084427833557, "learning_rate": 1.232669523569448e-05, "loss": 0.0335, "step": 3490 }, { "epoch": 2.3505708529214235, "grad_norm": 0.002272074343636632, "learning_rate": 1.2200655407108646e-05, "loss": 0.0348, "step": 3500 }, { "epoch": 2.3572867696440563, "grad_norm": 0.36031338572502136, "learning_rate": 1.2074615578522812e-05, "loss": 0.0211, "step": 3510 }, { "epoch": 2.364002686366689, "grad_norm": 1.6808172464370728, "learning_rate": 1.194857574993698e-05, "loss": 0.0838, "step": 3520 }, { "epoch": 2.3707186030893217, "grad_norm": 0.00011936118971789256, "learning_rate": 1.1822535921351149e-05, "loss": 0.0281, "step": 3530 }, { "epoch": 2.3774345198119544, "grad_norm": 0.0034045290667563677, "learning_rate": 1.1696496092765315e-05, "loss": 0.0384, "step": 3540 }, { "epoch": 2.384150436534587, "grad_norm": 3.1306815799325705e-05, "learning_rate": 1.1570456264179481e-05, "loss": 0.026, "step": 3550 }, { "epoch": 2.39086635325722, "grad_norm": 0.03722585365176201, "learning_rate": 1.1444416435593648e-05, "loss": 0.055, "step": 3560 }, { "epoch": 2.397582269979852, "grad_norm": 0.00011495844955788925, "learning_rate": 1.1318376607007814e-05, "loss": 0.0821, "step": 3570 }, { "epoch": 2.404298186702485, "grad_norm": 0.21863265335559845, "learning_rate": 1.1192336778421982e-05, "loss": 0.0207, "step": 3580 }, { "epoch": 2.4110141034251176, "grad_norm": 0.3169233798980713, "learning_rate": 1.1066296949836149e-05, "loss": 0.0398, "step": 3590 }, { "epoch": 2.4177300201477503, "grad_norm": 0.39215508103370667, "learning_rate": 1.0940257121250315e-05, "loss": 0.0436, "step": 3600 }, { "epoch": 2.4244459368703826, "grad_norm": 0.9112158417701721, "learning_rate": 1.0814217292664483e-05, "loss": 0.059, "step": 3610 }, { "epoch": 2.4311618535930153, "grad_norm": 0.2396714836359024, "learning_rate": 1.068817746407865e-05, "loss": 0.1052, "step": 3620 }, { "epoch": 2.437877770315648, "grad_norm": 0.48746079206466675, "learning_rate": 1.0562137635492816e-05, "loss": 0.1301, "step": 3630 }, { "epoch": 2.4445936870382807, "grad_norm": 0.2512687146663666, "learning_rate": 1.0436097806906982e-05, "loss": 0.0395, "step": 3640 }, { "epoch": 2.4513096037609134, "grad_norm": 0.0035686984192579985, "learning_rate": 1.031005797832115e-05, "loss": 0.0353, "step": 3650 }, { "epoch": 2.458025520483546, "grad_norm": 0.5211601257324219, "learning_rate": 1.0184018149735317e-05, "loss": 0.0351, "step": 3660 }, { "epoch": 2.464741437206179, "grad_norm": 0.0005140244611538947, "learning_rate": 1.0057978321149483e-05, "loss": 0.024, "step": 3670 }, { "epoch": 2.471457353928811, "grad_norm": 0.6637148261070251, "learning_rate": 9.93193849256365e-06, "loss": 0.0451, "step": 3680 }, { "epoch": 2.478173270651444, "grad_norm": 1.191238522529602, "learning_rate": 9.805898663977818e-06, "loss": 0.0392, "step": 3690 }, { "epoch": 2.4848891873740766, "grad_norm": 0.48035892844200134, "learning_rate": 9.679858835391984e-06, "loss": 0.071, "step": 3700 }, { "epoch": 2.4916051040967093, "grad_norm": 0.0007876435411162674, "learning_rate": 9.553819006806152e-06, "loss": 0.0841, "step": 3710 }, { "epoch": 2.4983210208193416, "grad_norm": 7.469072341918945, "learning_rate": 9.427779178220319e-06, "loss": 0.0629, "step": 3720 }, { "epoch": 2.5050369375419743, "grad_norm": 0.004849706310778856, "learning_rate": 9.301739349634485e-06, "loss": 0.0444, "step": 3730 }, { "epoch": 2.511752854264607, "grad_norm": 0.26340028643608093, "learning_rate": 9.175699521048651e-06, "loss": 0.0256, "step": 3740 }, { "epoch": 2.5184687709872398, "grad_norm": 0.5283430218696594, "learning_rate": 9.049659692462818e-06, "loss": 0.0177, "step": 3750 }, { "epoch": 2.5251846877098725, "grad_norm": 2.114301605615765e-05, "learning_rate": 8.923619863876984e-06, "loss": 0.0215, "step": 3760 }, { "epoch": 2.531900604432505, "grad_norm": 0.432597279548645, "learning_rate": 8.797580035291152e-06, "loss": 0.0784, "step": 3770 }, { "epoch": 2.538616521155138, "grad_norm": 0.00014320577611215413, "learning_rate": 8.67154020670532e-06, "loss": 0.0582, "step": 3780 }, { "epoch": 2.54533243787777, "grad_norm": 0.00046885263873264194, "learning_rate": 8.545500378119487e-06, "loss": 0.0253, "step": 3790 }, { "epoch": 2.552048354600403, "grad_norm": 0.0021923587191849947, "learning_rate": 8.419460549533653e-06, "loss": 0.0598, "step": 3800 }, { "epoch": 2.5587642713230356, "grad_norm": 0.3422527015209198, "learning_rate": 8.29342072094782e-06, "loss": 0.0442, "step": 3810 }, { "epoch": 2.5654801880456684, "grad_norm": 0.006465827114880085, "learning_rate": 8.167380892361986e-06, "loss": 0.0194, "step": 3820 }, { "epoch": 2.5721961047683006, "grad_norm": 9.287328430218622e-05, "learning_rate": 8.041341063776154e-06, "loss": 0.0813, "step": 3830 }, { "epoch": 2.5789120214909333, "grad_norm": 1.295121669769287, "learning_rate": 7.91530123519032e-06, "loss": 0.0261, "step": 3840 }, { "epoch": 2.585627938213566, "grad_norm": 1.8356350660324097, "learning_rate": 7.789261406604489e-06, "loss": 0.0862, "step": 3850 }, { "epoch": 2.592343854936199, "grad_norm": 0.40983912348747253, "learning_rate": 7.663221578018655e-06, "loss": 0.0302, "step": 3860 }, { "epoch": 2.5990597716588315, "grad_norm": 0.001640897593460977, "learning_rate": 7.537181749432821e-06, "loss": 0.0264, "step": 3870 }, { "epoch": 2.6057756883814642, "grad_norm": 2.8107948310207576e-05, "learning_rate": 7.4111419208469886e-06, "loss": 0.0615, "step": 3880 }, { "epoch": 2.612491605104097, "grad_norm": 3.114527862635441e-05, "learning_rate": 7.285102092261155e-06, "loss": 0.0594, "step": 3890 }, { "epoch": 2.6192075218267292, "grad_norm": 0.10608917474746704, "learning_rate": 7.159062263675321e-06, "loss": 0.0329, "step": 3900 }, { "epoch": 2.625923438549362, "grad_norm": 0.0014250510139390826, "learning_rate": 7.033022435089489e-06, "loss": 0.0758, "step": 3910 }, { "epoch": 2.6326393552719947, "grad_norm": 0.3084910213947296, "learning_rate": 6.906982606503655e-06, "loss": 0.0241, "step": 3920 }, { "epoch": 2.6393552719946274, "grad_norm": 2.416365168755874e-05, "learning_rate": 6.780942777917821e-06, "loss": 0.0247, "step": 3930 }, { "epoch": 2.6460711887172597, "grad_norm": 0.45002567768096924, "learning_rate": 6.6549029493319895e-06, "loss": 0.0558, "step": 3940 }, { "epoch": 2.6527871054398924, "grad_norm": 2.132117986679077, "learning_rate": 6.528863120746157e-06, "loss": 0.0902, "step": 3950 }, { "epoch": 2.659503022162525, "grad_norm": 0.6513732075691223, "learning_rate": 6.402823292160323e-06, "loss": 0.0566, "step": 3960 }, { "epoch": 2.666218938885158, "grad_norm": 0.0017292428528890014, "learning_rate": 6.2767834635744895e-06, "loss": 0.0324, "step": 3970 }, { "epoch": 2.6729348556077905, "grad_norm": 0.3677961826324463, "learning_rate": 6.150743634988657e-06, "loss": 0.0529, "step": 3980 }, { "epoch": 2.6796507723304233, "grad_norm": 0.43927228450775146, "learning_rate": 6.024703806402823e-06, "loss": 0.0385, "step": 3990 }, { "epoch": 2.686366689053056, "grad_norm": 0.47322943806648254, "learning_rate": 5.89866397781699e-06, "loss": 0.0735, "step": 4000 }, { "epoch": 2.6930826057756883, "grad_norm": 0.00010313421807950363, "learning_rate": 5.772624149231158e-06, "loss": 0.062, "step": 4010 }, { "epoch": 2.699798522498321, "grad_norm": 2.2094976902008057, "learning_rate": 5.646584320645324e-06, "loss": 0.0495, "step": 4020 }, { "epoch": 2.7065144392209537, "grad_norm": 5.995333776809275e-05, "learning_rate": 5.5205444920594905e-06, "loss": 0.0336, "step": 4030 }, { "epoch": 2.7132303559435864, "grad_norm": 0.1308954805135727, "learning_rate": 5.3945046634736586e-06, "loss": 0.0457, "step": 4040 }, { "epoch": 2.7199462726662187, "grad_norm": 0.10739518702030182, "learning_rate": 5.268464834887825e-06, "loss": 0.0205, "step": 4050 }, { "epoch": 2.7266621893888514, "grad_norm": 0.45175987482070923, "learning_rate": 5.142425006301991e-06, "loss": 0.0293, "step": 4060 }, { "epoch": 2.733378106111484, "grad_norm": 0.4527464210987091, "learning_rate": 5.016385177716159e-06, "loss": 0.049, "step": 4070 }, { "epoch": 2.740094022834117, "grad_norm": 0.9216799736022949, "learning_rate": 4.890345349130326e-06, "loss": 0.0472, "step": 4080 }, { "epoch": 2.7468099395567496, "grad_norm": 0.4577857553958893, "learning_rate": 4.764305520544492e-06, "loss": 0.0317, "step": 4090 }, { "epoch": 2.7535258562793823, "grad_norm": 0.2905367910861969, "learning_rate": 4.6382656919586595e-06, "loss": 0.0304, "step": 4100 }, { "epoch": 2.760241773002015, "grad_norm": 2.5396127700805664, "learning_rate": 4.512225863372826e-06, "loss": 0.0583, "step": 4110 }, { "epoch": 2.7669576897246473, "grad_norm": 0.0009265750413760543, "learning_rate": 4.386186034786993e-06, "loss": 0.0323, "step": 4120 }, { "epoch": 2.77367360644728, "grad_norm": 0.8636244535446167, "learning_rate": 4.2601462062011595e-06, "loss": 0.074, "step": 4130 }, { "epoch": 2.7803895231699127, "grad_norm": 0.0012430851347744465, "learning_rate": 4.134106377615327e-06, "loss": 0.03, "step": 4140 }, { "epoch": 2.7871054398925454, "grad_norm": 0.567550778388977, "learning_rate": 4.008066549029493e-06, "loss": 0.0888, "step": 4150 }, { "epoch": 2.7938213566151777, "grad_norm": 0.0012551085092127323, "learning_rate": 3.88202672044366e-06, "loss": 0.078, "step": 4160 }, { "epoch": 2.8005372733378104, "grad_norm": 0.6136677265167236, "learning_rate": 3.7559868918578272e-06, "loss": 0.0619, "step": 4170 }, { "epoch": 2.807253190060443, "grad_norm": 0.34767717123031616, "learning_rate": 3.629947063271994e-06, "loss": 0.0572, "step": 4180 }, { "epoch": 2.813969106783076, "grad_norm": 0.26787152886390686, "learning_rate": 3.503907234686161e-06, "loss": 0.0449, "step": 4190 }, { "epoch": 2.8206850235057086, "grad_norm": 0.8297312259674072, "learning_rate": 3.377867406100328e-06, "loss": 0.0545, "step": 4200 }, { "epoch": 2.8274009402283413, "grad_norm": 8.094152144622058e-05, "learning_rate": 3.251827577514495e-06, "loss": 0.0491, "step": 4210 }, { "epoch": 2.834116856950974, "grad_norm": 2.410463571548462, "learning_rate": 3.1257877489286617e-06, "loss": 0.0564, "step": 4220 }, { "epoch": 2.8408327736736063, "grad_norm": 0.9952247142791748, "learning_rate": 2.999747920342828e-06, "loss": 0.0714, "step": 4230 }, { "epoch": 2.847548690396239, "grad_norm": 0.4516734480857849, "learning_rate": 2.8737080917569954e-06, "loss": 0.047, "step": 4240 }, { "epoch": 2.8542646071188718, "grad_norm": 0.3798590898513794, "learning_rate": 2.747668263171162e-06, "loss": 0.0654, "step": 4250 }, { "epoch": 2.8609805238415045, "grad_norm": 1.9686492681503296, "learning_rate": 2.621628434585329e-06, "loss": 0.0626, "step": 4260 }, { "epoch": 2.8676964405641368, "grad_norm": 0.4292641580104828, "learning_rate": 2.495588605999496e-06, "loss": 0.0485, "step": 4270 }, { "epoch": 2.8744123572867695, "grad_norm": 5.976020111120306e-05, "learning_rate": 2.3695487774136627e-06, "loss": 0.0338, "step": 4280 }, { "epoch": 2.881128274009402, "grad_norm": 0.027319636195898056, "learning_rate": 2.2435089488278295e-06, "loss": 0.0465, "step": 4290 }, { "epoch": 2.887844190732035, "grad_norm": 0.37028828263282776, "learning_rate": 2.1174691202419967e-06, "loss": 0.0311, "step": 4300 }, { "epoch": 2.8945601074546676, "grad_norm": 0.44889381527900696, "learning_rate": 1.991429291656163e-06, "loss": 0.0414, "step": 4310 }, { "epoch": 2.9012760241773004, "grad_norm": 0.5002055168151855, "learning_rate": 1.8653894630703304e-06, "loss": 0.07, "step": 4320 }, { "epoch": 2.907991940899933, "grad_norm": 0.003884148085489869, "learning_rate": 1.739349634484497e-06, "loss": 0.0356, "step": 4330 }, { "epoch": 2.9147078576225653, "grad_norm": 0.35928064584732056, "learning_rate": 1.613309805898664e-06, "loss": 0.0508, "step": 4340 }, { "epoch": 2.921423774345198, "grad_norm": 1.2552955150604248, "learning_rate": 1.487269977312831e-06, "loss": 0.0427, "step": 4350 }, { "epoch": 2.928139691067831, "grad_norm": 0.007359993644058704, "learning_rate": 1.3612301487269979e-06, "loss": 0.0253, "step": 4360 }, { "epoch": 2.9348556077904635, "grad_norm": 1.1389840841293335, "learning_rate": 1.2351903201411647e-06, "loss": 0.0632, "step": 4370 }, { "epoch": 2.941571524513096, "grad_norm": 0.8267861008644104, "learning_rate": 1.1091504915553315e-06, "loss": 0.0396, "step": 4380 }, { "epoch": 2.9482874412357285, "grad_norm": 3.601860284805298, "learning_rate": 9.831106629694983e-07, "loss": 0.0484, "step": 4390 }, { "epoch": 2.955003357958361, "grad_norm": 0.8058955073356628, "learning_rate": 8.570708343836652e-07, "loss": 0.0346, "step": 4400 }, { "epoch": 2.961719274680994, "grad_norm": 0.0007326535996980965, "learning_rate": 7.310310057978322e-07, "loss": 0.0317, "step": 4410 }, { "epoch": 2.9684351914036267, "grad_norm": 0.9190682768821716, "learning_rate": 6.04991177211999e-07, "loss": 0.0232, "step": 4420 }, { "epoch": 2.9751511081262594, "grad_norm": 0.0039444975554943085, "learning_rate": 4.789513486261659e-07, "loss": 0.0685, "step": 4430 }, { "epoch": 2.981867024848892, "grad_norm": 0.9105575680732727, "learning_rate": 3.5291152004033275e-07, "loss": 0.0481, "step": 4440 }, { "epoch": 2.9885829415715244, "grad_norm": 0.0033778748475015163, "learning_rate": 2.2687169145449963e-07, "loss": 0.0524, "step": 4450 }, { "epoch": 2.995298858294157, "grad_norm": 0.5608463287353516, "learning_rate": 1.008318628686665e-07, "loss": 0.0465, "step": 4460 } ], "logging_steps": 10, "max_steps": 4467, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1521654161384448.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }