{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9991537376586743, "eval_steps": 500, "global_step": 2658, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011283497884344146, "grad_norm": 1.3125232159346416, "learning_rate": 5e-06, "loss": 0.7474, "step": 10 }, { "epoch": 0.022566995768688293, "grad_norm": 0.9167355748966123, "learning_rate": 5e-06, "loss": 0.6794, "step": 20 }, { "epoch": 0.03385049365303244, "grad_norm": 0.8469930147086754, "learning_rate": 5e-06, "loss": 0.6697, "step": 30 }, { "epoch": 0.045133991537376586, "grad_norm": 0.8935332426600987, "learning_rate": 5e-06, "loss": 0.6592, "step": 40 }, { "epoch": 0.056417489421720736, "grad_norm": 0.7558955578947845, "learning_rate": 5e-06, "loss": 0.6527, "step": 50 }, { "epoch": 0.06770098730606489, "grad_norm": 0.8864690987221431, "learning_rate": 5e-06, "loss": 0.6546, "step": 60 }, { "epoch": 0.07898448519040903, "grad_norm": 0.7695810970926475, "learning_rate": 5e-06, "loss": 0.632, "step": 70 }, { "epoch": 0.09026798307475317, "grad_norm": 0.7714248271685794, "learning_rate": 5e-06, "loss": 0.636, "step": 80 }, { "epoch": 0.10155148095909731, "grad_norm": 0.8886760851072192, "learning_rate": 5e-06, "loss": 0.6225, "step": 90 }, { "epoch": 0.11283497884344147, "grad_norm": 0.8974890384086033, "learning_rate": 5e-06, "loss": 0.6362, "step": 100 }, { "epoch": 0.12411847672778561, "grad_norm": 0.8819677575312233, "learning_rate": 5e-06, "loss": 0.6325, "step": 110 }, { "epoch": 0.13540197461212977, "grad_norm": 0.9251296507200821, "learning_rate": 5e-06, "loss": 0.6271, "step": 120 }, { "epoch": 0.1466854724964739, "grad_norm": 0.7702797791218077, "learning_rate": 5e-06, "loss": 0.6273, "step": 130 }, { "epoch": 0.15796897038081806, "grad_norm": 0.8095360026454328, "learning_rate": 5e-06, "loss": 0.6252, "step": 140 }, { "epoch": 0.1692524682651622, "grad_norm": 0.8204620911060526, "learning_rate": 5e-06, "loss": 0.6287, "step": 150 }, { "epoch": 0.18053596614950634, "grad_norm": 0.8182785549342615, "learning_rate": 5e-06, "loss": 0.6284, "step": 160 }, { "epoch": 0.1918194640338505, "grad_norm": 0.7445113899986885, "learning_rate": 5e-06, "loss": 0.6264, "step": 170 }, { "epoch": 0.20310296191819463, "grad_norm": 0.7873061135697372, "learning_rate": 5e-06, "loss": 0.6253, "step": 180 }, { "epoch": 0.2143864598025388, "grad_norm": 0.7956486177510658, "learning_rate": 5e-06, "loss": 0.6264, "step": 190 }, { "epoch": 0.22566995768688294, "grad_norm": 0.7746629467371798, "learning_rate": 5e-06, "loss": 0.6238, "step": 200 }, { "epoch": 0.23695345557122707, "grad_norm": 0.7345245715769874, "learning_rate": 5e-06, "loss": 0.619, "step": 210 }, { "epoch": 0.24823695345557123, "grad_norm": 0.8349914408696147, "learning_rate": 5e-06, "loss": 0.6145, "step": 220 }, { "epoch": 0.25952045133991536, "grad_norm": 0.7331223816734604, "learning_rate": 5e-06, "loss": 0.6179, "step": 230 }, { "epoch": 0.27080394922425954, "grad_norm": 0.7535227854265615, "learning_rate": 5e-06, "loss": 0.615, "step": 240 }, { "epoch": 0.2820874471086037, "grad_norm": 0.9217132891288368, "learning_rate": 5e-06, "loss": 0.614, "step": 250 }, { "epoch": 0.2933709449929478, "grad_norm": 0.7626220728005123, "learning_rate": 5e-06, "loss": 0.6207, "step": 260 }, { "epoch": 0.304654442877292, "grad_norm": 0.7567534233021369, "learning_rate": 5e-06, "loss": 0.6138, "step": 270 }, { "epoch": 0.3159379407616361, "grad_norm": 0.7740686670455319, "learning_rate": 5e-06, "loss": 0.6107, "step": 280 }, { "epoch": 0.32722143864598024, "grad_norm": 0.7577709244521811, "learning_rate": 5e-06, "loss": 0.614, "step": 290 }, { "epoch": 0.3385049365303244, "grad_norm": 0.7294703403964556, "learning_rate": 5e-06, "loss": 0.6113, "step": 300 }, { "epoch": 0.34978843441466856, "grad_norm": 0.7503750502849057, "learning_rate": 5e-06, "loss": 0.6122, "step": 310 }, { "epoch": 0.3610719322990127, "grad_norm": 0.7103206649111901, "learning_rate": 5e-06, "loss": 0.6167, "step": 320 }, { "epoch": 0.3723554301833568, "grad_norm": 0.7626842348184225, "learning_rate": 5e-06, "loss": 0.606, "step": 330 }, { "epoch": 0.383638928067701, "grad_norm": 0.7621571567885804, "learning_rate": 5e-06, "loss": 0.6155, "step": 340 }, { "epoch": 0.39492242595204513, "grad_norm": 0.7549313486626996, "learning_rate": 5e-06, "loss": 0.6172, "step": 350 }, { "epoch": 0.40620592383638926, "grad_norm": 0.782988336813104, "learning_rate": 5e-06, "loss": 0.5985, "step": 360 }, { "epoch": 0.41748942172073344, "grad_norm": 0.7863471438503445, "learning_rate": 5e-06, "loss": 0.6154, "step": 370 }, { "epoch": 0.4287729196050776, "grad_norm": 0.7640237197989878, "learning_rate": 5e-06, "loss": 0.5897, "step": 380 }, { "epoch": 0.4400564174894217, "grad_norm": 0.8024119449518029, "learning_rate": 5e-06, "loss": 0.6157, "step": 390 }, { "epoch": 0.4513399153737659, "grad_norm": 0.720556179289527, "learning_rate": 5e-06, "loss": 0.603, "step": 400 }, { "epoch": 0.46262341325811, "grad_norm": 0.76294955883197, "learning_rate": 5e-06, "loss": 0.6061, "step": 410 }, { "epoch": 0.47390691114245415, "grad_norm": 0.8156507807690674, "learning_rate": 5e-06, "loss": 0.6051, "step": 420 }, { "epoch": 0.48519040902679833, "grad_norm": 0.7521066252629773, "learning_rate": 5e-06, "loss": 0.5976, "step": 430 }, { "epoch": 0.49647390691114246, "grad_norm": 0.7133025964230046, "learning_rate": 5e-06, "loss": 0.5968, "step": 440 }, { "epoch": 0.5077574047954866, "grad_norm": 0.7925716834523774, "learning_rate": 5e-06, "loss": 0.6096, "step": 450 }, { "epoch": 0.5190409026798307, "grad_norm": 0.7100847121605542, "learning_rate": 5e-06, "loss": 0.6032, "step": 460 }, { "epoch": 0.5303244005641748, "grad_norm": 0.6929396371646737, "learning_rate": 5e-06, "loss": 0.6002, "step": 470 }, { "epoch": 0.5416078984485191, "grad_norm": 0.6977506618129865, "learning_rate": 5e-06, "loss": 0.6104, "step": 480 }, { "epoch": 0.5528913963328632, "grad_norm": 0.7147611855456376, "learning_rate": 5e-06, "loss": 0.5912, "step": 490 }, { "epoch": 0.5641748942172073, "grad_norm": 0.7042203040407636, "learning_rate": 5e-06, "loss": 0.6079, "step": 500 }, { "epoch": 0.5754583921015515, "grad_norm": 0.7309189788815705, "learning_rate": 5e-06, "loss": 0.6036, "step": 510 }, { "epoch": 0.5867418899858956, "grad_norm": 0.7248169421227385, "learning_rate": 5e-06, "loss": 0.6003, "step": 520 }, { "epoch": 0.5980253878702397, "grad_norm": 0.7572409467891201, "learning_rate": 5e-06, "loss": 0.5916, "step": 530 }, { "epoch": 0.609308885754584, "grad_norm": 0.733437644851395, "learning_rate": 5e-06, "loss": 0.6036, "step": 540 }, { "epoch": 0.6205923836389281, "grad_norm": 0.7131751072814058, "learning_rate": 5e-06, "loss": 0.6034, "step": 550 }, { "epoch": 0.6318758815232722, "grad_norm": 0.7360178195388309, "learning_rate": 5e-06, "loss": 0.605, "step": 560 }, { "epoch": 0.6431593794076164, "grad_norm": 1.1495804042684488, "learning_rate": 5e-06, "loss": 0.5974, "step": 570 }, { "epoch": 0.6544428772919605, "grad_norm": 0.6817754751813853, "learning_rate": 5e-06, "loss": 0.5977, "step": 580 }, { "epoch": 0.6657263751763046, "grad_norm": 0.7044965353014948, "learning_rate": 5e-06, "loss": 0.607, "step": 590 }, { "epoch": 0.6770098730606487, "grad_norm": 0.7308565798791333, "learning_rate": 5e-06, "loss": 0.6015, "step": 600 }, { "epoch": 0.688293370944993, "grad_norm": 0.758452633023999, "learning_rate": 5e-06, "loss": 0.603, "step": 610 }, { "epoch": 0.6995768688293371, "grad_norm": 0.7778955204205954, "learning_rate": 5e-06, "loss": 0.5924, "step": 620 }, { "epoch": 0.7108603667136812, "grad_norm": 0.6757727892125893, "learning_rate": 5e-06, "loss": 0.5999, "step": 630 }, { "epoch": 0.7221438645980254, "grad_norm": 0.7690880111005869, "learning_rate": 5e-06, "loss": 0.6008, "step": 640 }, { "epoch": 0.7334273624823695, "grad_norm": 0.780886462765786, "learning_rate": 5e-06, "loss": 0.5954, "step": 650 }, { "epoch": 0.7447108603667136, "grad_norm": 0.7683226572424213, "learning_rate": 5e-06, "loss": 0.5975, "step": 660 }, { "epoch": 0.7559943582510579, "grad_norm": 0.6840538894373641, "learning_rate": 5e-06, "loss": 0.6023, "step": 670 }, { "epoch": 0.767277856135402, "grad_norm": 0.6771027958757195, "learning_rate": 5e-06, "loss": 0.59, "step": 680 }, { "epoch": 0.7785613540197461, "grad_norm": 0.6681056791937571, "learning_rate": 5e-06, "loss": 0.5931, "step": 690 }, { "epoch": 0.7898448519040903, "grad_norm": 0.7086908628376102, "learning_rate": 5e-06, "loss": 0.6005, "step": 700 }, { "epoch": 0.8011283497884344, "grad_norm": 0.6592220118037463, "learning_rate": 5e-06, "loss": 0.5888, "step": 710 }, { "epoch": 0.8124118476727785, "grad_norm": 0.7384276235727564, "learning_rate": 5e-06, "loss": 0.6029, "step": 720 }, { "epoch": 0.8236953455571228, "grad_norm": 0.7153557202506454, "learning_rate": 5e-06, "loss": 0.6004, "step": 730 }, { "epoch": 0.8349788434414669, "grad_norm": 0.7412356079152558, "learning_rate": 5e-06, "loss": 0.6062, "step": 740 }, { "epoch": 0.846262341325811, "grad_norm": 0.8186167983419891, "learning_rate": 5e-06, "loss": 0.6012, "step": 750 }, { "epoch": 0.8575458392101551, "grad_norm": 0.6956555496968945, "learning_rate": 5e-06, "loss": 0.5986, "step": 760 }, { "epoch": 0.8688293370944993, "grad_norm": 0.6972125762170825, "learning_rate": 5e-06, "loss": 0.598, "step": 770 }, { "epoch": 0.8801128349788434, "grad_norm": 0.6554398016756771, "learning_rate": 5e-06, "loss": 0.5899, "step": 780 }, { "epoch": 0.8913963328631875, "grad_norm": 0.6816513719411913, "learning_rate": 5e-06, "loss": 0.5959, "step": 790 }, { "epoch": 0.9026798307475318, "grad_norm": 0.6802192397813065, "learning_rate": 5e-06, "loss": 0.5966, "step": 800 }, { "epoch": 0.9139633286318759, "grad_norm": 0.7297229988637639, "learning_rate": 5e-06, "loss": 0.5977, "step": 810 }, { "epoch": 0.92524682651622, "grad_norm": 0.6910581824985036, "learning_rate": 5e-06, "loss": 0.5903, "step": 820 }, { "epoch": 0.9365303244005642, "grad_norm": 0.7038633597805812, "learning_rate": 5e-06, "loss": 0.5926, "step": 830 }, { "epoch": 0.9478138222849083, "grad_norm": 0.7006043004126095, "learning_rate": 5e-06, "loss": 0.5968, "step": 840 }, { "epoch": 0.9590973201692524, "grad_norm": 0.7196243633360835, "learning_rate": 5e-06, "loss": 0.5946, "step": 850 }, { "epoch": 0.9703808180535967, "grad_norm": 1.0955172142857887, "learning_rate": 5e-06, "loss": 0.5837, "step": 860 }, { "epoch": 0.9816643159379408, "grad_norm": 0.6592667550168434, "learning_rate": 5e-06, "loss": 0.5998, "step": 870 }, { "epoch": 0.9929478138222849, "grad_norm": 0.7854190040474681, "learning_rate": 5e-06, "loss": 0.5875, "step": 880 }, { "epoch": 0.9997179125528914, "eval_loss": 0.5916627049446106, "eval_runtime": 700.0009, "eval_samples_per_second": 17.059, "eval_steps_per_second": 0.534, "step": 886 }, { "epoch": 1.004231311706629, "grad_norm": 1.3251196950195474, "learning_rate": 5e-06, "loss": 0.6038, "step": 890 }, { "epoch": 1.0155148095909732, "grad_norm": 0.833678301438908, "learning_rate": 5e-06, "loss": 0.5204, "step": 900 }, { "epoch": 1.0267983074753173, "grad_norm": 0.7372319799588725, "learning_rate": 5e-06, "loss": 0.5098, "step": 910 }, { "epoch": 1.0380818053596614, "grad_norm": 0.710656408718745, "learning_rate": 5e-06, "loss": 0.5147, "step": 920 }, { "epoch": 1.0493653032440056, "grad_norm": 0.7035392178179493, "learning_rate": 5e-06, "loss": 0.532, "step": 930 }, { "epoch": 1.0606488011283497, "grad_norm": 0.7419000790657035, "learning_rate": 5e-06, "loss": 0.5232, "step": 940 }, { "epoch": 1.071932299012694, "grad_norm": 0.7507810642090355, "learning_rate": 5e-06, "loss": 0.5182, "step": 950 }, { "epoch": 1.0832157968970382, "grad_norm": 0.7365569865429179, "learning_rate": 5e-06, "loss": 0.5156, "step": 960 }, { "epoch": 1.0944992947813823, "grad_norm": 0.6971352350986588, "learning_rate": 5e-06, "loss": 0.5255, "step": 970 }, { "epoch": 1.1057827926657264, "grad_norm": 0.769651560789747, "learning_rate": 5e-06, "loss": 0.5275, "step": 980 }, { "epoch": 1.1170662905500706, "grad_norm": 0.6835338919436766, "learning_rate": 5e-06, "loss": 0.5266, "step": 990 }, { "epoch": 1.1283497884344147, "grad_norm": 0.7080901866255427, "learning_rate": 5e-06, "loss": 0.5245, "step": 1000 }, { "epoch": 1.1396332863187588, "grad_norm": 0.6973746558937527, "learning_rate": 5e-06, "loss": 0.5272, "step": 1010 }, { "epoch": 1.150916784203103, "grad_norm": 0.7328115120631007, "learning_rate": 5e-06, "loss": 0.5269, "step": 1020 }, { "epoch": 1.162200282087447, "grad_norm": 0.8161495623001298, "learning_rate": 5e-06, "loss": 0.5226, "step": 1030 }, { "epoch": 1.1734837799717912, "grad_norm": 0.8517029792820613, "learning_rate": 5e-06, "loss": 0.5175, "step": 1040 }, { "epoch": 1.1847672778561353, "grad_norm": 0.8438046028102798, "learning_rate": 5e-06, "loss": 0.5241, "step": 1050 }, { "epoch": 1.1960507757404795, "grad_norm": 0.7099751086024491, "learning_rate": 5e-06, "loss": 0.5291, "step": 1060 }, { "epoch": 1.2073342736248236, "grad_norm": 0.7580945476663854, "learning_rate": 5e-06, "loss": 0.5249, "step": 1070 }, { "epoch": 1.2186177715091677, "grad_norm": 0.7667303387873934, "learning_rate": 5e-06, "loss": 0.5235, "step": 1080 }, { "epoch": 1.229901269393512, "grad_norm": 0.8114242325537707, "learning_rate": 5e-06, "loss": 0.5283, "step": 1090 }, { "epoch": 1.2411847672778562, "grad_norm": 0.7131407385877792, "learning_rate": 5e-06, "loss": 0.5288, "step": 1100 }, { "epoch": 1.2524682651622003, "grad_norm": 0.685086959113009, "learning_rate": 5e-06, "loss": 0.5195, "step": 1110 }, { "epoch": 1.2637517630465445, "grad_norm": 0.7053119373842179, "learning_rate": 5e-06, "loss": 0.542, "step": 1120 }, { "epoch": 1.2750352609308886, "grad_norm": 0.7644874615248024, "learning_rate": 5e-06, "loss": 0.5287, "step": 1130 }, { "epoch": 1.2863187588152327, "grad_norm": 0.6883885545058486, "learning_rate": 5e-06, "loss": 0.5267, "step": 1140 }, { "epoch": 1.2976022566995769, "grad_norm": 0.7306504262086587, "learning_rate": 5e-06, "loss": 0.5289, "step": 1150 }, { "epoch": 1.308885754583921, "grad_norm": 0.9640801871844217, "learning_rate": 5e-06, "loss": 0.5267, "step": 1160 }, { "epoch": 1.320169252468265, "grad_norm": 0.661755370313437, "learning_rate": 5e-06, "loss": 0.5305, "step": 1170 }, { "epoch": 1.3314527503526092, "grad_norm": 0.6875142102900156, "learning_rate": 5e-06, "loss": 0.5262, "step": 1180 }, { "epoch": 1.3427362482369536, "grad_norm": 0.7205563505642903, "learning_rate": 5e-06, "loss": 0.5391, "step": 1190 }, { "epoch": 1.3540197461212977, "grad_norm": 0.6599447213858599, "learning_rate": 5e-06, "loss": 0.5272, "step": 1200 }, { "epoch": 1.3653032440056418, "grad_norm": 0.6674411183277559, "learning_rate": 5e-06, "loss": 0.5294, "step": 1210 }, { "epoch": 1.376586741889986, "grad_norm": 0.6760191535339451, "learning_rate": 5e-06, "loss": 0.5237, "step": 1220 }, { "epoch": 1.38787023977433, "grad_norm": 0.699499504851807, "learning_rate": 5e-06, "loss": 0.5239, "step": 1230 }, { "epoch": 1.3991537376586742, "grad_norm": 0.6949474979507084, "learning_rate": 5e-06, "loss": 0.5371, "step": 1240 }, { "epoch": 1.4104372355430184, "grad_norm": 0.6871033689397662, "learning_rate": 5e-06, "loss": 0.5346, "step": 1250 }, { "epoch": 1.4217207334273625, "grad_norm": 0.6742402949788701, "learning_rate": 5e-06, "loss": 0.5239, "step": 1260 }, { "epoch": 1.4330042313117066, "grad_norm": 0.6751463140934231, "learning_rate": 5e-06, "loss": 0.5266, "step": 1270 }, { "epoch": 1.4442877291960508, "grad_norm": 0.7142787617460878, "learning_rate": 5e-06, "loss": 0.5315, "step": 1280 }, { "epoch": 1.4555712270803949, "grad_norm": 0.7226332853252287, "learning_rate": 5e-06, "loss": 0.5275, "step": 1290 }, { "epoch": 1.466854724964739, "grad_norm": 0.7145272993925487, "learning_rate": 5e-06, "loss": 0.5322, "step": 1300 }, { "epoch": 1.4781382228490831, "grad_norm": 0.6768426299144826, "learning_rate": 5e-06, "loss": 0.538, "step": 1310 }, { "epoch": 1.4894217207334273, "grad_norm": 0.7123747155692624, "learning_rate": 5e-06, "loss": 0.5289, "step": 1320 }, { "epoch": 1.5007052186177714, "grad_norm": 0.6786058643003756, "learning_rate": 5e-06, "loss": 0.5253, "step": 1330 }, { "epoch": 1.5119887165021155, "grad_norm": 0.7110831777325413, "learning_rate": 5e-06, "loss": 0.5324, "step": 1340 }, { "epoch": 1.5232722143864597, "grad_norm": 0.6695015578694526, "learning_rate": 5e-06, "loss": 0.5214, "step": 1350 }, { "epoch": 1.5345557122708038, "grad_norm": 0.7459448089167867, "learning_rate": 5e-06, "loss": 0.5252, "step": 1360 }, { "epoch": 1.5458392101551481, "grad_norm": 0.6753928189856611, "learning_rate": 5e-06, "loss": 0.5251, "step": 1370 }, { "epoch": 1.5571227080394923, "grad_norm": 0.6996235594913326, "learning_rate": 5e-06, "loss": 0.5248, "step": 1380 }, { "epoch": 1.5684062059238364, "grad_norm": 0.7079009762601143, "learning_rate": 5e-06, "loss": 0.5323, "step": 1390 }, { "epoch": 1.5796897038081805, "grad_norm": 0.6924093724518422, "learning_rate": 5e-06, "loss": 0.5255, "step": 1400 }, { "epoch": 1.5909732016925247, "grad_norm": 0.6761094320283147, "learning_rate": 5e-06, "loss": 0.5339, "step": 1410 }, { "epoch": 1.6022566995768688, "grad_norm": 0.6809745203094293, "learning_rate": 5e-06, "loss": 0.5298, "step": 1420 }, { "epoch": 1.6135401974612131, "grad_norm": 0.7106432021864519, "learning_rate": 5e-06, "loss": 0.5478, "step": 1430 }, { "epoch": 1.6248236953455573, "grad_norm": 0.7187158548818587, "learning_rate": 5e-06, "loss": 0.531, "step": 1440 }, { "epoch": 1.6361071932299014, "grad_norm": 0.6596453734369295, "learning_rate": 5e-06, "loss": 0.5225, "step": 1450 }, { "epoch": 1.6473906911142455, "grad_norm": 0.7093693747872939, "learning_rate": 5e-06, "loss": 0.5257, "step": 1460 }, { "epoch": 1.6586741889985896, "grad_norm": 0.6944427657225931, "learning_rate": 5e-06, "loss": 0.5412, "step": 1470 }, { "epoch": 1.6699576868829338, "grad_norm": 0.6955533636685318, "learning_rate": 5e-06, "loss": 0.5367, "step": 1480 }, { "epoch": 1.681241184767278, "grad_norm": 0.6624317919452952, "learning_rate": 5e-06, "loss": 0.5293, "step": 1490 }, { "epoch": 1.692524682651622, "grad_norm": 0.6778095946034755, "learning_rate": 5e-06, "loss": 0.5323, "step": 1500 }, { "epoch": 1.7038081805359662, "grad_norm": 0.7341427766426889, "learning_rate": 5e-06, "loss": 0.5263, "step": 1510 }, { "epoch": 1.7150916784203103, "grad_norm": 0.7432696239071007, "learning_rate": 5e-06, "loss": 0.5334, "step": 1520 }, { "epoch": 1.7263751763046544, "grad_norm": 0.6491261448207155, "learning_rate": 5e-06, "loss": 0.5305, "step": 1530 }, { "epoch": 1.7376586741889986, "grad_norm": 0.7486777638151452, "learning_rate": 5e-06, "loss": 0.5304, "step": 1540 }, { "epoch": 1.7489421720733427, "grad_norm": 0.7118918086770966, "learning_rate": 5e-06, "loss": 0.5216, "step": 1550 }, { "epoch": 1.7602256699576868, "grad_norm": 0.7038340173438765, "learning_rate": 5e-06, "loss": 0.5322, "step": 1560 }, { "epoch": 1.771509167842031, "grad_norm": 0.6539818459517368, "learning_rate": 5e-06, "loss": 0.5321, "step": 1570 }, { "epoch": 1.782792665726375, "grad_norm": 0.658011536602541, "learning_rate": 5e-06, "loss": 0.5288, "step": 1580 }, { "epoch": 1.7940761636107192, "grad_norm": 0.6572168627274099, "learning_rate": 5e-06, "loss": 0.5219, "step": 1590 }, { "epoch": 1.8053596614950633, "grad_norm": 0.6648318139285605, "learning_rate": 5e-06, "loss": 0.5292, "step": 1600 }, { "epoch": 1.8166431593794075, "grad_norm": 0.6932570407391014, "learning_rate": 5e-06, "loss": 0.5381, "step": 1610 }, { "epoch": 1.8279266572637518, "grad_norm": 0.6710478896264537, "learning_rate": 5e-06, "loss": 0.5344, "step": 1620 }, { "epoch": 1.839210155148096, "grad_norm": 0.6710698360135149, "learning_rate": 5e-06, "loss": 0.5201, "step": 1630 }, { "epoch": 1.85049365303244, "grad_norm": 0.6664339220436462, "learning_rate": 5e-06, "loss": 0.5272, "step": 1640 }, { "epoch": 1.8617771509167842, "grad_norm": 0.679324908548248, "learning_rate": 5e-06, "loss": 0.5381, "step": 1650 }, { "epoch": 1.8730606488011283, "grad_norm": 0.7203049853389839, "learning_rate": 5e-06, "loss": 0.5325, "step": 1660 }, { "epoch": 1.8843441466854725, "grad_norm": 0.671958846252959, "learning_rate": 5e-06, "loss": 0.5308, "step": 1670 }, { "epoch": 1.8956276445698168, "grad_norm": 0.6756200949168255, "learning_rate": 5e-06, "loss": 0.5401, "step": 1680 }, { "epoch": 1.906911142454161, "grad_norm": 0.6795733118816987, "learning_rate": 5e-06, "loss": 0.5209, "step": 1690 }, { "epoch": 1.918194640338505, "grad_norm": 0.6875411881069398, "learning_rate": 5e-06, "loss": 0.5274, "step": 1700 }, { "epoch": 1.9294781382228492, "grad_norm": 0.6539417007788165, "learning_rate": 5e-06, "loss": 0.5293, "step": 1710 }, { "epoch": 1.9407616361071933, "grad_norm": 0.7131644844536464, "learning_rate": 5e-06, "loss": 0.5288, "step": 1720 }, { "epoch": 1.9520451339915375, "grad_norm": 0.6498878256598866, "learning_rate": 5e-06, "loss": 0.5289, "step": 1730 }, { "epoch": 1.9633286318758816, "grad_norm": 0.6843667341548544, "learning_rate": 5e-06, "loss": 0.5418, "step": 1740 }, { "epoch": 1.9746121297602257, "grad_norm": 0.6650168678603656, "learning_rate": 5e-06, "loss": 0.5402, "step": 1750 }, { "epoch": 1.9858956276445698, "grad_norm": 0.6825312514425923, "learning_rate": 5e-06, "loss": 0.5268, "step": 1760 }, { "epoch": 1.997179125528914, "grad_norm": 0.6765862874969008, "learning_rate": 5e-06, "loss": 0.5215, "step": 1770 }, { "epoch": 1.9994358251057829, "eval_loss": 0.5913873910903931, "eval_runtime": 698.2566, "eval_samples_per_second": 17.101, "eval_steps_per_second": 0.536, "step": 1772 }, { "epoch": 2.008462623413258, "grad_norm": 0.9398660673562257, "learning_rate": 5e-06, "loss": 0.5208, "step": 1780 }, { "epoch": 2.0197461212976022, "grad_norm": 0.7762940970041788, "learning_rate": 5e-06, "loss": 0.4524, "step": 1790 }, { "epoch": 2.0310296191819464, "grad_norm": 0.8552156108609141, "learning_rate": 5e-06, "loss": 0.4428, "step": 1800 }, { "epoch": 2.0423131170662905, "grad_norm": 0.7759834543073888, "learning_rate": 5e-06, "loss": 0.4497, "step": 1810 }, { "epoch": 2.0535966149506346, "grad_norm": 0.813460157749594, "learning_rate": 5e-06, "loss": 0.4501, "step": 1820 }, { "epoch": 2.0648801128349787, "grad_norm": 0.7555915322851285, "learning_rate": 5e-06, "loss": 0.4366, "step": 1830 }, { "epoch": 2.076163610719323, "grad_norm": 0.778910984127173, "learning_rate": 5e-06, "loss": 0.45, "step": 1840 }, { "epoch": 2.087447108603667, "grad_norm": 0.7823483784772997, "learning_rate": 5e-06, "loss": 0.4487, "step": 1850 }, { "epoch": 2.098730606488011, "grad_norm": 0.7451819548464573, "learning_rate": 5e-06, "loss": 0.4524, "step": 1860 }, { "epoch": 2.1100141043723553, "grad_norm": 0.7366797047615511, "learning_rate": 5e-06, "loss": 0.4554, "step": 1870 }, { "epoch": 2.1212976022566994, "grad_norm": 0.7739116377396993, "learning_rate": 5e-06, "loss": 0.4545, "step": 1880 }, { "epoch": 2.1325811001410435, "grad_norm": 0.7393809355678413, "learning_rate": 5e-06, "loss": 0.4576, "step": 1890 }, { "epoch": 2.143864598025388, "grad_norm": 0.7783700395003256, "learning_rate": 5e-06, "loss": 0.456, "step": 1900 }, { "epoch": 2.155148095909732, "grad_norm": 0.7655026275655048, "learning_rate": 5e-06, "loss": 0.4555, "step": 1910 }, { "epoch": 2.1664315937940763, "grad_norm": 0.7308671453948609, "learning_rate": 5e-06, "loss": 0.4516, "step": 1920 }, { "epoch": 2.1777150916784205, "grad_norm": 0.7548898417368491, "learning_rate": 5e-06, "loss": 0.4554, "step": 1930 }, { "epoch": 2.1889985895627646, "grad_norm": 0.7422727245259925, "learning_rate": 5e-06, "loss": 0.4501, "step": 1940 }, { "epoch": 2.2002820874471087, "grad_norm": 0.787975192132044, "learning_rate": 5e-06, "loss": 0.4439, "step": 1950 }, { "epoch": 2.211565585331453, "grad_norm": 0.7365878952628802, "learning_rate": 5e-06, "loss": 0.4514, "step": 1960 }, { "epoch": 2.222849083215797, "grad_norm": 0.7259152335914627, "learning_rate": 5e-06, "loss": 0.4443, "step": 1970 }, { "epoch": 2.234132581100141, "grad_norm": 0.7305442934832262, "learning_rate": 5e-06, "loss": 0.4508, "step": 1980 }, { "epoch": 2.2454160789844853, "grad_norm": 0.752378808107532, "learning_rate": 5e-06, "loss": 0.4533, "step": 1990 }, { "epoch": 2.2566995768688294, "grad_norm": 0.7662288679828292, "learning_rate": 5e-06, "loss": 0.4591, "step": 2000 }, { "epoch": 2.2679830747531735, "grad_norm": 0.7571986629865566, "learning_rate": 5e-06, "loss": 0.4583, "step": 2010 }, { "epoch": 2.2792665726375176, "grad_norm": 0.7853445680318998, "learning_rate": 5e-06, "loss": 0.4665, "step": 2020 }, { "epoch": 2.2905500705218618, "grad_norm": 0.8056666478419078, "learning_rate": 5e-06, "loss": 0.4626, "step": 2030 }, { "epoch": 2.301833568406206, "grad_norm": 0.7705827370447227, "learning_rate": 5e-06, "loss": 0.4529, "step": 2040 }, { "epoch": 2.31311706629055, "grad_norm": 0.952209351698993, "learning_rate": 5e-06, "loss": 0.4533, "step": 2050 }, { "epoch": 2.324400564174894, "grad_norm": 0.7437396447699182, "learning_rate": 5e-06, "loss": 0.4694, "step": 2060 }, { "epoch": 2.3356840620592383, "grad_norm": 0.7507184464222211, "learning_rate": 5e-06, "loss": 0.4571, "step": 2070 }, { "epoch": 2.3469675599435824, "grad_norm": 0.7538037629123275, "learning_rate": 5e-06, "loss": 0.4637, "step": 2080 }, { "epoch": 2.3582510578279265, "grad_norm": 0.7207349348209307, "learning_rate": 5e-06, "loss": 0.454, "step": 2090 }, { "epoch": 2.3695345557122707, "grad_norm": 0.7732963363098568, "learning_rate": 5e-06, "loss": 0.4621, "step": 2100 }, { "epoch": 2.380818053596615, "grad_norm": 0.7733305143075813, "learning_rate": 5e-06, "loss": 0.4616, "step": 2110 }, { "epoch": 2.392101551480959, "grad_norm": 0.7317816197062187, "learning_rate": 5e-06, "loss": 0.4544, "step": 2120 }, { "epoch": 2.403385049365303, "grad_norm": 0.7387742831088012, "learning_rate": 5e-06, "loss": 0.4612, "step": 2130 }, { "epoch": 2.414668547249647, "grad_norm": 0.7855904874369565, "learning_rate": 5e-06, "loss": 0.4567, "step": 2140 }, { "epoch": 2.4259520451339913, "grad_norm": 0.7237436303435315, "learning_rate": 5e-06, "loss": 0.4624, "step": 2150 }, { "epoch": 2.4372355430183354, "grad_norm": 0.8184805263780661, "learning_rate": 5e-06, "loss": 0.4621, "step": 2160 }, { "epoch": 2.44851904090268, "grad_norm": 0.760755214994802, "learning_rate": 5e-06, "loss": 0.4602, "step": 2170 }, { "epoch": 2.459802538787024, "grad_norm": 0.7634415569046652, "learning_rate": 5e-06, "loss": 0.461, "step": 2180 }, { "epoch": 2.4710860366713683, "grad_norm": 0.725712122867678, "learning_rate": 5e-06, "loss": 0.4631, "step": 2190 }, { "epoch": 2.4823695345557124, "grad_norm": 0.7540090450075305, "learning_rate": 5e-06, "loss": 0.4638, "step": 2200 }, { "epoch": 2.4936530324400565, "grad_norm": 0.7217092577620574, "learning_rate": 5e-06, "loss": 0.4702, "step": 2210 }, { "epoch": 2.5049365303244007, "grad_norm": 0.7316510601952371, "learning_rate": 5e-06, "loss": 0.4636, "step": 2220 }, { "epoch": 2.516220028208745, "grad_norm": 0.7769888826201891, "learning_rate": 5e-06, "loss": 0.457, "step": 2230 }, { "epoch": 2.527503526093089, "grad_norm": 0.6872130661688746, "learning_rate": 5e-06, "loss": 0.4603, "step": 2240 }, { "epoch": 2.538787023977433, "grad_norm": 0.7809947622038785, "learning_rate": 5e-06, "loss": 0.4598, "step": 2250 }, { "epoch": 2.550070521861777, "grad_norm": 0.7128936960294685, "learning_rate": 5e-06, "loss": 0.4665, "step": 2260 }, { "epoch": 2.5613540197461213, "grad_norm": 0.7343811921788322, "learning_rate": 5e-06, "loss": 0.4612, "step": 2270 }, { "epoch": 2.5726375176304654, "grad_norm": 0.7390667339205597, "learning_rate": 5e-06, "loss": 0.4607, "step": 2280 }, { "epoch": 2.5839210155148096, "grad_norm": 0.726032460592344, "learning_rate": 5e-06, "loss": 0.4549, "step": 2290 }, { "epoch": 2.5952045133991537, "grad_norm": 0.7524597049646161, "learning_rate": 5e-06, "loss": 0.4694, "step": 2300 }, { "epoch": 2.606488011283498, "grad_norm": 0.7858250860151663, "learning_rate": 5e-06, "loss": 0.4663, "step": 2310 }, { "epoch": 2.617771509167842, "grad_norm": 0.7293613916851106, "learning_rate": 5e-06, "loss": 0.4664, "step": 2320 }, { "epoch": 2.629055007052186, "grad_norm": 0.7290633000536495, "learning_rate": 5e-06, "loss": 0.4543, "step": 2330 }, { "epoch": 2.64033850493653, "grad_norm": 0.7091768581049495, "learning_rate": 5e-06, "loss": 0.4559, "step": 2340 }, { "epoch": 2.6516220028208743, "grad_norm": 0.7087926245008993, "learning_rate": 5e-06, "loss": 0.4635, "step": 2350 }, { "epoch": 2.6629055007052185, "grad_norm": 0.7590073933828212, "learning_rate": 5e-06, "loss": 0.4686, "step": 2360 }, { "epoch": 2.6741889985895626, "grad_norm": 0.7267770663460916, "learning_rate": 5e-06, "loss": 0.46, "step": 2370 }, { "epoch": 2.685472496473907, "grad_norm": 0.7248345474556908, "learning_rate": 5e-06, "loss": 0.4727, "step": 2380 }, { "epoch": 2.6967559943582513, "grad_norm": 0.7498626285953574, "learning_rate": 5e-06, "loss": 0.4667, "step": 2390 }, { "epoch": 2.7080394922425954, "grad_norm": 0.7438411024370735, "learning_rate": 5e-06, "loss": 0.465, "step": 2400 }, { "epoch": 2.7193229901269396, "grad_norm": 0.7551882708372384, "learning_rate": 5e-06, "loss": 0.4659, "step": 2410 }, { "epoch": 2.7306064880112837, "grad_norm": 0.7575954402507921, "learning_rate": 5e-06, "loss": 0.4679, "step": 2420 }, { "epoch": 2.741889985895628, "grad_norm": 0.7276043657180276, "learning_rate": 5e-06, "loss": 0.4628, "step": 2430 }, { "epoch": 2.753173483779972, "grad_norm": 0.7154984668294021, "learning_rate": 5e-06, "loss": 0.4654, "step": 2440 }, { "epoch": 2.764456981664316, "grad_norm": 0.7203876412478339, "learning_rate": 5e-06, "loss": 0.4697, "step": 2450 }, { "epoch": 2.77574047954866, "grad_norm": 0.7653450966118911, "learning_rate": 5e-06, "loss": 0.4705, "step": 2460 }, { "epoch": 2.7870239774330043, "grad_norm": 0.7371256159182112, "learning_rate": 5e-06, "loss": 0.4641, "step": 2470 }, { "epoch": 2.7983074753173485, "grad_norm": 0.7334765675316313, "learning_rate": 5e-06, "loss": 0.4726, "step": 2480 }, { "epoch": 2.8095909732016926, "grad_norm": 0.7255504755388352, "learning_rate": 5e-06, "loss": 0.4734, "step": 2490 }, { "epoch": 2.8208744710860367, "grad_norm": 0.721440148744813, "learning_rate": 5e-06, "loss": 0.4644, "step": 2500 }, { "epoch": 2.832157968970381, "grad_norm": 0.748754622356668, "learning_rate": 5e-06, "loss": 0.4794, "step": 2510 }, { "epoch": 2.843441466854725, "grad_norm": 0.7463238131630598, "learning_rate": 5e-06, "loss": 0.4628, "step": 2520 }, { "epoch": 2.854724964739069, "grad_norm": 0.7220764644154523, "learning_rate": 5e-06, "loss": 0.4715, "step": 2530 }, { "epoch": 2.8660084626234132, "grad_norm": 0.7116624476538259, "learning_rate": 5e-06, "loss": 0.4633, "step": 2540 }, { "epoch": 2.8772919605077574, "grad_norm": 0.7567720887800601, "learning_rate": 5e-06, "loss": 0.4689, "step": 2550 }, { "epoch": 2.8885754583921015, "grad_norm": 0.7311117433105612, "learning_rate": 5e-06, "loss": 0.4659, "step": 2560 }, { "epoch": 2.8998589562764456, "grad_norm": 0.6857771399213093, "learning_rate": 5e-06, "loss": 0.4668, "step": 2570 }, { "epoch": 2.9111424541607898, "grad_norm": 0.7603898848747829, "learning_rate": 5e-06, "loss": 0.4723, "step": 2580 }, { "epoch": 2.922425952045134, "grad_norm": 0.7346850752363808, "learning_rate": 5e-06, "loss": 0.4631, "step": 2590 }, { "epoch": 2.933709449929478, "grad_norm": 0.7524612932802797, "learning_rate": 5e-06, "loss": 0.4686, "step": 2600 }, { "epoch": 2.944992947813822, "grad_norm": 0.7353041598333038, "learning_rate": 5e-06, "loss": 0.4641, "step": 2610 }, { "epoch": 2.9562764456981663, "grad_norm": 0.7285422773911787, "learning_rate": 5e-06, "loss": 0.4748, "step": 2620 }, { "epoch": 2.9675599435825104, "grad_norm": 0.7448074753267844, "learning_rate": 5e-06, "loss": 0.472, "step": 2630 }, { "epoch": 2.9788434414668545, "grad_norm": 0.705662537995191, "learning_rate": 5e-06, "loss": 0.4627, "step": 2640 }, { "epoch": 2.9901269393511987, "grad_norm": 0.7412402310259647, "learning_rate": 5e-06, "loss": 0.4672, "step": 2650 }, { "epoch": 2.9991537376586743, "eval_loss": 0.6162799000740051, "eval_runtime": 701.2713, "eval_samples_per_second": 17.028, "eval_steps_per_second": 0.533, "step": 2658 }, { "epoch": 2.9991537376586743, "step": 2658, "total_flos": 5064195066298368.0, "train_loss": 0.5339989464833739, "train_runtime": 122516.7944, "train_samples_per_second": 5.555, "train_steps_per_second": 0.022 } ], "logging_steps": 10, "max_steps": 2658, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5064195066298368.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }