| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9991537376586743, | |
| "eval_steps": 500, | |
| "global_step": 2658, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011283497884344146, | |
| "grad_norm": 1.3125232159346416, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7474, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.022566995768688293, | |
| "grad_norm": 0.9167355748966123, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6794, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03385049365303244, | |
| "grad_norm": 0.8469930147086754, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6697, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.045133991537376586, | |
| "grad_norm": 0.8935332426600987, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6592, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.056417489421720736, | |
| "grad_norm": 0.7558955578947845, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6527, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06770098730606489, | |
| "grad_norm": 0.8864690987221431, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6546, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07898448519040903, | |
| "grad_norm": 0.7695810970926475, | |
| "learning_rate": 5e-06, | |
| "loss": 0.632, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.09026798307475317, | |
| "grad_norm": 0.7714248271685794, | |
| "learning_rate": 5e-06, | |
| "loss": 0.636, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.10155148095909731, | |
| "grad_norm": 0.8886760851072192, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6225, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.11283497884344147, | |
| "grad_norm": 0.8974890384086033, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6362, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12411847672778561, | |
| "grad_norm": 0.8819677575312233, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6325, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.13540197461212977, | |
| "grad_norm": 0.9251296507200821, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6271, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1466854724964739, | |
| "grad_norm": 0.7702797791218077, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6273, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.15796897038081806, | |
| "grad_norm": 0.8095360026454328, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6252, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.1692524682651622, | |
| "grad_norm": 0.8204620911060526, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6287, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.18053596614950634, | |
| "grad_norm": 0.8182785549342615, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6284, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1918194640338505, | |
| "grad_norm": 0.7445113899986885, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6264, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.20310296191819463, | |
| "grad_norm": 0.7873061135697372, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6253, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2143864598025388, | |
| "grad_norm": 0.7956486177510658, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6264, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.22566995768688294, | |
| "grad_norm": 0.7746629467371798, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6238, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.23695345557122707, | |
| "grad_norm": 0.7345245715769874, | |
| "learning_rate": 5e-06, | |
| "loss": 0.619, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.24823695345557123, | |
| "grad_norm": 0.8349914408696147, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6145, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.25952045133991536, | |
| "grad_norm": 0.7331223816734604, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6179, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.27080394922425954, | |
| "grad_norm": 0.7535227854265615, | |
| "learning_rate": 5e-06, | |
| "loss": 0.615, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2820874471086037, | |
| "grad_norm": 0.9217132891288368, | |
| "learning_rate": 5e-06, | |
| "loss": 0.614, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2933709449929478, | |
| "grad_norm": 0.7626220728005123, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6207, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.304654442877292, | |
| "grad_norm": 0.7567534233021369, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6138, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3159379407616361, | |
| "grad_norm": 0.7740686670455319, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6107, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.32722143864598024, | |
| "grad_norm": 0.7577709244521811, | |
| "learning_rate": 5e-06, | |
| "loss": 0.614, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3385049365303244, | |
| "grad_norm": 0.7294703403964556, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6113, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.34978843441466856, | |
| "grad_norm": 0.7503750502849057, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6122, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3610719322990127, | |
| "grad_norm": 0.7103206649111901, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6167, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3723554301833568, | |
| "grad_norm": 0.7626842348184225, | |
| "learning_rate": 5e-06, | |
| "loss": 0.606, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.383638928067701, | |
| "grad_norm": 0.7621571567885804, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6155, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.39492242595204513, | |
| "grad_norm": 0.7549313486626996, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6172, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.40620592383638926, | |
| "grad_norm": 0.782988336813104, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5985, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.41748942172073344, | |
| "grad_norm": 0.7863471438503445, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6154, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4287729196050776, | |
| "grad_norm": 0.7640237197989878, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5897, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4400564174894217, | |
| "grad_norm": 0.8024119449518029, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6157, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4513399153737659, | |
| "grad_norm": 0.720556179289527, | |
| "learning_rate": 5e-06, | |
| "loss": 0.603, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.46262341325811, | |
| "grad_norm": 0.76294955883197, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6061, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.47390691114245415, | |
| "grad_norm": 0.8156507807690674, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6051, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.48519040902679833, | |
| "grad_norm": 0.7521066252629773, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5976, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.49647390691114246, | |
| "grad_norm": 0.7133025964230046, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5968, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5077574047954866, | |
| "grad_norm": 0.7925716834523774, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6096, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5190409026798307, | |
| "grad_norm": 0.7100847121605542, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6032, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5303244005641748, | |
| "grad_norm": 0.6929396371646737, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6002, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5416078984485191, | |
| "grad_norm": 0.6977506618129865, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6104, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5528913963328632, | |
| "grad_norm": 0.7147611855456376, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5912, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5641748942172073, | |
| "grad_norm": 0.7042203040407636, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6079, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5754583921015515, | |
| "grad_norm": 0.7309189788815705, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6036, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5867418899858956, | |
| "grad_norm": 0.7248169421227385, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6003, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5980253878702397, | |
| "grad_norm": 0.7572409467891201, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5916, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.609308885754584, | |
| "grad_norm": 0.733437644851395, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6036, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6205923836389281, | |
| "grad_norm": 0.7131751072814058, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6034, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6318758815232722, | |
| "grad_norm": 0.7360178195388309, | |
| "learning_rate": 5e-06, | |
| "loss": 0.605, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6431593794076164, | |
| "grad_norm": 1.1495804042684488, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5974, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6544428772919605, | |
| "grad_norm": 0.6817754751813853, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5977, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6657263751763046, | |
| "grad_norm": 0.7044965353014948, | |
| "learning_rate": 5e-06, | |
| "loss": 0.607, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6770098730606487, | |
| "grad_norm": 0.7308565798791333, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6015, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.688293370944993, | |
| "grad_norm": 0.758452633023999, | |
| "learning_rate": 5e-06, | |
| "loss": 0.603, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6995768688293371, | |
| "grad_norm": 0.7778955204205954, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5924, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7108603667136812, | |
| "grad_norm": 0.6757727892125893, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5999, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7221438645980254, | |
| "grad_norm": 0.7690880111005869, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6008, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.7334273624823695, | |
| "grad_norm": 0.780886462765786, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5954, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7447108603667136, | |
| "grad_norm": 0.7683226572424213, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5975, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7559943582510579, | |
| "grad_norm": 0.6840538894373641, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6023, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.767277856135402, | |
| "grad_norm": 0.6771027958757195, | |
| "learning_rate": 5e-06, | |
| "loss": 0.59, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7785613540197461, | |
| "grad_norm": 0.6681056791937571, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5931, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7898448519040903, | |
| "grad_norm": 0.7086908628376102, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6005, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8011283497884344, | |
| "grad_norm": 0.6592220118037463, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5888, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.8124118476727785, | |
| "grad_norm": 0.7384276235727564, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6029, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8236953455571228, | |
| "grad_norm": 0.7153557202506454, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6004, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8349788434414669, | |
| "grad_norm": 0.7412356079152558, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6062, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.846262341325811, | |
| "grad_norm": 0.8186167983419891, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6012, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8575458392101551, | |
| "grad_norm": 0.6956555496968945, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5986, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8688293370944993, | |
| "grad_norm": 0.6972125762170825, | |
| "learning_rate": 5e-06, | |
| "loss": 0.598, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8801128349788434, | |
| "grad_norm": 0.6554398016756771, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5899, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8913963328631875, | |
| "grad_norm": 0.6816513719411913, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5959, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.9026798307475318, | |
| "grad_norm": 0.6802192397813065, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5966, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.9139633286318759, | |
| "grad_norm": 0.7297229988637639, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5977, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.92524682651622, | |
| "grad_norm": 0.6910581824985036, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5903, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.9365303244005642, | |
| "grad_norm": 0.7038633597805812, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5926, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.9478138222849083, | |
| "grad_norm": 0.7006043004126095, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5968, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9590973201692524, | |
| "grad_norm": 0.7196243633360835, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5946, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9703808180535967, | |
| "grad_norm": 1.0955172142857887, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5837, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9816643159379408, | |
| "grad_norm": 0.6592667550168434, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5998, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9929478138222849, | |
| "grad_norm": 0.7854190040474681, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5875, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9997179125528914, | |
| "eval_loss": 0.5916627049446106, | |
| "eval_runtime": 700.0009, | |
| "eval_samples_per_second": 17.059, | |
| "eval_steps_per_second": 0.534, | |
| "step": 886 | |
| }, | |
| { | |
| "epoch": 1.004231311706629, | |
| "grad_norm": 1.3251196950195474, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6038, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.0155148095909732, | |
| "grad_norm": 0.833678301438908, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5204, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.0267983074753173, | |
| "grad_norm": 0.7372319799588725, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5098, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.0380818053596614, | |
| "grad_norm": 0.710656408718745, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5147, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.0493653032440056, | |
| "grad_norm": 0.7035392178179493, | |
| "learning_rate": 5e-06, | |
| "loss": 0.532, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.0606488011283497, | |
| "grad_norm": 0.7419000790657035, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5232, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.071932299012694, | |
| "grad_norm": 0.7507810642090355, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5182, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.0832157968970382, | |
| "grad_norm": 0.7365569865429179, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5156, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.0944992947813823, | |
| "grad_norm": 0.6971352350986588, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5255, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.1057827926657264, | |
| "grad_norm": 0.769651560789747, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5275, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.1170662905500706, | |
| "grad_norm": 0.6835338919436766, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5266, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.1283497884344147, | |
| "grad_norm": 0.7080901866255427, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5245, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.1396332863187588, | |
| "grad_norm": 0.6973746558937527, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5272, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.150916784203103, | |
| "grad_norm": 0.7328115120631007, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5269, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.162200282087447, | |
| "grad_norm": 0.8161495623001298, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5226, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.1734837799717912, | |
| "grad_norm": 0.8517029792820613, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5175, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.1847672778561353, | |
| "grad_norm": 0.8438046028102798, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5241, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.1960507757404795, | |
| "grad_norm": 0.7099751086024491, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5291, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.2073342736248236, | |
| "grad_norm": 0.7580945476663854, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5249, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.2186177715091677, | |
| "grad_norm": 0.7667303387873934, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5235, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.229901269393512, | |
| "grad_norm": 0.8114242325537707, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5283, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.2411847672778562, | |
| "grad_norm": 0.7131407385877792, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5288, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.2524682651622003, | |
| "grad_norm": 0.685086959113009, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5195, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.2637517630465445, | |
| "grad_norm": 0.7053119373842179, | |
| "learning_rate": 5e-06, | |
| "loss": 0.542, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.2750352609308886, | |
| "grad_norm": 0.7644874615248024, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5287, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.2863187588152327, | |
| "grad_norm": 0.6883885545058486, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5267, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.2976022566995769, | |
| "grad_norm": 0.7306504262086587, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5289, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.308885754583921, | |
| "grad_norm": 0.9640801871844217, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5267, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.320169252468265, | |
| "grad_norm": 0.661755370313437, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5305, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.3314527503526092, | |
| "grad_norm": 0.6875142102900156, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5262, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.3427362482369536, | |
| "grad_norm": 0.7205563505642903, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5391, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.3540197461212977, | |
| "grad_norm": 0.6599447213858599, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5272, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.3653032440056418, | |
| "grad_norm": 0.6674411183277559, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5294, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.376586741889986, | |
| "grad_norm": 0.6760191535339451, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5237, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.38787023977433, | |
| "grad_norm": 0.699499504851807, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5239, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.3991537376586742, | |
| "grad_norm": 0.6949474979507084, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5371, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.4104372355430184, | |
| "grad_norm": 0.6871033689397662, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5346, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.4217207334273625, | |
| "grad_norm": 0.6742402949788701, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5239, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.4330042313117066, | |
| "grad_norm": 0.6751463140934231, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5266, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.4442877291960508, | |
| "grad_norm": 0.7142787617460878, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5315, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.4555712270803949, | |
| "grad_norm": 0.7226332853252287, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5275, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.466854724964739, | |
| "grad_norm": 0.7145272993925487, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5322, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.4781382228490831, | |
| "grad_norm": 0.6768426299144826, | |
| "learning_rate": 5e-06, | |
| "loss": 0.538, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.4894217207334273, | |
| "grad_norm": 0.7123747155692624, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5289, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.5007052186177714, | |
| "grad_norm": 0.6786058643003756, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5253, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.5119887165021155, | |
| "grad_norm": 0.7110831777325413, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5324, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.5232722143864597, | |
| "grad_norm": 0.6695015578694526, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5214, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.5345557122708038, | |
| "grad_norm": 0.7459448089167867, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5252, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.5458392101551481, | |
| "grad_norm": 0.6753928189856611, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5251, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.5571227080394923, | |
| "grad_norm": 0.6996235594913326, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5248, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.5684062059238364, | |
| "grad_norm": 0.7079009762601143, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5323, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.5796897038081805, | |
| "grad_norm": 0.6924093724518422, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5255, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.5909732016925247, | |
| "grad_norm": 0.6761094320283147, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5339, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.6022566995768688, | |
| "grad_norm": 0.6809745203094293, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5298, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.6135401974612131, | |
| "grad_norm": 0.7106432021864519, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5478, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.6248236953455573, | |
| "grad_norm": 0.7187158548818587, | |
| "learning_rate": 5e-06, | |
| "loss": 0.531, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.6361071932299014, | |
| "grad_norm": 0.6596453734369295, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5225, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.6473906911142455, | |
| "grad_norm": 0.7093693747872939, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5257, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.6586741889985896, | |
| "grad_norm": 0.6944427657225931, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5412, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.6699576868829338, | |
| "grad_norm": 0.6955533636685318, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5367, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.681241184767278, | |
| "grad_norm": 0.6624317919452952, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5293, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.692524682651622, | |
| "grad_norm": 0.6778095946034755, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5323, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.7038081805359662, | |
| "grad_norm": 0.7341427766426889, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5263, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.7150916784203103, | |
| "grad_norm": 0.7432696239071007, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5334, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.7263751763046544, | |
| "grad_norm": 0.6491261448207155, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5305, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.7376586741889986, | |
| "grad_norm": 0.7486777638151452, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5304, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.7489421720733427, | |
| "grad_norm": 0.7118918086770966, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5216, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.7602256699576868, | |
| "grad_norm": 0.7038340173438765, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5322, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.771509167842031, | |
| "grad_norm": 0.6539818459517368, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5321, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.782792665726375, | |
| "grad_norm": 0.658011536602541, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5288, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.7940761636107192, | |
| "grad_norm": 0.6572168627274099, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5219, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.8053596614950633, | |
| "grad_norm": 0.6648318139285605, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5292, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.8166431593794075, | |
| "grad_norm": 0.6932570407391014, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5381, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.8279266572637518, | |
| "grad_norm": 0.6710478896264537, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5344, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.839210155148096, | |
| "grad_norm": 0.6710698360135149, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5201, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.85049365303244, | |
| "grad_norm": 0.6664339220436462, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5272, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.8617771509167842, | |
| "grad_norm": 0.679324908548248, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5381, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.8730606488011283, | |
| "grad_norm": 0.7203049853389839, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5325, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.8843441466854725, | |
| "grad_norm": 0.671958846252959, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5308, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.8956276445698168, | |
| "grad_norm": 0.6756200949168255, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5401, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.906911142454161, | |
| "grad_norm": 0.6795733118816987, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5209, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.918194640338505, | |
| "grad_norm": 0.6875411881069398, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5274, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.9294781382228492, | |
| "grad_norm": 0.6539417007788165, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5293, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.9407616361071933, | |
| "grad_norm": 0.7131644844536464, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5288, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.9520451339915375, | |
| "grad_norm": 0.6498878256598866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5289, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.9633286318758816, | |
| "grad_norm": 0.6843667341548544, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5418, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.9746121297602257, | |
| "grad_norm": 0.6650168678603656, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5402, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.9858956276445698, | |
| "grad_norm": 0.6825312514425923, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5268, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.997179125528914, | |
| "grad_norm": 0.6765862874969008, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5215, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.9994358251057829, | |
| "eval_loss": 0.5913873910903931, | |
| "eval_runtime": 698.2566, | |
| "eval_samples_per_second": 17.101, | |
| "eval_steps_per_second": 0.536, | |
| "step": 1772 | |
| }, | |
| { | |
| "epoch": 2.008462623413258, | |
| "grad_norm": 0.9398660673562257, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5208, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.0197461212976022, | |
| "grad_norm": 0.7762940970041788, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4524, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.0310296191819464, | |
| "grad_norm": 0.8552156108609141, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4428, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.0423131170662905, | |
| "grad_norm": 0.7759834543073888, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4497, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.0535966149506346, | |
| "grad_norm": 0.813460157749594, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4501, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.0648801128349787, | |
| "grad_norm": 0.7555915322851285, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4366, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.076163610719323, | |
| "grad_norm": 0.778910984127173, | |
| "learning_rate": 5e-06, | |
| "loss": 0.45, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.087447108603667, | |
| "grad_norm": 0.7823483784772997, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4487, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.098730606488011, | |
| "grad_norm": 0.7451819548464573, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4524, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.1100141043723553, | |
| "grad_norm": 0.7366797047615511, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4554, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.1212976022566994, | |
| "grad_norm": 0.7739116377396993, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4545, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.1325811001410435, | |
| "grad_norm": 0.7393809355678413, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4576, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.143864598025388, | |
| "grad_norm": 0.7783700395003256, | |
| "learning_rate": 5e-06, | |
| "loss": 0.456, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.155148095909732, | |
| "grad_norm": 0.7655026275655048, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4555, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.1664315937940763, | |
| "grad_norm": 0.7308671453948609, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4516, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.1777150916784205, | |
| "grad_norm": 0.7548898417368491, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4554, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.1889985895627646, | |
| "grad_norm": 0.7422727245259925, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4501, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.2002820874471087, | |
| "grad_norm": 0.787975192132044, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4439, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.211565585331453, | |
| "grad_norm": 0.7365878952628802, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4514, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.222849083215797, | |
| "grad_norm": 0.7259152335914627, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4443, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.234132581100141, | |
| "grad_norm": 0.7305442934832262, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4508, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.2454160789844853, | |
| "grad_norm": 0.752378808107532, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4533, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.2566995768688294, | |
| "grad_norm": 0.7662288679828292, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4591, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.2679830747531735, | |
| "grad_norm": 0.7571986629865566, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4583, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.2792665726375176, | |
| "grad_norm": 0.7853445680318998, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4665, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.2905500705218618, | |
| "grad_norm": 0.8056666478419078, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4626, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.301833568406206, | |
| "grad_norm": 0.7705827370447227, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4529, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.31311706629055, | |
| "grad_norm": 0.952209351698993, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4533, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.324400564174894, | |
| "grad_norm": 0.7437396447699182, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4694, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.3356840620592383, | |
| "grad_norm": 0.7507184464222211, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4571, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.3469675599435824, | |
| "grad_norm": 0.7538037629123275, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4637, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.3582510578279265, | |
| "grad_norm": 0.7207349348209307, | |
| "learning_rate": 5e-06, | |
| "loss": 0.454, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.3695345557122707, | |
| "grad_norm": 0.7732963363098568, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4621, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.380818053596615, | |
| "grad_norm": 0.7733305143075813, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4616, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.392101551480959, | |
| "grad_norm": 0.7317816197062187, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4544, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.403385049365303, | |
| "grad_norm": 0.7387742831088012, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4612, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.414668547249647, | |
| "grad_norm": 0.7855904874369565, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4567, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.4259520451339913, | |
| "grad_norm": 0.7237436303435315, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4624, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.4372355430183354, | |
| "grad_norm": 0.8184805263780661, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4621, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.44851904090268, | |
| "grad_norm": 0.760755214994802, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4602, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.459802538787024, | |
| "grad_norm": 0.7634415569046652, | |
| "learning_rate": 5e-06, | |
| "loss": 0.461, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.4710860366713683, | |
| "grad_norm": 0.725712122867678, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4631, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.4823695345557124, | |
| "grad_norm": 0.7540090450075305, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4638, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.4936530324400565, | |
| "grad_norm": 0.7217092577620574, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4702, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.5049365303244007, | |
| "grad_norm": 0.7316510601952371, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4636, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.516220028208745, | |
| "grad_norm": 0.7769888826201891, | |
| "learning_rate": 5e-06, | |
| "loss": 0.457, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.527503526093089, | |
| "grad_norm": 0.6872130661688746, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4603, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.538787023977433, | |
| "grad_norm": 0.7809947622038785, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4598, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.550070521861777, | |
| "grad_norm": 0.7128936960294685, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4665, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.5613540197461213, | |
| "grad_norm": 0.7343811921788322, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4612, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.5726375176304654, | |
| "grad_norm": 0.7390667339205597, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4607, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.5839210155148096, | |
| "grad_norm": 0.726032460592344, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4549, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.5952045133991537, | |
| "grad_norm": 0.7524597049646161, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4694, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.606488011283498, | |
| "grad_norm": 0.7858250860151663, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4663, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.617771509167842, | |
| "grad_norm": 0.7293613916851106, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4664, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.629055007052186, | |
| "grad_norm": 0.7290633000536495, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4543, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.64033850493653, | |
| "grad_norm": 0.7091768581049495, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4559, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.6516220028208743, | |
| "grad_norm": 0.7087926245008993, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4635, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.6629055007052185, | |
| "grad_norm": 0.7590073933828212, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4686, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.6741889985895626, | |
| "grad_norm": 0.7267770663460916, | |
| "learning_rate": 5e-06, | |
| "loss": 0.46, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.685472496473907, | |
| "grad_norm": 0.7248345474556908, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4727, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.6967559943582513, | |
| "grad_norm": 0.7498626285953574, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4667, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.7080394922425954, | |
| "grad_norm": 0.7438411024370735, | |
| "learning_rate": 5e-06, | |
| "loss": 0.465, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.7193229901269396, | |
| "grad_norm": 0.7551882708372384, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4659, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.7306064880112837, | |
| "grad_norm": 0.7575954402507921, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4679, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.741889985895628, | |
| "grad_norm": 0.7276043657180276, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4628, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.753173483779972, | |
| "grad_norm": 0.7154984668294021, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4654, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.764456981664316, | |
| "grad_norm": 0.7203876412478339, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4697, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.77574047954866, | |
| "grad_norm": 0.7653450966118911, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4705, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.7870239774330043, | |
| "grad_norm": 0.7371256159182112, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4641, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.7983074753173485, | |
| "grad_norm": 0.7334765675316313, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4726, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.8095909732016926, | |
| "grad_norm": 0.7255504755388352, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4734, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.8208744710860367, | |
| "grad_norm": 0.721440148744813, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4644, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.832157968970381, | |
| "grad_norm": 0.748754622356668, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4794, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.843441466854725, | |
| "grad_norm": 0.7463238131630598, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4628, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.854724964739069, | |
| "grad_norm": 0.7220764644154523, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4715, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.8660084626234132, | |
| "grad_norm": 0.7116624476538259, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4633, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.8772919605077574, | |
| "grad_norm": 0.7567720887800601, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4689, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.8885754583921015, | |
| "grad_norm": 0.7311117433105612, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4659, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.8998589562764456, | |
| "grad_norm": 0.6857771399213093, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4668, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.9111424541607898, | |
| "grad_norm": 0.7603898848747829, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4723, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.922425952045134, | |
| "grad_norm": 0.7346850752363808, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4631, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.933709449929478, | |
| "grad_norm": 0.7524612932802797, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4686, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.944992947813822, | |
| "grad_norm": 0.7353041598333038, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4641, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.9562764456981663, | |
| "grad_norm": 0.7285422773911787, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4748, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.9675599435825104, | |
| "grad_norm": 0.7448074753267844, | |
| "learning_rate": 5e-06, | |
| "loss": 0.472, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.9788434414668545, | |
| "grad_norm": 0.705662537995191, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4627, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.9901269393511987, | |
| "grad_norm": 0.7412402310259647, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4672, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.9991537376586743, | |
| "eval_loss": 0.6162799000740051, | |
| "eval_runtime": 701.2713, | |
| "eval_samples_per_second": 17.028, | |
| "eval_steps_per_second": 0.533, | |
| "step": 2658 | |
| }, | |
| { | |
| "epoch": 2.9991537376586743, | |
| "step": 2658, | |
| "total_flos": 5064195066298368.0, | |
| "train_loss": 0.5339989464833739, | |
| "train_runtime": 122516.7944, | |
| "train_samples_per_second": 5.555, | |
| "train_steps_per_second": 0.022 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2658, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5064195066298368.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |