diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4178 @@ +{ + "best_global_step": 5600, + "best_metric": 1.2055819034576416, + "best_model_checkpoint": "models/MNLP_M3_rag_model_test/checkpoint-5600", + "epoch": 2.9254471863167515, + "eval_steps": 200, + "global_step": 5600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005222613918266093, + "grad_norm": 27.89534568786621, + "learning_rate": 7.000000000000001e-07, + "loss": 3.4324, + "step": 10 + }, + { + "epoch": 0.010445227836532185, + "grad_norm": 12.811655044555664, + "learning_rate": 1.7000000000000002e-06, + "loss": 3.1666, + "step": 20 + }, + { + "epoch": 0.015667841754798278, + "grad_norm": 9.790305137634277, + "learning_rate": 2.7000000000000004e-06, + "loss": 2.7959, + "step": 30 + }, + { + "epoch": 0.02089045567306437, + "grad_norm": 8.068702697753906, + "learning_rate": 3.7e-06, + "loss": 2.5106, + "step": 40 + }, + { + "epoch": 0.02611306959133046, + "grad_norm": 8.072208404541016, + "learning_rate": 4.7e-06, + "loss": 2.4756, + "step": 50 + }, + { + "epoch": 0.031335683509596556, + "grad_norm": 8.231782913208008, + "learning_rate": 5.7e-06, + "loss": 2.433, + "step": 60 + }, + { + "epoch": 0.036558297427862645, + "grad_norm": 9.866532325744629, + "learning_rate": 6.700000000000001e-06, + "loss": 2.3698, + "step": 70 + }, + { + "epoch": 0.04178091134612874, + "grad_norm": 9.043497085571289, + "learning_rate": 7.7e-06, + "loss": 2.3509, + "step": 80 + }, + { + "epoch": 0.04700352526439483, + "grad_norm": 8.417478561401367, + "learning_rate": 8.700000000000001e-06, + "loss": 2.3521, + "step": 90 + }, + { + "epoch": 0.05222613918266092, + "grad_norm": 8.51689624786377, + "learning_rate": 9.7e-06, + "loss": 2.3839, + "step": 100 + }, + { + "epoch": 0.057448753100927015, + "grad_norm": 8.41511058807373, + "learning_rate": 9.995398369708125e-06, + "loss": 2.1144, + "step": 110 + }, + { + "epoch": 0.06267136701919311, + "grad_norm": 9.541606903076172, + "learning_rate": 9.988824612148304e-06, + "loss": 2.1907, + "step": 120 + }, + { + "epoch": 0.0678939809374592, + "grad_norm": 8.702176094055176, + "learning_rate": 9.982250854588484e-06, + "loss": 2.1522, + "step": 130 + }, + { + "epoch": 0.07311659485572529, + "grad_norm": 7.285041809082031, + "learning_rate": 9.975677097028661e-06, + "loss": 2.289, + "step": 140 + }, + { + "epoch": 0.07833920877399138, + "grad_norm": 8.327420234680176, + "learning_rate": 9.96910333946884e-06, + "loss": 2.1041, + "step": 150 + }, + { + "epoch": 0.08356182269225748, + "grad_norm": 9.352178573608398, + "learning_rate": 9.96252958190902e-06, + "loss": 2.1858, + "step": 160 + }, + { + "epoch": 0.08878443661052357, + "grad_norm": 10.03535270690918, + "learning_rate": 9.955955824349198e-06, + "loss": 2.092, + "step": 170 + }, + { + "epoch": 0.09400705052878966, + "grad_norm": 8.207446098327637, + "learning_rate": 9.949382066789379e-06, + "loss": 2.0972, + "step": 180 + }, + { + "epoch": 0.09922966444705575, + "grad_norm": 10.35864543914795, + "learning_rate": 9.942808309229556e-06, + "loss": 2.0366, + "step": 190 + }, + { + "epoch": 0.10445227836532184, + "grad_norm": 8.569424629211426, + "learning_rate": 9.936234551669736e-06, + "loss": 2.0236, + "step": 200 + }, + { + "epoch": 0.10445227836532184, + "eval_loss": 2.0600883960723877, + "eval_runtime": 46.6244, + "eval_samples_per_second": 36.505, + "eval_steps_per_second": 4.568, + "step": 200 + }, + { + "epoch": 0.10967489228358794, + "grad_norm": 10.646219253540039, + "learning_rate": 9.929660794109915e-06, + "loss": 2.1485, + "step": 210 + }, + { + "epoch": 0.11489750620185403, + "grad_norm": 8.741375923156738, + "learning_rate": 9.923087036550093e-06, + "loss": 2.0663, + "step": 220 + }, + { + "epoch": 0.12012012012012012, + "grad_norm": 8.898869514465332, + "learning_rate": 9.916513278990272e-06, + "loss": 1.8793, + "step": 230 + }, + { + "epoch": 0.12534273403838622, + "grad_norm": 10.121482849121094, + "learning_rate": 9.90993952143045e-06, + "loss": 1.9377, + "step": 240 + }, + { + "epoch": 0.1305653479566523, + "grad_norm": 9.474202156066895, + "learning_rate": 9.903365763870629e-06, + "loss": 2.0534, + "step": 250 + }, + { + "epoch": 0.1357879618749184, + "grad_norm": 7.948584079742432, + "learning_rate": 9.896792006310808e-06, + "loss": 1.9262, + "step": 260 + }, + { + "epoch": 0.1410105757931845, + "grad_norm": 9.371474266052246, + "learning_rate": 9.890218248750986e-06, + "loss": 1.9202, + "step": 270 + }, + { + "epoch": 0.14623318971145058, + "grad_norm": 8.858964920043945, + "learning_rate": 9.883644491191165e-06, + "loss": 2.0666, + "step": 280 + }, + { + "epoch": 0.15145580362971667, + "grad_norm": 8.085942268371582, + "learning_rate": 9.877070733631345e-06, + "loss": 1.9757, + "step": 290 + }, + { + "epoch": 0.15667841754798276, + "grad_norm": 9.249700546264648, + "learning_rate": 9.870496976071522e-06, + "loss": 1.9912, + "step": 300 + }, + { + "epoch": 0.16190103146624885, + "grad_norm": 9.418254852294922, + "learning_rate": 9.863923218511702e-06, + "loss": 1.8573, + "step": 310 + }, + { + "epoch": 0.16712364538451496, + "grad_norm": 7.633480072021484, + "learning_rate": 9.857349460951881e-06, + "loss": 1.8782, + "step": 320 + }, + { + "epoch": 0.17234625930278105, + "grad_norm": 8.046884536743164, + "learning_rate": 9.850775703392059e-06, + "loss": 2.0481, + "step": 330 + }, + { + "epoch": 0.17756887322104714, + "grad_norm": 8.936604499816895, + "learning_rate": 9.844201945832238e-06, + "loss": 1.9284, + "step": 340 + }, + { + "epoch": 0.18279148713931323, + "grad_norm": 8.811029434204102, + "learning_rate": 9.837628188272417e-06, + "loss": 1.9936, + "step": 350 + }, + { + "epoch": 0.18801410105757932, + "grad_norm": 8.724671363830566, + "learning_rate": 9.831054430712597e-06, + "loss": 1.6656, + "step": 360 + }, + { + "epoch": 0.1932367149758454, + "grad_norm": 8.415898323059082, + "learning_rate": 9.824480673152776e-06, + "loss": 1.7273, + "step": 370 + }, + { + "epoch": 0.1984593288941115, + "grad_norm": 8.664811134338379, + "learning_rate": 9.817906915592954e-06, + "loss": 1.865, + "step": 380 + }, + { + "epoch": 0.2036819428123776, + "grad_norm": 8.129899978637695, + "learning_rate": 9.811333158033133e-06, + "loss": 1.7854, + "step": 390 + }, + { + "epoch": 0.20890455673064368, + "grad_norm": 9.967806816101074, + "learning_rate": 9.80475940047331e-06, + "loss": 1.7001, + "step": 400 + }, + { + "epoch": 0.20890455673064368, + "eval_loss": 1.8084577322006226, + "eval_runtime": 46.1675, + "eval_samples_per_second": 36.866, + "eval_steps_per_second": 4.614, + "step": 400 + }, + { + "epoch": 0.2141271706489098, + "grad_norm": 8.425873756408691, + "learning_rate": 9.79818564291349e-06, + "loss": 1.8895, + "step": 410 + }, + { + "epoch": 0.21934978456717588, + "grad_norm": 7.594646453857422, + "learning_rate": 9.79161188535367e-06, + "loss": 1.7879, + "step": 420 + }, + { + "epoch": 0.22457239848544197, + "grad_norm": 8.4673433303833, + "learning_rate": 9.785038127793847e-06, + "loss": 1.8445, + "step": 430 + }, + { + "epoch": 0.22979501240370806, + "grad_norm": 8.08828353881836, + "learning_rate": 9.778464370234027e-06, + "loss": 1.7835, + "step": 440 + }, + { + "epoch": 0.23501762632197415, + "grad_norm": 9.548452377319336, + "learning_rate": 9.771890612674206e-06, + "loss": 1.8427, + "step": 450 + }, + { + "epoch": 0.24024024024024024, + "grad_norm": 9.219648361206055, + "learning_rate": 9.765316855114384e-06, + "loss": 1.6848, + "step": 460 + }, + { + "epoch": 0.24546285415850633, + "grad_norm": 7.592192649841309, + "learning_rate": 9.758743097554563e-06, + "loss": 1.683, + "step": 470 + }, + { + "epoch": 0.25068546807677244, + "grad_norm": 7.887657165527344, + "learning_rate": 9.752169339994742e-06, + "loss": 1.7546, + "step": 480 + }, + { + "epoch": 0.2559080819950385, + "grad_norm": 8.5076904296875, + "learning_rate": 9.74559558243492e-06, + "loss": 1.6915, + "step": 490 + }, + { + "epoch": 0.2611306959133046, + "grad_norm": 8.41952896118164, + "learning_rate": 9.7390218248751e-06, + "loss": 1.6883, + "step": 500 + }, + { + "epoch": 0.2663533098315707, + "grad_norm": 8.776701927185059, + "learning_rate": 9.732448067315277e-06, + "loss": 1.8154, + "step": 510 + }, + { + "epoch": 0.2715759237498368, + "grad_norm": 7.871417045593262, + "learning_rate": 9.725874309755458e-06, + "loss": 1.7861, + "step": 520 + }, + { + "epoch": 0.27679853766810286, + "grad_norm": 8.374330520629883, + "learning_rate": 9.719300552195636e-06, + "loss": 1.7334, + "step": 530 + }, + { + "epoch": 0.282021151586369, + "grad_norm": 9.533878326416016, + "learning_rate": 9.712726794635815e-06, + "loss": 1.642, + "step": 540 + }, + { + "epoch": 0.2872437655046351, + "grad_norm": 8.845860481262207, + "learning_rate": 9.706153037075994e-06, + "loss": 1.5368, + "step": 550 + }, + { + "epoch": 0.29246637942290116, + "grad_norm": 8.25372314453125, + "learning_rate": 9.699579279516172e-06, + "loss": 1.8538, + "step": 560 + }, + { + "epoch": 0.2976889933411673, + "grad_norm": 8.217201232910156, + "learning_rate": 9.693005521956351e-06, + "loss": 1.5753, + "step": 570 + }, + { + "epoch": 0.30291160725943334, + "grad_norm": 9.172762870788574, + "learning_rate": 9.68643176439653e-06, + "loss": 1.485, + "step": 580 + }, + { + "epoch": 0.30813422117769945, + "grad_norm": 8.109066009521484, + "learning_rate": 9.679858006836708e-06, + "loss": 1.6386, + "step": 590 + }, + { + "epoch": 0.3133568350959655, + "grad_norm": 9.576763153076172, + "learning_rate": 9.673284249276888e-06, + "loss": 1.6593, + "step": 600 + }, + { + "epoch": 0.3133568350959655, + "eval_loss": 1.669974684715271, + "eval_runtime": 46.1982, + "eval_samples_per_second": 36.841, + "eval_steps_per_second": 4.611, + "step": 600 + }, + { + "epoch": 0.31857944901423163, + "grad_norm": 8.07900333404541, + "learning_rate": 9.666710491717067e-06, + "loss": 1.5609, + "step": 610 + }, + { + "epoch": 0.3238020629324977, + "grad_norm": 7.784631729125977, + "learning_rate": 9.660136734157245e-06, + "loss": 1.598, + "step": 620 + }, + { + "epoch": 0.3290246768507638, + "grad_norm": 7.267651557922363, + "learning_rate": 9.653562976597424e-06, + "loss": 1.693, + "step": 630 + }, + { + "epoch": 0.3342472907690299, + "grad_norm": 7.9032816886901855, + "learning_rate": 9.646989219037603e-06, + "loss": 1.5721, + "step": 640 + }, + { + "epoch": 0.339469904687296, + "grad_norm": 7.756552696228027, + "learning_rate": 9.640415461477781e-06, + "loss": 1.688, + "step": 650 + }, + { + "epoch": 0.3446925186055621, + "grad_norm": 7.442072868347168, + "learning_rate": 9.63384170391796e-06, + "loss": 1.6674, + "step": 660 + }, + { + "epoch": 0.34991513252382817, + "grad_norm": 7.6043548583984375, + "learning_rate": 9.627267946358138e-06, + "loss": 1.5278, + "step": 670 + }, + { + "epoch": 0.3551377464420943, + "grad_norm": 8.23291015625, + "learning_rate": 9.620694188798317e-06, + "loss": 1.588, + "step": 680 + }, + { + "epoch": 0.36036036036036034, + "grad_norm": 7.537237644195557, + "learning_rate": 9.614120431238497e-06, + "loss": 1.4416, + "step": 690 + }, + { + "epoch": 0.36558297427862646, + "grad_norm": 8.757375717163086, + "learning_rate": 9.607546673678676e-06, + "loss": 1.8258, + "step": 700 + }, + { + "epoch": 0.3708055881968925, + "grad_norm": 8.787406921386719, + "learning_rate": 9.600972916118855e-06, + "loss": 1.5436, + "step": 710 + }, + { + "epoch": 0.37602820211515864, + "grad_norm": 8.105976104736328, + "learning_rate": 9.594399158559033e-06, + "loss": 1.6407, + "step": 720 + }, + { + "epoch": 0.38125081603342476, + "grad_norm": 9.059203147888184, + "learning_rate": 9.587825400999212e-06, + "loss": 1.5917, + "step": 730 + }, + { + "epoch": 0.3864734299516908, + "grad_norm": 7.951854228973389, + "learning_rate": 9.581251643439392e-06, + "loss": 1.6077, + "step": 740 + }, + { + "epoch": 0.39169604386995693, + "grad_norm": 7.4102396965026855, + "learning_rate": 9.57467788587957e-06, + "loss": 1.7028, + "step": 750 + }, + { + "epoch": 0.396918657788223, + "grad_norm": 8.980511665344238, + "learning_rate": 9.568104128319749e-06, + "loss": 1.5217, + "step": 760 + }, + { + "epoch": 0.4021412717064891, + "grad_norm": 7.856898784637451, + "learning_rate": 9.561530370759928e-06, + "loss": 1.6657, + "step": 770 + }, + { + "epoch": 0.4073638856247552, + "grad_norm": 7.5405802726745605, + "learning_rate": 9.554956613200106e-06, + "loss": 1.5447, + "step": 780 + }, + { + "epoch": 0.4125864995430213, + "grad_norm": 7.326587200164795, + "learning_rate": 9.548382855640285e-06, + "loss": 1.4387, + "step": 790 + }, + { + "epoch": 0.41780911346128735, + "grad_norm": 7.001702785491943, + "learning_rate": 9.541809098080463e-06, + "loss": 1.5789, + "step": 800 + }, + { + "epoch": 0.41780911346128735, + "eval_loss": 1.5911635160446167, + "eval_runtime": 46.2223, + "eval_samples_per_second": 36.822, + "eval_steps_per_second": 4.608, + "step": 800 + }, + { + "epoch": 0.42303172737955347, + "grad_norm": 10.446452140808105, + "learning_rate": 9.535235340520642e-06, + "loss": 1.5838, + "step": 810 + }, + { + "epoch": 0.4282543412978196, + "grad_norm": 8.23008918762207, + "learning_rate": 9.528661582960821e-06, + "loss": 1.6463, + "step": 820 + }, + { + "epoch": 0.43347695521608565, + "grad_norm": 10.593551635742188, + "learning_rate": 9.522087825400999e-06, + "loss": 1.6947, + "step": 830 + }, + { + "epoch": 0.43869956913435176, + "grad_norm": 11.24173641204834, + "learning_rate": 9.515514067841178e-06, + "loss": 1.4659, + "step": 840 + }, + { + "epoch": 0.4439221830526178, + "grad_norm": 8.380345344543457, + "learning_rate": 9.508940310281358e-06, + "loss": 1.6864, + "step": 850 + }, + { + "epoch": 0.44914479697088394, + "grad_norm": 9.692063331604004, + "learning_rate": 9.502366552721535e-06, + "loss": 1.5631, + "step": 860 + }, + { + "epoch": 0.45436741088915, + "grad_norm": 7.017058372497559, + "learning_rate": 9.495792795161716e-06, + "loss": 1.5841, + "step": 870 + }, + { + "epoch": 0.4595900248074161, + "grad_norm": 7.2586822509765625, + "learning_rate": 9.489219037601894e-06, + "loss": 1.3882, + "step": 880 + }, + { + "epoch": 0.4648126387256822, + "grad_norm": 7.41990852355957, + "learning_rate": 9.482645280042073e-06, + "loss": 1.6894, + "step": 890 + }, + { + "epoch": 0.4700352526439483, + "grad_norm": 8.770058631896973, + "learning_rate": 9.476071522482253e-06, + "loss": 1.586, + "step": 900 + }, + { + "epoch": 0.4752578665622144, + "grad_norm": 7.8078742027282715, + "learning_rate": 9.46949776492243e-06, + "loss": 1.4878, + "step": 910 + }, + { + "epoch": 0.4804804804804805, + "grad_norm": 6.9704203605651855, + "learning_rate": 9.46292400736261e-06, + "loss": 1.4678, + "step": 920 + }, + { + "epoch": 0.4857030943987466, + "grad_norm": 8.024680137634277, + "learning_rate": 9.456350249802787e-06, + "loss": 1.5597, + "step": 930 + }, + { + "epoch": 0.49092570831701265, + "grad_norm": 8.284809112548828, + "learning_rate": 9.449776492242967e-06, + "loss": 1.571, + "step": 940 + }, + { + "epoch": 0.49614832223527877, + "grad_norm": 8.523279190063477, + "learning_rate": 9.443202734683146e-06, + "loss": 1.5098, + "step": 950 + }, + { + "epoch": 0.5013709361535449, + "grad_norm": 7.113609790802002, + "learning_rate": 9.436628977123324e-06, + "loss": 1.6931, + "step": 960 + }, + { + "epoch": 0.506593550071811, + "grad_norm": 8.182168006896973, + "learning_rate": 9.430055219563503e-06, + "loss": 1.3979, + "step": 970 + }, + { + "epoch": 0.511816163990077, + "grad_norm": 6.154737949371338, + "learning_rate": 9.423481462003682e-06, + "loss": 1.6433, + "step": 980 + }, + { + "epoch": 0.5170387779083431, + "grad_norm": 7.272144794464111, + "learning_rate": 9.41690770444386e-06, + "loss": 1.6019, + "step": 990 + }, + { + "epoch": 0.5222613918266092, + "grad_norm": 9.126653671264648, + "learning_rate": 9.41033394688404e-06, + "loss": 1.4966, + "step": 1000 + }, + { + "epoch": 0.5222613918266092, + "eval_loss": 1.519429087638855, + "eval_runtime": 46.2139, + "eval_samples_per_second": 36.829, + "eval_steps_per_second": 4.609, + "step": 1000 + }, + { + "epoch": 0.5274840057448753, + "grad_norm": 6.704170227050781, + "learning_rate": 9.403760189324219e-06, + "loss": 1.3031, + "step": 1010 + }, + { + "epoch": 0.5327066196631414, + "grad_norm": 7.243802070617676, + "learning_rate": 9.397186431764396e-06, + "loss": 1.6704, + "step": 1020 + }, + { + "epoch": 0.5379292335814075, + "grad_norm": 8.116981506347656, + "learning_rate": 9.390612674204576e-06, + "loss": 1.6063, + "step": 1030 + }, + { + "epoch": 0.5431518474996736, + "grad_norm": 8.39439868927002, + "learning_rate": 9.384038916644755e-06, + "loss": 1.6131, + "step": 1040 + }, + { + "epoch": 0.5483744614179397, + "grad_norm": 7.224155426025391, + "learning_rate": 9.377465159084934e-06, + "loss": 1.4247, + "step": 1050 + }, + { + "epoch": 0.5535970753362057, + "grad_norm": 7.421390056610107, + "learning_rate": 9.370891401525112e-06, + "loss": 1.4148, + "step": 1060 + }, + { + "epoch": 0.5588196892544719, + "grad_norm": 7.829183578491211, + "learning_rate": 9.364317643965291e-06, + "loss": 1.3336, + "step": 1070 + }, + { + "epoch": 0.564042303172738, + "grad_norm": 7.551697254180908, + "learning_rate": 9.35774388640547e-06, + "loss": 1.3667, + "step": 1080 + }, + { + "epoch": 0.569264917091004, + "grad_norm": 10.012944221496582, + "learning_rate": 9.351170128845648e-06, + "loss": 1.4699, + "step": 1090 + }, + { + "epoch": 0.5744875310092702, + "grad_norm": 7.096408367156982, + "learning_rate": 9.344596371285828e-06, + "loss": 1.3641, + "step": 1100 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 6.685956954956055, + "learning_rate": 9.338022613726007e-06, + "loss": 1.4036, + "step": 1110 + }, + { + "epoch": 0.5849327588458023, + "grad_norm": 6.154745101928711, + "learning_rate": 9.331448856166185e-06, + "loss": 1.3924, + "step": 1120 + }, + { + "epoch": 0.5901553727640684, + "grad_norm": 8.996173858642578, + "learning_rate": 9.324875098606364e-06, + "loss": 1.3899, + "step": 1130 + }, + { + "epoch": 0.5953779866823345, + "grad_norm": 7.491435527801514, + "learning_rate": 9.318301341046544e-06, + "loss": 1.406, + "step": 1140 + }, + { + "epoch": 0.6006006006006006, + "grad_norm": 8.112247467041016, + "learning_rate": 9.311727583486721e-06, + "loss": 1.3555, + "step": 1150 + }, + { + "epoch": 0.6058232145188667, + "grad_norm": 6.320436000823975, + "learning_rate": 9.3051538259269e-06, + "loss": 1.4438, + "step": 1160 + }, + { + "epoch": 0.6110458284371327, + "grad_norm": 8.957696914672852, + "learning_rate": 9.29858006836708e-06, + "loss": 1.5721, + "step": 1170 + }, + { + "epoch": 0.6162684423553989, + "grad_norm": 7.852613925933838, + "learning_rate": 9.292006310807258e-06, + "loss": 1.3379, + "step": 1180 + }, + { + "epoch": 0.621491056273665, + "grad_norm": 8.566577911376953, + "learning_rate": 9.285432553247437e-06, + "loss": 1.2671, + "step": 1190 + }, + { + "epoch": 0.626713670191931, + "grad_norm": 8.17737102508545, + "learning_rate": 9.278858795687615e-06, + "loss": 1.5569, + "step": 1200 + }, + { + "epoch": 0.626713670191931, + "eval_loss": 1.4577302932739258, + "eval_runtime": 46.2429, + "eval_samples_per_second": 36.806, + "eval_steps_per_second": 4.606, + "step": 1200 + }, + { + "epoch": 0.6319362841101972, + "grad_norm": 9.098960876464844, + "learning_rate": 9.272285038127796e-06, + "loss": 1.3189, + "step": 1210 + }, + { + "epoch": 0.6371588980284633, + "grad_norm": 7.19639778137207, + "learning_rate": 9.265711280567973e-06, + "loss": 1.4387, + "step": 1220 + }, + { + "epoch": 0.6423815119467293, + "grad_norm": 12.178171157836914, + "learning_rate": 9.259137523008153e-06, + "loss": 1.3847, + "step": 1230 + }, + { + "epoch": 0.6476041258649954, + "grad_norm": 7.771910667419434, + "learning_rate": 9.252563765448332e-06, + "loss": 1.4767, + "step": 1240 + }, + { + "epoch": 0.6528267397832616, + "grad_norm": 7.271229267120361, + "learning_rate": 9.24599000788851e-06, + "loss": 1.6648, + "step": 1250 + }, + { + "epoch": 0.6580493537015276, + "grad_norm": 6.665337562561035, + "learning_rate": 9.239416250328689e-06, + "loss": 1.5588, + "step": 1260 + }, + { + "epoch": 0.6632719676197937, + "grad_norm": 9.990988731384277, + "learning_rate": 9.232842492768868e-06, + "loss": 1.5604, + "step": 1270 + }, + { + "epoch": 0.6684945815380599, + "grad_norm": 11.0624418258667, + "learning_rate": 9.226268735209046e-06, + "loss": 1.3764, + "step": 1280 + }, + { + "epoch": 0.6737171954563259, + "grad_norm": 7.9689788818359375, + "learning_rate": 9.219694977649225e-06, + "loss": 1.4476, + "step": 1290 + }, + { + "epoch": 0.678939809374592, + "grad_norm": 9.779829978942871, + "learning_rate": 9.213121220089405e-06, + "loss": 1.6465, + "step": 1300 + }, + { + "epoch": 0.684162423292858, + "grad_norm": 13.22938060760498, + "learning_rate": 9.206547462529582e-06, + "loss": 1.3137, + "step": 1310 + }, + { + "epoch": 0.6893850372111242, + "grad_norm": 7.279387950897217, + "learning_rate": 9.199973704969762e-06, + "loss": 1.3996, + "step": 1320 + }, + { + "epoch": 0.6946076511293903, + "grad_norm": 8.289338111877441, + "learning_rate": 9.19339994740994e-06, + "loss": 1.6241, + "step": 1330 + }, + { + "epoch": 0.6998302650476563, + "grad_norm": 6.582716464996338, + "learning_rate": 9.186826189850119e-06, + "loss": 1.4579, + "step": 1340 + }, + { + "epoch": 0.7050528789659224, + "grad_norm": 6.584691524505615, + "learning_rate": 9.180252432290298e-06, + "loss": 1.4906, + "step": 1350 + }, + { + "epoch": 0.7102754928841886, + "grad_norm": 7.73353910446167, + "learning_rate": 9.173678674730476e-06, + "loss": 1.4636, + "step": 1360 + }, + { + "epoch": 0.7154981068024546, + "grad_norm": 8.247365951538086, + "learning_rate": 9.167104917170655e-06, + "loss": 1.322, + "step": 1370 + }, + { + "epoch": 0.7207207207207207, + "grad_norm": 10.286720275878906, + "learning_rate": 9.160531159610834e-06, + "loss": 1.3642, + "step": 1380 + }, + { + "epoch": 0.7259433346389869, + "grad_norm": 8.615984916687012, + "learning_rate": 9.153957402051014e-06, + "loss": 1.2931, + "step": 1390 + }, + { + "epoch": 0.7311659485572529, + "grad_norm": 12.748590469360352, + "learning_rate": 9.147383644491193e-06, + "loss": 1.2309, + "step": 1400 + }, + { + "epoch": 0.7311659485572529, + "eval_loss": 1.4139466285705566, + "eval_runtime": 46.1975, + "eval_samples_per_second": 36.842, + "eval_steps_per_second": 4.611, + "step": 1400 + }, + { + "epoch": 0.736388562475519, + "grad_norm": 7.815068244934082, + "learning_rate": 9.14080988693137e-06, + "loss": 1.4541, + "step": 1410 + }, + { + "epoch": 0.741611176393785, + "grad_norm": 7.568152904510498, + "learning_rate": 9.13423612937155e-06, + "loss": 1.4903, + "step": 1420 + }, + { + "epoch": 0.7468337903120512, + "grad_norm": 7.310227394104004, + "learning_rate": 9.12766237181173e-06, + "loss": 1.4544, + "step": 1430 + }, + { + "epoch": 0.7520564042303173, + "grad_norm": 8.279141426086426, + "learning_rate": 9.121088614251907e-06, + "loss": 1.5115, + "step": 1440 + }, + { + "epoch": 0.7572790181485833, + "grad_norm": 6.896315574645996, + "learning_rate": 9.114514856692086e-06, + "loss": 1.3089, + "step": 1450 + }, + { + "epoch": 0.7625016320668495, + "grad_norm": 8.386058807373047, + "learning_rate": 9.107941099132264e-06, + "loss": 1.2252, + "step": 1460 + }, + { + "epoch": 0.7677242459851156, + "grad_norm": 6.841330528259277, + "learning_rate": 9.101367341572443e-06, + "loss": 1.519, + "step": 1470 + }, + { + "epoch": 0.7729468599033816, + "grad_norm": 12.19370174407959, + "learning_rate": 9.094793584012623e-06, + "loss": 1.2249, + "step": 1480 + }, + { + "epoch": 0.7781694738216477, + "grad_norm": 7.211881637573242, + "learning_rate": 9.0882198264528e-06, + "loss": 1.3279, + "step": 1490 + }, + { + "epoch": 0.7833920877399139, + "grad_norm": 7.7247724533081055, + "learning_rate": 9.08164606889298e-06, + "loss": 1.3658, + "step": 1500 + }, + { + "epoch": 0.7886147016581799, + "grad_norm": 6.654493808746338, + "learning_rate": 9.075072311333159e-06, + "loss": 1.2192, + "step": 1510 + }, + { + "epoch": 0.793837315576446, + "grad_norm": 9.495121955871582, + "learning_rate": 9.068498553773337e-06, + "loss": 1.4915, + "step": 1520 + }, + { + "epoch": 0.799059929494712, + "grad_norm": 8.51004409790039, + "learning_rate": 9.061924796213516e-06, + "loss": 1.3852, + "step": 1530 + }, + { + "epoch": 0.8042825434129782, + "grad_norm": 8.426909446716309, + "learning_rate": 9.055351038653695e-06, + "loss": 1.5215, + "step": 1540 + }, + { + "epoch": 0.8095051573312443, + "grad_norm": 5.057459831237793, + "learning_rate": 9.048777281093875e-06, + "loss": 1.3362, + "step": 1550 + }, + { + "epoch": 0.8147277712495103, + "grad_norm": 7.3518571853637695, + "learning_rate": 9.042203523534054e-06, + "loss": 1.5133, + "step": 1560 + }, + { + "epoch": 0.8199503851677765, + "grad_norm": 7.912439823150635, + "learning_rate": 9.035629765974232e-06, + "loss": 1.3317, + "step": 1570 + }, + { + "epoch": 0.8251729990860426, + "grad_norm": 9.68945598602295, + "learning_rate": 9.029056008414411e-06, + "loss": 1.3286, + "step": 1580 + }, + { + "epoch": 0.8303956130043086, + "grad_norm": 6.489112377166748, + "learning_rate": 9.022482250854589e-06, + "loss": 1.2283, + "step": 1590 + }, + { + "epoch": 0.8356182269225747, + "grad_norm": 7.945755481719971, + "learning_rate": 9.015908493294768e-06, + "loss": 1.2544, + "step": 1600 + }, + { + "epoch": 0.8356182269225747, + "eval_loss": 1.3756215572357178, + "eval_runtime": 46.2433, + "eval_samples_per_second": 36.805, + "eval_steps_per_second": 4.606, + "step": 1600 + }, + { + "epoch": 0.8408408408408409, + "grad_norm": 7.036093235015869, + "learning_rate": 9.009334735734947e-06, + "loss": 1.4884, + "step": 1610 + }, + { + "epoch": 0.8460634547591069, + "grad_norm": 5.541379928588867, + "learning_rate": 9.002760978175125e-06, + "loss": 1.4642, + "step": 1620 + }, + { + "epoch": 0.851286068677373, + "grad_norm": 7.85528564453125, + "learning_rate": 8.996187220615304e-06, + "loss": 1.4053, + "step": 1630 + }, + { + "epoch": 0.8565086825956392, + "grad_norm": 7.868051052093506, + "learning_rate": 8.989613463055484e-06, + "loss": 1.1438, + "step": 1640 + }, + { + "epoch": 0.8617312965139052, + "grad_norm": 6.309744834899902, + "learning_rate": 8.983039705495661e-06, + "loss": 1.2862, + "step": 1650 + }, + { + "epoch": 0.8669539104321713, + "grad_norm": 8.591092109680176, + "learning_rate": 8.97646594793584e-06, + "loss": 1.3227, + "step": 1660 + }, + { + "epoch": 0.8721765243504374, + "grad_norm": 7.7726149559021, + "learning_rate": 8.96989219037602e-06, + "loss": 1.1849, + "step": 1670 + }, + { + "epoch": 0.8773991382687035, + "grad_norm": 7.683166980743408, + "learning_rate": 8.963318432816198e-06, + "loss": 1.598, + "step": 1680 + }, + { + "epoch": 0.8826217521869696, + "grad_norm": 8.678107261657715, + "learning_rate": 8.956744675256377e-06, + "loss": 1.3754, + "step": 1690 + }, + { + "epoch": 0.8878443661052356, + "grad_norm": 6.267312049865723, + "learning_rate": 8.950170917696556e-06, + "loss": 1.2975, + "step": 1700 + }, + { + "epoch": 0.8930669800235017, + "grad_norm": 7.084597587585449, + "learning_rate": 8.943597160136734e-06, + "loss": 1.1706, + "step": 1710 + }, + { + "epoch": 0.8982895939417679, + "grad_norm": 7.456942558288574, + "learning_rate": 8.937023402576913e-06, + "loss": 1.3991, + "step": 1720 + }, + { + "epoch": 0.903512207860034, + "grad_norm": 7.471573829650879, + "learning_rate": 8.930449645017093e-06, + "loss": 1.2524, + "step": 1730 + }, + { + "epoch": 0.9087348217783, + "grad_norm": 10.539592742919922, + "learning_rate": 8.923875887457272e-06, + "loss": 1.4209, + "step": 1740 + }, + { + "epoch": 0.9139574356965662, + "grad_norm": 7.277865886688232, + "learning_rate": 8.91730212989745e-06, + "loss": 1.3516, + "step": 1750 + }, + { + "epoch": 0.9191800496148322, + "grad_norm": 9.74389362335205, + "learning_rate": 8.910728372337629e-06, + "loss": 1.3255, + "step": 1760 + }, + { + "epoch": 0.9244026635330983, + "grad_norm": 10.399345397949219, + "learning_rate": 8.904154614777808e-06, + "loss": 1.3837, + "step": 1770 + }, + { + "epoch": 0.9296252774513644, + "grad_norm": 7.868115425109863, + "learning_rate": 8.897580857217986e-06, + "loss": 1.3837, + "step": 1780 + }, + { + "epoch": 0.9348478913696305, + "grad_norm": 6.750741004943848, + "learning_rate": 8.891007099658165e-06, + "loss": 1.2667, + "step": 1790 + }, + { + "epoch": 0.9400705052878966, + "grad_norm": 6.125620365142822, + "learning_rate": 8.884433342098345e-06, + "loss": 1.338, + "step": 1800 + }, + { + "epoch": 0.9400705052878966, + "eval_loss": 1.3472024202346802, + "eval_runtime": 46.2182, + "eval_samples_per_second": 36.825, + "eval_steps_per_second": 4.609, + "step": 1800 + }, + { + "epoch": 0.9452931192061627, + "grad_norm": 10.77094841003418, + "learning_rate": 8.877859584538522e-06, + "loss": 1.3902, + "step": 1810 + }, + { + "epoch": 0.9505157331244288, + "grad_norm": 5.590978145599365, + "learning_rate": 8.871285826978702e-06, + "loss": 1.2914, + "step": 1820 + }, + { + "epoch": 0.9557383470426949, + "grad_norm": 6.462856292724609, + "learning_rate": 8.864712069418881e-06, + "loss": 1.5046, + "step": 1830 + }, + { + "epoch": 0.960960960960961, + "grad_norm": 7.771232604980469, + "learning_rate": 8.858138311859059e-06, + "loss": 1.4848, + "step": 1840 + }, + { + "epoch": 0.966183574879227, + "grad_norm": 7.693990230560303, + "learning_rate": 8.851564554299238e-06, + "loss": 1.396, + "step": 1850 + }, + { + "epoch": 0.9714061887974932, + "grad_norm": 6.7986159324646, + "learning_rate": 8.844990796739416e-06, + "loss": 1.2407, + "step": 1860 + }, + { + "epoch": 0.9766288027157592, + "grad_norm": 8.107544898986816, + "learning_rate": 8.838417039179595e-06, + "loss": 1.1383, + "step": 1870 + }, + { + "epoch": 0.9818514166340253, + "grad_norm": 8.577611923217773, + "learning_rate": 8.831843281619774e-06, + "loss": 1.4034, + "step": 1880 + }, + { + "epoch": 0.9870740305522914, + "grad_norm": 8.117776870727539, + "learning_rate": 8.825269524059952e-06, + "loss": 1.5123, + "step": 1890 + }, + { + "epoch": 0.9922966444705575, + "grad_norm": 6.738471984863281, + "learning_rate": 8.818695766500133e-06, + "loss": 1.3251, + "step": 1900 + }, + { + "epoch": 0.9975192583888236, + "grad_norm": 7.631872177124023, + "learning_rate": 8.81212200894031e-06, + "loss": 1.3811, + "step": 1910 + }, + { + "epoch": 1.0031335683509597, + "grad_norm": 7.611673831939697, + "learning_rate": 8.80554825138049e-06, + "loss": 1.1202, + "step": 1920 + }, + { + "epoch": 1.0083561822692257, + "grad_norm": 10.344902992248535, + "learning_rate": 8.79897449382067e-06, + "loss": 1.1347, + "step": 1930 + }, + { + "epoch": 1.0135787961874918, + "grad_norm": 7.126277923583984, + "learning_rate": 8.792400736260847e-06, + "loss": 1.1396, + "step": 1940 + }, + { + "epoch": 1.0188014101057579, + "grad_norm": 5.496697425842285, + "learning_rate": 8.785826978701027e-06, + "loss": 1.1543, + "step": 1950 + }, + { + "epoch": 1.024024024024024, + "grad_norm": 7.818673610687256, + "learning_rate": 8.779253221141206e-06, + "loss": 1.0574, + "step": 1960 + }, + { + "epoch": 1.0292466379422902, + "grad_norm": 7.81503438949585, + "learning_rate": 8.772679463581384e-06, + "loss": 1.1081, + "step": 1970 + }, + { + "epoch": 1.0344692518605563, + "grad_norm": 7.696155548095703, + "learning_rate": 8.766105706021563e-06, + "loss": 1.0364, + "step": 1980 + }, + { + "epoch": 1.0396918657788223, + "grad_norm": 6.808041095733643, + "learning_rate": 8.75953194846174e-06, + "loss": 1.2301, + "step": 1990 + }, + { + "epoch": 1.0449144796970884, + "grad_norm": 9.0170316696167, + "learning_rate": 8.75295819090192e-06, + "loss": 0.9611, + "step": 2000 + }, + { + "epoch": 1.0449144796970884, + "eval_loss": 1.3424558639526367, + "eval_runtime": 46.1975, + "eval_samples_per_second": 36.842, + "eval_steps_per_second": 4.611, + "step": 2000 + }, + { + "epoch": 1.0501370936153545, + "grad_norm": 7.797222137451172, + "learning_rate": 8.7463844333421e-06, + "loss": 1.0953, + "step": 2010 + }, + { + "epoch": 1.0553597075336205, + "grad_norm": 8.089154243469238, + "learning_rate": 8.739810675782277e-06, + "loss": 1.1462, + "step": 2020 + }, + { + "epoch": 1.0605823214518866, + "grad_norm": 7.9765625, + "learning_rate": 8.733894293978439e-06, + "loss": 0.9677, + "step": 2030 + }, + { + "epoch": 1.0658049353701529, + "grad_norm": 7.094350814819336, + "learning_rate": 8.727320536418618e-06, + "loss": 1.0856, + "step": 2040 + }, + { + "epoch": 1.071027549288419, + "grad_norm": 6.9282307624816895, + "learning_rate": 8.720746778858796e-06, + "loss": 0.9677, + "step": 2050 + }, + { + "epoch": 1.076250163206685, + "grad_norm": 7.576324462890625, + "learning_rate": 8.714173021298975e-06, + "loss": 0.8877, + "step": 2060 + }, + { + "epoch": 1.081472777124951, + "grad_norm": 6.454078674316406, + "learning_rate": 8.707599263739153e-06, + "loss": 1.0766, + "step": 2070 + }, + { + "epoch": 1.086695391043217, + "grad_norm": 6.857527732849121, + "learning_rate": 8.701025506179332e-06, + "loss": 1.1231, + "step": 2080 + }, + { + "epoch": 1.0919180049614832, + "grad_norm": 9.354379653930664, + "learning_rate": 8.694451748619512e-06, + "loss": 1.0554, + "step": 2090 + }, + { + "epoch": 1.0971406188797492, + "grad_norm": 7.970911502838135, + "learning_rate": 8.68787799105969e-06, + "loss": 1.0672, + "step": 2100 + }, + { + "epoch": 1.1023632327980155, + "grad_norm": 7.781393527984619, + "learning_rate": 8.681304233499869e-06, + "loss": 1.1021, + "step": 2110 + }, + { + "epoch": 1.1075858467162816, + "grad_norm": 11.121278762817383, + "learning_rate": 8.674730475940048e-06, + "loss": 1.0507, + "step": 2120 + }, + { + "epoch": 1.1128084606345476, + "grad_norm": 6.972072124481201, + "learning_rate": 8.668156718380227e-06, + "loss": 0.9596, + "step": 2130 + }, + { + "epoch": 1.1180310745528137, + "grad_norm": 8.896551132202148, + "learning_rate": 8.661582960820407e-06, + "loss": 1.0482, + "step": 2140 + }, + { + "epoch": 1.1232536884710798, + "grad_norm": 7.05515718460083, + "learning_rate": 8.655009203260584e-06, + "loss": 1.0665, + "step": 2150 + }, + { + "epoch": 1.1284763023893458, + "grad_norm": 7.063408851623535, + "learning_rate": 8.648435445700764e-06, + "loss": 0.9991, + "step": 2160 + }, + { + "epoch": 1.1336989163076119, + "grad_norm": 7.457261085510254, + "learning_rate": 8.641861688140943e-06, + "loss": 1.1902, + "step": 2170 + }, + { + "epoch": 1.1389215302258782, + "grad_norm": 8.556297302246094, + "learning_rate": 8.63528793058112e-06, + "loss": 1.2003, + "step": 2180 + }, + { + "epoch": 1.1441441441441442, + "grad_norm": 7.71785306930542, + "learning_rate": 8.6287141730213e-06, + "loss": 1.1433, + "step": 2190 + }, + { + "epoch": 1.1493667580624103, + "grad_norm": 5.20206356048584, + "learning_rate": 8.622140415461478e-06, + "loss": 0.9814, + "step": 2200 + }, + { + "epoch": 1.1493667580624103, + "eval_loss": 1.3300807476043701, + "eval_runtime": 46.2098, + "eval_samples_per_second": 36.832, + "eval_steps_per_second": 4.609, + "step": 2200 + }, + { + "epoch": 1.1545893719806763, + "grad_norm": 10.628904342651367, + "learning_rate": 8.615566657901657e-06, + "loss": 1.1436, + "step": 2210 + }, + { + "epoch": 1.1598119858989424, + "grad_norm": 9.307092666625977, + "learning_rate": 8.608992900341836e-06, + "loss": 1.0982, + "step": 2220 + }, + { + "epoch": 1.1650345998172085, + "grad_norm": 5.672645092010498, + "learning_rate": 8.602419142782014e-06, + "loss": 0.9906, + "step": 2230 + }, + { + "epoch": 1.1702572137354745, + "grad_norm": 7.028037071228027, + "learning_rate": 8.595845385222193e-06, + "loss": 1.1416, + "step": 2240 + }, + { + "epoch": 1.1754798276537408, + "grad_norm": 7.749709606170654, + "learning_rate": 8.589271627662373e-06, + "loss": 1.1821, + "step": 2250 + }, + { + "epoch": 1.1807024415720069, + "grad_norm": 8.712191581726074, + "learning_rate": 8.58269787010255e-06, + "loss": 1.0771, + "step": 2260 + }, + { + "epoch": 1.185925055490273, + "grad_norm": 6.774427890777588, + "learning_rate": 8.57612411254273e-06, + "loss": 1.1228, + "step": 2270 + }, + { + "epoch": 1.191147669408539, + "grad_norm": 9.674090385437012, + "learning_rate": 8.569550354982909e-06, + "loss": 1.0714, + "step": 2280 + }, + { + "epoch": 1.196370283326805, + "grad_norm": 11.087074279785156, + "learning_rate": 8.562976597423088e-06, + "loss": 1.0104, + "step": 2290 + }, + { + "epoch": 1.2015928972450711, + "grad_norm": 7.747646331787109, + "learning_rate": 8.556402839863268e-06, + "loss": 1.1871, + "step": 2300 + }, + { + "epoch": 1.2068155111633372, + "grad_norm": 6.897591590881348, + "learning_rate": 8.549829082303445e-06, + "loss": 0.9774, + "step": 2310 + }, + { + "epoch": 1.2120381250816032, + "grad_norm": 8.087874412536621, + "learning_rate": 8.543255324743625e-06, + "loss": 1.0487, + "step": 2320 + }, + { + "epoch": 1.2172607389998695, + "grad_norm": 8.251788139343262, + "learning_rate": 8.536681567183802e-06, + "loss": 1.1489, + "step": 2330 + }, + { + "epoch": 1.2224833529181356, + "grad_norm": 8.342131614685059, + "learning_rate": 8.530107809623982e-06, + "loss": 1.0752, + "step": 2340 + }, + { + "epoch": 1.2277059668364017, + "grad_norm": 10.579221725463867, + "learning_rate": 8.523534052064161e-06, + "loss": 1.0812, + "step": 2350 + }, + { + "epoch": 1.2329285807546677, + "grad_norm": 5.970239639282227, + "learning_rate": 8.516960294504339e-06, + "loss": 0.9434, + "step": 2360 + }, + { + "epoch": 1.2381511946729338, + "grad_norm": 7.801792621612549, + "learning_rate": 8.510386536944518e-06, + "loss": 1.1259, + "step": 2370 + }, + { + "epoch": 1.2433738085911998, + "grad_norm": 6.949589252471924, + "learning_rate": 8.503812779384697e-06, + "loss": 1.0598, + "step": 2380 + }, + { + "epoch": 1.248596422509466, + "grad_norm": 7.705820083618164, + "learning_rate": 8.497239021824875e-06, + "loss": 1.0536, + "step": 2390 + }, + { + "epoch": 1.253819036427732, + "grad_norm": 7.674275875091553, + "learning_rate": 8.490665264265054e-06, + "loss": 0.9851, + "step": 2400 + }, + { + "epoch": 1.253819036427732, + "eval_loss": 1.307387351989746, + "eval_runtime": 46.2194, + "eval_samples_per_second": 36.824, + "eval_steps_per_second": 4.608, + "step": 2400 + }, + { + "epoch": 1.2590416503459982, + "grad_norm": 7.880123615264893, + "learning_rate": 8.484091506705234e-06, + "loss": 1.1418, + "step": 2410 + }, + { + "epoch": 1.2642642642642643, + "grad_norm": 8.38881778717041, + "learning_rate": 8.477517749145411e-06, + "loss": 1.1259, + "step": 2420 + }, + { + "epoch": 1.2694868781825304, + "grad_norm": 8.130038261413574, + "learning_rate": 8.47094399158559e-06, + "loss": 1.0038, + "step": 2430 + }, + { + "epoch": 1.2747094921007964, + "grad_norm": 8.897841453552246, + "learning_rate": 8.46437023402577e-06, + "loss": 1.0479, + "step": 2440 + }, + { + "epoch": 1.2799321060190625, + "grad_norm": 5.228075981140137, + "learning_rate": 8.457796476465948e-06, + "loss": 0.8703, + "step": 2450 + }, + { + "epoch": 1.2851547199373288, + "grad_norm": 7.763029098510742, + "learning_rate": 8.451222718906127e-06, + "loss": 0.9687, + "step": 2460 + }, + { + "epoch": 1.2903773338555946, + "grad_norm": 8.072653770446777, + "learning_rate": 8.444648961346307e-06, + "loss": 1.0566, + "step": 2470 + }, + { + "epoch": 1.295599947773861, + "grad_norm": 5.563382148742676, + "learning_rate": 8.438075203786486e-06, + "loss": 1.0113, + "step": 2480 + }, + { + "epoch": 1.300822561692127, + "grad_norm": 7.0666608810424805, + "learning_rate": 8.431501446226664e-06, + "loss": 1.0784, + "step": 2490 + }, + { + "epoch": 1.306045175610393, + "grad_norm": 7.217309474945068, + "learning_rate": 8.424927688666843e-06, + "loss": 1.0815, + "step": 2500 + }, + { + "epoch": 1.311267789528659, + "grad_norm": 7.389529705047607, + "learning_rate": 8.418353931107022e-06, + "loss": 0.9945, + "step": 2510 + }, + { + "epoch": 1.3164904034469251, + "grad_norm": 8.694013595581055, + "learning_rate": 8.4117801735472e-06, + "loss": 0.9028, + "step": 2520 + }, + { + "epoch": 1.3217130173651912, + "grad_norm": 10.238712310791016, + "learning_rate": 8.40520641598738e-06, + "loss": 1.0965, + "step": 2530 + }, + { + "epoch": 1.3269356312834573, + "grad_norm": 5.861429214477539, + "learning_rate": 8.398632658427559e-06, + "loss": 0.9861, + "step": 2540 + }, + { + "epoch": 1.3321582452017235, + "grad_norm": 8.313767433166504, + "learning_rate": 8.392058900867736e-06, + "loss": 0.977, + "step": 2550 + }, + { + "epoch": 1.3373808591199896, + "grad_norm": 8.89799976348877, + "learning_rate": 8.385485143307916e-06, + "loss": 1.0753, + "step": 2560 + }, + { + "epoch": 1.3426034730382557, + "grad_norm": 9.983473777770996, + "learning_rate": 8.378911385748095e-06, + "loss": 1.0387, + "step": 2570 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 7.438155651092529, + "learning_rate": 8.372337628188273e-06, + "loss": 1.0159, + "step": 2580 + }, + { + "epoch": 1.3530487008747878, + "grad_norm": 5.991061687469482, + "learning_rate": 8.365763870628452e-06, + "loss": 0.941, + "step": 2590 + }, + { + "epoch": 1.3582713147930539, + "grad_norm": 8.431944847106934, + "learning_rate": 8.35919011306863e-06, + "loss": 1.0113, + "step": 2600 + }, + { + "epoch": 1.3582713147930539, + "eval_loss": 1.291563630104065, + "eval_runtime": 46.1904, + "eval_samples_per_second": 36.848, + "eval_steps_per_second": 4.611, + "step": 2600 + }, + { + "epoch": 1.36349392871132, + "grad_norm": 6.396197319030762, + "learning_rate": 8.352616355508809e-06, + "loss": 1.0564, + "step": 2610 + }, + { + "epoch": 1.3687165426295862, + "grad_norm": 8.248652458190918, + "learning_rate": 8.346042597948988e-06, + "loss": 1.035, + "step": 2620 + }, + { + "epoch": 1.3739391565478523, + "grad_norm": 10.068962097167969, + "learning_rate": 8.339468840389166e-06, + "loss": 1.1366, + "step": 2630 + }, + { + "epoch": 1.3791617704661183, + "grad_norm": 7.027943134307861, + "learning_rate": 8.332895082829347e-06, + "loss": 1.0452, + "step": 2640 + }, + { + "epoch": 1.3843843843843844, + "grad_norm": 8.624917030334473, + "learning_rate": 8.326321325269525e-06, + "loss": 0.9896, + "step": 2650 + }, + { + "epoch": 1.3896069983026504, + "grad_norm": 10.41063117980957, + "learning_rate": 8.319747567709704e-06, + "loss": 0.8731, + "step": 2660 + }, + { + "epoch": 1.3948296122209165, + "grad_norm": 9.194129943847656, + "learning_rate": 8.313173810149883e-06, + "loss": 1.1184, + "step": 2670 + }, + { + "epoch": 1.4000522261391826, + "grad_norm": 7.924031734466553, + "learning_rate": 8.306600052590061e-06, + "loss": 0.9832, + "step": 2680 + }, + { + "epoch": 1.4052748400574488, + "grad_norm": 7.612093925476074, + "learning_rate": 8.30002629503024e-06, + "loss": 0.8867, + "step": 2690 + }, + { + "epoch": 1.410497453975715, + "grad_norm": 5.16526985168457, + "learning_rate": 8.29345253747042e-06, + "loss": 0.9798, + "step": 2700 + }, + { + "epoch": 1.415720067893981, + "grad_norm": 7.717186450958252, + "learning_rate": 8.286878779910597e-06, + "loss": 0.9842, + "step": 2710 + }, + { + "epoch": 1.420942681812247, + "grad_norm": 7.59783411026001, + "learning_rate": 8.280305022350777e-06, + "loss": 1.1226, + "step": 2720 + }, + { + "epoch": 1.426165295730513, + "grad_norm": 7.938925266265869, + "learning_rate": 8.273731264790954e-06, + "loss": 1.1191, + "step": 2730 + }, + { + "epoch": 1.4313879096487792, + "grad_norm": 5.337181091308594, + "learning_rate": 8.267157507231134e-06, + "loss": 0.9871, + "step": 2740 + }, + { + "epoch": 1.4366105235670452, + "grad_norm": 8.043785095214844, + "learning_rate": 8.260583749671313e-06, + "loss": 1.013, + "step": 2750 + }, + { + "epoch": 1.4418331374853115, + "grad_norm": 8.082165718078613, + "learning_rate": 8.25400999211149e-06, + "loss": 1.1487, + "step": 2760 + }, + { + "epoch": 1.4470557514035776, + "grad_norm": 8.345974922180176, + "learning_rate": 8.24743623455167e-06, + "loss": 1.1525, + "step": 2770 + }, + { + "epoch": 1.4522783653218436, + "grad_norm": 7.04867696762085, + "learning_rate": 8.24086247699185e-06, + "loss": 0.8678, + "step": 2780 + }, + { + "epoch": 1.4575009792401097, + "grad_norm": 10.37601375579834, + "learning_rate": 8.234288719432027e-06, + "loss": 1.0822, + "step": 2790 + }, + { + "epoch": 1.4627235931583757, + "grad_norm": 7.350039958953857, + "learning_rate": 8.227714961872206e-06, + "loss": 0.9471, + "step": 2800 + }, + { + "epoch": 1.4627235931583757, + "eval_loss": 1.282351016998291, + "eval_runtime": 46.2302, + "eval_samples_per_second": 36.816, + "eval_steps_per_second": 4.607, + "step": 2800 + }, + { + "epoch": 1.4679462070766418, + "grad_norm": 8.35920524597168, + "learning_rate": 8.221141204312386e-06, + "loss": 1.0057, + "step": 2810 + }, + { + "epoch": 1.4731688209949079, + "grad_norm": 7.395483016967773, + "learning_rate": 8.214567446752565e-06, + "loss": 1.0621, + "step": 2820 + }, + { + "epoch": 1.4783914349131742, + "grad_norm": 7.589978218078613, + "learning_rate": 8.207993689192744e-06, + "loss": 1.1316, + "step": 2830 + }, + { + "epoch": 1.4836140488314402, + "grad_norm": 6.801819324493408, + "learning_rate": 8.201419931632922e-06, + "loss": 1.0843, + "step": 2840 + }, + { + "epoch": 1.4888366627497063, + "grad_norm": 7.094258785247803, + "learning_rate": 8.194846174073101e-06, + "loss": 1.0899, + "step": 2850 + }, + { + "epoch": 1.4940592766679723, + "grad_norm": 7.351092338562012, + "learning_rate": 8.18827241651328e-06, + "loss": 0.8335, + "step": 2860 + }, + { + "epoch": 1.4992818905862384, + "grad_norm": 7.691007614135742, + "learning_rate": 8.181698658953458e-06, + "loss": 0.951, + "step": 2870 + }, + { + "epoch": 1.5045045045045045, + "grad_norm": 6.353541851043701, + "learning_rate": 8.175124901393638e-06, + "loss": 0.9797, + "step": 2880 + }, + { + "epoch": 1.5097271184227705, + "grad_norm": 8.544390678405762, + "learning_rate": 8.168551143833815e-06, + "loss": 0.9335, + "step": 2890 + }, + { + "epoch": 1.5149497323410368, + "grad_norm": 7.043111324310303, + "learning_rate": 8.161977386273995e-06, + "loss": 1.094, + "step": 2900 + }, + { + "epoch": 1.5201723462593026, + "grad_norm": 8.29129695892334, + "learning_rate": 8.155403628714174e-06, + "loss": 1.031, + "step": 2910 + }, + { + "epoch": 1.525394960177569, + "grad_norm": 9.749144554138184, + "learning_rate": 8.148829871154352e-06, + "loss": 0.989, + "step": 2920 + }, + { + "epoch": 1.530617574095835, + "grad_norm": 7.455598831176758, + "learning_rate": 8.142256113594531e-06, + "loss": 1.0981, + "step": 2930 + }, + { + "epoch": 1.535840188014101, + "grad_norm": 8.99416732788086, + "learning_rate": 8.13568235603471e-06, + "loss": 0.9406, + "step": 2940 + }, + { + "epoch": 1.541062801932367, + "grad_norm": 8.05884075164795, + "learning_rate": 8.129108598474888e-06, + "loss": 0.9374, + "step": 2950 + }, + { + "epoch": 1.5462854158506332, + "grad_norm": 8.008783340454102, + "learning_rate": 8.122534840915067e-06, + "loss": 1.2935, + "step": 2960 + }, + { + "epoch": 1.5515080297688995, + "grad_norm": 4.450300693511963, + "learning_rate": 8.115961083355247e-06, + "loss": 0.998, + "step": 2970 + }, + { + "epoch": 1.5567306436871653, + "grad_norm": 8.787833213806152, + "learning_rate": 8.109387325795426e-06, + "loss": 1.1033, + "step": 2980 + }, + { + "epoch": 1.5619532576054316, + "grad_norm": 8.741140365600586, + "learning_rate": 8.102813568235605e-06, + "loss": 1.0123, + "step": 2990 + }, + { + "epoch": 1.5671758715236976, + "grad_norm": 7.553483486175537, + "learning_rate": 8.096239810675783e-06, + "loss": 0.9479, + "step": 3000 + }, + { + "epoch": 1.5671758715236976, + "eval_loss": 1.2631675004959106, + "eval_runtime": 46.3906, + "eval_samples_per_second": 36.688, + "eval_steps_per_second": 4.591, + "step": 3000 + }, + { + "epoch": 1.5723984854419637, + "grad_norm": 7.2385478019714355, + "learning_rate": 8.089666053115962e-06, + "loss": 0.9502, + "step": 3010 + }, + { + "epoch": 1.5776210993602298, + "grad_norm": 7.736097812652588, + "learning_rate": 8.08309229555614e-06, + "loss": 1.0187, + "step": 3020 + }, + { + "epoch": 1.5828437132784958, + "grad_norm": 10.500167846679688, + "learning_rate": 8.07651853799632e-06, + "loss": 1.1638, + "step": 3030 + }, + { + "epoch": 1.588066327196762, + "grad_norm": 7.112147808074951, + "learning_rate": 8.069944780436499e-06, + "loss": 0.9987, + "step": 3040 + }, + { + "epoch": 1.593288941115028, + "grad_norm": 7.6480536460876465, + "learning_rate": 8.063371022876676e-06, + "loss": 0.9721, + "step": 3050 + }, + { + "epoch": 1.5985115550332942, + "grad_norm": 5.637914657592773, + "learning_rate": 8.056797265316856e-06, + "loss": 1.0136, + "step": 3060 + }, + { + "epoch": 1.6037341689515603, + "grad_norm": 9.559964179992676, + "learning_rate": 8.050223507757035e-06, + "loss": 0.99, + "step": 3070 + }, + { + "epoch": 1.6089567828698264, + "grad_norm": 9.080857276916504, + "learning_rate": 8.043649750197213e-06, + "loss": 0.9547, + "step": 3080 + }, + { + "epoch": 1.6141793967880924, + "grad_norm": 7.321665287017822, + "learning_rate": 8.037075992637392e-06, + "loss": 1.1887, + "step": 3090 + }, + { + "epoch": 1.6194020107063585, + "grad_norm": 7.967632293701172, + "learning_rate": 8.030502235077571e-06, + "loss": 1.0033, + "step": 3100 + }, + { + "epoch": 1.6246246246246248, + "grad_norm": 8.761786460876465, + "learning_rate": 8.023928477517749e-06, + "loss": 1.0964, + "step": 3110 + }, + { + "epoch": 1.6298472385428906, + "grad_norm": 8.108577728271484, + "learning_rate": 8.017354719957928e-06, + "loss": 0.9006, + "step": 3120 + }, + { + "epoch": 1.6350698524611569, + "grad_norm": 9.67638874053955, + "learning_rate": 8.010780962398108e-06, + "loss": 0.9444, + "step": 3130 + }, + { + "epoch": 1.640292466379423, + "grad_norm": 5.613959312438965, + "learning_rate": 8.004207204838285e-06, + "loss": 0.9477, + "step": 3140 + }, + { + "epoch": 1.645515080297689, + "grad_norm": 10.248137474060059, + "learning_rate": 7.997633447278465e-06, + "loss": 0.973, + "step": 3150 + }, + { + "epoch": 1.650737694215955, + "grad_norm": 8.074259757995605, + "learning_rate": 7.991059689718644e-06, + "loss": 0.9694, + "step": 3160 + }, + { + "epoch": 1.6559603081342211, + "grad_norm": 7.905124187469482, + "learning_rate": 7.984485932158824e-06, + "loss": 0.9383, + "step": 3170 + }, + { + "epoch": 1.6611829220524874, + "grad_norm": 7.854189395904541, + "learning_rate": 7.977912174599001e-06, + "loss": 0.9687, + "step": 3180 + }, + { + "epoch": 1.6664055359707532, + "grad_norm": 8.677661895751953, + "learning_rate": 7.97133841703918e-06, + "loss": 1.0772, + "step": 3190 + }, + { + "epoch": 1.6716281498890195, + "grad_norm": 6.915931701660156, + "learning_rate": 7.96476465947936e-06, + "loss": 0.9937, + "step": 3200 + }, + { + "epoch": 1.6716281498890195, + "eval_loss": 1.2522579431533813, + "eval_runtime": 46.2527, + "eval_samples_per_second": 36.798, + "eval_steps_per_second": 4.605, + "step": 3200 + }, + { + "epoch": 1.6768507638072856, + "grad_norm": 7.813302993774414, + "learning_rate": 7.958190901919538e-06, + "loss": 0.9355, + "step": 3210 + }, + { + "epoch": 1.6820733777255517, + "grad_norm": 7.921642780303955, + "learning_rate": 7.951617144359717e-06, + "loss": 1.0172, + "step": 3220 + }, + { + "epoch": 1.6872959916438177, + "grad_norm": 6.058948040008545, + "learning_rate": 7.945043386799896e-06, + "loss": 0.922, + "step": 3230 + }, + { + "epoch": 1.6925186055620838, + "grad_norm": 6.718349456787109, + "learning_rate": 7.938469629240074e-06, + "loss": 0.9229, + "step": 3240 + }, + { + "epoch": 1.69774121948035, + "grad_norm": 7.593595504760742, + "learning_rate": 7.931895871680253e-06, + "loss": 0.8853, + "step": 3250 + }, + { + "epoch": 1.702963833398616, + "grad_norm": 8.581092834472656, + "learning_rate": 7.925322114120433e-06, + "loss": 0.9894, + "step": 3260 + }, + { + "epoch": 1.7081864473168822, + "grad_norm": 9.009191513061523, + "learning_rate": 7.91874835656061e-06, + "loss": 0.9499, + "step": 3270 + }, + { + "epoch": 1.7134090612351482, + "grad_norm": 7.549342155456543, + "learning_rate": 7.91217459900079e-06, + "loss": 1.0529, + "step": 3280 + }, + { + "epoch": 1.7186316751534143, + "grad_norm": 7.8952317237854, + "learning_rate": 7.905600841440967e-06, + "loss": 1.0283, + "step": 3290 + }, + { + "epoch": 1.7238542890716804, + "grad_norm": 4.43769645690918, + "learning_rate": 7.899027083881147e-06, + "loss": 0.8977, + "step": 3300 + }, + { + "epoch": 1.7290769029899464, + "grad_norm": 9.048827171325684, + "learning_rate": 7.892453326321326e-06, + "loss": 1.0214, + "step": 3310 + }, + { + "epoch": 1.7342995169082127, + "grad_norm": 8.52785873413086, + "learning_rate": 7.885879568761504e-06, + "loss": 1.0758, + "step": 3320 + }, + { + "epoch": 1.7395221308264786, + "grad_norm": 9.804828643798828, + "learning_rate": 7.879305811201685e-06, + "loss": 1.0046, + "step": 3330 + }, + { + "epoch": 1.7447447447447448, + "grad_norm": 5.0326642990112305, + "learning_rate": 7.872732053641862e-06, + "loss": 1.0151, + "step": 3340 + }, + { + "epoch": 1.7499673586630107, + "grad_norm": 8.352339744567871, + "learning_rate": 7.866158296082042e-06, + "loss": 1.1599, + "step": 3350 + }, + { + "epoch": 1.755189972581277, + "grad_norm": 8.454385757446289, + "learning_rate": 7.859584538522221e-06, + "loss": 1.021, + "step": 3360 + }, + { + "epoch": 1.760412586499543, + "grad_norm": 6.878560543060303, + "learning_rate": 7.853010780962399e-06, + "loss": 1.0795, + "step": 3370 + }, + { + "epoch": 1.765635200417809, + "grad_norm": 7.027520179748535, + "learning_rate": 7.846437023402578e-06, + "loss": 0.9958, + "step": 3380 + }, + { + "epoch": 1.7708578143360754, + "grad_norm": 6.573943614959717, + "learning_rate": 7.839863265842757e-06, + "loss": 0.9701, + "step": 3390 + }, + { + "epoch": 1.7760804282543412, + "grad_norm": 8.435296058654785, + "learning_rate": 7.833289508282935e-06, + "loss": 1.2055, + "step": 3400 + }, + { + "epoch": 1.7760804282543412, + "eval_loss": 1.2281008958816528, + "eval_runtime": 46.491, + "eval_samples_per_second": 36.609, + "eval_steps_per_second": 4.582, + "step": 3400 + }, + { + "epoch": 1.7813030421726075, + "grad_norm": 8.427298545837402, + "learning_rate": 7.826715750723114e-06, + "loss": 0.8982, + "step": 3410 + }, + { + "epoch": 1.7865256560908733, + "grad_norm": 6.709393501281738, + "learning_rate": 7.820141993163292e-06, + "loss": 0.9613, + "step": 3420 + }, + { + "epoch": 1.7917482700091396, + "grad_norm": 7.055306911468506, + "learning_rate": 7.813568235603471e-06, + "loss": 1.0241, + "step": 3430 + }, + { + "epoch": 1.7969708839274057, + "grad_norm": 6.494068145751953, + "learning_rate": 7.80699447804365e-06, + "loss": 0.9973, + "step": 3440 + }, + { + "epoch": 1.8021934978456717, + "grad_norm": 7.806325912475586, + "learning_rate": 7.800420720483828e-06, + "loss": 1.0703, + "step": 3450 + }, + { + "epoch": 1.807416111763938, + "grad_norm": 7.981148719787598, + "learning_rate": 7.793846962924008e-06, + "loss": 1.0622, + "step": 3460 + }, + { + "epoch": 1.8126387256822039, + "grad_norm": 11.118639945983887, + "learning_rate": 7.787273205364187e-06, + "loss": 0.951, + "step": 3470 + }, + { + "epoch": 1.8178613396004701, + "grad_norm": 6.713343620300293, + "learning_rate": 7.780699447804365e-06, + "loss": 0.979, + "step": 3480 + }, + { + "epoch": 1.823083953518736, + "grad_norm": 7.850152492523193, + "learning_rate": 7.774125690244544e-06, + "loss": 1.0042, + "step": 3490 + }, + { + "epoch": 1.8283065674370023, + "grad_norm": 9.41398811340332, + "learning_rate": 7.767551932684723e-06, + "loss": 0.9911, + "step": 3500 + }, + { + "epoch": 1.8335291813552683, + "grad_norm": 5.341313362121582, + "learning_rate": 7.760978175124903e-06, + "loss": 1.1345, + "step": 3510 + }, + { + "epoch": 1.8387517952735344, + "grad_norm": 7.4594950675964355, + "learning_rate": 7.754404417565082e-06, + "loss": 1.0226, + "step": 3520 + }, + { + "epoch": 1.8439744091918004, + "grad_norm": 7.304319381713867, + "learning_rate": 7.74783066000526e-06, + "loss": 1.033, + "step": 3530 + }, + { + "epoch": 1.8491970231100665, + "grad_norm": 7.15964412689209, + "learning_rate": 7.741256902445439e-06, + "loss": 1.0393, + "step": 3540 + }, + { + "epoch": 1.8544196370283328, + "grad_norm": 7.043346405029297, + "learning_rate": 7.734683144885617e-06, + "loss": 0.9624, + "step": 3550 + }, + { + "epoch": 1.8596422509465986, + "grad_norm": 7.86375617980957, + "learning_rate": 7.728109387325796e-06, + "loss": 0.9693, + "step": 3560 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 7.74959659576416, + "learning_rate": 7.721535629765975e-06, + "loss": 0.9708, + "step": 3570 + }, + { + "epoch": 1.870087478783131, + "grad_norm": 9.733150482177734, + "learning_rate": 7.714961872206153e-06, + "loss": 1.2004, + "step": 3580 + }, + { + "epoch": 1.875310092701397, + "grad_norm": 5.977094650268555, + "learning_rate": 7.708388114646332e-06, + "loss": 0.9956, + "step": 3590 + }, + { + "epoch": 1.880532706619663, + "grad_norm": 5.454172134399414, + "learning_rate": 7.701814357086512e-06, + "loss": 0.9481, + "step": 3600 + }, + { + "epoch": 1.880532706619663, + "eval_loss": 1.215065598487854, + "eval_runtime": 46.3976, + "eval_samples_per_second": 36.683, + "eval_steps_per_second": 4.591, + "step": 3600 + }, + { + "epoch": 1.8857553205379292, + "grad_norm": 7.248448848724365, + "learning_rate": 7.69524059952669e-06, + "loss": 0.9859, + "step": 3610 + }, + { + "epoch": 1.8909779344561954, + "grad_norm": 7.5254669189453125, + "learning_rate": 7.688666841966869e-06, + "loss": 0.9096, + "step": 3620 + }, + { + "epoch": 1.8962005483744613, + "grad_norm": 5.095489978790283, + "learning_rate": 7.682093084407048e-06, + "loss": 0.9428, + "step": 3630 + }, + { + "epoch": 1.9014231622927276, + "grad_norm": 6.897408485412598, + "learning_rate": 7.675519326847226e-06, + "loss": 0.9568, + "step": 3640 + }, + { + "epoch": 1.9066457762109936, + "grad_norm": 12.702198028564453, + "learning_rate": 7.668945569287405e-06, + "loss": 0.8563, + "step": 3650 + }, + { + "epoch": 1.9118683901292597, + "grad_norm": 8.203999519348145, + "learning_rate": 7.662371811727584e-06, + "loss": 0.9841, + "step": 3660 + }, + { + "epoch": 1.9170910040475257, + "grad_norm": 6.950149059295654, + "learning_rate": 7.655798054167764e-06, + "loss": 0.9489, + "step": 3670 + }, + { + "epoch": 1.9223136179657918, + "grad_norm": 9.259650230407715, + "learning_rate": 7.649224296607941e-06, + "loss": 0.9244, + "step": 3680 + }, + { + "epoch": 1.927536231884058, + "grad_norm": 8.09697151184082, + "learning_rate": 7.64265053904812e-06, + "loss": 0.9717, + "step": 3690 + }, + { + "epoch": 1.932758845802324, + "grad_norm": 12.38032054901123, + "learning_rate": 7.6360767814883e-06, + "loss": 0.8376, + "step": 3700 + }, + { + "epoch": 1.9379814597205902, + "grad_norm": 6.492051601409912, + "learning_rate": 7.629503023928478e-06, + "loss": 0.9907, + "step": 3710 + }, + { + "epoch": 1.9432040736388563, + "grad_norm": 6.897779941558838, + "learning_rate": 7.622929266368657e-06, + "loss": 0.8404, + "step": 3720 + }, + { + "epoch": 1.9484266875571223, + "grad_norm": 5.3351240158081055, + "learning_rate": 7.616355508808836e-06, + "loss": 0.8059, + "step": 3730 + }, + { + "epoch": 1.9536493014753884, + "grad_norm": 6.559781551361084, + "learning_rate": 7.609781751249015e-06, + "loss": 1.0037, + "step": 3740 + }, + { + "epoch": 1.9588719153936545, + "grad_norm": 8.780523300170898, + "learning_rate": 7.6032079936891934e-06, + "loss": 0.9477, + "step": 3750 + }, + { + "epoch": 1.9640945293119207, + "grad_norm": 6.601717472076416, + "learning_rate": 7.596634236129372e-06, + "loss": 0.9746, + "step": 3760 + }, + { + "epoch": 1.9693171432301866, + "grad_norm": 6.906192779541016, + "learning_rate": 7.590060478569551e-06, + "loss": 0.9532, + "step": 3770 + }, + { + "epoch": 1.9745397571484529, + "grad_norm": 8.66415023803711, + "learning_rate": 7.58348672100973e-06, + "loss": 0.9325, + "step": 3780 + }, + { + "epoch": 1.979762371066719, + "grad_norm": 7.726676940917969, + "learning_rate": 7.576912963449908e-06, + "loss": 1.1076, + "step": 3790 + }, + { + "epoch": 1.984984984984985, + "grad_norm": 9.380224227905273, + "learning_rate": 7.570339205890087e-06, + "loss": 1.0061, + "step": 3800 + }, + { + "epoch": 1.984984984984985, + "eval_loss": 1.206125020980835, + "eval_runtime": 46.4741, + "eval_samples_per_second": 36.623, + "eval_steps_per_second": 4.583, + "step": 3800 + }, + { + "epoch": 1.990207598903251, + "grad_norm": 8.470906257629395, + "learning_rate": 7.563765448330266e-06, + "loss": 1.0322, + "step": 3810 + }, + { + "epoch": 1.9954302128215171, + "grad_norm": 7.907584190368652, + "learning_rate": 7.557191690770445e-06, + "loss": 0.8892, + "step": 3820 + }, + { + "epoch": 2.001044522783653, + "grad_norm": 5.961308479309082, + "learning_rate": 7.550617933210623e-06, + "loss": 0.8805, + "step": 3830 + }, + { + "epoch": 2.0062671367019194, + "grad_norm": 5.583798408508301, + "learning_rate": 7.544044175650803e-06, + "loss": 0.6595, + "step": 3840 + }, + { + "epoch": 2.011489750620185, + "grad_norm": 7.540070533752441, + "learning_rate": 7.537470418090982e-06, + "loss": 0.7234, + "step": 3850 + }, + { + "epoch": 2.0167123645384515, + "grad_norm": 7.355820655822754, + "learning_rate": 7.53089666053116e-06, + "loss": 0.8085, + "step": 3860 + }, + { + "epoch": 2.0219349784567178, + "grad_norm": 8.060832977294922, + "learning_rate": 7.52432290297134e-06, + "loss": 0.8341, + "step": 3870 + }, + { + "epoch": 2.0271575923749836, + "grad_norm": 7.983650207519531, + "learning_rate": 7.517749145411518e-06, + "loss": 0.8054, + "step": 3880 + }, + { + "epoch": 2.03238020629325, + "grad_norm": 7.6987810134887695, + "learning_rate": 7.511175387851697e-06, + "loss": 0.6774, + "step": 3890 + }, + { + "epoch": 2.0376028202115157, + "grad_norm": 10.233553886413574, + "learning_rate": 7.504601630291876e-06, + "loss": 0.7152, + "step": 3900 + }, + { + "epoch": 2.042825434129782, + "grad_norm": 6.328879356384277, + "learning_rate": 7.4980278727320545e-06, + "loss": 0.8492, + "step": 3910 + }, + { + "epoch": 2.048048048048048, + "grad_norm": 5.196915149688721, + "learning_rate": 7.491454115172233e-06, + "loss": 0.756, + "step": 3920 + }, + { + "epoch": 2.053270661966314, + "grad_norm": 6.743471145629883, + "learning_rate": 7.4848803576124115e-06, + "loss": 0.8221, + "step": 3930 + }, + { + "epoch": 2.0584932758845804, + "grad_norm": 10.30760383605957, + "learning_rate": 7.478306600052591e-06, + "loss": 0.93, + "step": 3940 + }, + { + "epoch": 2.0637158898028463, + "grad_norm": 9.386149406433105, + "learning_rate": 7.471732842492769e-06, + "loss": 0.714, + "step": 3950 + }, + { + "epoch": 2.0689385037211125, + "grad_norm": 7.039390563964844, + "learning_rate": 7.465159084932948e-06, + "loss": 0.74, + "step": 3960 + }, + { + "epoch": 2.0741611176393784, + "grad_norm": 6.082592010498047, + "learning_rate": 7.458585327373127e-06, + "loss": 0.7808, + "step": 3970 + }, + { + "epoch": 2.0793837315576447, + "grad_norm": 8.188385009765625, + "learning_rate": 7.452011569813306e-06, + "loss": 0.8064, + "step": 3980 + }, + { + "epoch": 2.0846063454759105, + "grad_norm": 7.643196105957031, + "learning_rate": 7.445437812253484e-06, + "loss": 0.7309, + "step": 3990 + }, + { + "epoch": 2.089828959394177, + "grad_norm": 8.841309547424316, + "learning_rate": 7.438864054693663e-06, + "loss": 0.7749, + "step": 4000 + }, + { + "epoch": 2.089828959394177, + "eval_loss": 1.2640846967697144, + "eval_runtime": 46.3887, + "eval_samples_per_second": 36.69, + "eval_steps_per_second": 4.592, + "step": 4000 + }, + { + "epoch": 2.095051573312443, + "grad_norm": 7.499510765075684, + "learning_rate": 7.432290297133842e-06, + "loss": 0.6646, + "step": 4010 + }, + { + "epoch": 2.100274187230709, + "grad_norm": 8.0418119430542, + "learning_rate": 7.425716539574021e-06, + "loss": 0.6537, + "step": 4020 + }, + { + "epoch": 2.105496801148975, + "grad_norm": 8.836782455444336, + "learning_rate": 7.419142782014201e-06, + "loss": 0.5779, + "step": 4030 + }, + { + "epoch": 2.110719415067241, + "grad_norm": 4.353819370269775, + "learning_rate": 7.412569024454379e-06, + "loss": 0.7561, + "step": 4040 + }, + { + "epoch": 2.1159420289855073, + "grad_norm": 8.226792335510254, + "learning_rate": 7.405995266894558e-06, + "loss": 0.6666, + "step": 4050 + }, + { + "epoch": 2.121164642903773, + "grad_norm": 8.977775573730469, + "learning_rate": 7.399421509334736e-06, + "loss": 0.7941, + "step": 4060 + }, + { + "epoch": 2.1263872568220394, + "grad_norm": 7.1391706466674805, + "learning_rate": 7.392847751774916e-06, + "loss": 0.8272, + "step": 4070 + }, + { + "epoch": 2.1316098707403057, + "grad_norm": 7.537032604217529, + "learning_rate": 7.386273994215094e-06, + "loss": 0.7583, + "step": 4080 + }, + { + "epoch": 2.1368324846585716, + "grad_norm": 9.434386253356934, + "learning_rate": 7.379700236655273e-06, + "loss": 0.6792, + "step": 4090 + }, + { + "epoch": 2.142055098576838, + "grad_norm": 8.694379806518555, + "learning_rate": 7.373126479095452e-06, + "loss": 0.8661, + "step": 4100 + }, + { + "epoch": 2.1472777124951037, + "grad_norm": 7.044814586639404, + "learning_rate": 7.3665527215356304e-06, + "loss": 0.6704, + "step": 4110 + }, + { + "epoch": 2.15250032641337, + "grad_norm": 8.827573776245117, + "learning_rate": 7.359978963975809e-06, + "loss": 0.765, + "step": 4120 + }, + { + "epoch": 2.157722940331636, + "grad_norm": 9.122138023376465, + "learning_rate": 7.3534052064159874e-06, + "loss": 0.674, + "step": 4130 + }, + { + "epoch": 2.162945554249902, + "grad_norm": 8.409847259521484, + "learning_rate": 7.346831448856167e-06, + "loss": 0.718, + "step": 4140 + }, + { + "epoch": 2.1681681681681684, + "grad_norm": 6.554537773132324, + "learning_rate": 7.340257691296345e-06, + "loss": 0.7853, + "step": 4150 + }, + { + "epoch": 2.173390782086434, + "grad_norm": 8.142253875732422, + "learning_rate": 7.333683933736524e-06, + "loss": 0.6993, + "step": 4160 + }, + { + "epoch": 2.1786133960047005, + "grad_norm": 6.217634677886963, + "learning_rate": 7.327110176176703e-06, + "loss": 0.7372, + "step": 4170 + }, + { + "epoch": 2.1838360099229663, + "grad_norm": 8.371528625488281, + "learning_rate": 7.320536418616882e-06, + "loss": 0.68, + "step": 4180 + }, + { + "epoch": 2.1890586238412326, + "grad_norm": 7.018899917602539, + "learning_rate": 7.313962661057061e-06, + "loss": 0.7883, + "step": 4190 + }, + { + "epoch": 2.1942812377594985, + "grad_norm": 7.779040336608887, + "learning_rate": 7.30738890349724e-06, + "loss": 0.7449, + "step": 4200 + }, + { + "epoch": 2.1942812377594985, + "eval_loss": 1.26158607006073, + "eval_runtime": 46.3831, + "eval_samples_per_second": 36.694, + "eval_steps_per_second": 4.592, + "step": 4200 + }, + { + "epoch": 2.1995038516777647, + "grad_norm": 9.133999824523926, + "learning_rate": 7.300815145937419e-06, + "loss": 0.7521, + "step": 4210 + }, + { + "epoch": 2.204726465596031, + "grad_norm": 10.447097778320312, + "learning_rate": 7.294241388377597e-06, + "loss": 0.8424, + "step": 4220 + }, + { + "epoch": 2.209949079514297, + "grad_norm": 8.978377342224121, + "learning_rate": 7.287667630817777e-06, + "loss": 0.7258, + "step": 4230 + }, + { + "epoch": 2.215171693432563, + "grad_norm": 9.498069763183594, + "learning_rate": 7.281093873257955e-06, + "loss": 0.7952, + "step": 4240 + }, + { + "epoch": 2.220394307350829, + "grad_norm": 7.864907264709473, + "learning_rate": 7.274520115698134e-06, + "loss": 0.7921, + "step": 4250 + }, + { + "epoch": 2.2256169212690953, + "grad_norm": 8.536930084228516, + "learning_rate": 7.267946358138312e-06, + "loss": 0.8745, + "step": 4260 + }, + { + "epoch": 2.230839535187361, + "grad_norm": 8.639608383178711, + "learning_rate": 7.2613726005784915e-06, + "loss": 0.7734, + "step": 4270 + }, + { + "epoch": 2.2360621491056274, + "grad_norm": 9.74687385559082, + "learning_rate": 7.255456218774653e-06, + "loss": 0.7512, + "step": 4280 + }, + { + "epoch": 2.2412847630238932, + "grad_norm": 5.085870742797852, + "learning_rate": 7.248882461214831e-06, + "loss": 0.7072, + "step": 4290 + }, + { + "epoch": 2.2465073769421595, + "grad_norm": 8.538248062133789, + "learning_rate": 7.24230870365501e-06, + "loss": 0.7686, + "step": 4300 + }, + { + "epoch": 2.251729990860426, + "grad_norm": 7.15536642074585, + "learning_rate": 7.235734946095189e-06, + "loss": 0.7064, + "step": 4310 + }, + { + "epoch": 2.2569526047786916, + "grad_norm": 6.049582004547119, + "learning_rate": 7.229161188535368e-06, + "loss": 0.8178, + "step": 4320 + }, + { + "epoch": 2.262175218696958, + "grad_norm": 7.065965175628662, + "learning_rate": 7.222587430975546e-06, + "loss": 0.676, + "step": 4330 + }, + { + "epoch": 2.2673978326152238, + "grad_norm": 8.788618087768555, + "learning_rate": 7.216013673415725e-06, + "loss": 0.743, + "step": 4340 + }, + { + "epoch": 2.27262044653349, + "grad_norm": 7.415438175201416, + "learning_rate": 7.209439915855904e-06, + "loss": 0.7076, + "step": 4350 + }, + { + "epoch": 2.2778430604517563, + "grad_norm": 5.244143486022949, + "learning_rate": 7.2028661582960824e-06, + "loss": 0.6873, + "step": 4360 + }, + { + "epoch": 2.283065674370022, + "grad_norm": 8.050501823425293, + "learning_rate": 7.196292400736261e-06, + "loss": 0.7375, + "step": 4370 + }, + { + "epoch": 2.2882882882882885, + "grad_norm": 9.382434844970703, + "learning_rate": 7.18971864317644e-06, + "loss": 0.7662, + "step": 4380 + }, + { + "epoch": 2.2935109022065543, + "grad_norm": 9.074572563171387, + "learning_rate": 7.183144885616619e-06, + "loss": 0.7775, + "step": 4390 + }, + { + "epoch": 2.2987335161248206, + "grad_norm": 9.558895111083984, + "learning_rate": 7.176571128056797e-06, + "loss": 0.8324, + "step": 4400 + }, + { + "epoch": 2.2987335161248206, + "eval_loss": 1.246055006980896, + "eval_runtime": 46.3387, + "eval_samples_per_second": 36.73, + "eval_steps_per_second": 4.597, + "step": 4400 + }, + { + "epoch": 2.3039561300430864, + "grad_norm": 7.064749717712402, + "learning_rate": 7.1699973704969775e-06, + "loss": 0.6728, + "step": 4410 + }, + { + "epoch": 2.3091787439613527, + "grad_norm": 8.828902244567871, + "learning_rate": 7.163423612937156e-06, + "loss": 0.8473, + "step": 4420 + }, + { + "epoch": 2.3144013578796185, + "grad_norm": 6.347407817840576, + "learning_rate": 7.1568498553773345e-06, + "loss": 0.6786, + "step": 4430 + }, + { + "epoch": 2.319623971797885, + "grad_norm": 5.167905807495117, + "learning_rate": 7.150276097817514e-06, + "loss": 0.6778, + "step": 4440 + }, + { + "epoch": 2.324846585716151, + "grad_norm": 9.10237979888916, + "learning_rate": 7.143702340257692e-06, + "loss": 0.7664, + "step": 4450 + }, + { + "epoch": 2.330069199634417, + "grad_norm": 8.039263725280762, + "learning_rate": 7.137128582697871e-06, + "loss": 0.716, + "step": 4460 + }, + { + "epoch": 2.3352918135526832, + "grad_norm": 5.672702789306641, + "learning_rate": 7.130554825138049e-06, + "loss": 0.7147, + "step": 4470 + }, + { + "epoch": 2.340514427470949, + "grad_norm": 7.255804538726807, + "learning_rate": 7.123981067578229e-06, + "loss": 0.7427, + "step": 4480 + }, + { + "epoch": 2.3457370413892153, + "grad_norm": 7.924871444702148, + "learning_rate": 7.117407310018407e-06, + "loss": 0.7103, + "step": 4490 + }, + { + "epoch": 2.3509596553074816, + "grad_norm": 6.1431708335876465, + "learning_rate": 7.110833552458586e-06, + "loss": 0.6292, + "step": 4500 + }, + { + "epoch": 2.3561822692257475, + "grad_norm": 9.339113235473633, + "learning_rate": 7.104259794898765e-06, + "loss": 0.8119, + "step": 4510 + }, + { + "epoch": 2.3614048831440138, + "grad_norm": 8.561595916748047, + "learning_rate": 7.0976860373389435e-06, + "loss": 0.7959, + "step": 4520 + }, + { + "epoch": 2.3666274970622796, + "grad_norm": 8.174324035644531, + "learning_rate": 7.091112279779122e-06, + "loss": 0.6731, + "step": 4530 + }, + { + "epoch": 2.371850110980546, + "grad_norm": 8.378727912902832, + "learning_rate": 7.0845385222193005e-06, + "loss": 0.707, + "step": 4540 + }, + { + "epoch": 2.3770727248988117, + "grad_norm": 8.307716369628906, + "learning_rate": 7.07796476465948e-06, + "loss": 0.8, + "step": 4550 + }, + { + "epoch": 2.382295338817078, + "grad_norm": 9.376656532287598, + "learning_rate": 7.071391007099658e-06, + "loss": 0.832, + "step": 4560 + }, + { + "epoch": 2.387517952735344, + "grad_norm": 8.780073165893555, + "learning_rate": 7.064817249539837e-06, + "loss": 0.6899, + "step": 4570 + }, + { + "epoch": 2.39274056665361, + "grad_norm": 11.08247184753418, + "learning_rate": 7.058243491980017e-06, + "loss": 0.6915, + "step": 4580 + }, + { + "epoch": 2.3979631805718764, + "grad_norm": 7.881500720977783, + "learning_rate": 7.0516697344201955e-06, + "loss": 0.6109, + "step": 4590 + }, + { + "epoch": 2.4031857944901422, + "grad_norm": 8.939738273620605, + "learning_rate": 7.045095976860374e-06, + "loss": 0.6944, + "step": 4600 + }, + { + "epoch": 2.4031857944901422, + "eval_loss": 1.247771143913269, + "eval_runtime": 46.3198, + "eval_samples_per_second": 36.745, + "eval_steps_per_second": 4.598, + "step": 4600 + }, + { + "epoch": 2.4084084084084085, + "grad_norm": 10.969844818115234, + "learning_rate": 7.038522219300553e-06, + "loss": 0.7537, + "step": 4610 + }, + { + "epoch": 2.4136310223266744, + "grad_norm": 7.069279670715332, + "learning_rate": 7.031948461740732e-06, + "loss": 0.6966, + "step": 4620 + }, + { + "epoch": 2.4188536362449407, + "grad_norm": 9.71384334564209, + "learning_rate": 7.02537470418091e-06, + "loss": 0.7609, + "step": 4630 + }, + { + "epoch": 2.4240762501632065, + "grad_norm": 6.699389934539795, + "learning_rate": 7.01880094662109e-06, + "loss": 0.6327, + "step": 4640 + }, + { + "epoch": 2.4292988640814728, + "grad_norm": 8.881025314331055, + "learning_rate": 7.012227189061268e-06, + "loss": 0.8115, + "step": 4650 + }, + { + "epoch": 2.434521477999739, + "grad_norm": 6.328512191772461, + "learning_rate": 7.005653431501447e-06, + "loss": 0.7266, + "step": 4660 + }, + { + "epoch": 2.439744091918005, + "grad_norm": 6.8868279457092285, + "learning_rate": 6.999079673941625e-06, + "loss": 0.7584, + "step": 4670 + }, + { + "epoch": 2.444966705836271, + "grad_norm": 8.252555847167969, + "learning_rate": 6.992505916381805e-06, + "loss": 0.7738, + "step": 4680 + }, + { + "epoch": 2.450189319754537, + "grad_norm": 6.152466297149658, + "learning_rate": 6.985932158821983e-06, + "loss": 0.7227, + "step": 4690 + }, + { + "epoch": 2.4554119336728033, + "grad_norm": 8.81529426574707, + "learning_rate": 6.979358401262162e-06, + "loss": 0.7793, + "step": 4700 + }, + { + "epoch": 2.460634547591069, + "grad_norm": 11.22712516784668, + "learning_rate": 6.972784643702341e-06, + "loss": 0.7545, + "step": 4710 + }, + { + "epoch": 2.4658571615093354, + "grad_norm": 8.15162181854248, + "learning_rate": 6.9662108861425194e-06, + "loss": 0.8213, + "step": 4720 + }, + { + "epoch": 2.4710797754276017, + "grad_norm": 10.381488800048828, + "learning_rate": 6.959637128582698e-06, + "loss": 0.7706, + "step": 4730 + }, + { + "epoch": 2.4763023893458675, + "grad_norm": 5.261969566345215, + "learning_rate": 6.9530633710228764e-06, + "loss": 0.7216, + "step": 4740 + }, + { + "epoch": 2.481525003264134, + "grad_norm": 7.4040117263793945, + "learning_rate": 6.946489613463056e-06, + "loss": 0.6696, + "step": 4750 + }, + { + "epoch": 2.4867476171823997, + "grad_norm": 6.897395133972168, + "learning_rate": 6.939915855903235e-06, + "loss": 0.7881, + "step": 4760 + }, + { + "epoch": 2.491970231100666, + "grad_norm": 10.105294227600098, + "learning_rate": 6.9333420983434145e-06, + "loss": 0.6668, + "step": 4770 + }, + { + "epoch": 2.497192845018932, + "grad_norm": 6.959880828857422, + "learning_rate": 6.926768340783593e-06, + "loss": 0.7377, + "step": 4780 + }, + { + "epoch": 2.502415458937198, + "grad_norm": 10.688835144042969, + "learning_rate": 6.9201945832237715e-06, + "loss": 0.7178, + "step": 4790 + }, + { + "epoch": 2.507638072855464, + "grad_norm": 9.69953441619873, + "learning_rate": 6.91362082566395e-06, + "loss": 0.7491, + "step": 4800 + }, + { + "epoch": 2.507638072855464, + "eval_loss": 1.2456127405166626, + "eval_runtime": 46.9191, + "eval_samples_per_second": 36.275, + "eval_steps_per_second": 4.54, + "step": 4800 + }, + { + "epoch": 2.51286068677373, + "grad_norm": 7.690919399261475, + "learning_rate": 6.907047068104129e-06, + "loss": 0.7302, + "step": 4810 + }, + { + "epoch": 2.5180833006919965, + "grad_norm": 7.342029571533203, + "learning_rate": 6.900473310544308e-06, + "loss": 0.694, + "step": 4820 + }, + { + "epoch": 2.5233059146102623, + "grad_norm": 7.751829147338867, + "learning_rate": 6.893899552984486e-06, + "loss": 0.6835, + "step": 4830 + }, + { + "epoch": 2.5285285285285286, + "grad_norm": 7.896656036376953, + "learning_rate": 6.887325795424666e-06, + "loss": 0.7437, + "step": 4840 + }, + { + "epoch": 2.5337511424467944, + "grad_norm": 7.776586055755615, + "learning_rate": 6.880752037864844e-06, + "loss": 0.6117, + "step": 4850 + }, + { + "epoch": 2.5389737563650607, + "grad_norm": 9.255610466003418, + "learning_rate": 6.874178280305023e-06, + "loss": 0.7713, + "step": 4860 + }, + { + "epoch": 2.544196370283327, + "grad_norm": 8.877751350402832, + "learning_rate": 6.867604522745201e-06, + "loss": 0.6392, + "step": 4870 + }, + { + "epoch": 2.549418984201593, + "grad_norm": 5.893667221069336, + "learning_rate": 6.8610307651853805e-06, + "loss": 0.758, + "step": 4880 + }, + { + "epoch": 2.554641598119859, + "grad_norm": 5.860952854156494, + "learning_rate": 6.854457007625559e-06, + "loss": 0.6937, + "step": 4890 + }, + { + "epoch": 2.559864212038125, + "grad_norm": 6.213397979736328, + "learning_rate": 6.8478832500657375e-06, + "loss": 0.6206, + "step": 4900 + }, + { + "epoch": 2.5650868259563913, + "grad_norm": 7.741673946380615, + "learning_rate": 6.841309492505917e-06, + "loss": 0.8378, + "step": 4910 + }, + { + "epoch": 2.5703094398746575, + "grad_norm": 7.0414838790893555, + "learning_rate": 6.834735734946095e-06, + "loss": 0.7588, + "step": 4920 + }, + { + "epoch": 2.5755320537929234, + "grad_norm": 8.802332878112793, + "learning_rate": 6.828161977386275e-06, + "loss": 0.7929, + "step": 4930 + }, + { + "epoch": 2.580754667711189, + "grad_norm": 8.432696342468262, + "learning_rate": 6.821588219826454e-06, + "loss": 0.8244, + "step": 4940 + }, + { + "epoch": 2.5859772816294555, + "grad_norm": 4.923203945159912, + "learning_rate": 6.8150144622666325e-06, + "loss": 0.7292, + "step": 4950 + }, + { + "epoch": 2.591199895547722, + "grad_norm": 8.111337661743164, + "learning_rate": 6.808440704706811e-06, + "loss": 0.7311, + "step": 4960 + }, + { + "epoch": 2.5964225094659876, + "grad_norm": 8.650612831115723, + "learning_rate": 6.80186694714699e-06, + "loss": 0.7428, + "step": 4970 + }, + { + "epoch": 2.601645123384254, + "grad_norm": 9.554792404174805, + "learning_rate": 6.795293189587169e-06, + "loss": 0.817, + "step": 4980 + }, + { + "epoch": 2.6068677373025197, + "grad_norm": 9.558310508728027, + "learning_rate": 6.788719432027347e-06, + "loss": 0.7322, + "step": 4990 + }, + { + "epoch": 2.612090351220786, + "grad_norm": 7.577544689178467, + "learning_rate": 6.782145674467526e-06, + "loss": 0.7557, + "step": 5000 + }, + { + "epoch": 2.612090351220786, + "eval_loss": 1.2190589904785156, + "eval_runtime": 46.4012, + "eval_samples_per_second": 36.68, + "eval_steps_per_second": 4.59, + "step": 5000 + }, + { + "epoch": 2.6173129651390523, + "grad_norm": 9.112717628479004, + "learning_rate": 6.775571916907705e-06, + "loss": 0.7545, + "step": 5010 + }, + { + "epoch": 2.622535579057318, + "grad_norm": 9.887261390686035, + "learning_rate": 6.768998159347884e-06, + "loss": 0.7087, + "step": 5020 + }, + { + "epoch": 2.6277581929755844, + "grad_norm": 9.290141105651855, + "learning_rate": 6.762424401788062e-06, + "loss": 0.7582, + "step": 5030 + }, + { + "epoch": 2.6329808068938503, + "grad_norm": 5.160643100738525, + "learning_rate": 6.755850644228242e-06, + "loss": 0.7339, + "step": 5040 + }, + { + "epoch": 2.6382034208121166, + "grad_norm": 7.384275913238525, + "learning_rate": 6.74927688666842e-06, + "loss": 0.6564, + "step": 5050 + }, + { + "epoch": 2.6434260347303824, + "grad_norm": 10.130136489868164, + "learning_rate": 6.742703129108599e-06, + "loss": 0.7619, + "step": 5060 + }, + { + "epoch": 2.6486486486486487, + "grad_norm": 5.019765377044678, + "learning_rate": 6.736129371548777e-06, + "loss": 0.6594, + "step": 5070 + }, + { + "epoch": 2.6538712625669145, + "grad_norm": 8.21762752532959, + "learning_rate": 6.7295556139889564e-06, + "loss": 0.7934, + "step": 5080 + }, + { + "epoch": 2.659093876485181, + "grad_norm": 8.9004487991333, + "learning_rate": 6.722981856429135e-06, + "loss": 0.7252, + "step": 5090 + }, + { + "epoch": 2.664316490403447, + "grad_norm": 8.407866477966309, + "learning_rate": 6.716408098869315e-06, + "loss": 0.7514, + "step": 5100 + }, + { + "epoch": 2.669539104321713, + "grad_norm": 12.734916687011719, + "learning_rate": 6.709834341309494e-06, + "loss": 0.7225, + "step": 5110 + }, + { + "epoch": 2.674761718239979, + "grad_norm": 7.250882625579834, + "learning_rate": 6.703260583749672e-06, + "loss": 0.6623, + "step": 5120 + }, + { + "epoch": 2.679984332158245, + "grad_norm": 6.4962663650512695, + "learning_rate": 6.696686826189851e-06, + "loss": 0.7286, + "step": 5130 + }, + { + "epoch": 2.6852069460765113, + "grad_norm": 6.720248222351074, + "learning_rate": 6.69011306863003e-06, + "loss": 0.6411, + "step": 5140 + }, + { + "epoch": 2.6904295599947776, + "grad_norm": 7.895679950714111, + "learning_rate": 6.6835393110702085e-06, + "loss": 0.761, + "step": 5150 + }, + { + "epoch": 2.6956521739130435, + "grad_norm": 9.329249382019043, + "learning_rate": 6.676965553510387e-06, + "loss": 0.6963, + "step": 5160 + }, + { + "epoch": 2.7008747878313093, + "grad_norm": 7.362915515899658, + "learning_rate": 6.670391795950566e-06, + "loss": 0.7546, + "step": 5170 + }, + { + "epoch": 2.7060974017495756, + "grad_norm": 9.045941352844238, + "learning_rate": 6.663818038390745e-06, + "loss": 0.6971, + "step": 5180 + }, + { + "epoch": 2.711320015667842, + "grad_norm": 12.443278312683105, + "learning_rate": 6.657244280830923e-06, + "loss": 0.7261, + "step": 5190 + }, + { + "epoch": 2.7165426295861077, + "grad_norm": 9.796110153198242, + "learning_rate": 6.650670523271102e-06, + "loss": 0.7577, + "step": 5200 + }, + { + "epoch": 2.7165426295861077, + "eval_loss": 1.2298212051391602, + "eval_runtime": 46.3629, + "eval_samples_per_second": 36.71, + "eval_steps_per_second": 4.594, + "step": 5200 + }, + { + "epoch": 2.721765243504374, + "grad_norm": 6.790576457977295, + "learning_rate": 6.644096765711281e-06, + "loss": 0.7437, + "step": 5210 + }, + { + "epoch": 2.72698785742264, + "grad_norm": 6.432572841644287, + "learning_rate": 6.63752300815146e-06, + "loss": 0.6898, + "step": 5220 + }, + { + "epoch": 2.732210471340906, + "grad_norm": 6.689311504364014, + "learning_rate": 6.630949250591638e-06, + "loss": 0.736, + "step": 5230 + }, + { + "epoch": 2.7374330852591724, + "grad_norm": 8.386734962463379, + "learning_rate": 6.6243754930318175e-06, + "loss": 0.7809, + "step": 5240 + }, + { + "epoch": 2.7426556991774382, + "grad_norm": 8.303974151611328, + "learning_rate": 6.617801735471996e-06, + "loss": 0.7536, + "step": 5250 + }, + { + "epoch": 2.7478783130957045, + "grad_norm": 10.667426109313965, + "learning_rate": 6.6112279779121745e-06, + "loss": 0.8118, + "step": 5260 + }, + { + "epoch": 2.7531009270139704, + "grad_norm": 6.878983974456787, + "learning_rate": 6.604654220352355e-06, + "loss": 0.7044, + "step": 5270 + }, + { + "epoch": 2.7583235409322366, + "grad_norm": 9.650110244750977, + "learning_rate": 6.598080462792533e-06, + "loss": 0.7329, + "step": 5280 + }, + { + "epoch": 2.763546154850503, + "grad_norm": 6.491201877593994, + "learning_rate": 6.591506705232712e-06, + "loss": 0.6498, + "step": 5290 + }, + { + "epoch": 2.7687687687687688, + "grad_norm": 8.977750778198242, + "learning_rate": 6.584932947672891e-06, + "loss": 0.7035, + "step": 5300 + }, + { + "epoch": 2.7739913826870346, + "grad_norm": 6.382355213165283, + "learning_rate": 6.5783591901130695e-06, + "loss": 0.6769, + "step": 5310 + }, + { + "epoch": 2.779213996605301, + "grad_norm": 9.238271713256836, + "learning_rate": 6.571785432553248e-06, + "loss": 0.7547, + "step": 5320 + }, + { + "epoch": 2.784436610523567, + "grad_norm": 7.663956165313721, + "learning_rate": 6.5652116749934265e-06, + "loss": 0.7363, + "step": 5330 + }, + { + "epoch": 2.789659224441833, + "grad_norm": 10.917364120483398, + "learning_rate": 6.558637917433606e-06, + "loss": 0.7218, + "step": 5340 + }, + { + "epoch": 2.7948818383600993, + "grad_norm": 8.472268104553223, + "learning_rate": 6.552064159873784e-06, + "loss": 0.7329, + "step": 5350 + }, + { + "epoch": 2.800104452278365, + "grad_norm": 9.487913131713867, + "learning_rate": 6.545490402313963e-06, + "loss": 0.7496, + "step": 5360 + }, + { + "epoch": 2.8053270661966314, + "grad_norm": 7.357760906219482, + "learning_rate": 6.538916644754142e-06, + "loss": 0.6209, + "step": 5370 + }, + { + "epoch": 2.8105496801148977, + "grad_norm": 8.204305648803711, + "learning_rate": 6.532342887194321e-06, + "loss": 0.7739, + "step": 5380 + }, + { + "epoch": 2.8157722940331635, + "grad_norm": 6.225192070007324, + "learning_rate": 6.525769129634499e-06, + "loss": 0.6911, + "step": 5390 + }, + { + "epoch": 2.82099490795143, + "grad_norm": 9.028958320617676, + "learning_rate": 6.519195372074678e-06, + "loss": 0.7801, + "step": 5400 + }, + { + "epoch": 2.82099490795143, + "eval_loss": 1.220414400100708, + "eval_runtime": 46.3932, + "eval_samples_per_second": 36.686, + "eval_steps_per_second": 4.591, + "step": 5400 + }, + { + "epoch": 2.8262175218696957, + "grad_norm": 8.968846321105957, + "learning_rate": 6.512621614514857e-06, + "loss": 0.7202, + "step": 5410 + }, + { + "epoch": 2.831440135787962, + "grad_norm": 8.162824630737305, + "learning_rate": 6.506047856955036e-06, + "loss": 0.733, + "step": 5420 + }, + { + "epoch": 2.8366627497062282, + "grad_norm": 8.892393112182617, + "learning_rate": 6.499474099395214e-06, + "loss": 0.8351, + "step": 5430 + }, + { + "epoch": 2.841885363624494, + "grad_norm": 7.610676288604736, + "learning_rate": 6.492900341835394e-06, + "loss": 0.6797, + "step": 5440 + }, + { + "epoch": 2.84710797754276, + "grad_norm": 6.808664798736572, + "learning_rate": 6.486326584275573e-06, + "loss": 0.7261, + "step": 5450 + }, + { + "epoch": 2.852330591461026, + "grad_norm": 12.52797794342041, + "learning_rate": 6.479752826715751e-06, + "loss": 0.7066, + "step": 5460 + }, + { + "epoch": 2.8575532053792925, + "grad_norm": 7.522469520568848, + "learning_rate": 6.473179069155931e-06, + "loss": 0.6178, + "step": 5470 + }, + { + "epoch": 2.8627758192975583, + "grad_norm": 6.208746910095215, + "learning_rate": 6.466605311596109e-06, + "loss": 0.663, + "step": 5480 + }, + { + "epoch": 2.8679984332158246, + "grad_norm": 7.486382007598877, + "learning_rate": 6.460031554036288e-06, + "loss": 0.652, + "step": 5490 + }, + { + "epoch": 2.8732210471340904, + "grad_norm": 7.342718124389648, + "learning_rate": 6.453457796476467e-06, + "loss": 0.7086, + "step": 5500 + }, + { + "epoch": 2.8784436610523567, + "grad_norm": 11.11368465423584, + "learning_rate": 6.4468840389166455e-06, + "loss": 0.7731, + "step": 5510 + }, + { + "epoch": 2.883666274970623, + "grad_norm": 5.823308944702148, + "learning_rate": 6.440310281356824e-06, + "loss": 0.6701, + "step": 5520 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 9.190802574157715, + "learning_rate": 6.4337365237970025e-06, + "loss": 0.8433, + "step": 5530 + }, + { + "epoch": 2.894111502807155, + "grad_norm": 7.263264179229736, + "learning_rate": 6.427162766237182e-06, + "loss": 0.6523, + "step": 5540 + }, + { + "epoch": 2.899334116725421, + "grad_norm": 9.167975425720215, + "learning_rate": 6.42058900867736e-06, + "loss": 0.6965, + "step": 5550 + }, + { + "epoch": 2.9045567306436872, + "grad_norm": 7.3201375007629395, + "learning_rate": 6.414015251117539e-06, + "loss": 0.7339, + "step": 5560 + }, + { + "epoch": 2.9097793445619535, + "grad_norm": 8.229689598083496, + "learning_rate": 6.407441493557718e-06, + "loss": 0.7504, + "step": 5570 + }, + { + "epoch": 2.9150019584802194, + "grad_norm": 8.792162895202637, + "learning_rate": 6.400867735997897e-06, + "loss": 0.7689, + "step": 5580 + }, + { + "epoch": 2.920224572398485, + "grad_norm": 9.17432689666748, + "learning_rate": 6.394293978438075e-06, + "loss": 0.7747, + "step": 5590 + }, + { + "epoch": 2.9254471863167515, + "grad_norm": 6.218533039093018, + "learning_rate": 6.387720220878254e-06, + "loss": 0.8414, + "step": 5600 + }, + { + "epoch": 2.9254471863167515, + "eval_loss": 1.2055819034576416, + "eval_runtime": 46.3894, + "eval_samples_per_second": 36.689, + "eval_steps_per_second": 4.592, + "step": 5600 + } + ], + "logging_steps": 10, + "max_steps": 15312, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.063042284067226e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}