{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2195, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022779043280182234, "grad_norm": 1.461890268182302, "learning_rate": 5e-06, "loss": 0.7557, "step": 10 }, { "epoch": 0.04555808656036447, "grad_norm": 0.8438010452400335, "learning_rate": 5e-06, "loss": 0.6991, "step": 20 }, { "epoch": 0.0683371298405467, "grad_norm": 0.9176367932491145, "learning_rate": 5e-06, "loss": 0.6807, "step": 30 }, { "epoch": 0.09111617312072894, "grad_norm": 0.8591490414558551, "learning_rate": 5e-06, "loss": 0.6795, "step": 40 }, { "epoch": 0.11389521640091116, "grad_norm": 0.8079754565291697, "learning_rate": 5e-06, "loss": 0.6751, "step": 50 }, { "epoch": 0.1366742596810934, "grad_norm": 0.676950753600941, "learning_rate": 5e-06, "loss": 0.6551, "step": 60 }, { "epoch": 0.15945330296127563, "grad_norm": 0.4397948544262224, "learning_rate": 5e-06, "loss": 0.6519, "step": 70 }, { "epoch": 0.18223234624145787, "grad_norm": 0.36816486280988786, "learning_rate": 5e-06, "loss": 0.6552, "step": 80 }, { "epoch": 0.20501138952164008, "grad_norm": 0.31467376370284666, "learning_rate": 5e-06, "loss": 0.6394, "step": 90 }, { "epoch": 0.22779043280182232, "grad_norm": 0.2990731503632509, "learning_rate": 5e-06, "loss": 0.6413, "step": 100 }, { "epoch": 0.2505694760820046, "grad_norm": 0.34146204165056326, "learning_rate": 5e-06, "loss": 0.6325, "step": 110 }, { "epoch": 0.2733485193621868, "grad_norm": 0.3118885479261177, "learning_rate": 5e-06, "loss": 0.6411, "step": 120 }, { "epoch": 0.296127562642369, "grad_norm": 0.29309780131472263, "learning_rate": 5e-06, "loss": 0.6422, "step": 130 }, { "epoch": 0.31890660592255127, "grad_norm": 0.3076465447126475, "learning_rate": 5e-06, "loss": 0.6443, "step": 140 }, { "epoch": 0.3416856492027335, "grad_norm": 0.3146670257810527, "learning_rate": 5e-06, "loss": 0.6369, "step": 150 }, { "epoch": 0.36446469248291574, "grad_norm": 0.3089804621013958, "learning_rate": 5e-06, "loss": 0.6424, "step": 160 }, { "epoch": 0.38724373576309795, "grad_norm": 0.34694834193027213, "learning_rate": 5e-06, "loss": 0.6335, "step": 170 }, { "epoch": 0.41002277904328016, "grad_norm": 0.3056788305992376, "learning_rate": 5e-06, "loss": 0.6399, "step": 180 }, { "epoch": 0.4328018223234624, "grad_norm": 0.30612032878616063, "learning_rate": 5e-06, "loss": 0.6313, "step": 190 }, { "epoch": 0.45558086560364464, "grad_norm": 0.3055698368680009, "learning_rate": 5e-06, "loss": 0.6408, "step": 200 }, { "epoch": 0.4783599088838269, "grad_norm": 0.3093335714854928, "learning_rate": 5e-06, "loss": 0.6357, "step": 210 }, { "epoch": 0.5011389521640092, "grad_norm": 0.2994802389469234, "learning_rate": 5e-06, "loss": 0.6396, "step": 220 }, { "epoch": 0.5239179954441914, "grad_norm": 0.3262302042973422, "learning_rate": 5e-06, "loss": 0.6351, "step": 230 }, { "epoch": 0.5466970387243736, "grad_norm": 0.3003963990271382, "learning_rate": 5e-06, "loss": 0.6317, "step": 240 }, { "epoch": 0.5694760820045558, "grad_norm": 0.3387577414932059, "learning_rate": 5e-06, "loss": 0.6354, "step": 250 }, { "epoch": 0.592255125284738, "grad_norm": 0.30285545820421356, "learning_rate": 5e-06, "loss": 0.6335, "step": 260 }, { "epoch": 0.6150341685649203, "grad_norm": 0.3155840468520353, "learning_rate": 5e-06, "loss": 0.6313, "step": 270 }, { "epoch": 0.6378132118451025, "grad_norm": 0.31986432306959706, "learning_rate": 5e-06, "loss": 0.6429, "step": 280 }, { "epoch": 0.6605922551252847, "grad_norm": 0.3215717385758522, "learning_rate": 5e-06, "loss": 0.6306, "step": 290 }, { "epoch": 0.683371298405467, "grad_norm": 0.2843677230367682, "learning_rate": 5e-06, "loss": 0.6288, "step": 300 }, { "epoch": 0.7061503416856492, "grad_norm": 0.29860979950924554, "learning_rate": 5e-06, "loss": 0.6286, "step": 310 }, { "epoch": 0.7289293849658315, "grad_norm": 0.3421102512943671, "learning_rate": 5e-06, "loss": 0.6304, "step": 320 }, { "epoch": 0.7517084282460137, "grad_norm": 0.30230237595313497, "learning_rate": 5e-06, "loss": 0.6259, "step": 330 }, { "epoch": 0.7744874715261959, "grad_norm": 0.3242375947167446, "learning_rate": 5e-06, "loss": 0.6316, "step": 340 }, { "epoch": 0.7972665148063781, "grad_norm": 0.33505088740414596, "learning_rate": 5e-06, "loss": 0.6396, "step": 350 }, { "epoch": 0.8200455580865603, "grad_norm": 0.3181545099474234, "learning_rate": 5e-06, "loss": 0.6313, "step": 360 }, { "epoch": 0.8428246013667426, "grad_norm": 0.33228646342910256, "learning_rate": 5e-06, "loss": 0.6246, "step": 370 }, { "epoch": 0.8656036446469249, "grad_norm": 0.31771307346794053, "learning_rate": 5e-06, "loss": 0.6364, "step": 380 }, { "epoch": 0.8883826879271071, "grad_norm": 0.33335301062788486, "learning_rate": 5e-06, "loss": 0.6356, "step": 390 }, { "epoch": 0.9111617312072893, "grad_norm": 0.3008063121883294, "learning_rate": 5e-06, "loss": 0.6389, "step": 400 }, { "epoch": 0.9339407744874715, "grad_norm": 0.2849573758523548, "learning_rate": 5e-06, "loss": 0.6344, "step": 410 }, { "epoch": 0.9567198177676538, "grad_norm": 0.30446671994288743, "learning_rate": 5e-06, "loss": 0.6218, "step": 420 }, { "epoch": 0.979498861047836, "grad_norm": 0.34683096229419913, "learning_rate": 5e-06, "loss": 0.6342, "step": 430 }, { "epoch": 1.0, "eval_loss": 0.6256291270256042, "eval_runtime": 113.4259, "eval_samples_per_second": 104.235, "eval_steps_per_second": 0.414, "step": 439 }, { "epoch": 1.0022779043280183, "grad_norm": 0.3602620020640227, "learning_rate": 5e-06, "loss": 0.6154, "step": 440 }, { "epoch": 1.0250569476082005, "grad_norm": 0.34395688755773274, "learning_rate": 5e-06, "loss": 0.6008, "step": 450 }, { "epoch": 1.0478359908883828, "grad_norm": 0.28793755227416584, "learning_rate": 5e-06, "loss": 0.5988, "step": 460 }, { "epoch": 1.070615034168565, "grad_norm": 0.3130290296959807, "learning_rate": 5e-06, "loss": 0.5963, "step": 470 }, { "epoch": 1.0933940774487472, "grad_norm": 0.3499557098228171, "learning_rate": 5e-06, "loss": 0.6032, "step": 480 }, { "epoch": 1.1161731207289294, "grad_norm": 0.3167538370762415, "learning_rate": 5e-06, "loss": 0.6081, "step": 490 }, { "epoch": 1.1389521640091116, "grad_norm": 0.3349317939917438, "learning_rate": 5e-06, "loss": 0.5983, "step": 500 }, { "epoch": 1.1617312072892938, "grad_norm": 0.3292658287931989, "learning_rate": 5e-06, "loss": 0.5955, "step": 510 }, { "epoch": 1.184510250569476, "grad_norm": 0.2967678480397173, "learning_rate": 5e-06, "loss": 0.6023, "step": 520 }, { "epoch": 1.2072892938496582, "grad_norm": 0.30361211541373534, "learning_rate": 5e-06, "loss": 0.5979, "step": 530 }, { "epoch": 1.2300683371298406, "grad_norm": 0.3144938982592506, "learning_rate": 5e-06, "loss": 0.6031, "step": 540 }, { "epoch": 1.2528473804100229, "grad_norm": 0.3224243344188798, "learning_rate": 5e-06, "loss": 0.6057, "step": 550 }, { "epoch": 1.275626423690205, "grad_norm": 0.2909142280815913, "learning_rate": 5e-06, "loss": 0.6016, "step": 560 }, { "epoch": 1.2984054669703873, "grad_norm": 0.27281571584371533, "learning_rate": 5e-06, "loss": 0.5977, "step": 570 }, { "epoch": 1.3211845102505695, "grad_norm": 0.29393369913014705, "learning_rate": 5e-06, "loss": 0.599, "step": 580 }, { "epoch": 1.3439635535307517, "grad_norm": 0.2806426163732274, "learning_rate": 5e-06, "loss": 0.5936, "step": 590 }, { "epoch": 1.366742596810934, "grad_norm": 0.32181121642041943, "learning_rate": 5e-06, "loss": 0.6035, "step": 600 }, { "epoch": 1.3895216400911161, "grad_norm": 0.33057935312052084, "learning_rate": 5e-06, "loss": 0.5998, "step": 610 }, { "epoch": 1.4123006833712983, "grad_norm": 0.29776416393925226, "learning_rate": 5e-06, "loss": 0.6048, "step": 620 }, { "epoch": 1.4350797266514808, "grad_norm": 0.3187235177646908, "learning_rate": 5e-06, "loss": 0.604, "step": 630 }, { "epoch": 1.4578587699316627, "grad_norm": 0.30109896478239506, "learning_rate": 5e-06, "loss": 0.5943, "step": 640 }, { "epoch": 1.4806378132118452, "grad_norm": 0.3058536488410727, "learning_rate": 5e-06, "loss": 0.5969, "step": 650 }, { "epoch": 1.5034168564920274, "grad_norm": 0.2961055030713261, "learning_rate": 5e-06, "loss": 0.5934, "step": 660 }, { "epoch": 1.5261958997722096, "grad_norm": 0.32268254074465025, "learning_rate": 5e-06, "loss": 0.6031, "step": 670 }, { "epoch": 1.5489749430523918, "grad_norm": 0.2884659126878991, "learning_rate": 5e-06, "loss": 0.5933, "step": 680 }, { "epoch": 1.571753986332574, "grad_norm": 0.297377445772241, "learning_rate": 5e-06, "loss": 0.5927, "step": 690 }, { "epoch": 1.5945330296127562, "grad_norm": 0.2927878668275797, "learning_rate": 5e-06, "loss": 0.6108, "step": 700 }, { "epoch": 1.6173120728929384, "grad_norm": 0.29960294143917904, "learning_rate": 5e-06, "loss": 0.5971, "step": 710 }, { "epoch": 1.6400911161731209, "grad_norm": 0.2823864099624513, "learning_rate": 5e-06, "loss": 0.599, "step": 720 }, { "epoch": 1.6628701594533029, "grad_norm": 0.3298411305027824, "learning_rate": 5e-06, "loss": 0.5988, "step": 730 }, { "epoch": 1.6856492027334853, "grad_norm": 0.32096887145187725, "learning_rate": 5e-06, "loss": 0.5942, "step": 740 }, { "epoch": 1.7084282460136673, "grad_norm": 0.326904120172968, "learning_rate": 5e-06, "loss": 0.5976, "step": 750 }, { "epoch": 1.7312072892938497, "grad_norm": 0.3004047360631492, "learning_rate": 5e-06, "loss": 0.5948, "step": 760 }, { "epoch": 1.753986332574032, "grad_norm": 0.2935646513952348, "learning_rate": 5e-06, "loss": 0.5949, "step": 770 }, { "epoch": 1.7767653758542141, "grad_norm": 0.3125619213493301, "learning_rate": 5e-06, "loss": 0.6001, "step": 780 }, { "epoch": 1.7995444191343963, "grad_norm": 0.3169307013507772, "learning_rate": 5e-06, "loss": 0.5981, "step": 790 }, { "epoch": 1.8223234624145785, "grad_norm": 0.29918234015264383, "learning_rate": 5e-06, "loss": 0.5894, "step": 800 }, { "epoch": 1.845102505694761, "grad_norm": 0.3132325711095288, "learning_rate": 5e-06, "loss": 0.59, "step": 810 }, { "epoch": 1.867881548974943, "grad_norm": 0.32890776445477105, "learning_rate": 5e-06, "loss": 0.5906, "step": 820 }, { "epoch": 1.8906605922551254, "grad_norm": 0.3219739904566219, "learning_rate": 5e-06, "loss": 0.597, "step": 830 }, { "epoch": 1.9134396355353074, "grad_norm": 0.30897624435850635, "learning_rate": 5e-06, "loss": 0.5986, "step": 840 }, { "epoch": 1.9362186788154898, "grad_norm": 0.2987799249080493, "learning_rate": 5e-06, "loss": 0.5973, "step": 850 }, { "epoch": 1.958997722095672, "grad_norm": 0.28864632125618506, "learning_rate": 5e-06, "loss": 0.5976, "step": 860 }, { "epoch": 1.9817767653758542, "grad_norm": 0.30847409916462015, "learning_rate": 5e-06, "loss": 0.5978, "step": 870 }, { "epoch": 2.0, "eval_loss": 0.6193838715553284, "eval_runtime": 113.1154, "eval_samples_per_second": 104.522, "eval_steps_per_second": 0.416, "step": 878 }, { "epoch": 2.0045558086560367, "grad_norm": 0.3137021968628789, "learning_rate": 5e-06, "loss": 0.5872, "step": 880 }, { "epoch": 2.0273348519362187, "grad_norm": 0.30071543886182633, "learning_rate": 5e-06, "loss": 0.556, "step": 890 }, { "epoch": 2.050113895216401, "grad_norm": 0.3181241053043949, "learning_rate": 5e-06, "loss": 0.5683, "step": 900 }, { "epoch": 2.072892938496583, "grad_norm": 0.3215476859380607, "learning_rate": 5e-06, "loss": 0.5671, "step": 910 }, { "epoch": 2.0956719817767655, "grad_norm": 0.29933719760378436, "learning_rate": 5e-06, "loss": 0.5729, "step": 920 }, { "epoch": 2.1184510250569475, "grad_norm": 0.2937608437190206, "learning_rate": 5e-06, "loss": 0.5655, "step": 930 }, { "epoch": 2.14123006833713, "grad_norm": 0.310137570277187, "learning_rate": 5e-06, "loss": 0.5684, "step": 940 }, { "epoch": 2.164009111617312, "grad_norm": 0.2880285833232218, "learning_rate": 5e-06, "loss": 0.5635, "step": 950 }, { "epoch": 2.1867881548974943, "grad_norm": 0.29687571567703486, "learning_rate": 5e-06, "loss": 0.5684, "step": 960 }, { "epoch": 2.2095671981776768, "grad_norm": 0.3083759467790449, "learning_rate": 5e-06, "loss": 0.5694, "step": 970 }, { "epoch": 2.2323462414578588, "grad_norm": 0.2960376868080064, "learning_rate": 5e-06, "loss": 0.5618, "step": 980 }, { "epoch": 2.255125284738041, "grad_norm": 0.2729683850426089, "learning_rate": 5e-06, "loss": 0.5659, "step": 990 }, { "epoch": 2.277904328018223, "grad_norm": 0.3676983350737448, "learning_rate": 5e-06, "loss": 0.5677, "step": 1000 }, { "epoch": 2.3006833712984056, "grad_norm": 0.29979543582966117, "learning_rate": 5e-06, "loss": 0.5657, "step": 1010 }, { "epoch": 2.3234624145785876, "grad_norm": 0.3150574450739465, "learning_rate": 5e-06, "loss": 0.5695, "step": 1020 }, { "epoch": 2.34624145785877, "grad_norm": 0.29866514403413835, "learning_rate": 5e-06, "loss": 0.57, "step": 1030 }, { "epoch": 2.369020501138952, "grad_norm": 0.30022038641134147, "learning_rate": 5e-06, "loss": 0.5678, "step": 1040 }, { "epoch": 2.3917995444191344, "grad_norm": 0.30344605993782325, "learning_rate": 5e-06, "loss": 0.5655, "step": 1050 }, { "epoch": 2.4145785876993164, "grad_norm": 0.30571626801721524, "learning_rate": 5e-06, "loss": 0.5764, "step": 1060 }, { "epoch": 2.437357630979499, "grad_norm": 0.32949832175555666, "learning_rate": 5e-06, "loss": 0.5791, "step": 1070 }, { "epoch": 2.4601366742596813, "grad_norm": 0.3149321897462233, "learning_rate": 5e-06, "loss": 0.5743, "step": 1080 }, { "epoch": 2.4829157175398633, "grad_norm": 0.30371196782165644, "learning_rate": 5e-06, "loss": 0.5726, "step": 1090 }, { "epoch": 2.5056947608200457, "grad_norm": 0.3550601237225024, "learning_rate": 5e-06, "loss": 0.5779, "step": 1100 }, { "epoch": 2.5284738041002277, "grad_norm": 0.32449640072708563, "learning_rate": 5e-06, "loss": 0.5696, "step": 1110 }, { "epoch": 2.55125284738041, "grad_norm": 0.29536363274710853, "learning_rate": 5e-06, "loss": 0.5708, "step": 1120 }, { "epoch": 2.574031890660592, "grad_norm": 0.32326444150399974, "learning_rate": 5e-06, "loss": 0.5687, "step": 1130 }, { "epoch": 2.5968109339407746, "grad_norm": 0.29854931304839455, "learning_rate": 5e-06, "loss": 0.57, "step": 1140 }, { "epoch": 2.619589977220957, "grad_norm": 0.2774546049422405, "learning_rate": 5e-06, "loss": 0.5687, "step": 1150 }, { "epoch": 2.642369020501139, "grad_norm": 0.320847736330343, "learning_rate": 5e-06, "loss": 0.5683, "step": 1160 }, { "epoch": 2.665148063781321, "grad_norm": 0.3038825483920144, "learning_rate": 5e-06, "loss": 0.5756, "step": 1170 }, { "epoch": 2.6879271070615034, "grad_norm": 0.31626522926626655, "learning_rate": 5e-06, "loss": 0.5688, "step": 1180 }, { "epoch": 2.710706150341686, "grad_norm": 0.31475420899136236, "learning_rate": 5e-06, "loss": 0.5668, "step": 1190 }, { "epoch": 2.733485193621868, "grad_norm": 0.3032040716975987, "learning_rate": 5e-06, "loss": 0.5735, "step": 1200 }, { "epoch": 2.7562642369020502, "grad_norm": 0.294489594430855, "learning_rate": 5e-06, "loss": 0.5746, "step": 1210 }, { "epoch": 2.7790432801822322, "grad_norm": 0.28352737426646324, "learning_rate": 5e-06, "loss": 0.5645, "step": 1220 }, { "epoch": 2.8018223234624147, "grad_norm": 0.3019187072329014, "learning_rate": 5e-06, "loss": 0.5662, "step": 1230 }, { "epoch": 2.8246013667425967, "grad_norm": 0.29450570760369266, "learning_rate": 5e-06, "loss": 0.5776, "step": 1240 }, { "epoch": 2.847380410022779, "grad_norm": 0.28917558370883034, "learning_rate": 5e-06, "loss": 0.5766, "step": 1250 }, { "epoch": 2.8701594533029615, "grad_norm": 0.28348244461550015, "learning_rate": 5e-06, "loss": 0.5712, "step": 1260 }, { "epoch": 2.8929384965831435, "grad_norm": 0.29607489274032067, "learning_rate": 5e-06, "loss": 0.5739, "step": 1270 }, { "epoch": 2.9157175398633255, "grad_norm": 0.3154865371104941, "learning_rate": 5e-06, "loss": 0.5664, "step": 1280 }, { "epoch": 2.938496583143508, "grad_norm": 0.3261178018397659, "learning_rate": 5e-06, "loss": 0.5603, "step": 1290 }, { "epoch": 2.9612756264236904, "grad_norm": 0.2965242333469874, "learning_rate": 5e-06, "loss": 0.5668, "step": 1300 }, { "epoch": 2.9840546697038723, "grad_norm": 0.2979844145574236, "learning_rate": 5e-06, "loss": 0.5732, "step": 1310 }, { "epoch": 3.0, "eval_loss": 0.6193576455116272, "eval_runtime": 113.4465, "eval_samples_per_second": 104.217, "eval_steps_per_second": 0.414, "step": 1317 }, { "epoch": 3.0068337129840548, "grad_norm": 0.2956883479073517, "learning_rate": 5e-06, "loss": 0.5578, "step": 1320 }, { "epoch": 3.0296127562642368, "grad_norm": 0.3283040695026618, "learning_rate": 5e-06, "loss": 0.5379, "step": 1330 }, { "epoch": 3.052391799544419, "grad_norm": 0.28542889086490736, "learning_rate": 5e-06, "loss": 0.5411, "step": 1340 }, { "epoch": 3.075170842824601, "grad_norm": 0.31853253784878555, "learning_rate": 5e-06, "loss": 0.5424, "step": 1350 }, { "epoch": 3.0979498861047836, "grad_norm": 0.2987374379369755, "learning_rate": 5e-06, "loss": 0.5382, "step": 1360 }, { "epoch": 3.120728929384966, "grad_norm": 0.28369399180470734, "learning_rate": 5e-06, "loss": 0.5396, "step": 1370 }, { "epoch": 3.143507972665148, "grad_norm": 0.29848307489101883, "learning_rate": 5e-06, "loss": 0.5379, "step": 1380 }, { "epoch": 3.1662870159453305, "grad_norm": 0.28281172058487525, "learning_rate": 5e-06, "loss": 0.5378, "step": 1390 }, { "epoch": 3.1890660592255125, "grad_norm": 0.326068339866218, "learning_rate": 5e-06, "loss": 0.5355, "step": 1400 }, { "epoch": 3.211845102505695, "grad_norm": 0.286394634763708, "learning_rate": 5e-06, "loss": 0.5392, "step": 1410 }, { "epoch": 3.234624145785877, "grad_norm": 0.29481950476140845, "learning_rate": 5e-06, "loss": 0.5453, "step": 1420 }, { "epoch": 3.2574031890660593, "grad_norm": 0.30132819765081525, "learning_rate": 5e-06, "loss": 0.5462, "step": 1430 }, { "epoch": 3.2801822323462413, "grad_norm": 0.2995713700746164, "learning_rate": 5e-06, "loss": 0.5389, "step": 1440 }, { "epoch": 3.3029612756264237, "grad_norm": 0.30326731163716586, "learning_rate": 5e-06, "loss": 0.536, "step": 1450 }, { "epoch": 3.3257403189066057, "grad_norm": 0.3083162582031878, "learning_rate": 5e-06, "loss": 0.5386, "step": 1460 }, { "epoch": 3.348519362186788, "grad_norm": 0.29491218271079633, "learning_rate": 5e-06, "loss": 0.5399, "step": 1470 }, { "epoch": 3.3712984054669706, "grad_norm": 0.32397760989963276, "learning_rate": 5e-06, "loss": 0.5424, "step": 1480 }, { "epoch": 3.3940774487471526, "grad_norm": 0.3109701142211013, "learning_rate": 5e-06, "loss": 0.541, "step": 1490 }, { "epoch": 3.416856492027335, "grad_norm": 0.28649653583977264, "learning_rate": 5e-06, "loss": 0.5383, "step": 1500 }, { "epoch": 3.439635535307517, "grad_norm": 0.29162775892122766, "learning_rate": 5e-06, "loss": 0.543, "step": 1510 }, { "epoch": 3.4624145785876994, "grad_norm": 0.328813316428343, "learning_rate": 5e-06, "loss": 0.5402, "step": 1520 }, { "epoch": 3.4851936218678814, "grad_norm": 0.296329700264349, "learning_rate": 5e-06, "loss": 0.548, "step": 1530 }, { "epoch": 3.507972665148064, "grad_norm": 0.32679718874242103, "learning_rate": 5e-06, "loss": 0.5481, "step": 1540 }, { "epoch": 3.5307517084282463, "grad_norm": 0.2978227357429331, "learning_rate": 5e-06, "loss": 0.5436, "step": 1550 }, { "epoch": 3.5535307517084282, "grad_norm": 0.30550394149901167, "learning_rate": 5e-06, "loss": 0.5429, "step": 1560 }, { "epoch": 3.5763097949886102, "grad_norm": 0.34688194348484774, "learning_rate": 5e-06, "loss": 0.5458, "step": 1570 }, { "epoch": 3.5990888382687927, "grad_norm": 0.2994450574269458, "learning_rate": 5e-06, "loss": 0.546, "step": 1580 }, { "epoch": 3.621867881548975, "grad_norm": 0.31537321423200465, "learning_rate": 5e-06, "loss": 0.5422, "step": 1590 }, { "epoch": 3.644646924829157, "grad_norm": 0.3063603045559125, "learning_rate": 5e-06, "loss": 0.5359, "step": 1600 }, { "epoch": 3.6674259681093395, "grad_norm": 0.2916735901907535, "learning_rate": 5e-06, "loss": 0.5422, "step": 1610 }, { "epoch": 3.6902050113895215, "grad_norm": 0.30734650480956516, "learning_rate": 5e-06, "loss": 0.5427, "step": 1620 }, { "epoch": 3.712984054669704, "grad_norm": 0.3033502937728263, "learning_rate": 5e-06, "loss": 0.5419, "step": 1630 }, { "epoch": 3.735763097949886, "grad_norm": 0.30259245354328573, "learning_rate": 5e-06, "loss": 0.5384, "step": 1640 }, { "epoch": 3.7585421412300684, "grad_norm": 0.31130340672642876, "learning_rate": 5e-06, "loss": 0.552, "step": 1650 }, { "epoch": 3.781321184510251, "grad_norm": 0.30596028970571365, "learning_rate": 5e-06, "loss": 0.5416, "step": 1660 }, { "epoch": 3.8041002277904328, "grad_norm": 0.29635873101912724, "learning_rate": 5e-06, "loss": 0.5497, "step": 1670 }, { "epoch": 3.8268792710706148, "grad_norm": 0.3197535185114282, "learning_rate": 5e-06, "loss": 0.5429, "step": 1680 }, { "epoch": 3.849658314350797, "grad_norm": 0.3011450510960511, "learning_rate": 5e-06, "loss": 0.5451, "step": 1690 }, { "epoch": 3.8724373576309796, "grad_norm": 0.29556191119994085, "learning_rate": 5e-06, "loss": 0.5422, "step": 1700 }, { "epoch": 3.8952164009111616, "grad_norm": 0.29782230107739593, "learning_rate": 5e-06, "loss": 0.5463, "step": 1710 }, { "epoch": 3.917995444191344, "grad_norm": 0.2786017141276205, "learning_rate": 5e-06, "loss": 0.5505, "step": 1720 }, { "epoch": 3.940774487471526, "grad_norm": 0.3011821733566416, "learning_rate": 5e-06, "loss": 0.5496, "step": 1730 }, { "epoch": 3.9635535307517085, "grad_norm": 0.2891980461006773, "learning_rate": 5e-06, "loss": 0.5511, "step": 1740 }, { "epoch": 3.9863325740318905, "grad_norm": 0.29323206246213684, "learning_rate": 5e-06, "loss": 0.5461, "step": 1750 }, { "epoch": 4.0, "eval_loss": 0.6239981055259705, "eval_runtime": 113.6401, "eval_samples_per_second": 104.039, "eval_steps_per_second": 0.414, "step": 1756 }, { "epoch": 4.009111617312073, "grad_norm": 0.31845782541537054, "learning_rate": 5e-06, "loss": 0.5355, "step": 1760 }, { "epoch": 4.031890660592255, "grad_norm": 0.3247571121989434, "learning_rate": 5e-06, "loss": 0.5112, "step": 1770 }, { "epoch": 4.054669703872437, "grad_norm": 0.33205526411756225, "learning_rate": 5e-06, "loss": 0.5155, "step": 1780 }, { "epoch": 4.077448747152619, "grad_norm": 0.2961306281740012, "learning_rate": 5e-06, "loss": 0.5107, "step": 1790 }, { "epoch": 4.100227790432802, "grad_norm": 0.29404582667456747, "learning_rate": 5e-06, "loss": 0.5168, "step": 1800 }, { "epoch": 4.123006833712984, "grad_norm": 0.29099721485432206, "learning_rate": 5e-06, "loss": 0.5126, "step": 1810 }, { "epoch": 4.145785876993166, "grad_norm": 0.2850302733912273, "learning_rate": 5e-06, "loss": 0.5134, "step": 1820 }, { "epoch": 4.168564920273348, "grad_norm": 0.28612051891525286, "learning_rate": 5e-06, "loss": 0.5093, "step": 1830 }, { "epoch": 4.191343963553531, "grad_norm": 0.29474345150964504, "learning_rate": 5e-06, "loss": 0.5174, "step": 1840 }, { "epoch": 4.214123006833713, "grad_norm": 0.3002301077876599, "learning_rate": 5e-06, "loss": 0.5172, "step": 1850 }, { "epoch": 4.236902050113895, "grad_norm": 0.2920498114457919, "learning_rate": 5e-06, "loss": 0.5121, "step": 1860 }, { "epoch": 4.259681093394078, "grad_norm": 0.2942506005889262, "learning_rate": 5e-06, "loss": 0.5101, "step": 1870 }, { "epoch": 4.28246013667426, "grad_norm": 0.3069703095242122, "learning_rate": 5e-06, "loss": 0.5144, "step": 1880 }, { "epoch": 4.305239179954442, "grad_norm": 0.2997675139173423, "learning_rate": 5e-06, "loss": 0.517, "step": 1890 }, { "epoch": 4.328018223234624, "grad_norm": 0.3017224318798671, "learning_rate": 5e-06, "loss": 0.5163, "step": 1900 }, { "epoch": 4.350797266514807, "grad_norm": 0.32710267583639663, "learning_rate": 5e-06, "loss": 0.517, "step": 1910 }, { "epoch": 4.373576309794989, "grad_norm": 0.35589318746923176, "learning_rate": 5e-06, "loss": 0.5148, "step": 1920 }, { "epoch": 4.396355353075171, "grad_norm": 0.29856323049173306, "learning_rate": 5e-06, "loss": 0.5167, "step": 1930 }, { "epoch": 4.4191343963553535, "grad_norm": 0.31561434261206783, "learning_rate": 5e-06, "loss": 0.51, "step": 1940 }, { "epoch": 4.4419134396355355, "grad_norm": 0.32343506635297187, "learning_rate": 5e-06, "loss": 0.5172, "step": 1950 }, { "epoch": 4.4646924829157175, "grad_norm": 0.2920601079152555, "learning_rate": 5e-06, "loss": 0.5134, "step": 1960 }, { "epoch": 4.4874715261958995, "grad_norm": 0.30047616053441856, "learning_rate": 5e-06, "loss": 0.5123, "step": 1970 }, { "epoch": 4.510250569476082, "grad_norm": 0.299324693496857, "learning_rate": 5e-06, "loss": 0.5177, "step": 1980 }, { "epoch": 4.533029612756264, "grad_norm": 0.30340189870468204, "learning_rate": 5e-06, "loss": 0.5095, "step": 1990 }, { "epoch": 4.555808656036446, "grad_norm": 0.32578537933466617, "learning_rate": 5e-06, "loss": 0.5126, "step": 2000 }, { "epoch": 4.578587699316628, "grad_norm": 0.3214168201654836, "learning_rate": 5e-06, "loss": 0.516, "step": 2010 }, { "epoch": 4.601366742596811, "grad_norm": 0.33535913909657655, "learning_rate": 5e-06, "loss": 0.516, "step": 2020 }, { "epoch": 4.624145785876993, "grad_norm": 0.34950298355241916, "learning_rate": 5e-06, "loss": 0.5159, "step": 2030 }, { "epoch": 4.646924829157175, "grad_norm": 0.3067631933457454, "learning_rate": 5e-06, "loss": 0.5103, "step": 2040 }, { "epoch": 4.669703872437358, "grad_norm": 0.31011573141446475, "learning_rate": 5e-06, "loss": 0.526, "step": 2050 }, { "epoch": 4.69248291571754, "grad_norm": 0.31485826733211975, "learning_rate": 5e-06, "loss": 0.5205, "step": 2060 }, { "epoch": 4.715261958997722, "grad_norm": 0.2981899092207014, "learning_rate": 5e-06, "loss": 0.5128, "step": 2070 }, { "epoch": 4.738041002277904, "grad_norm": 0.30070781244970324, "learning_rate": 5e-06, "loss": 0.5195, "step": 2080 }, { "epoch": 4.760820045558087, "grad_norm": 0.29089650375946846, "learning_rate": 5e-06, "loss": 0.5192, "step": 2090 }, { "epoch": 4.783599088838269, "grad_norm": 0.32942349791391345, "learning_rate": 5e-06, "loss": 0.5155, "step": 2100 }, { "epoch": 4.806378132118451, "grad_norm": 0.30690284165223813, "learning_rate": 5e-06, "loss": 0.5157, "step": 2110 }, { "epoch": 4.829157175398633, "grad_norm": 0.29755186369998715, "learning_rate": 5e-06, "loss": 0.515, "step": 2120 }, { "epoch": 4.851936218678816, "grad_norm": 0.29726224376764365, "learning_rate": 5e-06, "loss": 0.52, "step": 2130 }, { "epoch": 4.874715261958998, "grad_norm": 0.3121093770179605, "learning_rate": 5e-06, "loss": 0.5214, "step": 2140 }, { "epoch": 4.89749430523918, "grad_norm": 0.2921989069170297, "learning_rate": 5e-06, "loss": 0.5141, "step": 2150 }, { "epoch": 4.920273348519363, "grad_norm": 0.3137625481692737, "learning_rate": 5e-06, "loss": 0.5191, "step": 2160 }, { "epoch": 4.943052391799545, "grad_norm": 0.31268104737726976, "learning_rate": 5e-06, "loss": 0.5204, "step": 2170 }, { "epoch": 4.965831435079727, "grad_norm": 0.31240087794789345, "learning_rate": 5e-06, "loss": 0.5214, "step": 2180 }, { "epoch": 4.988610478359909, "grad_norm": 0.27920850439212735, "learning_rate": 5e-06, "loss": 0.5179, "step": 2190 }, { "epoch": 5.0, "eval_loss": 0.6346195340156555, "eval_runtime": 112.3467, "eval_samples_per_second": 105.237, "eval_steps_per_second": 0.418, "step": 2195 }, { "epoch": 5.0, "step": 2195, "total_flos": 4601905523195904.0, "train_loss": 0.5737499028512177, "train_runtime": 30529.5931, "train_samples_per_second": 36.787, "train_steps_per_second": 0.072 } ], "logging_steps": 10, "max_steps": 2195, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4601905523195904.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }