| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 2195, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.022779043280182234, | |
| "grad_norm": 1.461890268182302, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7557, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04555808656036447, | |
| "grad_norm": 0.8438010452400335, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6991, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0683371298405467, | |
| "grad_norm": 0.9176367932491145, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6807, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09111617312072894, | |
| "grad_norm": 0.8591490414558551, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6795, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11389521640091116, | |
| "grad_norm": 0.8079754565291697, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6751, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1366742596810934, | |
| "grad_norm": 0.676950753600941, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6551, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15945330296127563, | |
| "grad_norm": 0.4397948544262224, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6519, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18223234624145787, | |
| "grad_norm": 0.36816486280988786, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6552, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.20501138952164008, | |
| "grad_norm": 0.31467376370284666, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6394, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22779043280182232, | |
| "grad_norm": 0.2990731503632509, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6413, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2505694760820046, | |
| "grad_norm": 0.34146204165056326, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6325, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2733485193621868, | |
| "grad_norm": 0.3118885479261177, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6411, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.296127562642369, | |
| "grad_norm": 0.29309780131472263, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6422, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.31890660592255127, | |
| "grad_norm": 0.3076465447126475, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6443, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3416856492027335, | |
| "grad_norm": 0.3146670257810527, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6369, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.36446469248291574, | |
| "grad_norm": 0.3089804621013958, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6424, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.38724373576309795, | |
| "grad_norm": 0.34694834193027213, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6335, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.41002277904328016, | |
| "grad_norm": 0.3056788305992376, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6399, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4328018223234624, | |
| "grad_norm": 0.30612032878616063, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6313, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.45558086560364464, | |
| "grad_norm": 0.3055698368680009, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6408, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4783599088838269, | |
| "grad_norm": 0.3093335714854928, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6357, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5011389521640092, | |
| "grad_norm": 0.2994802389469234, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6396, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5239179954441914, | |
| "grad_norm": 0.3262302042973422, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5466970387243736, | |
| "grad_norm": 0.3003963990271382, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6317, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5694760820045558, | |
| "grad_norm": 0.3387577414932059, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6354, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.592255125284738, | |
| "grad_norm": 0.30285545820421356, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6335, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6150341685649203, | |
| "grad_norm": 0.3155840468520353, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6313, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6378132118451025, | |
| "grad_norm": 0.31986432306959706, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6429, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6605922551252847, | |
| "grad_norm": 0.3215717385758522, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6306, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.683371298405467, | |
| "grad_norm": 0.2843677230367682, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6288, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7061503416856492, | |
| "grad_norm": 0.29860979950924554, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6286, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7289293849658315, | |
| "grad_norm": 0.3421102512943671, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6304, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7517084282460137, | |
| "grad_norm": 0.30230237595313497, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6259, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7744874715261959, | |
| "grad_norm": 0.3242375947167446, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6316, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7972665148063781, | |
| "grad_norm": 0.33505088740414596, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6396, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8200455580865603, | |
| "grad_norm": 0.3181545099474234, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6313, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8428246013667426, | |
| "grad_norm": 0.33228646342910256, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6246, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8656036446469249, | |
| "grad_norm": 0.31771307346794053, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6364, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8883826879271071, | |
| "grad_norm": 0.33335301062788486, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6356, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9111617312072893, | |
| "grad_norm": 0.3008063121883294, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6389, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9339407744874715, | |
| "grad_norm": 0.2849573758523548, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6344, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9567198177676538, | |
| "grad_norm": 0.30446671994288743, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6218, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.979498861047836, | |
| "grad_norm": 0.34683096229419913, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6342, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.6256291270256042, | |
| "eval_runtime": 113.4259, | |
| "eval_samples_per_second": 104.235, | |
| "eval_steps_per_second": 0.414, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.0022779043280183, | |
| "grad_norm": 0.3602620020640227, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6154, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.0250569476082005, | |
| "grad_norm": 0.34395688755773274, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6008, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0478359908883828, | |
| "grad_norm": 0.28793755227416584, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5988, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.070615034168565, | |
| "grad_norm": 0.3130290296959807, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5963, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.0933940774487472, | |
| "grad_norm": 0.3499557098228171, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6032, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.1161731207289294, | |
| "grad_norm": 0.3167538370762415, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6081, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.1389521640091116, | |
| "grad_norm": 0.3349317939917438, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5983, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.1617312072892938, | |
| "grad_norm": 0.3292658287931989, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5955, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.184510250569476, | |
| "grad_norm": 0.2967678480397173, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6023, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.2072892938496582, | |
| "grad_norm": 0.30361211541373534, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5979, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.2300683371298406, | |
| "grad_norm": 0.3144938982592506, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6031, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.2528473804100229, | |
| "grad_norm": 0.3224243344188798, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6057, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.275626423690205, | |
| "grad_norm": 0.2909142280815913, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6016, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.2984054669703873, | |
| "grad_norm": 0.27281571584371533, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5977, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.3211845102505695, | |
| "grad_norm": 0.29393369913014705, | |
| "learning_rate": 5e-06, | |
| "loss": 0.599, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.3439635535307517, | |
| "grad_norm": 0.2806426163732274, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5936, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.366742596810934, | |
| "grad_norm": 0.32181121642041943, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6035, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.3895216400911161, | |
| "grad_norm": 0.33057935312052084, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5998, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.4123006833712983, | |
| "grad_norm": 0.29776416393925226, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6048, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.4350797266514808, | |
| "grad_norm": 0.3187235177646908, | |
| "learning_rate": 5e-06, | |
| "loss": 0.604, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.4578587699316627, | |
| "grad_norm": 0.30109896478239506, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5943, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.4806378132118452, | |
| "grad_norm": 0.3058536488410727, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5969, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.5034168564920274, | |
| "grad_norm": 0.2961055030713261, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5934, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.5261958997722096, | |
| "grad_norm": 0.32268254074465025, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6031, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.5489749430523918, | |
| "grad_norm": 0.2884659126878991, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5933, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.571753986332574, | |
| "grad_norm": 0.297377445772241, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5927, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.5945330296127562, | |
| "grad_norm": 0.2927878668275797, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6108, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6173120728929384, | |
| "grad_norm": 0.29960294143917904, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5971, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.6400911161731209, | |
| "grad_norm": 0.2823864099624513, | |
| "learning_rate": 5e-06, | |
| "loss": 0.599, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.6628701594533029, | |
| "grad_norm": 0.3298411305027824, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5988, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.6856492027334853, | |
| "grad_norm": 0.32096887145187725, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5942, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.7084282460136673, | |
| "grad_norm": 0.326904120172968, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5976, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.7312072892938497, | |
| "grad_norm": 0.3004047360631492, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5948, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.753986332574032, | |
| "grad_norm": 0.2935646513952348, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5949, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.7767653758542141, | |
| "grad_norm": 0.3125619213493301, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6001, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.7995444191343963, | |
| "grad_norm": 0.3169307013507772, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5981, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.8223234624145785, | |
| "grad_norm": 0.29918234015264383, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5894, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.845102505694761, | |
| "grad_norm": 0.3132325711095288, | |
| "learning_rate": 5e-06, | |
| "loss": 0.59, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.867881548974943, | |
| "grad_norm": 0.32890776445477105, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5906, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.8906605922551254, | |
| "grad_norm": 0.3219739904566219, | |
| "learning_rate": 5e-06, | |
| "loss": 0.597, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.9134396355353074, | |
| "grad_norm": 0.30897624435850635, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5986, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.9362186788154898, | |
| "grad_norm": 0.2987799249080493, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5973, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.958997722095672, | |
| "grad_norm": 0.28864632125618506, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5976, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.9817767653758542, | |
| "grad_norm": 0.30847409916462015, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5978, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.6193838715553284, | |
| "eval_runtime": 113.1154, | |
| "eval_samples_per_second": 104.522, | |
| "eval_steps_per_second": 0.416, | |
| "step": 878 | |
| }, | |
| { | |
| "epoch": 2.0045558086560367, | |
| "grad_norm": 0.3137021968628789, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5872, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.0273348519362187, | |
| "grad_norm": 0.30071543886182633, | |
| "learning_rate": 5e-06, | |
| "loss": 0.556, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.050113895216401, | |
| "grad_norm": 0.3181241053043949, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5683, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.072892938496583, | |
| "grad_norm": 0.3215476859380607, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5671, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.0956719817767655, | |
| "grad_norm": 0.29933719760378436, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5729, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.1184510250569475, | |
| "grad_norm": 0.2937608437190206, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5655, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.14123006833713, | |
| "grad_norm": 0.310137570277187, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5684, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.164009111617312, | |
| "grad_norm": 0.2880285833232218, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5635, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.1867881548974943, | |
| "grad_norm": 0.29687571567703486, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5684, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.2095671981776768, | |
| "grad_norm": 0.3083759467790449, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5694, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.2323462414578588, | |
| "grad_norm": 0.2960376868080064, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5618, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.255125284738041, | |
| "grad_norm": 0.2729683850426089, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5659, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.277904328018223, | |
| "grad_norm": 0.3676983350737448, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5677, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.3006833712984056, | |
| "grad_norm": 0.29979543582966117, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5657, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.3234624145785876, | |
| "grad_norm": 0.3150574450739465, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5695, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.34624145785877, | |
| "grad_norm": 0.29866514403413835, | |
| "learning_rate": 5e-06, | |
| "loss": 0.57, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.369020501138952, | |
| "grad_norm": 0.30022038641134147, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5678, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.3917995444191344, | |
| "grad_norm": 0.30344605993782325, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5655, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.4145785876993164, | |
| "grad_norm": 0.30571626801721524, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5764, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.437357630979499, | |
| "grad_norm": 0.32949832175555666, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5791, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.4601366742596813, | |
| "grad_norm": 0.3149321897462233, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5743, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.4829157175398633, | |
| "grad_norm": 0.30371196782165644, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5726, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.5056947608200457, | |
| "grad_norm": 0.3550601237225024, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5779, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.5284738041002277, | |
| "grad_norm": 0.32449640072708563, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5696, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.55125284738041, | |
| "grad_norm": 0.29536363274710853, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5708, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.574031890660592, | |
| "grad_norm": 0.32326444150399974, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5687, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.5968109339407746, | |
| "grad_norm": 0.29854931304839455, | |
| "learning_rate": 5e-06, | |
| "loss": 0.57, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.619589977220957, | |
| "grad_norm": 0.2774546049422405, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5687, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.642369020501139, | |
| "grad_norm": 0.320847736330343, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5683, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.665148063781321, | |
| "grad_norm": 0.3038825483920144, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5756, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.6879271070615034, | |
| "grad_norm": 0.31626522926626655, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5688, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.710706150341686, | |
| "grad_norm": 0.31475420899136236, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5668, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.733485193621868, | |
| "grad_norm": 0.3032040716975987, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5735, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.7562642369020502, | |
| "grad_norm": 0.294489594430855, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5746, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.7790432801822322, | |
| "grad_norm": 0.28352737426646324, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5645, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.8018223234624147, | |
| "grad_norm": 0.3019187072329014, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5662, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.8246013667425967, | |
| "grad_norm": 0.29450570760369266, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5776, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.847380410022779, | |
| "grad_norm": 0.28917558370883034, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5766, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.8701594533029615, | |
| "grad_norm": 0.28348244461550015, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5712, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.8929384965831435, | |
| "grad_norm": 0.29607489274032067, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5739, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.9157175398633255, | |
| "grad_norm": 0.3154865371104941, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5664, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.938496583143508, | |
| "grad_norm": 0.3261178018397659, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5603, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.9612756264236904, | |
| "grad_norm": 0.2965242333469874, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5668, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.9840546697038723, | |
| "grad_norm": 0.2979844145574236, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5732, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.6193576455116272, | |
| "eval_runtime": 113.4465, | |
| "eval_samples_per_second": 104.217, | |
| "eval_steps_per_second": 0.414, | |
| "step": 1317 | |
| }, | |
| { | |
| "epoch": 3.0068337129840548, | |
| "grad_norm": 0.2956883479073517, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5578, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 3.0296127562642368, | |
| "grad_norm": 0.3283040695026618, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5379, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 3.052391799544419, | |
| "grad_norm": 0.28542889086490736, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5411, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 3.075170842824601, | |
| "grad_norm": 0.31853253784878555, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5424, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.0979498861047836, | |
| "grad_norm": 0.2987374379369755, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5382, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 3.120728929384966, | |
| "grad_norm": 0.28369399180470734, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5396, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 3.143507972665148, | |
| "grad_norm": 0.29848307489101883, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5379, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 3.1662870159453305, | |
| "grad_norm": 0.28281172058487525, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5378, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 3.1890660592255125, | |
| "grad_norm": 0.326068339866218, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5355, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.211845102505695, | |
| "grad_norm": 0.286394634763708, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5392, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 3.234624145785877, | |
| "grad_norm": 0.29481950476140845, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5453, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 3.2574031890660593, | |
| "grad_norm": 0.30132819765081525, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5462, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 3.2801822323462413, | |
| "grad_norm": 0.2995713700746164, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5389, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 3.3029612756264237, | |
| "grad_norm": 0.30326731163716586, | |
| "learning_rate": 5e-06, | |
| "loss": 0.536, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.3257403189066057, | |
| "grad_norm": 0.3083162582031878, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5386, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 3.348519362186788, | |
| "grad_norm": 0.29491218271079633, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5399, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 3.3712984054669706, | |
| "grad_norm": 0.32397760989963276, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5424, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 3.3940774487471526, | |
| "grad_norm": 0.3109701142211013, | |
| "learning_rate": 5e-06, | |
| "loss": 0.541, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 3.416856492027335, | |
| "grad_norm": 0.28649653583977264, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5383, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.439635535307517, | |
| "grad_norm": 0.29162775892122766, | |
| "learning_rate": 5e-06, | |
| "loss": 0.543, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 3.4624145785876994, | |
| "grad_norm": 0.328813316428343, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5402, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 3.4851936218678814, | |
| "grad_norm": 0.296329700264349, | |
| "learning_rate": 5e-06, | |
| "loss": 0.548, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 3.507972665148064, | |
| "grad_norm": 0.32679718874242103, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5481, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 3.5307517084282463, | |
| "grad_norm": 0.2978227357429331, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5436, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.5535307517084282, | |
| "grad_norm": 0.30550394149901167, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5429, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 3.5763097949886102, | |
| "grad_norm": 0.34688194348484774, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5458, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 3.5990888382687927, | |
| "grad_norm": 0.2994450574269458, | |
| "learning_rate": 5e-06, | |
| "loss": 0.546, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 3.621867881548975, | |
| "grad_norm": 0.31537321423200465, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5422, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 3.644646924829157, | |
| "grad_norm": 0.3063603045559125, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5359, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.6674259681093395, | |
| "grad_norm": 0.2916735901907535, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5422, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 3.6902050113895215, | |
| "grad_norm": 0.30734650480956516, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5427, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 3.712984054669704, | |
| "grad_norm": 0.3033502937728263, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5419, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 3.735763097949886, | |
| "grad_norm": 0.30259245354328573, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5384, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 3.7585421412300684, | |
| "grad_norm": 0.31130340672642876, | |
| "learning_rate": 5e-06, | |
| "loss": 0.552, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.781321184510251, | |
| "grad_norm": 0.30596028970571365, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5416, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 3.8041002277904328, | |
| "grad_norm": 0.29635873101912724, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5497, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 3.8268792710706148, | |
| "grad_norm": 0.3197535185114282, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5429, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 3.849658314350797, | |
| "grad_norm": 0.3011450510960511, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5451, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 3.8724373576309796, | |
| "grad_norm": 0.29556191119994085, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5422, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.8952164009111616, | |
| "grad_norm": 0.29782230107739593, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5463, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 3.917995444191344, | |
| "grad_norm": 0.2786017141276205, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5505, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 3.940774487471526, | |
| "grad_norm": 0.3011821733566416, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5496, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 3.9635535307517085, | |
| "grad_norm": 0.2891980461006773, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5511, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 3.9863325740318905, | |
| "grad_norm": 0.29323206246213684, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5461, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.6239981055259705, | |
| "eval_runtime": 113.6401, | |
| "eval_samples_per_second": 104.039, | |
| "eval_steps_per_second": 0.414, | |
| "step": 1756 | |
| }, | |
| { | |
| "epoch": 4.009111617312073, | |
| "grad_norm": 0.31845782541537054, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5355, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 4.031890660592255, | |
| "grad_norm": 0.3247571121989434, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5112, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 4.054669703872437, | |
| "grad_norm": 0.33205526411756225, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5155, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 4.077448747152619, | |
| "grad_norm": 0.2961306281740012, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5107, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 4.100227790432802, | |
| "grad_norm": 0.29404582667456747, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5168, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.123006833712984, | |
| "grad_norm": 0.29099721485432206, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5126, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 4.145785876993166, | |
| "grad_norm": 0.2850302733912273, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5134, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 4.168564920273348, | |
| "grad_norm": 0.28612051891525286, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5093, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 4.191343963553531, | |
| "grad_norm": 0.29474345150964504, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5174, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 4.214123006833713, | |
| "grad_norm": 0.3002301077876599, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5172, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 4.236902050113895, | |
| "grad_norm": 0.2920498114457919, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5121, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 4.259681093394078, | |
| "grad_norm": 0.2942506005889262, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5101, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 4.28246013667426, | |
| "grad_norm": 0.3069703095242122, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5144, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 4.305239179954442, | |
| "grad_norm": 0.2997675139173423, | |
| "learning_rate": 5e-06, | |
| "loss": 0.517, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 4.328018223234624, | |
| "grad_norm": 0.3017224318798671, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5163, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.350797266514807, | |
| "grad_norm": 0.32710267583639663, | |
| "learning_rate": 5e-06, | |
| "loss": 0.517, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 4.373576309794989, | |
| "grad_norm": 0.35589318746923176, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5148, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 4.396355353075171, | |
| "grad_norm": 0.29856323049173306, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5167, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 4.4191343963553535, | |
| "grad_norm": 0.31561434261206783, | |
| "learning_rate": 5e-06, | |
| "loss": 0.51, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 4.4419134396355355, | |
| "grad_norm": 0.32343506635297187, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5172, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 4.4646924829157175, | |
| "grad_norm": 0.2920601079152555, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5134, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 4.4874715261958995, | |
| "grad_norm": 0.30047616053441856, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5123, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 4.510250569476082, | |
| "grad_norm": 0.299324693496857, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5177, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 4.533029612756264, | |
| "grad_norm": 0.30340189870468204, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5095, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 4.555808656036446, | |
| "grad_norm": 0.32578537933466617, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5126, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 4.578587699316628, | |
| "grad_norm": 0.3214168201654836, | |
| "learning_rate": 5e-06, | |
| "loss": 0.516, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 4.601366742596811, | |
| "grad_norm": 0.33535913909657655, | |
| "learning_rate": 5e-06, | |
| "loss": 0.516, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 4.624145785876993, | |
| "grad_norm": 0.34950298355241916, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5159, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 4.646924829157175, | |
| "grad_norm": 0.3067631933457454, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5103, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 4.669703872437358, | |
| "grad_norm": 0.31011573141446475, | |
| "learning_rate": 5e-06, | |
| "loss": 0.526, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 4.69248291571754, | |
| "grad_norm": 0.31485826733211975, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5205, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 4.715261958997722, | |
| "grad_norm": 0.2981899092207014, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5128, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 4.738041002277904, | |
| "grad_norm": 0.30070781244970324, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5195, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 4.760820045558087, | |
| "grad_norm": 0.29089650375946846, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5192, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 4.783599088838269, | |
| "grad_norm": 0.32942349791391345, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5155, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 4.806378132118451, | |
| "grad_norm": 0.30690284165223813, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5157, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 4.829157175398633, | |
| "grad_norm": 0.29755186369998715, | |
| "learning_rate": 5e-06, | |
| "loss": 0.515, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 4.851936218678816, | |
| "grad_norm": 0.29726224376764365, | |
| "learning_rate": 5e-06, | |
| "loss": 0.52, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 4.874715261958998, | |
| "grad_norm": 0.3121093770179605, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5214, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 4.89749430523918, | |
| "grad_norm": 0.2921989069170297, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5141, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 4.920273348519363, | |
| "grad_norm": 0.3137625481692737, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5191, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 4.943052391799545, | |
| "grad_norm": 0.31268104737726976, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5204, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 4.965831435079727, | |
| "grad_norm": 0.31240087794789345, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5214, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 4.988610478359909, | |
| "grad_norm": 0.27920850439212735, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5179, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.6346195340156555, | |
| "eval_runtime": 112.3467, | |
| "eval_samples_per_second": 105.237, | |
| "eval_steps_per_second": 0.418, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 2195, | |
| "total_flos": 4601905523195904.0, | |
| "train_loss": 0.5737499028512177, | |
| "train_runtime": 30529.5931, | |
| "train_samples_per_second": 36.787, | |
| "train_steps_per_second": 0.072 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2195, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4601905523195904.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |