| { | |
| "best_global_step": 208, | |
| "best_metric": 0.305007666349411, | |
| "best_model_checkpoint": "tmp/out/512-96-ft-l1-r2.1_common_channel_fcmCtx8_fcmLayers2_fcmChMixingTrue_stride24_bs512_lr0.0003_89b6/checkpoint-208", | |
| "epoch": 104.0, | |
| "eval_steps": 500, | |
| "global_step": 208, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.6618412137031555, | |
| "learning_rate": 0.0002999998149449555, | |
| "loss": 0.3922, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.358045369386673, | |
| "eval_runtime": 0.9433, | |
| "eval_samples_per_second": 314.866, | |
| "eval_steps_per_second": 1.06, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.465237021446228, | |
| "learning_rate": 0.0002999983345073394, | |
| "loss": 0.3871, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.35514307022094727, | |
| "eval_runtime": 0.984, | |
| "eval_samples_per_second": 301.829, | |
| "eval_steps_per_second": 1.016, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.3141915500164032, | |
| "learning_rate": 0.00029999537364671844, | |
| "loss": 0.3814, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.3537431061267853, | |
| "eval_runtime": 1.0649, | |
| "eval_samples_per_second": 278.894, | |
| "eval_steps_per_second": 0.939, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.3491910696029663, | |
| "learning_rate": 0.0002999909323923152, | |
| "loss": 0.3733, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.3517496585845947, | |
| "eval_runtime": 0.9441, | |
| "eval_samples_per_second": 314.571, | |
| "eval_steps_per_second": 1.059, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.29127219319343567, | |
| "learning_rate": 0.000299985010787963, | |
| "loss": 0.3749, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.35048815608024597, | |
| "eval_runtime": 0.9185, | |
| "eval_samples_per_second": 323.353, | |
| "eval_steps_per_second": 1.089, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.33912065625190735, | |
| "learning_rate": 0.0002999776088921057, | |
| "loss": 0.3708, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.35033392906188965, | |
| "eval_runtime": 0.9493, | |
| "eval_samples_per_second": 312.868, | |
| "eval_steps_per_second": 1.053, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.311758428812027, | |
| "learning_rate": 0.0002999687267777971, | |
| "loss": 0.3685, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.35117536783218384, | |
| "eval_runtime": 0.7975, | |
| "eval_samples_per_second": 372.426, | |
| "eval_steps_per_second": 1.254, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.30109912157058716, | |
| "learning_rate": 0.0002999583645327, | |
| "loss": 0.3639, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.35066911578178406, | |
| "eval_runtime": 0.7655, | |
| "eval_samples_per_second": 388.003, | |
| "eval_steps_per_second": 1.306, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.3332655131816864, | |
| "learning_rate": 0.0002999465222590856, | |
| "loss": 0.3655, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.350115031003952, | |
| "eval_runtime": 0.7681, | |
| "eval_samples_per_second": 386.655, | |
| "eval_steps_per_second": 1.302, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.36181315779685974, | |
| "learning_rate": 0.00029993320007383234, | |
| "loss": 0.3627, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.35086938738822937, | |
| "eval_runtime": 0.7303, | |
| "eval_samples_per_second": 406.705, | |
| "eval_steps_per_second": 1.369, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 0.3522287607192993, | |
| "learning_rate": 0.0002999183981084249, | |
| "loss": 0.3575, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.3501550853252411, | |
| "eval_runtime": 0.6638, | |
| "eval_samples_per_second": 447.418, | |
| "eval_steps_per_second": 1.506, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.48265793919563293, | |
| "learning_rate": 0.0002999021165089526, | |
| "loss": 0.356, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.34953632950782776, | |
| "eval_runtime": 0.6847, | |
| "eval_samples_per_second": 433.784, | |
| "eval_steps_per_second": 1.461, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 0.452486515045166, | |
| "learning_rate": 0.0002998843554361083, | |
| "loss": 0.3572, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.34998443722724915, | |
| "eval_runtime": 0.6957, | |
| "eval_samples_per_second": 426.915, | |
| "eval_steps_per_second": 1.437, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 0.5479589700698853, | |
| "learning_rate": 0.0002998651150651866, | |
| "loss": 0.3602, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.3498837649822235, | |
| "eval_runtime": 0.7349, | |
| "eval_samples_per_second": 404.119, | |
| "eval_steps_per_second": 1.361, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.43432193994522095, | |
| "learning_rate": 0.00029984439558608224, | |
| "loss": 0.3552, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.34781429171562195, | |
| "eval_runtime": 0.6365, | |
| "eval_samples_per_second": 466.635, | |
| "eval_steps_per_second": 1.571, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.35709846019744873, | |
| "learning_rate": 0.00029982219720328803, | |
| "loss": 0.3442, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.34714677929878235, | |
| "eval_runtime": 0.7389, | |
| "eval_samples_per_second": 401.956, | |
| "eval_steps_per_second": 1.353, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 0.3510877192020416, | |
| "learning_rate": 0.00029979852013589306, | |
| "loss": 0.3466, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.3476266860961914, | |
| "eval_runtime": 0.582, | |
| "eval_samples_per_second": 510.287, | |
| "eval_steps_per_second": 1.718, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.6324598789215088, | |
| "learning_rate": 0.0002997733646175805, | |
| "loss": 0.3495, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.34712979197502136, | |
| "eval_runtime": 0.6306, | |
| "eval_samples_per_second": 470.991, | |
| "eval_steps_per_second": 1.586, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 0.3290903866291046, | |
| "learning_rate": 0.00029974673089662506, | |
| "loss": 0.343, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.3468128442764282, | |
| "eval_runtime": 0.6393, | |
| "eval_samples_per_second": 464.534, | |
| "eval_steps_per_second": 1.564, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.3430045545101166, | |
| "learning_rate": 0.00029971861923589095, | |
| "loss": 0.342, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.3459309935569763, | |
| "eval_runtime": 0.7921, | |
| "eval_samples_per_second": 374.931, | |
| "eval_steps_per_second": 1.262, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "grad_norm": 0.39738816022872925, | |
| "learning_rate": 0.0002996890299128288, | |
| "loss": 0.3444, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 0.3452853560447693, | |
| "eval_runtime": 0.7918, | |
| "eval_samples_per_second": 375.108, | |
| "eval_steps_per_second": 1.263, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "grad_norm": 0.3685533106327057, | |
| "learning_rate": 0.00029965796321947336, | |
| "loss": 0.3368, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 0.34537574648857117, | |
| "eval_runtime": 0.715, | |
| "eval_samples_per_second": 415.38, | |
| "eval_steps_per_second": 1.399, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "grad_norm": 0.41565972566604614, | |
| "learning_rate": 0.00029962541946244024, | |
| "loss": 0.3337, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_loss": 0.34441086649894714, | |
| "eval_runtime": 0.5609, | |
| "eval_samples_per_second": 529.532, | |
| "eval_steps_per_second": 1.783, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 0.36327725648880005, | |
| "learning_rate": 0.00029959139896292323, | |
| "loss": 0.3342, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 0.34298038482666016, | |
| "eval_runtime": 0.672, | |
| "eval_samples_per_second": 441.965, | |
| "eval_steps_per_second": 1.488, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 0.4043494164943695, | |
| "learning_rate": 0.0002995559020566911, | |
| "loss": 0.3414, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 0.3428707420825958, | |
| "eval_runtime": 0.6633, | |
| "eval_samples_per_second": 447.772, | |
| "eval_steps_per_second": 1.508, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "grad_norm": 0.40012046694755554, | |
| "learning_rate": 0.0002995189290940837, | |
| "loss": 0.3387, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_loss": 0.3441244959831238, | |
| "eval_runtime": 0.7445, | |
| "eval_samples_per_second": 398.927, | |
| "eval_steps_per_second": 1.343, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "grad_norm": 0.4684393107891083, | |
| "learning_rate": 0.0002994804804400094, | |
| "loss": 0.3343, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_loss": 0.3433794677257538, | |
| "eval_runtime": 0.681, | |
| "eval_samples_per_second": 436.153, | |
| "eval_steps_per_second": 1.469, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "grad_norm": 0.4722350239753723, | |
| "learning_rate": 0.00029944055647394087, | |
| "loss": 0.3366, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_loss": 0.3417203724384308, | |
| "eval_runtime": 0.6481, | |
| "eval_samples_per_second": 458.289, | |
| "eval_steps_per_second": 1.543, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "grad_norm": 0.5819035768508911, | |
| "learning_rate": 0.0002993991575899116, | |
| "loss": 0.3338, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_loss": 0.3421356976032257, | |
| "eval_runtime": 0.8299, | |
| "eval_samples_per_second": 357.891, | |
| "eval_steps_per_second": 1.205, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 0.4985131621360779, | |
| "learning_rate": 0.0002993562841965118, | |
| "loss": 0.3314, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 0.34163638949394226, | |
| "eval_runtime": 0.6224, | |
| "eval_samples_per_second": 477.149, | |
| "eval_steps_per_second": 1.607, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "grad_norm": 0.5123302340507507, | |
| "learning_rate": 0.00029931193671688453, | |
| "loss": 0.33, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_loss": 0.33960819244384766, | |
| "eval_runtime": 0.7702, | |
| "eval_samples_per_second": 385.59, | |
| "eval_steps_per_second": 1.298, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "grad_norm": 0.34553447365760803, | |
| "learning_rate": 0.0002992661155887215, | |
| "loss": 0.3253, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_loss": 0.33889296650886536, | |
| "eval_runtime": 0.6787, | |
| "eval_samples_per_second": 437.601, | |
| "eval_steps_per_second": 1.473, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "grad_norm": 0.4334942698478699, | |
| "learning_rate": 0.0002992188212642587, | |
| "loss": 0.3221, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_loss": 0.3390357494354248, | |
| "eval_runtime": 0.7883, | |
| "eval_samples_per_second": 376.773, | |
| "eval_steps_per_second": 1.269, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "grad_norm": 0.5000684261322021, | |
| "learning_rate": 0.0002991700542102722, | |
| "loss": 0.3228, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_loss": 0.3387429118156433, | |
| "eval_runtime": 0.7066, | |
| "eval_samples_per_second": 420.345, | |
| "eval_steps_per_second": 1.415, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "grad_norm": 0.5403483510017395, | |
| "learning_rate": 0.000299119814908073, | |
| "loss": 0.3246, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_loss": 0.337623655796051, | |
| "eval_runtime": 0.5666, | |
| "eval_samples_per_second": 524.176, | |
| "eval_steps_per_second": 1.765, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "grad_norm": 0.3801553547382355, | |
| "learning_rate": 0.00029906810385350283, | |
| "loss": 0.3215, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_loss": 0.3370514214038849, | |
| "eval_runtime": 0.7169, | |
| "eval_samples_per_second": 414.265, | |
| "eval_steps_per_second": 1.395, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "grad_norm": 0.6648715138435364, | |
| "learning_rate": 0.00029901492155692876, | |
| "loss": 0.3205, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "eval_loss": 0.3367176651954651, | |
| "eval_runtime": 0.6219, | |
| "eval_samples_per_second": 477.599, | |
| "eval_steps_per_second": 1.608, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "grad_norm": 0.38467201590538025, | |
| "learning_rate": 0.0002989602685432388, | |
| "loss": 0.3175, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_loss": 0.3385818302631378, | |
| "eval_runtime": 0.7477, | |
| "eval_samples_per_second": 397.199, | |
| "eval_steps_per_second": 1.337, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "grad_norm": 0.5086773633956909, | |
| "learning_rate": 0.00029890414535183583, | |
| "loss": 0.3188, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "eval_loss": 0.33492594957351685, | |
| "eval_runtime": 0.7572, | |
| "eval_samples_per_second": 392.254, | |
| "eval_steps_per_second": 1.321, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 0.4003610610961914, | |
| "learning_rate": 0.00029884655253663344, | |
| "loss": 0.3197, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_loss": 0.33226659893989563, | |
| "eval_runtime": 0.738, | |
| "eval_samples_per_second": 402.452, | |
| "eval_steps_per_second": 1.355, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "grad_norm": 0.4991908371448517, | |
| "learning_rate": 0.00029878749066604925, | |
| "loss": 0.313, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "eval_loss": 0.331875741481781, | |
| "eval_runtime": 0.7609, | |
| "eval_samples_per_second": 390.343, | |
| "eval_steps_per_second": 1.314, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "grad_norm": 0.450600802898407, | |
| "learning_rate": 0.0002987269603230001, | |
| "loss": 0.3142, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "eval_loss": 0.33316686749458313, | |
| "eval_runtime": 0.6259, | |
| "eval_samples_per_second": 474.552, | |
| "eval_steps_per_second": 1.598, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "grad_norm": 0.7004669308662415, | |
| "learning_rate": 0.00029866496210489605, | |
| "loss": 0.3112, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "eval_loss": 0.3317732810974121, | |
| "eval_runtime": 0.7352, | |
| "eval_samples_per_second": 403.996, | |
| "eval_steps_per_second": 1.36, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "grad_norm": 0.4675546884536743, | |
| "learning_rate": 0.0002986014966236345, | |
| "loss": 0.3118, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "eval_loss": 0.32955238223075867, | |
| "eval_runtime": 0.8122, | |
| "eval_samples_per_second": 365.683, | |
| "eval_steps_per_second": 1.231, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "grad_norm": 0.5740877389907837, | |
| "learning_rate": 0.00029853656450559414, | |
| "loss": 0.3083, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "eval_loss": 0.32967710494995117, | |
| "eval_runtime": 0.7609, | |
| "eval_samples_per_second": 390.346, | |
| "eval_steps_per_second": 1.314, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "grad_norm": 0.5421927571296692, | |
| "learning_rate": 0.00029847016639162867, | |
| "loss": 0.3125, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "eval_loss": 0.33168312907218933, | |
| "eval_runtime": 0.6488, | |
| "eval_samples_per_second": 457.788, | |
| "eval_steps_per_second": 1.541, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "grad_norm": 0.6857365369796753, | |
| "learning_rate": 0.0002984023029370609, | |
| "loss": 0.3035, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "eval_loss": 0.3298337161540985, | |
| "eval_runtime": 0.6488, | |
| "eval_samples_per_second": 457.751, | |
| "eval_steps_per_second": 1.541, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "grad_norm": 0.8388419151306152, | |
| "learning_rate": 0.0002983329748116755, | |
| "loss": 0.3031, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "eval_loss": 0.32884910702705383, | |
| "eval_runtime": 0.6776, | |
| "eval_samples_per_second": 438.283, | |
| "eval_steps_per_second": 1.476, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "grad_norm": 0.46255478262901306, | |
| "learning_rate": 0.00029826218269971314, | |
| "loss": 0.305, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "eval_loss": 0.3276749551296234, | |
| "eval_runtime": 0.733, | |
| "eval_samples_per_second": 405.188, | |
| "eval_steps_per_second": 1.364, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "grad_norm": 0.44940125942230225, | |
| "learning_rate": 0.0002981899272998634, | |
| "loss": 0.302, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "eval_loss": 0.3282870352268219, | |
| "eval_runtime": 0.7162, | |
| "eval_samples_per_second": 414.71, | |
| "eval_steps_per_second": 1.396, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 51.0, | |
| "grad_norm": 0.5168047547340393, | |
| "learning_rate": 0.0002981162093252579, | |
| "loss": 0.2989, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 51.0, | |
| "eval_loss": 0.33124735951423645, | |
| "eval_runtime": 0.629, | |
| "eval_samples_per_second": 472.143, | |
| "eval_steps_per_second": 1.59, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "grad_norm": 0.5312138795852661, | |
| "learning_rate": 0.00029804102950346334, | |
| "loss": 0.2957, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "eval_loss": 0.3266555964946747, | |
| "eval_runtime": 0.6431, | |
| "eval_samples_per_second": 461.793, | |
| "eval_steps_per_second": 1.555, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 53.0, | |
| "grad_norm": 0.5349692702293396, | |
| "learning_rate": 0.0002979643885764741, | |
| "loss": 0.2975, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 53.0, | |
| "eval_loss": 0.3233616352081299, | |
| "eval_runtime": 0.8057, | |
| "eval_samples_per_second": 368.612, | |
| "eval_steps_per_second": 1.241, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "grad_norm": 0.36133870482444763, | |
| "learning_rate": 0.00029788628730070533, | |
| "loss": 0.2957, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "eval_loss": 0.323390930891037, | |
| "eval_runtime": 0.6815, | |
| "eval_samples_per_second": 435.829, | |
| "eval_steps_per_second": 1.467, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "grad_norm": 0.4435558021068573, | |
| "learning_rate": 0.00029780672644698494, | |
| "loss": 0.2956, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "eval_loss": 0.3230491578578949, | |
| "eval_runtime": 0.6254, | |
| "eval_samples_per_second": 474.93, | |
| "eval_steps_per_second": 1.599, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "grad_norm": 0.4963320791721344, | |
| "learning_rate": 0.0002977257068005465, | |
| "loss": 0.2927, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "eval_loss": 0.321744829416275, | |
| "eval_runtime": 0.7031, | |
| "eval_samples_per_second": 422.43, | |
| "eval_steps_per_second": 1.422, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 57.0, | |
| "grad_norm": 0.47572246193885803, | |
| "learning_rate": 0.0002976432291610213, | |
| "loss": 0.2945, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 57.0, | |
| "eval_loss": 0.32216939330101013, | |
| "eval_runtime": 0.77, | |
| "eval_samples_per_second": 385.727, | |
| "eval_steps_per_second": 1.299, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "grad_norm": 0.4532373547554016, | |
| "learning_rate": 0.00029755929434243023, | |
| "loss": 0.2912, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "eval_loss": 0.32353639602661133, | |
| "eval_runtime": 0.7486, | |
| "eval_samples_per_second": 396.744, | |
| "eval_steps_per_second": 1.336, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 59.0, | |
| "grad_norm": 0.37667471170425415, | |
| "learning_rate": 0.00029747390317317603, | |
| "loss": 0.2892, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 59.0, | |
| "eval_loss": 0.3225230276584625, | |
| "eval_runtime": 0.6999, | |
| "eval_samples_per_second": 424.335, | |
| "eval_steps_per_second": 1.429, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "grad_norm": 0.48530539870262146, | |
| "learning_rate": 0.0002973870564960352, | |
| "loss": 0.2874, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "eval_loss": 0.32104259729385376, | |
| "eval_runtime": 0.6522, | |
| "eval_samples_per_second": 455.417, | |
| "eval_steps_per_second": 1.533, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 61.0, | |
| "grad_norm": 0.46407485008239746, | |
| "learning_rate": 0.00029729875516814935, | |
| "loss": 0.2836, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 61.0, | |
| "eval_loss": 0.32013222575187683, | |
| "eval_runtime": 0.8233, | |
| "eval_samples_per_second": 360.757, | |
| "eval_steps_per_second": 1.215, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "grad_norm": 0.6122850179672241, | |
| "learning_rate": 0.0002972090000610169, | |
| "loss": 0.2894, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "eval_loss": 0.3198564350605011, | |
| "eval_runtime": 0.6743, | |
| "eval_samples_per_second": 440.478, | |
| "eval_steps_per_second": 1.483, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 63.0, | |
| "grad_norm": 0.5037839412689209, | |
| "learning_rate": 0.0002971177920604845, | |
| "loss": 0.2891, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 63.0, | |
| "eval_loss": 0.31861403584480286, | |
| "eval_runtime": 0.6524, | |
| "eval_samples_per_second": 455.235, | |
| "eval_steps_per_second": 1.533, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "grad_norm": 0.4292280972003937, | |
| "learning_rate": 0.00029702513206673827, | |
| "loss": 0.2895, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "eval_loss": 0.32153990864753723, | |
| "eval_runtime": 0.6418, | |
| "eval_samples_per_second": 462.761, | |
| "eval_steps_per_second": 1.558, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "grad_norm": 0.4504665732383728, | |
| "learning_rate": 0.000296931020994295, | |
| "loss": 0.2829, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "eval_loss": 0.3232710063457489, | |
| "eval_runtime": 0.7745, | |
| "eval_samples_per_second": 383.487, | |
| "eval_steps_per_second": 1.291, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "grad_norm": 0.6219745874404907, | |
| "learning_rate": 0.000296835459771993, | |
| "loss": 0.2859, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "eval_loss": 0.3181236982345581, | |
| "eval_runtime": 0.7429, | |
| "eval_samples_per_second": 399.762, | |
| "eval_steps_per_second": 1.346, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 67.0, | |
| "grad_norm": 0.37324589490890503, | |
| "learning_rate": 0.0002967384493429829, | |
| "loss": 0.2798, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 67.0, | |
| "eval_loss": 0.31664231419563293, | |
| "eval_runtime": 0.6029, | |
| "eval_samples_per_second": 492.583, | |
| "eval_steps_per_second": 1.659, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "grad_norm": 0.48705121874809265, | |
| "learning_rate": 0.0002966399906647185, | |
| "loss": 0.281, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "eval_loss": 0.3166642189025879, | |
| "eval_runtime": 0.7131, | |
| "eval_samples_per_second": 416.468, | |
| "eval_steps_per_second": 1.402, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 69.0, | |
| "grad_norm": 0.4066949188709259, | |
| "learning_rate": 0.0002965400847089472, | |
| "loss": 0.2822, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 69.0, | |
| "eval_loss": 0.3151547312736511, | |
| "eval_runtime": 0.7062, | |
| "eval_samples_per_second": 420.548, | |
| "eval_steps_per_second": 1.416, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "grad_norm": 0.3984168469905853, | |
| "learning_rate": 0.00029643873246170045, | |
| "loss": 0.2801, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "eval_loss": 0.314954549074173, | |
| "eval_runtime": 0.6916, | |
| "eval_samples_per_second": 429.454, | |
| "eval_steps_per_second": 1.446, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 71.0, | |
| "grad_norm": 0.4139808118343353, | |
| "learning_rate": 0.000296335934923284, | |
| "loss": 0.2815, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 71.0, | |
| "eval_loss": 0.31498512625694275, | |
| "eval_runtime": 0.6363, | |
| "eval_samples_per_second": 466.781, | |
| "eval_steps_per_second": 1.572, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 72.0, | |
| "grad_norm": 0.4868161678314209, | |
| "learning_rate": 0.0002962316931082681, | |
| "loss": 0.2781, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 72.0, | |
| "eval_loss": 0.31632447242736816, | |
| "eval_runtime": 0.7076, | |
| "eval_samples_per_second": 419.738, | |
| "eval_steps_per_second": 1.413, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 73.0, | |
| "grad_norm": 0.41822776198387146, | |
| "learning_rate": 0.0002961260080454773, | |
| "loss": 0.2771, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 73.0, | |
| "eval_loss": 0.31577062606811523, | |
| "eval_runtime": 0.6483, | |
| "eval_samples_per_second": 458.136, | |
| "eval_steps_per_second": 1.543, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 74.0, | |
| "grad_norm": 0.5603657960891724, | |
| "learning_rate": 0.0002960188807779805, | |
| "loss": 0.2732, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 74.0, | |
| "eval_loss": 0.31435203552246094, | |
| "eval_runtime": 0.691, | |
| "eval_samples_per_second": 429.801, | |
| "eval_steps_per_second": 1.447, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 75.0, | |
| "grad_norm": 0.5573254227638245, | |
| "learning_rate": 0.0002959103123630807, | |
| "loss": 0.2766, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 75.0, | |
| "eval_loss": 0.3145710527896881, | |
| "eval_runtime": 0.5527, | |
| "eval_samples_per_second": 537.332, | |
| "eval_steps_per_second": 1.809, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 76.0, | |
| "grad_norm": 0.4642890393733978, | |
| "learning_rate": 0.0002958003038723042, | |
| "loss": 0.2798, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 76.0, | |
| "eval_loss": 0.3158363401889801, | |
| "eval_runtime": 0.6733, | |
| "eval_samples_per_second": 441.12, | |
| "eval_steps_per_second": 1.485, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 77.0, | |
| "grad_norm": 0.4938775300979614, | |
| "learning_rate": 0.00029568885639139053, | |
| "loss": 0.2753, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 77.0, | |
| "eval_loss": 0.31477540731430054, | |
| "eval_runtime": 0.7417, | |
| "eval_samples_per_second": 400.422, | |
| "eval_steps_per_second": 1.348, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 78.0, | |
| "grad_norm": 0.39144381880760193, | |
| "learning_rate": 0.00029557597102028123, | |
| "loss": 0.2732, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 78.0, | |
| "eval_loss": 0.31341299414634705, | |
| "eval_runtime": 0.7854, | |
| "eval_samples_per_second": 378.137, | |
| "eval_steps_per_second": 1.273, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 79.0, | |
| "grad_norm": 0.4929453432559967, | |
| "learning_rate": 0.00029546164887310933, | |
| "loss": 0.2711, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 79.0, | |
| "eval_loss": 0.31385812163352966, | |
| "eval_runtime": 0.6657, | |
| "eval_samples_per_second": 446.114, | |
| "eval_steps_per_second": 1.502, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 80.0, | |
| "grad_norm": 0.40074649453163147, | |
| "learning_rate": 0.0002953458910781883, | |
| "loss": 0.2677, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 80.0, | |
| "eval_loss": 0.3135402500629425, | |
| "eval_runtime": 0.6596, | |
| "eval_samples_per_second": 450.303, | |
| "eval_steps_per_second": 1.516, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 81.0, | |
| "grad_norm": 0.41622215509414673, | |
| "learning_rate": 0.0002952286987780008, | |
| "loss": 0.2668, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 81.0, | |
| "eval_loss": 0.31341665983200073, | |
| "eval_runtime": 0.5644, | |
| "eval_samples_per_second": 526.236, | |
| "eval_steps_per_second": 1.772, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 82.0, | |
| "grad_norm": 0.5020734667778015, | |
| "learning_rate": 0.0002951100731291876, | |
| "loss": 0.2734, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 82.0, | |
| "eval_loss": 0.3154440224170685, | |
| "eval_runtime": 0.7427, | |
| "eval_samples_per_second": 399.916, | |
| "eval_steps_per_second": 1.347, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 83.0, | |
| "grad_norm": 0.7120861411094666, | |
| "learning_rate": 0.0002949900153025359, | |
| "loss": 0.2712, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 83.0, | |
| "eval_loss": 0.3155328929424286, | |
| "eval_runtime": 0.734, | |
| "eval_samples_per_second": 404.611, | |
| "eval_steps_per_second": 1.362, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 84.0, | |
| "grad_norm": 0.5501344799995422, | |
| "learning_rate": 0.00029486852648296806, | |
| "loss": 0.2705, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 84.0, | |
| "eval_loss": 0.31369757652282715, | |
| "eval_runtime": 0.6073, | |
| "eval_samples_per_second": 489.085, | |
| "eval_steps_per_second": 1.647, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 85.0, | |
| "grad_norm": 0.3931790292263031, | |
| "learning_rate": 0.00029474560786952957, | |
| "loss": 0.273, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 85.0, | |
| "eval_loss": 0.3132490813732147, | |
| "eval_runtime": 0.6683, | |
| "eval_samples_per_second": 444.398, | |
| "eval_steps_per_second": 1.496, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 86.0, | |
| "grad_norm": 0.4931921064853668, | |
| "learning_rate": 0.00029462126067537756, | |
| "loss": 0.2713, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 86.0, | |
| "eval_loss": 0.3119468688964844, | |
| "eval_runtime": 0.5093, | |
| "eval_samples_per_second": 583.176, | |
| "eval_steps_per_second": 1.964, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 87.0, | |
| "grad_norm": 0.5810508131980896, | |
| "learning_rate": 0.00029449548612776866, | |
| "loss": 0.2665, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 87.0, | |
| "eval_loss": 0.31254178285598755, | |
| "eval_runtime": 0.7282, | |
| "eval_samples_per_second": 407.876, | |
| "eval_steps_per_second": 1.373, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 88.0, | |
| "grad_norm": 0.6227275729179382, | |
| "learning_rate": 0.00029436828546804686, | |
| "loss": 0.2719, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 88.0, | |
| "eval_loss": 0.3148057162761688, | |
| "eval_runtime": 0.7497, | |
| "eval_samples_per_second": 396.169, | |
| "eval_steps_per_second": 1.334, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 89.0, | |
| "grad_norm": 0.6895466446876526, | |
| "learning_rate": 0.0002942396599516314, | |
| "loss": 0.2666, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 89.0, | |
| "eval_loss": 0.31230518221855164, | |
| "eval_runtime": 0.6871, | |
| "eval_samples_per_second": 432.239, | |
| "eval_steps_per_second": 1.455, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 90.0, | |
| "grad_norm": 0.6018730401992798, | |
| "learning_rate": 0.0002941096108480041, | |
| "loss": 0.2641, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 90.0, | |
| "eval_loss": 0.3139157295227051, | |
| "eval_runtime": 0.6814, | |
| "eval_samples_per_second": 435.86, | |
| "eval_steps_per_second": 1.468, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 91.0, | |
| "grad_norm": 0.5773816704750061, | |
| "learning_rate": 0.0002939781394406971, | |
| "loss": 0.2679, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 91.0, | |
| "eval_loss": 0.314519464969635, | |
| "eval_runtime": 0.6242, | |
| "eval_samples_per_second": 475.778, | |
| "eval_steps_per_second": 1.602, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 92.0, | |
| "grad_norm": 0.430232971906662, | |
| "learning_rate": 0.00029384524702728013, | |
| "loss": 0.2636, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 92.0, | |
| "eval_loss": 0.3126896917819977, | |
| "eval_runtime": 0.686, | |
| "eval_samples_per_second": 432.948, | |
| "eval_steps_per_second": 1.458, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 93.0, | |
| "grad_norm": 0.5966992974281311, | |
| "learning_rate": 0.0002937109349193477, | |
| "loss": 0.2662, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 93.0, | |
| "eval_loss": 0.3127867877483368, | |
| "eval_runtime": 0.704, | |
| "eval_samples_per_second": 421.861, | |
| "eval_steps_per_second": 1.42, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 94.0, | |
| "grad_norm": 0.5436354875564575, | |
| "learning_rate": 0.000293575204442506, | |
| "loss": 0.2625, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 94.0, | |
| "eval_loss": 0.3121253550052643, | |
| "eval_runtime": 0.6839, | |
| "eval_samples_per_second": 434.267, | |
| "eval_steps_per_second": 1.462, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 95.0, | |
| "grad_norm": 0.6525291800498962, | |
| "learning_rate": 0.00029343805693636017, | |
| "loss": 0.2643, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 95.0, | |
| "eval_loss": 0.30872872471809387, | |
| "eval_runtime": 0.7643, | |
| "eval_samples_per_second": 388.597, | |
| "eval_steps_per_second": 1.308, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 96.0, | |
| "grad_norm": 0.41923660039901733, | |
| "learning_rate": 0.0002932994937545007, | |
| "loss": 0.2605, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 96.0, | |
| "eval_loss": 0.30773305892944336, | |
| "eval_runtime": 0.621, | |
| "eval_samples_per_second": 478.226, | |
| "eval_steps_per_second": 1.61, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 97.0, | |
| "grad_norm": 0.49382027983665466, | |
| "learning_rate": 0.0002931595162644901, | |
| "loss": 0.2613, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 97.0, | |
| "eval_loss": 0.3081483542919159, | |
| "eval_runtime": 0.5459, | |
| "eval_samples_per_second": 544.096, | |
| "eval_steps_per_second": 1.832, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 98.0, | |
| "grad_norm": 0.3870932459831238, | |
| "learning_rate": 0.0002930181258478499, | |
| "loss": 0.2655, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 98.0, | |
| "eval_loss": 0.3087702691555023, | |
| "eval_runtime": 0.7521, | |
| "eval_samples_per_second": 394.875, | |
| "eval_steps_per_second": 1.33, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 99.0, | |
| "grad_norm": 0.3989209532737732, | |
| "learning_rate": 0.00029287532390004633, | |
| "loss": 0.2605, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 99.0, | |
| "eval_loss": 0.30880430340766907, | |
| "eval_runtime": 0.6471, | |
| "eval_samples_per_second": 458.95, | |
| "eval_steps_per_second": 1.545, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 100.0, | |
| "grad_norm": 0.6288752555847168, | |
| "learning_rate": 0.00029273111183047697, | |
| "loss": 0.2624, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 100.0, | |
| "eval_loss": 0.30914193391799927, | |
| "eval_runtime": 0.7927, | |
| "eval_samples_per_second": 374.659, | |
| "eval_steps_per_second": 1.261, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 101.0, | |
| "grad_norm": 0.7080681324005127, | |
| "learning_rate": 0.0002925854910624568, | |
| "loss": 0.2532, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 101.0, | |
| "eval_loss": 0.3055001199245453, | |
| "eval_runtime": 0.6808, | |
| "eval_samples_per_second": 436.251, | |
| "eval_steps_per_second": 1.469, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 102.0, | |
| "grad_norm": 0.6294286251068115, | |
| "learning_rate": 0.00029243846303320386, | |
| "loss": 0.2603, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 102.0, | |
| "eval_loss": 0.3056044280529022, | |
| "eval_runtime": 0.7558, | |
| "eval_samples_per_second": 392.961, | |
| "eval_steps_per_second": 1.323, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 103.0, | |
| "grad_norm": 0.6036813259124756, | |
| "learning_rate": 0.0002922900291938255, | |
| "loss": 0.2576, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 103.0, | |
| "eval_loss": 0.30621686577796936, | |
| "eval_runtime": 0.7737, | |
| "eval_samples_per_second": 383.867, | |
| "eval_steps_per_second": 1.292, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 104.0, | |
| "grad_norm": 0.44163164496421814, | |
| "learning_rate": 0.00029214019100930384, | |
| "loss": 0.2532, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 104.0, | |
| "eval_loss": 0.305007666349411, | |
| "eval_runtime": 0.7019, | |
| "eval_samples_per_second": 423.12, | |
| "eval_steps_per_second": 1.425, | |
| "step": 208 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 2000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1000, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 10, | |
| "early_stopping_threshold": 1e-05 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1925114350927872.0, | |
| "train_batch_size": 512, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |