{ "best_global_step": 208, "best_metric": 0.305007666349411, "best_model_checkpoint": "tmp/out/512-96-ft-l1-r2.1_common_channel_fcmCtx8_fcmLayers2_fcmChMixingTrue_stride24_bs512_lr0.0003_89b6/checkpoint-208", "epoch": 104.0, "eval_steps": 500, "global_step": 208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 0.6618412137031555, "learning_rate": 0.0002999998149449555, "loss": 0.3922, "step": 2 }, { "epoch": 1.0, "eval_loss": 0.358045369386673, "eval_runtime": 0.9433, "eval_samples_per_second": 314.866, "eval_steps_per_second": 1.06, "step": 2 }, { "epoch": 2.0, "grad_norm": 0.465237021446228, "learning_rate": 0.0002999983345073394, "loss": 0.3871, "step": 4 }, { "epoch": 2.0, "eval_loss": 0.35514307022094727, "eval_runtime": 0.984, "eval_samples_per_second": 301.829, "eval_steps_per_second": 1.016, "step": 4 }, { "epoch": 3.0, "grad_norm": 0.3141915500164032, "learning_rate": 0.00029999537364671844, "loss": 0.3814, "step": 6 }, { "epoch": 3.0, "eval_loss": 0.3537431061267853, "eval_runtime": 1.0649, "eval_samples_per_second": 278.894, "eval_steps_per_second": 0.939, "step": 6 }, { "epoch": 4.0, "grad_norm": 0.3491910696029663, "learning_rate": 0.0002999909323923152, "loss": 0.3733, "step": 8 }, { "epoch": 4.0, "eval_loss": 0.3517496585845947, "eval_runtime": 0.9441, "eval_samples_per_second": 314.571, "eval_steps_per_second": 1.059, "step": 8 }, { "epoch": 5.0, "grad_norm": 0.29127219319343567, "learning_rate": 0.000299985010787963, "loss": 0.3749, "step": 10 }, { "epoch": 5.0, "eval_loss": 0.35048815608024597, "eval_runtime": 0.9185, "eval_samples_per_second": 323.353, "eval_steps_per_second": 1.089, "step": 10 }, { "epoch": 6.0, "grad_norm": 0.33912065625190735, "learning_rate": 0.0002999776088921057, "loss": 0.3708, "step": 12 }, { "epoch": 6.0, "eval_loss": 0.35033392906188965, "eval_runtime": 0.9493, "eval_samples_per_second": 312.868, "eval_steps_per_second": 1.053, "step": 12 }, { "epoch": 7.0, "grad_norm": 0.311758428812027, "learning_rate": 0.0002999687267777971, "loss": 0.3685, "step": 14 }, { "epoch": 7.0, "eval_loss": 0.35117536783218384, "eval_runtime": 0.7975, "eval_samples_per_second": 372.426, "eval_steps_per_second": 1.254, "step": 14 }, { "epoch": 8.0, "grad_norm": 0.30109912157058716, "learning_rate": 0.0002999583645327, "loss": 0.3639, "step": 16 }, { "epoch": 8.0, "eval_loss": 0.35066911578178406, "eval_runtime": 0.7655, "eval_samples_per_second": 388.003, "eval_steps_per_second": 1.306, "step": 16 }, { "epoch": 9.0, "grad_norm": 0.3332655131816864, "learning_rate": 0.0002999465222590856, "loss": 0.3655, "step": 18 }, { "epoch": 9.0, "eval_loss": 0.350115031003952, "eval_runtime": 0.7681, "eval_samples_per_second": 386.655, "eval_steps_per_second": 1.302, "step": 18 }, { "epoch": 10.0, "grad_norm": 0.36181315779685974, "learning_rate": 0.00029993320007383234, "loss": 0.3627, "step": 20 }, { "epoch": 10.0, "eval_loss": 0.35086938738822937, "eval_runtime": 0.7303, "eval_samples_per_second": 406.705, "eval_steps_per_second": 1.369, "step": 20 }, { "epoch": 11.0, "grad_norm": 0.3522287607192993, "learning_rate": 0.0002999183981084249, "loss": 0.3575, "step": 22 }, { "epoch": 11.0, "eval_loss": 0.3501550853252411, "eval_runtime": 0.6638, "eval_samples_per_second": 447.418, "eval_steps_per_second": 1.506, "step": 22 }, { "epoch": 12.0, "grad_norm": 0.48265793919563293, "learning_rate": 0.0002999021165089526, "loss": 0.356, "step": 24 }, { "epoch": 12.0, "eval_loss": 0.34953632950782776, "eval_runtime": 0.6847, "eval_samples_per_second": 433.784, "eval_steps_per_second": 1.461, "step": 24 }, { "epoch": 13.0, "grad_norm": 0.452486515045166, "learning_rate": 0.0002998843554361083, "loss": 0.3572, "step": 26 }, { "epoch": 13.0, "eval_loss": 0.34998443722724915, "eval_runtime": 0.6957, "eval_samples_per_second": 426.915, "eval_steps_per_second": 1.437, "step": 26 }, { "epoch": 14.0, "grad_norm": 0.5479589700698853, "learning_rate": 0.0002998651150651866, "loss": 0.3602, "step": 28 }, { "epoch": 14.0, "eval_loss": 0.3498837649822235, "eval_runtime": 0.7349, "eval_samples_per_second": 404.119, "eval_steps_per_second": 1.361, "step": 28 }, { "epoch": 15.0, "grad_norm": 0.43432193994522095, "learning_rate": 0.00029984439558608224, "loss": 0.3552, "step": 30 }, { "epoch": 15.0, "eval_loss": 0.34781429171562195, "eval_runtime": 0.6365, "eval_samples_per_second": 466.635, "eval_steps_per_second": 1.571, "step": 30 }, { "epoch": 16.0, "grad_norm": 0.35709846019744873, "learning_rate": 0.00029982219720328803, "loss": 0.3442, "step": 32 }, { "epoch": 16.0, "eval_loss": 0.34714677929878235, "eval_runtime": 0.7389, "eval_samples_per_second": 401.956, "eval_steps_per_second": 1.353, "step": 32 }, { "epoch": 17.0, "grad_norm": 0.3510877192020416, "learning_rate": 0.00029979852013589306, "loss": 0.3466, "step": 34 }, { "epoch": 17.0, "eval_loss": 0.3476266860961914, "eval_runtime": 0.582, "eval_samples_per_second": 510.287, "eval_steps_per_second": 1.718, "step": 34 }, { "epoch": 18.0, "grad_norm": 0.6324598789215088, "learning_rate": 0.0002997733646175805, "loss": 0.3495, "step": 36 }, { "epoch": 18.0, "eval_loss": 0.34712979197502136, "eval_runtime": 0.6306, "eval_samples_per_second": 470.991, "eval_steps_per_second": 1.586, "step": 36 }, { "epoch": 19.0, "grad_norm": 0.3290903866291046, "learning_rate": 0.00029974673089662506, "loss": 0.343, "step": 38 }, { "epoch": 19.0, "eval_loss": 0.3468128442764282, "eval_runtime": 0.6393, "eval_samples_per_second": 464.534, "eval_steps_per_second": 1.564, "step": 38 }, { "epoch": 20.0, "grad_norm": 0.3430045545101166, "learning_rate": 0.00029971861923589095, "loss": 0.342, "step": 40 }, { "epoch": 20.0, "eval_loss": 0.3459309935569763, "eval_runtime": 0.7921, "eval_samples_per_second": 374.931, "eval_steps_per_second": 1.262, "step": 40 }, { "epoch": 21.0, "grad_norm": 0.39738816022872925, "learning_rate": 0.0002996890299128288, "loss": 0.3444, "step": 42 }, { "epoch": 21.0, "eval_loss": 0.3452853560447693, "eval_runtime": 0.7918, "eval_samples_per_second": 375.108, "eval_steps_per_second": 1.263, "step": 42 }, { "epoch": 22.0, "grad_norm": 0.3685533106327057, "learning_rate": 0.00029965796321947336, "loss": 0.3368, "step": 44 }, { "epoch": 22.0, "eval_loss": 0.34537574648857117, "eval_runtime": 0.715, "eval_samples_per_second": 415.38, "eval_steps_per_second": 1.399, "step": 44 }, { "epoch": 23.0, "grad_norm": 0.41565972566604614, "learning_rate": 0.00029962541946244024, "loss": 0.3337, "step": 46 }, { "epoch": 23.0, "eval_loss": 0.34441086649894714, "eval_runtime": 0.5609, "eval_samples_per_second": 529.532, "eval_steps_per_second": 1.783, "step": 46 }, { "epoch": 24.0, "grad_norm": 0.36327725648880005, "learning_rate": 0.00029959139896292323, "loss": 0.3342, "step": 48 }, { "epoch": 24.0, "eval_loss": 0.34298038482666016, "eval_runtime": 0.672, "eval_samples_per_second": 441.965, "eval_steps_per_second": 1.488, "step": 48 }, { "epoch": 25.0, "grad_norm": 0.4043494164943695, "learning_rate": 0.0002995559020566911, "loss": 0.3414, "step": 50 }, { "epoch": 25.0, "eval_loss": 0.3428707420825958, "eval_runtime": 0.6633, "eval_samples_per_second": 447.772, "eval_steps_per_second": 1.508, "step": 50 }, { "epoch": 26.0, "grad_norm": 0.40012046694755554, "learning_rate": 0.0002995189290940837, "loss": 0.3387, "step": 52 }, { "epoch": 26.0, "eval_loss": 0.3441244959831238, "eval_runtime": 0.7445, "eval_samples_per_second": 398.927, "eval_steps_per_second": 1.343, "step": 52 }, { "epoch": 27.0, "grad_norm": 0.4684393107891083, "learning_rate": 0.0002994804804400094, "loss": 0.3343, "step": 54 }, { "epoch": 27.0, "eval_loss": 0.3433794677257538, "eval_runtime": 0.681, "eval_samples_per_second": 436.153, "eval_steps_per_second": 1.469, "step": 54 }, { "epoch": 28.0, "grad_norm": 0.4722350239753723, "learning_rate": 0.00029944055647394087, "loss": 0.3366, "step": 56 }, { "epoch": 28.0, "eval_loss": 0.3417203724384308, "eval_runtime": 0.6481, "eval_samples_per_second": 458.289, "eval_steps_per_second": 1.543, "step": 56 }, { "epoch": 29.0, "grad_norm": 0.5819035768508911, "learning_rate": 0.0002993991575899116, "loss": 0.3338, "step": 58 }, { "epoch": 29.0, "eval_loss": 0.3421356976032257, "eval_runtime": 0.8299, "eval_samples_per_second": 357.891, "eval_steps_per_second": 1.205, "step": 58 }, { "epoch": 30.0, "grad_norm": 0.4985131621360779, "learning_rate": 0.0002993562841965118, "loss": 0.3314, "step": 60 }, { "epoch": 30.0, "eval_loss": 0.34163638949394226, "eval_runtime": 0.6224, "eval_samples_per_second": 477.149, "eval_steps_per_second": 1.607, "step": 60 }, { "epoch": 31.0, "grad_norm": 0.5123302340507507, "learning_rate": 0.00029931193671688453, "loss": 0.33, "step": 62 }, { "epoch": 31.0, "eval_loss": 0.33960819244384766, "eval_runtime": 0.7702, "eval_samples_per_second": 385.59, "eval_steps_per_second": 1.298, "step": 62 }, { "epoch": 32.0, "grad_norm": 0.34553447365760803, "learning_rate": 0.0002992661155887215, "loss": 0.3253, "step": 64 }, { "epoch": 32.0, "eval_loss": 0.33889296650886536, "eval_runtime": 0.6787, "eval_samples_per_second": 437.601, "eval_steps_per_second": 1.473, "step": 64 }, { "epoch": 33.0, "grad_norm": 0.4334942698478699, "learning_rate": 0.0002992188212642587, "loss": 0.3221, "step": 66 }, { "epoch": 33.0, "eval_loss": 0.3390357494354248, "eval_runtime": 0.7883, "eval_samples_per_second": 376.773, "eval_steps_per_second": 1.269, "step": 66 }, { "epoch": 34.0, "grad_norm": 0.5000684261322021, "learning_rate": 0.0002991700542102722, "loss": 0.3228, "step": 68 }, { "epoch": 34.0, "eval_loss": 0.3387429118156433, "eval_runtime": 0.7066, "eval_samples_per_second": 420.345, "eval_steps_per_second": 1.415, "step": 68 }, { "epoch": 35.0, "grad_norm": 0.5403483510017395, "learning_rate": 0.000299119814908073, "loss": 0.3246, "step": 70 }, { "epoch": 35.0, "eval_loss": 0.337623655796051, "eval_runtime": 0.5666, "eval_samples_per_second": 524.176, "eval_steps_per_second": 1.765, "step": 70 }, { "epoch": 36.0, "grad_norm": 0.3801553547382355, "learning_rate": 0.00029906810385350283, "loss": 0.3215, "step": 72 }, { "epoch": 36.0, "eval_loss": 0.3370514214038849, "eval_runtime": 0.7169, "eval_samples_per_second": 414.265, "eval_steps_per_second": 1.395, "step": 72 }, { "epoch": 37.0, "grad_norm": 0.6648715138435364, "learning_rate": 0.00029901492155692876, "loss": 0.3205, "step": 74 }, { "epoch": 37.0, "eval_loss": 0.3367176651954651, "eval_runtime": 0.6219, "eval_samples_per_second": 477.599, "eval_steps_per_second": 1.608, "step": 74 }, { "epoch": 38.0, "grad_norm": 0.38467201590538025, "learning_rate": 0.0002989602685432388, "loss": 0.3175, "step": 76 }, { "epoch": 38.0, "eval_loss": 0.3385818302631378, "eval_runtime": 0.7477, "eval_samples_per_second": 397.199, "eval_steps_per_second": 1.337, "step": 76 }, { "epoch": 39.0, "grad_norm": 0.5086773633956909, "learning_rate": 0.00029890414535183583, "loss": 0.3188, "step": 78 }, { "epoch": 39.0, "eval_loss": 0.33492594957351685, "eval_runtime": 0.7572, "eval_samples_per_second": 392.254, "eval_steps_per_second": 1.321, "step": 78 }, { "epoch": 40.0, "grad_norm": 0.4003610610961914, "learning_rate": 0.00029884655253663344, "loss": 0.3197, "step": 80 }, { "epoch": 40.0, "eval_loss": 0.33226659893989563, "eval_runtime": 0.738, "eval_samples_per_second": 402.452, "eval_steps_per_second": 1.355, "step": 80 }, { "epoch": 41.0, "grad_norm": 0.4991908371448517, "learning_rate": 0.00029878749066604925, "loss": 0.313, "step": 82 }, { "epoch": 41.0, "eval_loss": 0.331875741481781, "eval_runtime": 0.7609, "eval_samples_per_second": 390.343, "eval_steps_per_second": 1.314, "step": 82 }, { "epoch": 42.0, "grad_norm": 0.450600802898407, "learning_rate": 0.0002987269603230001, "loss": 0.3142, "step": 84 }, { "epoch": 42.0, "eval_loss": 0.33316686749458313, "eval_runtime": 0.6259, "eval_samples_per_second": 474.552, "eval_steps_per_second": 1.598, "step": 84 }, { "epoch": 43.0, "grad_norm": 0.7004669308662415, "learning_rate": 0.00029866496210489605, "loss": 0.3112, "step": 86 }, { "epoch": 43.0, "eval_loss": 0.3317732810974121, "eval_runtime": 0.7352, "eval_samples_per_second": 403.996, "eval_steps_per_second": 1.36, "step": 86 }, { "epoch": 44.0, "grad_norm": 0.4675546884536743, "learning_rate": 0.0002986014966236345, "loss": 0.3118, "step": 88 }, { "epoch": 44.0, "eval_loss": 0.32955238223075867, "eval_runtime": 0.8122, "eval_samples_per_second": 365.683, "eval_steps_per_second": 1.231, "step": 88 }, { "epoch": 45.0, "grad_norm": 0.5740877389907837, "learning_rate": 0.00029853656450559414, "loss": 0.3083, "step": 90 }, { "epoch": 45.0, "eval_loss": 0.32967710494995117, "eval_runtime": 0.7609, "eval_samples_per_second": 390.346, "eval_steps_per_second": 1.314, "step": 90 }, { "epoch": 46.0, "grad_norm": 0.5421927571296692, "learning_rate": 0.00029847016639162867, "loss": 0.3125, "step": 92 }, { "epoch": 46.0, "eval_loss": 0.33168312907218933, "eval_runtime": 0.6488, "eval_samples_per_second": 457.788, "eval_steps_per_second": 1.541, "step": 92 }, { "epoch": 47.0, "grad_norm": 0.6857365369796753, "learning_rate": 0.0002984023029370609, "loss": 0.3035, "step": 94 }, { "epoch": 47.0, "eval_loss": 0.3298337161540985, "eval_runtime": 0.6488, "eval_samples_per_second": 457.751, "eval_steps_per_second": 1.541, "step": 94 }, { "epoch": 48.0, "grad_norm": 0.8388419151306152, "learning_rate": 0.0002983329748116755, "loss": 0.3031, "step": 96 }, { "epoch": 48.0, "eval_loss": 0.32884910702705383, "eval_runtime": 0.6776, "eval_samples_per_second": 438.283, "eval_steps_per_second": 1.476, "step": 96 }, { "epoch": 49.0, "grad_norm": 0.46255478262901306, "learning_rate": 0.00029826218269971314, "loss": 0.305, "step": 98 }, { "epoch": 49.0, "eval_loss": 0.3276749551296234, "eval_runtime": 0.733, "eval_samples_per_second": 405.188, "eval_steps_per_second": 1.364, "step": 98 }, { "epoch": 50.0, "grad_norm": 0.44940125942230225, "learning_rate": 0.0002981899272998634, "loss": 0.302, "step": 100 }, { "epoch": 50.0, "eval_loss": 0.3282870352268219, "eval_runtime": 0.7162, "eval_samples_per_second": 414.71, "eval_steps_per_second": 1.396, "step": 100 }, { "epoch": 51.0, "grad_norm": 0.5168047547340393, "learning_rate": 0.0002981162093252579, "loss": 0.2989, "step": 102 }, { "epoch": 51.0, "eval_loss": 0.33124735951423645, "eval_runtime": 0.629, "eval_samples_per_second": 472.143, "eval_steps_per_second": 1.59, "step": 102 }, { "epoch": 52.0, "grad_norm": 0.5312138795852661, "learning_rate": 0.00029804102950346334, "loss": 0.2957, "step": 104 }, { "epoch": 52.0, "eval_loss": 0.3266555964946747, "eval_runtime": 0.6431, "eval_samples_per_second": 461.793, "eval_steps_per_second": 1.555, "step": 104 }, { "epoch": 53.0, "grad_norm": 0.5349692702293396, "learning_rate": 0.0002979643885764741, "loss": 0.2975, "step": 106 }, { "epoch": 53.0, "eval_loss": 0.3233616352081299, "eval_runtime": 0.8057, "eval_samples_per_second": 368.612, "eval_steps_per_second": 1.241, "step": 106 }, { "epoch": 54.0, "grad_norm": 0.36133870482444763, "learning_rate": 0.00029788628730070533, "loss": 0.2957, "step": 108 }, { "epoch": 54.0, "eval_loss": 0.323390930891037, "eval_runtime": 0.6815, "eval_samples_per_second": 435.829, "eval_steps_per_second": 1.467, "step": 108 }, { "epoch": 55.0, "grad_norm": 0.4435558021068573, "learning_rate": 0.00029780672644698494, "loss": 0.2956, "step": 110 }, { "epoch": 55.0, "eval_loss": 0.3230491578578949, "eval_runtime": 0.6254, "eval_samples_per_second": 474.93, "eval_steps_per_second": 1.599, "step": 110 }, { "epoch": 56.0, "grad_norm": 0.4963320791721344, "learning_rate": 0.0002977257068005465, "loss": 0.2927, "step": 112 }, { "epoch": 56.0, "eval_loss": 0.321744829416275, "eval_runtime": 0.7031, "eval_samples_per_second": 422.43, "eval_steps_per_second": 1.422, "step": 112 }, { "epoch": 57.0, "grad_norm": 0.47572246193885803, "learning_rate": 0.0002976432291610213, "loss": 0.2945, "step": 114 }, { "epoch": 57.0, "eval_loss": 0.32216939330101013, "eval_runtime": 0.77, "eval_samples_per_second": 385.727, "eval_steps_per_second": 1.299, "step": 114 }, { "epoch": 58.0, "grad_norm": 0.4532373547554016, "learning_rate": 0.00029755929434243023, "loss": 0.2912, "step": 116 }, { "epoch": 58.0, "eval_loss": 0.32353639602661133, "eval_runtime": 0.7486, "eval_samples_per_second": 396.744, "eval_steps_per_second": 1.336, "step": 116 }, { "epoch": 59.0, "grad_norm": 0.37667471170425415, "learning_rate": 0.00029747390317317603, "loss": 0.2892, "step": 118 }, { "epoch": 59.0, "eval_loss": 0.3225230276584625, "eval_runtime": 0.6999, "eval_samples_per_second": 424.335, "eval_steps_per_second": 1.429, "step": 118 }, { "epoch": 60.0, "grad_norm": 0.48530539870262146, "learning_rate": 0.0002973870564960352, "loss": 0.2874, "step": 120 }, { "epoch": 60.0, "eval_loss": 0.32104259729385376, "eval_runtime": 0.6522, "eval_samples_per_second": 455.417, "eval_steps_per_second": 1.533, "step": 120 }, { "epoch": 61.0, "grad_norm": 0.46407485008239746, "learning_rate": 0.00029729875516814935, "loss": 0.2836, "step": 122 }, { "epoch": 61.0, "eval_loss": 0.32013222575187683, "eval_runtime": 0.8233, "eval_samples_per_second": 360.757, "eval_steps_per_second": 1.215, "step": 122 }, { "epoch": 62.0, "grad_norm": 0.6122850179672241, "learning_rate": 0.0002972090000610169, "loss": 0.2894, "step": 124 }, { "epoch": 62.0, "eval_loss": 0.3198564350605011, "eval_runtime": 0.6743, "eval_samples_per_second": 440.478, "eval_steps_per_second": 1.483, "step": 124 }, { "epoch": 63.0, "grad_norm": 0.5037839412689209, "learning_rate": 0.0002971177920604845, "loss": 0.2891, "step": 126 }, { "epoch": 63.0, "eval_loss": 0.31861403584480286, "eval_runtime": 0.6524, "eval_samples_per_second": 455.235, "eval_steps_per_second": 1.533, "step": 126 }, { "epoch": 64.0, "grad_norm": 0.4292280972003937, "learning_rate": 0.00029702513206673827, "loss": 0.2895, "step": 128 }, { "epoch": 64.0, "eval_loss": 0.32153990864753723, "eval_runtime": 0.6418, "eval_samples_per_second": 462.761, "eval_steps_per_second": 1.558, "step": 128 }, { "epoch": 65.0, "grad_norm": 0.4504665732383728, "learning_rate": 0.000296931020994295, "loss": 0.2829, "step": 130 }, { "epoch": 65.0, "eval_loss": 0.3232710063457489, "eval_runtime": 0.7745, "eval_samples_per_second": 383.487, "eval_steps_per_second": 1.291, "step": 130 }, { "epoch": 66.0, "grad_norm": 0.6219745874404907, "learning_rate": 0.000296835459771993, "loss": 0.2859, "step": 132 }, { "epoch": 66.0, "eval_loss": 0.3181236982345581, "eval_runtime": 0.7429, "eval_samples_per_second": 399.762, "eval_steps_per_second": 1.346, "step": 132 }, { "epoch": 67.0, "grad_norm": 0.37324589490890503, "learning_rate": 0.0002967384493429829, "loss": 0.2798, "step": 134 }, { "epoch": 67.0, "eval_loss": 0.31664231419563293, "eval_runtime": 0.6029, "eval_samples_per_second": 492.583, "eval_steps_per_second": 1.659, "step": 134 }, { "epoch": 68.0, "grad_norm": 0.48705121874809265, "learning_rate": 0.0002966399906647185, "loss": 0.281, "step": 136 }, { "epoch": 68.0, "eval_loss": 0.3166642189025879, "eval_runtime": 0.7131, "eval_samples_per_second": 416.468, "eval_steps_per_second": 1.402, "step": 136 }, { "epoch": 69.0, "grad_norm": 0.4066949188709259, "learning_rate": 0.0002965400847089472, "loss": 0.2822, "step": 138 }, { "epoch": 69.0, "eval_loss": 0.3151547312736511, "eval_runtime": 0.7062, "eval_samples_per_second": 420.548, "eval_steps_per_second": 1.416, "step": 138 }, { "epoch": 70.0, "grad_norm": 0.3984168469905853, "learning_rate": 0.00029643873246170045, "loss": 0.2801, "step": 140 }, { "epoch": 70.0, "eval_loss": 0.314954549074173, "eval_runtime": 0.6916, "eval_samples_per_second": 429.454, "eval_steps_per_second": 1.446, "step": 140 }, { "epoch": 71.0, "grad_norm": 0.4139808118343353, "learning_rate": 0.000296335934923284, "loss": 0.2815, "step": 142 }, { "epoch": 71.0, "eval_loss": 0.31498512625694275, "eval_runtime": 0.6363, "eval_samples_per_second": 466.781, "eval_steps_per_second": 1.572, "step": 142 }, { "epoch": 72.0, "grad_norm": 0.4868161678314209, "learning_rate": 0.0002962316931082681, "loss": 0.2781, "step": 144 }, { "epoch": 72.0, "eval_loss": 0.31632447242736816, "eval_runtime": 0.7076, "eval_samples_per_second": 419.738, "eval_steps_per_second": 1.413, "step": 144 }, { "epoch": 73.0, "grad_norm": 0.41822776198387146, "learning_rate": 0.0002961260080454773, "loss": 0.2771, "step": 146 }, { "epoch": 73.0, "eval_loss": 0.31577062606811523, "eval_runtime": 0.6483, "eval_samples_per_second": 458.136, "eval_steps_per_second": 1.543, "step": 146 }, { "epoch": 74.0, "grad_norm": 0.5603657960891724, "learning_rate": 0.0002960188807779805, "loss": 0.2732, "step": 148 }, { "epoch": 74.0, "eval_loss": 0.31435203552246094, "eval_runtime": 0.691, "eval_samples_per_second": 429.801, "eval_steps_per_second": 1.447, "step": 148 }, { "epoch": 75.0, "grad_norm": 0.5573254227638245, "learning_rate": 0.0002959103123630807, "loss": 0.2766, "step": 150 }, { "epoch": 75.0, "eval_loss": 0.3145710527896881, "eval_runtime": 0.5527, "eval_samples_per_second": 537.332, "eval_steps_per_second": 1.809, "step": 150 }, { "epoch": 76.0, "grad_norm": 0.4642890393733978, "learning_rate": 0.0002958003038723042, "loss": 0.2798, "step": 152 }, { "epoch": 76.0, "eval_loss": 0.3158363401889801, "eval_runtime": 0.6733, "eval_samples_per_second": 441.12, "eval_steps_per_second": 1.485, "step": 152 }, { "epoch": 77.0, "grad_norm": 0.4938775300979614, "learning_rate": 0.00029568885639139053, "loss": 0.2753, "step": 154 }, { "epoch": 77.0, "eval_loss": 0.31477540731430054, "eval_runtime": 0.7417, "eval_samples_per_second": 400.422, "eval_steps_per_second": 1.348, "step": 154 }, { "epoch": 78.0, "grad_norm": 0.39144381880760193, "learning_rate": 0.00029557597102028123, "loss": 0.2732, "step": 156 }, { "epoch": 78.0, "eval_loss": 0.31341299414634705, "eval_runtime": 0.7854, "eval_samples_per_second": 378.137, "eval_steps_per_second": 1.273, "step": 156 }, { "epoch": 79.0, "grad_norm": 0.4929453432559967, "learning_rate": 0.00029546164887310933, "loss": 0.2711, "step": 158 }, { "epoch": 79.0, "eval_loss": 0.31385812163352966, "eval_runtime": 0.6657, "eval_samples_per_second": 446.114, "eval_steps_per_second": 1.502, "step": 158 }, { "epoch": 80.0, "grad_norm": 0.40074649453163147, "learning_rate": 0.0002953458910781883, "loss": 0.2677, "step": 160 }, { "epoch": 80.0, "eval_loss": 0.3135402500629425, "eval_runtime": 0.6596, "eval_samples_per_second": 450.303, "eval_steps_per_second": 1.516, "step": 160 }, { "epoch": 81.0, "grad_norm": 0.41622215509414673, "learning_rate": 0.0002952286987780008, "loss": 0.2668, "step": 162 }, { "epoch": 81.0, "eval_loss": 0.31341665983200073, "eval_runtime": 0.5644, "eval_samples_per_second": 526.236, "eval_steps_per_second": 1.772, "step": 162 }, { "epoch": 82.0, "grad_norm": 0.5020734667778015, "learning_rate": 0.0002951100731291876, "loss": 0.2734, "step": 164 }, { "epoch": 82.0, "eval_loss": 0.3154440224170685, "eval_runtime": 0.7427, "eval_samples_per_second": 399.916, "eval_steps_per_second": 1.347, "step": 164 }, { "epoch": 83.0, "grad_norm": 0.7120861411094666, "learning_rate": 0.0002949900153025359, "loss": 0.2712, "step": 166 }, { "epoch": 83.0, "eval_loss": 0.3155328929424286, "eval_runtime": 0.734, "eval_samples_per_second": 404.611, "eval_steps_per_second": 1.362, "step": 166 }, { "epoch": 84.0, "grad_norm": 0.5501344799995422, "learning_rate": 0.00029486852648296806, "loss": 0.2705, "step": 168 }, { "epoch": 84.0, "eval_loss": 0.31369757652282715, "eval_runtime": 0.6073, "eval_samples_per_second": 489.085, "eval_steps_per_second": 1.647, "step": 168 }, { "epoch": 85.0, "grad_norm": 0.3931790292263031, "learning_rate": 0.00029474560786952957, "loss": 0.273, "step": 170 }, { "epoch": 85.0, "eval_loss": 0.3132490813732147, "eval_runtime": 0.6683, "eval_samples_per_second": 444.398, "eval_steps_per_second": 1.496, "step": 170 }, { "epoch": 86.0, "grad_norm": 0.4931921064853668, "learning_rate": 0.00029462126067537756, "loss": 0.2713, "step": 172 }, { "epoch": 86.0, "eval_loss": 0.3119468688964844, "eval_runtime": 0.5093, "eval_samples_per_second": 583.176, "eval_steps_per_second": 1.964, "step": 172 }, { "epoch": 87.0, "grad_norm": 0.5810508131980896, "learning_rate": 0.00029449548612776866, "loss": 0.2665, "step": 174 }, { "epoch": 87.0, "eval_loss": 0.31254178285598755, "eval_runtime": 0.7282, "eval_samples_per_second": 407.876, "eval_steps_per_second": 1.373, "step": 174 }, { "epoch": 88.0, "grad_norm": 0.6227275729179382, "learning_rate": 0.00029436828546804686, "loss": 0.2719, "step": 176 }, { "epoch": 88.0, "eval_loss": 0.3148057162761688, "eval_runtime": 0.7497, "eval_samples_per_second": 396.169, "eval_steps_per_second": 1.334, "step": 176 }, { "epoch": 89.0, "grad_norm": 0.6895466446876526, "learning_rate": 0.0002942396599516314, "loss": 0.2666, "step": 178 }, { "epoch": 89.0, "eval_loss": 0.31230518221855164, "eval_runtime": 0.6871, "eval_samples_per_second": 432.239, "eval_steps_per_second": 1.455, "step": 178 }, { "epoch": 90.0, "grad_norm": 0.6018730401992798, "learning_rate": 0.0002941096108480041, "loss": 0.2641, "step": 180 }, { "epoch": 90.0, "eval_loss": 0.3139157295227051, "eval_runtime": 0.6814, "eval_samples_per_second": 435.86, "eval_steps_per_second": 1.468, "step": 180 }, { "epoch": 91.0, "grad_norm": 0.5773816704750061, "learning_rate": 0.0002939781394406971, "loss": 0.2679, "step": 182 }, { "epoch": 91.0, "eval_loss": 0.314519464969635, "eval_runtime": 0.6242, "eval_samples_per_second": 475.778, "eval_steps_per_second": 1.602, "step": 182 }, { "epoch": 92.0, "grad_norm": 0.430232971906662, "learning_rate": 0.00029384524702728013, "loss": 0.2636, "step": 184 }, { "epoch": 92.0, "eval_loss": 0.3126896917819977, "eval_runtime": 0.686, "eval_samples_per_second": 432.948, "eval_steps_per_second": 1.458, "step": 184 }, { "epoch": 93.0, "grad_norm": 0.5966992974281311, "learning_rate": 0.0002937109349193477, "loss": 0.2662, "step": 186 }, { "epoch": 93.0, "eval_loss": 0.3127867877483368, "eval_runtime": 0.704, "eval_samples_per_second": 421.861, "eval_steps_per_second": 1.42, "step": 186 }, { "epoch": 94.0, "grad_norm": 0.5436354875564575, "learning_rate": 0.000293575204442506, "loss": 0.2625, "step": 188 }, { "epoch": 94.0, "eval_loss": 0.3121253550052643, "eval_runtime": 0.6839, "eval_samples_per_second": 434.267, "eval_steps_per_second": 1.462, "step": 188 }, { "epoch": 95.0, "grad_norm": 0.6525291800498962, "learning_rate": 0.00029343805693636017, "loss": 0.2643, "step": 190 }, { "epoch": 95.0, "eval_loss": 0.30872872471809387, "eval_runtime": 0.7643, "eval_samples_per_second": 388.597, "eval_steps_per_second": 1.308, "step": 190 }, { "epoch": 96.0, "grad_norm": 0.41923660039901733, "learning_rate": 0.0002932994937545007, "loss": 0.2605, "step": 192 }, { "epoch": 96.0, "eval_loss": 0.30773305892944336, "eval_runtime": 0.621, "eval_samples_per_second": 478.226, "eval_steps_per_second": 1.61, "step": 192 }, { "epoch": 97.0, "grad_norm": 0.49382027983665466, "learning_rate": 0.0002931595162644901, "loss": 0.2613, "step": 194 }, { "epoch": 97.0, "eval_loss": 0.3081483542919159, "eval_runtime": 0.5459, "eval_samples_per_second": 544.096, "eval_steps_per_second": 1.832, "step": 194 }, { "epoch": 98.0, "grad_norm": 0.3870932459831238, "learning_rate": 0.0002930181258478499, "loss": 0.2655, "step": 196 }, { "epoch": 98.0, "eval_loss": 0.3087702691555023, "eval_runtime": 0.7521, "eval_samples_per_second": 394.875, "eval_steps_per_second": 1.33, "step": 196 }, { "epoch": 99.0, "grad_norm": 0.3989209532737732, "learning_rate": 0.00029287532390004633, "loss": 0.2605, "step": 198 }, { "epoch": 99.0, "eval_loss": 0.30880430340766907, "eval_runtime": 0.6471, "eval_samples_per_second": 458.95, "eval_steps_per_second": 1.545, "step": 198 }, { "epoch": 100.0, "grad_norm": 0.6288752555847168, "learning_rate": 0.00029273111183047697, "loss": 0.2624, "step": 200 }, { "epoch": 100.0, "eval_loss": 0.30914193391799927, "eval_runtime": 0.7927, "eval_samples_per_second": 374.659, "eval_steps_per_second": 1.261, "step": 200 }, { "epoch": 101.0, "grad_norm": 0.7080681324005127, "learning_rate": 0.0002925854910624568, "loss": 0.2532, "step": 202 }, { "epoch": 101.0, "eval_loss": 0.3055001199245453, "eval_runtime": 0.6808, "eval_samples_per_second": 436.251, "eval_steps_per_second": 1.469, "step": 202 }, { "epoch": 102.0, "grad_norm": 0.6294286251068115, "learning_rate": 0.00029243846303320386, "loss": 0.2603, "step": 204 }, { "epoch": 102.0, "eval_loss": 0.3056044280529022, "eval_runtime": 0.7558, "eval_samples_per_second": 392.961, "eval_steps_per_second": 1.323, "step": 204 }, { "epoch": 103.0, "grad_norm": 0.6036813259124756, "learning_rate": 0.0002922900291938255, "loss": 0.2576, "step": 206 }, { "epoch": 103.0, "eval_loss": 0.30621686577796936, "eval_runtime": 0.7737, "eval_samples_per_second": 383.867, "eval_steps_per_second": 1.292, "step": 206 }, { "epoch": 104.0, "grad_norm": 0.44163164496421814, "learning_rate": 0.00029214019100930384, "loss": 0.2532, "step": 208 }, { "epoch": 104.0, "eval_loss": 0.305007666349411, "eval_runtime": 0.7019, "eval_samples_per_second": 423.12, "eval_steps_per_second": 1.425, "step": 208 } ], "logging_steps": 500, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 1000, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 1e-05 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1925114350927872.0, "train_batch_size": 512, "trial_name": null, "trial_params": null }