{ "best_global_step": 348, "best_metric": 0.5661588907241821, "best_model_checkpoint": "/workspace/scripts/soutputs/8ca8a9ea-9ae3-4938-9713-015819984d61_0/checkpoint-348", "epoch": 0.997134670487106, "eval_steps": 500, "global_step": 348, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014326647564469915, "grad_norm": 1.8568024635314941, "learning_rate": 2.3557306798236462e-05, "loss": 0.9115, "step": 5 }, { "epoch": 0.02865329512893983, "grad_norm": 0.9094077944755554, "learning_rate": 5.300394029603203e-05, "loss": 0.7201, "step": 10 }, { "epoch": 0.04297994269340974, "grad_norm": 0.7323216199874878, "learning_rate": 8.245057379382762e-05, "loss": 0.6833, "step": 15 }, { "epoch": 0.05730659025787966, "grad_norm": 0.5553655624389648, "learning_rate": 0.00011189720729162319, "loss": 0.6641, "step": 20 }, { "epoch": 0.07163323782234957, "grad_norm": 0.5094953179359436, "learning_rate": 0.00014134384078941877, "loss": 0.6485, "step": 25 }, { "epoch": 0.08595988538681948, "grad_norm": 0.4796569347381592, "learning_rate": 0.00017079047428721436, "loss": 0.6399, "step": 30 }, { "epoch": 0.10028653295128939, "grad_norm": 0.49097880721092224, "learning_rate": 0.00020023710778500992, "loss": 0.6322, "step": 35 }, { "epoch": 0.11461318051575932, "grad_norm": 0.4515115022659302, "learning_rate": 0.00020611968072263296, "loss": 0.6384, "step": 40 }, { "epoch": 0.12893982808022922, "grad_norm": 0.4155406057834625, "learning_rate": 0.0002060922453495023, "loss": 0.6088, "step": 45 }, { "epoch": 0.14326647564469913, "grad_norm": 0.4288005232810974, "learning_rate": 0.00020604371285965804, "loss": 0.6074, "step": 50 }, { "epoch": 0.15759312320916904, "grad_norm": 0.41716331243515015, "learning_rate": 0.000205974094945471, "loss": 0.604, "step": 55 }, { "epoch": 0.17191977077363896, "grad_norm": 0.412203311920166, "learning_rate": 0.00020588340837917924, "loss": 0.6138, "step": 60 }, { "epoch": 0.18624641833810887, "grad_norm": 0.3845170736312866, "learning_rate": 0.00020577167500884718, "loss": 0.6036, "step": 65 }, { "epoch": 0.20057306590257878, "grad_norm": 0.4012996256351471, "learning_rate": 0.00020563892175310208, "loss": 0.6313, "step": 70 }, { "epoch": 0.2148997134670487, "grad_norm": 0.4158208966255188, "learning_rate": 0.0002054851805946488, "loss": 0.6115, "step": 75 }, { "epoch": 0.22922636103151864, "grad_norm": 0.409179151058197, "learning_rate": 0.00020531048857256465, "loss": 0.6039, "step": 80 }, { "epoch": 0.24355300859598855, "grad_norm": 0.3829016387462616, "learning_rate": 0.00020511488777337586, "loss": 0.6224, "step": 85 }, { "epoch": 0.25787965616045844, "grad_norm": 0.3995232582092285, "learning_rate": 0.00020489842532091834, "loss": 0.616, "step": 90 }, { "epoch": 0.2722063037249284, "grad_norm": 0.3849766254425049, "learning_rate": 0.00020466115336498453, "loss": 0.6012, "step": 95 }, { "epoch": 0.28653295128939826, "grad_norm": 0.3875938951969147, "learning_rate": 0.00020440312906875961, "loss": 0.5982, "step": 100 }, { "epoch": 0.3008595988538682, "grad_norm": 0.4086308777332306, "learning_rate": 0.0002041244145950498, "loss": 0.6051, "step": 105 }, { "epoch": 0.3151862464183381, "grad_norm": 0.40100717544555664, "learning_rate": 0.00020382507709130636, "loss": 0.6121, "step": 110 }, { "epoch": 0.32951289398280803, "grad_norm": 0.42525944113731384, "learning_rate": 0.0002035051886734482, "loss": 0.6111, "step": 115 }, { "epoch": 0.3438395415472779, "grad_norm": 0.4180887043476105, "learning_rate": 0.00020316482640848823, "loss": 0.5977, "step": 120 }, { "epoch": 0.35816618911174786, "grad_norm": 0.38851866126060486, "learning_rate": 0.00020280407229596612, "loss": 0.6037, "step": 125 }, { "epoch": 0.37249283667621774, "grad_norm": 0.39135101437568665, "learning_rate": 0.0002024230132481934, "loss": 0.5931, "step": 130 }, { "epoch": 0.3868194842406877, "grad_norm": 0.3942822813987732, "learning_rate": 0.00020202174106931448, "loss": 0.572, "step": 135 }, { "epoch": 0.40114613180515757, "grad_norm": 0.38815903663635254, "learning_rate": 0.0002016003524331895, "loss": 0.6004, "step": 140 }, { "epoch": 0.4154727793696275, "grad_norm": 0.3702361285686493, "learning_rate": 0.00020115894886010366, "loss": 0.5913, "step": 145 }, { "epoch": 0.4297994269340974, "grad_norm": 0.37867555022239685, "learning_rate": 0.00020069763669230918, "loss": 0.5811, "step": 150 }, { "epoch": 0.44412607449856734, "grad_norm": 0.36923325061798096, "learning_rate": 0.00020021652706840554, "loss": 0.5953, "step": 155 }, { "epoch": 0.4584527220630373, "grad_norm": 0.3655109703540802, "learning_rate": 0.00019971573589656414, "loss": 0.5863, "step": 160 }, { "epoch": 0.47277936962750716, "grad_norm": 0.37171751260757446, "learning_rate": 0.00019919538382660374, "loss": 0.5939, "step": 165 }, { "epoch": 0.4871060171919771, "grad_norm": 0.37251758575439453, "learning_rate": 0.00019865559622092392, "loss": 0.5895, "step": 170 }, { "epoch": 0.501432664756447, "grad_norm": 0.3676713705062866, "learning_rate": 0.00019809650312430275, "loss": 0.5701, "step": 175 }, { "epoch": 0.5157593123209169, "grad_norm": 0.3767331540584564, "learning_rate": 0.0001975182392325668, "loss": 0.5874, "step": 180 }, { "epoch": 0.5300859598853869, "grad_norm": 0.3921726942062378, "learning_rate": 0.00019692094386014036, "loss": 0.5729, "step": 185 }, { "epoch": 0.5444126074498568, "grad_norm": 0.37424615025520325, "learning_rate": 0.00019630476090648182, "loss": 0.5826, "step": 190 }, { "epoch": 0.5587392550143266, "grad_norm": 0.37583500146865845, "learning_rate": 0.00019566983882141615, "loss": 0.5687, "step": 195 }, { "epoch": 0.5730659025787965, "grad_norm": 0.38613978028297424, "learning_rate": 0.00019501633056936998, "loss": 0.5918, "step": 200 }, { "epoch": 0.5873925501432665, "grad_norm": 0.3772313594818115, "learning_rate": 0.00019434439359252017, "loss": 0.5884, "step": 205 }, { "epoch": 0.6017191977077364, "grad_norm": 0.3849615752696991, "learning_rate": 0.00019365418977286276, "loss": 0.5598, "step": 210 }, { "epoch": 0.6160458452722063, "grad_norm": 0.3819197118282318, "learning_rate": 0.0001929458853932128, "loss": 0.5803, "step": 215 }, { "epoch": 0.6303724928366762, "grad_norm": 0.3772597908973694, "learning_rate": 0.00019221965109714363, "loss": 0.5858, "step": 220 }, { "epoch": 0.6446991404011462, "grad_norm": 0.37635743618011475, "learning_rate": 0.00019147566184787585, "loss": 0.5827, "step": 225 }, { "epoch": 0.6590257879656161, "grad_norm": 0.3801959455013275, "learning_rate": 0.00019071409688612524, "loss": 0.5683, "step": 230 }, { "epoch": 0.673352435530086, "grad_norm": 0.3805936872959137, "learning_rate": 0.00018993513968692063, "loss": 0.5771, "step": 235 }, { "epoch": 0.6876790830945558, "grad_norm": 0.3774188160896301, "learning_rate": 0.00018913897791540107, "loss": 0.5631, "step": 240 }, { "epoch": 0.7020057306590258, "grad_norm": 0.37649068236351013, "learning_rate": 0.00018832580338160425, "loss": 0.5669, "step": 245 }, { "epoch": 0.7163323782234957, "grad_norm": 0.38752481341362, "learning_rate": 0.00018749581199425556, "loss": 0.5743, "step": 250 }, { "epoch": 0.7306590257879656, "grad_norm": 0.38894984126091003, "learning_rate": 0.0001866492037135702, "loss": 0.579, "step": 255 }, { "epoch": 0.7449856733524355, "grad_norm": 0.36124807596206665, "learning_rate": 0.00018578618250307912, "loss": 0.5752, "step": 260 }, { "epoch": 0.7593123209169055, "grad_norm": 0.3647681474685669, "learning_rate": 0.00018490695628049046, "loss": 0.573, "step": 265 }, { "epoch": 0.7736389684813754, "grad_norm": 0.38366076350212097, "learning_rate": 0.0001840117368675982, "loss": 0.5677, "step": 270 }, { "epoch": 0.7879656160458453, "grad_norm": 0.36573946475982666, "learning_rate": 0.0001831007399392506, "loss": 0.5652, "step": 275 }, { "epoch": 0.8022922636103151, "grad_norm": 0.354879230260849, "learning_rate": 0.00018217418497139, "loss": 0.5648, "step": 280 }, { "epoch": 0.8166189111747851, "grad_norm": 0.36587202548980713, "learning_rate": 0.00018123229518817702, "loss": 0.566, "step": 285 }, { "epoch": 0.830945558739255, "grad_norm": 0.3714900314807892, "learning_rate": 0.0001802752975082119, "loss": 0.57, "step": 290 }, { "epoch": 0.8452722063037249, "grad_norm": 0.3702372610569, "learning_rate": 0.00017930342248986537, "loss": 0.5569, "step": 295 }, { "epoch": 0.8595988538681948, "grad_norm": 0.36726057529449463, "learning_rate": 0.00017831690427573326, "loss": 0.5491, "step": 300 }, { "epoch": 0.8739255014326648, "grad_norm": 0.36911457777023315, "learning_rate": 0.00017731598053622675, "loss": 0.5596, "step": 305 }, { "epoch": 0.8882521489971347, "grad_norm": 0.38149625062942505, "learning_rate": 0.00017630089241231375, "loss": 0.5736, "step": 310 }, { "epoch": 0.9025787965616046, "grad_norm": 0.3648279011249542, "learning_rate": 0.00017527188445742308, "loss": 0.5605, "step": 315 }, { "epoch": 0.9169054441260746, "grad_norm": 0.3748702108860016, "learning_rate": 0.00017422920457852738, "loss": 0.5802, "step": 320 }, { "epoch": 0.9312320916905444, "grad_norm": 0.3691900372505188, "learning_rate": 0.00017317310397641764, "loss": 0.5431, "step": 325 }, { "epoch": 0.9455587392550143, "grad_norm": 0.3742789328098297, "learning_rate": 0.0001721038370851842, "loss": 0.5647, "step": 330 }, { "epoch": 0.9598853868194842, "grad_norm": 0.36382776498794556, "learning_rate": 0.00017102166151091922, "loss": 0.5631, "step": 335 }, { "epoch": 0.9742120343839542, "grad_norm": 0.3645402193069458, "learning_rate": 0.00016992683796965424, "loss": 0.5671, "step": 340 }, { "epoch": 0.9885386819484241, "grad_norm": 0.3582296669483185, "learning_rate": 0.000168819630224549, "loss": 0.564, "step": 345 }, { "epoch": 0.997134670487106, "eval_loss": 0.5661588907241821, "eval_runtime": 3.0246, "eval_samples_per_second": 13.886, "eval_steps_per_second": 13.886, "step": 348 } ], "logging_steps": 5, "max_steps": 1047, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.990390930375967e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }