| { | |
| "best_global_step": 348, | |
| "best_metric": 0.5661588907241821, | |
| "best_model_checkpoint": "/workspace/scripts/soutputs/8ca8a9ea-9ae3-4938-9713-015819984d61_0/checkpoint-348", | |
| "epoch": 0.997134670487106, | |
| "eval_steps": 500, | |
| "global_step": 348, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014326647564469915, | |
| "grad_norm": 1.8568024635314941, | |
| "learning_rate": 2.3557306798236462e-05, | |
| "loss": 0.9115, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02865329512893983, | |
| "grad_norm": 0.9094077944755554, | |
| "learning_rate": 5.300394029603203e-05, | |
| "loss": 0.7201, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04297994269340974, | |
| "grad_norm": 0.7323216199874878, | |
| "learning_rate": 8.245057379382762e-05, | |
| "loss": 0.6833, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05730659025787966, | |
| "grad_norm": 0.5553655624389648, | |
| "learning_rate": 0.00011189720729162319, | |
| "loss": 0.6641, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07163323782234957, | |
| "grad_norm": 0.5094953179359436, | |
| "learning_rate": 0.00014134384078941877, | |
| "loss": 0.6485, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08595988538681948, | |
| "grad_norm": 0.4796569347381592, | |
| "learning_rate": 0.00017079047428721436, | |
| "loss": 0.6399, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10028653295128939, | |
| "grad_norm": 0.49097880721092224, | |
| "learning_rate": 0.00020023710778500992, | |
| "loss": 0.6322, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.11461318051575932, | |
| "grad_norm": 0.4515115022659302, | |
| "learning_rate": 0.00020611968072263296, | |
| "loss": 0.6384, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12893982808022922, | |
| "grad_norm": 0.4155406057834625, | |
| "learning_rate": 0.0002060922453495023, | |
| "loss": 0.6088, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.14326647564469913, | |
| "grad_norm": 0.4288005232810974, | |
| "learning_rate": 0.00020604371285965804, | |
| "loss": 0.6074, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15759312320916904, | |
| "grad_norm": 0.41716331243515015, | |
| "learning_rate": 0.000205974094945471, | |
| "loss": 0.604, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.17191977077363896, | |
| "grad_norm": 0.412203311920166, | |
| "learning_rate": 0.00020588340837917924, | |
| "loss": 0.6138, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.18624641833810887, | |
| "grad_norm": 0.3845170736312866, | |
| "learning_rate": 0.00020577167500884718, | |
| "loss": 0.6036, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.20057306590257878, | |
| "grad_norm": 0.4012996256351471, | |
| "learning_rate": 0.00020563892175310208, | |
| "loss": 0.6313, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2148997134670487, | |
| "grad_norm": 0.4158208966255188, | |
| "learning_rate": 0.0002054851805946488, | |
| "loss": 0.6115, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.22922636103151864, | |
| "grad_norm": 0.409179151058197, | |
| "learning_rate": 0.00020531048857256465, | |
| "loss": 0.6039, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.24355300859598855, | |
| "grad_norm": 0.3829016387462616, | |
| "learning_rate": 0.00020511488777337586, | |
| "loss": 0.6224, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.25787965616045844, | |
| "grad_norm": 0.3995232582092285, | |
| "learning_rate": 0.00020489842532091834, | |
| "loss": 0.616, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2722063037249284, | |
| "grad_norm": 0.3849766254425049, | |
| "learning_rate": 0.00020466115336498453, | |
| "loss": 0.6012, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.28653295128939826, | |
| "grad_norm": 0.3875938951969147, | |
| "learning_rate": 0.00020440312906875961, | |
| "loss": 0.5982, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3008595988538682, | |
| "grad_norm": 0.4086308777332306, | |
| "learning_rate": 0.0002041244145950498, | |
| "loss": 0.6051, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3151862464183381, | |
| "grad_norm": 0.40100717544555664, | |
| "learning_rate": 0.00020382507709130636, | |
| "loss": 0.6121, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.32951289398280803, | |
| "grad_norm": 0.42525944113731384, | |
| "learning_rate": 0.0002035051886734482, | |
| "loss": 0.6111, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3438395415472779, | |
| "grad_norm": 0.4180887043476105, | |
| "learning_rate": 0.00020316482640848823, | |
| "loss": 0.5977, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.35816618911174786, | |
| "grad_norm": 0.38851866126060486, | |
| "learning_rate": 0.00020280407229596612, | |
| "loss": 0.6037, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.37249283667621774, | |
| "grad_norm": 0.39135101437568665, | |
| "learning_rate": 0.0002024230132481934, | |
| "loss": 0.5931, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3868194842406877, | |
| "grad_norm": 0.3942822813987732, | |
| "learning_rate": 0.00020202174106931448, | |
| "loss": 0.572, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.40114613180515757, | |
| "grad_norm": 0.38815903663635254, | |
| "learning_rate": 0.0002016003524331895, | |
| "loss": 0.6004, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4154727793696275, | |
| "grad_norm": 0.3702361285686493, | |
| "learning_rate": 0.00020115894886010366, | |
| "loss": 0.5913, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4297994269340974, | |
| "grad_norm": 0.37867555022239685, | |
| "learning_rate": 0.00020069763669230918, | |
| "loss": 0.5811, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.44412607449856734, | |
| "grad_norm": 0.36923325061798096, | |
| "learning_rate": 0.00020021652706840554, | |
| "loss": 0.5953, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4584527220630373, | |
| "grad_norm": 0.3655109703540802, | |
| "learning_rate": 0.00019971573589656414, | |
| "loss": 0.5863, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.47277936962750716, | |
| "grad_norm": 0.37171751260757446, | |
| "learning_rate": 0.00019919538382660374, | |
| "loss": 0.5939, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.4871060171919771, | |
| "grad_norm": 0.37251758575439453, | |
| "learning_rate": 0.00019865559622092392, | |
| "loss": 0.5895, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.501432664756447, | |
| "grad_norm": 0.3676713705062866, | |
| "learning_rate": 0.00019809650312430275, | |
| "loss": 0.5701, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5157593123209169, | |
| "grad_norm": 0.3767331540584564, | |
| "learning_rate": 0.0001975182392325668, | |
| "loss": 0.5874, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5300859598853869, | |
| "grad_norm": 0.3921726942062378, | |
| "learning_rate": 0.00019692094386014036, | |
| "loss": 0.5729, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5444126074498568, | |
| "grad_norm": 0.37424615025520325, | |
| "learning_rate": 0.00019630476090648182, | |
| "loss": 0.5826, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5587392550143266, | |
| "grad_norm": 0.37583500146865845, | |
| "learning_rate": 0.00019566983882141615, | |
| "loss": 0.5687, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5730659025787965, | |
| "grad_norm": 0.38613978028297424, | |
| "learning_rate": 0.00019501633056936998, | |
| "loss": 0.5918, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5873925501432665, | |
| "grad_norm": 0.3772313594818115, | |
| "learning_rate": 0.00019434439359252017, | |
| "loss": 0.5884, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6017191977077364, | |
| "grad_norm": 0.3849615752696991, | |
| "learning_rate": 0.00019365418977286276, | |
| "loss": 0.5598, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6160458452722063, | |
| "grad_norm": 0.3819197118282318, | |
| "learning_rate": 0.0001929458853932128, | |
| "loss": 0.5803, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6303724928366762, | |
| "grad_norm": 0.3772597908973694, | |
| "learning_rate": 0.00019221965109714363, | |
| "loss": 0.5858, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6446991404011462, | |
| "grad_norm": 0.37635743618011475, | |
| "learning_rate": 0.00019147566184787585, | |
| "loss": 0.5827, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6590257879656161, | |
| "grad_norm": 0.3801959455013275, | |
| "learning_rate": 0.00019071409688612524, | |
| "loss": 0.5683, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.673352435530086, | |
| "grad_norm": 0.3805936872959137, | |
| "learning_rate": 0.00018993513968692063, | |
| "loss": 0.5771, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6876790830945558, | |
| "grad_norm": 0.3774188160896301, | |
| "learning_rate": 0.00018913897791540107, | |
| "loss": 0.5631, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7020057306590258, | |
| "grad_norm": 0.37649068236351013, | |
| "learning_rate": 0.00018832580338160425, | |
| "loss": 0.5669, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7163323782234957, | |
| "grad_norm": 0.38752481341362, | |
| "learning_rate": 0.00018749581199425556, | |
| "loss": 0.5743, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7306590257879656, | |
| "grad_norm": 0.38894984126091003, | |
| "learning_rate": 0.0001866492037135702, | |
| "loss": 0.579, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7449856733524355, | |
| "grad_norm": 0.36124807596206665, | |
| "learning_rate": 0.00018578618250307912, | |
| "loss": 0.5752, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7593123209169055, | |
| "grad_norm": 0.3647681474685669, | |
| "learning_rate": 0.00018490695628049046, | |
| "loss": 0.573, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7736389684813754, | |
| "grad_norm": 0.38366076350212097, | |
| "learning_rate": 0.0001840117368675982, | |
| "loss": 0.5677, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7879656160458453, | |
| "grad_norm": 0.36573946475982666, | |
| "learning_rate": 0.0001831007399392506, | |
| "loss": 0.5652, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8022922636103151, | |
| "grad_norm": 0.354879230260849, | |
| "learning_rate": 0.00018217418497139, | |
| "loss": 0.5648, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8166189111747851, | |
| "grad_norm": 0.36587202548980713, | |
| "learning_rate": 0.00018123229518817702, | |
| "loss": 0.566, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.830945558739255, | |
| "grad_norm": 0.3714900314807892, | |
| "learning_rate": 0.0001802752975082119, | |
| "loss": 0.57, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8452722063037249, | |
| "grad_norm": 0.3702372610569, | |
| "learning_rate": 0.00017930342248986537, | |
| "loss": 0.5569, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.8595988538681948, | |
| "grad_norm": 0.36726057529449463, | |
| "learning_rate": 0.00017831690427573326, | |
| "loss": 0.5491, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8739255014326648, | |
| "grad_norm": 0.36911457777023315, | |
| "learning_rate": 0.00017731598053622675, | |
| "loss": 0.5596, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.8882521489971347, | |
| "grad_norm": 0.38149625062942505, | |
| "learning_rate": 0.00017630089241231375, | |
| "loss": 0.5736, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9025787965616046, | |
| "grad_norm": 0.3648279011249542, | |
| "learning_rate": 0.00017527188445742308, | |
| "loss": 0.5605, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.9169054441260746, | |
| "grad_norm": 0.3748702108860016, | |
| "learning_rate": 0.00017422920457852738, | |
| "loss": 0.5802, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9312320916905444, | |
| "grad_norm": 0.3691900372505188, | |
| "learning_rate": 0.00017317310397641764, | |
| "loss": 0.5431, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.9455587392550143, | |
| "grad_norm": 0.3742789328098297, | |
| "learning_rate": 0.0001721038370851842, | |
| "loss": 0.5647, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9598853868194842, | |
| "grad_norm": 0.36382776498794556, | |
| "learning_rate": 0.00017102166151091922, | |
| "loss": 0.5631, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.9742120343839542, | |
| "grad_norm": 0.3645402193069458, | |
| "learning_rate": 0.00016992683796965424, | |
| "loss": 0.5671, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9885386819484241, | |
| "grad_norm": 0.3582296669483185, | |
| "learning_rate": 0.000168819630224549, | |
| "loss": 0.564, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.997134670487106, | |
| "eval_loss": 0.5661588907241821, | |
| "eval_runtime": 3.0246, | |
| "eval_samples_per_second": 13.886, | |
| "eval_steps_per_second": 13.886, | |
| "step": 348 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1047, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.990390930375967e+17, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |