{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.012497188132670149, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.4994376265340298e-05, "grad_norm": 22.25, "learning_rate": 0.0008, "loss": 12.4316, "step": 1 }, { "epoch": 0.00012497188132670149, "grad_norm": 4.40625, "learning_rate": 0.0007999840035991902, "loss": 13.3525, "step": 5 }, { "epoch": 0.00024994376265340297, "grad_norm": 4.5, "learning_rate": 0.000799964008098178, "loss": 14.8491, "step": 10 }, { "epoch": 0.00037491564398010446, "grad_norm": 6.84375, "learning_rate": 0.0007999440125971656, "loss": 8.8491, "step": 15 }, { "epoch": 0.0004998875253068059, "grad_norm": 3.75, "learning_rate": 0.0007999240170961534, "loss": 7.9633, "step": 20 }, { "epoch": 0.0006248594066335074, "grad_norm": 4.75, "learning_rate": 0.0007999040215951412, "loss": 7.787, "step": 25 }, { "epoch": 0.0007498312879602089, "grad_norm": 15.3125, "learning_rate": 0.0007998840260941289, "loss": 7.6196, "step": 30 }, { "epoch": 0.0008748031692869104, "grad_norm": 3.03125, "learning_rate": 0.0007998640305931166, "loss": 7.453, "step": 35 }, { "epoch": 0.0009997750506136119, "grad_norm": 3.921875, "learning_rate": 0.0007998440350921043, "loss": 7.3384, "step": 40 }, { "epoch": 0.0011247469319403134, "grad_norm": 3.203125, "learning_rate": 0.000799824039591092, "loss": 7.2074, "step": 45 }, { "epoch": 0.0012497188132670149, "grad_norm": 3.28125, "learning_rate": 0.0007998040440900798, "loss": 7.1685, "step": 50 }, { "epoch": 0.0013746906945937163, "grad_norm": 3.578125, "learning_rate": 0.0007997840485890674, "loss": 7.2311, "step": 55 }, { "epoch": 0.0014996625759204178, "grad_norm": 7.28125, "learning_rate": 0.0007997640530880552, "loss": 7.1694, "step": 60 }, { "epoch": 0.0016246344572471193, "grad_norm": 3.0, "learning_rate": 0.0007997440575870429, "loss": 7.1179, "step": 65 }, { "epoch": 0.0017496063385738208, "grad_norm": 2.453125, "learning_rate": 0.0007997240620860307, "loss": 7.1143, "step": 70 }, { "epoch": 0.0018745782199005223, "grad_norm": 3.46875, "learning_rate": 0.0007997040665850185, "loss": 7.0387, "step": 75 }, { "epoch": 0.0019995501012272238, "grad_norm": 3.484375, "learning_rate": 0.0007996840710840061, "loss": 6.825, "step": 80 }, { "epoch": 0.0021245219825539252, "grad_norm": 2.859375, "learning_rate": 0.0007996640755829939, "loss": 6.7811, "step": 85 }, { "epoch": 0.0022494938638806267, "grad_norm": 2.640625, "learning_rate": 0.0007996440800819816, "loss": 6.821, "step": 90 }, { "epoch": 0.0023744657452073282, "grad_norm": 3.0, "learning_rate": 0.0007996240845809693, "loss": 6.7552, "step": 95 }, { "epoch": 0.0024994376265340297, "grad_norm": 3.953125, "learning_rate": 0.000799604089079957, "loss": 6.7139, "step": 100 }, { "epoch": 0.002624409507860731, "grad_norm": 2.296875, "learning_rate": 0.0007995840935789447, "loss": 6.7687, "step": 105 }, { "epoch": 0.0027493813891874327, "grad_norm": 2.65625, "learning_rate": 0.0007995640980779325, "loss": 6.6156, "step": 110 }, { "epoch": 0.002874353270514134, "grad_norm": 4.28125, "learning_rate": 0.0007995441025769203, "loss": 6.5183, "step": 115 }, { "epoch": 0.0029993251518408356, "grad_norm": 2.4375, "learning_rate": 0.000799524107075908, "loss": 6.5023, "step": 120 }, { "epoch": 0.003124297033167537, "grad_norm": 3.0, "learning_rate": 0.0007995041115748957, "loss": 6.3342, "step": 125 }, { "epoch": 0.0032492689144942386, "grad_norm": 3.296875, "learning_rate": 0.0007994841160738834, "loss": 6.416, "step": 130 }, { "epoch": 0.00337424079582094, "grad_norm": 3.90625, "learning_rate": 0.0007994641205728712, "loss": 6.3348, "step": 135 }, { "epoch": 0.0034992126771476416, "grad_norm": 2.578125, "learning_rate": 0.0007994441250718589, "loss": 6.3827, "step": 140 }, { "epoch": 0.003624184558474343, "grad_norm": 2.5, "learning_rate": 0.0007994241295708465, "loss": 6.4047, "step": 145 }, { "epoch": 0.0037491564398010446, "grad_norm": 2.78125, "learning_rate": 0.0007994041340698343, "loss": 6.1781, "step": 150 }, { "epoch": 0.003874128321127746, "grad_norm": 3.375, "learning_rate": 0.000799384138568822, "loss": 6.2532, "step": 155 }, { "epoch": 0.0039991002024544475, "grad_norm": 2.34375, "learning_rate": 0.0007993641430678099, "loss": 6.2372, "step": 160 }, { "epoch": 0.004124072083781149, "grad_norm": 2.796875, "learning_rate": 0.0007993441475667975, "loss": 6.0455, "step": 165 }, { "epoch": 0.0042490439651078505, "grad_norm": 2.6875, "learning_rate": 0.0007993241520657852, "loss": 6.1694, "step": 170 }, { "epoch": 0.004374015846434552, "grad_norm": 2.0625, "learning_rate": 0.000799304156564773, "loss": 6.045, "step": 175 }, { "epoch": 0.0044989877277612535, "grad_norm": 2.765625, "learning_rate": 0.0007992841610637607, "loss": 6.0104, "step": 180 }, { "epoch": 0.004623959609087955, "grad_norm": 1.796875, "learning_rate": 0.0007992641655627485, "loss": 5.9327, "step": 185 }, { "epoch": 0.0047489314904146564, "grad_norm": 2.765625, "learning_rate": 0.0007992441700617361, "loss": 5.9164, "step": 190 }, { "epoch": 0.004873903371741358, "grad_norm": 2.265625, "learning_rate": 0.0007992241745607238, "loss": 5.9775, "step": 195 }, { "epoch": 0.004998875253068059, "grad_norm": 2.421875, "learning_rate": 0.0007992041790597116, "loss": 5.6958, "step": 200 }, { "epoch": 0.005123847134394761, "grad_norm": 2.546875, "learning_rate": 0.0007991841835586994, "loss": 5.8286, "step": 205 }, { "epoch": 0.005248819015721462, "grad_norm": 2.359375, "learning_rate": 0.000799164188057687, "loss": 5.7053, "step": 210 }, { "epoch": 0.005373790897048164, "grad_norm": 2.28125, "learning_rate": 0.0007991441925566748, "loss": 5.725, "step": 215 }, { "epoch": 0.005498762778374865, "grad_norm": 1.875, "learning_rate": 0.0007991241970556625, "loss": 5.6576, "step": 220 }, { "epoch": 0.005623734659701567, "grad_norm": 2.3125, "learning_rate": 0.0007991042015546503, "loss": 5.7389, "step": 225 }, { "epoch": 0.005748706541028268, "grad_norm": 2.546875, "learning_rate": 0.000799084206053638, "loss": 5.4624, "step": 230 }, { "epoch": 0.00587367842235497, "grad_norm": 2.625, "learning_rate": 0.0007990642105526257, "loss": 5.6441, "step": 235 }, { "epoch": 0.005998650303681671, "grad_norm": 2.5, "learning_rate": 0.0007990442150516134, "loss": 5.552, "step": 240 }, { "epoch": 0.006123622185008373, "grad_norm": 1.640625, "learning_rate": 0.0007990242195506011, "loss": 5.3983, "step": 245 }, { "epoch": 0.006248594066335074, "grad_norm": 2.640625, "learning_rate": 0.000799004224049589, "loss": 5.3757, "step": 250 }, { "epoch": 0.006373565947661776, "grad_norm": 2.25, "learning_rate": 0.0007989842285485766, "loss": 5.4723, "step": 255 }, { "epoch": 0.006498537828988477, "grad_norm": 2.25, "learning_rate": 0.0007989642330475643, "loss": 5.301, "step": 260 }, { "epoch": 0.006623509710315179, "grad_norm": 1.90625, "learning_rate": 0.0007989442375465521, "loss": 5.4499, "step": 265 }, { "epoch": 0.00674848159164188, "grad_norm": 2.65625, "learning_rate": 0.0007989242420455398, "loss": 5.5597, "step": 270 }, { "epoch": 0.006873453472968582, "grad_norm": 2.25, "learning_rate": 0.0007989042465445275, "loss": 5.2123, "step": 275 }, { "epoch": 0.006998425354295283, "grad_norm": 1.71875, "learning_rate": 0.0007988842510435152, "loss": 5.3393, "step": 280 }, { "epoch": 0.007123397235621985, "grad_norm": 2.390625, "learning_rate": 0.000798864255542503, "loss": 5.2593, "step": 285 }, { "epoch": 0.007248369116948686, "grad_norm": 2.125, "learning_rate": 0.0007988442600414907, "loss": 5.2776, "step": 290 }, { "epoch": 0.007373340998275388, "grad_norm": 1.84375, "learning_rate": 0.0007988242645404785, "loss": 5.2529, "step": 295 }, { "epoch": 0.007498312879602089, "grad_norm": 2.3125, "learning_rate": 0.0007988042690394662, "loss": 5.0664, "step": 300 }, { "epoch": 0.007623284760928791, "grad_norm": 1.5234375, "learning_rate": 0.0007987842735384539, "loss": 5.3028, "step": 305 }, { "epoch": 0.007748256642255492, "grad_norm": 1.7578125, "learning_rate": 0.0007987642780374416, "loss": 5.2152, "step": 310 }, { "epoch": 0.007873228523582194, "grad_norm": 1.765625, "learning_rate": 0.0007987442825364294, "loss": 4.97, "step": 315 }, { "epoch": 0.007998200404908895, "grad_norm": 2.203125, "learning_rate": 0.000798724287035417, "loss": 5.1328, "step": 320 }, { "epoch": 0.008123172286235597, "grad_norm": 1.765625, "learning_rate": 0.0007987042915344048, "loss": 5.0475, "step": 325 }, { "epoch": 0.008248144167562298, "grad_norm": 1.78125, "learning_rate": 0.0007986842960333925, "loss": 5.009, "step": 330 }, { "epoch": 0.008373116048889, "grad_norm": 1.90625, "learning_rate": 0.0007986643005323803, "loss": 5.0827, "step": 335 }, { "epoch": 0.008498087930215701, "grad_norm": 1.609375, "learning_rate": 0.000798644305031368, "loss": 5.0896, "step": 340 }, { "epoch": 0.008623059811542403, "grad_norm": 1.6640625, "learning_rate": 0.0007986243095303557, "loss": 4.9185, "step": 345 }, { "epoch": 0.008748031692869104, "grad_norm": 2.25, "learning_rate": 0.0007986043140293435, "loss": 5.0279, "step": 350 }, { "epoch": 0.008873003574195806, "grad_norm": 2.140625, "learning_rate": 0.0007985843185283312, "loss": 4.8811, "step": 355 }, { "epoch": 0.008997975455522507, "grad_norm": 1.640625, "learning_rate": 0.0007985643230273188, "loss": 5.0377, "step": 360 }, { "epoch": 0.00912294733684921, "grad_norm": 2.078125, "learning_rate": 0.0007985443275263066, "loss": 4.9903, "step": 365 }, { "epoch": 0.00924791921817591, "grad_norm": 2.15625, "learning_rate": 0.0007985243320252943, "loss": 4.8109, "step": 370 }, { "epoch": 0.009372891099502612, "grad_norm": 1.859375, "learning_rate": 0.0007985043365242821, "loss": 4.8445, "step": 375 }, { "epoch": 0.009497862980829313, "grad_norm": 2.171875, "learning_rate": 0.0007984843410232698, "loss": 4.887, "step": 380 }, { "epoch": 0.009622834862156015, "grad_norm": 1.96875, "learning_rate": 0.0007984643455222575, "loss": 4.8373, "step": 385 }, { "epoch": 0.009747806743482716, "grad_norm": 1.7890625, "learning_rate": 0.0007984443500212453, "loss": 4.8948, "step": 390 }, { "epoch": 0.009872778624809418, "grad_norm": 2.09375, "learning_rate": 0.000798424354520233, "loss": 4.9098, "step": 395 }, { "epoch": 0.009997750506136119, "grad_norm": 1.703125, "learning_rate": 0.0007984043590192208, "loss": 4.6948, "step": 400 }, { "epoch": 0.010122722387462821, "grad_norm": 1.9453125, "learning_rate": 0.0007983843635182084, "loss": 4.7561, "step": 405 }, { "epoch": 0.010247694268789522, "grad_norm": 2.171875, "learning_rate": 0.0007983643680171961, "loss": 4.6634, "step": 410 }, { "epoch": 0.010372666150116224, "grad_norm": 2.140625, "learning_rate": 0.0007983443725161839, "loss": 4.6339, "step": 415 }, { "epoch": 0.010497638031442925, "grad_norm": 1.953125, "learning_rate": 0.0007983243770151716, "loss": 4.7292, "step": 420 }, { "epoch": 0.010622609912769627, "grad_norm": 1.4453125, "learning_rate": 0.0007983043815141594, "loss": 4.6211, "step": 425 }, { "epoch": 0.010747581794096328, "grad_norm": 1.6484375, "learning_rate": 0.0007982843860131471, "loss": 4.6857, "step": 430 }, { "epoch": 0.01087255367542303, "grad_norm": 1.8359375, "learning_rate": 0.0007982643905121348, "loss": 4.6335, "step": 435 }, { "epoch": 0.01099752555674973, "grad_norm": 1.6484375, "learning_rate": 0.0007982443950111226, "loss": 4.7285, "step": 440 }, { "epoch": 0.011122497438076433, "grad_norm": 1.609375, "learning_rate": 0.0007982243995101103, "loss": 4.7707, "step": 445 }, { "epoch": 0.011247469319403134, "grad_norm": 1.78125, "learning_rate": 0.000798204404009098, "loss": 4.6769, "step": 450 }, { "epoch": 0.011372441200729836, "grad_norm": 1.390625, "learning_rate": 0.0007981844085080857, "loss": 4.7311, "step": 455 }, { "epoch": 0.011497413082056537, "grad_norm": 1.84375, "learning_rate": 0.0007981644130070734, "loss": 4.6677, "step": 460 }, { "epoch": 0.011622384963383239, "grad_norm": 1.328125, "learning_rate": 0.0007981444175060612, "loss": 4.6862, "step": 465 }, { "epoch": 0.01174735684470994, "grad_norm": 2.40625, "learning_rate": 0.0007981244220050488, "loss": 4.5674, "step": 470 }, { "epoch": 0.011872328726036642, "grad_norm": 1.8671875, "learning_rate": 0.0007981044265040366, "loss": 4.5283, "step": 475 }, { "epoch": 0.011997300607363343, "grad_norm": 1.625, "learning_rate": 0.0007980844310030244, "loss": 4.5966, "step": 480 }, { "epoch": 0.012122272488690045, "grad_norm": 1.7421875, "learning_rate": 0.0007980644355020121, "loss": 4.4785, "step": 485 }, { "epoch": 0.012247244370016746, "grad_norm": 1.765625, "learning_rate": 0.0007980444400009999, "loss": 4.5966, "step": 490 }, { "epoch": 0.012372216251343448, "grad_norm": 1.8671875, "learning_rate": 0.0007980244444999875, "loss": 4.6428, "step": 495 }, { "epoch": 0.012497188132670149, "grad_norm": 1.609375, "learning_rate": 0.0007980044489989753, "loss": 4.5225, "step": 500 } ], "logging_steps": 5, "max_steps": 200045, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.2266079346688e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }