| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 2304, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0390625, | |
| "grad_norm": 2.857225616936394, | |
| "learning_rate": 6.493506493506493e-07, | |
| "loss": 0.7578, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.078125, | |
| "grad_norm": 1.3044758629614164, | |
| "learning_rate": 1.2987012987012986e-06, | |
| "loss": 0.5677, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1171875, | |
| "grad_norm": 1.1974358130130436, | |
| "learning_rate": 1.9480519480519483e-06, | |
| "loss": 0.5243, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 1.0778991302776368, | |
| "learning_rate": 2.597402597402597e-06, | |
| "loss": 0.5118, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1953125, | |
| "grad_norm": 1.0101444029426376, | |
| "learning_rate": 3.246753246753247e-06, | |
| "loss": 0.5146, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.234375, | |
| "grad_norm": 0.97813783772122, | |
| "learning_rate": 3.896103896103897e-06, | |
| "loss": 0.5121, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2734375, | |
| "grad_norm": 1.0146147755881423, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 0.5172, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 1.0002914434860946, | |
| "learning_rate": 4.999767464405452e-06, | |
| "loss": 0.5053, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3515625, | |
| "grad_norm": 0.9909790519322216, | |
| "learning_rate": 4.995634701567892e-06, | |
| "loss": 0.5197, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "grad_norm": 0.9471114669156748, | |
| "learning_rate": 4.986344312601082e-06, | |
| "loss": 0.5087, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4296875, | |
| "grad_norm": 0.990508902551153, | |
| "learning_rate": 4.971915497571788e-06, | |
| "loss": 0.5126, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.46875, | |
| "grad_norm": 0.9299422099207572, | |
| "learning_rate": 4.9523780759216764e-06, | |
| "loss": 0.5144, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5078125, | |
| "grad_norm": 0.8743256439010634, | |
| "learning_rate": 4.927772424840702e-06, | |
| "loss": 0.5083, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.546875, | |
| "grad_norm": 0.9036662054182406, | |
| "learning_rate": 4.898149395821218e-06, | |
| "loss": 0.5117, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5859375, | |
| "grad_norm": 0.8810653800350299, | |
| "learning_rate": 4.863570209565277e-06, | |
| "loss": 0.5107, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 0.8846561337964834, | |
| "learning_rate": 4.824106329462313e-06, | |
| "loss": 0.5098, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6640625, | |
| "grad_norm": 0.8997382537130586, | |
| "learning_rate": 4.779839313898675e-06, | |
| "loss": 0.5152, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.703125, | |
| "grad_norm": 0.939340943654687, | |
| "learning_rate": 4.730860647704252e-06, | |
| "loss": 0.5078, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7421875, | |
| "grad_norm": 0.845279140634368, | |
| "learning_rate": 4.677271553084515e-06, | |
| "loss": 0.5078, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 0.9487929366289761, | |
| "learning_rate": 4.6191827804287236e-06, | |
| "loss": 0.5073, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8203125, | |
| "grad_norm": 0.9162235285381133, | |
| "learning_rate": 4.556714379426634e-06, | |
| "loss": 0.5194, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.859375, | |
| "grad_norm": 0.8817621527988113, | |
| "learning_rate": 4.489995450966714e-06, | |
| "loss": 0.4997, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8984375, | |
| "grad_norm": 0.8573037310302468, | |
| "learning_rate": 4.419163880328615e-06, | |
| "loss": 0.5008, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 0.9279340785042338, | |
| "learning_rate": 4.344366052221316e-06, | |
| "loss": 0.5037, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.9765625, | |
| "grad_norm": 0.8075055611747111, | |
| "learning_rate": 4.265756548255823e-06, | |
| "loss": 0.4977, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.015625, | |
| "grad_norm": 1.098595770652355, | |
| "learning_rate": 4.183497827477687e-06, | |
| "loss": 0.4782, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.0546875, | |
| "grad_norm": 0.9988970588938918, | |
| "learning_rate": 4.097759890619539e-06, | |
| "loss": 0.439, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.09375, | |
| "grad_norm": 0.9731524721989655, | |
| "learning_rate": 4.00871992876753e-06, | |
| "loss": 0.4441, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.1328125, | |
| "grad_norm": 0.9099368305482215, | |
| "learning_rate": 3.916561957167765e-06, | |
| "loss": 0.4438, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.171875, | |
| "grad_norm": 1.0659953860812488, | |
| "learning_rate": 3.82147643492952e-06, | |
| "loss": 0.44, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.2109375, | |
| "grad_norm": 0.9779085212603401, | |
| "learning_rate": 3.723659871411196e-06, | |
| "loss": 0.4406, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.9719837374463801, | |
| "learning_rate": 3.623314420102467e-06, | |
| "loss": 0.4464, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.2890625, | |
| "grad_norm": 1.0182265044301695, | |
| "learning_rate": 3.5206474608419385e-06, | |
| "loss": 0.4462, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.328125, | |
| "grad_norm": 0.9265295707713885, | |
| "learning_rate": 3.415871171233709e-06, | |
| "loss": 0.4412, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.3671875, | |
| "grad_norm": 1.0015451544453786, | |
| "learning_rate": 3.3092020881486085e-06, | |
| "loss": 0.4395, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.40625, | |
| "grad_norm": 0.93431276827024, | |
| "learning_rate": 3.2008606602163023e-06, | |
| "loss": 0.4425, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.4453125, | |
| "grad_norm": 0.9255587183512949, | |
| "learning_rate": 3.091070792233124e-06, | |
| "loss": 0.439, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.484375, | |
| "grad_norm": 0.9343974096084879, | |
| "learning_rate": 2.9800593824272027e-06, | |
| "loss": 0.4354, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.5234375, | |
| "grad_norm": 1.042370945691791, | |
| "learning_rate": 2.8680558535371688e-06, | |
| "loss": 0.4404, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "grad_norm": 0.9126651351226419, | |
| "learning_rate": 2.7552916786735744e-06, | |
| "loss": 0.4431, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.6015625, | |
| "grad_norm": 0.9963009170045803, | |
| "learning_rate": 2.641999902942882e-06, | |
| "loss": 0.43, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.640625, | |
| "grad_norm": 0.9703924532720508, | |
| "learning_rate": 2.5284146618226807e-06, | |
| "loss": 0.449, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.6796875, | |
| "grad_norm": 0.8989408268444277, | |
| "learning_rate": 2.414770697283471e-06, | |
| "loss": 0.4387, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.71875, | |
| "grad_norm": 0.9709563268893221, | |
| "learning_rate": 2.3013028726570436e-06, | |
| "loss": 0.444, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.7578125, | |
| "grad_norm": 0.9557991402725722, | |
| "learning_rate": 2.188245687254035e-06, | |
| "loss": 0.4394, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.796875, | |
| "grad_norm": 0.9603778434937646, | |
| "learning_rate": 2.075832791733802e-06, | |
| "loss": 0.4473, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.8359375, | |
| "grad_norm": 0.9476806986189421, | |
| "learning_rate": 1.9642965052281618e-06, | |
| "loss": 0.4404, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 0.9085522492818641, | |
| "learning_rate": 1.8538673352169467e-06, | |
| "loss": 0.4446, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.9140625, | |
| "grad_norm": 0.9480501458847437, | |
| "learning_rate": 1.744773501147627e-06, | |
| "loss": 0.4236, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.953125, | |
| "grad_norm": 0.9099438022581319, | |
| "learning_rate": 1.6372404627835182e-06, | |
| "loss": 0.4352, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.9921875, | |
| "grad_norm": 0.9130000879221961, | |
| "learning_rate": 1.5314904542553099e-06, | |
| "loss": 0.4344, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.03125, | |
| "grad_norm": 1.072785526043702, | |
| "learning_rate": 1.4277420247788842e-06, | |
| "loss": 0.3877, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.0703125, | |
| "grad_norm": 1.0213733525555977, | |
| "learning_rate": 1.3262095869885907e-06, | |
| "loss": 0.3748, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.109375, | |
| "grad_norm": 1.0378622376674729, | |
| "learning_rate": 1.227102973819426e-06, | |
| "loss": 0.3801, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.1484375, | |
| "grad_norm": 1.0636201329760862, | |
| "learning_rate": 1.1306270048538966e-06, | |
| "loss": 0.3623, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.1875, | |
| "grad_norm": 1.065350906800766, | |
| "learning_rate": 1.0369810630297658e-06, | |
| "loss": 0.3652, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.2265625, | |
| "grad_norm": 1.065580953707456, | |
| "learning_rate": 9.463586825834939e-07, | |
| "loss": 0.3724, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.265625, | |
| "grad_norm": 1.1044573731710503, | |
| "learning_rate": 8.589471490809473e-07, | |
| "loss": 0.3639, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.3046875, | |
| "grad_norm": 1.057219345369191, | |
| "learning_rate": 7.749271123619889e-07, | |
| "loss": 0.3665, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.34375, | |
| "grad_norm": 1.120692356243222, | |
| "learning_rate": 6.944722131988394e-07, | |
| "loss": 0.3624, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.3828125, | |
| "grad_norm": 1.1354171819466858, | |
| "learning_rate": 6.177487244398009e-07, | |
| "loss": 0.3629, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.421875, | |
| "grad_norm": 1.0454720627484864, | |
| "learning_rate": 5.449152073799616e-07, | |
| "loss": 0.3739, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.4609375, | |
| "grad_norm": 1.1194418224635865, | |
| "learning_rate": 4.761221840690586e-07, | |
| "loss": 0.3723, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.097996189333425, | |
| "learning_rate": 4.115118262337128e-07, | |
| "loss": 0.377, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.5390625, | |
| "grad_norm": 1.0208876828373903, | |
| "learning_rate": 3.512176614569418e-07, | |
| "loss": 0.3676, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.578125, | |
| "grad_norm": 1.022735887067447, | |
| "learning_rate": 2.9536429722216207e-07, | |
| "loss": 0.3714, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.6171875, | |
| "grad_norm": 1.04711360457167, | |
| "learning_rate": 2.440671633920075e-07, | |
| "loss": 0.3733, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.65625, | |
| "grad_norm": 1.0415975853212511, | |
| "learning_rate": 1.9743227365415092e-07, | |
| "loss": 0.3694, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.6953125, | |
| "grad_norm": 1.106325100873404, | |
| "learning_rate": 1.5555600642715442e-07, | |
| "loss": 0.3747, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.734375, | |
| "grad_norm": 1.0448870665618393, | |
| "learning_rate": 1.1852490567913655e-07, | |
| "loss": 0.3611, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.7734375, | |
| "grad_norm": 1.1214742422046289, | |
| "learning_rate": 8.641550207089039e-08, | |
| "loss": 0.3686, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.8125, | |
| "grad_norm": 1.0756561712922115, | |
| "learning_rate": 5.92941547931028e-08, | |
| "loss": 0.3716, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.8515625, | |
| "grad_norm": 1.057575658588282, | |
| "learning_rate": 3.7216914424527686e-08, | |
| "loss": 0.3624, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.890625, | |
| "grad_norm": 1.0540930132763664, | |
| "learning_rate": 2.0229407094547736e-08, | |
| "loss": 0.369, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.9296875, | |
| "grad_norm": 1.105125578397597, | |
| "learning_rate": 8.366740189520716e-09, | |
| "loss": 0.3668, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.96875, | |
| "grad_norm": 0.9862762687662256, | |
| "learning_rate": 1.6534297977804925e-09, | |
| "loss": 0.3621, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 2304, | |
| "total_flos": 415352546656256.0, | |
| "train_loss": 0.4438822174237834, | |
| "train_runtime": 11247.4679, | |
| "train_samples_per_second": 26.212, | |
| "train_steps_per_second": 0.205 | |
| } | |
| ], | |
| "logging_steps": 30, | |
| "max_steps": 2304, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 256, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 415352546656256.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |