{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4233, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007559650366170565, "grad_norm": 32.25, "learning_rate": 7.311320754716981e-07, "loss": 0.6915, "step": 32 }, { "epoch": 0.01511930073234113, "grad_norm": 14.5625, "learning_rate": 1.4858490566037737e-06, "loss": 0.6792, "step": 64 }, { "epoch": 0.022678951098511695, "grad_norm": 16.875, "learning_rate": 2.2405660377358494e-06, "loss": 0.5993, "step": 96 }, { "epoch": 0.03023860146468226, "grad_norm": 11.5625, "learning_rate": 2.995283018867925e-06, "loss": 0.5179, "step": 128 }, { "epoch": 0.03779825183085282, "grad_norm": 14.5625, "learning_rate": 3.7500000000000005e-06, "loss": 0.5111, "step": 160 }, { "epoch": 0.04535790219702339, "grad_norm": 9.8125, "learning_rate": 4.504716981132076e-06, "loss": 0.5109, "step": 192 }, { "epoch": 0.05291755256319395, "grad_norm": 7.34375, "learning_rate": 5.259433962264151e-06, "loss": 0.5106, "step": 224 }, { "epoch": 0.06047720292936452, "grad_norm": 6.1875, "learning_rate": 6.014150943396226e-06, "loss": 0.4514, "step": 256 }, { "epoch": 0.06803685329553508, "grad_norm": 7.875, "learning_rate": 6.768867924528303e-06, "loss": 0.4862, "step": 288 }, { "epoch": 0.07559650366170564, "grad_norm": 6.03125, "learning_rate": 7.523584905660378e-06, "loss": 0.4711, "step": 320 }, { "epoch": 0.08315615402787621, "grad_norm": 6.1875, "learning_rate": 8.278301886792453e-06, "loss": 0.4783, "step": 352 }, { "epoch": 0.09071580439404678, "grad_norm": 7.78125, "learning_rate": 9.03301886792453e-06, "loss": 0.4457, "step": 384 }, { "epoch": 0.09827545476021735, "grad_norm": 6.59375, "learning_rate": 9.787735849056604e-06, "loss": 0.5125, "step": 416 }, { "epoch": 0.1058351051263879, "grad_norm": 13.5625, "learning_rate": 9.99910037719311e-06, "loss": 0.4171, "step": 448 }, { "epoch": 0.11339475549255847, "grad_norm": 3.921875, "learning_rate": 9.994856381944038e-06, "loss": 0.4538, "step": 480 }, { "epoch": 0.12095440585872903, "grad_norm": 6.15625, "learning_rate": 9.987133217483066e-06, "loss": 0.4629, "step": 512 }, { "epoch": 0.1285140562248996, "grad_norm": 6.28125, "learning_rate": 9.975936263383488e-06, "loss": 0.4744, "step": 544 }, { "epoch": 0.13607370659107015, "grad_norm": 7.125, "learning_rate": 9.96127331888816e-06, "loss": 0.4292, "step": 576 }, { "epoch": 0.14363335695724072, "grad_norm": 5.15625, "learning_rate": 9.943154597476943e-06, "loss": 0.4558, "step": 608 }, { "epoch": 0.1511930073234113, "grad_norm": 10.6875, "learning_rate": 9.921592719752486e-06, "loss": 0.448, "step": 640 }, { "epoch": 0.15875265768958186, "grad_norm": 8.125, "learning_rate": 9.896602704649348e-06, "loss": 0.4117, "step": 672 }, { "epoch": 0.16631230805575242, "grad_norm": 4.84375, "learning_rate": 9.868201958972548e-06, "loss": 0.4303, "step": 704 }, { "epoch": 0.173871958421923, "grad_norm": 5.0625, "learning_rate": 9.836410265272857e-06, "loss": 0.4402, "step": 736 }, { "epoch": 0.18143160878809356, "grad_norm": 8.875, "learning_rate": 9.801249768067246e-06, "loss": 0.4242, "step": 768 }, { "epoch": 0.18899125915426412, "grad_norm": 5.625, "learning_rate": 9.762744958414113e-06, "loss": 0.4771, "step": 800 }, { "epoch": 0.1965509095204347, "grad_norm": 14.875, "learning_rate": 9.720922656854032e-06, "loss": 0.4497, "step": 832 }, { "epoch": 0.20411055988660523, "grad_norm": 6.3125, "learning_rate": 9.675811994727897e-06, "loss": 0.4141, "step": 864 }, { "epoch": 0.2116702102527758, "grad_norm": 6.15625, "learning_rate": 9.627444393885463e-06, "loss": 0.432, "step": 896 }, { "epoch": 0.21922986061894637, "grad_norm": 6.28125, "learning_rate": 9.575853544798453e-06, "loss": 0.4253, "step": 928 }, { "epoch": 0.22678951098511693, "grad_norm": 7.4375, "learning_rate": 9.521075383093452e-06, "loss": 0.4334, "step": 960 }, { "epoch": 0.2343491613512875, "grad_norm": 9.0625, "learning_rate": 9.463148064520913e-06, "loss": 0.4595, "step": 992 }, { "epoch": 0.24190881171745807, "grad_norm": 6.71875, "learning_rate": 9.402111938377776e-06, "loss": 0.4401, "step": 1024 }, { "epoch": 0.24946846208362863, "grad_norm": 9.0625, "learning_rate": 9.338009519402132e-06, "loss": 0.4216, "step": 1056 }, { "epoch": 0.2570281124497992, "grad_norm": 7.59375, "learning_rate": 9.270885458159576e-06, "loss": 0.4391, "step": 1088 }, { "epoch": 0.26458776281596974, "grad_norm": 7.875, "learning_rate": 9.200786509941827e-06, "loss": 0.4116, "step": 1120 }, { "epoch": 0.2721474131821403, "grad_norm": 11.5625, "learning_rate": 9.127761502199325e-06, "loss": 0.4004, "step": 1152 }, { "epoch": 0.2797070635483109, "grad_norm": 7.8125, "learning_rate": 9.051861300530438e-06, "loss": 0.4261, "step": 1184 }, { "epoch": 0.28726671391448144, "grad_norm": 6.53125, "learning_rate": 8.973138773251015e-06, "loss": 0.4075, "step": 1216 }, { "epoch": 0.294826364280652, "grad_norm": 6.78125, "learning_rate": 8.891648754568943e-06, "loss": 0.4398, "step": 1248 }, { "epoch": 0.3023860146468226, "grad_norm": 7.40625, "learning_rate": 8.807448006389343e-06, "loss": 0.4517, "step": 1280 }, { "epoch": 0.30994566501299314, "grad_norm": 4.3125, "learning_rate": 8.720595178777063e-06, "loss": 0.4254, "step": 1312 }, { "epoch": 0.3175053153791637, "grad_norm": 7.25, "learning_rate": 8.631150769103934e-06, "loss": 0.441, "step": 1344 }, { "epoch": 0.3250649657453343, "grad_norm": 6.65625, "learning_rate": 8.539177079909315e-06, "loss": 0.4337, "step": 1376 }, { "epoch": 0.33262461611150484, "grad_norm": 5.8125, "learning_rate": 8.444738175503222e-06, "loss": 0.4537, "step": 1408 }, { "epoch": 0.3401842664776754, "grad_norm": 5.5625, "learning_rate": 8.347899837342315e-06, "loss": 0.4071, "step": 1440 }, { "epoch": 0.347743916843846, "grad_norm": 7.875, "learning_rate": 8.2487295182098e-06, "loss": 0.4612, "step": 1472 }, { "epoch": 0.35530356721001655, "grad_norm": 5.1875, "learning_rate": 8.147296295231158e-06, "loss": 0.4296, "step": 1504 }, { "epoch": 0.3628632175761871, "grad_norm": 8.625, "learning_rate": 8.04367082175845e-06, "loss": 0.4491, "step": 1536 }, { "epoch": 0.3704228679423577, "grad_norm": 5.25, "learning_rate": 7.937925278156698e-06, "loss": 0.4132, "step": 1568 }, { "epoch": 0.37798251830852825, "grad_norm": 6.8125, "learning_rate": 7.830133321526615e-06, "loss": 0.4068, "step": 1600 }, { "epoch": 0.3855421686746988, "grad_norm": 4.375, "learning_rate": 7.720370034398741e-06, "loss": 0.4499, "step": 1632 }, { "epoch": 0.3931018190408694, "grad_norm": 6.96875, "learning_rate": 7.608711872434648e-06, "loss": 0.4256, "step": 1664 }, { "epoch": 0.40066146940703995, "grad_norm": 9.875, "learning_rate": 7.495236611171741e-06, "loss": 0.428, "step": 1696 }, { "epoch": 0.40822111977321046, "grad_norm": 5.90625, "learning_rate": 7.3800232918486715e-06, "loss": 0.4146, "step": 1728 }, { "epoch": 0.41578077013938103, "grad_norm": 6.4375, "learning_rate": 7.263152166349122e-06, "loss": 0.4476, "step": 1760 }, { "epoch": 0.4233404205055516, "grad_norm": 7.3125, "learning_rate": 7.144704641302337e-06, "loss": 0.4387, "step": 1792 }, { "epoch": 0.43090007087172216, "grad_norm": 6.84375, "learning_rate": 7.024763221379289e-06, "loss": 0.4276, "step": 1824 }, { "epoch": 0.43845972123789273, "grad_norm": 7.25, "learning_rate": 6.903411451824033e-06, "loss": 0.4482, "step": 1856 }, { "epoch": 0.4460193716040633, "grad_norm": 6.9375, "learning_rate": 6.780733860260216e-06, "loss": 0.4187, "step": 1888 }, { "epoch": 0.45357902197023386, "grad_norm": 6.09375, "learning_rate": 6.6568158978133455e-06, "loss": 0.402, "step": 1920 }, { "epoch": 0.46113867233640443, "grad_norm": 5.34375, "learning_rate": 6.531743879589754e-06, "loss": 0.4157, "step": 1952 }, { "epoch": 0.468698322702575, "grad_norm": 5.8125, "learning_rate": 6.405604924553797e-06, "loss": 0.4771, "step": 1984 }, { "epoch": 0.47625797306874557, "grad_norm": 6.53125, "learning_rate": 6.278486894845084e-06, "loss": 0.4408, "step": 2016 }, { "epoch": 0.48381762343491613, "grad_norm": 5.96875, "learning_rate": 6.150478334578085e-06, "loss": 0.4434, "step": 2048 }, { "epoch": 0.4913772738010867, "grad_norm": 8.3125, "learning_rate": 6.021668408166688e-06, "loss": 0.4214, "step": 2080 }, { "epoch": 0.49893692416725727, "grad_norm": 4.28125, "learning_rate": 5.892146838216687e-06, "loss": 0.4164, "step": 2112 }, { "epoch": 0.5064965745334278, "grad_norm": 4.25, "learning_rate": 5.762003843029466e-06, "loss": 0.426, "step": 2144 }, { "epoch": 0.5140562248995983, "grad_norm": 6.96875, "learning_rate": 5.631330073760413e-06, "loss": 0.4205, "step": 2176 }, { "epoch": 0.5216158752657689, "grad_norm": 6.6875, "learning_rate": 5.500216551275807e-06, "loss": 0.4429, "step": 2208 }, { "epoch": 0.5291755256319395, "grad_norm": 7.15625, "learning_rate": 5.368754602752213e-06, "loss": 0.431, "step": 2240 }, { "epoch": 0.53673517599811, "grad_norm": 5.28125, "learning_rate": 5.237035798062489e-06, "loss": 0.4224, "step": 2272 }, { "epoch": 0.5442948263642806, "grad_norm": 8.125, "learning_rate": 5.105151885992754e-06, "loss": 0.4194, "step": 2304 }, { "epoch": 0.5518544767304512, "grad_norm": 5.5, "learning_rate": 4.9731947303347485e-06, "loss": 0.434, "step": 2336 }, { "epoch": 0.5594141270966217, "grad_norm": 6.09375, "learning_rate": 4.841256245898055e-06, "loss": 0.4308, "step": 2368 }, { "epoch": 0.5669737774627923, "grad_norm": 4.3125, "learning_rate": 4.709428334486816e-06, "loss": 0.3907, "step": 2400 }, { "epoch": 0.5745334278289629, "grad_norm": 6.96875, "learning_rate": 4.577802820885482e-06, "loss": 0.4226, "step": 2432 }, { "epoch": 0.5820930781951335, "grad_norm": 5.75, "learning_rate": 4.446471388898236e-06, "loss": 0.4216, "step": 2464 }, { "epoch": 0.589652728561304, "grad_norm": 6.46875, "learning_rate": 4.315525517486586e-06, "loss": 0.4632, "step": 2496 }, { "epoch": 0.5972123789274746, "grad_norm": 3.46875, "learning_rate": 4.185056417049674e-06, "loss": 0.4304, "step": 2528 }, { "epoch": 0.6047720292936452, "grad_norm": 8.25, "learning_rate": 4.055154965891625e-06, "loss": 0.451, "step": 2560 }, { "epoch": 0.6123316796598157, "grad_norm": 6.78125, "learning_rate": 3.925911646920235e-06, "loss": 0.3851, "step": 2592 }, { "epoch": 0.6198913300259863, "grad_norm": 6.84375, "learning_rate": 3.797416484621057e-06, "loss": 0.4486, "step": 2624 }, { "epoch": 0.6274509803921569, "grad_norm": 5.28125, "learning_rate": 3.669758982350821e-06, "loss": 0.4258, "step": 2656 }, { "epoch": 0.6350106307583274, "grad_norm": 6.53125, "learning_rate": 3.5430280599938204e-06, "loss": 0.4303, "step": 2688 }, { "epoch": 0.642570281124498, "grad_norm": 12.1875, "learning_rate": 3.4173119920247454e-06, "loss": 0.4466, "step": 2720 }, { "epoch": 0.6501299314906686, "grad_norm": 5.1875, "learning_rate": 3.2926983460210564e-06, "loss": 0.4131, "step": 2752 }, { "epoch": 0.6576895818568391, "grad_norm": 4.6875, "learning_rate": 3.1692739216677483e-06, "loss": 0.4672, "step": 2784 }, { "epoch": 0.6652492322230097, "grad_norm": 5.5625, "learning_rate": 3.0471246902970032e-06, "loss": 0.4291, "step": 2816 }, { "epoch": 0.6728088825891803, "grad_norm": 4.71875, "learning_rate": 2.926335735004817e-06, "loss": 0.4264, "step": 2848 }, { "epoch": 0.6803685329553508, "grad_norm": 6.0625, "learning_rate": 2.8069911913863414e-06, "loss": 0.422, "step": 2880 }, { "epoch": 0.6879281833215214, "grad_norm": 6.9375, "learning_rate": 2.689174188931202e-06, "loss": 0.4005, "step": 2912 }, { "epoch": 0.695487833687692, "grad_norm": 6.125, "learning_rate": 2.5729667931196103e-06, "loss": 0.4137, "step": 2944 }, { "epoch": 0.7030474840538625, "grad_norm": 4.0625, "learning_rate": 2.4584499482596274e-06, "loss": 0.4145, "step": 2976 }, { "epoch": 0.7106071344200331, "grad_norm": 6.1875, "learning_rate": 2.3457034211053703e-06, "loss": 0.4601, "step": 3008 }, { "epoch": 0.7181667847862037, "grad_norm": 9.0, "learning_rate": 2.234805745295457e-06, "loss": 0.4238, "step": 3040 }, { "epoch": 0.7257264351523742, "grad_norm": 15.625, "learning_rate": 2.125834166650354e-06, "loss": 0.4579, "step": 3072 }, { "epoch": 0.7332860855185448, "grad_norm": 4.8125, "learning_rate": 2.018864589366778e-06, "loss": 0.4183, "step": 3104 }, { "epoch": 0.7408457358847154, "grad_norm": 6.1875, "learning_rate": 1.9139715231466014e-06, "loss": 0.4387, "step": 3136 }, { "epoch": 0.7484053862508859, "grad_norm": 7.03125, "learning_rate": 1.811228031297077e-06, "loss": 0.4367, "step": 3168 }, { "epoch": 0.7559650366170565, "grad_norm": 5.65625, "learning_rate": 1.7107056798385763e-06, "loss": 0.451, "step": 3200 }, { "epoch": 0.7635246869832271, "grad_norm": 6.65625, "learning_rate": 1.6124744876552373e-06, "loss": 0.4101, "step": 3232 }, { "epoch": 0.7710843373493976, "grad_norm": 6.8125, "learning_rate": 1.5166028777232884e-06, "loss": 0.3734, "step": 3264 }, { "epoch": 0.7786439877155682, "grad_norm": 5.34375, "learning_rate": 1.4231576294510013e-06, "loss": 0.4194, "step": 3296 }, { "epoch": 0.7862036380817388, "grad_norm": 7.8125, "learning_rate": 1.3322038321634567e-06, "loss": 0.465, "step": 3328 }, { "epoch": 0.7937632884479093, "grad_norm": 5.75, "learning_rate": 1.2438048397645558e-06, "loss": 0.4751, "step": 3360 }, { "epoch": 0.8013229388140799, "grad_norm": 8.75, "learning_rate": 1.1580222266078367e-06, "loss": 0.401, "step": 3392 }, { "epoch": 0.8088825891802505, "grad_norm": 8.375, "learning_rate": 1.0749157446068242e-06, "loss": 0.4418, "step": 3424 }, { "epoch": 0.8164422395464209, "grad_norm": 7.375, "learning_rate": 9.945432816148175e-07, "loss": 0.4405, "step": 3456 }, { "epoch": 0.8240018899125915, "grad_norm": 4.84375, "learning_rate": 9.169608211030783e-07, "loss": 0.4298, "step": 3488 }, { "epoch": 0.8315615402787621, "grad_norm": 5.78125, "learning_rate": 8.422224031655313e-07, "loss": 0.4156, "step": 3520 }, { "epoch": 0.8391211906449326, "grad_norm": 21.25, "learning_rate": 7.703800868771e-07, "loss": 0.4467, "step": 3552 }, { "epoch": 0.8466808410111032, "grad_norm": 14.3125, "learning_rate": 7.014839140319485e-07, "loss": 0.4443, "step": 3584 }, { "epoch": 0.8542404913772738, "grad_norm": 6.59375, "learning_rate": 6.355818742868447e-07, "loss": 0.4381, "step": 3616 }, { "epoch": 0.8618001417434443, "grad_norm": 5.46875, "learning_rate": 5.727198717339511e-07, "loss": 0.405, "step": 3648 }, { "epoch": 0.8693597921096149, "grad_norm": 4.3125, "learning_rate": 5.129416929263031e-07, "loss": 0.4161, "step": 3680 }, { "epoch": 0.8769194424757855, "grad_norm": 6.84375, "learning_rate": 4.5628897637827354e-07, "loss": 0.4294, "step": 3712 }, { "epoch": 0.884479092841956, "grad_norm": 6.0, "learning_rate": 4.028011835622492e-07, "loss": 0.4084, "step": 3744 }, { "epoch": 0.8920387432081266, "grad_norm": 5.1875, "learning_rate": 3.525155714217227e-07, "loss": 0.3902, "step": 3776 }, { "epoch": 0.8995983935742972, "grad_norm": 5.8125, "learning_rate": 3.054671664199543e-07, "loss": 0.423, "step": 3808 }, { "epoch": 0.9071580439404677, "grad_norm": 6.75, "learning_rate": 2.616887401422796e-07, "loss": 0.4108, "step": 3840 }, { "epoch": 0.9147176943066383, "grad_norm": 5.9375, "learning_rate": 2.212107864690438e-07, "loss": 0.4546, "step": 3872 }, { "epoch": 0.9222773446728089, "grad_norm": 8.0625, "learning_rate": 1.8406150033507764e-07, "loss": 0.4434, "step": 3904 }, { "epoch": 0.9298369950389794, "grad_norm": 10.4375, "learning_rate": 1.502667580905054e-07, "loss": 0.4149, "step": 3936 }, { "epoch": 0.93739664540515, "grad_norm": 6.0625, "learning_rate": 1.1985009947656278e-07, "loss": 0.4504, "step": 3968 }, { "epoch": 0.9449562957713206, "grad_norm": 5.625, "learning_rate": 9.283271122898174e-08, "loss": 0.437, "step": 4000 }, { "epoch": 0.9525159461374911, "grad_norm": 6.90625, "learning_rate": 6.923341232035863e-08, "loss": 0.4205, "step": 4032 }, { "epoch": 0.9600755965036617, "grad_norm": 9.625, "learning_rate": 4.9068640851792636e-08, "loss": 0.4221, "step": 4064 }, { "epoch": 0.9676352468698323, "grad_norm": 5.09375, "learning_rate": 3.235244260292147e-08, "loss": 0.4172, "step": 4096 }, { "epoch": 0.9751948972360028, "grad_norm": 6.0, "learning_rate": 1.909646124832576e-08, "loss": 0.415, "step": 4128 }, { "epoch": 0.9827545476021734, "grad_norm": 7.34375, "learning_rate": 9.30993024712279e-09, "loss": 0.4298, "step": 4160 }, { "epoch": 0.990314197968344, "grad_norm": 5.25, "learning_rate": 2.999666411398483e-09, "loss": 0.3673, "step": 4192 }, { "epoch": 0.9978738483345145, "grad_norm": 5.28125, "learning_rate": 1.7006515795336963e-10, "loss": 0.422, "step": 4224 }, { "epoch": 1.0, "step": 4233, "total_flos": 7.498056185637274e+16, "train_loss": 0.4399125433520113, "train_runtime": 1541.2359, "train_samples_per_second": 10.985, "train_steps_per_second": 2.746 } ], "logging_steps": 32, "max_steps": 4233, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.498056185637274e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }