{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.976, "eval_steps": 500, "global_step": 93, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032, "grad_norm": 6.086521555575525, "learning_rate": 2.0000000000000003e-06, "loss": 0.8709, "step": 1 }, { "epoch": 0.064, "grad_norm": 5.862669428136825, "learning_rate": 4.000000000000001e-06, "loss": 0.8625, "step": 2 }, { "epoch": 0.096, "grad_norm": 5.665430905084772, "learning_rate": 6e-06, "loss": 0.8709, "step": 3 }, { "epoch": 0.128, "grad_norm": 3.9717562012437484, "learning_rate": 8.000000000000001e-06, "loss": 0.7764, "step": 4 }, { "epoch": 0.16, "grad_norm": 2.2462420454815217, "learning_rate": 1e-05, "loss": 0.7977, "step": 5 }, { "epoch": 0.192, "grad_norm": 4.275087411533909, "learning_rate": 1.2e-05, "loss": 0.8235, "step": 6 }, { "epoch": 0.224, "grad_norm": 4.540127453722676, "learning_rate": 1.4e-05, "loss": 0.7536, "step": 7 }, { "epoch": 0.256, "grad_norm": 5.958818552449376, "learning_rate": 1.6000000000000003e-05, "loss": 0.7817, "step": 8 }, { "epoch": 0.288, "grad_norm": 4.615179916049989, "learning_rate": 1.8e-05, "loss": 0.7716, "step": 9 }, { "epoch": 0.32, "grad_norm": 2.3655854780686663, "learning_rate": 2e-05, "loss": 0.6932, "step": 10 }, { "epoch": 0.352, "grad_norm": 2.478544642789441, "learning_rate": 1.9992837548163315e-05, "loss": 0.7029, "step": 11 }, { "epoch": 0.384, "grad_norm": 1.9787874769183103, "learning_rate": 1.9971360452796523e-05, "loss": 0.7003, "step": 12 }, { "epoch": 0.416, "grad_norm": 1.3386567233728712, "learning_rate": 1.993559947963185e-05, "loss": 0.6677, "step": 13 }, { "epoch": 0.448, "grad_norm": 1.177972539602327, "learning_rate": 1.9885605855918887e-05, "loss": 0.6495, "step": 14 }, { "epoch": 0.48, "grad_norm": 1.1019717904815112, "learning_rate": 1.9821451197042028e-05, "loss": 0.5967, "step": 15 }, { "epoch": 0.512, "grad_norm": 0.972479825971204, "learning_rate": 1.9743227403932135e-05, "loss": 0.6233, "step": 16 }, { "epoch": 0.544, "grad_norm": 0.9169380714246801, "learning_rate": 1.9651046531419335e-05, "loss": 0.5939, "step": 17 }, { "epoch": 0.576, "grad_norm": 0.7484870741589357, "learning_rate": 1.9545040627715554e-05, "loss": 0.5913, "step": 18 }, { "epoch": 0.608, "grad_norm": 0.8433187788934118, "learning_rate": 1.942536154525673e-05, "loss": 0.5926, "step": 19 }, { "epoch": 0.64, "grad_norm": 0.7004246074241133, "learning_rate": 1.9292180723175656e-05, "loss": 0.5679, "step": 20 }, { "epoch": 0.672, "grad_norm": 0.9310125911747377, "learning_rate": 1.9145688941717074e-05, "loss": 0.5644, "step": 21 }, { "epoch": 0.704, "grad_norm": 0.7970495324653322, "learning_rate": 1.8986096048946826e-05, "loss": 0.608, "step": 22 }, { "epoch": 0.736, "grad_norm": 0.8089072723260873, "learning_rate": 1.881363066014649e-05, "loss": 0.5677, "step": 23 }, { "epoch": 0.768, "grad_norm": 1.0367436501959517, "learning_rate": 1.862853983032423e-05, "loss": 0.6124, "step": 24 }, { "epoch": 0.8, "grad_norm": 0.6549565017520195, "learning_rate": 1.8431088700310846e-05, "loss": 0.565, "step": 25 }, { "epoch": 0.832, "grad_norm": 0.8100479455474157, "learning_rate": 1.8221560116948103e-05, "loss": 0.5536, "step": 26 }, { "epoch": 0.864, "grad_norm": 0.6655173946648272, "learning_rate": 1.8000254227913346e-05, "loss": 0.5927, "step": 27 }, { "epoch": 0.896, "grad_norm": 0.6489458832967611, "learning_rate": 1.7767488051760858e-05, "loss": 0.5996, "step": 28 }, { "epoch": 0.928, "grad_norm": 0.6556791371183601, "learning_rate": 1.7523595023795814e-05, "loss": 0.5565, "step": 29 }, { "epoch": 0.96, "grad_norm": 0.6656680446819412, "learning_rate": 1.7268924518431437e-05, "loss": 0.5691, "step": 30 }, { "epoch": 0.992, "grad_norm": 0.6975728008241743, "learning_rate": 1.700384134871351e-05, "loss": 0.5634, "step": 31 }, { "epoch": 1.024, "grad_norm": 1.0956954168489788, "learning_rate": 1.672872524372919e-05, "loss": 0.9416, "step": 32 }, { "epoch": 1.056, "grad_norm": 0.48720084397077673, "learning_rate": 1.644397030464877e-05, "loss": 0.4704, "step": 33 }, { "epoch": 1.088, "grad_norm": 0.7718389138248645, "learning_rate": 1.614998444017954e-05, "loss": 0.5326, "step": 34 }, { "epoch": 1.12, "grad_norm": 0.6152935642153501, "learning_rate": 1.5847188782240473e-05, "loss": 0.5174, "step": 35 }, { "epoch": 1.152, "grad_norm": 0.5860900719025863, "learning_rate": 1.5536017082694846e-05, "loss": 0.5031, "step": 36 }, { "epoch": 1.184, "grad_norm": 0.9098930960008285, "learning_rate": 1.5216915092004847e-05, "loss": 0.4993, "step": 37 }, { "epoch": 1.216, "grad_norm": 0.6433773311840126, "learning_rate": 1.4890339920698334e-05, "loss": 0.4832, "step": 38 }, { "epoch": 1.248, "grad_norm": 0.574508872316764, "learning_rate": 1.4556759384562418e-05, "loss": 0.5301, "step": 39 }, { "epoch": 1.28, "grad_norm": 0.6177654141668454, "learning_rate": 1.421665133450184e-05, "loss": 0.4809, "step": 40 }, { "epoch": 1.312, "grad_norm": 0.5915067861936215, "learning_rate": 1.3870502972022175e-05, "loss": 0.4947, "step": 41 }, { "epoch": 1.3439999999999999, "grad_norm": 0.6003229031077386, "learning_rate": 1.351881015131833e-05, "loss": 0.5285, "step": 42 }, { "epoch": 1.376, "grad_norm": 0.5286636651815204, "learning_rate": 1.316207666896824e-05, "loss": 0.5134, "step": 43 }, { "epoch": 1.408, "grad_norm": 0.5729916541561343, "learning_rate": 1.2800813542249073e-05, "loss": 0.4392, "step": 44 }, { "epoch": 1.44, "grad_norm": 0.5389286220120857, "learning_rate": 1.2435538277109919e-05, "loss": 0.4811, "step": 45 }, { "epoch": 1.472, "grad_norm": 0.5162482888115615, "learning_rate": 1.206677412684953e-05, "loss": 0.5603, "step": 46 }, { "epoch": 1.504, "grad_norm": 0.5836229105696747, "learning_rate": 1.1695049342560969e-05, "loss": 0.452, "step": 47 }, { "epoch": 1.536, "grad_norm": 0.5251402934459473, "learning_rate": 1.1320896416417026e-05, "loss": 0.5007, "step": 48 }, { "epoch": 1.568, "grad_norm": 0.49581981068641073, "learning_rate": 1.0944851318880314e-05, "loss": 0.4815, "step": 49 }, { "epoch": 1.6, "grad_norm": 0.6328540516916794, "learning_rate": 1.0567452730930743e-05, "loss": 0.559, "step": 50 }, { "epoch": 1.6320000000000001, "grad_norm": 0.48288716820093464, "learning_rate": 1.0189241272410191e-05, "loss": 0.4568, "step": 51 }, { "epoch": 1.6640000000000001, "grad_norm": 0.43453357593747727, "learning_rate": 9.810758727589814e-06, "loss": 0.5136, "step": 52 }, { "epoch": 1.696, "grad_norm": 0.43509974292321796, "learning_rate": 9.43254726906926e-06, "loss": 0.4613, "step": 53 }, { "epoch": 1.728, "grad_norm": 0.5105120033270906, "learning_rate": 9.055148681119688e-06, "loss": 0.5104, "step": 54 }, { "epoch": 1.76, "grad_norm": 0.4655178250091279, "learning_rate": 8.67910358358298e-06, "loss": 0.515, "step": 55 }, { "epoch": 1.792, "grad_norm": 0.4271675109217912, "learning_rate": 8.304950657439034e-06, "loss": 0.4594, "step": 56 }, { "epoch": 1.8239999999999998, "grad_norm": 0.49279836130724164, "learning_rate": 7.93322587315047e-06, "loss": 0.5102, "step": 57 }, { "epoch": 1.8559999999999999, "grad_norm": 0.4312602763649386, "learning_rate": 7.564461722890082e-06, "loss": 0.461, "step": 58 }, { "epoch": 1.888, "grad_norm": 0.39722581396114753, "learning_rate": 7.199186457750931e-06, "loss": 0.5036, "step": 59 }, { "epoch": 1.92, "grad_norm": 0.3707840503108745, "learning_rate": 6.837923331031761e-06, "loss": 0.4449, "step": 60 }, { "epoch": 1.952, "grad_norm": 0.5482704117501644, "learning_rate": 6.48118984868167e-06, "loss": 0.492, "step": 61 }, { "epoch": 1.984, "grad_norm": 0.37776373909746913, "learning_rate": 6.129497027977829e-06, "loss": 0.4131, "step": 62 }, { "epoch": 2.016, "grad_norm": 0.8547764035968157, "learning_rate": 5.78334866549816e-06, "loss": 0.8721, "step": 63 }, { "epoch": 2.048, "grad_norm": 0.4825739846598577, "learning_rate": 5.443240615437586e-06, "loss": 0.4129, "step": 64 }, { "epoch": 2.08, "grad_norm": 0.48613435690189827, "learning_rate": 5.109660079301668e-06, "loss": 0.4016, "step": 65 }, { "epoch": 2.112, "grad_norm": 0.5173122404541247, "learning_rate": 4.783084907995156e-06, "loss": 0.4602, "step": 66 }, { "epoch": 2.144, "grad_norm": 0.4177033225672536, "learning_rate": 4.463982917305155e-06, "loss": 0.4306, "step": 67 }, { "epoch": 2.176, "grad_norm": 0.47335210102568437, "learning_rate": 4.152811217759529e-06, "loss": 0.4554, "step": 68 }, { "epoch": 2.208, "grad_norm": 0.5212910634677221, "learning_rate": 3.850015559820465e-06, "loss": 0.4453, "step": 69 }, { "epoch": 2.24, "grad_norm": 0.4937857543597342, "learning_rate": 3.5560296953512296e-06, "loss": 0.4501, "step": 70 }, { "epoch": 2.2720000000000002, "grad_norm": 0.456794357715941, "learning_rate": 3.2712747562708115e-06, "loss": 0.4418, "step": 71 }, { "epoch": 2.304, "grad_norm": 0.41067614923626466, "learning_rate": 2.9961586512864947e-06, "loss": 0.3938, "step": 72 }, { "epoch": 2.336, "grad_norm": 0.5398460548951893, "learning_rate": 2.7310754815685627e-06, "loss": 0.5264, "step": 73 }, { "epoch": 2.368, "grad_norm": 0.39421897779539533, "learning_rate": 2.4764049762041874e-06, "loss": 0.4214, "step": 74 }, { "epoch": 2.4, "grad_norm": 0.45677772727826993, "learning_rate": 2.2325119482391466e-06, "loss": 0.4438, "step": 75 }, { "epoch": 2.432, "grad_norm": 0.3614990193323558, "learning_rate": 1.9997457720866554e-06, "loss": 0.3733, "step": 76 }, { "epoch": 2.464, "grad_norm": 0.4657826146229637, "learning_rate": 1.7784398830519002e-06, "loss": 0.4528, "step": 77 }, { "epoch": 2.496, "grad_norm": 0.37950794177613073, "learning_rate": 1.5689112996891576e-06, "loss": 0.4219, "step": 78 }, { "epoch": 2.528, "grad_norm": 0.40926112347112487, "learning_rate": 1.3714601696757713e-06, "loss": 0.4557, "step": 79 }, { "epoch": 2.56, "grad_norm": 0.3444002577522251, "learning_rate": 1.1863693398535115e-06, "loss": 0.3943, "step": 80 }, { "epoch": 2.592, "grad_norm": 0.3404781664005299, "learning_rate": 1.01390395105318e-06, "loss": 0.4281, "step": 81 }, { "epoch": 2.624, "grad_norm": 0.3448953746542976, "learning_rate": 8.543110582829272e-07, "loss": 0.4293, "step": 82 }, { "epoch": 2.656, "grad_norm": 0.320896402093994, "learning_rate": 7.078192768243486e-07, "loss": 0.3955, "step": 83 }, { "epoch": 2.6879999999999997, "grad_norm": 0.3635340073918507, "learning_rate": 5.746384547432738e-07, "loss": 0.4496, "step": 84 }, { "epoch": 2.7199999999999998, "grad_norm": 0.3153399588065185, "learning_rate": 4.549593722844492e-07, "loss": 0.3784, "step": 85 }, { "epoch": 2.752, "grad_norm": 0.32229126417822246, "learning_rate": 3.4895346858066723e-07, "loss": 0.4526, "step": 86 }, { "epoch": 2.784, "grad_norm": 0.33084039803529036, "learning_rate": 2.5677259606786686e-07, "loss": 0.3698, "step": 87 }, { "epoch": 2.816, "grad_norm": 0.3521643205936658, "learning_rate": 1.7854880295797406e-07, "loss": 0.4528, "step": 88 }, { "epoch": 2.848, "grad_norm": 0.3379615978848595, "learning_rate": 1.1439414408111471e-07, "loss": 0.4464, "step": 89 }, { "epoch": 2.88, "grad_norm": 0.3383417229866119, "learning_rate": 6.440052036815081e-08, "loss": 0.3912, "step": 90 }, { "epoch": 2.912, "grad_norm": 0.3088668241871979, "learning_rate": 2.86395472034795e-08, "loss": 0.4426, "step": 91 }, { "epoch": 2.944, "grad_norm": 0.32476651809192353, "learning_rate": 7.162451836685291e-09, "loss": 0.4281, "step": 92 }, { "epoch": 2.976, "grad_norm": 0.3358967063808013, "learning_rate": 0.0, "loss": 0.4736, "step": 93 }, { "epoch": 2.976, "step": 93, "total_flos": 1.818538711009198e+17, "train_loss": 0.53916282679445, "train_runtime": 9093.0931, "train_samples_per_second": 0.989, "train_steps_per_second": 0.01 } ], "logging_steps": 1.0, "max_steps": 93, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.818538711009198e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }