{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 5000.0, "global_step": 29652, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.050586806960744635, "grad_norm": 3.026057720184326, "learning_rate": 4.9156886550654254e-05, "loss": 7.7479, "step": 500 }, { "epoch": 0.10117361392148927, "grad_norm": 2.428532361984253, "learning_rate": 4.831377310130851e-05, "loss": 7.602, "step": 1000 }, { "epoch": 0.1517604208822339, "grad_norm": 3.12160587310791, "learning_rate": 4.7470659651962764e-05, "loss": 7.5609, "step": 1500 }, { "epoch": 0.20234722784297854, "grad_norm": 2.7097604274749756, "learning_rate": 4.662754620261703e-05, "loss": 7.5615, "step": 2000 }, { "epoch": 0.2529340348037232, "grad_norm": 2.684849500656128, "learning_rate": 4.578443275327128e-05, "loss": 7.5591, "step": 2500 }, { "epoch": 0.3035208417644678, "grad_norm": 2.7275538444519043, "learning_rate": 4.494131930392554e-05, "loss": 7.5242, "step": 3000 }, { "epoch": 0.35410764872521244, "grad_norm": 3.8471314907073975, "learning_rate": 4.409820585457979e-05, "loss": 7.5216, "step": 3500 }, { "epoch": 0.4046944556859571, "grad_norm": 3.268939971923828, "learning_rate": 4.325509240523405e-05, "loss": 7.5121, "step": 4000 }, { "epoch": 0.45528126264670177, "grad_norm": 4.609728813171387, "learning_rate": 4.24119789558883e-05, "loss": 7.4998, "step": 4500 }, { "epoch": 0.5058680696074463, "grad_norm": 3.0856144428253174, "learning_rate": 4.156886550654257e-05, "loss": 7.4945, "step": 5000 }, { "epoch": 0.556454876568191, "grad_norm": 3.188549757003784, "learning_rate": 4.072575205719682e-05, "loss": 7.4944, "step": 5500 }, { "epoch": 0.6070416835289356, "grad_norm": 3.380218982696533, "learning_rate": 3.988263860785108e-05, "loss": 7.5203, "step": 6000 }, { "epoch": 0.6576284904896803, "grad_norm": 3.2636780738830566, "learning_rate": 3.903952515850533e-05, "loss": 7.4899, "step": 6500 }, { "epoch": 0.7082152974504249, "grad_norm": 3.4519433975219727, "learning_rate": 3.819641170915959e-05, "loss": 7.4763, "step": 7000 }, { "epoch": 0.7588021044111696, "grad_norm": 3.5375866889953613, "learning_rate": 3.735329825981384e-05, "loss": 7.4911, "step": 7500 }, { "epoch": 0.8093889113719142, "grad_norm": 3.090251922607422, "learning_rate": 3.65101848104681e-05, "loss": 7.4606, "step": 8000 }, { "epoch": 0.8599757183326588, "grad_norm": 2.6117258071899414, "learning_rate": 3.5667071361122356e-05, "loss": 7.4925, "step": 8500 }, { "epoch": 0.9105625252934035, "grad_norm": 3.3298494815826416, "learning_rate": 3.482395791177661e-05, "loss": 7.4626, "step": 9000 }, { "epoch": 0.9611493322541481, "grad_norm": 3.267622232437134, "learning_rate": 3.398084446243087e-05, "loss": 7.4583, "step": 9500 }, { "epoch": 1.0117361392148927, "grad_norm": 3.9384641647338867, "learning_rate": 3.313773101308512e-05, "loss": 7.471, "step": 10000 }, { "epoch": 1.0623229461756374, "grad_norm": 3.3493237495422363, "learning_rate": 3.229461756373938e-05, "loss": 7.4833, "step": 10500 }, { "epoch": 1.112909753136382, "grad_norm": 4.062368869781494, "learning_rate": 3.145150411439363e-05, "loss": 7.4637, "step": 11000 }, { "epoch": 1.1634965600971268, "grad_norm": 3.0285532474517822, "learning_rate": 3.0608390665047894e-05, "loss": 7.4524, "step": 11500 }, { "epoch": 1.2140833670578712, "grad_norm": 3.087231159210205, "learning_rate": 2.976527721570215e-05, "loss": 7.4797, "step": 12000 }, { "epoch": 1.264670174018616, "grad_norm": 2.7964272499084473, "learning_rate": 2.8922163766356404e-05, "loss": 7.4641, "step": 12500 }, { "epoch": 1.3152569809793606, "grad_norm": 3.68481183052063, "learning_rate": 2.807905031701066e-05, "loss": 7.452, "step": 13000 }, { "epoch": 1.3658437879401053, "grad_norm": 3.3050220012664795, "learning_rate": 2.7235936867664915e-05, "loss": 7.4593, "step": 13500 }, { "epoch": 1.41643059490085, "grad_norm": 2.939967155456543, "learning_rate": 2.639282341831917e-05, "loss": 7.4565, "step": 14000 }, { "epoch": 1.4670174018615945, "grad_norm": 2.9299378395080566, "learning_rate": 2.5549709968973428e-05, "loss": 7.4769, "step": 14500 }, { "epoch": 1.5176042088223392, "grad_norm": 4.469327449798584, "learning_rate": 2.470659651962768e-05, "loss": 7.4498, "step": 15000 }, { "epoch": 1.5681910157830838, "grad_norm": 3.3183658123016357, "learning_rate": 2.386348307028194e-05, "loss": 7.4527, "step": 15500 }, { "epoch": 1.6187778227438283, "grad_norm": 3.6595232486724854, "learning_rate": 2.3020369620936194e-05, "loss": 7.4346, "step": 16000 }, { "epoch": 1.669364629704573, "grad_norm": 3.1423637866973877, "learning_rate": 2.217725617159045e-05, "loss": 7.4697, "step": 16500 }, { "epoch": 1.7199514366653177, "grad_norm": 2.9798882007598877, "learning_rate": 2.1334142722244707e-05, "loss": 7.4587, "step": 17000 }, { "epoch": 1.7705382436260622, "grad_norm": 3.496962547302246, "learning_rate": 2.0491029272898962e-05, "loss": 7.4842, "step": 17500 }, { "epoch": 1.821125050586807, "grad_norm": 3.2860915660858154, "learning_rate": 1.9647915823553217e-05, "loss": 7.4505, "step": 18000 }, { "epoch": 1.8717118575475515, "grad_norm": 3.7444324493408203, "learning_rate": 1.8804802374207476e-05, "loss": 7.467, "step": 18500 }, { "epoch": 1.9222986645082962, "grad_norm": 12.527898788452148, "learning_rate": 1.796168892486173e-05, "loss": 7.4629, "step": 19000 }, { "epoch": 1.972885471469041, "grad_norm": 3.0026357173919678, "learning_rate": 1.7118575475515986e-05, "loss": 7.4508, "step": 19500 }, { "epoch": 2.0234722784297854, "grad_norm": 3.080428123474121, "learning_rate": 1.6275462026170245e-05, "loss": 7.4562, "step": 20000 }, { "epoch": 2.0740590853905303, "grad_norm": 3.0115535259246826, "learning_rate": 1.54323485768245e-05, "loss": 7.4614, "step": 20500 }, { "epoch": 2.1246458923512748, "grad_norm": 3.0066559314727783, "learning_rate": 1.4589235127478753e-05, "loss": 7.4522, "step": 21000 }, { "epoch": 2.1752326993120192, "grad_norm": 3.1596481800079346, "learning_rate": 1.3746121678133012e-05, "loss": 7.4373, "step": 21500 }, { "epoch": 2.225819506272764, "grad_norm": 3.6992828845977783, "learning_rate": 1.2903008228787267e-05, "loss": 7.4649, "step": 22000 }, { "epoch": 2.2764063132335086, "grad_norm": 2.809210777282715, "learning_rate": 1.2059894779441522e-05, "loss": 7.4695, "step": 22500 }, { "epoch": 2.3269931201942535, "grad_norm": 3.017029047012329, "learning_rate": 1.1216781330095779e-05, "loss": 7.439, "step": 23000 }, { "epoch": 2.377579927154998, "grad_norm": 3.747455596923828, "learning_rate": 1.0373667880750036e-05, "loss": 7.4468, "step": 23500 }, { "epoch": 2.4281667341157425, "grad_norm": 2.9909703731536865, "learning_rate": 9.53055443140429e-06, "loss": 7.4568, "step": 24000 }, { "epoch": 2.4787535410764874, "grad_norm": 2.6719205379486084, "learning_rate": 8.687440982058546e-06, "loss": 7.4599, "step": 24500 }, { "epoch": 2.529340348037232, "grad_norm": 3.0623087882995605, "learning_rate": 7.844327532712801e-06, "loss": 7.4613, "step": 25000 }, { "epoch": 2.5799271549979768, "grad_norm": 3.4007935523986816, "learning_rate": 7.001214083367058e-06, "loss": 7.4496, "step": 25500 }, { "epoch": 2.6305139619587212, "grad_norm": 3.7014873027801514, "learning_rate": 6.158100634021314e-06, "loss": 7.4569, "step": 26000 }, { "epoch": 2.6811007689194657, "grad_norm": 2.988811492919922, "learning_rate": 5.314987184675571e-06, "loss": 7.4381, "step": 26500 }, { "epoch": 2.7316875758802106, "grad_norm": 4.864758014678955, "learning_rate": 4.471873735329827e-06, "loss": 7.4846, "step": 27000 }, { "epoch": 2.782274382840955, "grad_norm": 2.810575246810913, "learning_rate": 3.628760285984082e-06, "loss": 7.4511, "step": 27500 }, { "epoch": 2.8328611898017, "grad_norm": 3.2787814140319824, "learning_rate": 2.785646836638338e-06, "loss": 7.4568, "step": 28000 }, { "epoch": 2.8834479967624445, "grad_norm": 3.12109637260437, "learning_rate": 1.942533387292594e-06, "loss": 7.4372, "step": 28500 }, { "epoch": 2.934034803723189, "grad_norm": 3.1420302391052246, "learning_rate": 1.0994199379468503e-06, "loss": 7.4484, "step": 29000 }, { "epoch": 2.9846216106839334, "grad_norm": 3.052818775177002, "learning_rate": 2.5630648860110616e-07, "loss": 7.461, "step": 29500 }, { "epoch": 3.0, "step": 29652, "total_flos": 1.2851147000984371e+17, "train_loss": 7.479052871051595, "train_runtime": 34661.0306, "train_samples_per_second": 1.711, "train_steps_per_second": 0.855 } ], "logging_steps": 500, "max_steps": 29652, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2851147000984371e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }