| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 5000.0, | |
| "global_step": 29652, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.050586806960744635, | |
| "grad_norm": 3.026057720184326, | |
| "learning_rate": 4.9156886550654254e-05, | |
| "loss": 7.7479, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.10117361392148927, | |
| "grad_norm": 2.428532361984253, | |
| "learning_rate": 4.831377310130851e-05, | |
| "loss": 7.602, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1517604208822339, | |
| "grad_norm": 3.12160587310791, | |
| "learning_rate": 4.7470659651962764e-05, | |
| "loss": 7.5609, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.20234722784297854, | |
| "grad_norm": 2.7097604274749756, | |
| "learning_rate": 4.662754620261703e-05, | |
| "loss": 7.5615, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2529340348037232, | |
| "grad_norm": 2.684849500656128, | |
| "learning_rate": 4.578443275327128e-05, | |
| "loss": 7.5591, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3035208417644678, | |
| "grad_norm": 2.7275538444519043, | |
| "learning_rate": 4.494131930392554e-05, | |
| "loss": 7.5242, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.35410764872521244, | |
| "grad_norm": 3.8471314907073975, | |
| "learning_rate": 4.409820585457979e-05, | |
| "loss": 7.5216, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.4046944556859571, | |
| "grad_norm": 3.268939971923828, | |
| "learning_rate": 4.325509240523405e-05, | |
| "loss": 7.5121, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.45528126264670177, | |
| "grad_norm": 4.609728813171387, | |
| "learning_rate": 4.24119789558883e-05, | |
| "loss": 7.4998, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.5058680696074463, | |
| "grad_norm": 3.0856144428253174, | |
| "learning_rate": 4.156886550654257e-05, | |
| "loss": 7.4945, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.556454876568191, | |
| "grad_norm": 3.188549757003784, | |
| "learning_rate": 4.072575205719682e-05, | |
| "loss": 7.4944, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.6070416835289356, | |
| "grad_norm": 3.380218982696533, | |
| "learning_rate": 3.988263860785108e-05, | |
| "loss": 7.5203, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.6576284904896803, | |
| "grad_norm": 3.2636780738830566, | |
| "learning_rate": 3.903952515850533e-05, | |
| "loss": 7.4899, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.7082152974504249, | |
| "grad_norm": 3.4519433975219727, | |
| "learning_rate": 3.819641170915959e-05, | |
| "loss": 7.4763, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7588021044111696, | |
| "grad_norm": 3.5375866889953613, | |
| "learning_rate": 3.735329825981384e-05, | |
| "loss": 7.4911, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.8093889113719142, | |
| "grad_norm": 3.090251922607422, | |
| "learning_rate": 3.65101848104681e-05, | |
| "loss": 7.4606, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.8599757183326588, | |
| "grad_norm": 2.6117258071899414, | |
| "learning_rate": 3.5667071361122356e-05, | |
| "loss": 7.4925, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.9105625252934035, | |
| "grad_norm": 3.3298494815826416, | |
| "learning_rate": 3.482395791177661e-05, | |
| "loss": 7.4626, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.9611493322541481, | |
| "grad_norm": 3.267622232437134, | |
| "learning_rate": 3.398084446243087e-05, | |
| "loss": 7.4583, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.0117361392148927, | |
| "grad_norm": 3.9384641647338867, | |
| "learning_rate": 3.313773101308512e-05, | |
| "loss": 7.471, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.0623229461756374, | |
| "grad_norm": 3.3493237495422363, | |
| "learning_rate": 3.229461756373938e-05, | |
| "loss": 7.4833, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.112909753136382, | |
| "grad_norm": 4.062368869781494, | |
| "learning_rate": 3.145150411439363e-05, | |
| "loss": 7.4637, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.1634965600971268, | |
| "grad_norm": 3.0285532474517822, | |
| "learning_rate": 3.0608390665047894e-05, | |
| "loss": 7.4524, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.2140833670578712, | |
| "grad_norm": 3.087231159210205, | |
| "learning_rate": 2.976527721570215e-05, | |
| "loss": 7.4797, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.264670174018616, | |
| "grad_norm": 2.7964272499084473, | |
| "learning_rate": 2.8922163766356404e-05, | |
| "loss": 7.4641, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.3152569809793606, | |
| "grad_norm": 3.68481183052063, | |
| "learning_rate": 2.807905031701066e-05, | |
| "loss": 7.452, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.3658437879401053, | |
| "grad_norm": 3.3050220012664795, | |
| "learning_rate": 2.7235936867664915e-05, | |
| "loss": 7.4593, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.41643059490085, | |
| "grad_norm": 2.939967155456543, | |
| "learning_rate": 2.639282341831917e-05, | |
| "loss": 7.4565, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.4670174018615945, | |
| "grad_norm": 2.9299378395080566, | |
| "learning_rate": 2.5549709968973428e-05, | |
| "loss": 7.4769, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.5176042088223392, | |
| "grad_norm": 4.469327449798584, | |
| "learning_rate": 2.470659651962768e-05, | |
| "loss": 7.4498, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.5681910157830838, | |
| "grad_norm": 3.3183658123016357, | |
| "learning_rate": 2.386348307028194e-05, | |
| "loss": 7.4527, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.6187778227438283, | |
| "grad_norm": 3.6595232486724854, | |
| "learning_rate": 2.3020369620936194e-05, | |
| "loss": 7.4346, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.669364629704573, | |
| "grad_norm": 3.1423637866973877, | |
| "learning_rate": 2.217725617159045e-05, | |
| "loss": 7.4697, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.7199514366653177, | |
| "grad_norm": 2.9798882007598877, | |
| "learning_rate": 2.1334142722244707e-05, | |
| "loss": 7.4587, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.7705382436260622, | |
| "grad_norm": 3.496962547302246, | |
| "learning_rate": 2.0491029272898962e-05, | |
| "loss": 7.4842, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.821125050586807, | |
| "grad_norm": 3.2860915660858154, | |
| "learning_rate": 1.9647915823553217e-05, | |
| "loss": 7.4505, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.8717118575475515, | |
| "grad_norm": 3.7444324493408203, | |
| "learning_rate": 1.8804802374207476e-05, | |
| "loss": 7.467, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.9222986645082962, | |
| "grad_norm": 12.527898788452148, | |
| "learning_rate": 1.796168892486173e-05, | |
| "loss": 7.4629, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.972885471469041, | |
| "grad_norm": 3.0026357173919678, | |
| "learning_rate": 1.7118575475515986e-05, | |
| "loss": 7.4508, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.0234722784297854, | |
| "grad_norm": 3.080428123474121, | |
| "learning_rate": 1.6275462026170245e-05, | |
| "loss": 7.4562, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.0740590853905303, | |
| "grad_norm": 3.0115535259246826, | |
| "learning_rate": 1.54323485768245e-05, | |
| "loss": 7.4614, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.1246458923512748, | |
| "grad_norm": 3.0066559314727783, | |
| "learning_rate": 1.4589235127478753e-05, | |
| "loss": 7.4522, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.1752326993120192, | |
| "grad_norm": 3.1596481800079346, | |
| "learning_rate": 1.3746121678133012e-05, | |
| "loss": 7.4373, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.225819506272764, | |
| "grad_norm": 3.6992828845977783, | |
| "learning_rate": 1.2903008228787267e-05, | |
| "loss": 7.4649, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.2764063132335086, | |
| "grad_norm": 2.809210777282715, | |
| "learning_rate": 1.2059894779441522e-05, | |
| "loss": 7.4695, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.3269931201942535, | |
| "grad_norm": 3.017029047012329, | |
| "learning_rate": 1.1216781330095779e-05, | |
| "loss": 7.439, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.377579927154998, | |
| "grad_norm": 3.747455596923828, | |
| "learning_rate": 1.0373667880750036e-05, | |
| "loss": 7.4468, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 2.4281667341157425, | |
| "grad_norm": 2.9909703731536865, | |
| "learning_rate": 9.53055443140429e-06, | |
| "loss": 7.4568, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.4787535410764874, | |
| "grad_norm": 2.6719205379486084, | |
| "learning_rate": 8.687440982058546e-06, | |
| "loss": 7.4599, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 2.529340348037232, | |
| "grad_norm": 3.0623087882995605, | |
| "learning_rate": 7.844327532712801e-06, | |
| "loss": 7.4613, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.5799271549979768, | |
| "grad_norm": 3.4007935523986816, | |
| "learning_rate": 7.001214083367058e-06, | |
| "loss": 7.4496, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 2.6305139619587212, | |
| "grad_norm": 3.7014873027801514, | |
| "learning_rate": 6.158100634021314e-06, | |
| "loss": 7.4569, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.6811007689194657, | |
| "grad_norm": 2.988811492919922, | |
| "learning_rate": 5.314987184675571e-06, | |
| "loss": 7.4381, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.7316875758802106, | |
| "grad_norm": 4.864758014678955, | |
| "learning_rate": 4.471873735329827e-06, | |
| "loss": 7.4846, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.782274382840955, | |
| "grad_norm": 2.810575246810913, | |
| "learning_rate": 3.628760285984082e-06, | |
| "loss": 7.4511, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.8328611898017, | |
| "grad_norm": 3.2787814140319824, | |
| "learning_rate": 2.785646836638338e-06, | |
| "loss": 7.4568, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.8834479967624445, | |
| "grad_norm": 3.12109637260437, | |
| "learning_rate": 1.942533387292594e-06, | |
| "loss": 7.4372, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.934034803723189, | |
| "grad_norm": 3.1420302391052246, | |
| "learning_rate": 1.0994199379468503e-06, | |
| "loss": 7.4484, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.9846216106839334, | |
| "grad_norm": 3.052818775177002, | |
| "learning_rate": 2.5630648860110616e-07, | |
| "loss": 7.461, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 29652, | |
| "total_flos": 1.2851147000984371e+17, | |
| "train_loss": 7.479052871051595, | |
| "train_runtime": 34661.0306, | |
| "train_samples_per_second": 1.711, | |
| "train_steps_per_second": 0.855 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 29652, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2851147000984371e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |