| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.2621231979030144, | |
| "eval_steps": 500, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00436871996505024, | |
| "grad_norm": 9.839941024780273, | |
| "learning_rate": 8e-05, | |
| "loss": 2.5246, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.00873743993010048, | |
| "grad_norm": 13.773455619812012, | |
| "learning_rate": 0.00018, | |
| "loss": 1.1343, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01310615989515072, | |
| "grad_norm": 5.6580424308776855, | |
| "learning_rate": 0.0001999997582552296, | |
| "loss": 0.7712, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.01747487986020096, | |
| "grad_norm": 5.294467926025391, | |
| "learning_rate": 0.0001999987761691029, | |
| "loss": 0.73, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.021843599825251202, | |
| "grad_norm": 2.8633503913879395, | |
| "learning_rate": 0.00019999703863998527, | |
| "loss": 0.7289, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.02621231979030144, | |
| "grad_norm": 3.2836177349090576, | |
| "learning_rate": 0.00019999454568100293, | |
| "loss": 0.4686, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03058103975535168, | |
| "grad_norm": 4.878258228302002, | |
| "learning_rate": 0.00019999129731098898, | |
| "loss": 0.6629, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.03494975972040192, | |
| "grad_norm": 2.899914026260376, | |
| "learning_rate": 0.00019998729355448326, | |
| "loss": 0.6038, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.039318479685452164, | |
| "grad_norm": 3.289844274520874, | |
| "learning_rate": 0.00019998253444173235, | |
| "loss": 0.4573, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.043687199650502405, | |
| "grad_norm": 2.957254648208618, | |
| "learning_rate": 0.00019997702000868896, | |
| "loss": 0.594, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.048055919615552646, | |
| "grad_norm": 3.171276807785034, | |
| "learning_rate": 0.00019997075029701207, | |
| "loss": 0.5719, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.05242463958060288, | |
| "grad_norm": 2.55605149269104, | |
| "learning_rate": 0.0001999637253540663, | |
| "loss": 0.5971, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.05679335954565312, | |
| "grad_norm": 2.127289295196533, | |
| "learning_rate": 0.00019995594523292178, | |
| "loss": 0.5712, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.06116207951070336, | |
| "grad_norm": 3.3928685188293457, | |
| "learning_rate": 0.00019994740999235359, | |
| "loss": 0.5712, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0655307994757536, | |
| "grad_norm": 2.6700279712677, | |
| "learning_rate": 0.00019993811969684142, | |
| "loss": 0.427, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.06989951944080385, | |
| "grad_norm": 2.6936633586883545, | |
| "learning_rate": 0.00019992807441656898, | |
| "loss": 0.5321, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.07426823940585409, | |
| "grad_norm": 3.9897687435150146, | |
| "learning_rate": 0.00019991727422742362, | |
| "loss": 0.6025, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.07863695937090433, | |
| "grad_norm": 2.3496663570404053, | |
| "learning_rate": 0.00019990571921099553, | |
| "loss": 0.5975, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08300567933595457, | |
| "grad_norm": 3.3796467781066895, | |
| "learning_rate": 0.0001998934094545774, | |
| "loss": 0.5255, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.08737439930100481, | |
| "grad_norm": 3.1103007793426514, | |
| "learning_rate": 0.00019988034505116352, | |
| "loss": 0.4946, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09174311926605505, | |
| "grad_norm": 2.002304792404175, | |
| "learning_rate": 0.00019986652609944926, | |
| "loss": 0.425, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.09611183923110529, | |
| "grad_norm": 1.7572168111801147, | |
| "learning_rate": 0.00019985195270383018, | |
| "loss": 0.6073, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.10048055919615553, | |
| "grad_norm": 2.745215654373169, | |
| "learning_rate": 0.00019983662497440133, | |
| "loss": 0.586, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.10484927916120576, | |
| "grad_norm": 1.8170915842056274, | |
| "learning_rate": 0.0001998205430269564, | |
| "loss": 0.5255, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.109217999126256, | |
| "grad_norm": 1.4944056272506714, | |
| "learning_rate": 0.00019980370698298677, | |
| "loss": 0.4219, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.11358671909130624, | |
| "grad_norm": 1.6616989374160767, | |
| "learning_rate": 0.00019978611696968074, | |
| "loss": 0.4231, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.11795543905635648, | |
| "grad_norm": 2.0523645877838135, | |
| "learning_rate": 0.00019976777311992247, | |
| "loss": 0.5298, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.12232415902140673, | |
| "grad_norm": 2.065765619277954, | |
| "learning_rate": 0.00019974867557229098, | |
| "loss": 0.5228, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.12669287898645698, | |
| "grad_norm": 1.7283438444137573, | |
| "learning_rate": 0.00019972882447105912, | |
| "loss": 0.3452, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.1310615989515072, | |
| "grad_norm": 2.655750274658203, | |
| "learning_rate": 0.00019970821996619244, | |
| "loss": 0.508, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.13543031891655744, | |
| "grad_norm": 2.67799973487854, | |
| "learning_rate": 0.0001996868622133482, | |
| "loss": 0.4359, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.1397990388816077, | |
| "grad_norm": 1.6298809051513672, | |
| "learning_rate": 0.00019966475137387396, | |
| "loss": 0.5447, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.14416775884665792, | |
| "grad_norm": 1.4772286415100098, | |
| "learning_rate": 0.00019964188761480657, | |
| "loss": 0.4105, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.14853647881170817, | |
| "grad_norm": 2.2986271381378174, | |
| "learning_rate": 0.00019961827110887083, | |
| "loss": 0.603, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1529051987767584, | |
| "grad_norm": 2.8261911869049072, | |
| "learning_rate": 0.00019959390203447817, | |
| "loss": 0.4649, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.15727391874180865, | |
| "grad_norm": 1.7771011590957642, | |
| "learning_rate": 0.00019956878057572524, | |
| "loss": 0.4394, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.16164263870685888, | |
| "grad_norm": 1.7315421104431152, | |
| "learning_rate": 0.00019954290692239274, | |
| "loss": 0.5289, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.16601135867190914, | |
| "grad_norm": 1.6124423742294312, | |
| "learning_rate": 0.00019951628126994373, | |
| "loss": 0.4173, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.17038007863695936, | |
| "grad_norm": 1.792577862739563, | |
| "learning_rate": 0.00019948890381952232, | |
| "loss": 0.4331, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.17474879860200962, | |
| "grad_norm": 1.9038774967193604, | |
| "learning_rate": 0.000199460774777952, | |
| "loss": 0.4247, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.17911751856705985, | |
| "grad_norm": 2.457122802734375, | |
| "learning_rate": 0.00019943189435773432, | |
| "loss": 0.4519, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.1834862385321101, | |
| "grad_norm": 1.97683584690094, | |
| "learning_rate": 0.00019940226277704706, | |
| "loss": 0.4761, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.18785495849716033, | |
| "grad_norm": 2.1646862030029297, | |
| "learning_rate": 0.0001993718802597426, | |
| "loss": 0.5294, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.19222367846221058, | |
| "grad_norm": 1.565412998199463, | |
| "learning_rate": 0.00019934074703534637, | |
| "loss": 0.3999, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1965923984272608, | |
| "grad_norm": 2.4315876960754395, | |
| "learning_rate": 0.00019930886333905504, | |
| "loss": 0.378, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.20096111839231107, | |
| "grad_norm": 2.7567529678344727, | |
| "learning_rate": 0.00019927622941173467, | |
| "loss": 0.5075, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2053298383573613, | |
| "grad_norm": 1.8640387058258057, | |
| "learning_rate": 0.00019924284549991902, | |
| "loss": 0.4749, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.20969855832241152, | |
| "grad_norm": 2.090924024581909, | |
| "learning_rate": 0.00019920871185580757, | |
| "loss": 0.4353, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.21406727828746178, | |
| "grad_norm": 1.9691081047058105, | |
| "learning_rate": 0.00019917382873726376, | |
| "loss": 0.4051, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.218435998252512, | |
| "grad_norm": 1.8130213022232056, | |
| "learning_rate": 0.0001991381964078128, | |
| "loss": 0.526, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.22280471821756226, | |
| "grad_norm": 2.078805923461914, | |
| "learning_rate": 0.00019910181513664, | |
| "loss": 0.5654, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.22717343818261249, | |
| "grad_norm": 2.0686287879943848, | |
| "learning_rate": 0.0001990646851985884, | |
| "loss": 0.43, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.23154215814766274, | |
| "grad_norm": 1.475821614265442, | |
| "learning_rate": 0.00019902680687415705, | |
| "loss": 0.355, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.23591087811271297, | |
| "grad_norm": 1.901236891746521, | |
| "learning_rate": 0.0001989881804494985, | |
| "loss": 0.4522, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.24027959807776322, | |
| "grad_norm": 1.2583553791046143, | |
| "learning_rate": 0.00019894880621641704, | |
| "loss": 0.3869, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.24464831804281345, | |
| "grad_norm": 1.712336540222168, | |
| "learning_rate": 0.00019890868447236613, | |
| "loss": 0.454, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.2490170380078637, | |
| "grad_norm": 2.3967206478118896, | |
| "learning_rate": 0.00019886781552044634, | |
| "loss": 0.4074, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.25338575797291396, | |
| "grad_norm": 2.0578925609588623, | |
| "learning_rate": 0.0001988261996694032, | |
| "loss": 0.4268, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.2577544779379642, | |
| "grad_norm": 1.7411088943481445, | |
| "learning_rate": 0.0001987838372336245, | |
| "loss": 0.334, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.2621231979030144, | |
| "grad_norm": 1.8145533800125122, | |
| "learning_rate": 0.0001987407285331382, | |
| "loss": 0.4019, | |
| "step": 300 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 5725, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 147111217127424.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |