| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.17474879860200962, | |
| "eval_steps": 500, | |
| "global_step": 200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00436871996505024, | |
| "grad_norm": 9.839941024780273, | |
| "learning_rate": 8e-05, | |
| "loss": 2.5246, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.00873743993010048, | |
| "grad_norm": 13.773455619812012, | |
| "learning_rate": 0.00018, | |
| "loss": 1.1343, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01310615989515072, | |
| "grad_norm": 5.6580424308776855, | |
| "learning_rate": 0.0001999997582552296, | |
| "loss": 0.7712, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.01747487986020096, | |
| "grad_norm": 5.294467926025391, | |
| "learning_rate": 0.0001999987761691029, | |
| "loss": 0.73, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.021843599825251202, | |
| "grad_norm": 2.8633503913879395, | |
| "learning_rate": 0.00019999703863998527, | |
| "loss": 0.7289, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.02621231979030144, | |
| "grad_norm": 3.2836177349090576, | |
| "learning_rate": 0.00019999454568100293, | |
| "loss": 0.4686, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03058103975535168, | |
| "grad_norm": 4.878258228302002, | |
| "learning_rate": 0.00019999129731098898, | |
| "loss": 0.6629, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.03494975972040192, | |
| "grad_norm": 2.899914026260376, | |
| "learning_rate": 0.00019998729355448326, | |
| "loss": 0.6038, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.039318479685452164, | |
| "grad_norm": 3.289844274520874, | |
| "learning_rate": 0.00019998253444173235, | |
| "loss": 0.4573, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.043687199650502405, | |
| "grad_norm": 2.957254648208618, | |
| "learning_rate": 0.00019997702000868896, | |
| "loss": 0.594, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.048055919615552646, | |
| "grad_norm": 3.171276807785034, | |
| "learning_rate": 0.00019997075029701207, | |
| "loss": 0.5719, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.05242463958060288, | |
| "grad_norm": 2.55605149269104, | |
| "learning_rate": 0.0001999637253540663, | |
| "loss": 0.5971, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.05679335954565312, | |
| "grad_norm": 2.127289295196533, | |
| "learning_rate": 0.00019995594523292178, | |
| "loss": 0.5712, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.06116207951070336, | |
| "grad_norm": 3.3928685188293457, | |
| "learning_rate": 0.00019994740999235359, | |
| "loss": 0.5712, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0655307994757536, | |
| "grad_norm": 2.6700279712677, | |
| "learning_rate": 0.00019993811969684142, | |
| "loss": 0.427, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.06989951944080385, | |
| "grad_norm": 2.6936633586883545, | |
| "learning_rate": 0.00019992807441656898, | |
| "loss": 0.5321, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.07426823940585409, | |
| "grad_norm": 3.9897687435150146, | |
| "learning_rate": 0.00019991727422742362, | |
| "loss": 0.6025, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.07863695937090433, | |
| "grad_norm": 2.3496663570404053, | |
| "learning_rate": 0.00019990571921099553, | |
| "loss": 0.5975, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08300567933595457, | |
| "grad_norm": 3.3796467781066895, | |
| "learning_rate": 0.0001998934094545774, | |
| "loss": 0.5255, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.08737439930100481, | |
| "grad_norm": 3.1103007793426514, | |
| "learning_rate": 0.00019988034505116352, | |
| "loss": 0.4946, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09174311926605505, | |
| "grad_norm": 2.002304792404175, | |
| "learning_rate": 0.00019986652609944926, | |
| "loss": 0.425, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.09611183923110529, | |
| "grad_norm": 1.7572168111801147, | |
| "learning_rate": 0.00019985195270383018, | |
| "loss": 0.6073, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.10048055919615553, | |
| "grad_norm": 2.745215654373169, | |
| "learning_rate": 0.00019983662497440133, | |
| "loss": 0.586, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.10484927916120576, | |
| "grad_norm": 1.8170915842056274, | |
| "learning_rate": 0.0001998205430269564, | |
| "loss": 0.5255, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.109217999126256, | |
| "grad_norm": 1.4944056272506714, | |
| "learning_rate": 0.00019980370698298677, | |
| "loss": 0.4219, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.11358671909130624, | |
| "grad_norm": 1.6616989374160767, | |
| "learning_rate": 0.00019978611696968074, | |
| "loss": 0.4231, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.11795543905635648, | |
| "grad_norm": 2.0523645877838135, | |
| "learning_rate": 0.00019976777311992247, | |
| "loss": 0.5298, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.12232415902140673, | |
| "grad_norm": 2.065765619277954, | |
| "learning_rate": 0.00019974867557229098, | |
| "loss": 0.5228, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.12669287898645698, | |
| "grad_norm": 1.7283438444137573, | |
| "learning_rate": 0.00019972882447105912, | |
| "loss": 0.3452, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.1310615989515072, | |
| "grad_norm": 2.655750274658203, | |
| "learning_rate": 0.00019970821996619244, | |
| "loss": 0.508, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.13543031891655744, | |
| "grad_norm": 2.67799973487854, | |
| "learning_rate": 0.0001996868622133482, | |
| "loss": 0.4359, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.1397990388816077, | |
| "grad_norm": 1.6298809051513672, | |
| "learning_rate": 0.00019966475137387396, | |
| "loss": 0.5447, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.14416775884665792, | |
| "grad_norm": 1.4772286415100098, | |
| "learning_rate": 0.00019964188761480657, | |
| "loss": 0.4105, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.14853647881170817, | |
| "grad_norm": 2.2986271381378174, | |
| "learning_rate": 0.00019961827110887083, | |
| "loss": 0.603, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1529051987767584, | |
| "grad_norm": 2.8261911869049072, | |
| "learning_rate": 0.00019959390203447817, | |
| "loss": 0.4649, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.15727391874180865, | |
| "grad_norm": 1.7771011590957642, | |
| "learning_rate": 0.00019956878057572524, | |
| "loss": 0.4394, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.16164263870685888, | |
| "grad_norm": 1.7315421104431152, | |
| "learning_rate": 0.00019954290692239274, | |
| "loss": 0.5289, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.16601135867190914, | |
| "grad_norm": 1.6124423742294312, | |
| "learning_rate": 0.00019951628126994373, | |
| "loss": 0.4173, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.17038007863695936, | |
| "grad_norm": 1.792577862739563, | |
| "learning_rate": 0.00019948890381952232, | |
| "loss": 0.4331, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.17474879860200962, | |
| "grad_norm": 1.9038774967193604, | |
| "learning_rate": 0.000199460774777952, | |
| "loss": 0.4247, | |
| "step": 200 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 5725, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 96943031820288.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |