| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.8243488794669895, | |
| "eval_steps": 500, | |
| "global_step": 4518, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04037956793862306, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 5.878836833602585e-05, | |
| "loss": 0.1271, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.08075913587724612, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 5.7576736672051694e-05, | |
| "loss": 0.0977, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.12113870381586916, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 5.636510500807755e-05, | |
| "loss": 0.077, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.16151827175449224, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 5.5153473344103394e-05, | |
| "loss": 0.0692, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.20189783969311528, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 5.394184168012924e-05, | |
| "loss": 0.0566, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.24227740763173833, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.2730210016155086e-05, | |
| "loss": 0.0594, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.2826569755703614, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.1518578352180936e-05, | |
| "loss": 0.0591, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3230365435089845, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 5.030694668820679e-05, | |
| "loss": 0.068, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3634161114476075, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 4.9095315024232635e-05, | |
| "loss": 0.0563, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.40379567938623057, | |
| "grad_norm": 1.0, | |
| "learning_rate": 4.7883683360258485e-05, | |
| "loss": 0.0562, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.44417524732485364, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 4.667205169628433e-05, | |
| "loss": 0.0601, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.48455481526347666, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 4.546042003231018e-05, | |
| "loss": 0.0455, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5249343832020997, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 4.424878836833603e-05, | |
| "loss": 0.0472, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5653139511407228, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 4.303715670436188e-05, | |
| "loss": 0.0561, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6056935190793459, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 4.1825525040387727e-05, | |
| "loss": 0.0517, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.646073087017969, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 4.061389337641357e-05, | |
| "loss": 0.0449, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.6864526549565919, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 3.940226171243942e-05, | |
| "loss": 0.0425, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.726832222895215, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 3.819063004846526e-05, | |
| "loss": 0.0462, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.7672117908338381, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 3.697899838449112e-05, | |
| "loss": 0.0488, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.8075913587724611, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 3.576736672051697e-05, | |
| "loss": 0.0447, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8479709267110842, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 3.455573505654281e-05, | |
| "loss": 0.0477, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.8883504946497073, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 3.334410339256866e-05, | |
| "loss": 0.0489, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.9287300625883304, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 3.2132471728594504e-05, | |
| "loss": 0.0508, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.9691096305269533, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 3.092084006462036e-05, | |
| "loss": 0.0396, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.0094891984655765, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 2.9709208400646203e-05, | |
| "loss": 0.0486, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.0498687664041995, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 2.8497576736672053e-05, | |
| "loss": 0.0349, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.0902483343428226, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 2.72859450726979e-05, | |
| "loss": 0.0277, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.1306279022814456, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 2.607431340872375e-05, | |
| "loss": 0.0291, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.1710074702200686, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 2.4862681744749595e-05, | |
| "loss": 0.0327, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.2113870381586918, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 2.3651050080775445e-05, | |
| "loss": 0.0324, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.2517666060973147, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 2.2439418416801295e-05, | |
| "loss": 0.0337, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.292146174035938, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 2.122778675282714e-05, | |
| "loss": 0.0318, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.3325257419745609, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 2.0016155088852987e-05, | |
| "loss": 0.0347, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.3729053099131838, | |
| "grad_norm": 2.875, | |
| "learning_rate": 1.8804523424878837e-05, | |
| "loss": 0.0299, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.413284877851807, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.7592891760904683e-05, | |
| "loss": 0.0292, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.45366444579043, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 1.6381260096930536e-05, | |
| "loss": 0.0321, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.4940440137290532, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.5169628432956381e-05, | |
| "loss": 0.0305, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.5344235816676761, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 1.395799676898223e-05, | |
| "loss": 0.0289, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.574803149606299, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 1.2746365105008077e-05, | |
| "loss": 0.0304, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.6151827175449223, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.1534733441033925e-05, | |
| "loss": 0.0346, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.6555622854835454, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.0323101777059775e-05, | |
| "loss": 0.033, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.6959418534221684, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 9.111470113085623e-06, | |
| "loss": 0.0328, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.7363214213607914, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 7.89983844911147e-06, | |
| "loss": 0.0306, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.7767009892994143, | |
| "grad_norm": 3.375, | |
| "learning_rate": 6.6882067851373186e-06, | |
| "loss": 0.0303, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.8170805572380375, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 5.4765751211631666e-06, | |
| "loss": 0.0312, | |
| "step": 4500 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 4952, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 502, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |