| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 48, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 6.530343055725098, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.101, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 6.764411926269531, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.0766, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 6.143789768218994, | |
| "learning_rate": 6e-06, | |
| "loss": 1.0221, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 4.986618518829346, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.045, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 2.9099574089050293, | |
| "learning_rate": 1e-05, | |
| "loss": 0.9587, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 5.138437271118164, | |
| "learning_rate": 9.986661418317759e-06, | |
| "loss": 0.9389, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.4375, | |
| "grad_norm": 5.72763204574585, | |
| "learning_rate": 9.946716840375552e-06, | |
| "loss": 0.9971, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 6.512542247772217, | |
| "learning_rate": 9.880379387779637e-06, | |
| "loss": 0.9481, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.5625, | |
| "grad_norm": 4.968111991882324, | |
| "learning_rate": 9.78800299954203e-06, | |
| "loss": 0.9156, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 2.9014463424682617, | |
| "learning_rate": 9.670080543662742e-06, | |
| "loss": 0.8799, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.6875, | |
| "grad_norm": 2.4352829456329346, | |
| "learning_rate": 9.527241187465735e-06, | |
| "loss": 0.9048, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 2.237089157104492, | |
| "learning_rate": 9.36024704071904e-06, | |
| "loss": 0.8642, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.8125, | |
| "grad_norm": 1.9950438737869263, | |
| "learning_rate": 9.16998908944939e-06, | |
| "loss": 0.849, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 1.2918341159820557, | |
| "learning_rate": 8.957482442146271e-06, | |
| "loss": 0.8796, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 1.4066290855407715, | |
| "learning_rate": 8.72386091371891e-06, | |
| "loss": 0.7775, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.3670933246612549, | |
| "learning_rate": 8.470370976103171e-06, | |
| "loss": 0.7598, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 1.0625, | |
| "grad_norm": 1.154747486114502, | |
| "learning_rate": 8.198365107794457e-06, | |
| "loss": 0.8226, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 1.125, | |
| "grad_norm": 0.9220020771026611, | |
| "learning_rate": 7.909294577789765e-06, | |
| "loss": 0.7433, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 1.1875, | |
| "grad_norm": 1.0542454719543457, | |
| "learning_rate": 7.604701702439652e-06, | |
| "loss": 0.7302, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.983396589756012, | |
| "learning_rate": 7.286211616523193e-06, | |
| "loss": 0.8066, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.3125, | |
| "grad_norm": 0.8708000779151917, | |
| "learning_rate": 6.95552360245078e-06, | |
| "loss": 0.7155, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 1.375, | |
| "grad_norm": 0.8514654040336609, | |
| "learning_rate": 6.614402023857231e-06, | |
| "loss": 0.7416, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 1.4375, | |
| "grad_norm": 0.7727275490760803, | |
| "learning_rate": 6.264666911958404e-06, | |
| "loss": 0.7355, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.7927507162094116, | |
| "learning_rate": 5.908184254897183e-06, | |
| "loss": 0.7514, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "grad_norm": 0.8321970105171204, | |
| "learning_rate": 5.546856041889374e-06, | |
| "loss": 0.7772, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.625, | |
| "grad_norm": 0.749494731426239, | |
| "learning_rate": 5.182610115288296e-06, | |
| "loss": 0.7391, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 1.6875, | |
| "grad_norm": 0.680793821811676, | |
| "learning_rate": 4.817389884711706e-06, | |
| "loss": 0.7486, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.6702898144721985, | |
| "learning_rate": 4.4531439581106295e-06, | |
| "loss": 0.7909, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.8125, | |
| "grad_norm": 0.5399229526519775, | |
| "learning_rate": 4.091815745102818e-06, | |
| "loss": 0.7535, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 0.5391576290130615, | |
| "learning_rate": 3.7353330880415963e-06, | |
| "loss": 0.7276, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.9375, | |
| "grad_norm": 0.6090447306632996, | |
| "learning_rate": 3.3855979761427705e-06, | |
| "loss": 0.7982, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.571646511554718, | |
| "learning_rate": 3.044476397549221e-06, | |
| "loss": 0.7689, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 2.0625, | |
| "grad_norm": 0.5187521576881409, | |
| "learning_rate": 2.7137883834768076e-06, | |
| "loss": 0.735, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 2.125, | |
| "grad_norm": 0.5552323460578918, | |
| "learning_rate": 2.3952982975603494e-06, | |
| "loss": 0.772, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 2.1875, | |
| "grad_norm": 0.4510865807533264, | |
| "learning_rate": 2.0907054222102367e-06, | |
| "loss": 0.7111, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.46868014335632324, | |
| "learning_rate": 1.8016348922055448e-06, | |
| "loss": 0.6825, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 2.3125, | |
| "grad_norm": 0.49473336338996887, | |
| "learning_rate": 1.5296290238968303e-06, | |
| "loss": 0.6869, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 2.375, | |
| "grad_norm": 0.43930625915527344, | |
| "learning_rate": 1.2761390862810907e-06, | |
| "loss": 0.6477, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 2.4375, | |
| "grad_norm": 0.43224358558654785, | |
| "learning_rate": 1.04251755785373e-06, | |
| "loss": 0.7017, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.4535328447818756, | |
| "learning_rate": 8.30010910550611e-07, | |
| "loss": 0.6958, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.5625, | |
| "grad_norm": 0.44307926297187805, | |
| "learning_rate": 6.397529592809615e-07, | |
| "loss": 0.6983, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 2.625, | |
| "grad_norm": 0.44362759590148926, | |
| "learning_rate": 4.727588125342669e-07, | |
| "loss": 0.6935, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 2.6875, | |
| "grad_norm": 0.38771650195121765, | |
| "learning_rate": 3.299194563372604e-07, | |
| "loss": 0.7581, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.45481687784194946, | |
| "learning_rate": 2.1199700045797077e-07, | |
| "loss": 0.7712, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 2.8125, | |
| "grad_norm": 0.42201128602027893, | |
| "learning_rate": 1.196206122203647e-07, | |
| "loss": 0.7006, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 2.875, | |
| "grad_norm": 0.47298797965049744, | |
| "learning_rate": 5.3283159624448745e-08, | |
| "loss": 0.7299, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 2.9375, | |
| "grad_norm": 0.4228404462337494, | |
| "learning_rate": 1.333858168224178e-08, | |
| "loss": 0.7749, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.40613028407096863, | |
| "learning_rate": 0.0, | |
| "loss": 0.666, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 48, | |
| "total_flos": 41283677585408.0, | |
| "train_loss": 0.8019564437369505, | |
| "train_runtime": 1597.112, | |
| "train_samples_per_second": 2.859, | |
| "train_steps_per_second": 0.03 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 48, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 41283677585408.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |