| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 189, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 5.599682807922363, | |
| "learning_rate": 0.0001997790438338385, | |
| "loss": 1.683, | |
| "num_input_tokens_seen": 4896, | |
| "step": 5, | |
| "train_runtime": 118.4256, | |
| "train_tokens_per_second": 41.342 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.600591659545898, | |
| "learning_rate": 0.00019888308262251285, | |
| "loss": 0.9406, | |
| "num_input_tokens_seen": 10144, | |
| "step": 10, | |
| "train_runtime": 247.2151, | |
| "train_tokens_per_second": 41.033 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.516849994659424, | |
| "learning_rate": 0.00019730448705798239, | |
| "loss": 1.1972, | |
| "num_input_tokens_seen": 14768, | |
| "step": 15, | |
| "train_runtime": 367.0408, | |
| "train_tokens_per_second": 40.235 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.443480968475342, | |
| "learning_rate": 0.0001950541548947829, | |
| "loss": 0.8463, | |
| "num_input_tokens_seen": 19456, | |
| "step": 20, | |
| "train_runtime": 488.2663, | |
| "train_tokens_per_second": 39.847 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 6.511438846588135, | |
| "learning_rate": 0.00019214762118704076, | |
| "loss": 0.9397, | |
| "num_input_tokens_seen": 24240, | |
| "step": 25, | |
| "train_runtime": 611.1324, | |
| "train_tokens_per_second": 39.664 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 5.14411735534668, | |
| "learning_rate": 0.00018860495104301345, | |
| "loss": 0.8065, | |
| "num_input_tokens_seen": 28928, | |
| "step": 30, | |
| "train_runtime": 732.2813, | |
| "train_tokens_per_second": 39.504 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 3.8415708541870117, | |
| "learning_rate": 0.0001844506011066308, | |
| "loss": 1.0692, | |
| "num_input_tokens_seen": 33712, | |
| "step": 35, | |
| "train_runtime": 854.4771, | |
| "train_tokens_per_second": 39.453 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 4.072882175445557, | |
| "learning_rate": 0.00017971325072229226, | |
| "loss": 1.1197, | |
| "num_input_tokens_seen": 38384, | |
| "step": 40, | |
| "train_runtime": 975.508, | |
| "train_tokens_per_second": 39.348 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 5.347297191619873, | |
| "learning_rate": 0.00017442560394846516, | |
| "loss": 0.9444, | |
| "num_input_tokens_seen": 43232, | |
| "step": 45, | |
| "train_runtime": 1098.6527, | |
| "train_tokens_per_second": 39.35 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 4.867551326751709, | |
| "learning_rate": 0.0001686241637868734, | |
| "loss": 1.1552, | |
| "num_input_tokens_seen": 47600, | |
| "step": 50, | |
| "train_runtime": 1216.8692, | |
| "train_tokens_per_second": 39.117 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 4.36644172668457, | |
| "learning_rate": 0.00016234898018587337, | |
| "loss": 0.7799, | |
| "num_input_tokens_seen": 52416, | |
| "step": 55, | |
| "train_runtime": 1339.2743, | |
| "train_tokens_per_second": 39.138 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 5.242959499359131, | |
| "learning_rate": 0.00015564337355766412, | |
| "loss": 1.0976, | |
| "num_input_tokens_seen": 57024, | |
| "step": 60, | |
| "train_runtime": 1460.6574, | |
| "train_tokens_per_second": 39.04 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 2.891428232192993, | |
| "learning_rate": 0.00014855363571801523, | |
| "loss": 0.6118, | |
| "num_input_tokens_seen": 61376, | |
| "step": 65, | |
| "train_runtime": 1571.9701, | |
| "train_tokens_per_second": 39.044 | |
| }, | |
| { | |
| "epoch": 1.112, | |
| "grad_norm": 3.863312244415283, | |
| "learning_rate": 0.00014112871031306119, | |
| "loss": 0.4582, | |
| "num_input_tokens_seen": 65808, | |
| "step": 70, | |
| "train_runtime": 1688.4702, | |
| "train_tokens_per_second": 38.975 | |
| }, | |
| { | |
| "epoch": 1.192, | |
| "grad_norm": 8.035051345825195, | |
| "learning_rate": 0.00013341985493931877, | |
| "loss": 0.4555, | |
| "num_input_tokens_seen": 70800, | |
| "step": 75, | |
| "train_runtime": 1816.0492, | |
| "train_tokens_per_second": 38.986 | |
| }, | |
| { | |
| "epoch": 1.272, | |
| "grad_norm": 6.827272891998291, | |
| "learning_rate": 0.0001254802872894655, | |
| "loss": 0.5214, | |
| "num_input_tokens_seen": 75744, | |
| "step": 80, | |
| "train_runtime": 1941.6476, | |
| "train_tokens_per_second": 39.01 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 3.403914451599121, | |
| "learning_rate": 0.00011736481776669306, | |
| "loss": 0.396, | |
| "num_input_tokens_seen": 80400, | |
| "step": 85, | |
| "train_runtime": 2063.1926, | |
| "train_tokens_per_second": 38.969 | |
| }, | |
| { | |
| "epoch": 1.432, | |
| "grad_norm": 5.601797103881836, | |
| "learning_rate": 0.00010912947110386484, | |
| "loss": 0.494, | |
| "num_input_tokens_seen": 85040, | |
| "step": 90, | |
| "train_runtime": 2183.6375, | |
| "train_tokens_per_second": 38.944 | |
| }, | |
| { | |
| "epoch": 1.512, | |
| "grad_norm": 4.063144683837891, | |
| "learning_rate": 0.00010083109959960973, | |
| "loss": 0.4208, | |
| "num_input_tokens_seen": 89472, | |
| "step": 95, | |
| "train_runtime": 2301.5727, | |
| "train_tokens_per_second": 38.874 | |
| }, | |
| { | |
| "epoch": 1.592, | |
| "grad_norm": 3.933701515197754, | |
| "learning_rate": 9.252699064135758e-05, | |
| "loss": 0.3598, | |
| "num_input_tokens_seen": 94128, | |
| "step": 100, | |
| "train_runtime": 2423.5983, | |
| "train_tokens_per_second": 38.838 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 2.3344547748565674, | |
| "learning_rate": 8.427447122476148e-05, | |
| "loss": 0.3386, | |
| "num_input_tokens_seen": 99120, | |
| "step": 105, | |
| "train_runtime": 2556.3976, | |
| "train_tokens_per_second": 38.773 | |
| }, | |
| { | |
| "epoch": 1.752, | |
| "grad_norm": 6.219064712524414, | |
| "learning_rate": 7.613051219968623e-05, | |
| "loss": 0.363, | |
| "num_input_tokens_seen": 104224, | |
| "step": 110, | |
| "train_runtime": 2687.8442, | |
| "train_tokens_per_second": 38.776 | |
| }, | |
| { | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 4.835460662841797, | |
| "learning_rate": 6.815133497483157e-05, | |
| "loss": 0.4988, | |
| "num_input_tokens_seen": 108352, | |
| "step": 115, | |
| "train_runtime": 2797.943, | |
| "train_tokens_per_second": 38.726 | |
| }, | |
| { | |
| "epoch": 1.912, | |
| "grad_norm": 3.8654727935791016, | |
| "learning_rate": 6.039202339608432e-05, | |
| "loss": 0.492, | |
| "num_input_tokens_seen": 113216, | |
| "step": 120, | |
| "train_runtime": 2921.7812, | |
| "train_tokens_per_second": 38.749 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "grad_norm": 2.8310158252716064, | |
| "learning_rate": 5.290614347797802e-05, | |
| "loss": 0.3521, | |
| "num_input_tokens_seen": 118128, | |
| "step": 125, | |
| "train_runtime": 3048.0813, | |
| "train_tokens_per_second": 38.755 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 2.7480006217956543, | |
| "learning_rate": 4.574537361342407e-05, | |
| "loss": 0.2833, | |
| "num_input_tokens_seen": 121984, | |
| "step": 130, | |
| "train_runtime": 3150.1518, | |
| "train_tokens_per_second": 38.723 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 2.6361961364746094, | |
| "learning_rate": 3.89591478145437e-05, | |
| "loss": 0.1752, | |
| "num_input_tokens_seen": 126272, | |
| "step": 135, | |
| "train_runtime": 3263.6021, | |
| "train_tokens_per_second": 38.691 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 2.188568115234375, | |
| "learning_rate": 3.259431444746846e-05, | |
| "loss": 0.1497, | |
| "num_input_tokens_seen": 131184, | |
| "step": 140, | |
| "train_runtime": 3388.4424, | |
| "train_tokens_per_second": 38.715 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 2.502239942550659, | |
| "learning_rate": 2.669481281701739e-05, | |
| "loss": 0.123, | |
| "num_input_tokens_seen": 136256, | |
| "step": 145, | |
| "train_runtime": 3517.0188, | |
| "train_tokens_per_second": 38.742 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 5.557635307312012, | |
| "learning_rate": 2.1301369833931117e-05, | |
| "loss": 0.1846, | |
| "num_input_tokens_seen": 140816, | |
| "step": 150, | |
| "train_runtime": 3636.002, | |
| "train_tokens_per_second": 38.728 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 7.517292022705078, | |
| "learning_rate": 1.6451218858706374e-05, | |
| "loss": 0.1589, | |
| "num_input_tokens_seen": 145824, | |
| "step": 155, | |
| "train_runtime": 3761.3932, | |
| "train_tokens_per_second": 38.769 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 4.320331573486328, | |
| "learning_rate": 1.2177842662977135e-05, | |
| "loss": 0.2244, | |
| "num_input_tokens_seen": 150720, | |
| "step": 160, | |
| "train_runtime": 3886.3506, | |
| "train_tokens_per_second": 38.782 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 5.886272430419922, | |
| "learning_rate": 8.510742282896544e-06, | |
| "loss": 0.243, | |
| "num_input_tokens_seen": 155408, | |
| "step": 165, | |
| "train_runtime": 4010.9688, | |
| "train_tokens_per_second": 38.746 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 4.565557479858398, | |
| "learning_rate": 5.475233360227516e-06, | |
| "loss": 0.1998, | |
| "num_input_tokens_seen": 159616, | |
| "step": 170, | |
| "train_runtime": 4124.7853, | |
| "train_tokens_per_second": 38.697 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 5.315474987030029, | |
| "learning_rate": 3.092271377092215e-06, | |
| "loss": 0.1511, | |
| "num_input_tokens_seen": 164544, | |
| "step": 175, | |
| "train_runtime": 4249.8174, | |
| "train_tokens_per_second": 38.718 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": 2.225633382797241, | |
| "learning_rate": 1.378306990862177e-06, | |
| "loss": 0.1868, | |
| "num_input_tokens_seen": 169728, | |
| "step": 180, | |
| "train_runtime": 4382.478, | |
| "train_tokens_per_second": 38.729 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 3.31820011138916, | |
| "learning_rate": 3.451724678784518e-07, | |
| "loss": 0.2023, | |
| "num_input_tokens_seen": 174320, | |
| "step": 185, | |
| "train_runtime": 4501.8922, | |
| "train_tokens_per_second": 38.721 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "num_input_tokens_seen": 177776, | |
| "step": 189, | |
| "total_flos": 7606997652996096.0, | |
| "train_loss": 0.5502152733071141, | |
| "train_runtime": 4598.4523, | |
| "train_samples_per_second": 0.652, | |
| "train_steps_per_second": 0.041 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 189, | |
| "num_input_tokens_seen": 177776, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7606997652996096.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |