{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 189, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 5.599682807922363, "learning_rate": 0.0001997790438338385, "loss": 1.683, "num_input_tokens_seen": 4896, "step": 5, "train_runtime": 118.4256, "train_tokens_per_second": 41.342 }, { "epoch": 0.16, "grad_norm": 4.600591659545898, "learning_rate": 0.00019888308262251285, "loss": 0.9406, "num_input_tokens_seen": 10144, "step": 10, "train_runtime": 247.2151, "train_tokens_per_second": 41.033 }, { "epoch": 0.24, "grad_norm": 5.516849994659424, "learning_rate": 0.00019730448705798239, "loss": 1.1972, "num_input_tokens_seen": 14768, "step": 15, "train_runtime": 367.0408, "train_tokens_per_second": 40.235 }, { "epoch": 0.32, "grad_norm": 4.443480968475342, "learning_rate": 0.0001950541548947829, "loss": 0.8463, "num_input_tokens_seen": 19456, "step": 20, "train_runtime": 488.2663, "train_tokens_per_second": 39.847 }, { "epoch": 0.4, "grad_norm": 6.511438846588135, "learning_rate": 0.00019214762118704076, "loss": 0.9397, "num_input_tokens_seen": 24240, "step": 25, "train_runtime": 611.1324, "train_tokens_per_second": 39.664 }, { "epoch": 0.48, "grad_norm": 5.14411735534668, "learning_rate": 0.00018860495104301345, "loss": 0.8065, "num_input_tokens_seen": 28928, "step": 30, "train_runtime": 732.2813, "train_tokens_per_second": 39.504 }, { "epoch": 0.56, "grad_norm": 3.8415708541870117, "learning_rate": 0.0001844506011066308, "loss": 1.0692, "num_input_tokens_seen": 33712, "step": 35, "train_runtime": 854.4771, "train_tokens_per_second": 39.453 }, { "epoch": 0.64, "grad_norm": 4.072882175445557, "learning_rate": 0.00017971325072229226, "loss": 1.1197, "num_input_tokens_seen": 38384, "step": 40, "train_runtime": 975.508, "train_tokens_per_second": 39.348 }, { "epoch": 0.72, "grad_norm": 5.347297191619873, "learning_rate": 0.00017442560394846516, "loss": 0.9444, "num_input_tokens_seen": 43232, "step": 45, "train_runtime": 1098.6527, "train_tokens_per_second": 39.35 }, { "epoch": 0.8, "grad_norm": 4.867551326751709, "learning_rate": 0.0001686241637868734, "loss": 1.1552, "num_input_tokens_seen": 47600, "step": 50, "train_runtime": 1216.8692, "train_tokens_per_second": 39.117 }, { "epoch": 0.88, "grad_norm": 4.36644172668457, "learning_rate": 0.00016234898018587337, "loss": 0.7799, "num_input_tokens_seen": 52416, "step": 55, "train_runtime": 1339.2743, "train_tokens_per_second": 39.138 }, { "epoch": 0.96, "grad_norm": 5.242959499359131, "learning_rate": 0.00015564337355766412, "loss": 1.0976, "num_input_tokens_seen": 57024, "step": 60, "train_runtime": 1460.6574, "train_tokens_per_second": 39.04 }, { "epoch": 1.032, "grad_norm": 2.891428232192993, "learning_rate": 0.00014855363571801523, "loss": 0.6118, "num_input_tokens_seen": 61376, "step": 65, "train_runtime": 1571.9701, "train_tokens_per_second": 39.044 }, { "epoch": 1.112, "grad_norm": 3.863312244415283, "learning_rate": 0.00014112871031306119, "loss": 0.4582, "num_input_tokens_seen": 65808, "step": 70, "train_runtime": 1688.4702, "train_tokens_per_second": 38.975 }, { "epoch": 1.192, "grad_norm": 8.035051345825195, "learning_rate": 0.00013341985493931877, "loss": 0.4555, "num_input_tokens_seen": 70800, "step": 75, "train_runtime": 1816.0492, "train_tokens_per_second": 38.986 }, { "epoch": 1.272, "grad_norm": 6.827272891998291, "learning_rate": 0.0001254802872894655, "loss": 0.5214, "num_input_tokens_seen": 75744, "step": 80, "train_runtime": 1941.6476, "train_tokens_per_second": 39.01 }, { "epoch": 1.3519999999999999, "grad_norm": 3.403914451599121, "learning_rate": 0.00011736481776669306, "loss": 0.396, "num_input_tokens_seen": 80400, "step": 85, "train_runtime": 2063.1926, "train_tokens_per_second": 38.969 }, { "epoch": 1.432, "grad_norm": 5.601797103881836, "learning_rate": 0.00010912947110386484, "loss": 0.494, "num_input_tokens_seen": 85040, "step": 90, "train_runtime": 2183.6375, "train_tokens_per_second": 38.944 }, { "epoch": 1.512, "grad_norm": 4.063144683837891, "learning_rate": 0.00010083109959960973, "loss": 0.4208, "num_input_tokens_seen": 89472, "step": 95, "train_runtime": 2301.5727, "train_tokens_per_second": 38.874 }, { "epoch": 1.592, "grad_norm": 3.933701515197754, "learning_rate": 9.252699064135758e-05, "loss": 0.3598, "num_input_tokens_seen": 94128, "step": 100, "train_runtime": 2423.5983, "train_tokens_per_second": 38.838 }, { "epoch": 1.6720000000000002, "grad_norm": 2.3344547748565674, "learning_rate": 8.427447122476148e-05, "loss": 0.3386, "num_input_tokens_seen": 99120, "step": 105, "train_runtime": 2556.3976, "train_tokens_per_second": 38.773 }, { "epoch": 1.752, "grad_norm": 6.219064712524414, "learning_rate": 7.613051219968623e-05, "loss": 0.363, "num_input_tokens_seen": 104224, "step": 110, "train_runtime": 2687.8442, "train_tokens_per_second": 38.776 }, { "epoch": 1.8319999999999999, "grad_norm": 4.835460662841797, "learning_rate": 6.815133497483157e-05, "loss": 0.4988, "num_input_tokens_seen": 108352, "step": 115, "train_runtime": 2797.943, "train_tokens_per_second": 38.726 }, { "epoch": 1.912, "grad_norm": 3.8654727935791016, "learning_rate": 6.039202339608432e-05, "loss": 0.492, "num_input_tokens_seen": 113216, "step": 120, "train_runtime": 2921.7812, "train_tokens_per_second": 38.749 }, { "epoch": 1.992, "grad_norm": 2.8310158252716064, "learning_rate": 5.290614347797802e-05, "loss": 0.3521, "num_input_tokens_seen": 118128, "step": 125, "train_runtime": 3048.0813, "train_tokens_per_second": 38.755 }, { "epoch": 2.064, "grad_norm": 2.7480006217956543, "learning_rate": 4.574537361342407e-05, "loss": 0.2833, "num_input_tokens_seen": 121984, "step": 130, "train_runtime": 3150.1518, "train_tokens_per_second": 38.723 }, { "epoch": 2.144, "grad_norm": 2.6361961364746094, "learning_rate": 3.89591478145437e-05, "loss": 0.1752, "num_input_tokens_seen": 126272, "step": 135, "train_runtime": 3263.6021, "train_tokens_per_second": 38.691 }, { "epoch": 2.224, "grad_norm": 2.188568115234375, "learning_rate": 3.259431444746846e-05, "loss": 0.1497, "num_input_tokens_seen": 131184, "step": 140, "train_runtime": 3388.4424, "train_tokens_per_second": 38.715 }, { "epoch": 2.304, "grad_norm": 2.502239942550659, "learning_rate": 2.669481281701739e-05, "loss": 0.123, "num_input_tokens_seen": 136256, "step": 145, "train_runtime": 3517.0188, "train_tokens_per_second": 38.742 }, { "epoch": 2.384, "grad_norm": 5.557635307312012, "learning_rate": 2.1301369833931117e-05, "loss": 0.1846, "num_input_tokens_seen": 140816, "step": 150, "train_runtime": 3636.002, "train_tokens_per_second": 38.728 }, { "epoch": 2.464, "grad_norm": 7.517292022705078, "learning_rate": 1.6451218858706374e-05, "loss": 0.1589, "num_input_tokens_seen": 145824, "step": 155, "train_runtime": 3761.3932, "train_tokens_per_second": 38.769 }, { "epoch": 2.544, "grad_norm": 4.320331573486328, "learning_rate": 1.2177842662977135e-05, "loss": 0.2244, "num_input_tokens_seen": 150720, "step": 160, "train_runtime": 3886.3506, "train_tokens_per_second": 38.782 }, { "epoch": 2.624, "grad_norm": 5.886272430419922, "learning_rate": 8.510742282896544e-06, "loss": 0.243, "num_input_tokens_seen": 155408, "step": 165, "train_runtime": 4010.9688, "train_tokens_per_second": 38.746 }, { "epoch": 2.7039999999999997, "grad_norm": 4.565557479858398, "learning_rate": 5.475233360227516e-06, "loss": 0.1998, "num_input_tokens_seen": 159616, "step": 170, "train_runtime": 4124.7853, "train_tokens_per_second": 38.697 }, { "epoch": 2.784, "grad_norm": 5.315474987030029, "learning_rate": 3.092271377092215e-06, "loss": 0.1511, "num_input_tokens_seen": 164544, "step": 175, "train_runtime": 4249.8174, "train_tokens_per_second": 38.718 }, { "epoch": 2.864, "grad_norm": 2.225633382797241, "learning_rate": 1.378306990862177e-06, "loss": 0.1868, "num_input_tokens_seen": 169728, "step": 180, "train_runtime": 4382.478, "train_tokens_per_second": 38.729 }, { "epoch": 2.944, "grad_norm": 3.31820011138916, "learning_rate": 3.451724678784518e-07, "loss": 0.2023, "num_input_tokens_seen": 174320, "step": 185, "train_runtime": 4501.8922, "train_tokens_per_second": 38.721 }, { "epoch": 3.0, "num_input_tokens_seen": 177776, "step": 189, "total_flos": 7606997652996096.0, "train_loss": 0.5502152733071141, "train_runtime": 4598.4523, "train_samples_per_second": 0.652, "train_steps_per_second": 0.041 } ], "logging_steps": 5, "max_steps": 189, "num_input_tokens_seen": 177776, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7606997652996096.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }