{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 2.1790294647216797, "learning_rate": 1.5625e-05, "loss": 0.8982, "step": 5 }, { "epoch": 0.032, "grad_norm": 1.2921503782272339, "learning_rate": 3.125e-05, "loss": 0.7771, "step": 10 }, { "epoch": 0.048, "grad_norm": 1.0662598609924316, "learning_rate": 4.6875e-05, "loss": 0.7065, "step": 15 }, { "epoch": 0.064, "grad_norm": 0.8291209936141968, "learning_rate": 4.9979726739605334e-05, "loss": 0.6569, "step": 20 }, { "epoch": 0.08, "grad_norm": 0.8745065927505493, "learning_rate": 4.989742922931149e-05, "loss": 0.6506, "step": 25 }, { "epoch": 0.096, "grad_norm": 0.6823293566703796, "learning_rate": 4.975207191995552e-05, "loss": 0.6579, "step": 30 }, { "epoch": 0.112, "grad_norm": 0.7768641114234924, "learning_rate": 4.95440640639845e-05, "loss": 0.6842, "step": 35 }, { "epoch": 0.128, "grad_norm": 0.6644716262817383, "learning_rate": 4.927399130600373e-05, "loss": 0.6327, "step": 40 }, { "epoch": 0.144, "grad_norm": 0.6596384644508362, "learning_rate": 4.894261403389862e-05, "loss": 0.6664, "step": 45 }, { "epoch": 0.16, "grad_norm": 0.6109484434127808, "learning_rate": 4.855086523796815e-05, "loss": 0.6369, "step": 50 }, { "epoch": 0.176, "grad_norm": 0.6302951574325562, "learning_rate": 4.8099847884097434e-05, "loss": 0.6217, "step": 55 }, { "epoch": 0.192, "grad_norm": 0.6894915699958801, "learning_rate": 4.7590831808365293e-05, "loss": 0.6364, "step": 60 }, { "epoch": 0.208, "grad_norm": 0.6988873481750488, "learning_rate": 4.702525014183007e-05, "loss": 0.6244, "step": 65 }, { "epoch": 0.224, "grad_norm": 0.6387168765068054, "learning_rate": 4.6404695275559475e-05, "loss": 0.614, "step": 70 }, { "epoch": 0.24, "grad_norm": 0.6547486186027527, "learning_rate": 4.57309143772652e-05, "loss": 0.6034, "step": 75 }, { "epoch": 0.256, "grad_norm": 0.6504953503608704, "learning_rate": 4.500580447216489e-05, "loss": 0.6265, "step": 80 }, { "epoch": 0.272, "grad_norm": 0.5533855557441711, "learning_rate": 4.423140710192144e-05, "loss": 0.6178, "step": 85 }, { "epoch": 0.288, "grad_norm": 0.5795829892158508, "learning_rate": 4.340990257669732e-05, "loss": 0.6243, "step": 90 }, { "epoch": 0.304, "grad_norm": 0.5757337808609009, "learning_rate": 4.254360383650716e-05, "loss": 0.605, "step": 95 }, { "epoch": 0.32, "grad_norm": 0.6556103825569153, "learning_rate": 4.163494993915196e-05, "loss": 0.6046, "step": 100 }, { "epoch": 0.336, "grad_norm": 0.6166912913322449, "learning_rate": 4.0686499193069595e-05, "loss": 0.6243, "step": 105 }, { "epoch": 0.352, "grad_norm": 0.6043514609336853, "learning_rate": 3.970092195443604e-05, "loss": 0.6182, "step": 110 }, { "epoch": 0.368, "grad_norm": 0.5503015518188477, "learning_rate": 3.8680993108796956e-05, "loss": 0.619, "step": 115 }, { "epoch": 0.384, "grad_norm": 12.36988353729248, "learning_rate": 3.7629584258397646e-05, "loss": 0.5871, "step": 120 }, { "epoch": 0.4, "grad_norm": 0.5975585579872131, "learning_rate": 3.65496556372078e-05, "loss": 0.6111, "step": 125 }, { "epoch": 0.416, "grad_norm": 0.5625191330909729, "learning_rate": 3.5444247776404274e-05, "loss": 0.5999, "step": 130 }, { "epoch": 0.432, "grad_norm": 0.5412049889564514, "learning_rate": 3.4316472943777736e-05, "loss": 0.5846, "step": 135 }, { "epoch": 0.448, "grad_norm": 0.5721265077590942, "learning_rate": 3.316950638116532e-05, "loss": 0.5975, "step": 140 }, { "epoch": 0.464, "grad_norm": 0.5761215090751648, "learning_rate": 3.2006577364580284e-05, "loss": 0.6136, "step": 145 }, { "epoch": 0.48, "grad_norm": 0.5800392627716064, "learning_rate": 3.083096011220896e-05, "loss": 0.5969, "step": 150 }, { "epoch": 0.496, "grad_norm": 0.5748460292816162, "learning_rate": 2.9645964565873207e-05, "loss": 0.6043, "step": 155 }, { "epoch": 0.512, "grad_norm": 0.5393357872962952, "learning_rate": 2.845492707191334e-05, "loss": 0.5814, "step": 160 }, { "epoch": 0.528, "grad_norm": 0.5506784319877625, "learning_rate": 2.7261200987729242e-05, "loss": 0.5676, "step": 165 }, { "epoch": 0.544, "grad_norm": 0.5413019061088562, "learning_rate": 2.606814724042701e-05, "loss": 0.6135, "step": 170 }, { "epoch": 0.56, "grad_norm": 0.5511438846588135, "learning_rate": 2.4879124864153163e-05, "loss": 0.5744, "step": 175 }, { "epoch": 0.576, "grad_norm": 0.5734318494796753, "learning_rate": 2.36974815427584e-05, "loss": 0.5972, "step": 180 }, { "epoch": 0.592, "grad_norm": 0.5379422307014465, "learning_rate": 2.252654418441808e-05, "loss": 0.5717, "step": 185 }, { "epoch": 0.608, "grad_norm": 0.5357218980789185, "learning_rate": 2.136960955474649e-05, "loss": 0.6015, "step": 190 }, { "epoch": 0.624, "grad_norm": 0.5256953835487366, "learning_rate": 2.0229934994777195e-05, "loss": 0.5913, "step": 195 }, { "epoch": 0.64, "grad_norm": 0.5051993131637573, "learning_rate": 1.911072924994306e-05, "loss": 0.579, "step": 200 }, { "epoch": 0.656, "grad_norm": 0.5518410205841064, "learning_rate": 1.801514343587688e-05, "loss": 0.5805, "step": 205 }, { "epoch": 0.672, "grad_norm": 0.50865238904953, "learning_rate": 1.6946262166468175e-05, "loss": 0.5962, "step": 210 }, { "epoch": 0.688, "grad_norm": 0.5044121146202087, "learning_rate": 1.590709486915524e-05, "loss": 0.5689, "step": 215 }, { "epoch": 0.704, "grad_norm": 0.4880557060241699, "learning_rate": 1.4900567311904107e-05, "loss": 0.5671, "step": 220 }, { "epoch": 0.72, "grad_norm": 0.5397293567657471, "learning_rate": 1.392951336573011e-05, "loss": 0.5693, "step": 225 }, { "epoch": 0.736, "grad_norm": 0.5184421539306641, "learning_rate": 1.2996667025954618e-05, "loss": 0.5693, "step": 230 }, { "epoch": 0.752, "grad_norm": 0.5067721009254456, "learning_rate": 1.2104654714661188e-05, "loss": 0.5693, "step": 235 }, { "epoch": 0.768, "grad_norm": 0.4883371591567993, "learning_rate": 1.1255987886023202e-05, "loss": 0.5749, "step": 240 }, { "epoch": 0.784, "grad_norm": 0.4902109205722809, "learning_rate": 1.0453055955322938e-05, "loss": 0.5791, "step": 245 }, { "epoch": 0.8, "grad_norm": 0.4747011065483093, "learning_rate": 9.698119571570258e-06, "loss": 0.5795, "step": 250 }, { "epoch": 0.816, "grad_norm": 0.4571741223335266, "learning_rate": 8.993304252661744e-06, "loss": 0.5657, "step": 255 }, { "epoch": 0.832, "grad_norm": 0.48521485924720764, "learning_rate": 8.340594401000496e-06, "loss": 0.5773, "step": 260 }, { "epoch": 0.848, "grad_norm": 0.4582931399345398, "learning_rate": 7.741827716425654e-06, "loss": 0.5822, "step": 265 }, { "epoch": 0.864, "grad_norm": 0.4858649671077728, "learning_rate": 7.198690022181837e-06, "loss": 0.5826, "step": 270 }, { "epoch": 0.88, "grad_norm": 0.49482160806655884, "learning_rate": 6.712710518496049e-06, "loss": 0.5588, "step": 275 }, { "epoch": 0.896, "grad_norm": 0.4916088581085205, "learning_rate": 6.285257477125605e-06, "loss": 0.5716, "step": 280 }, { "epoch": 0.912, "grad_norm": 0.4659317433834076, "learning_rate": 5.9175343889989275e-06, "loss": 0.5621, "step": 285 }, { "epoch": 0.928, "grad_norm": 0.5164335370063782, "learning_rate": 5.610576575795573e-06, "loss": 0.55, "step": 290 }, { "epoch": 0.944, "grad_norm": 0.5051558017730713, "learning_rate": 5.36524827500562e-06, "loss": 0.5709, "step": 295 }, { "epoch": 0.96, "grad_norm": 0.4648708403110504, "learning_rate": 5.182240206675272e-06, "loss": 0.5661, "step": 300 }, { "epoch": 0.976, "grad_norm": 0.47396937012672424, "learning_rate": 5.06206762868959e-06, "loss": 0.5582, "step": 305 }, { "epoch": 0.992, "grad_norm": 0.46491411328315735, "learning_rate": 5.005068886067688e-06, "loss": 0.5557, "step": 310 }, { "epoch": 0.9984, "step": 312, "total_flos": 34173306634240.0, "train_loss": 0.6066549909420502, "train_runtime": 5973.3628, "train_samples_per_second": 1.674, "train_steps_per_second": 0.052 } ], "logging_steps": 5, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 34173306634240.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }