| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9984, | |
| "eval_steps": 500, | |
| "global_step": 312, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 2.1790294647216797, | |
| "learning_rate": 1.5625e-05, | |
| "loss": 0.8982, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 1.2921503782272339, | |
| "learning_rate": 3.125e-05, | |
| "loss": 0.7771, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 1.0662598609924316, | |
| "learning_rate": 4.6875e-05, | |
| "loss": 0.7065, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.8291209936141968, | |
| "learning_rate": 4.9979726739605334e-05, | |
| "loss": 0.6569, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.8745065927505493, | |
| "learning_rate": 4.989742922931149e-05, | |
| "loss": 0.6506, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.6823293566703796, | |
| "learning_rate": 4.975207191995552e-05, | |
| "loss": 0.6579, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.7768641114234924, | |
| "learning_rate": 4.95440640639845e-05, | |
| "loss": 0.6842, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.6644716262817383, | |
| "learning_rate": 4.927399130600373e-05, | |
| "loss": 0.6327, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.6596384644508362, | |
| "learning_rate": 4.894261403389862e-05, | |
| "loss": 0.6664, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.6109484434127808, | |
| "learning_rate": 4.855086523796815e-05, | |
| "loss": 0.6369, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.6302951574325562, | |
| "learning_rate": 4.8099847884097434e-05, | |
| "loss": 0.6217, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.6894915699958801, | |
| "learning_rate": 4.7590831808365293e-05, | |
| "loss": 0.6364, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.6988873481750488, | |
| "learning_rate": 4.702525014183007e-05, | |
| "loss": 0.6244, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.6387168765068054, | |
| "learning_rate": 4.6404695275559475e-05, | |
| "loss": 0.614, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.6547486186027527, | |
| "learning_rate": 4.57309143772652e-05, | |
| "loss": 0.6034, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.6504953503608704, | |
| "learning_rate": 4.500580447216489e-05, | |
| "loss": 0.6265, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.5533855557441711, | |
| "learning_rate": 4.423140710192144e-05, | |
| "loss": 0.6178, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.5795829892158508, | |
| "learning_rate": 4.340990257669732e-05, | |
| "loss": 0.6243, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.5757337808609009, | |
| "learning_rate": 4.254360383650716e-05, | |
| "loss": 0.605, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.6556103825569153, | |
| "learning_rate": 4.163494993915196e-05, | |
| "loss": 0.6046, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.6166912913322449, | |
| "learning_rate": 4.0686499193069595e-05, | |
| "loss": 0.6243, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.6043514609336853, | |
| "learning_rate": 3.970092195443604e-05, | |
| "loss": 0.6182, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.5503015518188477, | |
| "learning_rate": 3.8680993108796956e-05, | |
| "loss": 0.619, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 12.36988353729248, | |
| "learning_rate": 3.7629584258397646e-05, | |
| "loss": 0.5871, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.5975585579872131, | |
| "learning_rate": 3.65496556372078e-05, | |
| "loss": 0.6111, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.5625191330909729, | |
| "learning_rate": 3.5444247776404274e-05, | |
| "loss": 0.5999, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.5412049889564514, | |
| "learning_rate": 3.4316472943777736e-05, | |
| "loss": 0.5846, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.5721265077590942, | |
| "learning_rate": 3.316950638116532e-05, | |
| "loss": 0.5975, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.5761215090751648, | |
| "learning_rate": 3.2006577364580284e-05, | |
| "loss": 0.6136, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.5800392627716064, | |
| "learning_rate": 3.083096011220896e-05, | |
| "loss": 0.5969, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.5748460292816162, | |
| "learning_rate": 2.9645964565873207e-05, | |
| "loss": 0.6043, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.5393357872962952, | |
| "learning_rate": 2.845492707191334e-05, | |
| "loss": 0.5814, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.5506784319877625, | |
| "learning_rate": 2.7261200987729242e-05, | |
| "loss": 0.5676, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.5413019061088562, | |
| "learning_rate": 2.606814724042701e-05, | |
| "loss": 0.6135, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.5511438846588135, | |
| "learning_rate": 2.4879124864153163e-05, | |
| "loss": 0.5744, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.5734318494796753, | |
| "learning_rate": 2.36974815427584e-05, | |
| "loss": 0.5972, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.5379422307014465, | |
| "learning_rate": 2.252654418441808e-05, | |
| "loss": 0.5717, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.5357218980789185, | |
| "learning_rate": 2.136960955474649e-05, | |
| "loss": 0.6015, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.5256953835487366, | |
| "learning_rate": 2.0229934994777195e-05, | |
| "loss": 0.5913, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.5051993131637573, | |
| "learning_rate": 1.911072924994306e-05, | |
| "loss": 0.579, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.5518410205841064, | |
| "learning_rate": 1.801514343587688e-05, | |
| "loss": 0.5805, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.50865238904953, | |
| "learning_rate": 1.6946262166468175e-05, | |
| "loss": 0.5962, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.5044121146202087, | |
| "learning_rate": 1.590709486915524e-05, | |
| "loss": 0.5689, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.4880557060241699, | |
| "learning_rate": 1.4900567311904107e-05, | |
| "loss": 0.5671, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.5397293567657471, | |
| "learning_rate": 1.392951336573011e-05, | |
| "loss": 0.5693, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.5184421539306641, | |
| "learning_rate": 1.2996667025954618e-05, | |
| "loss": 0.5693, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.5067721009254456, | |
| "learning_rate": 1.2104654714661188e-05, | |
| "loss": 0.5693, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.4883371591567993, | |
| "learning_rate": 1.1255987886023202e-05, | |
| "loss": 0.5749, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.4902109205722809, | |
| "learning_rate": 1.0453055955322938e-05, | |
| "loss": 0.5791, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.4747011065483093, | |
| "learning_rate": 9.698119571570258e-06, | |
| "loss": 0.5795, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.4571741223335266, | |
| "learning_rate": 8.993304252661744e-06, | |
| "loss": 0.5657, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.48521485924720764, | |
| "learning_rate": 8.340594401000496e-06, | |
| "loss": 0.5773, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.4582931399345398, | |
| "learning_rate": 7.741827716425654e-06, | |
| "loss": 0.5822, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.4858649671077728, | |
| "learning_rate": 7.198690022181837e-06, | |
| "loss": 0.5826, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.49482160806655884, | |
| "learning_rate": 6.712710518496049e-06, | |
| "loss": 0.5588, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.4916088581085205, | |
| "learning_rate": 6.285257477125605e-06, | |
| "loss": 0.5716, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.4659317433834076, | |
| "learning_rate": 5.9175343889989275e-06, | |
| "loss": 0.5621, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.5164335370063782, | |
| "learning_rate": 5.610576575795573e-06, | |
| "loss": 0.55, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.5051558017730713, | |
| "learning_rate": 5.36524827500562e-06, | |
| "loss": 0.5709, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.4648708403110504, | |
| "learning_rate": 5.182240206675272e-06, | |
| "loss": 0.5661, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 0.47396937012672424, | |
| "learning_rate": 5.06206762868959e-06, | |
| "loss": 0.5582, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.46491411328315735, | |
| "learning_rate": 5.005068886067688e-06, | |
| "loss": 0.5557, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "step": 312, | |
| "total_flos": 34173306634240.0, | |
| "train_loss": 0.6066549909420502, | |
| "train_runtime": 5973.3628, | |
| "train_samples_per_second": 1.674, | |
| "train_steps_per_second": 0.052 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 312, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 34173306634240.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |