{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9710982658959537, "eval_steps": 500, "global_step": 258, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.057803468208092484, "grad_norm": 0.7586865425109863, "learning_rate": 9.615384615384616e-06, "loss": 1.5305, "step": 5 }, { "epoch": 0.11560693641618497, "grad_norm": 1.5047568082809448, "learning_rate": 1.923076923076923e-05, "loss": 1.3586, "step": 10 }, { "epoch": 0.17341040462427745, "grad_norm": 1.0730587244033813, "learning_rate": 2.8846153846153845e-05, "loss": 1.3282, "step": 15 }, { "epoch": 0.23121387283236994, "grad_norm": 1.0204637050628662, "learning_rate": 3.846153846153846e-05, "loss": 1.246, "step": 20 }, { "epoch": 0.28901734104046245, "grad_norm": 0.5301410555839539, "learning_rate": 4.8076923076923084e-05, "loss": 1.3034, "step": 25 }, { "epoch": 0.3468208092485549, "grad_norm": 0.723696768283844, "learning_rate": 4.996333534627809e-05, "loss": 1.1816, "step": 30 }, { "epoch": 0.4046242774566474, "grad_norm": 1.3614885807037354, "learning_rate": 4.981456948708014e-05, "loss": 1.2341, "step": 35 }, { "epoch": 0.4624277456647399, "grad_norm": 1.0017260313034058, "learning_rate": 4.95520920685539e-05, "loss": 1.2838, "step": 40 }, { "epoch": 0.5202312138728323, "grad_norm": 0.6403581500053406, "learning_rate": 4.9177105880720173e-05, "loss": 1.2135, "step": 45 }, { "epoch": 0.5780346820809249, "grad_norm": 0.5783727765083313, "learning_rate": 4.869132927957007e-05, "loss": 1.11, "step": 50 }, { "epoch": 0.6358381502890174, "grad_norm": 0.7453054189682007, "learning_rate": 4.8096988312782174e-05, "loss": 1.2103, "step": 55 }, { "epoch": 0.6936416184971098, "grad_norm": 1.236171007156372, "learning_rate": 4.73968065189672e-05, "loss": 1.2226, "step": 60 }, { "epoch": 0.7514450867052023, "grad_norm": 0.44787439703941345, "learning_rate": 4.6593992447184586e-05, "loss": 1.1403, "step": 65 }, { "epoch": 0.8092485549132948, "grad_norm": 0.7945877313613892, "learning_rate": 4.5692224953922266e-05, "loss": 1.1933, "step": 70 }, { "epoch": 0.8670520231213873, "grad_norm": 1.6053190231323242, "learning_rate": 4.469563634491554e-05, "loss": 1.1941, "step": 75 }, { "epoch": 0.9248554913294798, "grad_norm": 1.0948492288589478, "learning_rate": 4.360879343905676e-05, "loss": 1.3349, "step": 80 }, { "epoch": 0.9826589595375722, "grad_norm": 0.726474940776825, "learning_rate": 4.243667664116956e-05, "loss": 1.3004, "step": 85 }, { "epoch": 1.0346820809248556, "grad_norm": 1.4559762477874756, "learning_rate": 4.118465711954569e-05, "loss": 1.0116, "step": 90 }, { "epoch": 1.092485549132948, "grad_norm": 1.5781135559082031, "learning_rate": 3.985847219282725e-05, "loss": 0.8764, "step": 95 }, { "epoch": 1.1502890173410405, "grad_norm": 0.6205704212188721, "learning_rate": 3.8464199039022605e-05, "loss": 0.9051, "step": 100 }, { "epoch": 1.208092485549133, "grad_norm": 1.4496972560882568, "learning_rate": 3.700822684713349e-05, "loss": 0.9408, "step": 105 }, { "epoch": 1.2658959537572254, "grad_norm": 0.4967881143093109, "learning_rate": 3.5497227539006614e-05, "loss": 0.7376, "step": 110 }, { "epoch": 1.323699421965318, "grad_norm": 1.4739594459533691, "learning_rate": 3.3938125195576e-05, "loss": 0.9192, "step": 115 }, { "epoch": 1.3815028901734103, "grad_norm": 1.443954348564148, "learning_rate": 3.233806432759837e-05, "loss": 0.7502, "step": 120 }, { "epoch": 1.439306358381503, "grad_norm": 0.8857870697975159, "learning_rate": 3.070437713627965e-05, "loss": 0.7896, "step": 125 }, { "epoch": 1.4971098265895955, "grad_norm": 0.49113208055496216, "learning_rate": 2.9044549913819124e-05, "loss": 0.7826, "step": 130 }, { "epoch": 1.5549132947976878, "grad_norm": 0.5606523752212524, "learning_rate": 2.7366188737839026e-05, "loss": 0.7622, "step": 135 }, { "epoch": 1.6127167630057804, "grad_norm": 0.5834754705429077, "learning_rate": 2.5676984616903367e-05, "loss": 0.6622, "step": 140 }, { "epoch": 1.6705202312138727, "grad_norm": 0.9665216207504272, "learning_rate": 2.3984678246844677e-05, "loss": 0.809, "step": 145 }, { "epoch": 1.7283236994219653, "grad_norm": 0.7581700086593628, "learning_rate": 2.2297024539401463e-05, "loss": 0.7095, "step": 150 }, { "epoch": 1.7861271676300579, "grad_norm": 0.8531942367553711, "learning_rate": 2.0621757085711734e-05, "loss": 0.8316, "step": 155 }, { "epoch": 1.8439306358381504, "grad_norm": 1.121618390083313, "learning_rate": 1.8966552717507364e-05, "loss": 0.7683, "step": 160 }, { "epoch": 1.9017341040462428, "grad_norm": 1.0460470914840698, "learning_rate": 1.7338996328405526e-05, "loss": 0.7656, "step": 165 }, { "epoch": 1.9595375722543351, "grad_norm": 1.1561076641082764, "learning_rate": 1.574654611650214e-05, "loss": 0.7079, "step": 170 }, { "epoch": 2.0115606936416186, "grad_norm": 0.6570937037467957, "learning_rate": 1.4196499407541359e-05, "loss": 0.7448, "step": 175 }, { "epoch": 2.069364161849711, "grad_norm": 0.8971176147460938, "learning_rate": 1.2695959215274816e-05, "loss": 0.5049, "step": 180 }, { "epoch": 2.1271676300578033, "grad_norm": 0.7877609133720398, "learning_rate": 1.125180169224613e-05, "loss": 0.4581, "step": 185 }, { "epoch": 2.184971098265896, "grad_norm": 1.099473476409912, "learning_rate": 9.870644620155877e-06, "loss": 0.4307, "step": 190 }, { "epoch": 2.2427745664739884, "grad_norm": 1.1364312171936035, "learning_rate": 8.558817084198387e-06, "loss": 0.4858, "step": 195 }, { "epoch": 2.300578034682081, "grad_norm": 0.7978260517120361, "learning_rate": 7.3223304703363135e-06, "loss": 0.4745, "step": 200 }, { "epoch": 2.3583815028901736, "grad_norm": 0.6734775900840759, "learning_rate": 6.166850918416406e-06, "loss": 0.5683, "step": 205 }, { "epoch": 2.416184971098266, "grad_norm": 0.9826372265815735, "learning_rate": 5.097673357358907e-06, "loss": 0.466, "step": 210 }, { "epoch": 2.4739884393063583, "grad_norm": 1.0204384326934814, "learning_rate": 4.119697241402998e-06, "loss": 0.4577, "step": 215 }, { "epoch": 2.531791907514451, "grad_norm": 1.038061261177063, "learning_rate": 3.2374040985957004e-06, "loss": 0.3862, "step": 220 }, { "epoch": 2.5895953757225434, "grad_norm": 0.9037131071090698, "learning_rate": 2.4548369944073004e-06, "loss": 0.4205, "step": 225 }, { "epoch": 2.647398843930636, "grad_norm": 0.7115334272384644, "learning_rate": 1.7755820045802145e-06, "loss": 0.3581, "step": 230 }, { "epoch": 2.705202312138728, "grad_norm": 0.8673137426376343, "learning_rate": 1.2027517821111112e-06, "loss": 0.4342, "step": 235 }, { "epoch": 2.7630057803468207, "grad_norm": 0.8103125691413879, "learning_rate": 7.389712936697129e-07, "loss": 0.4275, "step": 240 }, { "epoch": 2.820809248554913, "grad_norm": 0.638612687587738, "learning_rate": 3.8636579081657577e-07, "loss": 0.4198, "step": 245 }, { "epoch": 2.878612716763006, "grad_norm": 0.8524174690246582, "learning_rate": 1.4655107114101007e-07, "loss": 0.5151, "step": 250 }, { "epoch": 2.9364161849710984, "grad_norm": 0.9686955809593201, "learning_rate": 2.0626073947138668e-08, "loss": 0.4155, "step": 255 }, { "epoch": 2.9710982658959537, "step": 258, "total_flos": 2.017433878246195e+16, "train_loss": 0.8397066662477892, "train_runtime": 2578.3607, "train_samples_per_second": 0.804, "train_steps_per_second": 0.1 } ], "logging_steps": 5, "max_steps": 258, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.017433878246195e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }