{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.896, "eval_steps": 500, "global_step": 45, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.064, "grad_norm": 5.852497326444738, "learning_rate": 2.0000000000000003e-06, "loss": 0.8383, "step": 1 }, { "epoch": 0.128, "grad_norm": 5.787772418526806, "learning_rate": 4.000000000000001e-06, "loss": 0.8521, "step": 2 }, { "epoch": 0.192, "grad_norm": 5.713055325706916, "learning_rate": 6e-06, "loss": 0.8751, "step": 3 }, { "epoch": 0.256, "grad_norm": 4.005603131976, "learning_rate": 8.000000000000001e-06, "loss": 0.8204, "step": 4 }, { "epoch": 0.32, "grad_norm": 2.268867405051262, "learning_rate": 1e-05, "loss": 0.7586, "step": 5 }, { "epoch": 0.384, "grad_norm": 4.478369538237628, "learning_rate": 9.984586668665641e-06, "loss": 0.833, "step": 6 }, { "epoch": 0.448, "grad_norm": 4.7650131577888875, "learning_rate": 9.938441702975689e-06, "loss": 0.7763, "step": 7 }, { "epoch": 0.512, "grad_norm": 5.6864668038677, "learning_rate": 9.861849601988384e-06, "loss": 0.7636, "step": 8 }, { "epoch": 0.576, "grad_norm": 4.79068846647948, "learning_rate": 9.755282581475769e-06, "loss": 0.7728, "step": 9 }, { "epoch": 0.64, "grad_norm": 3.326502888431679, "learning_rate": 9.619397662556434e-06, "loss": 0.7261, "step": 10 }, { "epoch": 0.704, "grad_norm": 2.361650123105416, "learning_rate": 9.45503262094184e-06, "loss": 0.7025, "step": 11 }, { "epoch": 0.768, "grad_norm": 1.862931151952466, "learning_rate": 9.263200821770462e-06, "loss": 0.7031, "step": 12 }, { "epoch": 0.832, "grad_norm": 1.8033085031946665, "learning_rate": 9.045084971874738e-06, "loss": 0.7152, "step": 13 }, { "epoch": 0.896, "grad_norm": 1.227596985397878, "learning_rate": 8.802029828000157e-06, "loss": 0.6626, "step": 14 }, { "epoch": 0.96, "grad_norm": 1.101269544121175, "learning_rate": 8.535533905932739e-06, "loss": 0.6864, "step": 15 }, { "epoch": 1.032, "grad_norm": 1.9983144030760098, "learning_rate": 8.247240241650918e-06, "loss": 0.9603, "step": 16 }, { "epoch": 1.096, "grad_norm": 1.2735480355795505, "learning_rate": 7.938926261462366e-06, "loss": 0.7434, "step": 17 }, { "epoch": 1.16, "grad_norm": 0.7798050990201479, "learning_rate": 7.612492823579744e-06, "loss": 0.5232, "step": 18 }, { "epoch": 1.224, "grad_norm": 0.8629547467441678, "learning_rate": 7.269952498697734e-06, "loss": 0.6181, "step": 19 }, { "epoch": 1.288, "grad_norm": 0.8952639060585076, "learning_rate": 6.913417161825449e-06, "loss": 0.5387, "step": 20 }, { "epoch": 1.3519999999999999, "grad_norm": 0.7901264746326077, "learning_rate": 6.545084971874738e-06, "loss": 0.6179, "step": 21 }, { "epoch": 1.416, "grad_norm": 0.7130594562261059, "learning_rate": 6.1672268192795285e-06, "loss": 0.5751, "step": 22 }, { "epoch": 1.48, "grad_norm": 0.7275128809810312, "learning_rate": 5.782172325201155e-06, "loss": 0.5951, "step": 23 }, { "epoch": 1.544, "grad_norm": 0.7113626762984582, "learning_rate": 5.392295478639226e-06, "loss": 0.5711, "step": 24 }, { "epoch": 1.608, "grad_norm": 0.6395825616748008, "learning_rate": 5e-06, "loss": 0.6038, "step": 25 }, { "epoch": 1.6720000000000002, "grad_norm": 0.610635665061908, "learning_rate": 4.6077045213607765e-06, "loss": 0.6156, "step": 26 }, { "epoch": 1.736, "grad_norm": 0.6114077452144967, "learning_rate": 4.217827674798845e-06, "loss": 0.5486, "step": 27 }, { "epoch": 1.8, "grad_norm": 0.5620424477890631, "learning_rate": 3.832773180720475e-06, "loss": 0.5681, "step": 28 }, { "epoch": 1.8639999999999999, "grad_norm": 0.5770441629493418, "learning_rate": 3.4549150281252635e-06, "loss": 0.5366, "step": 29 }, { "epoch": 1.928, "grad_norm": 0.5615233017951738, "learning_rate": 3.0865828381745515e-06, "loss": 0.6139, "step": 30 }, { "epoch": 1.992, "grad_norm": 0.8472516836617299, "learning_rate": 2.7300475013022666e-06, "loss": 0.8237, "step": 31 }, { "epoch": 2.064, "grad_norm": 0.6380101645966922, "learning_rate": 2.387507176420256e-06, "loss": 0.5648, "step": 32 }, { "epoch": 2.128, "grad_norm": 0.5245770932317405, "learning_rate": 2.061073738537635e-06, "loss": 0.514, "step": 33 }, { "epoch": 2.192, "grad_norm": 0.49348454549991827, "learning_rate": 1.7527597583490825e-06, "loss": 0.5383, "step": 34 }, { "epoch": 2.2560000000000002, "grad_norm": 0.5103682114078478, "learning_rate": 1.4644660940672628e-06, "loss": 0.5411, "step": 35 }, { "epoch": 2.32, "grad_norm": 0.4690779131302028, "learning_rate": 1.1979701719998454e-06, "loss": 0.5695, "step": 36 }, { "epoch": 2.384, "grad_norm": 0.47394478810549673, "learning_rate": 9.549150281252633e-07, "loss": 0.5191, "step": 37 }, { "epoch": 2.448, "grad_norm": 0.5130264124034409, "learning_rate": 7.367991782295392e-07, "loss": 0.5229, "step": 38 }, { "epoch": 2.512, "grad_norm": 0.46041338211828164, "learning_rate": 5.449673790581611e-07, "loss": 0.5372, "step": 39 }, { "epoch": 2.576, "grad_norm": 0.47737752103280856, "learning_rate": 3.8060233744356634e-07, "loss": 0.5532, "step": 40 }, { "epoch": 2.64, "grad_norm": 0.45118553205170914, "learning_rate": 2.447174185242324e-07, "loss": 0.5248, "step": 41 }, { "epoch": 2.7039999999999997, "grad_norm": 0.463336138608449, "learning_rate": 1.3815039801161723e-07, "loss": 0.5512, "step": 42 }, { "epoch": 2.768, "grad_norm": 0.4418816559904613, "learning_rate": 6.15582970243117e-08, "loss": 0.5395, "step": 43 }, { "epoch": 2.832, "grad_norm": 0.42606502837865573, "learning_rate": 1.541333133436018e-08, "loss": 0.4979, "step": 44 }, { "epoch": 2.896, "grad_norm": 0.443669920016136, "learning_rate": 0.0, "loss": 0.552, "step": 45 }, { "epoch": 2.896, "step": 45, "total_flos": 5.682962857616998e+16, "train_loss": 0.6458813110987346, "train_runtime": 3188.9509, "train_samples_per_second": 0.937, "train_steps_per_second": 0.014 } ], "logging_steps": 1.0, "max_steps": 45, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.682962857616998e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }