| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.896, | |
| "eval_steps": 500, | |
| "global_step": 45, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 5.852497326444738, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.8383, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 5.787772418526806, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.8521, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 5.713055325706916, | |
| "learning_rate": 6e-06, | |
| "loss": 0.8751, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 4.005603131976, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.8204, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.268867405051262, | |
| "learning_rate": 1e-05, | |
| "loss": 0.7586, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 4.478369538237628, | |
| "learning_rate": 9.984586668665641e-06, | |
| "loss": 0.833, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 4.7650131577888875, | |
| "learning_rate": 9.938441702975689e-06, | |
| "loss": 0.7763, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 5.6864668038677, | |
| "learning_rate": 9.861849601988384e-06, | |
| "loss": 0.7636, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 4.79068846647948, | |
| "learning_rate": 9.755282581475769e-06, | |
| "loss": 0.7728, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 3.326502888431679, | |
| "learning_rate": 9.619397662556434e-06, | |
| "loss": 0.7261, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 2.361650123105416, | |
| "learning_rate": 9.45503262094184e-06, | |
| "loss": 0.7025, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 1.862931151952466, | |
| "learning_rate": 9.263200821770462e-06, | |
| "loss": 0.7031, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 1.8033085031946665, | |
| "learning_rate": 9.045084971874738e-06, | |
| "loss": 0.7152, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 1.227596985397878, | |
| "learning_rate": 8.802029828000157e-06, | |
| "loss": 0.6626, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.101269544121175, | |
| "learning_rate": 8.535533905932739e-06, | |
| "loss": 0.6864, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 1.9983144030760098, | |
| "learning_rate": 8.247240241650918e-06, | |
| "loss": 0.9603, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 1.096, | |
| "grad_norm": 1.2735480355795505, | |
| "learning_rate": 7.938926261462366e-06, | |
| "loss": 0.7434, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.7798050990201479, | |
| "learning_rate": 7.612492823579744e-06, | |
| "loss": 0.5232, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 1.224, | |
| "grad_norm": 0.8629547467441678, | |
| "learning_rate": 7.269952498697734e-06, | |
| "loss": 0.6181, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 1.288, | |
| "grad_norm": 0.8952639060585076, | |
| "learning_rate": 6.913417161825449e-06, | |
| "loss": 0.5387, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 0.7901264746326077, | |
| "learning_rate": 6.545084971874738e-06, | |
| "loss": 0.6179, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 1.416, | |
| "grad_norm": 0.7130594562261059, | |
| "learning_rate": 6.1672268192795285e-06, | |
| "loss": 0.5751, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.7275128809810312, | |
| "learning_rate": 5.782172325201155e-06, | |
| "loss": 0.5951, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 1.544, | |
| "grad_norm": 0.7113626762984582, | |
| "learning_rate": 5.392295478639226e-06, | |
| "loss": 0.5711, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 1.608, | |
| "grad_norm": 0.6395825616748008, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6038, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 0.610635665061908, | |
| "learning_rate": 4.6077045213607765e-06, | |
| "loss": 0.6156, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 1.736, | |
| "grad_norm": 0.6114077452144967, | |
| "learning_rate": 4.217827674798845e-06, | |
| "loss": 0.5486, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.5620424477890631, | |
| "learning_rate": 3.832773180720475e-06, | |
| "loss": 0.5681, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 0.5770441629493418, | |
| "learning_rate": 3.4549150281252635e-06, | |
| "loss": 0.5366, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.928, | |
| "grad_norm": 0.5615233017951738, | |
| "learning_rate": 3.0865828381745515e-06, | |
| "loss": 0.6139, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "grad_norm": 0.8472516836617299, | |
| "learning_rate": 2.7300475013022666e-06, | |
| "loss": 0.8237, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 0.6380101645966922, | |
| "learning_rate": 2.387507176420256e-06, | |
| "loss": 0.5648, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 2.128, | |
| "grad_norm": 0.5245770932317405, | |
| "learning_rate": 2.061073738537635e-06, | |
| "loss": 0.514, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": 0.49348454549991827, | |
| "learning_rate": 1.7527597583490825e-06, | |
| "loss": 0.5383, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 0.5103682114078478, | |
| "learning_rate": 1.4644660940672628e-06, | |
| "loss": 0.5411, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.4690779131302028, | |
| "learning_rate": 1.1979701719998454e-06, | |
| "loss": 0.5695, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 0.47394478810549673, | |
| "learning_rate": 9.549150281252633e-07, | |
| "loss": 0.5191, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 2.448, | |
| "grad_norm": 0.5130264124034409, | |
| "learning_rate": 7.367991782295392e-07, | |
| "loss": 0.5229, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": 0.46041338211828164, | |
| "learning_rate": 5.449673790581611e-07, | |
| "loss": 0.5372, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 2.576, | |
| "grad_norm": 0.47737752103280856, | |
| "learning_rate": 3.8060233744356634e-07, | |
| "loss": 0.5532, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.45118553205170914, | |
| "learning_rate": 2.447174185242324e-07, | |
| "loss": 0.5248, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 0.463336138608449, | |
| "learning_rate": 1.3815039801161723e-07, | |
| "loss": 0.5512, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 2.768, | |
| "grad_norm": 0.4418816559904613, | |
| "learning_rate": 6.15582970243117e-08, | |
| "loss": 0.5395, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 2.832, | |
| "grad_norm": 0.42606502837865573, | |
| "learning_rate": 1.541333133436018e-08, | |
| "loss": 0.4979, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "grad_norm": 0.443669920016136, | |
| "learning_rate": 0.0, | |
| "loss": 0.552, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "step": 45, | |
| "total_flos": 5.682962857616998e+16, | |
| "train_loss": 0.6458813110987346, | |
| "train_runtime": 3188.9509, | |
| "train_samples_per_second": 0.937, | |
| "train_steps_per_second": 0.014 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 45, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.682962857616998e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |