{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 83.33333333333333, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.9060055136680603, "epoch": 3.3333333333333335, "grad_norm": 1.0164649486541748, "learning_rate": 1.5e-05, "loss": 4.1011, "mean_token_accuracy": 0.4009168431162834, "num_tokens": 5512.0, "step": 10 }, { "entropy": 1.9487085342407227, "epoch": 6.666666666666667, "grad_norm": 0.9628961682319641, "learning_rate": 3.1666666666666666e-05, "loss": 4.1923, "mean_token_accuracy": 0.39442635476589205, "num_tokens": 10326.0, "step": 20 }, { "entropy": 1.8993895709514619, "epoch": 10.0, "grad_norm": 1.177027702331543, "learning_rate": 4.8333333333333334e-05, "loss": 4.0334, "mean_token_accuracy": 0.40877673774957657, "num_tokens": 15462.0, "step": 30 }, { "entropy": 1.9950154840946197, "epoch": 13.333333333333334, "grad_norm": 1.436251163482666, "learning_rate": 4.9863047384206835e-05, "loss": 3.9342, "mean_token_accuracy": 0.40931957215070724, "num_tokens": 20681.0, "step": 40 }, { "entropy": 2.067208158969879, "epoch": 16.666666666666668, "grad_norm": 1.078131079673767, "learning_rate": 4.939155700460536e-05, "loss": 3.7698, "mean_token_accuracy": 0.42098456621170044, "num_tokens": 26109.0, "step": 50 }, { "entropy": 2.193948692083359, "epoch": 20.0, "grad_norm": 1.1934932470321655, "learning_rate": 4.859021265983959e-05, "loss": 3.6115, "mean_token_accuracy": 0.43236732184886933, "num_tokens": 31244.0, "step": 60 }, { "entropy": 2.4151065587997436, "epoch": 23.333333333333332, "grad_norm": 0.8595532774925232, "learning_rate": 4.7469851157479177e-05, "loss": 3.5401, "mean_token_accuracy": 0.42654525190591813, "num_tokens": 36738.0, "step": 70 }, { "entropy": 2.4695091366767885, "epoch": 26.666666666666668, "grad_norm": 0.9981549978256226, "learning_rate": 4.604562346488144e-05, "loss": 3.3173, "mean_token_accuracy": 0.4566218167543411, "num_tokens": 41886.0, "step": 80 }, { "entropy": 2.4938253283500673, "epoch": 30.0, "grad_norm": 0.5646995902061462, "learning_rate": 4.43367898183491e-05, "loss": 3.1822, "mean_token_accuracy": 0.4696041285991669, "num_tokens": 46983.0, "step": 90 }, { "entropy": 2.564500260353088, "epoch": 33.333333333333336, "grad_norm": 0.8602616786956787, "learning_rate": 4.2366459261474933e-05, "loss": 3.1409, "mean_token_accuracy": 0.47821263819932935, "num_tokens": 52150.0, "step": 100 }, { "entropy": 2.5386017203330993, "epoch": 36.666666666666664, "grad_norm": 0.7394922971725464, "learning_rate": 4.0161277134970345e-05, "loss": 2.9888, "mean_token_accuracy": 0.49289675503969194, "num_tokens": 57240.0, "step": 110 }, { "entropy": 2.5600393176078797, "epoch": 40.0, "grad_norm": 0.6606329083442688, "learning_rate": 3.775106474414188e-05, "loss": 2.9445, "mean_token_accuracy": 0.5016584262251854, "num_tokens": 62806.0, "step": 120 }, { "entropy": 2.547697913646698, "epoch": 43.333333333333336, "grad_norm": 0.8845142126083374, "learning_rate": 3.516841607689501e-05, "loss": 2.9015, "mean_token_accuracy": 0.5097966089844703, "num_tokens": 67877.0, "step": 130 }, { "entropy": 2.5584489583969114, "epoch": 46.666666666666664, "grad_norm": 0.8559042811393738, "learning_rate": 3.244825702596205e-05, "loss": 2.9204, "mean_token_accuracy": 0.503237409889698, "num_tokens": 73030.0, "step": 140 }, { "entropy": 2.479199802875519, "epoch": 50.0, "grad_norm": 0.9662401080131531, "learning_rate": 2.9627373076117863e-05, "loss": 2.7848, "mean_token_accuracy": 0.5199230208992958, "num_tokens": 78133.0, "step": 150 }, { "entropy": 2.451095724105835, "epoch": 53.333333333333336, "grad_norm": 0.7248209714889526, "learning_rate": 2.674391184360313e-05, "loss": 2.72, "mean_token_accuracy": 0.530137126147747, "num_tokens": 83587.0, "step": 160 }, { "entropy": 2.4100263833999636, "epoch": 56.666666666666664, "grad_norm": 0.8213902115821838, "learning_rate": 2.3836867195056335e-05, "loss": 2.6899, "mean_token_accuracy": 0.5364036962389946, "num_tokens": 88745.0, "step": 170 }, { "entropy": 2.3743161976337435, "epoch": 60.0, "grad_norm": 0.9216499924659729, "learning_rate": 2.0945551922360818e-05, "loss": 2.6164, "mean_token_accuracy": 0.5464533418416977, "num_tokens": 93971.0, "step": 180 }, { "entropy": 2.3309614181518556, "epoch": 63.333333333333336, "grad_norm": 0.9028099179267883, "learning_rate": 1.8109066104575023e-05, "loss": 2.577, "mean_token_accuracy": 0.5494851708412171, "num_tokens": 99176.0, "step": 190 }, { "entropy": 2.3660804867744445, "epoch": 66.66666666666667, "grad_norm": 1.1050912141799927, "learning_rate": 1.5365768346438797e-05, "loss": 2.6015, "mean_token_accuracy": 0.5429237529635429, "num_tokens": 104369.0, "step": 200 }, { "entropy": 2.2412913620471953, "epoch": 70.0, "grad_norm": 1.1248013973236084, "learning_rate": 1.2752757044047827e-05, "loss": 2.4471, "mean_token_accuracy": 0.5652078241109848, "num_tokens": 109867.0, "step": 210 }, { "entropy": 2.2687363147735597, "epoch": 73.33333333333333, "grad_norm": 0.8138008117675781, "learning_rate": 1.0305368692688174e-05, "loss": 2.4754, "mean_token_accuracy": 0.5577909976243973, "num_tokens": 115316.0, "step": 220 }, { "entropy": 2.2360614061355593, "epoch": 76.66666666666667, "grad_norm": 0.9668932557106018, "learning_rate": 8.056700021356694e-06, "loss": 2.4623, "mean_token_accuracy": 0.5630308628082276, "num_tokens": 120487.0, "step": 230 }, { "entropy": 2.28078556060791, "epoch": 80.0, "grad_norm": 0.9901016354560852, "learning_rate": 6.037160416278278e-06, "loss": 2.5002, "mean_token_accuracy": 0.5567112416028976, "num_tokens": 125629.0, "step": 240 }, { "entropy": 2.2194084882736207, "epoch": 83.33333333333333, "grad_norm": 1.047966480255127, "learning_rate": 4.274060686123959e-06, "loss": 2.418, "mean_token_accuracy": 0.5643551826477051, "num_tokens": 130811.0, "step": 250 } ], "logging_steps": 10, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5636363954499584e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }