| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 100.0, | |
| "eval_steps": 500, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.9060055136680603, | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 1.0164649486541748, | |
| "learning_rate": 1.5e-05, | |
| "loss": 4.1011, | |
| "mean_token_accuracy": 0.4009168431162834, | |
| "num_tokens": 5512.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.9487085342407227, | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.9628961682319641, | |
| "learning_rate": 3.1666666666666666e-05, | |
| "loss": 4.1923, | |
| "mean_token_accuracy": 0.39442635476589205, | |
| "num_tokens": 10326.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.8993895709514619, | |
| "epoch": 10.0, | |
| "grad_norm": 1.177027702331543, | |
| "learning_rate": 4.8333333333333334e-05, | |
| "loss": 4.0334, | |
| "mean_token_accuracy": 0.40877673774957657, | |
| "num_tokens": 15462.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.9950154840946197, | |
| "epoch": 13.333333333333334, | |
| "grad_norm": 1.436251163482666, | |
| "learning_rate": 4.9863047384206835e-05, | |
| "loss": 3.9342, | |
| "mean_token_accuracy": 0.40931957215070724, | |
| "num_tokens": 20681.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 2.067208158969879, | |
| "epoch": 16.666666666666668, | |
| "grad_norm": 1.078131079673767, | |
| "learning_rate": 4.939155700460536e-05, | |
| "loss": 3.7698, | |
| "mean_token_accuracy": 0.42098456621170044, | |
| "num_tokens": 26109.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 2.193948692083359, | |
| "epoch": 20.0, | |
| "grad_norm": 1.1934932470321655, | |
| "learning_rate": 4.859021265983959e-05, | |
| "loss": 3.6115, | |
| "mean_token_accuracy": 0.43236732184886933, | |
| "num_tokens": 31244.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 2.4151065587997436, | |
| "epoch": 23.333333333333332, | |
| "grad_norm": 0.8595532774925232, | |
| "learning_rate": 4.7469851157479177e-05, | |
| "loss": 3.5401, | |
| "mean_token_accuracy": 0.42654525190591813, | |
| "num_tokens": 36738.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 2.4695091366767885, | |
| "epoch": 26.666666666666668, | |
| "grad_norm": 0.9981549978256226, | |
| "learning_rate": 4.604562346488144e-05, | |
| "loss": 3.3173, | |
| "mean_token_accuracy": 0.4566218167543411, | |
| "num_tokens": 41886.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 2.4938253283500673, | |
| "epoch": 30.0, | |
| "grad_norm": 0.5646995902061462, | |
| "learning_rate": 4.43367898183491e-05, | |
| "loss": 3.1822, | |
| "mean_token_accuracy": 0.4696041285991669, | |
| "num_tokens": 46983.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 2.564500260353088, | |
| "epoch": 33.333333333333336, | |
| "grad_norm": 0.8602616786956787, | |
| "learning_rate": 4.2366459261474933e-05, | |
| "loss": 3.1409, | |
| "mean_token_accuracy": 0.47821263819932935, | |
| "num_tokens": 52150.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 2.5386017203330993, | |
| "epoch": 36.666666666666664, | |
| "grad_norm": 0.7394922971725464, | |
| "learning_rate": 4.0161277134970345e-05, | |
| "loss": 2.9888, | |
| "mean_token_accuracy": 0.49289675503969194, | |
| "num_tokens": 57240.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 2.5600393176078797, | |
| "epoch": 40.0, | |
| "grad_norm": 0.6606329083442688, | |
| "learning_rate": 3.775106474414188e-05, | |
| "loss": 2.9445, | |
| "mean_token_accuracy": 0.5016584262251854, | |
| "num_tokens": 62806.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 2.547697913646698, | |
| "epoch": 43.333333333333336, | |
| "grad_norm": 0.8845142126083374, | |
| "learning_rate": 3.516841607689501e-05, | |
| "loss": 2.9015, | |
| "mean_token_accuracy": 0.5097966089844703, | |
| "num_tokens": 67877.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 2.5584489583969114, | |
| "epoch": 46.666666666666664, | |
| "grad_norm": 0.8559042811393738, | |
| "learning_rate": 3.244825702596205e-05, | |
| "loss": 2.9204, | |
| "mean_token_accuracy": 0.503237409889698, | |
| "num_tokens": 73030.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 2.479199802875519, | |
| "epoch": 50.0, | |
| "grad_norm": 0.9662401080131531, | |
| "learning_rate": 2.9627373076117863e-05, | |
| "loss": 2.7848, | |
| "mean_token_accuracy": 0.5199230208992958, | |
| "num_tokens": 78133.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 2.451095724105835, | |
| "epoch": 53.333333333333336, | |
| "grad_norm": 0.7248209714889526, | |
| "learning_rate": 2.674391184360313e-05, | |
| "loss": 2.72, | |
| "mean_token_accuracy": 0.530137126147747, | |
| "num_tokens": 83587.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 2.4100263833999636, | |
| "epoch": 56.666666666666664, | |
| "grad_norm": 0.8213902115821838, | |
| "learning_rate": 2.3836867195056335e-05, | |
| "loss": 2.6899, | |
| "mean_token_accuracy": 0.5364036962389946, | |
| "num_tokens": 88745.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 2.3743161976337435, | |
| "epoch": 60.0, | |
| "grad_norm": 0.9216499924659729, | |
| "learning_rate": 2.0945551922360818e-05, | |
| "loss": 2.6164, | |
| "mean_token_accuracy": 0.5464533418416977, | |
| "num_tokens": 93971.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 2.3309614181518556, | |
| "epoch": 63.333333333333336, | |
| "grad_norm": 0.9028099179267883, | |
| "learning_rate": 1.8109066104575023e-05, | |
| "loss": 2.577, | |
| "mean_token_accuracy": 0.5494851708412171, | |
| "num_tokens": 99176.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 2.3660804867744445, | |
| "epoch": 66.66666666666667, | |
| "grad_norm": 1.1050912141799927, | |
| "learning_rate": 1.5365768346438797e-05, | |
| "loss": 2.6015, | |
| "mean_token_accuracy": 0.5429237529635429, | |
| "num_tokens": 104369.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 2.2412913620471953, | |
| "epoch": 70.0, | |
| "grad_norm": 1.1248013973236084, | |
| "learning_rate": 1.2752757044047827e-05, | |
| "loss": 2.4471, | |
| "mean_token_accuracy": 0.5652078241109848, | |
| "num_tokens": 109867.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 2.2687363147735597, | |
| "epoch": 73.33333333333333, | |
| "grad_norm": 0.8138008117675781, | |
| "learning_rate": 1.0305368692688174e-05, | |
| "loss": 2.4754, | |
| "mean_token_accuracy": 0.5577909976243973, | |
| "num_tokens": 115316.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 2.2360614061355593, | |
| "epoch": 76.66666666666667, | |
| "grad_norm": 0.9668932557106018, | |
| "learning_rate": 8.056700021356694e-06, | |
| "loss": 2.4623, | |
| "mean_token_accuracy": 0.5630308628082276, | |
| "num_tokens": 120487.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 2.28078556060791, | |
| "epoch": 80.0, | |
| "grad_norm": 0.9901016354560852, | |
| "learning_rate": 6.037160416278278e-06, | |
| "loss": 2.5002, | |
| "mean_token_accuracy": 0.5567112416028976, | |
| "num_tokens": 125629.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 2.2194084882736207, | |
| "epoch": 83.33333333333333, | |
| "grad_norm": 1.047966480255127, | |
| "learning_rate": 4.274060686123959e-06, | |
| "loss": 2.418, | |
| "mean_token_accuracy": 0.5643551826477051, | |
| "num_tokens": 130811.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 2.2408297181129457, | |
| "epoch": 86.66666666666667, | |
| "grad_norm": 1.1455246210098267, | |
| "learning_rate": 2.7912437301749026e-06, | |
| "loss": 2.4294, | |
| "mean_token_accuracy": 0.5679019063711166, | |
| "num_tokens": 135921.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 2.2160379528999328, | |
| "epoch": 90.0, | |
| "grad_norm": 1.1439515352249146, | |
| "learning_rate": 1.6087621040117157e-06, | |
| "loss": 2.4361, | |
| "mean_token_accuracy": 0.5633732363581657, | |
| "num_tokens": 140722.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 2.277903026342392, | |
| "epoch": 93.33333333333333, | |
| "grad_norm": 1.191327452659607, | |
| "learning_rate": 7.426068431000882e-07, | |
| "loss": 2.4947, | |
| "mean_token_accuracy": 0.5539214983582497, | |
| "num_tokens": 145887.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 2.1880038678646088, | |
| "epoch": 96.66666666666667, | |
| "grad_norm": 1.1344318389892578, | |
| "learning_rate": 2.0449121146845774e-07, | |
| "loss": 2.3935, | |
| "mean_token_accuracy": 0.5718189179897308, | |
| "num_tokens": 150980.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 2.2173574447631834, | |
| "epoch": 100.0, | |
| "grad_norm": 1.0592862367630005, | |
| "learning_rate": 1.692299905944883e-09, | |
| "loss": 2.4204, | |
| "mean_token_accuracy": 0.5628041237592697, | |
| "num_tokens": 156525.0, | |
| "step": 300 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 300, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 100, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.8686345743478784e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |