| { |
| "best_global_step": 100, |
| "best_metric": 0.03781535476446152, |
| "best_model_checkpoint": "outputs/task17_validation_overfit/checkpoint-100", |
| "epoch": 14.307692307692308, |
| "eval_steps": 10, |
| "global_step": 100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.6475584894418716, |
| "epoch": 0.7692307692307693, |
| "grad_norm": 2.0882303714752197, |
| "learning_rate": 0.00019999999999999998, |
| "loss": 2.3588, |
| "mean_token_accuracy": 0.4615461818873882, |
| "num_tokens": 20320.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 1.9301502803961437, |
| "epoch": 1.4615384615384617, |
| "grad_norm": 18.46731948852539, |
| "learning_rate": 0.0002993207883859627, |
| "loss": 1.856, |
| "mean_token_accuracy": 0.5255880124039121, |
| "num_tokens": 40053.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 1.4615384615384617, |
| "eval_entropy": 1.9869511425495148, |
| "eval_loss": 1.886334776878357, |
| "eval_mean_token_accuracy": 0.551825076341629, |
| "eval_num_tokens": 40053.0, |
| "eval_runtime": 37.77, |
| "eval_samples_per_second": 0.688, |
| "eval_steps_per_second": 0.106, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.751060888171196, |
| "epoch": 2.1538461538461537, |
| "grad_norm": 0.5462398529052734, |
| "learning_rate": 0.0002951923052094534, |
| "loss": 1.6511, |
| "mean_token_accuracy": 0.5682276897132397, |
| "num_tokens": 57906.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 1.4793969199061394, |
| "epoch": 2.9230769230769234, |
| "grad_norm": 0.487332284450531, |
| "learning_rate": 0.0002874162686148104, |
| "loss": 1.4654, |
| "mean_token_accuracy": 0.6025816597044468, |
| "num_tokens": 78531.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 2.9230769230769234, |
| "eval_entropy": 1.3884812891483307, |
| "eval_loss": 1.270280361175537, |
| "eval_mean_token_accuracy": 0.6523506939411163, |
| "eval_num_tokens": 78531.0, |
| "eval_runtime": 22.9108, |
| "eval_samples_per_second": 1.135, |
| "eval_steps_per_second": 0.175, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.3133439835574892, |
| "epoch": 3.6153846153846154, |
| "grad_norm": 0.639244794845581, |
| "learning_rate": 0.0002761880299246772, |
| "loss": 1.2799, |
| "mean_token_accuracy": 0.6443595662713051, |
| "num_tokens": 97104.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 1.154199248386754, |
| "epoch": 4.3076923076923075, |
| "grad_norm": 0.6938906311988831, |
| "learning_rate": 0.0002617896674513632, |
| "loss": 1.1023, |
| "mean_token_accuracy": 0.681688571969668, |
| "num_tokens": 116150.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 4.3076923076923075, |
| "eval_entropy": 0.9741831719875336, |
| "eval_loss": 0.9136677980422974, |
| "eval_mean_token_accuracy": 0.7347625344991684, |
| "eval_num_tokens": 116150.0, |
| "eval_runtime": 536.3859, |
| "eval_samples_per_second": 0.048, |
| "eval_steps_per_second": 0.007, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.9346346780657768, |
| "epoch": 5.0, |
| "grad_norm": 1.3876852989196777, |
| "learning_rate": 0.00024458290006267833, |
| "loss": 0.931, |
| "mean_token_accuracy": 0.7168039282162985, |
| "num_tokens": 134635.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 0.79690430611372, |
| "epoch": 5.769230769230769, |
| "grad_norm": 0.877850353717804, |
| "learning_rate": 0.000225, |
| "loss": 0.6813, |
| "mean_token_accuracy": 0.7895070888102055, |
| "num_tokens": 155653.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 5.769230769230769, |
| "eval_entropy": 0.5750217065215111, |
| "eval_loss": 0.520744264125824, |
| "eval_mean_token_accuracy": 0.8456338793039322, |
| "eval_num_tokens": 155653.0, |
| "eval_runtime": 67.6644, |
| "eval_samples_per_second": 0.384, |
| "eval_steps_per_second": 0.059, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.5762933008372784, |
| "epoch": 6.461538461538462, |
| "grad_norm": 0.9030962586402893, |
| "learning_rate": 0.00020353293323878074, |
| "loss": 0.5085, |
| "mean_token_accuracy": 0.8485587851868736, |
| "num_tokens": 174302.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 0.4478672668337822, |
| "epoch": 7.153846153846154, |
| "grad_norm": 1.0197559595108032, |
| "learning_rate": 0.0001807210002097786, |
| "loss": 0.3625, |
| "mean_token_accuracy": 0.8866865825321939, |
| "num_tokens": 192210.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 7.153846153846154, |
| "eval_entropy": 0.38245467841625214, |
| "eval_loss": 0.2492842972278595, |
| "eval_mean_token_accuracy": 0.9312677532434464, |
| "eval_num_tokens": 192210.0, |
| "eval_runtime": 1107.4416, |
| "eval_samples_per_second": 0.023, |
| "eval_steps_per_second": 0.004, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.3418547097593546, |
| "epoch": 7.923076923076923, |
| "grad_norm": 1.294338583946228, |
| "learning_rate": 0.00015713728737356137, |
| "loss": 0.2487, |
| "mean_token_accuracy": 0.924527121335268, |
| "num_tokens": 213051.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 0.2615162552230888, |
| "epoch": 8.615384615384615, |
| "grad_norm": 0.9452798366546631, |
| "learning_rate": 0.00013337427001484836, |
| "loss": 0.1696, |
| "mean_token_accuracy": 0.9510174145301183, |
| "num_tokens": 232471.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 8.615384615384615, |
| "eval_entropy": 0.20260344073176384, |
| "eval_loss": 0.11910783499479294, |
| "eval_mean_token_accuracy": 0.9681012034416199, |
| "eval_num_tokens": 232471.0, |
| "eval_runtime": 33.7613, |
| "eval_samples_per_second": 0.77, |
| "eval_steps_per_second": 0.118, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.2021638828640183, |
| "epoch": 9.307692307692308, |
| "grad_norm": 0.7335841655731201, |
| "learning_rate": 0.00011002892794649476, |
| "loss": 0.1273, |
| "mean_token_accuracy": 0.9625586718320847, |
| "num_tokens": 250996.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 0.17093008839421803, |
| "epoch": 10.0, |
| "grad_norm": 1.0585453510284424, |
| "learning_rate": 8.768774804971705e-05, |
| "loss": 0.1003, |
| "mean_token_accuracy": 0.9728530612256792, |
| "num_tokens": 269270.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_entropy": 0.13076619990170002, |
| "eval_loss": 0.0703834816813469, |
| "eval_mean_token_accuracy": 0.9829248040914536, |
| "eval_num_tokens": 269270.0, |
| "eval_runtime": 82.837, |
| "eval_samples_per_second": 0.314, |
| "eval_steps_per_second": 0.048, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.1335297274403274, |
| "epoch": 10.76923076923077, |
| "grad_norm": 0.8506995439529419, |
| "learning_rate": 6.691199042008345e-05, |
| "loss": 0.0697, |
| "mean_token_accuracy": 0.9821603178977967, |
| "num_tokens": 289991.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 0.10489617474377155, |
| "epoch": 11.461538461538462, |
| "grad_norm": 0.5317821502685547, |
| "learning_rate": 4.8223588266430186e-05, |
| "loss": 0.056, |
| "mean_token_accuracy": 0.9858150093091859, |
| "num_tokens": 309159.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 11.461538461538462, |
| "eval_entropy": 0.08914025872945786, |
| "eval_loss": 0.048635080456733704, |
| "eval_mean_token_accuracy": 0.9872374981641769, |
| "eval_num_tokens": 309159.0, |
| "eval_runtime": 84.8378, |
| "eval_samples_per_second": 0.306, |
| "eval_steps_per_second": 0.047, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.09739745118551785, |
| "epoch": 12.153846153846153, |
| "grad_norm": 0.33153483271598816, |
| "learning_rate": 3.209203578858191e-05, |
| "loss": 0.0495, |
| "mean_token_accuracy": 0.9853963330388069, |
| "num_tokens": 327369.0, |
| "step": 85 |
| }, |
| { |
| "entropy": 0.08829495287500322, |
| "epoch": 12.923076923076923, |
| "grad_norm": 0.3288092613220215, |
| "learning_rate": 1.892259343953226e-05, |
| "loss": 0.045, |
| "mean_token_accuracy": 0.9884146824479103, |
| "num_tokens": 347845.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 12.923076923076923, |
| "eval_entropy": 0.07780754193663597, |
| "eval_loss": 0.04022412747144699, |
| "eval_mean_token_accuracy": 0.9897375255823135, |
| "eval_num_tokens": 347845.0, |
| "eval_runtime": 34.1594, |
| "eval_samples_per_second": 0.761, |
| "eval_steps_per_second": 0.117, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.08478014786831206, |
| "epoch": 13.615384615384615, |
| "grad_norm": 0.3467889726161957, |
| "learning_rate": 9.046106882113751e-06, |
| "loss": 0.0405, |
| "mean_token_accuracy": 0.9897107548183866, |
| "num_tokens": 366470.0, |
| "step": 95 |
| }, |
| { |
| "entropy": 0.08046578098502424, |
| "epoch": 14.307692307692308, |
| "grad_norm": 0.31346267461776733, |
| "learning_rate": 2.710695410593994e-06, |
| "loss": 0.0396, |
| "mean_token_accuracy": 0.989463752342595, |
| "num_tokens": 385277.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 14.307692307692308, |
| "eval_entropy": 0.07398746535181999, |
| "eval_loss": 0.03781535476446152, |
| "eval_mean_token_accuracy": 0.9904184341430664, |
| "eval_num_tokens": 385277.0, |
| "eval_runtime": 52.9007, |
| "eval_samples_per_second": 0.491, |
| "eval_steps_per_second": 0.076, |
| "step": 100 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 105, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 15, |
| "save_steps": 10, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6483535351259136.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|