| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 420, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11976047904191617, | |
| "grad_norm": 14.0, | |
| "learning_rate": 2.076923076923077e-05, | |
| "loss": 1.8289, | |
| "mean_token_accuracy": 0.683072866499424, | |
| "num_tokens": 587660.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.23952095808383234, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 2.998391588898347e-05, | |
| "loss": 0.2049, | |
| "mean_token_accuracy": 0.9667805030941963, | |
| "num_tokens": 1175555.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3592814371257485, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 2.9885748985121637e-05, | |
| "loss": 0.0489, | |
| "mean_token_accuracy": 0.9926777228713035, | |
| "num_tokens": 1763305.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.47904191616766467, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 2.9698934686822084e-05, | |
| "loss": 0.0412, | |
| "mean_token_accuracy": 0.9933341085910797, | |
| "num_tokens": 2351068.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5988023952095808, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 2.9424585507859205e-05, | |
| "loss": 0.0293, | |
| "mean_token_accuracy": 0.9942759811878205, | |
| "num_tokens": 2938562.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.718562874251497, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 2.9064335248447247e-05, | |
| "loss": 0.0252, | |
| "mean_token_accuracy": 0.9948339760303497, | |
| "num_tokens": 3526034.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.8383233532934131, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 2.8620329265657207e-05, | |
| "loss": 0.024, | |
| "mean_token_accuracy": 0.9949740320444107, | |
| "num_tokens": 4113493.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.9580838323353293, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 2.8095211697417823e-05, | |
| "loss": 0.0234, | |
| "mean_token_accuracy": 0.9949663072824478, | |
| "num_tokens": 4701481.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.0718562874251496, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 2.7492109716184062e-05, | |
| "loss": 0.0212, | |
| "mean_token_accuracy": 0.9952872379830009, | |
| "num_tokens": 5250575.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.1916167664670658, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 2.6814614906045513e-05, | |
| "loss": 0.02, | |
| "mean_token_accuracy": 0.9955174118280411, | |
| "num_tokens": 5837944.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.311377245508982, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 2.6066761874177395e-05, | |
| "loss": 0.0194, | |
| "mean_token_accuracy": 0.9955940291285514, | |
| "num_tokens": 6425447.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.4311377245508983, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 2.525300422400696e-05, | |
| "loss": 0.0192, | |
| "mean_token_accuracy": 0.9956039682030677, | |
| "num_tokens": 7013639.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.5508982035928143, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 2.437818803317959e-05, | |
| "loss": 0.0198, | |
| "mean_token_accuracy": 0.9954764798283577, | |
| "num_tokens": 7601116.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.6706586826347305, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 2.3447522994268103e-05, | |
| "loss": 0.0186, | |
| "mean_token_accuracy": 0.99561026096344, | |
| "num_tokens": 8189057.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.7904191616766467, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 2.2466551390087846e-05, | |
| "loss": 0.019, | |
| "mean_token_accuracy": 0.9955548778176307, | |
| "num_tokens": 8777083.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.910179640718563, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 2.1441115088375194e-05, | |
| "loss": 0.0189, | |
| "mean_token_accuracy": 0.9956801995635033, | |
| "num_tokens": 9365009.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.0239520958083834, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 2.0377320752382335e-05, | |
| "loss": 0.0172, | |
| "mean_token_accuracy": 0.9959440262694108, | |
| "num_tokens": 9913586.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.143712574850299, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 1.9281503474565655e-05, | |
| "loss": 0.0165, | |
| "mean_token_accuracy": 0.9959981262683868, | |
| "num_tokens": 10501231.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.2634730538922154, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 1.8160189049935895e-05, | |
| "loss": 0.0162, | |
| "mean_token_accuracy": 0.9961728557944298, | |
| "num_tokens": 11088786.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.3832335329341316, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 1.702005511373924e-05, | |
| "loss": 0.0162, | |
| "mean_token_accuracy": 0.9960243076086044, | |
| "num_tokens": 11676462.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.502994011976048, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 1.5867891374901648e-05, | |
| "loss": 0.0152, | |
| "mean_token_accuracy": 0.9962586492300034, | |
| "num_tokens": 12264351.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.622754491017964, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 1.4710559182053653e-05, | |
| "loss": 0.0163, | |
| "mean_token_accuracy": 0.9960203751921654, | |
| "num_tokens": 12852542.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.7425149700598803, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 1.3554950662927453e-05, | |
| "loss": 0.0158, | |
| "mean_token_accuracy": 0.9961689054965973, | |
| "num_tokens": 13440289.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.8622754491017965, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 1.2407947680458721e-05, | |
| "loss": 0.0154, | |
| "mean_token_accuracy": 0.9962015345692634, | |
| "num_tokens": 14027846.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.9820359281437128, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 1.1276380850017229e-05, | |
| "loss": 0.0153, | |
| "mean_token_accuracy": 0.9962929576635361, | |
| "num_tokens": 14615277.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.095808383233533, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 1.0166988861826149e-05, | |
| "loss": 0.0147, | |
| "mean_token_accuracy": 0.9963565531529879, | |
| "num_tokens": 15164712.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.215568862275449, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 9.086378350812725e-06, | |
| "loss": 0.0146, | |
| "mean_token_accuracy": 0.9963974550366401, | |
| "num_tokens": 15752265.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.3353293413173652, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 8.040984552872646e-06, | |
| "loss": 0.0144, | |
| "mean_token_accuracy": 0.9964193448424339, | |
| "num_tokens": 16340543.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.4550898203592815, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 7.037032981847467e-06, | |
| "loss": 0.0139, | |
| "mean_token_accuracy": 0.9965293854475021, | |
| "num_tokens": 16927958.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.5748502994011977, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 6.080502355435702e-06, | |
| "loss": 0.0144, | |
| "mean_token_accuracy": 0.9964624375104905, | |
| "num_tokens": 17515766.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.694610778443114, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 5.177088990820725e-06, | |
| "loss": 0.0142, | |
| "mean_token_accuracy": 0.9964716985821724, | |
| "num_tokens": 18103351.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.81437125748503, | |
| "grad_norm": 0.1806640625, | |
| "learning_rate": 4.332172882046057e-06, | |
| "loss": 0.0146, | |
| "mean_token_accuracy": 0.9963340446352958, | |
| "num_tokens": 18691113.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.934131736526946, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 3.5507856611536055e-06, | |
| "loss": 0.0142, | |
| "mean_token_accuracy": 0.996476624906063, | |
| "num_tokens": 19278565.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.047904191616767, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 2.8375806338821853e-06, | |
| "loss": 0.0141, | |
| "mean_token_accuracy": 0.9965244704171231, | |
| "num_tokens": 19827553.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 4.167664670658683, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 2.1968050683693077e-06, | |
| "loss": 0.014, | |
| "mean_token_accuracy": 0.9965354189276695, | |
| "num_tokens": 20415350.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.287425149700598, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 1.6322749018821415e-06, | |
| "loss": 0.0138, | |
| "mean_token_accuracy": 0.9965612664818764, | |
| "num_tokens": 21002648.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.407185628742515, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 1.1473520162037699e-06, | |
| "loss": 0.0137, | |
| "mean_token_accuracy": 0.9965482473373413, | |
| "num_tokens": 21590640.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.526946107784431, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 7.449242170039994e-07, | |
| "loss": 0.0143, | |
| "mean_token_accuracy": 0.996511273086071, | |
| "num_tokens": 22178449.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.6467065868263475, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 4.273880364213245e-07, | |
| "loss": 0.0137, | |
| "mean_token_accuracy": 0.9965626612305641, | |
| "num_tokens": 22766041.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.766467065868263, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 1.966344612698101e-07, | |
| "loss": 0.0141, | |
| "mean_token_accuracy": 0.996521869301796, | |
| "num_tokens": 23353886.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.88622754491018, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 5.403767186210218e-08, | |
| "loss": 0.0141, | |
| "mean_token_accuracy": 0.9965488821268081, | |
| "num_tokens": 23941445.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 4.4685851095238595e-10, | |
| "loss": 0.0138, | |
| "mean_token_accuracy": 0.996547807204096, | |
| "num_tokens": 24490520.0, | |
| "step": 420 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 420, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.4492608056601805e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |