| { | |
| "best_global_step": 3060, | |
| "best_metric": 0.44437724351882935, | |
| "best_model_checkpoint": "./sft_output_simpleQA\\checkpoint-3060", | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 15300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.16339869281045752, | |
| "grad_norm": 1.7589987516403198, | |
| "learning_rate": 1.9347712418300655e-05, | |
| "loss": 0.6565, | |
| "mean_token_accuracy": 0.8448051940202713, | |
| "num_tokens": 260778.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.32679738562091504, | |
| "grad_norm": 1.7202545404434204, | |
| "learning_rate": 1.8694117647058824e-05, | |
| "loss": 0.5658, | |
| "mean_token_accuracy": 0.8607444787025451, | |
| "num_tokens": 532143.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.49019607843137253, | |
| "grad_norm": 0.5248209834098816, | |
| "learning_rate": 1.8040522875816995e-05, | |
| "loss": 0.4945, | |
| "mean_token_accuracy": 0.8776222993135452, | |
| "num_tokens": 794641.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6535947712418301, | |
| "grad_norm": 1.153851866722107, | |
| "learning_rate": 1.7386928104575163e-05, | |
| "loss": 0.4709, | |
| "mean_token_accuracy": 0.8819570996761322, | |
| "num_tokens": 1067466.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8169934640522876, | |
| "grad_norm": 2.172006607055664, | |
| "learning_rate": 1.6733333333333335e-05, | |
| "loss": 0.4283, | |
| "mean_token_accuracy": 0.8932461794614792, | |
| "num_tokens": 1332041.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.9803921568627451, | |
| "grad_norm": 1.1672557592391968, | |
| "learning_rate": 1.6079738562091506e-05, | |
| "loss": 0.4424, | |
| "mean_token_accuracy": 0.8902202717065811, | |
| "num_tokens": 1591566.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.44437724351882935, | |
| "eval_mean_token_accuracy": 0.8896721144309219, | |
| "eval_num_tokens": 1620476.0, | |
| "eval_runtime": 17.1879, | |
| "eval_samples_per_second": 22.225, | |
| "eval_steps_per_second": 22.225, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.1437908496732025, | |
| "grad_norm": 1.5445764064788818, | |
| "learning_rate": 1.5426143790849674e-05, | |
| "loss": 0.2976, | |
| "mean_token_accuracy": 0.9238177012205124, | |
| "num_tokens": 1847540.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.3071895424836601, | |
| "grad_norm": 1.4802546501159668, | |
| "learning_rate": 1.4772549019607844e-05, | |
| "loss": 0.301, | |
| "mean_token_accuracy": 0.9228217270374298, | |
| "num_tokens": 2114882.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.4705882352941178, | |
| "grad_norm": 1.7694408893585205, | |
| "learning_rate": 1.4118954248366016e-05, | |
| "loss": 0.275, | |
| "mean_token_accuracy": 0.9289815596342087, | |
| "num_tokens": 2383543.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.6339869281045751, | |
| "grad_norm": 0.5041053295135498, | |
| "learning_rate": 1.3465359477124185e-05, | |
| "loss": 0.2656, | |
| "mean_token_accuracy": 0.9312243936061859, | |
| "num_tokens": 2655408.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.7973856209150327, | |
| "grad_norm": 1.395111083984375, | |
| "learning_rate": 1.2811764705882355e-05, | |
| "loss": 0.277, | |
| "mean_token_accuracy": 0.9283499264717102, | |
| "num_tokens": 2909181.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.9607843137254903, | |
| "grad_norm": 0.5925441980361938, | |
| "learning_rate": 1.2158169934640525e-05, | |
| "loss": 0.2716, | |
| "mean_token_accuracy": 0.9297851510047913, | |
| "num_tokens": 3175738.0, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.444907546043396, | |
| "eval_mean_token_accuracy": 0.8944824228736119, | |
| "eval_num_tokens": 3240952.0, | |
| "eval_runtime": 16.6457, | |
| "eval_samples_per_second": 22.949, | |
| "eval_steps_per_second": 22.949, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 2.1241830065359477, | |
| "grad_norm": 0.823498010635376, | |
| "learning_rate": 1.1504575163398695e-05, | |
| "loss": 0.178, | |
| "mean_token_accuracy": 0.9529191081523896, | |
| "num_tokens": 3438042.0, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.287581699346405, | |
| "grad_norm": 2.165581703186035, | |
| "learning_rate": 1.0850980392156865e-05, | |
| "loss": 0.1592, | |
| "mean_token_accuracy": 0.9576994030475616, | |
| "num_tokens": 3704561.0, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.450980392156863, | |
| "grad_norm": 0.9315876960754395, | |
| "learning_rate": 1.0197385620915034e-05, | |
| "loss": 0.1613, | |
| "mean_token_accuracy": 0.9569584859609603, | |
| "num_tokens": 3962939.0, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.6143790849673203, | |
| "grad_norm": 0.6542718410491943, | |
| "learning_rate": 9.543790849673204e-06, | |
| "loss": 0.1498, | |
| "mean_token_accuracy": 0.9596330618858337, | |
| "num_tokens": 4227835.0, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 2.1195223331451416, | |
| "learning_rate": 8.890196078431374e-06, | |
| "loss": 0.1571, | |
| "mean_token_accuracy": 0.9582107998132706, | |
| "num_tokens": 4490222.0, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.9411764705882355, | |
| "grad_norm": 0.9135161638259888, | |
| "learning_rate": 8.236601307189544e-06, | |
| "loss": 0.1617, | |
| "mean_token_accuracy": 0.9568131999969482, | |
| "num_tokens": 4768317.0, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.48607146739959717, | |
| "eval_mean_token_accuracy": 0.8936509206032878, | |
| "eval_num_tokens": 4861428.0, | |
| "eval_runtime": 18.0359, | |
| "eval_samples_per_second": 21.18, | |
| "eval_steps_per_second": 21.18, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 3.104575163398693, | |
| "grad_norm": 0.5220202803611755, | |
| "learning_rate": 7.5830065359477136e-06, | |
| "loss": 0.1193, | |
| "mean_token_accuracy": 0.9686885585784912, | |
| "num_tokens": 5033546.0, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.2679738562091503, | |
| "grad_norm": 1.7922545671463013, | |
| "learning_rate": 6.929411764705883e-06, | |
| "loss": 0.0853, | |
| "mean_token_accuracy": 0.9778412123918533, | |
| "num_tokens": 5299950.0, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.431372549019608, | |
| "grad_norm": 0.9012777209281921, | |
| "learning_rate": 6.275816993464052e-06, | |
| "loss": 0.0882, | |
| "mean_token_accuracy": 0.9765507321357727, | |
| "num_tokens": 5562491.0, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.5947712418300655, | |
| "grad_norm": 0.7348634600639343, | |
| "learning_rate": 5.622222222222222e-06, | |
| "loss": 0.083, | |
| "mean_token_accuracy": 0.9779657925367355, | |
| "num_tokens": 5831882.0, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.758169934640523, | |
| "grad_norm": 1.3759405612945557, | |
| "learning_rate": 4.968627450980393e-06, | |
| "loss": 0.0859, | |
| "mean_token_accuracy": 0.9769166078567505, | |
| "num_tokens": 6089238.0, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.9215686274509802, | |
| "grad_norm": 1.0385228395462036, | |
| "learning_rate": 4.315032679738563e-06, | |
| "loss": 0.0905, | |
| "mean_token_accuracy": 0.9758910204172134, | |
| "num_tokens": 6358110.0, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.5520662665367126, | |
| "eval_mean_token_accuracy": 0.8913699647518977, | |
| "eval_num_tokens": 6481904.0, | |
| "eval_runtime": 18.076, | |
| "eval_samples_per_second": 21.133, | |
| "eval_steps_per_second": 21.133, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 4.084967320261438, | |
| "grad_norm": 1.1012531518936157, | |
| "learning_rate": 3.6614379084967324e-06, | |
| "loss": 0.0645, | |
| "mean_token_accuracy": 0.9829886084794999, | |
| "num_tokens": 6621827.0, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.248366013071895, | |
| "grad_norm": 1.51634681224823, | |
| "learning_rate": 3.007843137254902e-06, | |
| "loss": 0.0492, | |
| "mean_token_accuracy": 0.987127070903778, | |
| "num_tokens": 6878025.0, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.411764705882353, | |
| "grad_norm": 0.771700382232666, | |
| "learning_rate": 2.354248366013072e-06, | |
| "loss": 0.0565, | |
| "mean_token_accuracy": 0.9855971633195877, | |
| "num_tokens": 7144101.0, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 4.57516339869281, | |
| "grad_norm": 1.2798362970352173, | |
| "learning_rate": 1.7006535947712418e-06, | |
| "loss": 0.051, | |
| "mean_token_accuracy": 0.9865847392082214, | |
| "num_tokens": 7416906.0, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.738562091503268, | |
| "grad_norm": 0.4460426867008209, | |
| "learning_rate": 1.0470588235294118e-06, | |
| "loss": 0.0511, | |
| "mean_token_accuracy": 0.9866908140182495, | |
| "num_tokens": 7684903.0, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.901960784313726, | |
| "grad_norm": 0.559505045413971, | |
| "learning_rate": 3.934640522875818e-07, | |
| "loss": 0.051, | |
| "mean_token_accuracy": 0.9866579355001449, | |
| "num_tokens": 7945394.0, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.6016601324081421, | |
| "eval_mean_token_accuracy": 0.8893632412112821, | |
| "eval_num_tokens": 8102380.0, | |
| "eval_runtime": 18.1768, | |
| "eval_samples_per_second": 21.016, | |
| "eval_steps_per_second": 21.016, | |
| "step": 15300 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 15300, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.739895902823936e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |