{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9878260869565216, "eval_steps": 500, "global_step": 213, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06956521739130435, "grad_norm": 1.1811336278915405, "learning_rate": 4.99565044765711e-05, "loss": 0.7566, "num_input_tokens_seen": 137920, "step": 5 }, { "epoch": 0.1391304347826087, "grad_norm": 0.6466936469078064, "learning_rate": 4.978006327248537e-05, "loss": 0.6594, "num_input_tokens_seen": 271824, "step": 10 }, { "epoch": 0.20869565217391303, "grad_norm": 0.7790480852127075, "learning_rate": 4.946891632198452e-05, "loss": 0.6611, "num_input_tokens_seen": 411200, "step": 15 }, { "epoch": 0.2782608695652174, "grad_norm": 0.6762462258338928, "learning_rate": 4.9024755036496795e-05, "loss": 0.6368, "num_input_tokens_seen": 544816, "step": 20 }, { "epoch": 0.34782608695652173, "grad_norm": 0.5914238095283508, "learning_rate": 4.8449993900474187e-05, "loss": 0.6475, "num_input_tokens_seen": 682528, "step": 25 }, { "epoch": 0.41739130434782606, "grad_norm": 0.7006980776786804, "learning_rate": 4.774775734612604e-05, "loss": 0.6333, "num_input_tokens_seen": 813456, "step": 30 }, { "epoch": 0.48695652173913045, "grad_norm": 0.6066564321517944, "learning_rate": 4.6921862768838855e-05, "loss": 0.581, "num_input_tokens_seen": 949040, "step": 35 }, { "epoch": 0.5565217391304348, "grad_norm": 0.5768821835517883, "learning_rate": 4.597679977561122e-05, "loss": 0.5915, "num_input_tokens_seen": 1085984, "step": 40 }, { "epoch": 0.6260869565217392, "grad_norm": 0.7307961583137512, "learning_rate": 4.491770577931057e-05, "loss": 0.5869, "num_input_tokens_seen": 1217520, "step": 45 }, { "epoch": 0.6956521739130435, "grad_norm": 0.6931986808776855, "learning_rate": 4.375033807142267e-05, "loss": 0.5684, "num_input_tokens_seen": 1353968, "step": 50 }, { "epoch": 0.7652173913043478, "grad_norm": 0.5990815758705139, "learning_rate": 4.2481042525107854e-05, "loss": 0.5967, "num_input_tokens_seen": 1491728, "step": 55 }, { "epoch": 0.8347826086956521, "grad_norm": 0.5957231521606445, "learning_rate": 4.111671909869582e-05, "loss": 0.5831, "num_input_tokens_seen": 1625184, "step": 60 }, { "epoch": 0.9043478260869565, "grad_norm": 0.6507815718650818, "learning_rate": 3.9664784327143955e-05, "loss": 0.5782, "num_input_tokens_seen": 1758384, "step": 65 }, { "epoch": 0.9739130434782609, "grad_norm": 0.6880366802215576, "learning_rate": 3.813313100535747e-05, "loss": 0.5842, "num_input_tokens_seen": 1893088, "step": 70 }, { "epoch": 1.0556521739130436, "grad_norm": 0.6173655390739441, "learning_rate": 3.653008528253509e-05, "loss": 0.5629, "num_input_tokens_seen": 2051200, "step": 75 }, { "epoch": 1.1252173913043477, "grad_norm": 0.7451260685920715, "learning_rate": 3.486436140077764e-05, "loss": 0.5549, "num_input_tokens_seen": 2183136, "step": 80 }, { "epoch": 1.1947826086956521, "grad_norm": 0.6699721217155457, "learning_rate": 3.3145014324002944e-05, "loss": 0.5263, "num_input_tokens_seen": 2317616, "step": 85 }, { "epoch": 1.2643478260869565, "grad_norm": 0.6695335507392883, "learning_rate": 3.1381390514678696e-05, "loss": 0.505, "num_input_tokens_seen": 2457440, "step": 90 }, { "epoch": 1.333913043478261, "grad_norm": 0.7196683287620544, "learning_rate": 2.9583077125953716e-05, "loss": 0.555, "num_input_tokens_seen": 2592320, "step": 95 }, { "epoch": 1.4034782608695653, "grad_norm": 0.6481020450592041, "learning_rate": 2.775984988538175e-05, "loss": 0.5266, "num_input_tokens_seen": 2729856, "step": 100 }, { "epoch": 1.4730434782608697, "grad_norm": 0.8577863574028015, "learning_rate": 2.592161995354479e-05, "loss": 0.5364, "num_input_tokens_seen": 2866464, "step": 105 }, { "epoch": 1.542608695652174, "grad_norm": 0.7868128418922424, "learning_rate": 2.4078380046455222e-05, "loss": 0.5215, "num_input_tokens_seen": 3003392, "step": 110 }, { "epoch": 1.6121739130434782, "grad_norm": 0.8645579218864441, "learning_rate": 2.224015011461826e-05, "loss": 0.5354, "num_input_tokens_seen": 3141984, "step": 115 }, { "epoch": 1.6817391304347826, "grad_norm": 0.6744593381881714, "learning_rate": 2.0416922874046293e-05, "loss": 0.5098, "num_input_tokens_seen": 3281264, "step": 120 }, { "epoch": 1.7513043478260868, "grad_norm": 0.808380126953125, "learning_rate": 1.8618609485321313e-05, "loss": 0.4939, "num_input_tokens_seen": 3418208, "step": 125 }, { "epoch": 1.8208695652173912, "grad_norm": 0.7624004483222961, "learning_rate": 1.6854985675997066e-05, "loss": 0.4919, "num_input_tokens_seen": 3552624, "step": 130 }, { "epoch": 1.8904347826086956, "grad_norm": 0.8070163726806641, "learning_rate": 1.5135638599222368e-05, "loss": 0.5245, "num_input_tokens_seen": 3686752, "step": 135 }, { "epoch": 1.96, "grad_norm": 0.9934524297714233, "learning_rate": 1.3469914717464916e-05, "loss": 0.5566, "num_input_tokens_seen": 3809216, "step": 140 }, { "epoch": 2.0417391304347827, "grad_norm": 0.855629563331604, "learning_rate": 1.1866868994642535e-05, "loss": 0.5672, "num_input_tokens_seen": 3971664, "step": 145 }, { "epoch": 2.111304347826087, "grad_norm": 0.8984305262565613, "learning_rate": 1.0335215672856046e-05, "loss": 0.5116, "num_input_tokens_seen": 4108736, "step": 150 }, { "epoch": 2.1808695652173915, "grad_norm": 0.9083377718925476, "learning_rate": 8.883280901304187e-06, "loss": 0.4855, "num_input_tokens_seen": 4246544, "step": 155 }, { "epoch": 2.2504347826086954, "grad_norm": 0.8181976079940796, "learning_rate": 7.518957474892149e-06, "loss": 0.4634, "num_input_tokens_seen": 4385600, "step": 160 }, { "epoch": 2.32, "grad_norm": 0.8508033156394958, "learning_rate": 6.2496619285773356e-06, "loss": 0.4784, "num_input_tokens_seen": 4515424, "step": 165 }, { "epoch": 2.3895652173913042, "grad_norm": 0.813805103302002, "learning_rate": 5.082294220689435e-06, "loss": 0.4562, "num_input_tokens_seen": 4656832, "step": 170 }, { "epoch": 2.4591304347826086, "grad_norm": 0.8060736656188965, "learning_rate": 4.023200224388787e-06, "loss": 0.4635, "num_input_tokens_seen": 4787584, "step": 175 }, { "epoch": 2.528695652173913, "grad_norm": 0.9117311239242554, "learning_rate": 3.078137231161146e-06, "loss": 0.5231, "num_input_tokens_seen": 4920640, "step": 180 }, { "epoch": 2.5982608695652174, "grad_norm": 0.751384437084198, "learning_rate": 2.2522426538739566e-06, "loss": 0.4908, "num_input_tokens_seen": 5061360, "step": 185 }, { "epoch": 2.667826086956522, "grad_norm": 0.9320093989372253, "learning_rate": 1.5500060995258137e-06, "loss": 0.5252, "num_input_tokens_seen": 5192048, "step": 190 }, { "epoch": 2.737391304347826, "grad_norm": 0.7602404356002808, "learning_rate": 9.75244963503205e-07, "loss": 0.4728, "num_input_tokens_seen": 5317552, "step": 195 }, { "epoch": 2.8069565217391306, "grad_norm": 0.7692611813545227, "learning_rate": 5.310836780154899e-07, "loss": 0.4781, "num_input_tokens_seen": 5456864, "step": 200 }, { "epoch": 2.8765217391304345, "grad_norm": 0.9036366939544678, "learning_rate": 2.1993672751463579e-07, "loss": 0.5074, "num_input_tokens_seen": 5592320, "step": 205 }, { "epoch": 2.9460869565217394, "grad_norm": 0.8938913345336914, "learning_rate": 4.3495523428899174e-08, "loss": 0.454, "num_input_tokens_seen": 5728832, "step": 210 }, { "epoch": 2.9878260869565216, "num_input_tokens_seen": 5812176, "step": 213, "total_flos": 1.3025650857335194e+17, "train_loss": 0.5454816756673821, "train_runtime": 5250.0748, "train_samples_per_second": 0.657, "train_steps_per_second": 0.041 } ], "logging_steps": 5, "max_steps": 213, "num_input_tokens_seen": 5812176, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3025650857335194e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }