{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 237, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06369426751592357, "grad_norm": 1.8735829591751099, "learning_rate": 1.6000000000000003e-05, "loss": 3.2478, "step": 5 }, { "epoch": 0.12738853503184713, "grad_norm": 1.7458922863006592, "learning_rate": 1.9655172413793106e-05, "loss": 3.1764, "step": 10 }, { "epoch": 0.1910828025477707, "grad_norm": 1.80453622341156, "learning_rate": 1.9224137931034484e-05, "loss": 2.9433, "step": 15 }, { "epoch": 0.25477707006369427, "grad_norm": 1.8682630062103271, "learning_rate": 1.8793103448275863e-05, "loss": 2.5689, "step": 20 }, { "epoch": 0.3184713375796178, "grad_norm": 2.233153820037842, "learning_rate": 1.8362068965517245e-05, "loss": 2.1608, "step": 25 }, { "epoch": 0.3821656050955414, "grad_norm": 1.8297864198684692, "learning_rate": 1.7931034482758623e-05, "loss": 1.7308, "step": 30 }, { "epoch": 0.445859872611465, "grad_norm": 1.6991928815841675, "learning_rate": 1.7500000000000002e-05, "loss": 1.3536, "step": 35 }, { "epoch": 0.5095541401273885, "grad_norm": 1.4975662231445312, "learning_rate": 1.706896551724138e-05, "loss": 1.0481, "step": 40 }, { "epoch": 0.5732484076433121, "grad_norm": 1.4604802131652832, "learning_rate": 1.663793103448276e-05, "loss": 0.8506, "step": 45 }, { "epoch": 0.6369426751592356, "grad_norm": 0.740442156791687, "learning_rate": 1.6206896551724137e-05, "loss": 0.7286, "step": 50 }, { "epoch": 0.7006369426751592, "grad_norm": 0.5034754276275635, "learning_rate": 1.577586206896552e-05, "loss": 0.6376, "step": 55 }, { "epoch": 0.7643312101910829, "grad_norm": 0.5653797388076782, "learning_rate": 1.5344827586206898e-05, "loss": 0.5873, "step": 60 }, { "epoch": 0.8280254777070064, "grad_norm": 0.6510421633720398, "learning_rate": 1.4913793103448278e-05, "loss": 0.5896, "step": 65 }, { "epoch": 0.89171974522293, "grad_norm": 0.4169202148914337, "learning_rate": 1.4482758620689657e-05, "loss": 0.5866, "step": 70 }, { "epoch": 0.9554140127388535, "grad_norm": 0.5169458985328674, "learning_rate": 1.4051724137931035e-05, "loss": 0.6369, "step": 75 }, { "epoch": 1.0127388535031847, "grad_norm": 0.5400104522705078, "learning_rate": 1.3620689655172414e-05, "loss": 0.5814, "step": 80 }, { "epoch": 1.0764331210191083, "grad_norm": 0.44644486904144287, "learning_rate": 1.3189655172413794e-05, "loss": 0.5647, "step": 85 }, { "epoch": 1.1401273885350318, "grad_norm": 0.5076264142990112, "learning_rate": 1.2758620689655174e-05, "loss": 0.5476, "step": 90 }, { "epoch": 1.2038216560509554, "grad_norm": 0.46874696016311646, "learning_rate": 1.2327586206896553e-05, "loss": 0.4927, "step": 95 }, { "epoch": 1.267515923566879, "grad_norm": 0.5135082006454468, "learning_rate": 1.1896551724137933e-05, "loss": 0.5551, "step": 100 }, { "epoch": 1.3312101910828025, "grad_norm": 0.5747122168540955, "learning_rate": 1.1465517241379311e-05, "loss": 0.5505, "step": 105 }, { "epoch": 1.394904458598726, "grad_norm": 0.5480216145515442, "learning_rate": 1.103448275862069e-05, "loss": 0.5135, "step": 110 }, { "epoch": 1.4585987261146496, "grad_norm": 0.5516991019248962, "learning_rate": 1.060344827586207e-05, "loss": 0.5969, "step": 115 }, { "epoch": 1.5222929936305731, "grad_norm": 0.6291227340698242, "learning_rate": 1.0172413793103449e-05, "loss": 0.521, "step": 120 }, { "epoch": 1.5859872611464967, "grad_norm": 0.5867863893508911, "learning_rate": 9.741379310344829e-06, "loss": 0.5218, "step": 125 }, { "epoch": 1.6496815286624202, "grad_norm": 0.6907349824905396, "learning_rate": 9.310344827586207e-06, "loss": 0.5387, "step": 130 }, { "epoch": 1.7133757961783438, "grad_norm": 0.7726341485977173, "learning_rate": 8.879310344827588e-06, "loss": 0.4955, "step": 135 }, { "epoch": 1.7770700636942676, "grad_norm": 0.6937519907951355, "learning_rate": 8.448275862068966e-06, "loss": 0.522, "step": 140 }, { "epoch": 1.8407643312101911, "grad_norm": 0.7441688776016235, "learning_rate": 8.017241379310345e-06, "loss": 0.4902, "step": 145 }, { "epoch": 1.9044585987261147, "grad_norm": 0.6664876937866211, "learning_rate": 7.586206896551724e-06, "loss": 0.5205, "step": 150 }, { "epoch": 1.9681528662420382, "grad_norm": 0.6684409976005554, "learning_rate": 7.155172413793104e-06, "loss": 0.5223, "step": 155 }, { "epoch": 2.0254777070063694, "grad_norm": 0.7442811727523804, "learning_rate": 6.724137931034484e-06, "loss": 0.5559, "step": 160 }, { "epoch": 2.089171974522293, "grad_norm": 0.7137691974639893, "learning_rate": 6.293103448275862e-06, "loss": 0.4983, "step": 165 }, { "epoch": 2.1528662420382165, "grad_norm": 0.7240712642669678, "learning_rate": 5.862068965517242e-06, "loss": 0.5343, "step": 170 }, { "epoch": 2.21656050955414, "grad_norm": 0.8344106078147888, "learning_rate": 5.431034482758621e-06, "loss": 0.4778, "step": 175 }, { "epoch": 2.2802547770700636, "grad_norm": 0.8354288935661316, "learning_rate": 5e-06, "loss": 0.4551, "step": 180 }, { "epoch": 2.343949044585987, "grad_norm": 0.8019914627075195, "learning_rate": 4.56896551724138e-06, "loss": 0.4619, "step": 185 }, { "epoch": 2.4076433121019107, "grad_norm": 0.8863728642463684, "learning_rate": 4.137931034482759e-06, "loss": 0.4663, "step": 190 }, { "epoch": 2.4713375796178343, "grad_norm": 0.8233481645584106, "learning_rate": 3.7068965517241385e-06, "loss": 0.4788, "step": 195 }, { "epoch": 2.535031847133758, "grad_norm": 0.9060497283935547, "learning_rate": 3.2758620689655175e-06, "loss": 0.4514, "step": 200 }, { "epoch": 2.5987261146496814, "grad_norm": 0.8957809805870056, "learning_rate": 2.844827586206897e-06, "loss": 0.4729, "step": 205 }, { "epoch": 2.662420382165605, "grad_norm": 0.9156747460365295, "learning_rate": 2.4137931034482762e-06, "loss": 0.5263, "step": 210 }, { "epoch": 2.7261146496815285, "grad_norm": 0.9548827409744263, "learning_rate": 1.982758620689655e-06, "loss": 0.4613, "step": 215 }, { "epoch": 2.789808917197452, "grad_norm": 0.9629204273223877, "learning_rate": 1.5517241379310346e-06, "loss": 0.507, "step": 220 }, { "epoch": 2.853503184713376, "grad_norm": 0.9625599980354309, "learning_rate": 1.120689655172414e-06, "loss": 0.4884, "step": 225 }, { "epoch": 2.917197452229299, "grad_norm": 0.9648425579071045, "learning_rate": 6.896551724137931e-07, "loss": 0.467, "step": 230 }, { "epoch": 2.980891719745223, "grad_norm": 0.93361496925354, "learning_rate": 2.5862068965517245e-07, "loss": 0.4868, "step": 235 } ], "logging_steps": 5, "max_steps": 237, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.973223653835571e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }