{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02004008016032064, "grad_norm": 0.9890378713607788, "learning_rate": 1.9047619047619047e-06, "loss": 1.4301, "step": 5 }, { "epoch": 0.04008016032064128, "grad_norm": 0.863508939743042, "learning_rate": 4.2857142857142855e-06, "loss": 1.4641, "step": 10 }, { "epoch": 0.06012024048096192, "grad_norm": 0.6787753701210022, "learning_rate": 6.666666666666667e-06, "loss": 1.4197, "step": 15 }, { "epoch": 0.08016032064128256, "grad_norm": 0.6681222319602966, "learning_rate": 9.047619047619047e-06, "loss": 1.4312, "step": 20 }, { "epoch": 0.10020040080160321, "grad_norm": 0.5663285255432129, "learning_rate": 1.1428571428571429e-05, "loss": 1.3627, "step": 25 }, { "epoch": 0.12024048096192384, "grad_norm": 0.5837746858596802, "learning_rate": 1.380952380952381e-05, "loss": 1.3997, "step": 30 }, { "epoch": 0.1402805611222445, "grad_norm": 0.5583487153053284, "learning_rate": 1.619047619047619e-05, "loss": 1.3404, "step": 35 }, { "epoch": 0.16032064128256512, "grad_norm": 0.4913857877254486, "learning_rate": 1.8571428571428572e-05, "loss": 1.3386, "step": 40 }, { "epoch": 0.18036072144288579, "grad_norm": 0.573939323425293, "learning_rate": 2.095238095238095e-05, "loss": 1.3201, "step": 45 }, { "epoch": 0.20040080160320642, "grad_norm": 0.508334219455719, "learning_rate": 2.3333333333333336e-05, "loss": 1.2503, "step": 50 }, { "epoch": 0.22044088176352705, "grad_norm": 0.5239691734313965, "learning_rate": 2.5714285714285714e-05, "loss": 1.2509, "step": 55 }, { "epoch": 0.24048096192384769, "grad_norm": 0.7545399069786072, "learning_rate": 2.8095238095238096e-05, "loss": 1.2877, "step": 60 }, { "epoch": 0.2605210420841683, "grad_norm": 0.4638131260871887, "learning_rate": 2.9999947463720068e-05, "loss": 1.219, "step": 65 }, { "epoch": 0.280561122244489, "grad_norm": 0.5463809370994568, "learning_rate": 2.9998108732563026e-05, "loss": 1.2131, "step": 70 }, { "epoch": 0.30060120240480964, "grad_norm": 0.5909023284912109, "learning_rate": 2.9993643555405922e-05, "loss": 1.1617, "step": 75 }, { "epoch": 0.32064128256513025, "grad_norm": 0.4846280515193939, "learning_rate": 2.9986552714183782e-05, "loss": 1.2422, "step": 80 }, { "epoch": 0.3406813627254509, "grad_norm": 0.6445205807685852, "learning_rate": 2.9976837450633944e-05, "loss": 1.1748, "step": 85 }, { "epoch": 0.36072144288577157, "grad_norm": 0.6564697623252869, "learning_rate": 2.996449946607859e-05, "loss": 1.1738, "step": 90 }, { "epoch": 0.3807615230460922, "grad_norm": 0.7153516411781311, "learning_rate": 2.9949540921126824e-05, "loss": 1.1438, "step": 95 }, { "epoch": 0.40080160320641284, "grad_norm": 0.6607214212417603, "learning_rate": 2.9931964435296292e-05, "loss": 1.1229, "step": 100 }, { "epoch": 0.42084168336673344, "grad_norm": 0.6068851351737976, "learning_rate": 2.991177308655447e-05, "loss": 1.1227, "step": 105 }, { "epoch": 0.4408817635270541, "grad_norm": 0.6417985558509827, "learning_rate": 2.988897041077966e-05, "loss": 1.1068, "step": 110 }, { "epoch": 0.46092184368737477, "grad_norm": 0.6902046203613281, "learning_rate": 2.9863560401141773e-05, "loss": 1.1214, "step": 115 }, { "epoch": 0.48096192384769537, "grad_norm": 0.6481944918632507, "learning_rate": 2.9835547507403067e-05, "loss": 1.0765, "step": 120 }, { "epoch": 0.501002004008016, "grad_norm": 0.7215368747711182, "learning_rate": 2.980493663513891e-05, "loss": 1.0368, "step": 125 }, { "epoch": 0.5210420841683366, "grad_norm": 0.7066978216171265, "learning_rate": 2.9771733144878706e-05, "loss": 0.9668, "step": 130 }, { "epoch": 0.5410821643286573, "grad_norm": 0.7489072680473328, "learning_rate": 2.9735942851167202e-05, "loss": 1.0239, "step": 135 }, { "epoch": 0.561122244488978, "grad_norm": 0.7932925224304199, "learning_rate": 2.9697572021546216e-05, "loss": 0.9795, "step": 140 }, { "epoch": 0.5811623246492986, "grad_norm": 0.9287444949150085, "learning_rate": 2.9656627375457102e-05, "loss": 0.9915, "step": 145 }, { "epoch": 0.6012024048096193, "grad_norm": 0.8538782596588135, "learning_rate": 2.961311608306403e-05, "loss": 1.0117, "step": 150 }, { "epoch": 0.6212424849699398, "grad_norm": 0.773285984992981, "learning_rate": 2.956704576399838e-05, "loss": 0.9697, "step": 155 }, { "epoch": 0.6412825651302605, "grad_norm": 0.9210479855537415, "learning_rate": 2.9518424486024382e-05, "loss": 0.9589, "step": 160 }, { "epoch": 0.6613226452905812, "grad_norm": 0.7253521680831909, "learning_rate": 2.9467260763626323e-05, "loss": 0.9559, "step": 165 }, { "epoch": 0.6813627254509018, "grad_norm": 1.1397440433502197, "learning_rate": 2.9413563556517483e-05, "loss": 0.9394, "step": 170 }, { "epoch": 0.7014028056112225, "grad_norm": 0.9524412751197815, "learning_rate": 2.935734226807114e-05, "loss": 0.9114, "step": 175 }, { "epoch": 0.7214428857715431, "grad_norm": 0.8871064186096191, "learning_rate": 2.9298606743673854e-05, "loss": 0.865, "step": 180 }, { "epoch": 0.7414829659318637, "grad_norm": 0.9255661368370056, "learning_rate": 2.9237367269001362e-05, "loss": 0.9108, "step": 185 }, { "epoch": 0.7615230460921844, "grad_norm": 0.9735074639320374, "learning_rate": 2.9173634568217366e-05, "loss": 0.8465, "step": 190 }, { "epoch": 0.781563126252505, "grad_norm": 0.9207878112792969, "learning_rate": 2.9107419802095546e-05, "loss": 0.8566, "step": 195 }, { "epoch": 0.8016032064128257, "grad_norm": 0.9188950657844543, "learning_rate": 2.9038734566065068e-05, "loss": 0.815, "step": 200 }, { "epoch": 0.8216432865731463, "grad_norm": 1.1056013107299805, "learning_rate": 2.8967590888180052e-05, "loss": 0.8299, "step": 205 }, { "epoch": 0.8416833667334669, "grad_norm": 1.0484192371368408, "learning_rate": 2.8894001227013213e-05, "loss": 0.8266, "step": 210 }, { "epoch": 0.8617234468937875, "grad_norm": 0.9629917144775391, "learning_rate": 2.8817978469474137e-05, "loss": 0.8386, "step": 215 }, { "epoch": 0.8817635270541082, "grad_norm": 0.9744315147399902, "learning_rate": 2.873953592855255e-05, "loss": 0.7977, "step": 220 }, { "epoch": 0.9018036072144289, "grad_norm": 1.0901851654052734, "learning_rate": 2.8658687340986962e-05, "loss": 0.7714, "step": 225 }, { "epoch": 0.9218436873747495, "grad_norm": 1.034933090209961, "learning_rate": 2.8575446864859115e-05, "loss": 0.8256, "step": 230 }, { "epoch": 0.9418837675350702, "grad_norm": 1.0317702293395996, "learning_rate": 2.8489829077114644e-05, "loss": 0.7881, "step": 235 }, { "epoch": 0.9619238476953907, "grad_norm": 1.095956802368164, "learning_rate": 2.8401848971010376e-05, "loss": 0.7371, "step": 240 }, { "epoch": 0.9819639278557114, "grad_norm": 1.2121639251708984, "learning_rate": 2.8311521953488738e-05, "loss": 0.7487, "step": 245 }, { "epoch": 1.0, "grad_norm": 1.322547197341919, "learning_rate": 2.8218863842479712e-05, "loss": 0.7738, "step": 250 } ], "logging_steps": 5, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.345331284690862e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }