| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02004008016032064, | |
| "grad_norm": 0.9890378713607788, | |
| "learning_rate": 1.9047619047619047e-06, | |
| "loss": 1.4301, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.04008016032064128, | |
| "grad_norm": 0.863508939743042, | |
| "learning_rate": 4.2857142857142855e-06, | |
| "loss": 1.4641, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06012024048096192, | |
| "grad_norm": 0.6787753701210022, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.4197, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08016032064128256, | |
| "grad_norm": 0.6681222319602966, | |
| "learning_rate": 9.047619047619047e-06, | |
| "loss": 1.4312, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10020040080160321, | |
| "grad_norm": 0.5663285255432129, | |
| "learning_rate": 1.1428571428571429e-05, | |
| "loss": 1.3627, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.12024048096192384, | |
| "grad_norm": 0.5837746858596802, | |
| "learning_rate": 1.380952380952381e-05, | |
| "loss": 1.3997, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1402805611222445, | |
| "grad_norm": 0.5583487153053284, | |
| "learning_rate": 1.619047619047619e-05, | |
| "loss": 1.3404, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.16032064128256512, | |
| "grad_norm": 0.4913857877254486, | |
| "learning_rate": 1.8571428571428572e-05, | |
| "loss": 1.3386, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18036072144288579, | |
| "grad_norm": 0.573939323425293, | |
| "learning_rate": 2.095238095238095e-05, | |
| "loss": 1.3201, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.20040080160320642, | |
| "grad_norm": 0.508334219455719, | |
| "learning_rate": 2.3333333333333336e-05, | |
| "loss": 1.2503, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22044088176352705, | |
| "grad_norm": 0.5239691734313965, | |
| "learning_rate": 2.5714285714285714e-05, | |
| "loss": 1.2509, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.24048096192384769, | |
| "grad_norm": 0.7545399069786072, | |
| "learning_rate": 2.8095238095238096e-05, | |
| "loss": 1.2877, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2605210420841683, | |
| "grad_norm": 0.4638131260871887, | |
| "learning_rate": 2.9999947463720068e-05, | |
| "loss": 1.219, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.280561122244489, | |
| "grad_norm": 0.5463809370994568, | |
| "learning_rate": 2.9998108732563026e-05, | |
| "loss": 1.2131, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.30060120240480964, | |
| "grad_norm": 0.5909023284912109, | |
| "learning_rate": 2.9993643555405922e-05, | |
| "loss": 1.1617, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.32064128256513025, | |
| "grad_norm": 0.4846280515193939, | |
| "learning_rate": 2.9986552714183782e-05, | |
| "loss": 1.2422, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3406813627254509, | |
| "grad_norm": 0.6445205807685852, | |
| "learning_rate": 2.9976837450633944e-05, | |
| "loss": 1.1748, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.36072144288577157, | |
| "grad_norm": 0.6564697623252869, | |
| "learning_rate": 2.996449946607859e-05, | |
| "loss": 1.1738, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3807615230460922, | |
| "grad_norm": 0.7153516411781311, | |
| "learning_rate": 2.9949540921126824e-05, | |
| "loss": 1.1438, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.40080160320641284, | |
| "grad_norm": 0.6607214212417603, | |
| "learning_rate": 2.9931964435296292e-05, | |
| "loss": 1.1229, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.42084168336673344, | |
| "grad_norm": 0.6068851351737976, | |
| "learning_rate": 2.991177308655447e-05, | |
| "loss": 1.1227, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4408817635270541, | |
| "grad_norm": 0.6417985558509827, | |
| "learning_rate": 2.988897041077966e-05, | |
| "loss": 1.1068, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.46092184368737477, | |
| "grad_norm": 0.6902046203613281, | |
| "learning_rate": 2.9863560401141773e-05, | |
| "loss": 1.1214, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.48096192384769537, | |
| "grad_norm": 0.6481944918632507, | |
| "learning_rate": 2.9835547507403067e-05, | |
| "loss": 1.0765, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.501002004008016, | |
| "grad_norm": 0.7215368747711182, | |
| "learning_rate": 2.980493663513891e-05, | |
| "loss": 1.0368, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5210420841683366, | |
| "grad_norm": 0.7066978216171265, | |
| "learning_rate": 2.9771733144878706e-05, | |
| "loss": 0.9668, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5410821643286573, | |
| "grad_norm": 0.7489072680473328, | |
| "learning_rate": 2.9735942851167202e-05, | |
| "loss": 1.0239, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.561122244488978, | |
| "grad_norm": 0.7932925224304199, | |
| "learning_rate": 2.9697572021546216e-05, | |
| "loss": 0.9795, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5811623246492986, | |
| "grad_norm": 0.9287444949150085, | |
| "learning_rate": 2.9656627375457102e-05, | |
| "loss": 0.9915, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.6012024048096193, | |
| "grad_norm": 0.8538782596588135, | |
| "learning_rate": 2.961311608306403e-05, | |
| "loss": 1.0117, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6212424849699398, | |
| "grad_norm": 0.773285984992981, | |
| "learning_rate": 2.956704576399838e-05, | |
| "loss": 0.9697, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6412825651302605, | |
| "grad_norm": 0.9210479855537415, | |
| "learning_rate": 2.9518424486024382e-05, | |
| "loss": 0.9589, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6613226452905812, | |
| "grad_norm": 0.7253521680831909, | |
| "learning_rate": 2.9467260763626323e-05, | |
| "loss": 0.9559, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.6813627254509018, | |
| "grad_norm": 1.1397440433502197, | |
| "learning_rate": 2.9413563556517483e-05, | |
| "loss": 0.9394, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7014028056112225, | |
| "grad_norm": 0.9524412751197815, | |
| "learning_rate": 2.935734226807114e-05, | |
| "loss": 0.9114, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7214428857715431, | |
| "grad_norm": 0.8871064186096191, | |
| "learning_rate": 2.9298606743673854e-05, | |
| "loss": 0.865, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7414829659318637, | |
| "grad_norm": 0.9255661368370056, | |
| "learning_rate": 2.9237367269001362e-05, | |
| "loss": 0.9108, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7615230460921844, | |
| "grad_norm": 0.9735074639320374, | |
| "learning_rate": 2.9173634568217366e-05, | |
| "loss": 0.8465, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.781563126252505, | |
| "grad_norm": 0.9207878112792969, | |
| "learning_rate": 2.9107419802095546e-05, | |
| "loss": 0.8566, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8016032064128257, | |
| "grad_norm": 0.9188950657844543, | |
| "learning_rate": 2.9038734566065068e-05, | |
| "loss": 0.815, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8216432865731463, | |
| "grad_norm": 1.1056013107299805, | |
| "learning_rate": 2.8967590888180052e-05, | |
| "loss": 0.8299, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8416833667334669, | |
| "grad_norm": 1.0484192371368408, | |
| "learning_rate": 2.8894001227013213e-05, | |
| "loss": 0.8266, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8617234468937875, | |
| "grad_norm": 0.9629917144775391, | |
| "learning_rate": 2.8817978469474137e-05, | |
| "loss": 0.8386, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.8817635270541082, | |
| "grad_norm": 0.9744315147399902, | |
| "learning_rate": 2.873953592855255e-05, | |
| "loss": 0.7977, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9018036072144289, | |
| "grad_norm": 1.0901851654052734, | |
| "learning_rate": 2.8658687340986962e-05, | |
| "loss": 0.7714, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9218436873747495, | |
| "grad_norm": 1.034933090209961, | |
| "learning_rate": 2.8575446864859115e-05, | |
| "loss": 0.8256, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9418837675350702, | |
| "grad_norm": 1.0317702293395996, | |
| "learning_rate": 2.8489829077114644e-05, | |
| "loss": 0.7881, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.9619238476953907, | |
| "grad_norm": 1.095956802368164, | |
| "learning_rate": 2.8401848971010376e-05, | |
| "loss": 0.7371, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9819639278557114, | |
| "grad_norm": 1.2121639251708984, | |
| "learning_rate": 2.8311521953488738e-05, | |
| "loss": 0.7487, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.322547197341919, | |
| "learning_rate": 2.8218863842479712e-05, | |
| "loss": 0.7738, | |
| "step": 250 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.345331284690862e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |