{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.22168033695411218, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005542008423852805, "grad_norm": 0.07243233174085617, "learning_rate": 1.2315270935960592e-05, "loss": 1.4594, "step": 25 }, { "epoch": 0.01108401684770561, "grad_norm": 0.40484485030174255, "learning_rate": 2.4630541871921184e-05, "loss": 2.2032, "step": 50 }, { "epoch": 0.016626025271558414, "grad_norm": 0.06850667297840118, "learning_rate": 3.694581280788178e-05, "loss": 1.2931, "step": 75 }, { "epoch": 0.02216803369541122, "grad_norm": 0.4395073354244232, "learning_rate": 4.926108374384237e-05, "loss": 1.5698, "step": 100 }, { "epoch": 0.027710042119264023, "grad_norm": 0.077068030834198, "learning_rate": 6.157635467980296e-05, "loss": 1.0537, "step": 125 }, { "epoch": 0.03325205054311683, "grad_norm": 0.3282291293144226, "learning_rate": 7.389162561576355e-05, "loss": 0.9749, "step": 150 }, { "epoch": 0.03879405896696963, "grad_norm": 0.0593000203371048, "learning_rate": 8.620689655172413e-05, "loss": 0.9349, "step": 175 }, { "epoch": 0.04433606739082244, "grad_norm": 0.25612473487854004, "learning_rate": 9.852216748768474e-05, "loss": 0.8974, "step": 200 }, { "epoch": 0.04987807581467524, "grad_norm": 0.0757347121834755, "learning_rate": 0.00011083743842364534, "loss": 0.9081, "step": 225 }, { "epoch": 0.055420084238528046, "grad_norm": 0.14145499467849731, "learning_rate": 0.00012315270935960593, "loss": 0.8607, "step": 250 }, { "epoch": 0.06096209266238085, "grad_norm": 0.07710155844688416, "learning_rate": 0.00013546798029556652, "loss": 0.8973, "step": 275 }, { "epoch": 0.06650410108623366, "grad_norm": 0.14791467785835266, "learning_rate": 0.0001477832512315271, "loss": 0.7924, "step": 300 }, { "epoch": 0.07204610951008646, "grad_norm": 0.07742594182491302, "learning_rate": 0.00016009852216748767, "loss": 0.8698, "step": 325 }, { "epoch": 0.07758811793393926, "grad_norm": 0.14303487539291382, "learning_rate": 0.00017241379310344826, "loss": 0.786, "step": 350 }, { "epoch": 0.08313012635779206, "grad_norm": 0.0865108072757721, "learning_rate": 0.00018472906403940888, "loss": 0.8606, "step": 375 }, { "epoch": 0.08867213478164487, "grad_norm": 0.7533164024353027, "learning_rate": 0.00019704433497536947, "loss": 0.807, "step": 400 }, { "epoch": 0.09421414320549767, "grad_norm": 0.08325570821762085, "learning_rate": 0.00019999896617927833, "loss": 0.8635, "step": 425 }, { "epoch": 0.09975615162935048, "grad_norm": 0.1043543666601181, "learning_rate": 0.0001999944557842899, "loss": 0.7825, "step": 450 }, { "epoch": 0.10529816005320328, "grad_norm": 0.07949995994567871, "learning_rate": 0.0001999863658806385, "loss": 0.8379, "step": 475 }, { "epoch": 0.11084016847705609, "grad_norm": 0.12020070850849152, "learning_rate": 0.00019997469675791905, "loss": 0.768, "step": 500 }, { "epoch": 0.11638217690090889, "grad_norm": 0.0803595781326294, "learning_rate": 0.00019995944883385196, "loss": 0.8487, "step": 525 }, { "epoch": 0.1219241853247617, "grad_norm": 0.11509452760219574, "learning_rate": 0.0001999406226542682, "loss": 0.7787, "step": 550 }, { "epoch": 0.1274661937486145, "grad_norm": 0.07928384840488434, "learning_rate": 0.00019991821889308987, "loss": 0.8357, "step": 575 }, { "epoch": 0.1330082021724673, "grad_norm": 0.09423446655273438, "learning_rate": 0.00019989223835230606, "loss": 0.7564, "step": 600 }, { "epoch": 0.1385502105963201, "grad_norm": 0.0835939422249794, "learning_rate": 0.000199862681961944, "loss": 0.8568, "step": 625 }, { "epoch": 0.1440922190201729, "grad_norm": 0.09292898327112198, "learning_rate": 0.0001998295507800359, "loss": 0.7612, "step": 650 }, { "epoch": 0.1496342274440257, "grad_norm": 0.07704215496778488, "learning_rate": 0.00019979284599258107, "loss": 0.8263, "step": 675 }, { "epoch": 0.15517623586787851, "grad_norm": 0.10980474948883057, "learning_rate": 0.0001997525689135034, "loss": 0.7677, "step": 700 }, { "epoch": 0.16071824429173132, "grad_norm": 0.08016064018011093, "learning_rate": 0.0001997087209846043, "loss": 0.8344, "step": 725 }, { "epoch": 0.16626025271558412, "grad_norm": 0.0950881615281105, "learning_rate": 0.0001996613037755113, "loss": 0.769, "step": 750 }, { "epoch": 0.17180226113943692, "grad_norm": 0.07932984828948975, "learning_rate": 0.00019961031898362152, "loss": 0.8156, "step": 775 }, { "epoch": 0.17734426956328975, "grad_norm": 0.09336528927087784, "learning_rate": 0.00019955576843404128, "loss": 0.7767, "step": 800 }, { "epoch": 0.18288627798714255, "grad_norm": 0.08560346812009811, "learning_rate": 0.00019949765407952042, "loss": 0.8228, "step": 825 }, { "epoch": 0.18842828641099535, "grad_norm": 0.08475169539451599, "learning_rate": 0.00019943597800038267, "loss": 0.7669, "step": 850 }, { "epoch": 0.19397029483484815, "grad_norm": 0.09038034081459045, "learning_rate": 0.00019937074240445105, "loss": 0.8182, "step": 875 }, { "epoch": 0.19951230325870095, "grad_norm": 0.09195873886346817, "learning_rate": 0.0001993019496269688, "loss": 0.7598, "step": 900 }, { "epoch": 0.20505431168255375, "grad_norm": 0.08655796200037003, "learning_rate": 0.0001992296021305159, "loss": 0.8167, "step": 925 }, { "epoch": 0.21059632010640655, "grad_norm": 0.08353498578071594, "learning_rate": 0.00019915370250492084, "loss": 0.7486, "step": 950 }, { "epoch": 0.21613832853025935, "grad_norm": 0.09225723147392273, "learning_rate": 0.0001990742534671679, "loss": 0.8138, "step": 975 }, { "epoch": 0.22168033695411218, "grad_norm": 0.12104763090610504, "learning_rate": 0.00019899125786129997, "loss": 0.7153, "step": 1000 } ], "logging_steps": 25, "max_steps": 13533, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.78733273181696e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }