{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9918991899189917, "eval_steps": 500, "global_step": 831, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.036003600360036005, "grad_norm": 8.28188267008514, "learning_rate": 5e-06, "loss": 1.0366, "step": 10 }, { "epoch": 0.07200720072007201, "grad_norm": 2.372440318983394, "learning_rate": 5e-06, "loss": 0.911, "step": 20 }, { "epoch": 0.10801080108010801, "grad_norm": 1.5430691151708913, "learning_rate": 5e-06, "loss": 0.8737, "step": 30 }, { "epoch": 0.14401440144014402, "grad_norm": 1.6852866024979738, "learning_rate": 5e-06, "loss": 0.8448, "step": 40 }, { "epoch": 0.18001800180018002, "grad_norm": 1.0422319477358, "learning_rate": 5e-06, "loss": 0.825, "step": 50 }, { "epoch": 0.21602160216021601, "grad_norm": 0.9965056562001428, "learning_rate": 5e-06, "loss": 0.8107, "step": 60 }, { "epoch": 0.252025202520252, "grad_norm": 0.7739884302751978, "learning_rate": 5e-06, "loss": 0.7958, "step": 70 }, { "epoch": 0.28802880288028804, "grad_norm": 1.1803788120317176, "learning_rate": 5e-06, "loss": 0.7931, "step": 80 }, { "epoch": 0.324032403240324, "grad_norm": 1.0324340981283762, "learning_rate": 5e-06, "loss": 0.7803, "step": 90 }, { "epoch": 0.36003600360036003, "grad_norm": 0.9575288046763553, "learning_rate": 5e-06, "loss": 0.7766, "step": 100 }, { "epoch": 0.39603960396039606, "grad_norm": 0.8374754661300629, "learning_rate": 5e-06, "loss": 0.7723, "step": 110 }, { "epoch": 0.43204320432043203, "grad_norm": 0.7907545033736981, "learning_rate": 5e-06, "loss": 0.7647, "step": 120 }, { "epoch": 0.46804680468046805, "grad_norm": 0.7676769317854443, "learning_rate": 5e-06, "loss": 0.7671, "step": 130 }, { "epoch": 0.504050405040504, "grad_norm": 0.6051545961732036, "learning_rate": 5e-06, "loss": 0.7621, "step": 140 }, { "epoch": 0.54005400540054, "grad_norm": 0.6744258472614542, "learning_rate": 5e-06, "loss": 0.7577, "step": 150 }, { "epoch": 0.5760576057605761, "grad_norm": 0.7199869824208848, "learning_rate": 5e-06, "loss": 0.7576, "step": 160 }, { "epoch": 0.6120612061206121, "grad_norm": 0.8152068316855611, "learning_rate": 5e-06, "loss": 0.7546, "step": 170 }, { "epoch": 0.648064806480648, "grad_norm": 0.6341559903827244, "learning_rate": 5e-06, "loss": 0.756, "step": 180 }, { "epoch": 0.684068406840684, "grad_norm": 0.6367964073514584, "learning_rate": 5e-06, "loss": 0.753, "step": 190 }, { "epoch": 0.7200720072007201, "grad_norm": 0.647151007125146, "learning_rate": 5e-06, "loss": 0.7568, "step": 200 }, { "epoch": 0.7560756075607561, "grad_norm": 0.7248014312732076, "learning_rate": 5e-06, "loss": 0.7512, "step": 210 }, { "epoch": 0.7920792079207921, "grad_norm": 0.7677811996827786, "learning_rate": 5e-06, "loss": 0.7455, "step": 220 }, { "epoch": 0.828082808280828, "grad_norm": 0.6716581315266134, "learning_rate": 5e-06, "loss": 0.7442, "step": 230 }, { "epoch": 0.8640864086408641, "grad_norm": 0.6700748281170444, "learning_rate": 5e-06, "loss": 0.7463, "step": 240 }, { "epoch": 0.9000900090009001, "grad_norm": 0.7119873555561343, "learning_rate": 5e-06, "loss": 0.7433, "step": 250 }, { "epoch": 0.9360936093609361, "grad_norm": 0.7505227904025245, "learning_rate": 5e-06, "loss": 0.7417, "step": 260 }, { "epoch": 0.9720972097209721, "grad_norm": 0.6278240730412481, "learning_rate": 5e-06, "loss": 0.7392, "step": 270 }, { "epoch": 0.9972997299729973, "eval_loss": 0.7396969199180603, "eval_runtime": 292.6851, "eval_samples_per_second": 25.574, "eval_steps_per_second": 0.4, "step": 277 }, { "epoch": 1.008100810081008, "grad_norm": 1.160799237623972, "learning_rate": 5e-06, "loss": 0.7435, "step": 280 }, { "epoch": 1.0441044104410442, "grad_norm": 0.8183376727189909, "learning_rate": 5e-06, "loss": 0.6915, "step": 290 }, { "epoch": 1.08010801080108, "grad_norm": 0.8424246758574473, "learning_rate": 5e-06, "loss": 0.6946, "step": 300 }, { "epoch": 1.116111611161116, "grad_norm": 0.7487195203197069, "learning_rate": 5e-06, "loss": 0.6902, "step": 310 }, { "epoch": 1.1521152115211521, "grad_norm": 0.7322920833129869, "learning_rate": 5e-06, "loss": 0.6898, "step": 320 }, { "epoch": 1.188118811881188, "grad_norm": 0.9645673518101977, "learning_rate": 5e-06, "loss": 0.6853, "step": 330 }, { "epoch": 1.2241224122412242, "grad_norm": 1.4283577735373754, "learning_rate": 5e-06, "loss": 0.6863, "step": 340 }, { "epoch": 1.2601260126012601, "grad_norm": 1.0313433086608226, "learning_rate": 5e-06, "loss": 0.689, "step": 350 }, { "epoch": 1.296129612961296, "grad_norm": 0.6108529584385496, "learning_rate": 5e-06, "loss": 0.6872, "step": 360 }, { "epoch": 1.3321332133213322, "grad_norm": 0.5789421153850809, "learning_rate": 5e-06, "loss": 0.6815, "step": 370 }, { "epoch": 1.368136813681368, "grad_norm": 0.62526363928803, "learning_rate": 5e-06, "loss": 0.6871, "step": 380 }, { "epoch": 1.4041404140414042, "grad_norm": 0.6010956062715849, "learning_rate": 5e-06, "loss": 0.6809, "step": 390 }, { "epoch": 1.4401440144014401, "grad_norm": 0.7964635599244039, "learning_rate": 5e-06, "loss": 0.6885, "step": 400 }, { "epoch": 1.476147614761476, "grad_norm": 0.5804300574827144, "learning_rate": 5e-06, "loss": 0.6851, "step": 410 }, { "epoch": 1.5121512151215122, "grad_norm": 0.5856848362882421, "learning_rate": 5e-06, "loss": 0.6872, "step": 420 }, { "epoch": 1.5481548154815483, "grad_norm": 0.6958357748276202, "learning_rate": 5e-06, "loss": 0.6872, "step": 430 }, { "epoch": 1.5841584158415842, "grad_norm": 0.6768954740097681, "learning_rate": 5e-06, "loss": 0.6816, "step": 440 }, { "epoch": 1.6201620162016201, "grad_norm": 0.6656001302473941, "learning_rate": 5e-06, "loss": 0.6852, "step": 450 }, { "epoch": 1.656165616561656, "grad_norm": 0.5847297445098926, "learning_rate": 5e-06, "loss": 0.6837, "step": 460 }, { "epoch": 1.6921692169216922, "grad_norm": 0.6677237827833459, "learning_rate": 5e-06, "loss": 0.6828, "step": 470 }, { "epoch": 1.7281728172817283, "grad_norm": 0.6717288073606101, "learning_rate": 5e-06, "loss": 0.6892, "step": 480 }, { "epoch": 1.7641764176417642, "grad_norm": 0.6026508515544339, "learning_rate": 5e-06, "loss": 0.6895, "step": 490 }, { "epoch": 1.8001800180018002, "grad_norm": 0.7348397512725225, "learning_rate": 5e-06, "loss": 0.6888, "step": 500 }, { "epoch": 1.836183618361836, "grad_norm": 0.6767034208815822, "learning_rate": 5e-06, "loss": 0.6822, "step": 510 }, { "epoch": 1.8721872187218722, "grad_norm": 0.5868328971723439, "learning_rate": 5e-06, "loss": 0.6856, "step": 520 }, { "epoch": 1.9081908190819084, "grad_norm": 0.6657122165386851, "learning_rate": 5e-06, "loss": 0.6858, "step": 530 }, { "epoch": 1.9441944194419443, "grad_norm": 0.6843562239717784, "learning_rate": 5e-06, "loss": 0.6854, "step": 540 }, { "epoch": 1.9801980198019802, "grad_norm": 0.6325820368205368, "learning_rate": 5e-06, "loss": 0.6894, "step": 550 }, { "epoch": 1.9981998199819984, "eval_loss": 0.7263253331184387, "eval_runtime": 294.6365, "eval_samples_per_second": 25.404, "eval_steps_per_second": 0.397, "step": 555 }, { "epoch": 2.016201620162016, "grad_norm": 0.8901113520582664, "learning_rate": 5e-06, "loss": 0.6805, "step": 560 }, { "epoch": 2.052205220522052, "grad_norm": 0.8312586311675406, "learning_rate": 5e-06, "loss": 0.6302, "step": 570 }, { "epoch": 2.0882088208820884, "grad_norm": 0.645523505968572, "learning_rate": 5e-06, "loss": 0.6315, "step": 580 }, { "epoch": 2.1242124212421243, "grad_norm": 0.6812801702191339, "learning_rate": 5e-06, "loss": 0.6301, "step": 590 }, { "epoch": 2.16021602160216, "grad_norm": 0.6483786618034394, "learning_rate": 5e-06, "loss": 0.6298, "step": 600 }, { "epoch": 2.196219621962196, "grad_norm": 0.6784530791335628, "learning_rate": 5e-06, "loss": 0.6354, "step": 610 }, { "epoch": 2.232223222322232, "grad_norm": 0.60627028648818, "learning_rate": 5e-06, "loss": 0.6319, "step": 620 }, { "epoch": 2.2682268226822684, "grad_norm": 0.6121470761934804, "learning_rate": 5e-06, "loss": 0.6336, "step": 630 }, { "epoch": 2.3042304230423043, "grad_norm": 0.7236081633441965, "learning_rate": 5e-06, "loss": 0.6345, "step": 640 }, { "epoch": 2.34023402340234, "grad_norm": 0.8548869591277164, "learning_rate": 5e-06, "loss": 0.6363, "step": 650 }, { "epoch": 2.376237623762376, "grad_norm": 0.647183532105941, "learning_rate": 5e-06, "loss": 0.6313, "step": 660 }, { "epoch": 2.412241224122412, "grad_norm": 0.6087053644736625, "learning_rate": 5e-06, "loss": 0.6357, "step": 670 }, { "epoch": 2.4482448244824484, "grad_norm": 0.7533233767115554, "learning_rate": 5e-06, "loss": 0.6383, "step": 680 }, { "epoch": 2.4842484248424843, "grad_norm": 0.8219213281281937, "learning_rate": 5e-06, "loss": 0.6376, "step": 690 }, { "epoch": 2.5202520252025202, "grad_norm": 0.5918279562239513, "learning_rate": 5e-06, "loss": 0.6358, "step": 700 }, { "epoch": 2.556255625562556, "grad_norm": 0.6812649937783365, "learning_rate": 5e-06, "loss": 0.6346, "step": 710 }, { "epoch": 2.592259225922592, "grad_norm": 0.7062046537729157, "learning_rate": 5e-06, "loss": 0.6341, "step": 720 }, { "epoch": 2.6282628262826284, "grad_norm": 0.6513878255971934, "learning_rate": 5e-06, "loss": 0.6345, "step": 730 }, { "epoch": 2.6642664266426643, "grad_norm": 0.6543385256924704, "learning_rate": 5e-06, "loss": 0.6346, "step": 740 }, { "epoch": 2.7002700270027002, "grad_norm": 0.737112200862458, "learning_rate": 5e-06, "loss": 0.6319, "step": 750 }, { "epoch": 2.736273627362736, "grad_norm": 0.7994135143651142, "learning_rate": 5e-06, "loss": 0.636, "step": 760 }, { "epoch": 2.772277227722772, "grad_norm": 0.6219351198221807, "learning_rate": 5e-06, "loss": 0.6426, "step": 770 }, { "epoch": 2.8082808280828084, "grad_norm": 0.7158062619534662, "learning_rate": 5e-06, "loss": 0.632, "step": 780 }, { "epoch": 2.8442844284428443, "grad_norm": 0.5601405956877966, "learning_rate": 5e-06, "loss": 0.6349, "step": 790 }, { "epoch": 2.8802880288028803, "grad_norm": 0.5993882826107069, "learning_rate": 5e-06, "loss": 0.6382, "step": 800 }, { "epoch": 2.916291629162916, "grad_norm": 0.5557179063729003, "learning_rate": 5e-06, "loss": 0.6362, "step": 810 }, { "epoch": 2.952295229522952, "grad_norm": 0.5645061472428777, "learning_rate": 5e-06, "loss": 0.6357, "step": 820 }, { "epoch": 2.9882988298829884, "grad_norm": 0.6301929405752535, "learning_rate": 5e-06, "loss": 0.6329, "step": 830 }, { "epoch": 2.9918991899189917, "eval_loss": 0.7277409434318542, "eval_runtime": 293.9341, "eval_samples_per_second": 25.465, "eval_steps_per_second": 0.398, "step": 831 }, { "epoch": 2.9918991899189917, "step": 831, "total_flos": 1391746571304960.0, "train_loss": 0.7029923594385278, "train_runtime": 48819.4117, "train_samples_per_second": 8.738, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 831, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1391746571304960.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }