| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9918991899189917, | |
| "eval_steps": 500, | |
| "global_step": 831, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.036003600360036005, | |
| "grad_norm": 8.28188267008514, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0366, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07200720072007201, | |
| "grad_norm": 2.372440318983394, | |
| "learning_rate": 5e-06, | |
| "loss": 0.911, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10801080108010801, | |
| "grad_norm": 1.5430691151708913, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8737, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14401440144014402, | |
| "grad_norm": 1.6852866024979738, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8448, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18001800180018002, | |
| "grad_norm": 1.0422319477358, | |
| "learning_rate": 5e-06, | |
| "loss": 0.825, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.21602160216021601, | |
| "grad_norm": 0.9965056562001428, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8107, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.252025202520252, | |
| "grad_norm": 0.7739884302751978, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7958, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.28802880288028804, | |
| "grad_norm": 1.1803788120317176, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7931, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.324032403240324, | |
| "grad_norm": 1.0324340981283762, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7803, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.36003600360036003, | |
| "grad_norm": 0.9575288046763553, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7766, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.39603960396039606, | |
| "grad_norm": 0.8374754661300629, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7723, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.43204320432043203, | |
| "grad_norm": 0.7907545033736981, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7647, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.46804680468046805, | |
| "grad_norm": 0.7676769317854443, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7671, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.504050405040504, | |
| "grad_norm": 0.6051545961732036, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7621, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.54005400540054, | |
| "grad_norm": 0.6744258472614542, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7577, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5760576057605761, | |
| "grad_norm": 0.7199869824208848, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7576, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6120612061206121, | |
| "grad_norm": 0.8152068316855611, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7546, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.648064806480648, | |
| "grad_norm": 0.6341559903827244, | |
| "learning_rate": 5e-06, | |
| "loss": 0.756, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.684068406840684, | |
| "grad_norm": 0.6367964073514584, | |
| "learning_rate": 5e-06, | |
| "loss": 0.753, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7200720072007201, | |
| "grad_norm": 0.647151007125146, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7568, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7560756075607561, | |
| "grad_norm": 0.7248014312732076, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7512, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7920792079207921, | |
| "grad_norm": 0.7677811996827786, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7455, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.828082808280828, | |
| "grad_norm": 0.6716581315266134, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7442, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8640864086408641, | |
| "grad_norm": 0.6700748281170444, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7463, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9000900090009001, | |
| "grad_norm": 0.7119873555561343, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7433, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9360936093609361, | |
| "grad_norm": 0.7505227904025245, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7417, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9720972097209721, | |
| "grad_norm": 0.6278240730412481, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7392, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9972997299729973, | |
| "eval_loss": 0.7396969199180603, | |
| "eval_runtime": 292.6851, | |
| "eval_samples_per_second": 25.574, | |
| "eval_steps_per_second": 0.4, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.008100810081008, | |
| "grad_norm": 1.160799237623972, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7435, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0441044104410442, | |
| "grad_norm": 0.8183376727189909, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6915, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.08010801080108, | |
| "grad_norm": 0.8424246758574473, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6946, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.116111611161116, | |
| "grad_norm": 0.7487195203197069, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6902, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.1521152115211521, | |
| "grad_norm": 0.7322920833129869, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6898, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.188118811881188, | |
| "grad_norm": 0.9645673518101977, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6853, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.2241224122412242, | |
| "grad_norm": 1.4283577735373754, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6863, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2601260126012601, | |
| "grad_norm": 1.0313433086608226, | |
| "learning_rate": 5e-06, | |
| "loss": 0.689, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.296129612961296, | |
| "grad_norm": 0.6108529584385496, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6872, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.3321332133213322, | |
| "grad_norm": 0.5789421153850809, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6815, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.368136813681368, | |
| "grad_norm": 0.62526363928803, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6871, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4041404140414042, | |
| "grad_norm": 0.6010956062715849, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6809, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.4401440144014401, | |
| "grad_norm": 0.7964635599244039, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6885, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.476147614761476, | |
| "grad_norm": 0.5804300574827144, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6851, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.5121512151215122, | |
| "grad_norm": 0.5856848362882421, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6872, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.5481548154815483, | |
| "grad_norm": 0.6958357748276202, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6872, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.5841584158415842, | |
| "grad_norm": 0.6768954740097681, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6816, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.6201620162016201, | |
| "grad_norm": 0.6656001302473941, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6852, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.656165616561656, | |
| "grad_norm": 0.5847297445098926, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6837, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.6921692169216922, | |
| "grad_norm": 0.6677237827833459, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6828, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.7281728172817283, | |
| "grad_norm": 0.6717288073606101, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6892, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.7641764176417642, | |
| "grad_norm": 0.6026508515544339, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6895, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.8001800180018002, | |
| "grad_norm": 0.7348397512725225, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6888, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.836183618361836, | |
| "grad_norm": 0.6767034208815822, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6822, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.8721872187218722, | |
| "grad_norm": 0.5868328971723439, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6856, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.9081908190819084, | |
| "grad_norm": 0.6657122165386851, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6858, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.9441944194419443, | |
| "grad_norm": 0.6843562239717784, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6854, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.9801980198019802, | |
| "grad_norm": 0.6325820368205368, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6894, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.9981998199819984, | |
| "eval_loss": 0.7263253331184387, | |
| "eval_runtime": 294.6365, | |
| "eval_samples_per_second": 25.404, | |
| "eval_steps_per_second": 0.397, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 2.016201620162016, | |
| "grad_norm": 0.8901113520582664, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6805, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.052205220522052, | |
| "grad_norm": 0.8312586311675406, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6302, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.0882088208820884, | |
| "grad_norm": 0.645523505968572, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6315, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.1242124212421243, | |
| "grad_norm": 0.6812801702191339, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6301, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.16021602160216, | |
| "grad_norm": 0.6483786618034394, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6298, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.196219621962196, | |
| "grad_norm": 0.6784530791335628, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6354, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.232223222322232, | |
| "grad_norm": 0.60627028648818, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6319, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.2682268226822684, | |
| "grad_norm": 0.6121470761934804, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6336, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.3042304230423043, | |
| "grad_norm": 0.7236081633441965, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6345, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.34023402340234, | |
| "grad_norm": 0.8548869591277164, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6363, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.376237623762376, | |
| "grad_norm": 0.647183532105941, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6313, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.412241224122412, | |
| "grad_norm": 0.6087053644736625, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6357, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.4482448244824484, | |
| "grad_norm": 0.7533233767115554, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6383, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.4842484248424843, | |
| "grad_norm": 0.8219213281281937, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6376, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.5202520252025202, | |
| "grad_norm": 0.5918279562239513, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6358, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.556255625562556, | |
| "grad_norm": 0.6812649937783365, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6346, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.592259225922592, | |
| "grad_norm": 0.7062046537729157, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6341, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.6282628262826284, | |
| "grad_norm": 0.6513878255971934, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6345, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.6642664266426643, | |
| "grad_norm": 0.6543385256924704, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6346, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.7002700270027002, | |
| "grad_norm": 0.737112200862458, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6319, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.736273627362736, | |
| "grad_norm": 0.7994135143651142, | |
| "learning_rate": 5e-06, | |
| "loss": 0.636, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.772277227722772, | |
| "grad_norm": 0.6219351198221807, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6426, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.8082808280828084, | |
| "grad_norm": 0.7158062619534662, | |
| "learning_rate": 5e-06, | |
| "loss": 0.632, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.8442844284428443, | |
| "grad_norm": 0.5601405956877966, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6349, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.8802880288028803, | |
| "grad_norm": 0.5993882826107069, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6382, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.916291629162916, | |
| "grad_norm": 0.5557179063729003, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6362, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.952295229522952, | |
| "grad_norm": 0.5645061472428777, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6357, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.9882988298829884, | |
| "grad_norm": 0.6301929405752535, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6329, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.9918991899189917, | |
| "eval_loss": 0.7277409434318542, | |
| "eval_runtime": 293.9341, | |
| "eval_samples_per_second": 25.465, | |
| "eval_steps_per_second": 0.398, | |
| "step": 831 | |
| }, | |
| { | |
| "epoch": 2.9918991899189917, | |
| "step": 831, | |
| "total_flos": 1391746571304960.0, | |
| "train_loss": 0.7029923594385278, | |
| "train_runtime": 48819.4117, | |
| "train_samples_per_second": 8.738, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 831, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1391746571304960.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |