| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 1000, |
| "global_step": 19173, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.02607833933135138, |
| "grad_norm": 1.453355312347412, |
| "learning_rate": 4.8696083033432434e-05, |
| "loss": 4.3635, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05215667866270276, |
| "grad_norm": 2.793958902359009, |
| "learning_rate": 4.7392166066864866e-05, |
| "loss": 2.8808, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.05215667866270276, |
| "eval_accuracy": 0.45157249686137624, |
| "eval_loss": 2.42598557472229, |
| "eval_runtime": 55.0824, |
| "eval_samples_per_second": 111.996, |
| "eval_steps_per_second": 3.504, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.07823501799405413, |
| "grad_norm": 1.6672502756118774, |
| "learning_rate": 4.608824910029729e-05, |
| "loss": 2.2617, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.10431335732540552, |
| "grad_norm": 1.5480303764343262, |
| "learning_rate": 4.478433213372973e-05, |
| "loss": 1.9542, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.10431335732540552, |
| "eval_accuracy": 0.5731481168970384, |
| "eval_loss": 1.7747852802276611, |
| "eval_runtime": 54.447, |
| "eval_samples_per_second": 113.303, |
| "eval_steps_per_second": 3.545, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.1303916966567569, |
| "grad_norm": 1.4495408535003662, |
| "learning_rate": 4.348041516716216e-05, |
| "loss": 1.8025, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.15647003598810827, |
| "grad_norm": 1.5346728563308716, |
| "learning_rate": 4.2176498200594586e-05, |
| "loss": 1.7122, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.15647003598810827, |
| "eval_accuracy": 0.6041759264585153, |
| "eval_loss": 1.601155400276184, |
| "eval_runtime": 54.5006, |
| "eval_samples_per_second": 113.191, |
| "eval_steps_per_second": 3.541, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.18254837531945967, |
| "grad_norm": 1.4614065885543823, |
| "learning_rate": 4.087258123402702e-05, |
| "loss": 1.6441, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.20862671465081103, |
| "grad_norm": 1.3059850931167603, |
| "learning_rate": 3.956866426745945e-05, |
| "loss": 1.5962, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.20862671465081103, |
| "eval_accuracy": 0.6221138169642397, |
| "eval_loss": 1.500780701637268, |
| "eval_runtime": 54.6417, |
| "eval_samples_per_second": 112.899, |
| "eval_steps_per_second": 3.532, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.2347050539821624, |
| "grad_norm": 1.243977427482605, |
| "learning_rate": 3.826474730089188e-05, |
| "loss": 1.5564, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.2607833933135138, |
| "grad_norm": 1.3383909463882446, |
| "learning_rate": 3.696083033432431e-05, |
| "loss": 1.5228, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.2607833933135138, |
| "eval_accuracy": 0.6330674277641162, |
| "eval_loss": 1.4412604570388794, |
| "eval_runtime": 54.0686, |
| "eval_samples_per_second": 114.096, |
| "eval_steps_per_second": 3.57, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.2868617326448652, |
| "grad_norm": 1.200243592262268, |
| "learning_rate": 3.5656913367756745e-05, |
| "loss": 1.495, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.31294007197621654, |
| "grad_norm": 1.1654722690582275, |
| "learning_rate": 3.435299640118918e-05, |
| "loss": 1.4706, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.31294007197621654, |
| "eval_accuracy": 0.6413079492629166, |
| "eval_loss": 1.3981218338012695, |
| "eval_runtime": 53.8081, |
| "eval_samples_per_second": 114.648, |
| "eval_steps_per_second": 3.587, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.33901841130756794, |
| "grad_norm": 1.2035291194915771, |
| "learning_rate": 3.30490794346216e-05, |
| "loss": 1.4546, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.36509675063891933, |
| "grad_norm": 1.1253894567489624, |
| "learning_rate": 3.1745162468054033e-05, |
| "loss": 1.4342, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.36509675063891933, |
| "eval_accuracy": 0.6485064619284103, |
| "eval_loss": 1.3601430654525757, |
| "eval_runtime": 54.0475, |
| "eval_samples_per_second": 114.14, |
| "eval_steps_per_second": 3.571, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.3911750899702707, |
| "grad_norm": 1.087329626083374, |
| "learning_rate": 3.0441245501486465e-05, |
| "loss": 1.4142, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.41725342930162207, |
| "grad_norm": 1.2092727422714233, |
| "learning_rate": 2.9137328534918894e-05, |
| "loss": 1.4, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.41725342930162207, |
| "eval_accuracy": 0.6541551132194254, |
| "eval_loss": 1.3316493034362793, |
| "eval_runtime": 53.9118, |
| "eval_samples_per_second": 114.428, |
| "eval_steps_per_second": 3.58, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.44333176863297347, |
| "grad_norm": 1.1115912199020386, |
| "learning_rate": 2.7833411568351332e-05, |
| "loss": 1.3875, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.4694101079643248, |
| "grad_norm": 1.1426430940628052, |
| "learning_rate": 2.652949460178376e-05, |
| "loss": 1.3759, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.4694101079643248, |
| "eval_accuracy": 0.6584719390475539, |
| "eval_loss": 1.3087505102157593, |
| "eval_runtime": 53.1728, |
| "eval_samples_per_second": 116.018, |
| "eval_steps_per_second": 3.63, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.4954884472956762, |
| "grad_norm": 1.11874520778656, |
| "learning_rate": 2.5225577635216192e-05, |
| "loss": 1.3662, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.5215667866270276, |
| "grad_norm": 1.1779828071594238, |
| "learning_rate": 2.392166066864862e-05, |
| "loss": 1.3551, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.5215667866270276, |
| "eval_accuracy": 0.6621520873373268, |
| "eval_loss": 1.2907606363296509, |
| "eval_runtime": 52.8312, |
| "eval_samples_per_second": 116.768, |
| "eval_steps_per_second": 3.653, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.547645125958379, |
| "grad_norm": 1.0862423181533813, |
| "learning_rate": 2.2617743702081052e-05, |
| "loss": 1.3463, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.5737234652897304, |
| "grad_norm": 1.142232894897461, |
| "learning_rate": 2.1313826735513484e-05, |
| "loss": 1.3322, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5737234652897304, |
| "eval_accuracy": 0.6656504862153292, |
| "eval_loss": 1.273500919342041, |
| "eval_runtime": 52.8062, |
| "eval_samples_per_second": 116.823, |
| "eval_steps_per_second": 3.655, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5998018046210817, |
| "grad_norm": 1.1338053941726685, |
| "learning_rate": 2.0009909768945913e-05, |
| "loss": 1.3289, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.6258801439524331, |
| "grad_norm": 1.1444751024246216, |
| "learning_rate": 1.8705992802378344e-05, |
| "loss": 1.3179, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.6258801439524331, |
| "eval_accuracy": 0.66850412628209, |
| "eval_loss": 1.2587136030197144, |
| "eval_runtime": 52.8166, |
| "eval_samples_per_second": 116.8, |
| "eval_steps_per_second": 3.654, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.6519584832837845, |
| "grad_norm": 1.0998445749282837, |
| "learning_rate": 1.7402075835810776e-05, |
| "loss": 1.3157, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.6780368226151359, |
| "grad_norm": 1.1154826879501343, |
| "learning_rate": 1.6098158869243208e-05, |
| "loss": 1.3075, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.6780368226151359, |
| "eval_accuracy": 0.6710853799156917, |
| "eval_loss": 1.2457832098007202, |
| "eval_runtime": 53.0417, |
| "eval_samples_per_second": 116.305, |
| "eval_steps_per_second": 3.639, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.7041151619464873, |
| "grad_norm": 1.135167121887207, |
| "learning_rate": 1.4794241902675638e-05, |
| "loss": 1.3038, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.7301935012778387, |
| "grad_norm": 1.1100085973739624, |
| "learning_rate": 1.349032493610807e-05, |
| "loss": 1.2997, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.7301935012778387, |
| "eval_accuracy": 0.6729637212645385, |
| "eval_loss": 1.236211895942688, |
| "eval_runtime": 52.7336, |
| "eval_samples_per_second": 116.984, |
| "eval_steps_per_second": 3.66, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.7562718406091901, |
| "grad_norm": 1.1251678466796875, |
| "learning_rate": 1.21864079695405e-05, |
| "loss": 1.2897, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.7823501799405413, |
| "grad_norm": 1.146391749382019, |
| "learning_rate": 1.0882491002972931e-05, |
| "loss": 1.2869, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.7823501799405413, |
| "eval_accuracy": 0.6746791695050157, |
| "eval_loss": 1.2276582717895508, |
| "eval_runtime": 52.7689, |
| "eval_samples_per_second": 116.906, |
| "eval_steps_per_second": 3.657, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.8084285192718927, |
| "grad_norm": 1.096614122390747, |
| "learning_rate": 9.578574036405362e-06, |
| "loss": 1.2794, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.8345068586032441, |
| "grad_norm": 1.1054586172103882, |
| "learning_rate": 8.274657069837793e-06, |
| "loss": 1.2766, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.8345068586032441, |
| "eval_accuracy": 0.676915463705815, |
| "eval_loss": 1.2177897691726685, |
| "eval_runtime": 53.1333, |
| "eval_samples_per_second": 116.104, |
| "eval_steps_per_second": 3.632, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.8605851979345955, |
| "grad_norm": 1.099358320236206, |
| "learning_rate": 6.970740103270223e-06, |
| "loss": 1.2751, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.8866635372659469, |
| "grad_norm": 1.1102577447891235, |
| "learning_rate": 5.666823136702655e-06, |
| "loss": 1.271, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.8866635372659469, |
| "eval_accuracy": 0.6781523738263734, |
| "eval_loss": 1.2116661071777344, |
| "eval_runtime": 53.2421, |
| "eval_samples_per_second": 115.867, |
| "eval_steps_per_second": 3.625, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.9127418765972983, |
| "grad_norm": 1.1219637393951416, |
| "learning_rate": 4.362906170135086e-06, |
| "loss": 1.2648, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.9388202159286496, |
| "grad_norm": 1.1664655208587646, |
| "learning_rate": 3.058989203567517e-06, |
| "loss": 1.2624, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.9388202159286496, |
| "eval_accuracy": 0.6793829456936877, |
| "eval_loss": 1.2054765224456787, |
| "eval_runtime": 54.1549, |
| "eval_samples_per_second": 113.914, |
| "eval_steps_per_second": 3.564, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.964898555260001, |
| "grad_norm": 1.1197658777236938, |
| "learning_rate": 1.7550722369999478e-06, |
| "loss": 1.2625, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.9909768945913524, |
| "grad_norm": 1.1014546155929565, |
| "learning_rate": 4.511552704323789e-07, |
| "loss": 1.2593, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.9909768945913524, |
| "eval_accuracy": 0.6801362470917321, |
| "eval_loss": 1.202091932296753, |
| "eval_runtime": 52.9233, |
| "eval_samples_per_second": 116.565, |
| "eval_steps_per_second": 3.647, |
| "step": 19000 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 19173, |
| "total_flos": 3.20619433033728e+17, |
| "train_loss": 1.5428574307948617, |
| "train_runtime": 7427.5337, |
| "train_samples_per_second": 82.602, |
| "train_steps_per_second": 2.581 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 19173, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.20619433033728e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|