| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 1000, | |
| "global_step": 19173, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02607833933135138, | |
| "grad_norm": 1.7889361381530762, | |
| "learning_rate": 4.8696083033432434e-05, | |
| "loss": 4.3597, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05215667866270276, | |
| "grad_norm": 4.862887859344482, | |
| "learning_rate": 4.7392166066864866e-05, | |
| "loss": 2.8882, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.05215667866270276, | |
| "eval_accuracy": 0.4476888906424723, | |
| "eval_loss": 2.4481201171875, | |
| "eval_runtime": 53.2088, | |
| "eval_samples_per_second": 115.939, | |
| "eval_steps_per_second": 3.627, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07823501799405413, | |
| "grad_norm": 2.319875478744507, | |
| "learning_rate": 4.608824910029729e-05, | |
| "loss": 2.2851, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.10431335732540552, | |
| "grad_norm": 1.8578275442123413, | |
| "learning_rate": 4.478433213372973e-05, | |
| "loss": 1.9734, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.10431335732540552, | |
| "eval_accuracy": 0.5687458831064476, | |
| "eval_loss": 1.797487497329712, | |
| "eval_runtime": 53.0283, | |
| "eval_samples_per_second": 116.334, | |
| "eval_steps_per_second": 3.64, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1303916966567569, | |
| "grad_norm": 1.4474238157272339, | |
| "learning_rate": 4.348041516716216e-05, | |
| "loss": 1.8203, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.15647003598810827, | |
| "grad_norm": 1.3132545948028564, | |
| "learning_rate": 4.2176498200594586e-05, | |
| "loss": 1.7272, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.15647003598810827, | |
| "eval_accuracy": 0.6016759609227673, | |
| "eval_loss": 1.6134130954742432, | |
| "eval_runtime": 52.5134, | |
| "eval_samples_per_second": 117.475, | |
| "eval_steps_per_second": 3.675, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.18254837531945967, | |
| "grad_norm": 1.4560322761535645, | |
| "learning_rate": 4.087258123402702e-05, | |
| "loss": 1.6575, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.20862671465081103, | |
| "grad_norm": 1.2237803936004639, | |
| "learning_rate": 3.956866426745945e-05, | |
| "loss": 1.6087, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.20862671465081103, | |
| "eval_accuracy": 0.6195010305207493, | |
| "eval_loss": 1.5135347843170166, | |
| "eval_runtime": 52.9452, | |
| "eval_samples_per_second": 116.517, | |
| "eval_steps_per_second": 3.645, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.2347050539821624, | |
| "grad_norm": 1.2754594087600708, | |
| "learning_rate": 3.826474730089188e-05, | |
| "loss": 1.568, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.2607833933135138, | |
| "grad_norm": 1.2609021663665771, | |
| "learning_rate": 3.696083033432431e-05, | |
| "loss": 1.5337, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2607833933135138, | |
| "eval_accuracy": 0.6312996889343764, | |
| "eval_loss": 1.4511637687683105, | |
| "eval_runtime": 52.2902, | |
| "eval_samples_per_second": 117.976, | |
| "eval_steps_per_second": 3.691, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2868617326448652, | |
| "grad_norm": 1.185180425643921, | |
| "learning_rate": 3.5656913367756745e-05, | |
| "loss": 1.5055, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.31294007197621654, | |
| "grad_norm": 1.199750542640686, | |
| "learning_rate": 3.435299640118918e-05, | |
| "loss": 1.4808, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.31294007197621654, | |
| "eval_accuracy": 0.6399303299203424, | |
| "eval_loss": 1.4058290719985962, | |
| "eval_runtime": 52.2985, | |
| "eval_samples_per_second": 117.958, | |
| "eval_steps_per_second": 3.69, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.33901841130756794, | |
| "grad_norm": 1.1488664150238037, | |
| "learning_rate": 3.30490794346216e-05, | |
| "loss": 1.4643, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.36509675063891933, | |
| "grad_norm": 1.1860216856002808, | |
| "learning_rate": 3.1745162468054033e-05, | |
| "loss": 1.444, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.36509675063891933, | |
| "eval_accuracy": 0.6465880311277955, | |
| "eval_loss": 1.3705039024353027, | |
| "eval_runtime": 52.4523, | |
| "eval_samples_per_second": 117.612, | |
| "eval_steps_per_second": 3.68, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.3911750899702707, | |
| "grad_norm": 1.1179466247558594, | |
| "learning_rate": 3.0441245501486465e-05, | |
| "loss": 1.4237, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.41725342930162207, | |
| "grad_norm": 1.187573790550232, | |
| "learning_rate": 2.9137328534918894e-05, | |
| "loss": 1.4094, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.41725342930162207, | |
| "eval_accuracy": 0.6524089244507151, | |
| "eval_loss": 1.3407615423202515, | |
| "eval_runtime": 52.059, | |
| "eval_samples_per_second": 118.5, | |
| "eval_steps_per_second": 3.707, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.44333176863297347, | |
| "grad_norm": 1.1021448373794556, | |
| "learning_rate": 2.7833411568351332e-05, | |
| "loss": 1.3971, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.4694101079643248, | |
| "grad_norm": 1.1032530069351196, | |
| "learning_rate": 2.652949460178376e-05, | |
| "loss": 1.385, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.4694101079643248, | |
| "eval_accuracy": 0.6566206937313249, | |
| "eval_loss": 1.3190594911575317, | |
| "eval_runtime": 52.2008, | |
| "eval_samples_per_second": 118.178, | |
| "eval_steps_per_second": 3.697, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.4954884472956762, | |
| "grad_norm": 1.1370844841003418, | |
| "learning_rate": 2.5225577635216192e-05, | |
| "loss": 1.3754, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.5215667866270276, | |
| "grad_norm": 1.1347073316574097, | |
| "learning_rate": 2.392166066864862e-05, | |
| "loss": 1.364, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5215667866270276, | |
| "eval_accuracy": 0.6607722496061171, | |
| "eval_loss": 1.298751950263977, | |
| "eval_runtime": 52.8669, | |
| "eval_samples_per_second": 116.689, | |
| "eval_steps_per_second": 3.651, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.547645125958379, | |
| "grad_norm": 1.0973560810089111, | |
| "learning_rate": 2.2617743702081052e-05, | |
| "loss": 1.3556, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.5737234652897304, | |
| "grad_norm": 1.130311131477356, | |
| "learning_rate": 2.1313826735513484e-05, | |
| "loss": 1.3413, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.5737234652897304, | |
| "eval_accuracy": 0.6643421122894452, | |
| "eval_loss": 1.281341552734375, | |
| "eval_runtime": 52.871, | |
| "eval_samples_per_second": 116.68, | |
| "eval_steps_per_second": 3.65, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.5998018046210817, | |
| "grad_norm": 1.146698236465454, | |
| "learning_rate": 2.0009909768945913e-05, | |
| "loss": 1.3378, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.6258801439524331, | |
| "grad_norm": 1.1305065155029297, | |
| "learning_rate": 1.8705992802378344e-05, | |
| "loss": 1.3267, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6258801439524331, | |
| "eval_accuracy": 0.6669354086042105, | |
| "eval_loss": 1.2677369117736816, | |
| "eval_runtime": 52.3527, | |
| "eval_samples_per_second": 117.835, | |
| "eval_steps_per_second": 3.687, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6519584832837845, | |
| "grad_norm": 1.0919705629348755, | |
| "learning_rate": 1.7402075835810776e-05, | |
| "loss": 1.3242, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.6780368226151359, | |
| "grad_norm": 1.0956826210021973, | |
| "learning_rate": 1.6098158869243208e-05, | |
| "loss": 1.3161, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.6780368226151359, | |
| "eval_accuracy": 0.6696912811146832, | |
| "eval_loss": 1.253438949584961, | |
| "eval_runtime": 52.2006, | |
| "eval_samples_per_second": 118.179, | |
| "eval_steps_per_second": 3.697, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.7041151619464873, | |
| "grad_norm": 1.1230990886688232, | |
| "learning_rate": 1.4794241902675638e-05, | |
| "loss": 1.3122, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.7301935012778387, | |
| "grad_norm": 1.101837396621704, | |
| "learning_rate": 1.349032493610807e-05, | |
| "loss": 1.3083, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7301935012778387, | |
| "eval_accuracy": 0.6716822849149414, | |
| "eval_loss": 1.2439320087432861, | |
| "eval_runtime": 52.4583, | |
| "eval_samples_per_second": 117.598, | |
| "eval_steps_per_second": 3.679, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7562718406091901, | |
| "grad_norm": 1.1633756160736084, | |
| "learning_rate": 1.21864079695405e-05, | |
| "loss": 1.2988, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.7823501799405413, | |
| "grad_norm": 1.153860330581665, | |
| "learning_rate": 1.0882491002972931e-05, | |
| "loss": 1.2955, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.7823501799405413, | |
| "eval_accuracy": 0.6731286743052126, | |
| "eval_loss": 1.2366282939910889, | |
| "eval_runtime": 52.0235, | |
| "eval_samples_per_second": 118.581, | |
| "eval_steps_per_second": 3.71, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.8084285192718927, | |
| "grad_norm": 1.1294723749160767, | |
| "learning_rate": 9.578574036405362e-06, | |
| "loss": 1.2883, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.8345068586032441, | |
| "grad_norm": 1.1133977174758911, | |
| "learning_rate": 8.274657069837793e-06, | |
| "loss": 1.285, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.8345068586032441, | |
| "eval_accuracy": 0.6754039487634622, | |
| "eval_loss": 1.2262341976165771, | |
| "eval_runtime": 53.3, | |
| "eval_samples_per_second": 115.741, | |
| "eval_steps_per_second": 3.621, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.8605851979345955, | |
| "grad_norm": 1.113081693649292, | |
| "learning_rate": 6.970740103270223e-06, | |
| "loss": 1.2835, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.8866635372659469, | |
| "grad_norm": 1.0940190553665161, | |
| "learning_rate": 5.666823136702655e-06, | |
| "loss": 1.2796, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.8866635372659469, | |
| "eval_accuracy": 0.6767322881870647, | |
| "eval_loss": 1.2194445133209229, | |
| "eval_runtime": 53.2048, | |
| "eval_samples_per_second": 115.948, | |
| "eval_steps_per_second": 3.627, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.9127418765972983, | |
| "grad_norm": 1.1177361011505127, | |
| "learning_rate": 4.362906170135086e-06, | |
| "loss": 1.2731, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.9388202159286496, | |
| "grad_norm": 1.143923044204712, | |
| "learning_rate": 3.058989203567517e-06, | |
| "loss": 1.271, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.9388202159286496, | |
| "eval_accuracy": 0.6780031079624782, | |
| "eval_loss": 1.2132734060287476, | |
| "eval_runtime": 53.1787, | |
| "eval_samples_per_second": 116.005, | |
| "eval_steps_per_second": 3.629, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.964898555260001, | |
| "grad_norm": 1.1030622720718384, | |
| "learning_rate": 1.7550722369999478e-06, | |
| "loss": 1.2708, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.9909768945913524, | |
| "grad_norm": 1.1247570514678955, | |
| "learning_rate": 4.511552704323789e-07, | |
| "loss": 1.2678, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.9909768945913524, | |
| "eval_accuracy": 0.6787497541946164, | |
| "eval_loss": 1.210123062133789, | |
| "eval_runtime": 52.2652, | |
| "eval_samples_per_second": 118.033, | |
| "eval_steps_per_second": 3.693, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 19173, | |
| "total_flos": 3.20619433033728e+17, | |
| "train_loss": 1.5528178578381098, | |
| "train_runtime": 7442.3449, | |
| "train_samples_per_second": 82.437, | |
| "train_steps_per_second": 2.576 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 19173, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.20619433033728e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |