| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.2675227394328518, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01, |
| "grad_norm": 7.3019086777975994, |
| "learning_rate": 5e-06, |
| "loss": 0.6939, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 5.150989333266983, |
| "learning_rate": 1e-05, |
| "loss": 0.7167, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 3.640558809610037, |
| "learning_rate": 1.5e-05, |
| "loss": 0.5683, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 7.517999397731128, |
| "learning_rate": 2e-05, |
| "loss": 0.5472, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 1.9687061679463425, |
| "learning_rate": 2.5e-05, |
| "loss": 0.4439, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 3.643479206523606, |
| "learning_rate": 3e-05, |
| "loss": 0.2486, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 2.2754773308695095, |
| "learning_rate": 3.5e-05, |
| "loss": 0.2217, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 1.7144730049127388, |
| "learning_rate": 4e-05, |
| "loss": 0.169, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 3.4702829704135114, |
| "learning_rate": 4.5e-05, |
| "loss": 0.1994, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 1.3985127340985621, |
| "learning_rate": 5e-05, |
| "loss": 0.1612, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.375992184386137, |
| "learning_rate": 4.982758620689655e-05, |
| "loss": 0.1576, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.9528635753013313, |
| "learning_rate": 4.9655172413793107e-05, |
| "loss": 0.1393, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 4.075169010198401, |
| "learning_rate": 4.9482758620689655e-05, |
| "loss": 0.1969, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 2.0953991165751207, |
| "learning_rate": 4.931034482758621e-05, |
| "loss": 0.1294, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 1.942660591044849, |
| "learning_rate": 4.913793103448276e-05, |
| "loss": 0.1306, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 3.1508904015728345, |
| "learning_rate": 4.896551724137931e-05, |
| "loss": 0.1526, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 1.9862795165358471, |
| "learning_rate": 4.8793103448275864e-05, |
| "loss": 0.1186, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 2.633061833817991, |
| "learning_rate": 4.862068965517241e-05, |
| "loss": 0.1457, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 1.8017052368446178, |
| "learning_rate": 4.844827586206897e-05, |
| "loss": 0.1234, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 2.1560694100709803, |
| "learning_rate": 4.827586206896552e-05, |
| "loss": 0.1346, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 1.5737689267430703, |
| "learning_rate": 4.810344827586207e-05, |
| "loss": 0.116, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 1.957864677854788, |
| "learning_rate": 4.793103448275863e-05, |
| "loss": 0.1692, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 2.215039223521855, |
| "learning_rate": 4.7758620689655176e-05, |
| "loss": 0.1245, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 1.370517239734168, |
| "learning_rate": 4.7586206896551725e-05, |
| "loss": 0.1476, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 1.7341334563022532, |
| "learning_rate": 4.741379310344828e-05, |
| "loss": 0.1236, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 1.5994298113068974, |
| "learning_rate": 4.724137931034483e-05, |
| "loss": 0.1161, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 1.5317433190951963, |
| "learning_rate": 4.7068965517241385e-05, |
| "loss": 0.1035, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 2.191977732539556, |
| "learning_rate": 4.689655172413793e-05, |
| "loss": 0.1427, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 1.6038667570691656, |
| "learning_rate": 4.672413793103448e-05, |
| "loss": 0.1225, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 2.577572731831179, |
| "learning_rate": 4.655172413793104e-05, |
| "loss": 0.1399, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 1.6199241001441385, |
| "learning_rate": 4.6379310344827586e-05, |
| "loss": 0.1242, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 2.236577821186196, |
| "learning_rate": 4.6206896551724135e-05, |
| "loss": 0.1656, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 1.7294690605254757, |
| "learning_rate": 4.603448275862069e-05, |
| "loss": 0.1382, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 2.196527516378511, |
| "learning_rate": 4.586206896551724e-05, |
| "loss": 0.1257, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 2.1057444340221463, |
| "learning_rate": 4.5689655172413794e-05, |
| "loss": 0.1238, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 1.5409556870328274, |
| "learning_rate": 4.551724137931035e-05, |
| "loss": 0.1383, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 1.5204083616874053, |
| "learning_rate": 4.53448275862069e-05, |
| "loss": 0.1068, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 2.3557725298931746, |
| "learning_rate": 4.5172413793103454e-05, |
| "loss": 0.1071, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 3.2601538460418644, |
| "learning_rate": 4.5e-05, |
| "loss": 0.125, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 1.9031725385762286, |
| "learning_rate": 4.482758620689655e-05, |
| "loss": 0.0991, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 1.3946050262183123, |
| "learning_rate": 4.465517241379311e-05, |
| "loss": 0.1156, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 1.097644875106397, |
| "learning_rate": 4.4482758620689656e-05, |
| "loss": 0.1366, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 1.37846299019108, |
| "learning_rate": 4.431034482758621e-05, |
| "loss": 0.126, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 1.8340152889320331, |
| "learning_rate": 4.413793103448276e-05, |
| "loss": 0.1066, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 1.8304505611337867, |
| "learning_rate": 4.396551724137931e-05, |
| "loss": 0.0868, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 1.550196490898523, |
| "learning_rate": 4.3793103448275864e-05, |
| "loss": 0.1286, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 2.176112247796248, |
| "learning_rate": 4.362068965517241e-05, |
| "loss": 0.1206, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 1.6589263894091213, |
| "learning_rate": 4.344827586206897e-05, |
| "loss": 0.1008, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 1.8349611508902046, |
| "learning_rate": 4.327586206896552e-05, |
| "loss": 0.1198, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 2.1218964920724126, |
| "learning_rate": 4.3103448275862066e-05, |
| "loss": 0.1166, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.27, |
| "eval_loss": 0.6078919172286987, |
| "eval_runtime": 116.8471, |
| "eval_samples_per_second": 11.288, |
| "eval_steps_per_second": 2.824, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 3000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "total_flos": 14449508352000.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|