{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 13196, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06062443164595332, "grad_norm": 11.75, "learning_rate": 9.998590021800365e-06, "loss": 1.3697, "memory/device_reserved (GiB)": 4.98, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 200, "tokens_per_second_per_gpu": 2604.8 }, { "epoch": 0.12124886329190664, "grad_norm": 11.625, "learning_rate": 9.987143623124135e-06, "loss": 1.3194, "memory/device_reserved (GiB)": 4.98, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 400, "tokens_per_second_per_gpu": 2728.55 }, { "epoch": 0.18187329493785995, "grad_norm": 5.1875, "learning_rate": 9.964219627493663e-06, "loss": 1.3388, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 6.73, "memory/max_allocated (GiB)": 6.73, "step": 600, "tokens_per_second_per_gpu": 1955.81 }, { "epoch": 0.24249772658381327, "grad_norm": 9.1875, "learning_rate": 9.929870793041122e-06, "loss": 1.3009, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 800, "tokens_per_second_per_gpu": 2947.03 }, { "epoch": 0.30312215822976657, "grad_norm": 6.59375, "learning_rate": 9.884176171445684e-06, "loss": 1.2999, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 1000, "tokens_per_second_per_gpu": 1335.52 }, { "epoch": 0.3637465898757199, "grad_norm": 8.8125, "learning_rate": 9.82724092600107e-06, "loss": 1.2861, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 1200, "tokens_per_second_per_gpu": 2361.09 }, { "epoch": 0.4243710215216732, "grad_norm": 9.1875, "learning_rate": 9.759196089588842e-06, "loss": 1.2967, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 1400, "tokens_per_second_per_gpu": 2718.04 }, { "epoch": 0.48499545316762654, "grad_norm": 9.125, "learning_rate": 9.68019826311448e-06, "loss": 1.2745, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 1600, "tokens_per_second_per_gpu": 3015.46 }, { "epoch": 0.5456198848135799, "grad_norm": 8.625, "learning_rate": 9.59042925510027e-06, "loss": 1.283, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 1800, "tokens_per_second_per_gpu": 3223.33 }, { "epoch": 0.6062443164595331, "grad_norm": 9.375, "learning_rate": 9.49009566326443e-06, "loss": 1.3139, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 2000, "tokens_per_second_per_gpu": 2673.11 }, { "epoch": 0.6668687481054865, "grad_norm": 7.71875, "learning_rate": 9.37942839904948e-06, "loss": 1.3725, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 2200, "tokens_per_second_per_gpu": 1867.41 }, { "epoch": 0.7274931797514398, "grad_norm": 7.84375, "learning_rate": 9.258682156194094e-06, "loss": 1.3003, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 2400, "tokens_per_second_per_gpu": 2598.4 }, { "epoch": 0.7881176113973931, "grad_norm": 10.6875, "learning_rate": 9.128134824571508e-06, "loss": 1.325, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 2600, "tokens_per_second_per_gpu": 2643.37 }, { "epoch": 0.8487420430433464, "grad_norm": 11.4375, "learning_rate": 8.988086850643474e-06, "loss": 1.3052, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 2800, "tokens_per_second_per_gpu": 3103.11 }, { "epoch": 0.9093664746892998, "grad_norm": 9.3125, "learning_rate": 8.838860546001651e-06, "loss": 1.3247, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 3000, "tokens_per_second_per_gpu": 2654.71 }, { "epoch": 0.9699909063352531, "grad_norm": 10.875, "learning_rate": 8.680799345587778e-06, "loss": 1.2945, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 3200, "tokens_per_second_per_gpu": 2909.79 }, { "epoch": 1.0306153379812064, "grad_norm": 8.25, "learning_rate": 8.514267017299784e-06, "loss": 1.2225, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 3400, "tokens_per_second_per_gpu": 3081.31 }, { "epoch": 1.0912397696271596, "grad_norm": 7.6875, "learning_rate": 8.339646824802882e-06, "loss": 1.1383, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 3600, "tokens_per_second_per_gpu": 1820.96 }, { "epoch": 1.151864201273113, "grad_norm": 9.125, "learning_rate": 8.15734064547238e-06, "loss": 1.0931, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 3800, "tokens_per_second_per_gpu": 2334.09 }, { "epoch": 1.2124886329190665, "grad_norm": 8.8125, "learning_rate": 7.967768045498217e-06, "loss": 1.0925, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 4000, "tokens_per_second_per_gpu": 2451.13 }, { "epoch": 1.2731130645650197, "grad_norm": 6.90625, "learning_rate": 7.771365314279794e-06, "loss": 1.0901, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 4200, "tokens_per_second_per_gpu": 1827.05 }, { "epoch": 1.333737496210973, "grad_norm": 21.25, "learning_rate": 7.568584460333408e-06, "loss": 1.1244, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 4400, "tokens_per_second_per_gpu": 3097.57 }, { "epoch": 1.3943619278569264, "grad_norm": 8.9375, "learning_rate": 7.3598921710231106e-06, "loss": 1.1705, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 4600, "tokens_per_second_per_gpu": 2846.72 }, { "epoch": 1.4549863595028796, "grad_norm": 18.75, "learning_rate": 7.1457687385091415e-06, "loss": 1.1692, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 4800, "tokens_per_second_per_gpu": 1278.69 }, { "epoch": 1.515610791148833, "grad_norm": 11.875, "learning_rate": 6.926706954385761e-06, "loss": 1.1225, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 5000, "tokens_per_second_per_gpu": 2819.93 }, { "epoch": 1.5762352227947862, "grad_norm": 9.1875, "learning_rate": 6.7032109755524384e-06, "loss": 1.155, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 5200, "tokens_per_second_per_gpu": 2809.42 }, { "epoch": 1.6368596544407397, "grad_norm": 7.96875, "learning_rate": 6.475795163928501e-06, "loss": 1.153, "memory/device_reserved (GiB)": 6.92, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 5400, "tokens_per_second_per_gpu": 2595.08 }, { "epoch": 1.697484086086693, "grad_norm": 7.21875, "learning_rate": 6.244982902681568e-06, "loss": 1.1702, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 6.73, "memory/max_allocated (GiB)": 6.73, "step": 5600, "tokens_per_second_per_gpu": 2620.26 }, { "epoch": 1.758108517732646, "grad_norm": 7.84375, "learning_rate": 6.011305391694152e-06, "loss": 1.1159, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 5800, "tokens_per_second_per_gpu": 2988.73 }, { "epoch": 1.8187329493785995, "grad_norm": 4.96875, "learning_rate": 5.775300425040592e-06, "loss": 1.1151, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 6000, "tokens_per_second_per_gpu": 2139.4 }, { "epoch": 1.879357381024553, "grad_norm": 7.90625, "learning_rate": 5.537511153287856e-06, "loss": 1.1135, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 6200, "tokens_per_second_per_gpu": 3019.1 }, { "epoch": 1.9399818126705062, "grad_norm": 7.96875, "learning_rate": 5.298484833468716e-06, "loss": 1.1488, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 6400, "tokens_per_second_per_gpu": 2381.51 }, { "epoch": 2.0006062443164594, "grad_norm": 10.4375, "learning_rate": 5.0587715696041685e-06, "loss": 1.1651, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 6600, "tokens_per_second_per_gpu": 3025.08 }, { "epoch": 2.061230675962413, "grad_norm": 9.0, "learning_rate": 4.8189230466736585e-06, "loss": 1.0324, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 6800, "tokens_per_second_per_gpu": 2687.95 }, { "epoch": 2.1218551076083663, "grad_norm": 8.9375, "learning_rate": 4.579491260946856e-06, "loss": 0.9765, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 7000, "tokens_per_second_per_gpu": 2745.94 }, { "epoch": 2.1824795392543193, "grad_norm": 9.875, "learning_rate": 4.341027249598999e-06, "loss": 1.0136, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 7200, "tokens_per_second_per_gpu": 2545.45 }, { "epoch": 2.2431039709002727, "grad_norm": 8.1875, "learning_rate": 4.104079822533548e-06, "loss": 1.0314, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 6.73, "memory/max_allocated (GiB)": 6.73, "step": 7400, "tokens_per_second_per_gpu": 3023.1 }, { "epoch": 2.303728402546226, "grad_norm": 7.90625, "learning_rate": 3.869194299330752e-06, "loss": 1.0273, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 7600, "tokens_per_second_per_gpu": 2507.53 }, { "epoch": 2.3643528341921796, "grad_norm": 8.3125, "learning_rate": 3.63691125422898e-06, "loss": 1.0177, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 7800, "tokens_per_second_per_gpu": 2346.84 }, { "epoch": 2.424977265838133, "grad_norm": 43.75, "learning_rate": 3.407765272027156e-06, "loss": 1.0256, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 8000, "tokens_per_second_per_gpu": 2946.41 }, { "epoch": 2.485601697484086, "grad_norm": 8.9375, "learning_rate": 3.1822837177715192e-06, "loss": 1.0005, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 8200, "tokens_per_second_per_gpu": 2745.12 }, { "epoch": 2.5462261291300394, "grad_norm": 10.375, "learning_rate": 2.960985523058174e-06, "loss": 1.0119, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 8400, "tokens_per_second_per_gpu": 2990.09 }, { "epoch": 2.606850560775993, "grad_norm": 5.3125, "learning_rate": 2.744379991744713e-06, "loss": 1.0293, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 8600, "tokens_per_second_per_gpu": 2229.96 }, { "epoch": 2.667474992421946, "grad_norm": 6.75, "learning_rate": 2.5329656278194625e-06, "loss": 1.0065, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 8800, "tokens_per_second_per_gpu": 1817.5 }, { "epoch": 2.7280994240678993, "grad_norm": 9.75, "learning_rate": 2.327228988125946e-06, "loss": 1.0057, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 9000, "tokens_per_second_per_gpu": 2842.21 }, { "epoch": 2.7887238557138527, "grad_norm": 9.5625, "learning_rate": 2.1276435625829543e-06, "loss": 0.9829, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 9200, "tokens_per_second_per_gpu": 3086.49 }, { "epoch": 2.849348287359806, "grad_norm": 9.5, "learning_rate": 1.9346686844773253e-06, "loss": 1.0178, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 9400, "tokens_per_second_per_gpu": 2586.56 }, { "epoch": 2.909972719005759, "grad_norm": 7.53125, "learning_rate": 1.7487484733373017e-06, "loss": 1.0099, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 9600, "tokens_per_second_per_gpu": 1847.53 }, { "epoch": 2.9705971506517126, "grad_norm": 12.5, "learning_rate": 1.5703108128194145e-06, "loss": 1.0223, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 9800, "tokens_per_second_per_gpu": 2763.65 }, { "epoch": 3.031221582297666, "grad_norm": 8.0625, "learning_rate": 1.3997663659611982e-06, "loss": 0.9828, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 10000, "tokens_per_second_per_gpu": 2827.68 }, { "epoch": 3.0918460139436195, "grad_norm": 11.75, "learning_rate": 1.2375076300660677e-06, "loss": 0.9693, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 10200, "tokens_per_second_per_gpu": 2538.93 }, { "epoch": 3.1524704455895725, "grad_norm": 4.78125, "learning_rate": 1.083908033395496e-06, "loss": 0.9715, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 10400, "tokens_per_second_per_gpu": 2718.63 }, { "epoch": 3.213094877235526, "grad_norm": 9.75, "learning_rate": 9.393210757473959e-07, "loss": 1.0057, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 10600, "tokens_per_second_per_gpu": 2702.32 }, { "epoch": 3.2737193088814793, "grad_norm": 8.25, "learning_rate": 8.040795148985875e-07, "loss": 0.9761, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 10800, "tokens_per_second_per_gpu": 2937.95 }, { "epoch": 3.3343437405274328, "grad_norm": 7.53125, "learning_rate": 6.784946007837395e-07, "loss": 0.9284, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 11000, "tokens_per_second_per_gpu": 2385.64 }, { "epoch": 3.3949681721733858, "grad_norm": 6.875, "learning_rate": 5.628553591732478e-07, "loss": 0.9631, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 5.61, "memory/max_allocated (GiB)": 5.61, "step": 11200, "tokens_per_second_per_gpu": 2425.36 }, { "epoch": 3.455592603819339, "grad_norm": 8.0625, "learning_rate": 4.57427926498627e-07, "loss": 0.986, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 11400, "tokens_per_second_per_gpu": 2226.29 }, { "epoch": 3.5162170354652926, "grad_norm": 9.875, "learning_rate": 3.6245493735626913e-07, "loss": 1.0002, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 11600, "tokens_per_second_per_gpu": 2925.67 }, { "epoch": 3.5768414671112456, "grad_norm": 9.8125, "learning_rate": 2.7815496609921053e-07, "loss": 0.9356, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 11800, "tokens_per_second_per_gpu": 2927.99 }, { "epoch": 3.637465898757199, "grad_norm": 11.4375, "learning_rate": 2.047220238020181e-07, "loss": 1.0038, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 12000, "tokens_per_second_per_gpu": 2644.97 }, { "epoch": 3.6980903304031525, "grad_norm": 8.6875, "learning_rate": 1.4232511175652108e-07, "loss": 0.9089, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 12200, "tokens_per_second_per_gpu": 1991.61 }, { "epoch": 3.758714762049106, "grad_norm": 9.8125, "learning_rate": 9.110783252598453e-08, "loss": 0.9737, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 12400, "tokens_per_second_per_gpu": 2470.36 }, { "epoch": 3.8193391936950594, "grad_norm": 38.0, "learning_rate": 5.118805945285188e-08, "loss": 0.9549, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 12600, "tokens_per_second_per_gpu": 3157.65 }, { "epoch": 3.8799636253410124, "grad_norm": 10.125, "learning_rate": 2.2657665380669824e-08, "loss": 0.9517, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 12800, "tokens_per_second_per_gpu": 2685.23 }, { "epoch": 3.940588056986966, "grad_norm": 11.875, "learning_rate": 5.5823112145286796e-09, "loss": 0.9891, "memory/device_reserved (GiB)": 6.96, "memory/max_active (GiB)": 4.63, "memory/max_allocated (GiB)": 4.63, "step": 13000, "tokens_per_second_per_gpu": 2045.68 } ], "logging_steps": 200, "max_steps": 13196, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 3299, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.304147161540198e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }