| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 500, | |
| "global_step": 13196, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.06062443164595332, | |
| "grad_norm": 11.75, | |
| "learning_rate": 9.998590021800365e-06, | |
| "loss": 1.3697, | |
| "memory/device_reserved (GiB)": 4.98, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 200, | |
| "tokens_per_second_per_gpu": 2604.8 | |
| }, | |
| { | |
| "epoch": 0.12124886329190664, | |
| "grad_norm": 11.625, | |
| "learning_rate": 9.987143623124135e-06, | |
| "loss": 1.3194, | |
| "memory/device_reserved (GiB)": 4.98, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 400, | |
| "tokens_per_second_per_gpu": 2728.55 | |
| }, | |
| { | |
| "epoch": 0.18187329493785995, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 9.964219627493663e-06, | |
| "loss": 1.3388, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 6.73, | |
| "memory/max_allocated (GiB)": 6.73, | |
| "step": 600, | |
| "tokens_per_second_per_gpu": 1955.81 | |
| }, | |
| { | |
| "epoch": 0.24249772658381327, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 9.929870793041122e-06, | |
| "loss": 1.3009, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 800, | |
| "tokens_per_second_per_gpu": 2947.03 | |
| }, | |
| { | |
| "epoch": 0.30312215822976657, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 9.884176171445684e-06, | |
| "loss": 1.2999, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 1000, | |
| "tokens_per_second_per_gpu": 1335.52 | |
| }, | |
| { | |
| "epoch": 0.3637465898757199, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 9.82724092600107e-06, | |
| "loss": 1.2861, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 1200, | |
| "tokens_per_second_per_gpu": 2361.09 | |
| }, | |
| { | |
| "epoch": 0.4243710215216732, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 9.759196089588842e-06, | |
| "loss": 1.2967, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 1400, | |
| "tokens_per_second_per_gpu": 2718.04 | |
| }, | |
| { | |
| "epoch": 0.48499545316762654, | |
| "grad_norm": 9.125, | |
| "learning_rate": 9.68019826311448e-06, | |
| "loss": 1.2745, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 1600, | |
| "tokens_per_second_per_gpu": 3015.46 | |
| }, | |
| { | |
| "epoch": 0.5456198848135799, | |
| "grad_norm": 8.625, | |
| "learning_rate": 9.59042925510027e-06, | |
| "loss": 1.283, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 1800, | |
| "tokens_per_second_per_gpu": 3223.33 | |
| }, | |
| { | |
| "epoch": 0.6062443164595331, | |
| "grad_norm": 9.375, | |
| "learning_rate": 9.49009566326443e-06, | |
| "loss": 1.3139, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 2000, | |
| "tokens_per_second_per_gpu": 2673.11 | |
| }, | |
| { | |
| "epoch": 0.6668687481054865, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 9.37942839904948e-06, | |
| "loss": 1.3725, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 2200, | |
| "tokens_per_second_per_gpu": 1867.41 | |
| }, | |
| { | |
| "epoch": 0.7274931797514398, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 9.258682156194094e-06, | |
| "loss": 1.3003, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 2400, | |
| "tokens_per_second_per_gpu": 2598.4 | |
| }, | |
| { | |
| "epoch": 0.7881176113973931, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 9.128134824571508e-06, | |
| "loss": 1.325, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 2600, | |
| "tokens_per_second_per_gpu": 2643.37 | |
| }, | |
| { | |
| "epoch": 0.8487420430433464, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 8.988086850643474e-06, | |
| "loss": 1.3052, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 2800, | |
| "tokens_per_second_per_gpu": 3103.11 | |
| }, | |
| { | |
| "epoch": 0.9093664746892998, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 8.838860546001651e-06, | |
| "loss": 1.3247, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 3000, | |
| "tokens_per_second_per_gpu": 2654.71 | |
| }, | |
| { | |
| "epoch": 0.9699909063352531, | |
| "grad_norm": 10.875, | |
| "learning_rate": 8.680799345587778e-06, | |
| "loss": 1.2945, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 3200, | |
| "tokens_per_second_per_gpu": 2909.79 | |
| }, | |
| { | |
| "epoch": 1.0306153379812064, | |
| "grad_norm": 8.25, | |
| "learning_rate": 8.514267017299784e-06, | |
| "loss": 1.2225, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 3400, | |
| "tokens_per_second_per_gpu": 3081.31 | |
| }, | |
| { | |
| "epoch": 1.0912397696271596, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 8.339646824802882e-06, | |
| "loss": 1.1383, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 3600, | |
| "tokens_per_second_per_gpu": 1820.96 | |
| }, | |
| { | |
| "epoch": 1.151864201273113, | |
| "grad_norm": 9.125, | |
| "learning_rate": 8.15734064547238e-06, | |
| "loss": 1.0931, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 3800, | |
| "tokens_per_second_per_gpu": 2334.09 | |
| }, | |
| { | |
| "epoch": 1.2124886329190665, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 7.967768045498217e-06, | |
| "loss": 1.0925, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 4000, | |
| "tokens_per_second_per_gpu": 2451.13 | |
| }, | |
| { | |
| "epoch": 1.2731130645650197, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 7.771365314279794e-06, | |
| "loss": 1.0901, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 4200, | |
| "tokens_per_second_per_gpu": 1827.05 | |
| }, | |
| { | |
| "epoch": 1.333737496210973, | |
| "grad_norm": 21.25, | |
| "learning_rate": 7.568584460333408e-06, | |
| "loss": 1.1244, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 4400, | |
| "tokens_per_second_per_gpu": 3097.57 | |
| }, | |
| { | |
| "epoch": 1.3943619278569264, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 7.3598921710231106e-06, | |
| "loss": 1.1705, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 4600, | |
| "tokens_per_second_per_gpu": 2846.72 | |
| }, | |
| { | |
| "epoch": 1.4549863595028796, | |
| "grad_norm": 18.75, | |
| "learning_rate": 7.1457687385091415e-06, | |
| "loss": 1.1692, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 4800, | |
| "tokens_per_second_per_gpu": 1278.69 | |
| }, | |
| { | |
| "epoch": 1.515610791148833, | |
| "grad_norm": 11.875, | |
| "learning_rate": 6.926706954385761e-06, | |
| "loss": 1.1225, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 5000, | |
| "tokens_per_second_per_gpu": 2819.93 | |
| }, | |
| { | |
| "epoch": 1.5762352227947862, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 6.7032109755524384e-06, | |
| "loss": 1.155, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 5200, | |
| "tokens_per_second_per_gpu": 2809.42 | |
| }, | |
| { | |
| "epoch": 1.6368596544407397, | |
| "grad_norm": 7.96875, | |
| "learning_rate": 6.475795163928501e-06, | |
| "loss": 1.153, | |
| "memory/device_reserved (GiB)": 6.92, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 5400, | |
| "tokens_per_second_per_gpu": 2595.08 | |
| }, | |
| { | |
| "epoch": 1.697484086086693, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 6.244982902681568e-06, | |
| "loss": 1.1702, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 6.73, | |
| "memory/max_allocated (GiB)": 6.73, | |
| "step": 5600, | |
| "tokens_per_second_per_gpu": 2620.26 | |
| }, | |
| { | |
| "epoch": 1.758108517732646, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 6.011305391694152e-06, | |
| "loss": 1.1159, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 5800, | |
| "tokens_per_second_per_gpu": 2988.73 | |
| }, | |
| { | |
| "epoch": 1.8187329493785995, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 5.775300425040592e-06, | |
| "loss": 1.1151, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 6000, | |
| "tokens_per_second_per_gpu": 2139.4 | |
| }, | |
| { | |
| "epoch": 1.879357381024553, | |
| "grad_norm": 7.90625, | |
| "learning_rate": 5.537511153287856e-06, | |
| "loss": 1.1135, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 6200, | |
| "tokens_per_second_per_gpu": 3019.1 | |
| }, | |
| { | |
| "epoch": 1.9399818126705062, | |
| "grad_norm": 7.96875, | |
| "learning_rate": 5.298484833468716e-06, | |
| "loss": 1.1488, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 6400, | |
| "tokens_per_second_per_gpu": 2381.51 | |
| }, | |
| { | |
| "epoch": 2.0006062443164594, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 5.0587715696041685e-06, | |
| "loss": 1.1651, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 6600, | |
| "tokens_per_second_per_gpu": 3025.08 | |
| }, | |
| { | |
| "epoch": 2.061230675962413, | |
| "grad_norm": 9.0, | |
| "learning_rate": 4.8189230466736585e-06, | |
| "loss": 1.0324, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 6800, | |
| "tokens_per_second_per_gpu": 2687.95 | |
| }, | |
| { | |
| "epoch": 2.1218551076083663, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 4.579491260946856e-06, | |
| "loss": 0.9765, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 7000, | |
| "tokens_per_second_per_gpu": 2745.94 | |
| }, | |
| { | |
| "epoch": 2.1824795392543193, | |
| "grad_norm": 9.875, | |
| "learning_rate": 4.341027249598999e-06, | |
| "loss": 1.0136, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 7200, | |
| "tokens_per_second_per_gpu": 2545.45 | |
| }, | |
| { | |
| "epoch": 2.2431039709002727, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 4.104079822533548e-06, | |
| "loss": 1.0314, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 6.73, | |
| "memory/max_allocated (GiB)": 6.73, | |
| "step": 7400, | |
| "tokens_per_second_per_gpu": 3023.1 | |
| }, | |
| { | |
| "epoch": 2.303728402546226, | |
| "grad_norm": 7.90625, | |
| "learning_rate": 3.869194299330752e-06, | |
| "loss": 1.0273, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 7600, | |
| "tokens_per_second_per_gpu": 2507.53 | |
| }, | |
| { | |
| "epoch": 2.3643528341921796, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 3.63691125422898e-06, | |
| "loss": 1.0177, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 7800, | |
| "tokens_per_second_per_gpu": 2346.84 | |
| }, | |
| { | |
| "epoch": 2.424977265838133, | |
| "grad_norm": 43.75, | |
| "learning_rate": 3.407765272027156e-06, | |
| "loss": 1.0256, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 8000, | |
| "tokens_per_second_per_gpu": 2946.41 | |
| }, | |
| { | |
| "epoch": 2.485601697484086, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 3.1822837177715192e-06, | |
| "loss": 1.0005, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 8200, | |
| "tokens_per_second_per_gpu": 2745.12 | |
| }, | |
| { | |
| "epoch": 2.5462261291300394, | |
| "grad_norm": 10.375, | |
| "learning_rate": 2.960985523058174e-06, | |
| "loss": 1.0119, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 8400, | |
| "tokens_per_second_per_gpu": 2990.09 | |
| }, | |
| { | |
| "epoch": 2.606850560775993, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 2.744379991744713e-06, | |
| "loss": 1.0293, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 8600, | |
| "tokens_per_second_per_gpu": 2229.96 | |
| }, | |
| { | |
| "epoch": 2.667474992421946, | |
| "grad_norm": 6.75, | |
| "learning_rate": 2.5329656278194625e-06, | |
| "loss": 1.0065, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 8800, | |
| "tokens_per_second_per_gpu": 1817.5 | |
| }, | |
| { | |
| "epoch": 2.7280994240678993, | |
| "grad_norm": 9.75, | |
| "learning_rate": 2.327228988125946e-06, | |
| "loss": 1.0057, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 9000, | |
| "tokens_per_second_per_gpu": 2842.21 | |
| }, | |
| { | |
| "epoch": 2.7887238557138527, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 2.1276435625829543e-06, | |
| "loss": 0.9829, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 9200, | |
| "tokens_per_second_per_gpu": 3086.49 | |
| }, | |
| { | |
| "epoch": 2.849348287359806, | |
| "grad_norm": 9.5, | |
| "learning_rate": 1.9346686844773253e-06, | |
| "loss": 1.0178, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 9400, | |
| "tokens_per_second_per_gpu": 2586.56 | |
| }, | |
| { | |
| "epoch": 2.909972719005759, | |
| "grad_norm": 7.53125, | |
| "learning_rate": 1.7487484733373017e-06, | |
| "loss": 1.0099, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 9600, | |
| "tokens_per_second_per_gpu": 1847.53 | |
| }, | |
| { | |
| "epoch": 2.9705971506517126, | |
| "grad_norm": 12.5, | |
| "learning_rate": 1.5703108128194145e-06, | |
| "loss": 1.0223, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 9800, | |
| "tokens_per_second_per_gpu": 2763.65 | |
| }, | |
| { | |
| "epoch": 3.031221582297666, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 1.3997663659611982e-06, | |
| "loss": 0.9828, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 10000, | |
| "tokens_per_second_per_gpu": 2827.68 | |
| }, | |
| { | |
| "epoch": 3.0918460139436195, | |
| "grad_norm": 11.75, | |
| "learning_rate": 1.2375076300660677e-06, | |
| "loss": 0.9693, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 10200, | |
| "tokens_per_second_per_gpu": 2538.93 | |
| }, | |
| { | |
| "epoch": 3.1524704455895725, | |
| "grad_norm": 4.78125, | |
| "learning_rate": 1.083908033395496e-06, | |
| "loss": 0.9715, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 10400, | |
| "tokens_per_second_per_gpu": 2718.63 | |
| }, | |
| { | |
| "epoch": 3.213094877235526, | |
| "grad_norm": 9.75, | |
| "learning_rate": 9.393210757473959e-07, | |
| "loss": 1.0057, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 10600, | |
| "tokens_per_second_per_gpu": 2702.32 | |
| }, | |
| { | |
| "epoch": 3.2737193088814793, | |
| "grad_norm": 8.25, | |
| "learning_rate": 8.040795148985875e-07, | |
| "loss": 0.9761, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 10800, | |
| "tokens_per_second_per_gpu": 2937.95 | |
| }, | |
| { | |
| "epoch": 3.3343437405274328, | |
| "grad_norm": 7.53125, | |
| "learning_rate": 6.784946007837395e-07, | |
| "loss": 0.9284, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 11000, | |
| "tokens_per_second_per_gpu": 2385.64 | |
| }, | |
| { | |
| "epoch": 3.3949681721733858, | |
| "grad_norm": 6.875, | |
| "learning_rate": 5.628553591732478e-07, | |
| "loss": 0.9631, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 5.61, | |
| "memory/max_allocated (GiB)": 5.61, | |
| "step": 11200, | |
| "tokens_per_second_per_gpu": 2425.36 | |
| }, | |
| { | |
| "epoch": 3.455592603819339, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 4.57427926498627e-07, | |
| "loss": 0.986, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 11400, | |
| "tokens_per_second_per_gpu": 2226.29 | |
| }, | |
| { | |
| "epoch": 3.5162170354652926, | |
| "grad_norm": 9.875, | |
| "learning_rate": 3.6245493735626913e-07, | |
| "loss": 1.0002, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 11600, | |
| "tokens_per_second_per_gpu": 2925.67 | |
| }, | |
| { | |
| "epoch": 3.5768414671112456, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 2.7815496609921053e-07, | |
| "loss": 0.9356, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 11800, | |
| "tokens_per_second_per_gpu": 2927.99 | |
| }, | |
| { | |
| "epoch": 3.637465898757199, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 2.047220238020181e-07, | |
| "loss": 1.0038, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 12000, | |
| "tokens_per_second_per_gpu": 2644.97 | |
| }, | |
| { | |
| "epoch": 3.6980903304031525, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 1.4232511175652108e-07, | |
| "loss": 0.9089, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 12200, | |
| "tokens_per_second_per_gpu": 1991.61 | |
| }, | |
| { | |
| "epoch": 3.758714762049106, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 9.110783252598453e-08, | |
| "loss": 0.9737, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 12400, | |
| "tokens_per_second_per_gpu": 2470.36 | |
| }, | |
| { | |
| "epoch": 3.8193391936950594, | |
| "grad_norm": 38.0, | |
| "learning_rate": 5.118805945285188e-08, | |
| "loss": 0.9549, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 12600, | |
| "tokens_per_second_per_gpu": 3157.65 | |
| }, | |
| { | |
| "epoch": 3.8799636253410124, | |
| "grad_norm": 10.125, | |
| "learning_rate": 2.2657665380669824e-08, | |
| "loss": 0.9517, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 12800, | |
| "tokens_per_second_per_gpu": 2685.23 | |
| }, | |
| { | |
| "epoch": 3.940588056986966, | |
| "grad_norm": 11.875, | |
| "learning_rate": 5.5823112145286796e-09, | |
| "loss": 0.9891, | |
| "memory/device_reserved (GiB)": 6.96, | |
| "memory/max_active (GiB)": 4.63, | |
| "memory/max_allocated (GiB)": 4.63, | |
| "step": 13000, | |
| "tokens_per_second_per_gpu": 2045.68 | |
| } | |
| ], | |
| "logging_steps": 200, | |
| "max_steps": 13196, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 3299, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.304147161540198e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |