| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0030849320350912, | |
| "eval_steps": 500, | |
| "global_step": 651, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007712330087727755, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 2.0000000000000002e-07, | |
| "loss": 1.164, | |
| "memory/device_reserved (GiB)": 246.45, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 5, | |
| "tokens_per_second_per_gpu": 500.9, | |
| "total_tokens": 1287402 | |
| }, | |
| { | |
| "epoch": 0.01542466017545551, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 4.5000000000000003e-07, | |
| "loss": 1.1463, | |
| "memory/device_reserved (GiB)": 246.57, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 10, | |
| "tokens_per_second_per_gpu": 458.45, | |
| "total_tokens": 2568739 | |
| }, | |
| { | |
| "epoch": 0.023136990263183263, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 7.000000000000001e-07, | |
| "loss": 1.1346, | |
| "memory/device_reserved (GiB)": 246.57, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 15, | |
| "tokens_per_second_per_gpu": 487.44, | |
| "total_tokens": 3789106 | |
| }, | |
| { | |
| "epoch": 0.03084932035091102, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 9.500000000000001e-07, | |
| "loss": 1.1363, | |
| "memory/device_reserved (GiB)": 246.57, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 20, | |
| "tokens_per_second_per_gpu": 514.12, | |
| "total_tokens": 5093129 | |
| }, | |
| { | |
| "epoch": 0.038561650438638774, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": 1.118, | |
| "memory/device_reserved (GiB)": 246.57, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 25, | |
| "tokens_per_second_per_gpu": 449.18, | |
| "total_tokens": 6340204 | |
| }, | |
| { | |
| "epoch": 0.046273980526366526, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 1.45e-06, | |
| "loss": 1.1324, | |
| "memory/device_reserved (GiB)": 246.57, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 30, | |
| "tokens_per_second_per_gpu": 450.16, | |
| "total_tokens": 7552866 | |
| }, | |
| { | |
| "epoch": 0.053986310614094285, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 1.7000000000000002e-06, | |
| "loss": 1.1225, | |
| "memory/device_reserved (GiB)": 246.57, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 35, | |
| "tokens_per_second_per_gpu": 517.87, | |
| "total_tokens": 8796011 | |
| }, | |
| { | |
| "epoch": 0.06169864070182204, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 1.9500000000000004e-06, | |
| "loss": 1.0583, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 40, | |
| "tokens_per_second_per_gpu": 478.29, | |
| "total_tokens": 10026103 | |
| }, | |
| { | |
| "epoch": 0.06941097078954979, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 2.2e-06, | |
| "loss": 1.0965, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 45, | |
| "tokens_per_second_per_gpu": 470.47, | |
| "total_tokens": 11340325 | |
| }, | |
| { | |
| "epoch": 0.07712330087727755, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 2.4500000000000003e-06, | |
| "loss": 1.1055, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 50, | |
| "tokens_per_second_per_gpu": 447.61, | |
| "total_tokens": 12618872 | |
| }, | |
| { | |
| "epoch": 0.08483563096500531, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 2.7000000000000004e-06, | |
| "loss": 1.0971, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 55, | |
| "tokens_per_second_per_gpu": 468.97, | |
| "total_tokens": 13906030 | |
| }, | |
| { | |
| "epoch": 0.09254796105273305, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 2.95e-06, | |
| "loss": 1.0837, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 60, | |
| "tokens_per_second_per_gpu": 443.28, | |
| "total_tokens": 15157386 | |
| }, | |
| { | |
| "epoch": 0.10026029114046081, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 3.2000000000000003e-06, | |
| "loss": 1.081, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 65, | |
| "tokens_per_second_per_gpu": 434.55, | |
| "total_tokens": 16483820 | |
| }, | |
| { | |
| "epoch": 0.10797262122818857, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 3.45e-06, | |
| "loss": 1.0501, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 70, | |
| "tokens_per_second_per_gpu": 446.29, | |
| "total_tokens": 17761687 | |
| }, | |
| { | |
| "epoch": 0.11568495131591632, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 3.7e-06, | |
| "loss": 1.0402, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 75, | |
| "tokens_per_second_per_gpu": 499.21, | |
| "total_tokens": 19043878 | |
| }, | |
| { | |
| "epoch": 0.12339728140364407, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 3.95e-06, | |
| "loss": 1.0604, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 80, | |
| "tokens_per_second_per_gpu": 453.36, | |
| "total_tokens": 20314866 | |
| }, | |
| { | |
| "epoch": 0.13110961149137182, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 4.2000000000000004e-06, | |
| "loss": 1.0382, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 85, | |
| "tokens_per_second_per_gpu": 453.89, | |
| "total_tokens": 21555557 | |
| }, | |
| { | |
| "epoch": 0.13882194157909958, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 4.450000000000001e-06, | |
| "loss": 1.0753, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 90, | |
| "tokens_per_second_per_gpu": 510.81, | |
| "total_tokens": 22834764 | |
| }, | |
| { | |
| "epoch": 0.14653427166682734, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 4.7e-06, | |
| "loss": 1.021, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 95, | |
| "tokens_per_second_per_gpu": 477.3, | |
| "total_tokens": 24078719 | |
| }, | |
| { | |
| "epoch": 0.1542466017545551, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 4.95e-06, | |
| "loss": 1.0379, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 100, | |
| "tokens_per_second_per_gpu": 495.26, | |
| "total_tokens": 25368597 | |
| }, | |
| { | |
| "epoch": 0.16195893184228285, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 4.999950130642566e-06, | |
| "loss": 1.0399, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 105, | |
| "tokens_per_second_per_gpu": 458.71, | |
| "total_tokens": 26667382 | |
| }, | |
| { | |
| "epoch": 0.16967126193001061, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 4.999747540342916e-06, | |
| "loss": 1.0052, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 110, | |
| "tokens_per_second_per_gpu": 454.85, | |
| "total_tokens": 27935546 | |
| }, | |
| { | |
| "epoch": 0.17738359201773837, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 4.999389126939546e-06, | |
| "loss": 0.9942, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 115, | |
| "tokens_per_second_per_gpu": 488.42, | |
| "total_tokens": 29232061 | |
| }, | |
| { | |
| "epoch": 0.1850959221054661, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 4.998874916411828e-06, | |
| "loss": 1.0166, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 120, | |
| "tokens_per_second_per_gpu": 479.94, | |
| "total_tokens": 30496024 | |
| }, | |
| { | |
| "epoch": 0.19280825219319386, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 4.99820494603199e-06, | |
| "loss": 0.9909, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 125, | |
| "tokens_per_second_per_gpu": 462.61, | |
| "total_tokens": 31699870 | |
| }, | |
| { | |
| "epoch": 0.20052058228092162, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 4.997379264362414e-06, | |
| "loss": 0.9874, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 130, | |
| "tokens_per_second_per_gpu": 467.47, | |
| "total_tokens": 32956914 | |
| }, | |
| { | |
| "epoch": 0.20823291236864938, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 4.996397931252118e-06, | |
| "loss": 1.0041, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 135, | |
| "tokens_per_second_per_gpu": 459.93, | |
| "total_tokens": 34182303 | |
| }, | |
| { | |
| "epoch": 0.21594524245637714, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 4.9952610178324206e-06, | |
| "loss": 1.0227, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 140, | |
| "tokens_per_second_per_gpu": 510.67, | |
| "total_tokens": 35453388 | |
| }, | |
| { | |
| "epoch": 0.2236575725441049, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 4.993968606511777e-06, | |
| "loss": 1.0007, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 145, | |
| "tokens_per_second_per_gpu": 448.95, | |
| "total_tokens": 36723948 | |
| }, | |
| { | |
| "epoch": 0.23136990263183263, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 4.9925207909698115e-06, | |
| "loss": 0.9714, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 150, | |
| "tokens_per_second_per_gpu": 460.62, | |
| "total_tokens": 37953157 | |
| }, | |
| { | |
| "epoch": 0.2390822327195604, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 4.990917676150527e-06, | |
| "loss": 1.0193, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 155, | |
| "tokens_per_second_per_gpu": 509.45, | |
| "total_tokens": 39278804 | |
| }, | |
| { | |
| "epoch": 0.24679456280728815, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 4.989159378254698e-06, | |
| "loss": 0.9887, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 160, | |
| "tokens_per_second_per_gpu": 491.62, | |
| "total_tokens": 40562259 | |
| }, | |
| { | |
| "epoch": 0.2545068928950159, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 4.9872460247314455e-06, | |
| "loss": 0.9637, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 165, | |
| "tokens_per_second_per_gpu": 423.64, | |
| "total_tokens": 41779016 | |
| }, | |
| { | |
| "epoch": 0.26221922298274364, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 4.9851777542690004e-06, | |
| "loss": 0.9812, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 170, | |
| "tokens_per_second_per_gpu": 504.59, | |
| "total_tokens": 43071677 | |
| }, | |
| { | |
| "epoch": 0.2699315530704714, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 4.9829547167846515e-06, | |
| "loss": 0.9513, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 175, | |
| "tokens_per_second_per_gpu": 440.97, | |
| "total_tokens": 44311747 | |
| }, | |
| { | |
| "epoch": 0.27764388315819916, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 4.9805770734138785e-06, | |
| "loss": 0.9504, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 180, | |
| "tokens_per_second_per_gpu": 463.76, | |
| "total_tokens": 45513801 | |
| }, | |
| { | |
| "epoch": 0.2853562132459269, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 4.978044996498671e-06, | |
| "loss": 0.9967, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 185, | |
| "tokens_per_second_per_gpu": 450.72, | |
| "total_tokens": 46737916 | |
| }, | |
| { | |
| "epoch": 0.2930685433336547, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 4.975358669575039e-06, | |
| "loss": 0.9927, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 190, | |
| "tokens_per_second_per_gpu": 482.76, | |
| "total_tokens": 48020729 | |
| }, | |
| { | |
| "epoch": 0.30078087342138243, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 4.972518287359707e-06, | |
| "loss": 0.9893, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 195, | |
| "tokens_per_second_per_gpu": 437.14, | |
| "total_tokens": 49235773 | |
| }, | |
| { | |
| "epoch": 0.3084932035091102, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 4.969524055735999e-06, | |
| "loss": 0.9344, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 200, | |
| "tokens_per_second_per_gpu": 428.14, | |
| "total_tokens": 50485073 | |
| }, | |
| { | |
| "epoch": 0.31620553359683795, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 4.9663761917389195e-06, | |
| "loss": 0.9669, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 205, | |
| "tokens_per_second_per_gpu": 513.34, | |
| "total_tokens": 51774221 | |
| }, | |
| { | |
| "epoch": 0.3239178636845657, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 4.9630749235394174e-06, | |
| "loss": 1.0206, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 210, | |
| "tokens_per_second_per_gpu": 470.02, | |
| "total_tokens": 53055366 | |
| }, | |
| { | |
| "epoch": 0.33163019377229347, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 4.959620490427851e-06, | |
| "loss": 1.0136, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 215, | |
| "tokens_per_second_per_gpu": 474.09, | |
| "total_tokens": 54318548 | |
| }, | |
| { | |
| "epoch": 0.33934252386002123, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 4.95601314279664e-06, | |
| "loss": 0.98, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 220, | |
| "tokens_per_second_per_gpu": 441.47, | |
| "total_tokens": 55573941 | |
| }, | |
| { | |
| "epoch": 0.347054853947749, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 4.952253142122116e-06, | |
| "loss": 0.9428, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 225, | |
| "tokens_per_second_per_gpu": 446.8, | |
| "total_tokens": 56816147 | |
| }, | |
| { | |
| "epoch": 0.35476718403547675, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 4.948340760945575e-06, | |
| "loss": 0.9925, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 230, | |
| "tokens_per_second_per_gpu": 440.95, | |
| "total_tokens": 58018929 | |
| }, | |
| { | |
| "epoch": 0.36247951412320445, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 4.944276282853515e-06, | |
| "loss": 1.0056, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 235, | |
| "tokens_per_second_per_gpu": 437.83, | |
| "total_tokens": 59236706 | |
| }, | |
| { | |
| "epoch": 0.3701918442109322, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 4.940060002457083e-06, | |
| "loss": 1.0271, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 240, | |
| "tokens_per_second_per_gpu": 478.28, | |
| "total_tokens": 60523938 | |
| }, | |
| { | |
| "epoch": 0.37790417429865997, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 4.935692225370723e-06, | |
| "loss": 0.9817, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 245, | |
| "tokens_per_second_per_gpu": 461.75, | |
| "total_tokens": 61782785 | |
| }, | |
| { | |
| "epoch": 0.3856165043863877, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 4.9311732681900195e-06, | |
| "loss": 1.0036, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 250, | |
| "tokens_per_second_per_gpu": 455.35, | |
| "total_tokens": 62995481 | |
| }, | |
| { | |
| "epoch": 0.3933288344741155, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 4.926503458468756e-06, | |
| "loss": 1.0026, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 255, | |
| "tokens_per_second_per_gpu": 485.03, | |
| "total_tokens": 64329450 | |
| }, | |
| { | |
| "epoch": 0.40104116456184324, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 4.921683134695161e-06, | |
| "loss": 1.0022, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 260, | |
| "tokens_per_second_per_gpu": 462.42, | |
| "total_tokens": 65577746 | |
| }, | |
| { | |
| "epoch": 0.408753494649571, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 4.916712646267386e-06, | |
| "loss": 1.0291, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 265, | |
| "tokens_per_second_per_gpu": 467.37, | |
| "total_tokens": 66836212 | |
| }, | |
| { | |
| "epoch": 0.41646582473729876, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 4.91159235346817e-06, | |
| "loss": 0.9912, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 270, | |
| "tokens_per_second_per_gpu": 515.37, | |
| "total_tokens": 68143788 | |
| }, | |
| { | |
| "epoch": 0.4241781548250265, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 4.906322627438728e-06, | |
| "loss": 0.965, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 275, | |
| "tokens_per_second_per_gpu": 482.58, | |
| "total_tokens": 69419203 | |
| }, | |
| { | |
| "epoch": 0.4318904849127543, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 4.900903850151851e-06, | |
| "loss": 0.9919, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 280, | |
| "tokens_per_second_per_gpu": 476.95, | |
| "total_tokens": 70708858 | |
| }, | |
| { | |
| "epoch": 0.43960281500048204, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 4.895336414384216e-06, | |
| "loss": 1.015, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 285, | |
| "tokens_per_second_per_gpu": 488.29, | |
| "total_tokens": 71965415 | |
| }, | |
| { | |
| "epoch": 0.4473151450882098, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 4.889620723687917e-06, | |
| "loss": 0.998, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 290, | |
| "tokens_per_second_per_gpu": 501.89, | |
| "total_tokens": 73325345 | |
| }, | |
| { | |
| "epoch": 0.45502747517593756, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 4.883757192361212e-06, | |
| "loss": 0.9773, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 295, | |
| "tokens_per_second_per_gpu": 469.94, | |
| "total_tokens": 74612028 | |
| }, | |
| { | |
| "epoch": 0.46273980526366526, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 4.8777462454184985e-06, | |
| "loss": 0.9877, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 300, | |
| "tokens_per_second_per_gpu": 476.32, | |
| "total_tokens": 75892822 | |
| }, | |
| { | |
| "epoch": 0.470452135351393, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 4.871588318559497e-06, | |
| "loss": 1.0243, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 305, | |
| "tokens_per_second_per_gpu": 451.66, | |
| "total_tokens": 77185408 | |
| }, | |
| { | |
| "epoch": 0.4781644654391208, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 4.86528385813768e-06, | |
| "loss": 0.9481, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 310, | |
| "tokens_per_second_per_gpu": 441.19, | |
| "total_tokens": 78404970 | |
| }, | |
| { | |
| "epoch": 0.48587679552684854, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 4.8588333211279105e-06, | |
| "loss": 0.9578, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 315, | |
| "tokens_per_second_per_gpu": 466.55, | |
| "total_tokens": 79635002 | |
| }, | |
| { | |
| "epoch": 0.4935891256145763, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 4.852237175093324e-06, | |
| "loss": 1.0071, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 320, | |
| "tokens_per_second_per_gpu": 452.13, | |
| "total_tokens": 80933611 | |
| }, | |
| { | |
| "epoch": 0.5013014557023041, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 4.845495898151433e-06, | |
| "loss": 0.944, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 325, | |
| "tokens_per_second_per_gpu": 433.83, | |
| "total_tokens": 82145228 | |
| }, | |
| { | |
| "epoch": 0.5090137857900318, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 4.838609978939473e-06, | |
| "loss": 1.0259, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 330, | |
| "tokens_per_second_per_gpu": 453.14, | |
| "total_tokens": 83357951 | |
| }, | |
| { | |
| "epoch": 0.5167261158777595, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 4.831579916578984e-06, | |
| "loss": 0.9701, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 335, | |
| "tokens_per_second_per_gpu": 466.64, | |
| "total_tokens": 84573768 | |
| }, | |
| { | |
| "epoch": 0.5244384459654873, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 4.824406220639634e-06, | |
| "loss": 1.005, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 340, | |
| "tokens_per_second_per_gpu": 460.44, | |
| "total_tokens": 85851432 | |
| }, | |
| { | |
| "epoch": 0.532150776053215, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 4.817089411102277e-06, | |
| "loss": 0.9687, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 345, | |
| "tokens_per_second_per_gpu": 440.77, | |
| "total_tokens": 87088346 | |
| }, | |
| { | |
| "epoch": 0.5398631061409428, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 4.809630018321269e-06, | |
| "loss": 0.9483, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 350, | |
| "tokens_per_second_per_gpu": 480.0, | |
| "total_tokens": 88317739 | |
| }, | |
| { | |
| "epoch": 0.5475754362286706, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 4.802028582986024e-06, | |
| "loss": 0.9666, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 355, | |
| "tokens_per_second_per_gpu": 476.5, | |
| "total_tokens": 89577574 | |
| }, | |
| { | |
| "epoch": 0.5552877663163983, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 4.794285656081819e-06, | |
| "loss": 0.9507, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 360, | |
| "tokens_per_second_per_gpu": 497.81, | |
| "total_tokens": 90828267 | |
| }, | |
| { | |
| "epoch": 0.5630000964041261, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 4.786401798849861e-06, | |
| "loss": 0.9405, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 365, | |
| "tokens_per_second_per_gpu": 481.32, | |
| "total_tokens": 92116882 | |
| }, | |
| { | |
| "epoch": 0.5707124264918538, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 4.778377582746601e-06, | |
| "loss": 1.0099, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 370, | |
| "tokens_per_second_per_gpu": 459.13, | |
| "total_tokens": 93354393 | |
| }, | |
| { | |
| "epoch": 0.5784247565795816, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 4.770213589402317e-06, | |
| "loss": 0.9608, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 375, | |
| "tokens_per_second_per_gpu": 485.77, | |
| "total_tokens": 94596707 | |
| }, | |
| { | |
| "epoch": 0.5861370866673093, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 4.7619104105789525e-06, | |
| "loss": 0.9747, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 380, | |
| "tokens_per_second_per_gpu": 449.15, | |
| "total_tokens": 95823690 | |
| }, | |
| { | |
| "epoch": 0.5938494167550371, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 4.753468648127223e-06, | |
| "loss": 0.9593, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 385, | |
| "tokens_per_second_per_gpu": 462.81, | |
| "total_tokens": 97064318 | |
| }, | |
| { | |
| "epoch": 0.6015617468427649, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 4.74488891394299e-06, | |
| "loss": 0.984, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 390, | |
| "tokens_per_second_per_gpu": 467.19, | |
| "total_tokens": 98318680 | |
| }, | |
| { | |
| "epoch": 0.6092740769304926, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 4.7361718299229125e-06, | |
| "loss": 0.9446, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 395, | |
| "tokens_per_second_per_gpu": 439.44, | |
| "total_tokens": 99522489 | |
| }, | |
| { | |
| "epoch": 0.6169864070182204, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 4.727318027919364e-06, | |
| "loss": 0.9796, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 400, | |
| "tokens_per_second_per_gpu": 441.41, | |
| "total_tokens": 100785080 | |
| }, | |
| { | |
| "epoch": 0.6246987371059481, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 4.718328149694636e-06, | |
| "loss": 1.0054, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 405, | |
| "tokens_per_second_per_gpu": 497.07, | |
| "total_tokens": 102084350 | |
| }, | |
| { | |
| "epoch": 0.6324110671936759, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 4.70920284687442e-06, | |
| "loss": 0.9891, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 410, | |
| "tokens_per_second_per_gpu": 482.3, | |
| "total_tokens": 103357801 | |
| }, | |
| { | |
| "epoch": 0.6401233972814037, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 4.699942780900575e-06, | |
| "loss": 0.9616, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 415, | |
| "tokens_per_second_per_gpu": 499.49, | |
| "total_tokens": 104613606 | |
| }, | |
| { | |
| "epoch": 0.6478357273691314, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 4.6905486229831814e-06, | |
| "loss": 0.9559, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 420, | |
| "tokens_per_second_per_gpu": 500.21, | |
| "total_tokens": 105880634 | |
| }, | |
| { | |
| "epoch": 0.6555480574568592, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 4.68102105405189e-06, | |
| "loss": 0.9997, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 425, | |
| "tokens_per_second_per_gpu": 473.56, | |
| "total_tokens": 107115671 | |
| }, | |
| { | |
| "epoch": 0.6632603875445869, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 4.671360764706566e-06, | |
| "loss": 0.9785, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 430, | |
| "tokens_per_second_per_gpu": 507.32, | |
| "total_tokens": 108431881 | |
| }, | |
| { | |
| "epoch": 0.6709727176323147, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 4.661568455167234e-06, | |
| "loss": 0.9736, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 435, | |
| "tokens_per_second_per_gpu": 446.16, | |
| "total_tokens": 109661764 | |
| }, | |
| { | |
| "epoch": 0.6786850477200425, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 4.6516448352233115e-06, | |
| "loss": 0.9528, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 440, | |
| "tokens_per_second_per_gpu": 464.06, | |
| "total_tokens": 110893822 | |
| }, | |
| { | |
| "epoch": 0.6863973778077702, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 4.641590624182177e-06, | |
| "loss": 0.9679, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 445, | |
| "tokens_per_second_per_gpu": 474.49, | |
| "total_tokens": 112161956 | |
| }, | |
| { | |
| "epoch": 0.694109707895498, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 4.631406550817017e-06, | |
| "loss": 0.9598, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 450, | |
| "tokens_per_second_per_gpu": 458.35, | |
| "total_tokens": 113387998 | |
| }, | |
| { | |
| "epoch": 0.7018220379832257, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 4.621093353314011e-06, | |
| "loss": 0.9754, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 455, | |
| "tokens_per_second_per_gpu": 458.23, | |
| "total_tokens": 114644259 | |
| }, | |
| { | |
| "epoch": 0.7095343680709535, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 4.610651779218818e-06, | |
| "loss": 0.9744, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 460, | |
| "tokens_per_second_per_gpu": 512.77, | |
| "total_tokens": 115933303 | |
| }, | |
| { | |
| "epoch": 0.7172466981586811, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 4.600082585382396e-06, | |
| "loss": 0.9725, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 465, | |
| "tokens_per_second_per_gpu": 535.52, | |
| "total_tokens": 117241165 | |
| }, | |
| { | |
| "epoch": 0.7249590282464089, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 4.5893865379061375e-06, | |
| "loss": 0.9523, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 470, | |
| "tokens_per_second_per_gpu": 446.49, | |
| "total_tokens": 118480571 | |
| }, | |
| { | |
| "epoch": 0.7326713583341367, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 4.578564412086345e-06, | |
| "loss": 0.9915, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 475, | |
| "tokens_per_second_per_gpu": 447.33, | |
| "total_tokens": 119748086 | |
| }, | |
| { | |
| "epoch": 0.7403836884218644, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 4.567616992358027e-06, | |
| "loss": 0.9736, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 480, | |
| "tokens_per_second_per_gpu": 455.33, | |
| "total_tokens": 121010074 | |
| }, | |
| { | |
| "epoch": 0.7480960185095922, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 4.556545072238046e-06, | |
| "loss": 0.9755, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 485, | |
| "tokens_per_second_per_gpu": 441.4, | |
| "total_tokens": 122291737 | |
| }, | |
| { | |
| "epoch": 0.7558083485973199, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 4.545349454267595e-06, | |
| "loss": 0.9839, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 490, | |
| "tokens_per_second_per_gpu": 451.9, | |
| "total_tokens": 123604957 | |
| }, | |
| { | |
| "epoch": 0.7635206786850477, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 4.534030949954029e-06, | |
| "loss": 0.9801, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 495, | |
| "tokens_per_second_per_gpu": 443.71, | |
| "total_tokens": 124876147 | |
| }, | |
| { | |
| "epoch": 0.7712330087727755, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 4.522590379712043e-06, | |
| "loss": 0.966, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 500, | |
| "tokens_per_second_per_gpu": 465.62, | |
| "total_tokens": 126142017 | |
| }, | |
| { | |
| "epoch": 0.7789453388605032, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 4.511028572804202e-06, | |
| "loss": 0.9528, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 505, | |
| "tokens_per_second_per_gpu": 474.94, | |
| "total_tokens": 127354629 | |
| }, | |
| { | |
| "epoch": 0.786657668948231, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 4.499346367280838e-06, | |
| "loss": 0.9615, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 510, | |
| "tokens_per_second_per_gpu": 495.08, | |
| "total_tokens": 128638347 | |
| }, | |
| { | |
| "epoch": 0.7943699990359587, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 4.4875446099193e-06, | |
| "loss": 0.9405, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 515, | |
| "tokens_per_second_per_gpu": 444.79, | |
| "total_tokens": 129925188 | |
| }, | |
| { | |
| "epoch": 0.8020823291236865, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 4.475624156162578e-06, | |
| "loss": 0.9918, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 520, | |
| "tokens_per_second_per_gpu": 466.79, | |
| "total_tokens": 131210797 | |
| }, | |
| { | |
| "epoch": 0.8097946592114142, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 4.463585870057293e-06, | |
| "loss": 0.9679, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 525, | |
| "tokens_per_second_per_gpu": 460.59, | |
| "total_tokens": 132460881 | |
| }, | |
| { | |
| "epoch": 0.817506989299142, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 4.4514306241910716e-06, | |
| "loss": 0.9684, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 530, | |
| "tokens_per_second_per_gpu": 476.54, | |
| "total_tokens": 133653075 | |
| }, | |
| { | |
| "epoch": 0.8252193193868698, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 4.439159299629297e-06, | |
| "loss": 0.9562, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 535, | |
| "tokens_per_second_per_gpu": 486.69, | |
| "total_tokens": 134977958 | |
| }, | |
| { | |
| "epoch": 0.8329316494745975, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 4.426772785851242e-06, | |
| "loss": 0.9743, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 540, | |
| "tokens_per_second_per_gpu": 494.38, | |
| "total_tokens": 136315997 | |
| }, | |
| { | |
| "epoch": 0.8406439795623253, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 4.414271980685597e-06, | |
| "loss": 0.9508, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 545, | |
| "tokens_per_second_per_gpu": 455.26, | |
| "total_tokens": 137602622 | |
| }, | |
| { | |
| "epoch": 0.848356309650053, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 4.401657790245395e-06, | |
| "loss": 0.944, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 550, | |
| "tokens_per_second_per_gpu": 452.81, | |
| "total_tokens": 138895921 | |
| }, | |
| { | |
| "epoch": 0.8560686397377808, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 4.388931128862326e-06, | |
| "loss": 0.9463, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 555, | |
| "tokens_per_second_per_gpu": 437.79, | |
| "total_tokens": 140113031 | |
| }, | |
| { | |
| "epoch": 0.8637809698255086, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 4.376092919020469e-06, | |
| "loss": 0.9808, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 560, | |
| "tokens_per_second_per_gpu": 463.15, | |
| "total_tokens": 141376436 | |
| }, | |
| { | |
| "epoch": 0.8714932999132363, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 4.363144091289421e-06, | |
| "loss": 0.9739, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 565, | |
| "tokens_per_second_per_gpu": 488.68, | |
| "total_tokens": 142666491 | |
| }, | |
| { | |
| "epoch": 0.8792056300009641, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 4.350085584256847e-06, | |
| "loss": 0.9755, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 570, | |
| "tokens_per_second_per_gpu": 444.08, | |
| "total_tokens": 143931345 | |
| }, | |
| { | |
| "epoch": 0.8869179600886918, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 4.336918344460451e-06, | |
| "loss": 0.9615, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 575, | |
| "tokens_per_second_per_gpu": 448.53, | |
| "total_tokens": 145200618 | |
| }, | |
| { | |
| "epoch": 0.8946302901764196, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 4.323643326319359e-06, | |
| "loss": 0.9772, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 580, | |
| "tokens_per_second_per_gpu": 484.18, | |
| "total_tokens": 146502772 | |
| }, | |
| { | |
| "epoch": 0.9023426202641474, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 4.310261492064944e-06, | |
| "loss": 0.9755, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 585, | |
| "tokens_per_second_per_gpu": 470.67, | |
| "total_tokens": 147770982 | |
| }, | |
| { | |
| "epoch": 0.9100549503518751, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 4.296773811671081e-06, | |
| "loss": 0.9743, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 590, | |
| "tokens_per_second_per_gpu": 467.73, | |
| "total_tokens": 149068406 | |
| }, | |
| { | |
| "epoch": 0.9177672804396028, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 4.283181262783832e-06, | |
| "loss": 0.9361, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 595, | |
| "tokens_per_second_per_gpu": 438.69, | |
| "total_tokens": 150281330 | |
| }, | |
| { | |
| "epoch": 0.9254796105273305, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 4.269484830650589e-06, | |
| "loss": 0.9654, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 600, | |
| "tokens_per_second_per_gpu": 453.7, | |
| "total_tokens": 151550646 | |
| }, | |
| { | |
| "epoch": 0.9331919406150583, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 4.255685508048654e-06, | |
| "loss": 0.9209, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 605, | |
| "tokens_per_second_per_gpu": 485.97, | |
| "total_tokens": 152800147 | |
| }, | |
| { | |
| "epoch": 0.940904270702786, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 4.241784295213281e-06, | |
| "loss": 0.9228, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 610, | |
| "tokens_per_second_per_gpu": 471.49, | |
| "total_tokens": 154041877 | |
| }, | |
| { | |
| "epoch": 0.9486166007905138, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 4.227782199765174e-06, | |
| "loss": 0.97, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 615, | |
| "tokens_per_second_per_gpu": 445.62, | |
| "total_tokens": 155243285 | |
| }, | |
| { | |
| "epoch": 0.9563289308782416, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 4.213680236637449e-06, | |
| "loss": 1.0037, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 620, | |
| "tokens_per_second_per_gpu": 499.17, | |
| "total_tokens": 156517899 | |
| }, | |
| { | |
| "epoch": 0.9640412609659693, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 4.19947942800207e-06, | |
| "loss": 0.9686, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 625, | |
| "tokens_per_second_per_gpu": 475.34, | |
| "total_tokens": 157838358 | |
| }, | |
| { | |
| "epoch": 0.9717535910536971, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 4.1851808031957545e-06, | |
| "loss": 0.9373, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 630, | |
| "tokens_per_second_per_gpu": 498.46, | |
| "total_tokens": 159098939 | |
| }, | |
| { | |
| "epoch": 0.9794659211414248, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 4.170785398645364e-06, | |
| "loss": 0.99, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 635, | |
| "tokens_per_second_per_gpu": 491.96, | |
| "total_tokens": 160426067 | |
| }, | |
| { | |
| "epoch": 0.9871782512291526, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 4.156294257792778e-06, | |
| "loss": 0.9816, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 640, | |
| "tokens_per_second_per_gpu": 470.02, | |
| "total_tokens": 161645752 | |
| }, | |
| { | |
| "epoch": 0.9948905813168804, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 4.141708431019264e-06, | |
| "loss": 0.9631, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 645, | |
| "tokens_per_second_per_gpu": 513.67, | |
| "total_tokens": 162985387 | |
| }, | |
| { | |
| "epoch": 1.0015424660175456, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 4.127028975569339e-06, | |
| "loss": 0.9654, | |
| "memory/device_reserved (GiB)": 246.75, | |
| "memory/max_active (GiB)": 211.16, | |
| "memory/max_allocated (GiB)": 211.16, | |
| "step": 650, | |
| "tokens_per_second_per_gpu": 398.45, | |
| "total_tokens": 164082525 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1945, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 217, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.6172120234563994e+20, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |