{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6694302516147691, "eval_steps": 500, "global_step": 434, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007712330087727755, "grad_norm": 0.84765625, "learning_rate": 2.0000000000000002e-07, "loss": 1.164, "memory/device_reserved (GiB)": 246.45, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 5, "tokens_per_second_per_gpu": 500.9, "total_tokens": 1287402 }, { "epoch": 0.01542466017545551, "grad_norm": 0.9765625, "learning_rate": 4.5000000000000003e-07, "loss": 1.1463, "memory/device_reserved (GiB)": 246.57, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 10, "tokens_per_second_per_gpu": 458.45, "total_tokens": 2568739 }, { "epoch": 0.023136990263183263, "grad_norm": 0.91015625, "learning_rate": 7.000000000000001e-07, "loss": 1.1346, "memory/device_reserved (GiB)": 246.57, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 15, "tokens_per_second_per_gpu": 487.44, "total_tokens": 3789106 }, { "epoch": 0.03084932035091102, "grad_norm": 0.90625, "learning_rate": 9.500000000000001e-07, "loss": 1.1363, "memory/device_reserved (GiB)": 246.57, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 20, "tokens_per_second_per_gpu": 514.12, "total_tokens": 5093129 }, { "epoch": 0.038561650438638774, "grad_norm": 1.0, "learning_rate": 1.2000000000000002e-06, "loss": 1.118, "memory/device_reserved (GiB)": 246.57, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 25, "tokens_per_second_per_gpu": 449.18, "total_tokens": 6340204 }, { "epoch": 0.046273980526366526, "grad_norm": 0.859375, "learning_rate": 1.45e-06, "loss": 1.1324, "memory/device_reserved (GiB)": 246.57, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 30, "tokens_per_second_per_gpu": 450.16, "total_tokens": 7552866 }, { "epoch": 0.053986310614094285, "grad_norm": 0.76171875, "learning_rate": 1.7000000000000002e-06, "loss": 1.1225, "memory/device_reserved (GiB)": 246.57, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 35, "tokens_per_second_per_gpu": 517.87, "total_tokens": 8796011 }, { "epoch": 0.06169864070182204, "grad_norm": 0.8046875, "learning_rate": 1.9500000000000004e-06, "loss": 1.0583, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 40, "tokens_per_second_per_gpu": 478.29, "total_tokens": 10026103 }, { "epoch": 0.06941097078954979, "grad_norm": 0.7578125, "learning_rate": 2.2e-06, "loss": 1.0965, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 45, "tokens_per_second_per_gpu": 470.47, "total_tokens": 11340325 }, { "epoch": 0.07712330087727755, "grad_norm": 0.65625, "learning_rate": 2.4500000000000003e-06, "loss": 1.1055, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 50, "tokens_per_second_per_gpu": 447.61, "total_tokens": 12618872 }, { "epoch": 0.08483563096500531, "grad_norm": 0.61328125, "learning_rate": 2.7000000000000004e-06, "loss": 1.0971, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 55, "tokens_per_second_per_gpu": 468.97, "total_tokens": 13906030 }, { "epoch": 0.09254796105273305, "grad_norm": 0.51171875, "learning_rate": 2.95e-06, "loss": 1.0837, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 60, "tokens_per_second_per_gpu": 443.28, "total_tokens": 15157386 }, { "epoch": 0.10026029114046081, "grad_norm": 0.474609375, "learning_rate": 3.2000000000000003e-06, "loss": 1.081, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 65, "tokens_per_second_per_gpu": 434.55, "total_tokens": 16483820 }, { "epoch": 0.10797262122818857, "grad_norm": 0.486328125, "learning_rate": 3.45e-06, "loss": 1.0501, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 70, "tokens_per_second_per_gpu": 446.29, "total_tokens": 17761687 }, { "epoch": 0.11568495131591632, "grad_norm": 0.37890625, "learning_rate": 3.7e-06, "loss": 1.0402, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 75, "tokens_per_second_per_gpu": 499.21, "total_tokens": 19043878 }, { "epoch": 0.12339728140364407, "grad_norm": 0.42578125, "learning_rate": 3.95e-06, "loss": 1.0604, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 80, "tokens_per_second_per_gpu": 453.36, "total_tokens": 20314866 }, { "epoch": 0.13110961149137182, "grad_norm": 0.373046875, "learning_rate": 4.2000000000000004e-06, "loss": 1.0382, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 85, "tokens_per_second_per_gpu": 453.89, "total_tokens": 21555557 }, { "epoch": 0.13882194157909958, "grad_norm": 0.330078125, "learning_rate": 4.450000000000001e-06, "loss": 1.0753, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 90, "tokens_per_second_per_gpu": 510.81, "total_tokens": 22834764 }, { "epoch": 0.14653427166682734, "grad_norm": 0.373046875, "learning_rate": 4.7e-06, "loss": 1.021, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 95, "tokens_per_second_per_gpu": 477.3, "total_tokens": 24078719 }, { "epoch": 0.1542466017545551, "grad_norm": 0.271484375, "learning_rate": 4.95e-06, "loss": 1.0379, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 100, "tokens_per_second_per_gpu": 495.26, "total_tokens": 25368597 }, { "epoch": 0.16195893184228285, "grad_norm": 0.283203125, "learning_rate": 4.999950130642566e-06, "loss": 1.0399, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 105, "tokens_per_second_per_gpu": 458.71, "total_tokens": 26667382 }, { "epoch": 0.16967126193001061, "grad_norm": 0.27734375, "learning_rate": 4.999747540342916e-06, "loss": 1.0052, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 110, "tokens_per_second_per_gpu": 454.85, "total_tokens": 27935546 }, { "epoch": 0.17738359201773837, "grad_norm": 0.287109375, "learning_rate": 4.999389126939546e-06, "loss": 0.9942, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 115, "tokens_per_second_per_gpu": 488.42, "total_tokens": 29232061 }, { "epoch": 0.1850959221054661, "grad_norm": 0.263671875, "learning_rate": 4.998874916411828e-06, "loss": 1.0166, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 120, "tokens_per_second_per_gpu": 479.94, "total_tokens": 30496024 }, { "epoch": 0.19280825219319386, "grad_norm": 0.271484375, "learning_rate": 4.99820494603199e-06, "loss": 0.9909, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 125, "tokens_per_second_per_gpu": 462.61, "total_tokens": 31699870 }, { "epoch": 0.20052058228092162, "grad_norm": 0.259765625, "learning_rate": 4.997379264362414e-06, "loss": 0.9874, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 130, "tokens_per_second_per_gpu": 467.47, "total_tokens": 32956914 }, { "epoch": 0.20823291236864938, "grad_norm": 0.251953125, "learning_rate": 4.996397931252118e-06, "loss": 1.0041, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 135, "tokens_per_second_per_gpu": 459.93, "total_tokens": 34182303 }, { "epoch": 0.21594524245637714, "grad_norm": 0.2314453125, "learning_rate": 4.9952610178324206e-06, "loss": 1.0227, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 140, "tokens_per_second_per_gpu": 510.67, "total_tokens": 35453388 }, { "epoch": 0.2236575725441049, "grad_norm": 0.2734375, "learning_rate": 4.993968606511777e-06, "loss": 1.0007, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 145, "tokens_per_second_per_gpu": 448.95, "total_tokens": 36723948 }, { "epoch": 0.23136990263183263, "grad_norm": 0.24609375, "learning_rate": 4.9925207909698115e-06, "loss": 0.9714, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 150, "tokens_per_second_per_gpu": 460.62, "total_tokens": 37953157 }, { "epoch": 0.2390822327195604, "grad_norm": 0.244140625, "learning_rate": 4.990917676150527e-06, "loss": 1.0193, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 155, "tokens_per_second_per_gpu": 509.45, "total_tokens": 39278804 }, { "epoch": 0.24679456280728815, "grad_norm": 0.2451171875, "learning_rate": 4.989159378254698e-06, "loss": 0.9887, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 160, "tokens_per_second_per_gpu": 491.62, "total_tokens": 40562259 }, { "epoch": 0.2545068928950159, "grad_norm": 0.2353515625, "learning_rate": 4.9872460247314455e-06, "loss": 0.9637, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 165, "tokens_per_second_per_gpu": 423.64, "total_tokens": 41779016 }, { "epoch": 0.26221922298274364, "grad_norm": 0.248046875, "learning_rate": 4.9851777542690004e-06, "loss": 0.9812, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 170, "tokens_per_second_per_gpu": 504.59, "total_tokens": 43071677 }, { "epoch": 0.2699315530704714, "grad_norm": 0.2412109375, "learning_rate": 4.9829547167846515e-06, "loss": 0.9513, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 175, "tokens_per_second_per_gpu": 440.97, "total_tokens": 44311747 }, { "epoch": 0.27764388315819916, "grad_norm": 0.2216796875, "learning_rate": 4.9805770734138785e-06, "loss": 0.9504, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 180, "tokens_per_second_per_gpu": 463.76, "total_tokens": 45513801 }, { "epoch": 0.2853562132459269, "grad_norm": 0.2490234375, "learning_rate": 4.978044996498671e-06, "loss": 0.9967, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 185, "tokens_per_second_per_gpu": 450.72, "total_tokens": 46737916 }, { "epoch": 0.2930685433336547, "grad_norm": 0.32421875, "learning_rate": 4.975358669575039e-06, "loss": 0.9927, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 190, "tokens_per_second_per_gpu": 482.76, "total_tokens": 48020729 }, { "epoch": 0.30078087342138243, "grad_norm": 0.259765625, "learning_rate": 4.972518287359707e-06, "loss": 0.9893, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 195, "tokens_per_second_per_gpu": 437.14, "total_tokens": 49235773 }, { "epoch": 0.3084932035091102, "grad_norm": 0.2314453125, "learning_rate": 4.969524055735999e-06, "loss": 0.9344, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 200, "tokens_per_second_per_gpu": 428.14, "total_tokens": 50485073 }, { "epoch": 0.31620553359683795, "grad_norm": 0.2294921875, "learning_rate": 4.9663761917389195e-06, "loss": 0.9669, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 205, "tokens_per_second_per_gpu": 513.34, "total_tokens": 51774221 }, { "epoch": 0.3239178636845657, "grad_norm": 0.2373046875, "learning_rate": 4.9630749235394174e-06, "loss": 1.0206, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 210, "tokens_per_second_per_gpu": 470.02, "total_tokens": 53055366 }, { "epoch": 0.33163019377229347, "grad_norm": 0.2216796875, "learning_rate": 4.959620490427851e-06, "loss": 1.0136, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 215, "tokens_per_second_per_gpu": 474.09, "total_tokens": 54318548 }, { "epoch": 0.33934252386002123, "grad_norm": 0.2353515625, "learning_rate": 4.95601314279664e-06, "loss": 0.98, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 220, "tokens_per_second_per_gpu": 441.47, "total_tokens": 55573941 }, { "epoch": 0.347054853947749, "grad_norm": 0.21875, "learning_rate": 4.952253142122116e-06, "loss": 0.9428, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 225, "tokens_per_second_per_gpu": 446.8, "total_tokens": 56816147 }, { "epoch": 0.35476718403547675, "grad_norm": 0.236328125, "learning_rate": 4.948340760945575e-06, "loss": 0.9925, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 230, "tokens_per_second_per_gpu": 440.95, "total_tokens": 58018929 }, { "epoch": 0.36247951412320445, "grad_norm": 0.216796875, "learning_rate": 4.944276282853515e-06, "loss": 1.0056, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 235, "tokens_per_second_per_gpu": 437.83, "total_tokens": 59236706 }, { "epoch": 0.3701918442109322, "grad_norm": 0.216796875, "learning_rate": 4.940060002457083e-06, "loss": 1.0271, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 240, "tokens_per_second_per_gpu": 478.28, "total_tokens": 60523938 }, { "epoch": 0.37790417429865997, "grad_norm": 0.2099609375, "learning_rate": 4.935692225370723e-06, "loss": 0.9817, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 245, "tokens_per_second_per_gpu": 461.75, "total_tokens": 61782785 }, { "epoch": 0.3856165043863877, "grad_norm": 0.2265625, "learning_rate": 4.9311732681900195e-06, "loss": 1.0036, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 250, "tokens_per_second_per_gpu": 455.35, "total_tokens": 62995481 }, { "epoch": 0.3933288344741155, "grad_norm": 0.2109375, "learning_rate": 4.926503458468756e-06, "loss": 1.0026, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 255, "tokens_per_second_per_gpu": 485.03, "total_tokens": 64329450 }, { "epoch": 0.40104116456184324, "grad_norm": 0.2294921875, "learning_rate": 4.921683134695161e-06, "loss": 1.0022, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 260, "tokens_per_second_per_gpu": 462.42, "total_tokens": 65577746 }, { "epoch": 0.408753494649571, "grad_norm": 0.2158203125, "learning_rate": 4.916712646267386e-06, "loss": 1.0291, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 265, "tokens_per_second_per_gpu": 467.37, "total_tokens": 66836212 }, { "epoch": 0.41646582473729876, "grad_norm": 0.2333984375, "learning_rate": 4.91159235346817e-06, "loss": 0.9912, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 270, "tokens_per_second_per_gpu": 515.37, "total_tokens": 68143788 }, { "epoch": 0.4241781548250265, "grad_norm": 0.2275390625, "learning_rate": 4.906322627438728e-06, "loss": 0.965, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 275, "tokens_per_second_per_gpu": 482.58, "total_tokens": 69419203 }, { "epoch": 0.4318904849127543, "grad_norm": 0.251953125, "learning_rate": 4.900903850151851e-06, "loss": 0.9919, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 280, "tokens_per_second_per_gpu": 476.95, "total_tokens": 70708858 }, { "epoch": 0.43960281500048204, "grad_norm": 0.2255859375, "learning_rate": 4.895336414384216e-06, "loss": 1.015, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 285, "tokens_per_second_per_gpu": 488.29, "total_tokens": 71965415 }, { "epoch": 0.4473151450882098, "grad_norm": 0.2470703125, "learning_rate": 4.889620723687917e-06, "loss": 0.998, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 290, "tokens_per_second_per_gpu": 501.89, "total_tokens": 73325345 }, { "epoch": 0.45502747517593756, "grad_norm": 0.24609375, "learning_rate": 4.883757192361212e-06, "loss": 0.9773, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 295, "tokens_per_second_per_gpu": 469.94, "total_tokens": 74612028 }, { "epoch": 0.46273980526366526, "grad_norm": 0.2255859375, "learning_rate": 4.8777462454184985e-06, "loss": 0.9877, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 300, "tokens_per_second_per_gpu": 476.32, "total_tokens": 75892822 }, { "epoch": 0.470452135351393, "grad_norm": 0.244140625, "learning_rate": 4.871588318559497e-06, "loss": 1.0243, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 305, "tokens_per_second_per_gpu": 451.66, "total_tokens": 77185408 }, { "epoch": 0.4781644654391208, "grad_norm": 0.220703125, "learning_rate": 4.86528385813768e-06, "loss": 0.9481, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 310, "tokens_per_second_per_gpu": 441.19, "total_tokens": 78404970 }, { "epoch": 0.48587679552684854, "grad_norm": 0.2216796875, "learning_rate": 4.8588333211279105e-06, "loss": 0.9578, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 315, "tokens_per_second_per_gpu": 466.55, "total_tokens": 79635002 }, { "epoch": 0.4935891256145763, "grad_norm": 0.212890625, "learning_rate": 4.852237175093324e-06, "loss": 1.0071, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 320, "tokens_per_second_per_gpu": 452.13, "total_tokens": 80933611 }, { "epoch": 0.5013014557023041, "grad_norm": 0.2177734375, "learning_rate": 4.845495898151433e-06, "loss": 0.944, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 325, "tokens_per_second_per_gpu": 433.83, "total_tokens": 82145228 }, { "epoch": 0.5090137857900318, "grad_norm": 0.2099609375, "learning_rate": 4.838609978939473e-06, "loss": 1.0259, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 330, "tokens_per_second_per_gpu": 453.14, "total_tokens": 83357951 }, { "epoch": 0.5167261158777595, "grad_norm": 0.236328125, "learning_rate": 4.831579916578984e-06, "loss": 0.9701, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 335, "tokens_per_second_per_gpu": 466.64, "total_tokens": 84573768 }, { "epoch": 0.5244384459654873, "grad_norm": 0.2060546875, "learning_rate": 4.824406220639634e-06, "loss": 1.005, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 340, "tokens_per_second_per_gpu": 460.44, "total_tokens": 85851432 }, { "epoch": 0.532150776053215, "grad_norm": 0.2138671875, "learning_rate": 4.817089411102277e-06, "loss": 0.9687, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 345, "tokens_per_second_per_gpu": 440.77, "total_tokens": 87088346 }, { "epoch": 0.5398631061409428, "grad_norm": 0.2294921875, "learning_rate": 4.809630018321269e-06, "loss": 0.9483, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 350, "tokens_per_second_per_gpu": 480.0, "total_tokens": 88317739 }, { "epoch": 0.5475754362286706, "grad_norm": 0.271484375, "learning_rate": 4.802028582986024e-06, "loss": 0.9666, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 355, "tokens_per_second_per_gpu": 476.5, "total_tokens": 89577574 }, { "epoch": 0.5552877663163983, "grad_norm": 0.240234375, "learning_rate": 4.794285656081819e-06, "loss": 0.9507, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 360, "tokens_per_second_per_gpu": 497.81, "total_tokens": 90828267 }, { "epoch": 0.5630000964041261, "grad_norm": 0.2099609375, "learning_rate": 4.786401798849861e-06, "loss": 0.9405, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 365, "tokens_per_second_per_gpu": 481.32, "total_tokens": 92116882 }, { "epoch": 0.5707124264918538, "grad_norm": 0.2392578125, "learning_rate": 4.778377582746601e-06, "loss": 1.0099, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 370, "tokens_per_second_per_gpu": 459.13, "total_tokens": 93354393 }, { "epoch": 0.5784247565795816, "grad_norm": 0.220703125, "learning_rate": 4.770213589402317e-06, "loss": 0.9608, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 375, "tokens_per_second_per_gpu": 485.77, "total_tokens": 94596707 }, { "epoch": 0.5861370866673093, "grad_norm": 0.2109375, "learning_rate": 4.7619104105789525e-06, "loss": 0.9747, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 380, "tokens_per_second_per_gpu": 449.15, "total_tokens": 95823690 }, { "epoch": 0.5938494167550371, "grad_norm": 0.234375, "learning_rate": 4.753468648127223e-06, "loss": 0.9593, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 385, "tokens_per_second_per_gpu": 462.81, "total_tokens": 97064318 }, { "epoch": 0.6015617468427649, "grad_norm": 0.2041015625, "learning_rate": 4.74488891394299e-06, "loss": 0.984, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 390, "tokens_per_second_per_gpu": 467.19, "total_tokens": 98318680 }, { "epoch": 0.6092740769304926, "grad_norm": 0.2216796875, "learning_rate": 4.7361718299229125e-06, "loss": 0.9446, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 395, "tokens_per_second_per_gpu": 439.44, "total_tokens": 99522489 }, { "epoch": 0.6169864070182204, "grad_norm": 0.2265625, "learning_rate": 4.727318027919364e-06, "loss": 0.9796, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 400, "tokens_per_second_per_gpu": 441.41, "total_tokens": 100785080 }, { "epoch": 0.6246987371059481, "grad_norm": 0.23046875, "learning_rate": 4.718328149694636e-06, "loss": 1.0054, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 405, "tokens_per_second_per_gpu": 497.07, "total_tokens": 102084350 }, { "epoch": 0.6324110671936759, "grad_norm": 0.236328125, "learning_rate": 4.70920284687442e-06, "loss": 0.9891, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 410, "tokens_per_second_per_gpu": 482.3, "total_tokens": 103357801 }, { "epoch": 0.6401233972814037, "grad_norm": 0.2294921875, "learning_rate": 4.699942780900575e-06, "loss": 0.9616, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 415, "tokens_per_second_per_gpu": 499.49, "total_tokens": 104613606 }, { "epoch": 0.6478357273691314, "grad_norm": 0.212890625, "learning_rate": 4.6905486229831814e-06, "loss": 0.9559, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 420, "tokens_per_second_per_gpu": 500.21, "total_tokens": 105880634 }, { "epoch": 0.6555480574568592, "grad_norm": 0.220703125, "learning_rate": 4.68102105405189e-06, "loss": 0.9997, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 425, "tokens_per_second_per_gpu": 473.56, "total_tokens": 107115671 }, { "epoch": 0.6632603875445869, "grad_norm": 0.212890625, "learning_rate": 4.671360764706566e-06, "loss": 0.9785, "memory/device_reserved (GiB)": 246.75, "memory/max_active (GiB)": 211.16, "memory/max_allocated (GiB)": 211.16, "step": 430, "tokens_per_second_per_gpu": 507.32, "total_tokens": 108431881 } ], "logging_steps": 5, "max_steps": 1945, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 217, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.746652599408562e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }