{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9991843393148451, "eval_steps": 1225, "global_step": 1225, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 1.9604277610778809, "eval_runtime": 3419.2806, "eval_samples_per_second": 0.439, "eval_steps_per_second": 0.146, "memory/device_reserved (GiB)": 59.93, "memory/max_active (GiB)": 57.99, "memory/max_allocated (GiB)": 57.99, "step": 0 }, { "epoch": 0.0008156606851549756, "grad_norm": 0.6348393559455872, "learning_rate": 0.0, "loss": 1.9553, "memory/device_reserved (GiB)": 75.31, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1, "tokens_per_second_per_gpu": 1639.07 }, { "epoch": 0.0016313213703099511, "grad_norm": 0.33999207615852356, "learning_rate": 4.081632653061225e-08, "loss": 1.8732, "memory/device_reserved (GiB)": 75.31, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 2, "tokens_per_second_per_gpu": 11.05 }, { "epoch": 0.0024469820554649264, "grad_norm": 1.0568342208862305, "learning_rate": 8.16326530612245e-08, "loss": 2.0511, "memory/device_reserved (GiB)": 75.31, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 3, "tokens_per_second_per_gpu": 9.65 }, { "epoch": 0.0032626427406199023, "grad_norm": 0.45948347449302673, "learning_rate": 1.2244897959183673e-07, "loss": 2.0169, "memory/device_reserved (GiB)": 75.31, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 4, "tokens_per_second_per_gpu": 8.82 }, { "epoch": 0.004078303425774877, "grad_norm": 0.427413672208786, "learning_rate": 1.63265306122449e-07, "loss": 2.0055, "memory/device_reserved (GiB)": 75.31, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 5, "tokens_per_second_per_gpu": 12.83 }, { "epoch": 0.004893964110929853, "grad_norm": 0.44444531202316284, "learning_rate": 2.0408163265306124e-07, "loss": 1.9156, "memory/device_reserved (GiB)": 75.31, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 6, "tokens_per_second_per_gpu": 8.19 }, { "epoch": 0.005709624796084829, "grad_norm": 0.5367342233657837, "learning_rate": 2.4489795918367347e-07, "loss": 1.8406, "memory/device_reserved (GiB)": 75.31, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 7, "tokens_per_second_per_gpu": 13.2 }, { "epoch": 0.0065252854812398045, "grad_norm": 0.3617013096809387, "learning_rate": 2.8571428571428575e-07, "loss": 1.8834, "memory/device_reserved (GiB)": 75.31, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 8, "tokens_per_second_per_gpu": 10.48 }, { "epoch": 0.00734094616639478, "grad_norm": 0.4110305905342102, "learning_rate": 3.26530612244898e-07, "loss": 1.9313, "memory/device_reserved (GiB)": 75.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 9, "tokens_per_second_per_gpu": 12.02 }, { "epoch": 0.008156606851549755, "grad_norm": 0.5980206727981567, "learning_rate": 3.673469387755102e-07, "loss": 1.9577, "memory/device_reserved (GiB)": 75.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 10, "tokens_per_second_per_gpu": 9.69 }, { "epoch": 0.00897226753670473, "grad_norm": 0.3745971620082855, "learning_rate": 4.081632653061225e-07, "loss": 2.0916, "memory/device_reserved (GiB)": 75.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 11, "tokens_per_second_per_gpu": 5.32 }, { "epoch": 0.009787928221859706, "grad_norm": 0.41269657015800476, "learning_rate": 4.489795918367347e-07, "loss": 1.8364, "memory/device_reserved (GiB)": 75.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 12, "tokens_per_second_per_gpu": 10.56 }, { "epoch": 0.010603588907014683, "grad_norm": 0.6282578110694885, "learning_rate": 4.897959183673469e-07, "loss": 2.019, "memory/device_reserved (GiB)": 75.33, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 13, "tokens_per_second_per_gpu": 10.1 }, { "epoch": 0.011419249592169658, "grad_norm": 0.4410882890224457, "learning_rate": 5.306122448979592e-07, "loss": 2.0098, "memory/device_reserved (GiB)": 75.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 14, "tokens_per_second_per_gpu": 7.14 }, { "epoch": 0.012234910277324634, "grad_norm": 0.5374980568885803, "learning_rate": 5.714285714285715e-07, "loss": 1.9771, "memory/device_reserved (GiB)": 75.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 15, "tokens_per_second_per_gpu": 14.0 }, { "epoch": 0.013050570962479609, "grad_norm": 0.3347431719303131, "learning_rate": 6.122448979591837e-07, "loss": 2.0082, "memory/device_reserved (GiB)": 75.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 16, "tokens_per_second_per_gpu": 17.04 }, { "epoch": 0.013866231647634585, "grad_norm": 0.5574456453323364, "learning_rate": 6.53061224489796e-07, "loss": 1.8819, "memory/device_reserved (GiB)": 75.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 17, "tokens_per_second_per_gpu": 10.0 }, { "epoch": 0.01468189233278956, "grad_norm": 0.46337175369262695, "learning_rate": 6.938775510204082e-07, "loss": 1.9275, "memory/device_reserved (GiB)": 75.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 18, "tokens_per_second_per_gpu": 7.77 }, { "epoch": 0.015497553017944535, "grad_norm": 0.573352038860321, "learning_rate": 7.346938775510204e-07, "loss": 1.9539, "memory/device_reserved (GiB)": 75.33, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 19, "tokens_per_second_per_gpu": 11.88 }, { "epoch": 0.01631321370309951, "grad_norm": 0.5387033224105835, "learning_rate": 7.755102040816327e-07, "loss": 1.9769, "memory/device_reserved (GiB)": 75.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 20, "tokens_per_second_per_gpu": 9.64 }, { "epoch": 0.017128874388254486, "grad_norm": 0.4464549124240875, "learning_rate": 8.16326530612245e-07, "loss": 1.9476, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 21, "tokens_per_second_per_gpu": 7.59 }, { "epoch": 0.01794453507340946, "grad_norm": 0.3910236954689026, "learning_rate": 8.571428571428572e-07, "loss": 1.9558, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 22, "tokens_per_second_per_gpu": 4.6 }, { "epoch": 0.018760195758564437, "grad_norm": 0.37353578209877014, "learning_rate": 8.979591836734694e-07, "loss": 1.9154, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 23, "tokens_per_second_per_gpu": 16.59 }, { "epoch": 0.01957585644371941, "grad_norm": 0.40477487444877625, "learning_rate": 9.387755102040817e-07, "loss": 1.9768, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 24, "tokens_per_second_per_gpu": 5.74 }, { "epoch": 0.020391517128874388, "grad_norm": 0.3659735321998596, "learning_rate": 9.795918367346939e-07, "loss": 1.9407, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 25, "tokens_per_second_per_gpu": 14.35 }, { "epoch": 0.021207177814029365, "grad_norm": 0.7394002676010132, "learning_rate": 1.020408163265306e-06, "loss": 1.9461, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 26, "tokens_per_second_per_gpu": 9.23 }, { "epoch": 0.02202283849918434, "grad_norm": 0.44476643204689026, "learning_rate": 1.0612244897959184e-06, "loss": 1.9605, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 27, "tokens_per_second_per_gpu": 8.79 }, { "epoch": 0.022838499184339316, "grad_norm": 0.3480445444583893, "learning_rate": 1.1020408163265306e-06, "loss": 2.0688, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 28, "tokens_per_second_per_gpu": 16.26 }, { "epoch": 0.02365415986949429, "grad_norm": 0.3581937253475189, "learning_rate": 1.142857142857143e-06, "loss": 1.8412, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 29, "tokens_per_second_per_gpu": 8.74 }, { "epoch": 0.024469820554649267, "grad_norm": 0.44773343205451965, "learning_rate": 1.1836734693877552e-06, "loss": 1.968, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 30, "tokens_per_second_per_gpu": 14.99 }, { "epoch": 0.02528548123980424, "grad_norm": 0.6487011909484863, "learning_rate": 1.2244897959183673e-06, "loss": 1.9781, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 31, "tokens_per_second_per_gpu": 6.2 }, { "epoch": 0.026101141924959218, "grad_norm": 0.47234439849853516, "learning_rate": 1.2653061224489797e-06, "loss": 1.9405, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 32, "tokens_per_second_per_gpu": 11.67 }, { "epoch": 0.026916802610114192, "grad_norm": 0.37792879343032837, "learning_rate": 1.306122448979592e-06, "loss": 1.8645, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 33, "tokens_per_second_per_gpu": 8.8 }, { "epoch": 0.02773246329526917, "grad_norm": 0.38162070512771606, "learning_rate": 1.346938775510204e-06, "loss": 1.8048, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 34, "tokens_per_second_per_gpu": 9.82 }, { "epoch": 0.028548123980424143, "grad_norm": 0.4825076162815094, "learning_rate": 1.3877551020408165e-06, "loss": 1.9485, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 35, "tokens_per_second_per_gpu": 11.64 }, { "epoch": 0.02936378466557912, "grad_norm": 0.45664966106414795, "learning_rate": 1.4285714285714286e-06, "loss": 1.9444, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 36, "tokens_per_second_per_gpu": 10.65 }, { "epoch": 0.030179445350734094, "grad_norm": 0.4269927442073822, "learning_rate": 1.4693877551020408e-06, "loss": 1.9887, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 37, "tokens_per_second_per_gpu": 14.32 }, { "epoch": 0.03099510603588907, "grad_norm": 0.34001636505126953, "learning_rate": 1.5102040816326532e-06, "loss": 1.9121, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 38, "tokens_per_second_per_gpu": 11.35 }, { "epoch": 0.03181076672104405, "grad_norm": 0.3767380118370056, "learning_rate": 1.5510204081632654e-06, "loss": 1.8613, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 39, "tokens_per_second_per_gpu": 12.33 }, { "epoch": 0.03262642740619902, "grad_norm": 0.3435960114002228, "learning_rate": 1.5918367346938775e-06, "loss": 1.8585, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 40, "tokens_per_second_per_gpu": 16.49 }, { "epoch": 0.033442088091353996, "grad_norm": 0.4967412054538727, "learning_rate": 1.63265306122449e-06, "loss": 1.9495, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 41, "tokens_per_second_per_gpu": 11.68 }, { "epoch": 0.03425774877650897, "grad_norm": 0.3683670163154602, "learning_rate": 1.673469387755102e-06, "loss": 1.7929, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 42, "tokens_per_second_per_gpu": 13.78 }, { "epoch": 0.03507340946166395, "grad_norm": 0.46631813049316406, "learning_rate": 1.7142857142857145e-06, "loss": 1.9415, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 43, "tokens_per_second_per_gpu": 10.42 }, { "epoch": 0.03588907014681892, "grad_norm": 0.4533991515636444, "learning_rate": 1.7551020408163264e-06, "loss": 1.8555, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 44, "tokens_per_second_per_gpu": 21.18 }, { "epoch": 0.0367047308319739, "grad_norm": 0.48453959822654724, "learning_rate": 1.7959183673469388e-06, "loss": 1.9029, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 45, "tokens_per_second_per_gpu": 9.35 }, { "epoch": 0.037520391517128875, "grad_norm": 0.5037137269973755, "learning_rate": 1.8367346938775512e-06, "loss": 1.8665, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 46, "tokens_per_second_per_gpu": 7.31 }, { "epoch": 0.03833605220228385, "grad_norm": 0.29343292117118835, "learning_rate": 1.8775510204081634e-06, "loss": 1.946, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 47, "tokens_per_second_per_gpu": 15.46 }, { "epoch": 0.03915171288743882, "grad_norm": 0.34195050597190857, "learning_rate": 1.9183673469387756e-06, "loss": 1.9831, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 48, "tokens_per_second_per_gpu": 10.6 }, { "epoch": 0.0399673735725938, "grad_norm": 0.43533310294151306, "learning_rate": 1.9591836734693877e-06, "loss": 2.0406, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 49, "tokens_per_second_per_gpu": 9.33 }, { "epoch": 0.040783034257748776, "grad_norm": 0.4667811691761017, "learning_rate": 2.0000000000000003e-06, "loss": 2.0437, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 50, "tokens_per_second_per_gpu": 13.26 }, { "epoch": 0.041598694942903754, "grad_norm": 0.5717320442199707, "learning_rate": 2.040816326530612e-06, "loss": 1.9639, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 51, "tokens_per_second_per_gpu": 8.79 }, { "epoch": 0.04241435562805873, "grad_norm": 0.40679931640625, "learning_rate": 2.0816326530612247e-06, "loss": 1.9111, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 52, "tokens_per_second_per_gpu": 9.87 }, { "epoch": 0.0432300163132137, "grad_norm": 0.4831007421016693, "learning_rate": 2.122448979591837e-06, "loss": 1.9413, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 53, "tokens_per_second_per_gpu": 17.47 }, { "epoch": 0.04404567699836868, "grad_norm": 0.5955955386161804, "learning_rate": 2.163265306122449e-06, "loss": 2.0074, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 54, "tokens_per_second_per_gpu": 7.55 }, { "epoch": 0.044861337683523655, "grad_norm": 0.411893367767334, "learning_rate": 2.204081632653061e-06, "loss": 1.9131, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 55, "tokens_per_second_per_gpu": 5.92 }, { "epoch": 0.04567699836867863, "grad_norm": 0.4156520664691925, "learning_rate": 2.2448979591836734e-06, "loss": 1.9144, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 56, "tokens_per_second_per_gpu": 9.29 }, { "epoch": 0.0464926590538336, "grad_norm": 0.4573107361793518, "learning_rate": 2.285714285714286e-06, "loss": 1.9006, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 57, "tokens_per_second_per_gpu": 12.57 }, { "epoch": 0.04730831973898858, "grad_norm": 0.7593570351600647, "learning_rate": 2.326530612244898e-06, "loss": 1.8322, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 58, "tokens_per_second_per_gpu": 13.26 }, { "epoch": 0.04812398042414356, "grad_norm": 0.4180932343006134, "learning_rate": 2.3673469387755103e-06, "loss": 1.9156, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 59, "tokens_per_second_per_gpu": 8.59 }, { "epoch": 0.048939641109298535, "grad_norm": 0.537712574005127, "learning_rate": 2.4081632653061225e-06, "loss": 1.7654, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 60, "tokens_per_second_per_gpu": 8.53 }, { "epoch": 0.049755301794453505, "grad_norm": 0.4493379294872284, "learning_rate": 2.4489795918367347e-06, "loss": 1.9755, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 61, "tokens_per_second_per_gpu": 15.83 }, { "epoch": 0.05057096247960848, "grad_norm": 0.424493670463562, "learning_rate": 2.4897959183673473e-06, "loss": 2.0056, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 62, "tokens_per_second_per_gpu": 8.52 }, { "epoch": 0.05138662316476346, "grad_norm": 0.3909122943878174, "learning_rate": 2.5306122448979594e-06, "loss": 1.9019, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 63, "tokens_per_second_per_gpu": 9.72 }, { "epoch": 0.052202283849918436, "grad_norm": 0.36254364252090454, "learning_rate": 2.5714285714285716e-06, "loss": 1.8711, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 64, "tokens_per_second_per_gpu": 20.47 }, { "epoch": 0.05301794453507341, "grad_norm": 0.41211870312690735, "learning_rate": 2.612244897959184e-06, "loss": 1.8332, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 65, "tokens_per_second_per_gpu": 13.23 }, { "epoch": 0.053833605220228384, "grad_norm": 0.2782273590564728, "learning_rate": 2.653061224489796e-06, "loss": 1.9176, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 66, "tokens_per_second_per_gpu": 11.49 }, { "epoch": 0.05464926590538336, "grad_norm": 0.3749746084213257, "learning_rate": 2.693877551020408e-06, "loss": 1.816, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 67, "tokens_per_second_per_gpu": 11.73 }, { "epoch": 0.05546492659053834, "grad_norm": 0.34983018040657043, "learning_rate": 2.7346938775510203e-06, "loss": 1.8174, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 68, "tokens_per_second_per_gpu": 12.04 }, { "epoch": 0.05628058727569331, "grad_norm": 0.3992249071598053, "learning_rate": 2.775510204081633e-06, "loss": 1.9402, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 69, "tokens_per_second_per_gpu": 16.81 }, { "epoch": 0.057096247960848286, "grad_norm": 0.3737384080886841, "learning_rate": 2.816326530612245e-06, "loss": 1.8778, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 70, "tokens_per_second_per_gpu": 11.64 }, { "epoch": 0.05791190864600326, "grad_norm": 1.317840576171875, "learning_rate": 2.8571428571428573e-06, "loss": 1.8753, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 71, "tokens_per_second_per_gpu": 15.99 }, { "epoch": 0.05872756933115824, "grad_norm": 0.50225830078125, "learning_rate": 2.8979591836734694e-06, "loss": 1.9502, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 72, "tokens_per_second_per_gpu": 10.47 }, { "epoch": 0.05954323001631321, "grad_norm": 0.5393797159194946, "learning_rate": 2.9387755102040816e-06, "loss": 2.0252, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 73, "tokens_per_second_per_gpu": 12.25 }, { "epoch": 0.06035889070146819, "grad_norm": 0.5161297917366028, "learning_rate": 2.979591836734694e-06, "loss": 1.7385, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 74, "tokens_per_second_per_gpu": 20.27 }, { "epoch": 0.061174551386623165, "grad_norm": 0.40634122490882874, "learning_rate": 3.0204081632653064e-06, "loss": 1.8129, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 75, "tokens_per_second_per_gpu": 11.3 }, { "epoch": 0.06199021207177814, "grad_norm": 1.7107641696929932, "learning_rate": 3.0612244897959185e-06, "loss": 1.9759, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 76, "tokens_per_second_per_gpu": 12.34 }, { "epoch": 0.06280587275693311, "grad_norm": 0.4087129533290863, "learning_rate": 3.1020408163265307e-06, "loss": 1.9124, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 77, "tokens_per_second_per_gpu": 13.29 }, { "epoch": 0.0636215334420881, "grad_norm": 0.46592018008232117, "learning_rate": 3.1428571428571433e-06, "loss": 1.8621, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 78, "tokens_per_second_per_gpu": 15.21 }, { "epoch": 0.06443719412724307, "grad_norm": 0.4075947403907776, "learning_rate": 3.183673469387755e-06, "loss": 1.8218, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 79, "tokens_per_second_per_gpu": 16.62 }, { "epoch": 0.06525285481239804, "grad_norm": 0.4880898594856262, "learning_rate": 3.2244897959183672e-06, "loss": 1.8591, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 80, "tokens_per_second_per_gpu": 12.44 }, { "epoch": 0.06606851549755302, "grad_norm": 0.5287392735481262, "learning_rate": 3.26530612244898e-06, "loss": 1.8899, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 81, "tokens_per_second_per_gpu": 11.28 }, { "epoch": 0.06688417618270799, "grad_norm": 0.4736325144767761, "learning_rate": 3.306122448979592e-06, "loss": 1.7647, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 82, "tokens_per_second_per_gpu": 8.83 }, { "epoch": 0.06769983686786298, "grad_norm": 0.42015793919563293, "learning_rate": 3.346938775510204e-06, "loss": 1.856, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 83, "tokens_per_second_per_gpu": 14.43 }, { "epoch": 0.06851549755301795, "grad_norm": 0.4492851793766022, "learning_rate": 3.3877551020408164e-06, "loss": 1.8882, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 84, "tokens_per_second_per_gpu": 8.71 }, { "epoch": 0.06933115823817292, "grad_norm": 0.3998938202857971, "learning_rate": 3.428571428571429e-06, "loss": 1.8661, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 85, "tokens_per_second_per_gpu": 12.74 }, { "epoch": 0.0701468189233279, "grad_norm": 0.4164998233318329, "learning_rate": 3.469387755102041e-06, "loss": 1.907, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 86, "tokens_per_second_per_gpu": 13.98 }, { "epoch": 0.07096247960848287, "grad_norm": 0.4323839545249939, "learning_rate": 3.510204081632653e-06, "loss": 1.8721, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 87, "tokens_per_second_per_gpu": 9.14 }, { "epoch": 0.07177814029363784, "grad_norm": 0.4599860906600952, "learning_rate": 3.551020408163266e-06, "loss": 1.7586, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 88, "tokens_per_second_per_gpu": 9.77 }, { "epoch": 0.07259380097879282, "grad_norm": 0.6686786413192749, "learning_rate": 3.5918367346938777e-06, "loss": 1.8355, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 89, "tokens_per_second_per_gpu": 15.78 }, { "epoch": 0.0734094616639478, "grad_norm": 0.4024296700954437, "learning_rate": 3.63265306122449e-06, "loss": 1.7524, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 90, "tokens_per_second_per_gpu": 8.29 }, { "epoch": 0.07422512234910278, "grad_norm": 0.3434332013130188, "learning_rate": 3.6734693877551024e-06, "loss": 1.8173, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 91, "tokens_per_second_per_gpu": 15.87 }, { "epoch": 0.07504078303425775, "grad_norm": 0.40535256266593933, "learning_rate": 3.7142857142857146e-06, "loss": 1.804, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 92, "tokens_per_second_per_gpu": 14.11 }, { "epoch": 0.07585644371941272, "grad_norm": 0.3459508717060089, "learning_rate": 3.7551020408163268e-06, "loss": 1.7282, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 93, "tokens_per_second_per_gpu": 15.19 }, { "epoch": 0.0766721044045677, "grad_norm": 0.3481232523918152, "learning_rate": 3.7959183673469385e-06, "loss": 1.8058, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 94, "tokens_per_second_per_gpu": 4.98 }, { "epoch": 0.07748776508972267, "grad_norm": 0.3750293552875519, "learning_rate": 3.836734693877551e-06, "loss": 1.8359, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 95, "tokens_per_second_per_gpu": 10.35 }, { "epoch": 0.07830342577487764, "grad_norm": 0.5111256837844849, "learning_rate": 3.877551020408164e-06, "loss": 1.7988, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 96, "tokens_per_second_per_gpu": 11.01 }, { "epoch": 0.07911908646003263, "grad_norm": 0.4040432274341583, "learning_rate": 3.9183673469387755e-06, "loss": 1.8231, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 97, "tokens_per_second_per_gpu": 4.3 }, { "epoch": 0.0799347471451876, "grad_norm": 0.38245025277137756, "learning_rate": 3.959183673469388e-06, "loss": 1.7931, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 98, "tokens_per_second_per_gpu": 18.09 }, { "epoch": 0.08075040783034258, "grad_norm": 0.43279772996902466, "learning_rate": 4.000000000000001e-06, "loss": 1.8691, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 99, "tokens_per_second_per_gpu": 7.05 }, { "epoch": 0.08156606851549755, "grad_norm": 0.24266256392002106, "learning_rate": 4.040816326530612e-06, "loss": 1.7811, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 100, "tokens_per_second_per_gpu": 9.42 }, { "epoch": 0.08238172920065252, "grad_norm": 0.4181773364543915, "learning_rate": 4.081632653061224e-06, "loss": 1.7368, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 101, "tokens_per_second_per_gpu": 7.57 }, { "epoch": 0.08319738988580751, "grad_norm": 0.4386857748031616, "learning_rate": 4.122448979591837e-06, "loss": 1.814, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 102, "tokens_per_second_per_gpu": 10.61 }, { "epoch": 0.08401305057096248, "grad_norm": 0.3691022992134094, "learning_rate": 4.163265306122449e-06, "loss": 1.784, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 103, "tokens_per_second_per_gpu": 13.55 }, { "epoch": 0.08482871125611746, "grad_norm": 0.3794533908367157, "learning_rate": 4.204081632653061e-06, "loss": 1.8259, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 104, "tokens_per_second_per_gpu": 17.32 }, { "epoch": 0.08564437194127243, "grad_norm": 0.3870322108268738, "learning_rate": 4.244897959183674e-06, "loss": 1.6609, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 105, "tokens_per_second_per_gpu": 11.51 }, { "epoch": 0.0864600326264274, "grad_norm": 0.38991808891296387, "learning_rate": 4.285714285714286e-06, "loss": 1.761, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 106, "tokens_per_second_per_gpu": 15.63 }, { "epoch": 0.08727569331158239, "grad_norm": 1.304951786994934, "learning_rate": 4.326530612244898e-06, "loss": 1.8385, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 107, "tokens_per_second_per_gpu": 10.75 }, { "epoch": 0.08809135399673736, "grad_norm": 0.3592953681945801, "learning_rate": 4.367346938775511e-06, "loss": 1.8055, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 108, "tokens_per_second_per_gpu": 9.29 }, { "epoch": 0.08890701468189233, "grad_norm": 0.4183287024497986, "learning_rate": 4.408163265306122e-06, "loss": 1.7569, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 109, "tokens_per_second_per_gpu": 15.03 }, { "epoch": 0.08972267536704731, "grad_norm": 0.24654170870780945, "learning_rate": 4.448979591836735e-06, "loss": 1.7709, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 110, "tokens_per_second_per_gpu": 12.62 }, { "epoch": 0.09053833605220228, "grad_norm": 0.3184000253677368, "learning_rate": 4.489795918367347e-06, "loss": 1.9577, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 111, "tokens_per_second_per_gpu": 7.32 }, { "epoch": 0.09135399673735727, "grad_norm": 0.3506717383861542, "learning_rate": 4.530612244897959e-06, "loss": 1.6413, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 112, "tokens_per_second_per_gpu": 11.81 }, { "epoch": 0.09216965742251224, "grad_norm": 0.46599406003952026, "learning_rate": 4.571428571428572e-06, "loss": 1.6915, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 113, "tokens_per_second_per_gpu": 14.13 }, { "epoch": 0.0929853181076672, "grad_norm": 0.2948969602584839, "learning_rate": 4.612244897959184e-06, "loss": 1.7739, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 114, "tokens_per_second_per_gpu": 17.97 }, { "epoch": 0.09380097879282219, "grad_norm": 0.2474483698606491, "learning_rate": 4.653061224489796e-06, "loss": 1.8371, "memory/device_reserved (GiB)": 75.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 115, "tokens_per_second_per_gpu": 15.17 }, { "epoch": 0.09461663947797716, "grad_norm": 0.3324006497859955, "learning_rate": 4.693877551020408e-06, "loss": 1.5136, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 116, "tokens_per_second_per_gpu": 19.63 }, { "epoch": 0.09543230016313213, "grad_norm": 0.2667665183544159, "learning_rate": 4.734693877551021e-06, "loss": 1.6376, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 117, "tokens_per_second_per_gpu": 10.22 }, { "epoch": 0.09624796084828711, "grad_norm": 0.3425082266330719, "learning_rate": 4.775510204081632e-06, "loss": 1.7488, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 118, "tokens_per_second_per_gpu": 9.92 }, { "epoch": 0.09706362153344208, "grad_norm": 0.3169610798358917, "learning_rate": 4.816326530612245e-06, "loss": 1.6325, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 119, "tokens_per_second_per_gpu": 15.57 }, { "epoch": 0.09787928221859707, "grad_norm": 0.24414299428462982, "learning_rate": 4.857142857142858e-06, "loss": 1.6763, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 120, "tokens_per_second_per_gpu": 11.96 }, { "epoch": 0.09869494290375204, "grad_norm": 0.28090181946754456, "learning_rate": 4.897959183673469e-06, "loss": 1.6614, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 121, "tokens_per_second_per_gpu": 13.92 }, { "epoch": 0.09951060358890701, "grad_norm": 0.4507470428943634, "learning_rate": 4.938775510204082e-06, "loss": 1.7077, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 122, "tokens_per_second_per_gpu": 7.61 }, { "epoch": 0.100326264274062, "grad_norm": 0.39393696188926697, "learning_rate": 4.9795918367346945e-06, "loss": 1.7571, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 123, "tokens_per_second_per_gpu": 6.55 }, { "epoch": 0.10114192495921696, "grad_norm": 0.30896738171577454, "learning_rate": 5.020408163265306e-06, "loss": 1.6731, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 124, "tokens_per_second_per_gpu": 14.76 }, { "epoch": 0.10195758564437195, "grad_norm": 0.26113393902778625, "learning_rate": 5.061224489795919e-06, "loss": 1.7381, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 125, "tokens_per_second_per_gpu": 20.75 }, { "epoch": 0.10277324632952692, "grad_norm": 0.3467881679534912, "learning_rate": 5.102040816326531e-06, "loss": 1.7007, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 126, "tokens_per_second_per_gpu": 12.99 }, { "epoch": 0.10358890701468189, "grad_norm": 0.33074134588241577, "learning_rate": 5.142857142857143e-06, "loss": 1.7315, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 127, "tokens_per_second_per_gpu": 10.63 }, { "epoch": 0.10440456769983687, "grad_norm": 0.31264668703079224, "learning_rate": 5.183673469387755e-06, "loss": 1.6664, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 128, "tokens_per_second_per_gpu": 7.19 }, { "epoch": 0.10522022838499184, "grad_norm": 0.44500023126602173, "learning_rate": 5.224489795918368e-06, "loss": 1.698, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 129, "tokens_per_second_per_gpu": 12.71 }, { "epoch": 0.10603588907014681, "grad_norm": 0.3809625804424286, "learning_rate": 5.26530612244898e-06, "loss": 1.6826, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 130, "tokens_per_second_per_gpu": 8.39 }, { "epoch": 0.1068515497553018, "grad_norm": 0.35113289952278137, "learning_rate": 5.306122448979592e-06, "loss": 1.7119, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 131, "tokens_per_second_per_gpu": 10.92 }, { "epoch": 0.10766721044045677, "grad_norm": 0.32445791363716125, "learning_rate": 5.3469387755102045e-06, "loss": 1.6425, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 132, "tokens_per_second_per_gpu": 17.01 }, { "epoch": 0.10848287112561175, "grad_norm": 0.33810552954673767, "learning_rate": 5.387755102040816e-06, "loss": 1.6477, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 133, "tokens_per_second_per_gpu": 12.1 }, { "epoch": 0.10929853181076672, "grad_norm": 0.5386196374893188, "learning_rate": 5.428571428571429e-06, "loss": 1.5945, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 134, "tokens_per_second_per_gpu": 17.03 }, { "epoch": 0.11011419249592169, "grad_norm": 0.21357719600200653, "learning_rate": 5.469387755102041e-06, "loss": 1.6656, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 135, "tokens_per_second_per_gpu": 15.31 }, { "epoch": 0.11092985318107668, "grad_norm": 0.464016854763031, "learning_rate": 5.510204081632653e-06, "loss": 1.7119, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 136, "tokens_per_second_per_gpu": 8.79 }, { "epoch": 0.11174551386623165, "grad_norm": 0.30432239174842834, "learning_rate": 5.551020408163266e-06, "loss": 1.7467, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 137, "tokens_per_second_per_gpu": 14.53 }, { "epoch": 0.11256117455138662, "grad_norm": 0.33321642875671387, "learning_rate": 5.5918367346938776e-06, "loss": 1.6428, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 138, "tokens_per_second_per_gpu": 10.04 }, { "epoch": 0.1133768352365416, "grad_norm": 0.30344119668006897, "learning_rate": 5.63265306122449e-06, "loss": 1.6499, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 139, "tokens_per_second_per_gpu": 12.44 }, { "epoch": 0.11419249592169657, "grad_norm": 0.32138192653656006, "learning_rate": 5.673469387755103e-06, "loss": 1.7486, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 140, "tokens_per_second_per_gpu": 11.15 }, { "epoch": 0.11500815660685156, "grad_norm": 0.26749706268310547, "learning_rate": 5.7142857142857145e-06, "loss": 1.5062, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 141, "tokens_per_second_per_gpu": 5.36 }, { "epoch": 0.11582381729200653, "grad_norm": 0.2847835421562195, "learning_rate": 5.755102040816327e-06, "loss": 1.5893, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 142, "tokens_per_second_per_gpu": 17.7 }, { "epoch": 0.1166394779771615, "grad_norm": 0.5660161375999451, "learning_rate": 5.795918367346939e-06, "loss": 1.6985, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 143, "tokens_per_second_per_gpu": 16.96 }, { "epoch": 0.11745513866231648, "grad_norm": 0.22120146453380585, "learning_rate": 5.8367346938775515e-06, "loss": 1.6977, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 144, "tokens_per_second_per_gpu": 8.73 }, { "epoch": 0.11827079934747145, "grad_norm": 0.264000803232193, "learning_rate": 5.877551020408163e-06, "loss": 1.6313, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 145, "tokens_per_second_per_gpu": 14.59 }, { "epoch": 0.11908646003262642, "grad_norm": 0.37124985456466675, "learning_rate": 5.918367346938776e-06, "loss": 1.6288, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 146, "tokens_per_second_per_gpu": 19.56 }, { "epoch": 0.1199021207177814, "grad_norm": 0.23694360256195068, "learning_rate": 5.959183673469388e-06, "loss": 1.7389, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 147, "tokens_per_second_per_gpu": 13.09 }, { "epoch": 0.12071778140293637, "grad_norm": 0.28653818368911743, "learning_rate": 6e-06, "loss": 1.6576, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 148, "tokens_per_second_per_gpu": 15.05 }, { "epoch": 0.12153344208809136, "grad_norm": 0.34486496448516846, "learning_rate": 6.040816326530613e-06, "loss": 1.6156, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 149, "tokens_per_second_per_gpu": 6.43 }, { "epoch": 0.12234910277324633, "grad_norm": 0.24030032753944397, "learning_rate": 6.0816326530612245e-06, "loss": 1.6446, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 150, "tokens_per_second_per_gpu": 10.4 }, { "epoch": 0.1231647634584013, "grad_norm": 0.3838251233100891, "learning_rate": 6.122448979591837e-06, "loss": 1.6748, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 151, "tokens_per_second_per_gpu": 7.17 }, { "epoch": 0.12398042414355628, "grad_norm": 0.2585683763027191, "learning_rate": 6.163265306122449e-06, "loss": 1.6171, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 152, "tokens_per_second_per_gpu": 9.79 }, { "epoch": 0.12479608482871125, "grad_norm": 0.31367412209510803, "learning_rate": 6.2040816326530614e-06, "loss": 1.629, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 153, "tokens_per_second_per_gpu": 10.48 }, { "epoch": 0.12561174551386622, "grad_norm": 0.2558131217956543, "learning_rate": 6.244897959183674e-06, "loss": 1.6727, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 154, "tokens_per_second_per_gpu": 15.0 }, { "epoch": 0.1264274061990212, "grad_norm": 0.29049068689346313, "learning_rate": 6.285714285714287e-06, "loss": 1.6949, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 155, "tokens_per_second_per_gpu": 17.28 }, { "epoch": 0.1272430668841762, "grad_norm": 0.33925697207450867, "learning_rate": 6.3265306122448975e-06, "loss": 1.5485, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 156, "tokens_per_second_per_gpu": 10.82 }, { "epoch": 0.12805872756933115, "grad_norm": 0.41309934854507446, "learning_rate": 6.36734693877551e-06, "loss": 1.665, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 157, "tokens_per_second_per_gpu": 10.38 }, { "epoch": 0.12887438825448613, "grad_norm": 0.18366199731826782, "learning_rate": 6.408163265306124e-06, "loss": 1.5401, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 158, "tokens_per_second_per_gpu": 16.37 }, { "epoch": 0.12969004893964112, "grad_norm": 0.40772685408592224, "learning_rate": 6.4489795918367345e-06, "loss": 1.6404, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 159, "tokens_per_second_per_gpu": 15.91 }, { "epoch": 0.13050570962479607, "grad_norm": 0.3082660436630249, "learning_rate": 6.489795918367347e-06, "loss": 1.6602, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 160, "tokens_per_second_per_gpu": 12.39 }, { "epoch": 0.13132137030995106, "grad_norm": 0.17388151586055756, "learning_rate": 6.53061224489796e-06, "loss": 1.612, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 161, "tokens_per_second_per_gpu": 14.7 }, { "epoch": 0.13213703099510604, "grad_norm": 0.3215138912200928, "learning_rate": 6.5714285714285714e-06, "loss": 1.5761, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 162, "tokens_per_second_per_gpu": 17.09 }, { "epoch": 0.132952691680261, "grad_norm": 0.2463226318359375, "learning_rate": 6.612244897959184e-06, "loss": 1.6133, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 163, "tokens_per_second_per_gpu": 5.75 }, { "epoch": 0.13376835236541598, "grad_norm": 0.2863466143608093, "learning_rate": 6.653061224489797e-06, "loss": 1.555, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 164, "tokens_per_second_per_gpu": 11.89 }, { "epoch": 0.13458401305057097, "grad_norm": 0.301667183637619, "learning_rate": 6.693877551020408e-06, "loss": 1.5679, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 165, "tokens_per_second_per_gpu": 15.47 }, { "epoch": 0.13539967373572595, "grad_norm": 0.3648882508277893, "learning_rate": 6.734693877551021e-06, "loss": 1.546, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 166, "tokens_per_second_per_gpu": 13.08 }, { "epoch": 0.1362153344208809, "grad_norm": 0.207938551902771, "learning_rate": 6.775510204081633e-06, "loss": 1.5652, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 167, "tokens_per_second_per_gpu": 13.82 }, { "epoch": 0.1370309951060359, "grad_norm": 0.33382394909858704, "learning_rate": 6.816326530612245e-06, "loss": 1.6225, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 168, "tokens_per_second_per_gpu": 6.9 }, { "epoch": 0.13784665579119088, "grad_norm": 0.3136134743690491, "learning_rate": 6.857142857142858e-06, "loss": 1.5525, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 169, "tokens_per_second_per_gpu": 10.94 }, { "epoch": 0.13866231647634583, "grad_norm": 0.3100225031375885, "learning_rate": 6.897959183673469e-06, "loss": 1.6161, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 170, "tokens_per_second_per_gpu": 13.67 }, { "epoch": 0.13947797716150082, "grad_norm": 0.4159872829914093, "learning_rate": 6.938775510204082e-06, "loss": 1.4783, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 171, "tokens_per_second_per_gpu": 9.79 }, { "epoch": 0.1402936378466558, "grad_norm": 0.28888729214668274, "learning_rate": 6.979591836734695e-06, "loss": 1.6408, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 172, "tokens_per_second_per_gpu": 9.05 }, { "epoch": 0.14110929853181076, "grad_norm": 0.3259894549846649, "learning_rate": 7.020408163265306e-06, "loss": 1.6154, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 173, "tokens_per_second_per_gpu": 8.93 }, { "epoch": 0.14192495921696574, "grad_norm": 0.34797877073287964, "learning_rate": 7.061224489795918e-06, "loss": 1.5343, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 174, "tokens_per_second_per_gpu": 8.61 }, { "epoch": 0.14274061990212072, "grad_norm": 0.28687313199043274, "learning_rate": 7.102040816326532e-06, "loss": 1.611, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 175, "tokens_per_second_per_gpu": 14.52 }, { "epoch": 0.14355628058727568, "grad_norm": 0.2807822525501251, "learning_rate": 7.142857142857143e-06, "loss": 1.5567, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 176, "tokens_per_second_per_gpu": 13.13 }, { "epoch": 0.14437194127243066, "grad_norm": 0.2929311990737915, "learning_rate": 7.183673469387755e-06, "loss": 1.681, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 177, "tokens_per_second_per_gpu": 12.07 }, { "epoch": 0.14518760195758565, "grad_norm": 0.2921620309352875, "learning_rate": 7.224489795918368e-06, "loss": 1.4854, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 178, "tokens_per_second_per_gpu": 9.77 }, { "epoch": 0.14600326264274063, "grad_norm": 0.3116817772388458, "learning_rate": 7.26530612244898e-06, "loss": 1.5873, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 179, "tokens_per_second_per_gpu": 12.69 }, { "epoch": 0.1468189233278956, "grad_norm": 0.265588641166687, "learning_rate": 7.306122448979592e-06, "loss": 1.7307, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 180, "tokens_per_second_per_gpu": 6.67 }, { "epoch": 0.14763458401305057, "grad_norm": 0.38359183073043823, "learning_rate": 7.346938775510205e-06, "loss": 1.5306, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 181, "tokens_per_second_per_gpu": 15.28 }, { "epoch": 0.14845024469820556, "grad_norm": 0.3852451741695404, "learning_rate": 7.387755102040817e-06, "loss": 1.6024, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 182, "tokens_per_second_per_gpu": 6.74 }, { "epoch": 0.14926590538336051, "grad_norm": 0.28015273809432983, "learning_rate": 7.428571428571429e-06, "loss": 1.5927, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 183, "tokens_per_second_per_gpu": 20.25 }, { "epoch": 0.1500815660685155, "grad_norm": 0.8965319991111755, "learning_rate": 7.46938775510204e-06, "loss": 1.6301, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 184, "tokens_per_second_per_gpu": 9.36 }, { "epoch": 0.15089722675367048, "grad_norm": 0.4190131425857544, "learning_rate": 7.5102040816326536e-06, "loss": 1.5866, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 185, "tokens_per_second_per_gpu": 16.62 }, { "epoch": 0.15171288743882544, "grad_norm": 0.4055452346801758, "learning_rate": 7.551020408163266e-06, "loss": 1.6005, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 186, "tokens_per_second_per_gpu": 5.41 }, { "epoch": 0.15252854812398042, "grad_norm": 0.29650765657424927, "learning_rate": 7.591836734693877e-06, "loss": 1.5474, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 187, "tokens_per_second_per_gpu": 14.06 }, { "epoch": 0.1533442088091354, "grad_norm": 0.2841929793357849, "learning_rate": 7.63265306122449e-06, "loss": 1.5782, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 188, "tokens_per_second_per_gpu": 9.2 }, { "epoch": 0.15415986949429036, "grad_norm": 0.3095893859863281, "learning_rate": 7.673469387755102e-06, "loss": 1.5587, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 189, "tokens_per_second_per_gpu": 10.08 }, { "epoch": 0.15497553017944535, "grad_norm": 0.7705554962158203, "learning_rate": 7.714285714285714e-06, "loss": 1.7203, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 190, "tokens_per_second_per_gpu": 11.18 }, { "epoch": 0.15579119086460033, "grad_norm": 0.2898006737232208, "learning_rate": 7.755102040816327e-06, "loss": 1.5739, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 191, "tokens_per_second_per_gpu": 10.16 }, { "epoch": 0.1566068515497553, "grad_norm": 0.25893375277519226, "learning_rate": 7.79591836734694e-06, "loss": 1.5734, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 192, "tokens_per_second_per_gpu": 15.82 }, { "epoch": 0.15742251223491027, "grad_norm": 0.3126630485057831, "learning_rate": 7.836734693877551e-06, "loss": 1.6046, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 193, "tokens_per_second_per_gpu": 5.89 }, { "epoch": 0.15823817292006526, "grad_norm": 0.3616257905960083, "learning_rate": 7.877551020408164e-06, "loss": 1.6217, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 194, "tokens_per_second_per_gpu": 12.6 }, { "epoch": 0.15905383360522024, "grad_norm": 0.24247130751609802, "learning_rate": 7.918367346938776e-06, "loss": 1.599, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 195, "tokens_per_second_per_gpu": 9.76 }, { "epoch": 0.1598694942903752, "grad_norm": 0.29351916909217834, "learning_rate": 7.959183673469388e-06, "loss": 1.5428, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 196, "tokens_per_second_per_gpu": 9.46 }, { "epoch": 0.16068515497553018, "grad_norm": 0.27666643261909485, "learning_rate": 8.000000000000001e-06, "loss": 1.6079, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 197, "tokens_per_second_per_gpu": 9.26 }, { "epoch": 0.16150081566068517, "grad_norm": 0.44331464171409607, "learning_rate": 8.040816326530613e-06, "loss": 1.5421, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 198, "tokens_per_second_per_gpu": 9.26 }, { "epoch": 0.16231647634584012, "grad_norm": 0.6585918068885803, "learning_rate": 8.081632653061225e-06, "loss": 1.5476, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 199, "tokens_per_second_per_gpu": 10.3 }, { "epoch": 0.1631321370309951, "grad_norm": 1.1724046468734741, "learning_rate": 8.122448979591837e-06, "loss": 1.5848, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 200, "tokens_per_second_per_gpu": 13.99 }, { "epoch": 0.1639477977161501, "grad_norm": 0.369884192943573, "learning_rate": 8.163265306122448e-06, "loss": 1.4983, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 201, "tokens_per_second_per_gpu": 11.73 }, { "epoch": 0.16476345840130505, "grad_norm": 0.2741272449493408, "learning_rate": 8.204081632653062e-06, "loss": 1.6027, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 202, "tokens_per_second_per_gpu": 26.87 }, { "epoch": 0.16557911908646003, "grad_norm": 0.3122844099998474, "learning_rate": 8.244897959183674e-06, "loss": 1.5815, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 203, "tokens_per_second_per_gpu": 9.37 }, { "epoch": 0.16639477977161501, "grad_norm": 0.35319003462791443, "learning_rate": 8.285714285714285e-06, "loss": 1.6084, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 204, "tokens_per_second_per_gpu": 17.0 }, { "epoch": 0.16721044045676997, "grad_norm": 0.3192112445831299, "learning_rate": 8.326530612244899e-06, "loss": 1.515, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 205, "tokens_per_second_per_gpu": 12.3 }, { "epoch": 0.16802610114192496, "grad_norm": 0.3584693968296051, "learning_rate": 8.36734693877551e-06, "loss": 1.5695, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 206, "tokens_per_second_per_gpu": 7.06 }, { "epoch": 0.16884176182707994, "grad_norm": 0.2820286452770233, "learning_rate": 8.408163265306122e-06, "loss": 1.5211, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 207, "tokens_per_second_per_gpu": 15.69 }, { "epoch": 0.16965742251223492, "grad_norm": 0.2145698517560959, "learning_rate": 8.448979591836736e-06, "loss": 1.5605, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 208, "tokens_per_second_per_gpu": 14.09 }, { "epoch": 0.17047308319738988, "grad_norm": 0.3924921452999115, "learning_rate": 8.489795918367347e-06, "loss": 1.5046, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 209, "tokens_per_second_per_gpu": 6.68 }, { "epoch": 0.17128874388254486, "grad_norm": 0.26540106534957886, "learning_rate": 8.53061224489796e-06, "loss": 1.5704, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 210, "tokens_per_second_per_gpu": 12.63 }, { "epoch": 0.17210440456769985, "grad_norm": 0.28012341260910034, "learning_rate": 8.571428571428573e-06, "loss": 1.5622, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 211, "tokens_per_second_per_gpu": 11.87 }, { "epoch": 0.1729200652528548, "grad_norm": 0.3970394432544708, "learning_rate": 8.612244897959184e-06, "loss": 1.5495, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 212, "tokens_per_second_per_gpu": 9.77 }, { "epoch": 0.1737357259380098, "grad_norm": 0.27244076132774353, "learning_rate": 8.653061224489796e-06, "loss": 1.5304, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 213, "tokens_per_second_per_gpu": 17.72 }, { "epoch": 0.17455138662316477, "grad_norm": 0.3282474875450134, "learning_rate": 8.69387755102041e-06, "loss": 1.5398, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 214, "tokens_per_second_per_gpu": 7.33 }, { "epoch": 0.17536704730831973, "grad_norm": 0.33740946650505066, "learning_rate": 8.734693877551021e-06, "loss": 1.6209, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 215, "tokens_per_second_per_gpu": 10.17 }, { "epoch": 0.1761827079934747, "grad_norm": 0.2922510504722595, "learning_rate": 8.775510204081633e-06, "loss": 1.5085, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 216, "tokens_per_second_per_gpu": 19.9 }, { "epoch": 0.1769983686786297, "grad_norm": 0.3444550633430481, "learning_rate": 8.816326530612245e-06, "loss": 1.5319, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 217, "tokens_per_second_per_gpu": 13.23 }, { "epoch": 0.17781402936378465, "grad_norm": 0.33822599053382874, "learning_rate": 8.857142857142857e-06, "loss": 1.5974, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 218, "tokens_per_second_per_gpu": 9.71 }, { "epoch": 0.17862969004893964, "grad_norm": 0.3765545189380646, "learning_rate": 8.89795918367347e-06, "loss": 1.4956, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 219, "tokens_per_second_per_gpu": 4.0 }, { "epoch": 0.17944535073409462, "grad_norm": 0.41983312368392944, "learning_rate": 8.938775510204082e-06, "loss": 1.6075, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 220, "tokens_per_second_per_gpu": 14.99 }, { "epoch": 0.1802610114192496, "grad_norm": 0.2781478762626648, "learning_rate": 8.979591836734694e-06, "loss": 1.5367, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 221, "tokens_per_second_per_gpu": 14.41 }, { "epoch": 0.18107667210440456, "grad_norm": 0.4371727705001831, "learning_rate": 9.020408163265307e-06, "loss": 1.4141, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 222, "tokens_per_second_per_gpu": 10.58 }, { "epoch": 0.18189233278955955, "grad_norm": 0.44839075207710266, "learning_rate": 9.061224489795919e-06, "loss": 1.5599, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 223, "tokens_per_second_per_gpu": 14.15 }, { "epoch": 0.18270799347471453, "grad_norm": 0.38045358657836914, "learning_rate": 9.10204081632653e-06, "loss": 1.6026, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 224, "tokens_per_second_per_gpu": 3.78 }, { "epoch": 0.1835236541598695, "grad_norm": 0.34852662682533264, "learning_rate": 9.142857142857144e-06, "loss": 1.5311, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 225, "tokens_per_second_per_gpu": 2.68 }, { "epoch": 0.18433931484502447, "grad_norm": 0.28683966398239136, "learning_rate": 9.183673469387756e-06, "loss": 1.502, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 226, "tokens_per_second_per_gpu": 5.36 }, { "epoch": 0.18515497553017946, "grad_norm": 0.3060706555843353, "learning_rate": 9.224489795918367e-06, "loss": 1.5929, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 227, "tokens_per_second_per_gpu": 14.43 }, { "epoch": 0.1859706362153344, "grad_norm": 0.27326321601867676, "learning_rate": 9.26530612244898e-06, "loss": 1.5066, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 228, "tokens_per_second_per_gpu": 13.84 }, { "epoch": 0.1867862969004894, "grad_norm": 0.33330002427101135, "learning_rate": 9.306122448979593e-06, "loss": 1.4772, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 229, "tokens_per_second_per_gpu": 10.83 }, { "epoch": 0.18760195758564438, "grad_norm": 0.3110118806362152, "learning_rate": 9.346938775510204e-06, "loss": 1.4824, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 230, "tokens_per_second_per_gpu": 13.76 }, { "epoch": 0.18841761827079934, "grad_norm": 0.2643192410469055, "learning_rate": 9.387755102040816e-06, "loss": 1.4771, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 231, "tokens_per_second_per_gpu": 13.14 }, { "epoch": 0.18923327895595432, "grad_norm": 0.29222267866134644, "learning_rate": 9.42857142857143e-06, "loss": 1.5651, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 232, "tokens_per_second_per_gpu": 12.66 }, { "epoch": 0.1900489396411093, "grad_norm": 0.35375407338142395, "learning_rate": 9.469387755102041e-06, "loss": 1.4593, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 233, "tokens_per_second_per_gpu": 9.83 }, { "epoch": 0.19086460032626426, "grad_norm": 0.25826701521873474, "learning_rate": 9.510204081632653e-06, "loss": 1.4387, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 234, "tokens_per_second_per_gpu": 21.25 }, { "epoch": 0.19168026101141925, "grad_norm": 0.25453484058380127, "learning_rate": 9.551020408163265e-06, "loss": 1.5052, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 235, "tokens_per_second_per_gpu": 23.79 }, { "epoch": 0.19249592169657423, "grad_norm": 0.3481979966163635, "learning_rate": 9.591836734693878e-06, "loss": 1.4809, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 236, "tokens_per_second_per_gpu": 19.89 }, { "epoch": 0.1933115823817292, "grad_norm": 0.2821756899356842, "learning_rate": 9.63265306122449e-06, "loss": 1.4682, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 237, "tokens_per_second_per_gpu": 19.96 }, { "epoch": 0.19412724306688417, "grad_norm": 0.4794289469718933, "learning_rate": 9.673469387755102e-06, "loss": 1.5519, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 238, "tokens_per_second_per_gpu": 5.94 }, { "epoch": 0.19494290375203915, "grad_norm": 0.2830967605113983, "learning_rate": 9.714285714285715e-06, "loss": 1.5095, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 239, "tokens_per_second_per_gpu": 12.84 }, { "epoch": 0.19575856443719414, "grad_norm": 0.3756522536277771, "learning_rate": 9.755102040816327e-06, "loss": 1.5275, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 240, "tokens_per_second_per_gpu": 11.54 }, { "epoch": 0.1965742251223491, "grad_norm": 0.4045179486274719, "learning_rate": 9.795918367346939e-06, "loss": 1.4649, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 241, "tokens_per_second_per_gpu": 12.23 }, { "epoch": 0.19738988580750408, "grad_norm": 0.4029671251773834, "learning_rate": 9.836734693877552e-06, "loss": 1.5669, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 242, "tokens_per_second_per_gpu": 8.3 }, { "epoch": 0.19820554649265906, "grad_norm": 0.23653516173362732, "learning_rate": 9.877551020408164e-06, "loss": 1.6576, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 243, "tokens_per_second_per_gpu": 6.97 }, { "epoch": 0.19902120717781402, "grad_norm": 0.6236711144447327, "learning_rate": 9.918367346938776e-06, "loss": 1.4917, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 244, "tokens_per_second_per_gpu": 10.82 }, { "epoch": 0.199836867862969, "grad_norm": 0.3831644058227539, "learning_rate": 9.959183673469389e-06, "loss": 1.5023, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 245, "tokens_per_second_per_gpu": 10.87 }, { "epoch": 0.200652528548124, "grad_norm": 0.5063720345497131, "learning_rate": 1e-05, "loss": 1.5048, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 246, "tokens_per_second_per_gpu": 5.04 }, { "epoch": 0.20146818923327894, "grad_norm": 0.28379303216934204, "learning_rate": 1.0040816326530613e-05, "loss": 1.5941, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 247, "tokens_per_second_per_gpu": 10.07 }, { "epoch": 0.20228384991843393, "grad_norm": 0.22729305922985077, "learning_rate": 1.0081632653061224e-05, "loss": 1.4813, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 248, "tokens_per_second_per_gpu": 11.73 }, { "epoch": 0.2030995106035889, "grad_norm": 0.37438860535621643, "learning_rate": 1.0122448979591838e-05, "loss": 1.5412, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 249, "tokens_per_second_per_gpu": 8.66 }, { "epoch": 0.2039151712887439, "grad_norm": 0.9508554339408875, "learning_rate": 1.016326530612245e-05, "loss": 1.5866, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 250, "tokens_per_second_per_gpu": 10.81 }, { "epoch": 0.20473083197389885, "grad_norm": 0.3455393314361572, "learning_rate": 1.0204081632653061e-05, "loss": 1.4988, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 251, "tokens_per_second_per_gpu": 10.93 }, { "epoch": 0.20554649265905384, "grad_norm": 0.46677708625793457, "learning_rate": 1.0244897959183673e-05, "loss": 1.4909, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 252, "tokens_per_second_per_gpu": 3.89 }, { "epoch": 0.20636215334420882, "grad_norm": 0.3540324866771698, "learning_rate": 1.0285714285714286e-05, "loss": 1.5046, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 253, "tokens_per_second_per_gpu": 16.77 }, { "epoch": 0.20717781402936378, "grad_norm": 0.3343189060688019, "learning_rate": 1.0326530612244898e-05, "loss": 1.5763, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 254, "tokens_per_second_per_gpu": 10.46 }, { "epoch": 0.20799347471451876, "grad_norm": 0.6823508739471436, "learning_rate": 1.036734693877551e-05, "loss": 1.4718, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 255, "tokens_per_second_per_gpu": 14.01 }, { "epoch": 0.20880913539967375, "grad_norm": 0.36964482069015503, "learning_rate": 1.0408163265306123e-05, "loss": 1.4782, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 256, "tokens_per_second_per_gpu": 13.08 }, { "epoch": 0.2096247960848287, "grad_norm": 0.33643579483032227, "learning_rate": 1.0448979591836735e-05, "loss": 1.4444, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 257, "tokens_per_second_per_gpu": 16.94 }, { "epoch": 0.21044045676998369, "grad_norm": 0.3572589159011841, "learning_rate": 1.0489795918367347e-05, "loss": 1.4342, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 258, "tokens_per_second_per_gpu": 13.09 }, { "epoch": 0.21125611745513867, "grad_norm": 0.2646964490413666, "learning_rate": 1.053061224489796e-05, "loss": 1.4733, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 259, "tokens_per_second_per_gpu": 10.43 }, { "epoch": 0.21207177814029363, "grad_norm": 0.28068482875823975, "learning_rate": 1.0571428571428572e-05, "loss": 1.4783, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 260, "tokens_per_second_per_gpu": 8.72 }, { "epoch": 0.2128874388254486, "grad_norm": 0.4587586224079132, "learning_rate": 1.0612244897959184e-05, "loss": 1.5932, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 261, "tokens_per_second_per_gpu": 5.08 }, { "epoch": 0.2137030995106036, "grad_norm": 0.5011075139045715, "learning_rate": 1.0653061224489797e-05, "loss": 1.5691, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 262, "tokens_per_second_per_gpu": 11.18 }, { "epoch": 0.21451876019575855, "grad_norm": 0.31809067726135254, "learning_rate": 1.0693877551020409e-05, "loss": 1.5148, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 263, "tokens_per_second_per_gpu": 10.4 }, { "epoch": 0.21533442088091354, "grad_norm": 0.4068273901939392, "learning_rate": 1.073469387755102e-05, "loss": 1.5746, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 264, "tokens_per_second_per_gpu": 20.12 }, { "epoch": 0.21615008156606852, "grad_norm": 0.4858180284500122, "learning_rate": 1.0775510204081633e-05, "loss": 1.5104, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 265, "tokens_per_second_per_gpu": 6.89 }, { "epoch": 0.2169657422512235, "grad_norm": 0.40557336807250977, "learning_rate": 1.0816326530612246e-05, "loss": 1.5367, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 266, "tokens_per_second_per_gpu": 10.27 }, { "epoch": 0.21778140293637846, "grad_norm": 0.38423269987106323, "learning_rate": 1.0857142857142858e-05, "loss": 1.6255, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 267, "tokens_per_second_per_gpu": 7.66 }, { "epoch": 0.21859706362153344, "grad_norm": 0.31470102071762085, "learning_rate": 1.089795918367347e-05, "loss": 1.3953, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 268, "tokens_per_second_per_gpu": 13.33 }, { "epoch": 0.21941272430668843, "grad_norm": 0.3152869939804077, "learning_rate": 1.0938775510204081e-05, "loss": 1.4999, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 269, "tokens_per_second_per_gpu": 11.55 }, { "epoch": 0.22022838499184338, "grad_norm": 0.29454201459884644, "learning_rate": 1.0979591836734695e-05, "loss": 1.382, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 270, "tokens_per_second_per_gpu": 17.45 }, { "epoch": 0.22104404567699837, "grad_norm": 0.2620077133178711, "learning_rate": 1.1020408163265306e-05, "loss": 1.4639, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 271, "tokens_per_second_per_gpu": 15.71 }, { "epoch": 0.22185970636215335, "grad_norm": 0.3115851879119873, "learning_rate": 1.1061224489795918e-05, "loss": 1.456, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 272, "tokens_per_second_per_gpu": 7.65 }, { "epoch": 0.2226753670473083, "grad_norm": 0.3818085789680481, "learning_rate": 1.1102040816326532e-05, "loss": 1.5793, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 273, "tokens_per_second_per_gpu": 7.54 }, { "epoch": 0.2234910277324633, "grad_norm": 0.3348657786846161, "learning_rate": 1.1142857142857143e-05, "loss": 1.4849, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 274, "tokens_per_second_per_gpu": 13.26 }, { "epoch": 0.22430668841761828, "grad_norm": 0.34314364194869995, "learning_rate": 1.1183673469387755e-05, "loss": 1.4839, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 275, "tokens_per_second_per_gpu": 15.57 }, { "epoch": 0.22512234910277323, "grad_norm": 0.27445629239082336, "learning_rate": 1.1224489795918369e-05, "loss": 1.4787, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 276, "tokens_per_second_per_gpu": 12.08 }, { "epoch": 0.22593800978792822, "grad_norm": 0.38252323865890503, "learning_rate": 1.126530612244898e-05, "loss": 1.4601, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 277, "tokens_per_second_per_gpu": 11.5 }, { "epoch": 0.2267536704730832, "grad_norm": 0.3560408353805542, "learning_rate": 1.1306122448979592e-05, "loss": 1.5184, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 278, "tokens_per_second_per_gpu": 11.47 }, { "epoch": 0.2275693311582382, "grad_norm": 0.27676957845687866, "learning_rate": 1.1346938775510206e-05, "loss": 1.4324, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 279, "tokens_per_second_per_gpu": 10.24 }, { "epoch": 0.22838499184339314, "grad_norm": 0.3509454131126404, "learning_rate": 1.1387755102040817e-05, "loss": 1.502, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 280, "tokens_per_second_per_gpu": 10.73 }, { "epoch": 0.22920065252854813, "grad_norm": 0.32813915610313416, "learning_rate": 1.1428571428571429e-05, "loss": 1.4284, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 281, "tokens_per_second_per_gpu": 11.0 }, { "epoch": 0.2300163132137031, "grad_norm": 0.3885466456413269, "learning_rate": 1.146938775510204e-05, "loss": 1.515, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 282, "tokens_per_second_per_gpu": 16.26 }, { "epoch": 0.23083197389885807, "grad_norm": 0.29951179027557373, "learning_rate": 1.1510204081632654e-05, "loss": 1.2357, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 283, "tokens_per_second_per_gpu": 14.83 }, { "epoch": 0.23164763458401305, "grad_norm": 0.2931567430496216, "learning_rate": 1.1551020408163266e-05, "loss": 1.4157, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 284, "tokens_per_second_per_gpu": 22.6 }, { "epoch": 0.23246329526916804, "grad_norm": 0.3267894685268402, "learning_rate": 1.1591836734693878e-05, "loss": 1.4795, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 285, "tokens_per_second_per_gpu": 15.61 }, { "epoch": 0.233278955954323, "grad_norm": 0.5030086040496826, "learning_rate": 1.163265306122449e-05, "loss": 1.5606, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 286, "tokens_per_second_per_gpu": 11.11 }, { "epoch": 0.23409461663947798, "grad_norm": 1.3993918895721436, "learning_rate": 1.1673469387755103e-05, "loss": 1.5339, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 287, "tokens_per_second_per_gpu": 12.47 }, { "epoch": 0.23491027732463296, "grad_norm": 0.43690645694732666, "learning_rate": 1.1714285714285715e-05, "loss": 1.4066, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 288, "tokens_per_second_per_gpu": 8.35 }, { "epoch": 0.23572593800978792, "grad_norm": 0.3441956639289856, "learning_rate": 1.1755102040816326e-05, "loss": 1.4718, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 289, "tokens_per_second_per_gpu": 7.23 }, { "epoch": 0.2365415986949429, "grad_norm": 0.34400686621665955, "learning_rate": 1.179591836734694e-05, "loss": 1.5299, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 290, "tokens_per_second_per_gpu": 15.34 }, { "epoch": 0.23735725938009788, "grad_norm": 0.32894930243492126, "learning_rate": 1.1836734693877552e-05, "loss": 1.483, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 291, "tokens_per_second_per_gpu": 15.31 }, { "epoch": 0.23817292006525284, "grad_norm": 0.3937448263168335, "learning_rate": 1.1877551020408163e-05, "loss": 1.4522, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 292, "tokens_per_second_per_gpu": 14.14 }, { "epoch": 0.23898858075040783, "grad_norm": 1.1808565855026245, "learning_rate": 1.1918367346938777e-05, "loss": 1.4618, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 293, "tokens_per_second_per_gpu": 13.23 }, { "epoch": 0.2398042414355628, "grad_norm": 0.39133161306381226, "learning_rate": 1.1959183673469389e-05, "loss": 1.4927, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 294, "tokens_per_second_per_gpu": 11.63 }, { "epoch": 0.2406199021207178, "grad_norm": 0.3667590022087097, "learning_rate": 1.2e-05, "loss": 1.4593, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 295, "tokens_per_second_per_gpu": 8.87 }, { "epoch": 0.24143556280587275, "grad_norm": 0.36433354020118713, "learning_rate": 1.2040816326530612e-05, "loss": 1.4524, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 296, "tokens_per_second_per_gpu": 8.13 }, { "epoch": 0.24225122349102773, "grad_norm": 0.3044168949127197, "learning_rate": 1.2081632653061225e-05, "loss": 1.4536, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 297, "tokens_per_second_per_gpu": 10.86 }, { "epoch": 0.24306688417618272, "grad_norm": 0.44419875741004944, "learning_rate": 1.2122448979591837e-05, "loss": 1.4515, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 298, "tokens_per_second_per_gpu": 5.46 }, { "epoch": 0.24388254486133767, "grad_norm": 0.34809044003486633, "learning_rate": 1.2163265306122449e-05, "loss": 1.4254, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 299, "tokens_per_second_per_gpu": 8.1 }, { "epoch": 0.24469820554649266, "grad_norm": 0.42444565892219543, "learning_rate": 1.2204081632653062e-05, "loss": 1.3656, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 300, "tokens_per_second_per_gpu": 13.06 }, { "epoch": 0.24551386623164764, "grad_norm": 0.28471747040748596, "learning_rate": 1.2244897959183674e-05, "loss": 1.5121, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 301, "tokens_per_second_per_gpu": 17.71 }, { "epoch": 0.2463295269168026, "grad_norm": 0.30371835827827454, "learning_rate": 1.2285714285714286e-05, "loss": 1.5699, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 302, "tokens_per_second_per_gpu": 12.68 }, { "epoch": 0.24714518760195758, "grad_norm": 0.37631192803382874, "learning_rate": 1.2326530612244898e-05, "loss": 1.5276, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 303, "tokens_per_second_per_gpu": 10.34 }, { "epoch": 0.24796084828711257, "grad_norm": 0.3609955906867981, "learning_rate": 1.2367346938775511e-05, "loss": 1.5016, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 304, "tokens_per_second_per_gpu": 6.89 }, { "epoch": 0.24877650897226752, "grad_norm": 0.4898303151130676, "learning_rate": 1.2408163265306123e-05, "loss": 1.4614, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 305, "tokens_per_second_per_gpu": 10.91 }, { "epoch": 0.2495921696574225, "grad_norm": 0.30270737409591675, "learning_rate": 1.2448979591836735e-05, "loss": 1.38, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 306, "tokens_per_second_per_gpu": 11.35 }, { "epoch": 0.25040783034257746, "grad_norm": 0.3722691535949707, "learning_rate": 1.2489795918367348e-05, "loss": 1.4509, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 307, "tokens_per_second_per_gpu": 14.48 }, { "epoch": 0.25122349102773245, "grad_norm": 0.3500398099422455, "learning_rate": 1.253061224489796e-05, "loss": 1.4114, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 308, "tokens_per_second_per_gpu": 13.57 }, { "epoch": 0.25203915171288743, "grad_norm": 0.5204240679740906, "learning_rate": 1.2571428571428573e-05, "loss": 1.5094, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 309, "tokens_per_second_per_gpu": 4.42 }, { "epoch": 0.2528548123980424, "grad_norm": 0.29405635595321655, "learning_rate": 1.2612244897959185e-05, "loss": 1.4576, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 310, "tokens_per_second_per_gpu": 13.78 }, { "epoch": 0.2536704730831974, "grad_norm": 0.37764155864715576, "learning_rate": 1.2653061224489795e-05, "loss": 1.515, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 311, "tokens_per_second_per_gpu": 7.57 }, { "epoch": 0.2544861337683524, "grad_norm": 0.27151474356651306, "learning_rate": 1.2693877551020409e-05, "loss": 1.4311, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 312, "tokens_per_second_per_gpu": 13.37 }, { "epoch": 0.2553017944535073, "grad_norm": 0.41707414388656616, "learning_rate": 1.273469387755102e-05, "loss": 1.4002, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 313, "tokens_per_second_per_gpu": 11.53 }, { "epoch": 0.2561174551386623, "grad_norm": 0.287445604801178, "learning_rate": 1.2775510204081634e-05, "loss": 1.5581, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 314, "tokens_per_second_per_gpu": 19.58 }, { "epoch": 0.2569331158238173, "grad_norm": 0.450358510017395, "learning_rate": 1.2816326530612247e-05, "loss": 1.5158, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 315, "tokens_per_second_per_gpu": 12.19 }, { "epoch": 0.25774877650897227, "grad_norm": 0.3332822024822235, "learning_rate": 1.2857142857142857e-05, "loss": 1.3793, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 316, "tokens_per_second_per_gpu": 12.92 }, { "epoch": 0.25856443719412725, "grad_norm": 0.36968323588371277, "learning_rate": 1.2897959183673469e-05, "loss": 1.405, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 317, "tokens_per_second_per_gpu": 10.28 }, { "epoch": 0.25938009787928223, "grad_norm": 0.4066469371318817, "learning_rate": 1.2938775510204082e-05, "loss": 1.3726, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 318, "tokens_per_second_per_gpu": 7.53 }, { "epoch": 0.2601957585644372, "grad_norm": 0.3844182789325714, "learning_rate": 1.2979591836734694e-05, "loss": 1.535, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 319, "tokens_per_second_per_gpu": 15.45 }, { "epoch": 0.26101141924959215, "grad_norm": 0.31681913137435913, "learning_rate": 1.3020408163265308e-05, "loss": 1.429, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 320, "tokens_per_second_per_gpu": 11.95 }, { "epoch": 0.26182707993474713, "grad_norm": 0.3920951783657074, "learning_rate": 1.306122448979592e-05, "loss": 1.5179, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 321, "tokens_per_second_per_gpu": 19.64 }, { "epoch": 0.2626427406199021, "grad_norm": 0.4178957939147949, "learning_rate": 1.310204081632653e-05, "loss": 1.4101, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 322, "tokens_per_second_per_gpu": 12.31 }, { "epoch": 0.2634584013050571, "grad_norm": 0.3969653248786926, "learning_rate": 1.3142857142857143e-05, "loss": 1.4302, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 323, "tokens_per_second_per_gpu": 14.05 }, { "epoch": 0.2642740619902121, "grad_norm": 0.5443893074989319, "learning_rate": 1.3183673469387756e-05, "loss": 1.4572, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 324, "tokens_per_second_per_gpu": 9.52 }, { "epoch": 0.26508972267536707, "grad_norm": 0.43145936727523804, "learning_rate": 1.3224489795918368e-05, "loss": 1.3764, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 325, "tokens_per_second_per_gpu": 7.48 }, { "epoch": 0.265905383360522, "grad_norm": 0.33886289596557617, "learning_rate": 1.3265306122448982e-05, "loss": 1.5285, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 326, "tokens_per_second_per_gpu": 10.83 }, { "epoch": 0.266721044045677, "grad_norm": 0.18749524652957916, "learning_rate": 1.3306122448979593e-05, "loss": 1.4672, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 327, "tokens_per_second_per_gpu": 8.57 }, { "epoch": 0.26753670473083196, "grad_norm": 0.3995645046234131, "learning_rate": 1.3346938775510203e-05, "loss": 1.4616, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 328, "tokens_per_second_per_gpu": 17.02 }, { "epoch": 0.26835236541598695, "grad_norm": 0.43051937222480774, "learning_rate": 1.3387755102040817e-05, "loss": 1.5292, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 329, "tokens_per_second_per_gpu": 11.47 }, { "epoch": 0.26916802610114193, "grad_norm": 0.3365328013896942, "learning_rate": 1.3428571428571429e-05, "loss": 1.5201, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 330, "tokens_per_second_per_gpu": 9.7 }, { "epoch": 0.2699836867862969, "grad_norm": 0.3400661051273346, "learning_rate": 1.3469387755102042e-05, "loss": 1.3406, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 331, "tokens_per_second_per_gpu": 18.77 }, { "epoch": 0.2707993474714519, "grad_norm": 0.3255477845668793, "learning_rate": 1.3510204081632655e-05, "loss": 1.4372, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 332, "tokens_per_second_per_gpu": 15.02 }, { "epoch": 0.27161500815660683, "grad_norm": 0.22054067254066467, "learning_rate": 1.3551020408163265e-05, "loss": 1.4145, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 333, "tokens_per_second_per_gpu": 11.95 }, { "epoch": 0.2724306688417618, "grad_norm": 0.47307920455932617, "learning_rate": 1.3591836734693877e-05, "loss": 1.4376, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 334, "tokens_per_second_per_gpu": 14.12 }, { "epoch": 0.2732463295269168, "grad_norm": 0.3545341193675995, "learning_rate": 1.363265306122449e-05, "loss": 1.3474, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 335, "tokens_per_second_per_gpu": 8.56 }, { "epoch": 0.2740619902120718, "grad_norm": 0.3070593476295471, "learning_rate": 1.3673469387755102e-05, "loss": 1.3903, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 336, "tokens_per_second_per_gpu": 12.7 }, { "epoch": 0.27487765089722677, "grad_norm": 0.4865548312664032, "learning_rate": 1.3714285714285716e-05, "loss": 1.4104, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 337, "tokens_per_second_per_gpu": 15.0 }, { "epoch": 0.27569331158238175, "grad_norm": 0.39248645305633545, "learning_rate": 1.3755102040816328e-05, "loss": 1.5106, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 338, "tokens_per_second_per_gpu": 15.25 }, { "epoch": 0.2765089722675367, "grad_norm": 0.34209758043289185, "learning_rate": 1.3795918367346938e-05, "loss": 1.422, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 339, "tokens_per_second_per_gpu": 11.99 }, { "epoch": 0.27732463295269166, "grad_norm": 0.31904342770576477, "learning_rate": 1.3836734693877551e-05, "loss": 1.4492, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 340, "tokens_per_second_per_gpu": 10.43 }, { "epoch": 0.27814029363784665, "grad_norm": 0.3742981255054474, "learning_rate": 1.3877551020408165e-05, "loss": 1.4561, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 341, "tokens_per_second_per_gpu": 10.06 }, { "epoch": 0.27895595432300163, "grad_norm": 0.42905399203300476, "learning_rate": 1.3918367346938776e-05, "loss": 1.4838, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 342, "tokens_per_second_per_gpu": 7.74 }, { "epoch": 0.2797716150081566, "grad_norm": 0.3831437826156616, "learning_rate": 1.395918367346939e-05, "loss": 1.3241, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 343, "tokens_per_second_per_gpu": 13.22 }, { "epoch": 0.2805872756933116, "grad_norm": 0.4319358170032501, "learning_rate": 1.4000000000000001e-05, "loss": 1.5333, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 344, "tokens_per_second_per_gpu": 7.94 }, { "epoch": 0.2814029363784666, "grad_norm": 0.3988523781299591, "learning_rate": 1.4040816326530612e-05, "loss": 1.5097, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 345, "tokens_per_second_per_gpu": 9.57 }, { "epoch": 0.2822185970636215, "grad_norm": 0.39250531792640686, "learning_rate": 1.4081632653061225e-05, "loss": 1.4141, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 346, "tokens_per_second_per_gpu": 10.77 }, { "epoch": 0.2830342577487765, "grad_norm": 0.4384603500366211, "learning_rate": 1.4122448979591837e-05, "loss": 1.4719, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 347, "tokens_per_second_per_gpu": 13.4 }, { "epoch": 0.2838499184339315, "grad_norm": 0.3549862504005432, "learning_rate": 1.416326530612245e-05, "loss": 1.3758, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 348, "tokens_per_second_per_gpu": 11.82 }, { "epoch": 0.28466557911908646, "grad_norm": 0.3049587309360504, "learning_rate": 1.4204081632653064e-05, "loss": 1.4506, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 349, "tokens_per_second_per_gpu": 12.85 }, { "epoch": 0.28548123980424145, "grad_norm": 0.31760889291763306, "learning_rate": 1.4244897959183674e-05, "loss": 1.4763, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 350, "tokens_per_second_per_gpu": 10.05 }, { "epoch": 0.28629690048939643, "grad_norm": 0.35694992542266846, "learning_rate": 1.4285714285714285e-05, "loss": 1.3778, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 351, "tokens_per_second_per_gpu": 13.25 }, { "epoch": 0.28711256117455136, "grad_norm": 0.33934900164604187, "learning_rate": 1.4326530612244899e-05, "loss": 1.3838, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 352, "tokens_per_second_per_gpu": 15.99 }, { "epoch": 0.28792822185970635, "grad_norm": 0.38901546597480774, "learning_rate": 1.436734693877551e-05, "loss": 1.4604, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 353, "tokens_per_second_per_gpu": 10.28 }, { "epoch": 0.28874388254486133, "grad_norm": 0.3728622794151306, "learning_rate": 1.4408163265306124e-05, "loss": 1.4832, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 354, "tokens_per_second_per_gpu": 16.66 }, { "epoch": 0.2895595432300163, "grad_norm": 0.2984008491039276, "learning_rate": 1.4448979591836736e-05, "loss": 1.4186, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 355, "tokens_per_second_per_gpu": 12.14 }, { "epoch": 0.2903752039151713, "grad_norm": 0.2661781311035156, "learning_rate": 1.4489795918367346e-05, "loss": 1.4679, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 356, "tokens_per_second_per_gpu": 16.78 }, { "epoch": 0.2911908646003263, "grad_norm": 0.5129377841949463, "learning_rate": 1.453061224489796e-05, "loss": 1.3227, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 357, "tokens_per_second_per_gpu": 10.48 }, { "epoch": 0.29200652528548127, "grad_norm": 0.368767648935318, "learning_rate": 1.4571428571428573e-05, "loss": 1.3621, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 358, "tokens_per_second_per_gpu": 10.24 }, { "epoch": 0.2928221859706362, "grad_norm": 0.4996505081653595, "learning_rate": 1.4612244897959185e-05, "loss": 1.5783, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 359, "tokens_per_second_per_gpu": 18.27 }, { "epoch": 0.2936378466557912, "grad_norm": 0.42398399114608765, "learning_rate": 1.4653061224489798e-05, "loss": 1.4545, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 360, "tokens_per_second_per_gpu": 16.67 }, { "epoch": 0.29445350734094616, "grad_norm": 0.3602077066898346, "learning_rate": 1.469387755102041e-05, "loss": 1.4175, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 361, "tokens_per_second_per_gpu": 10.36 }, { "epoch": 0.29526916802610115, "grad_norm": 0.3771497905254364, "learning_rate": 1.473469387755102e-05, "loss": 1.4327, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 362, "tokens_per_second_per_gpu": 9.23 }, { "epoch": 0.29608482871125613, "grad_norm": 0.4750893712043762, "learning_rate": 1.4775510204081633e-05, "loss": 1.37, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 363, "tokens_per_second_per_gpu": 5.57 }, { "epoch": 0.2969004893964111, "grad_norm": 0.36283817887306213, "learning_rate": 1.4816326530612245e-05, "loss": 1.4566, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 364, "tokens_per_second_per_gpu": 9.98 }, { "epoch": 0.29771615008156604, "grad_norm": 0.5055189728736877, "learning_rate": 1.4857142857142858e-05, "loss": 1.3645, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 365, "tokens_per_second_per_gpu": 18.85 }, { "epoch": 0.29853181076672103, "grad_norm": 0.591924786567688, "learning_rate": 1.4897959183673472e-05, "loss": 1.4546, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 366, "tokens_per_second_per_gpu": 10.59 }, { "epoch": 0.299347471451876, "grad_norm": 0.40629565715789795, "learning_rate": 1.493877551020408e-05, "loss": 1.4978, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 367, "tokens_per_second_per_gpu": 10.21 }, { "epoch": 0.300163132137031, "grad_norm": 0.4322623312473297, "learning_rate": 1.4979591836734694e-05, "loss": 1.4541, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 368, "tokens_per_second_per_gpu": 12.34 }, { "epoch": 0.300978792822186, "grad_norm": 0.40342336893081665, "learning_rate": 1.5020408163265307e-05, "loss": 1.4369, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 369, "tokens_per_second_per_gpu": 8.4 }, { "epoch": 0.30179445350734097, "grad_norm": 0.4252215325832367, "learning_rate": 1.5061224489795919e-05, "loss": 1.4116, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 370, "tokens_per_second_per_gpu": 13.39 }, { "epoch": 0.30261011419249595, "grad_norm": 0.5428847670555115, "learning_rate": 1.5102040816326532e-05, "loss": 1.4404, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 371, "tokens_per_second_per_gpu": 8.47 }, { "epoch": 0.3034257748776509, "grad_norm": 0.34889474511146545, "learning_rate": 1.5142857142857144e-05, "loss": 1.4411, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 372, "tokens_per_second_per_gpu": 22.66 }, { "epoch": 0.30424143556280586, "grad_norm": 0.4423021674156189, "learning_rate": 1.5183673469387754e-05, "loss": 1.5, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 373, "tokens_per_second_per_gpu": 5.41 }, { "epoch": 0.30505709624796085, "grad_norm": 0.40791764855384827, "learning_rate": 1.5224489795918368e-05, "loss": 1.474, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 374, "tokens_per_second_per_gpu": 6.76 }, { "epoch": 0.30587275693311583, "grad_norm": 0.48413440585136414, "learning_rate": 1.526530612244898e-05, "loss": 1.3137, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 375, "tokens_per_second_per_gpu": 5.7 }, { "epoch": 0.3066884176182708, "grad_norm": 3.1528420448303223, "learning_rate": 1.5306122448979594e-05, "loss": 1.3601, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 376, "tokens_per_second_per_gpu": 12.33 }, { "epoch": 0.3075040783034258, "grad_norm": 0.390323668718338, "learning_rate": 1.5346938775510204e-05, "loss": 1.3406, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 377, "tokens_per_second_per_gpu": 8.53 }, { "epoch": 0.3083197389885807, "grad_norm": 0.25342753529548645, "learning_rate": 1.5387755102040818e-05, "loss": 1.425, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 378, "tokens_per_second_per_gpu": 8.02 }, { "epoch": 0.3091353996737357, "grad_norm": 0.2518925070762634, "learning_rate": 1.5428571428571428e-05, "loss": 1.368, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 379, "tokens_per_second_per_gpu": 11.5 }, { "epoch": 0.3099510603588907, "grad_norm": 0.40985390543937683, "learning_rate": 1.546938775510204e-05, "loss": 1.4627, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 380, "tokens_per_second_per_gpu": 12.79 }, { "epoch": 0.3107667210440457, "grad_norm": 0.43490055203437805, "learning_rate": 1.5510204081632655e-05, "loss": 1.4076, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 381, "tokens_per_second_per_gpu": 6.03 }, { "epoch": 0.31158238172920066, "grad_norm": 0.25065430998802185, "learning_rate": 1.5551020408163265e-05, "loss": 1.444, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 382, "tokens_per_second_per_gpu": 18.77 }, { "epoch": 0.31239804241435565, "grad_norm": 0.410766065120697, "learning_rate": 1.559183673469388e-05, "loss": 1.3623, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 383, "tokens_per_second_per_gpu": 9.4 }, { "epoch": 0.3132137030995106, "grad_norm": 0.4539681375026703, "learning_rate": 1.563265306122449e-05, "loss": 1.509, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 384, "tokens_per_second_per_gpu": 4.97 }, { "epoch": 0.31402936378466556, "grad_norm": 0.5180471539497375, "learning_rate": 1.5673469387755102e-05, "loss": 1.3244, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 385, "tokens_per_second_per_gpu": 7.59 }, { "epoch": 0.31484502446982054, "grad_norm": 0.2832096517086029, "learning_rate": 1.5714285714285715e-05, "loss": 1.4556, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 386, "tokens_per_second_per_gpu": 11.66 }, { "epoch": 0.31566068515497553, "grad_norm": 0.3864551782608032, "learning_rate": 1.575510204081633e-05, "loss": 1.3481, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 387, "tokens_per_second_per_gpu": 17.13 }, { "epoch": 0.3164763458401305, "grad_norm": 3.0974278450012207, "learning_rate": 1.579591836734694e-05, "loss": 1.4279, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 388, "tokens_per_second_per_gpu": 8.26 }, { "epoch": 0.3172920065252855, "grad_norm": 0.4072868824005127, "learning_rate": 1.5836734693877552e-05, "loss": 1.4619, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 389, "tokens_per_second_per_gpu": 15.47 }, { "epoch": 0.3181076672104405, "grad_norm": 0.5372384786605835, "learning_rate": 1.5877551020408162e-05, "loss": 1.3752, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 390, "tokens_per_second_per_gpu": 16.86 }, { "epoch": 0.3189233278955954, "grad_norm": 0.26297539472579956, "learning_rate": 1.5918367346938776e-05, "loss": 1.2797, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 391, "tokens_per_second_per_gpu": 13.24 }, { "epoch": 0.3197389885807504, "grad_norm": 0.30278274416923523, "learning_rate": 1.595918367346939e-05, "loss": 1.3882, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 392, "tokens_per_second_per_gpu": 25.09 }, { "epoch": 0.3205546492659054, "grad_norm": 0.38926735520362854, "learning_rate": 1.6000000000000003e-05, "loss": 1.4181, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 393, "tokens_per_second_per_gpu": 9.59 }, { "epoch": 0.32137030995106036, "grad_norm": 0.44510549306869507, "learning_rate": 1.6040816326530613e-05, "loss": 1.5282, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 394, "tokens_per_second_per_gpu": 5.62 }, { "epoch": 0.32218597063621535, "grad_norm": 0.34546762704849243, "learning_rate": 1.6081632653061226e-05, "loss": 1.4172, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 395, "tokens_per_second_per_gpu": 12.64 }, { "epoch": 0.32300163132137033, "grad_norm": 0.42362064123153687, "learning_rate": 1.6122448979591836e-05, "loss": 1.4053, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 396, "tokens_per_second_per_gpu": 10.06 }, { "epoch": 0.32381729200652526, "grad_norm": 0.4111805260181427, "learning_rate": 1.616326530612245e-05, "loss": 1.4163, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 397, "tokens_per_second_per_gpu": 4.54 }, { "epoch": 0.32463295269168024, "grad_norm": 0.423369437456131, "learning_rate": 1.6204081632653063e-05, "loss": 1.4575, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 398, "tokens_per_second_per_gpu": 14.91 }, { "epoch": 0.3254486133768352, "grad_norm": 0.45139840245246887, "learning_rate": 1.6244897959183673e-05, "loss": 1.3715, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 399, "tokens_per_second_per_gpu": 10.71 }, { "epoch": 0.3262642740619902, "grad_norm": 0.47898101806640625, "learning_rate": 1.6285714285714287e-05, "loss": 1.4994, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 400, "tokens_per_second_per_gpu": 16.56 }, { "epoch": 0.3270799347471452, "grad_norm": 0.3921525478363037, "learning_rate": 1.6326530612244897e-05, "loss": 1.4703, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 401, "tokens_per_second_per_gpu": 8.59 }, { "epoch": 0.3278955954323002, "grad_norm": 0.47009849548339844, "learning_rate": 1.636734693877551e-05, "loss": 1.4442, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 402, "tokens_per_second_per_gpu": 5.04 }, { "epoch": 0.32871125611745516, "grad_norm": 0.41200587153434753, "learning_rate": 1.6408163265306124e-05, "loss": 1.4444, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 403, "tokens_per_second_per_gpu": 7.79 }, { "epoch": 0.3295269168026101, "grad_norm": 0.40177062153816223, "learning_rate": 1.6448979591836737e-05, "loss": 1.4688, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 404, "tokens_per_second_per_gpu": 14.23 }, { "epoch": 0.3303425774877651, "grad_norm": 0.3921443819999695, "learning_rate": 1.6489795918367347e-05, "loss": 1.3963, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 405, "tokens_per_second_per_gpu": 16.6 }, { "epoch": 0.33115823817292006, "grad_norm": 0.38833385705947876, "learning_rate": 1.653061224489796e-05, "loss": 1.3837, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 406, "tokens_per_second_per_gpu": 15.45 }, { "epoch": 0.33197389885807504, "grad_norm": 0.43128108978271484, "learning_rate": 1.657142857142857e-05, "loss": 1.426, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 407, "tokens_per_second_per_gpu": 8.69 }, { "epoch": 0.33278955954323003, "grad_norm": 1.010020136833191, "learning_rate": 1.6612244897959184e-05, "loss": 1.3995, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 408, "tokens_per_second_per_gpu": 5.3 }, { "epoch": 0.333605220228385, "grad_norm": 0.3808646500110626, "learning_rate": 1.6653061224489797e-05, "loss": 1.4029, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 409, "tokens_per_second_per_gpu": 14.42 }, { "epoch": 0.33442088091353994, "grad_norm": 0.4391070008277893, "learning_rate": 1.669387755102041e-05, "loss": 1.3516, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 410, "tokens_per_second_per_gpu": 6.61 }, { "epoch": 0.3352365415986949, "grad_norm": 0.31592270731925964, "learning_rate": 1.673469387755102e-05, "loss": 1.3176, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 411, "tokens_per_second_per_gpu": 5.16 }, { "epoch": 0.3360522022838499, "grad_norm": 0.38476914167404175, "learning_rate": 1.6775510204081634e-05, "loss": 1.3849, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 412, "tokens_per_second_per_gpu": 10.59 }, { "epoch": 0.3368678629690049, "grad_norm": 0.3604581952095032, "learning_rate": 1.6816326530612244e-05, "loss": 1.4034, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 413, "tokens_per_second_per_gpu": 9.68 }, { "epoch": 0.3376835236541599, "grad_norm": 0.42885690927505493, "learning_rate": 1.6857142857142858e-05, "loss": 1.3709, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 414, "tokens_per_second_per_gpu": 11.24 }, { "epoch": 0.33849918433931486, "grad_norm": 0.3725053071975708, "learning_rate": 1.689795918367347e-05, "loss": 1.3642, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 415, "tokens_per_second_per_gpu": 12.24 }, { "epoch": 0.33931484502446985, "grad_norm": 0.5094704031944275, "learning_rate": 1.693877551020408e-05, "loss": 1.4202, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 416, "tokens_per_second_per_gpu": 15.79 }, { "epoch": 0.3401305057096248, "grad_norm": 0.633568286895752, "learning_rate": 1.6979591836734695e-05, "loss": 1.3994, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 417, "tokens_per_second_per_gpu": 12.87 }, { "epoch": 0.34094616639477976, "grad_norm": 0.3238550126552582, "learning_rate": 1.7020408163265305e-05, "loss": 1.413, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 418, "tokens_per_second_per_gpu": 10.81 }, { "epoch": 0.34176182707993474, "grad_norm": 0.4031946063041687, "learning_rate": 1.706122448979592e-05, "loss": 1.421, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 419, "tokens_per_second_per_gpu": 15.56 }, { "epoch": 0.3425774877650897, "grad_norm": 0.39531195163726807, "learning_rate": 1.7102040816326532e-05, "loss": 1.4036, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 420, "tokens_per_second_per_gpu": 9.77 }, { "epoch": 0.3433931484502447, "grad_norm": 0.36542779207229614, "learning_rate": 1.7142857142857145e-05, "loss": 1.2756, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 421, "tokens_per_second_per_gpu": 10.5 }, { "epoch": 0.3442088091353997, "grad_norm": 0.47611281275749207, "learning_rate": 1.7183673469387755e-05, "loss": 1.438, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 422, "tokens_per_second_per_gpu": 8.17 }, { "epoch": 0.3450244698205546, "grad_norm": 0.30392661690711975, "learning_rate": 1.722448979591837e-05, "loss": 1.4398, "memory/device_reserved (GiB)": 75.35, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 423, "tokens_per_second_per_gpu": 12.84 }, { "epoch": 0.3458401305057096, "grad_norm": 0.4050835072994232, "learning_rate": 1.726530612244898e-05, "loss": 1.3499, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 424, "tokens_per_second_per_gpu": 17.2 }, { "epoch": 0.3466557911908646, "grad_norm": 0.5706076622009277, "learning_rate": 1.7306122448979592e-05, "loss": 1.4849, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 425, "tokens_per_second_per_gpu": 8.7 }, { "epoch": 0.3474714518760196, "grad_norm": 0.4582195580005646, "learning_rate": 1.7346938775510206e-05, "loss": 1.3567, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 426, "tokens_per_second_per_gpu": 7.8 }, { "epoch": 0.34828711256117456, "grad_norm": 0.5157740116119385, "learning_rate": 1.738775510204082e-05, "loss": 1.3768, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 427, "tokens_per_second_per_gpu": 6.45 }, { "epoch": 0.34910277324632955, "grad_norm": 0.39053112268447876, "learning_rate": 1.742857142857143e-05, "loss": 1.3614, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 428, "tokens_per_second_per_gpu": 6.98 }, { "epoch": 0.34991843393148453, "grad_norm": 0.49635013937950134, "learning_rate": 1.7469387755102043e-05, "loss": 1.3168, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 429, "tokens_per_second_per_gpu": 7.95 }, { "epoch": 0.35073409461663946, "grad_norm": 0.41582977771759033, "learning_rate": 1.7510204081632653e-05, "loss": 1.4241, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 430, "tokens_per_second_per_gpu": 9.98 }, { "epoch": 0.35154975530179444, "grad_norm": 0.5570454001426697, "learning_rate": 1.7551020408163266e-05, "loss": 1.3707, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 431, "tokens_per_second_per_gpu": 16.55 }, { "epoch": 0.3523654159869494, "grad_norm": 0.3282792866230011, "learning_rate": 1.759183673469388e-05, "loss": 1.3344, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 432, "tokens_per_second_per_gpu": 12.64 }, { "epoch": 0.3531810766721044, "grad_norm": 0.5488823056221008, "learning_rate": 1.763265306122449e-05, "loss": 1.3784, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 433, "tokens_per_second_per_gpu": 14.36 }, { "epoch": 0.3539967373572594, "grad_norm": 0.584414541721344, "learning_rate": 1.7673469387755103e-05, "loss": 1.4003, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 434, "tokens_per_second_per_gpu": 12.74 }, { "epoch": 0.3548123980424144, "grad_norm": 0.41352713108062744, "learning_rate": 1.7714285714285713e-05, "loss": 1.4361, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 435, "tokens_per_second_per_gpu": 8.86 }, { "epoch": 0.3556280587275693, "grad_norm": 0.3178623616695404, "learning_rate": 1.7755102040816327e-05, "loss": 1.4125, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 436, "tokens_per_second_per_gpu": 13.92 }, { "epoch": 0.3564437194127243, "grad_norm": 0.47775039076805115, "learning_rate": 1.779591836734694e-05, "loss": 1.4453, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 437, "tokens_per_second_per_gpu": 17.11 }, { "epoch": 0.3572593800978793, "grad_norm": 0.485061913728714, "learning_rate": 1.7836734693877553e-05, "loss": 1.3194, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 438, "tokens_per_second_per_gpu": 7.08 }, { "epoch": 0.35807504078303426, "grad_norm": 0.4576400816440582, "learning_rate": 1.7877551020408164e-05, "loss": 1.4044, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 439, "tokens_per_second_per_gpu": 10.0 }, { "epoch": 0.35889070146818924, "grad_norm": 0.4648911952972412, "learning_rate": 1.7918367346938777e-05, "loss": 1.304, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 440, "tokens_per_second_per_gpu": 14.88 }, { "epoch": 0.35970636215334423, "grad_norm": 0.3557177186012268, "learning_rate": 1.7959183673469387e-05, "loss": 1.4114, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 441, "tokens_per_second_per_gpu": 9.61 }, { "epoch": 0.3605220228384992, "grad_norm": 0.6361094117164612, "learning_rate": 1.8e-05, "loss": 1.3767, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 442, "tokens_per_second_per_gpu": 8.14 }, { "epoch": 0.36133768352365414, "grad_norm": 0.6456100344657898, "learning_rate": 1.8040816326530614e-05, "loss": 1.4344, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 443, "tokens_per_second_per_gpu": 16.75 }, { "epoch": 0.3621533442088091, "grad_norm": 0.5669094324111938, "learning_rate": 1.8081632653061227e-05, "loss": 1.4618, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 444, "tokens_per_second_per_gpu": 14.49 }, { "epoch": 0.3629690048939641, "grad_norm": 0.6152269244194031, "learning_rate": 1.8122448979591837e-05, "loss": 1.483, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 445, "tokens_per_second_per_gpu": 5.54 }, { "epoch": 0.3637846655791191, "grad_norm": 0.4512878954410553, "learning_rate": 1.816326530612245e-05, "loss": 1.3256, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 446, "tokens_per_second_per_gpu": 15.76 }, { "epoch": 0.3646003262642741, "grad_norm": 0.3525928556919098, "learning_rate": 1.820408163265306e-05, "loss": 1.3754, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 447, "tokens_per_second_per_gpu": 9.6 }, { "epoch": 0.36541598694942906, "grad_norm": 0.482322633266449, "learning_rate": 1.8244897959183674e-05, "loss": 1.4046, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 448, "tokens_per_second_per_gpu": 15.39 }, { "epoch": 0.366231647634584, "grad_norm": 0.46756869554519653, "learning_rate": 1.8285714285714288e-05, "loss": 1.4544, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 449, "tokens_per_second_per_gpu": 5.82 }, { "epoch": 0.367047308319739, "grad_norm": 0.4769602417945862, "learning_rate": 1.8326530612244898e-05, "loss": 1.4506, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 450, "tokens_per_second_per_gpu": 8.48 }, { "epoch": 0.36786296900489396, "grad_norm": 0.5040116906166077, "learning_rate": 1.836734693877551e-05, "loss": 1.4584, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 451, "tokens_per_second_per_gpu": 19.55 }, { "epoch": 0.36867862969004894, "grad_norm": 0.5957096815109253, "learning_rate": 1.840816326530612e-05, "loss": 1.4167, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 452, "tokens_per_second_per_gpu": 7.3 }, { "epoch": 0.3694942903752039, "grad_norm": 0.4840245842933655, "learning_rate": 1.8448979591836735e-05, "loss": 1.2131, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 453, "tokens_per_second_per_gpu": 5.55 }, { "epoch": 0.3703099510603589, "grad_norm": 0.43628785014152527, "learning_rate": 1.8489795918367348e-05, "loss": 1.4571, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 454, "tokens_per_second_per_gpu": 12.78 }, { "epoch": 0.37112561174551384, "grad_norm": 0.34155741333961487, "learning_rate": 1.853061224489796e-05, "loss": 1.3785, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 455, "tokens_per_second_per_gpu": 8.63 }, { "epoch": 0.3719412724306688, "grad_norm": 0.4318888485431671, "learning_rate": 1.8571428571428572e-05, "loss": 1.4605, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 456, "tokens_per_second_per_gpu": 14.4 }, { "epoch": 0.3727569331158238, "grad_norm": 0.5510685443878174, "learning_rate": 1.8612244897959185e-05, "loss": 1.362, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 457, "tokens_per_second_per_gpu": 14.62 }, { "epoch": 0.3735725938009788, "grad_norm": 0.48757562041282654, "learning_rate": 1.8653061224489795e-05, "loss": 1.3122, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 458, "tokens_per_second_per_gpu": 7.96 }, { "epoch": 0.3743882544861338, "grad_norm": 0.5833005905151367, "learning_rate": 1.869387755102041e-05, "loss": 1.366, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 459, "tokens_per_second_per_gpu": 13.03 }, { "epoch": 0.37520391517128876, "grad_norm": 0.646443784236908, "learning_rate": 1.8734693877551022e-05, "loss": 1.257, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 460, "tokens_per_second_per_gpu": 5.7 }, { "epoch": 0.37601957585644374, "grad_norm": 0.49216002225875854, "learning_rate": 1.8775510204081632e-05, "loss": 1.3221, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 461, "tokens_per_second_per_gpu": 8.49 }, { "epoch": 0.3768352365415987, "grad_norm": 0.5369203686714172, "learning_rate": 1.8816326530612246e-05, "loss": 1.3656, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 462, "tokens_per_second_per_gpu": 14.68 }, { "epoch": 0.37765089722675366, "grad_norm": 0.4284152388572693, "learning_rate": 1.885714285714286e-05, "loss": 1.3883, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 463, "tokens_per_second_per_gpu": 10.22 }, { "epoch": 0.37846655791190864, "grad_norm": 0.3624604642391205, "learning_rate": 1.889795918367347e-05, "loss": 1.366, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 464, "tokens_per_second_per_gpu": 11.12 }, { "epoch": 0.3792822185970636, "grad_norm": 0.42302119731903076, "learning_rate": 1.8938775510204083e-05, "loss": 1.4565, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 465, "tokens_per_second_per_gpu": 7.83 }, { "epoch": 0.3800978792822186, "grad_norm": 0.7876184582710266, "learning_rate": 1.8979591836734696e-05, "loss": 1.3704, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 466, "tokens_per_second_per_gpu": 7.01 }, { "epoch": 0.3809135399673736, "grad_norm": 0.4301254451274872, "learning_rate": 1.9020408163265306e-05, "loss": 1.4174, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 467, "tokens_per_second_per_gpu": 9.25 }, { "epoch": 0.3817292006525285, "grad_norm": 0.3738052546977997, "learning_rate": 1.906122448979592e-05, "loss": 1.3662, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 468, "tokens_per_second_per_gpu": 21.95 }, { "epoch": 0.3825448613376835, "grad_norm": 0.5467523336410522, "learning_rate": 1.910204081632653e-05, "loss": 1.3789, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 469, "tokens_per_second_per_gpu": 14.44 }, { "epoch": 0.3833605220228385, "grad_norm": 0.6453961133956909, "learning_rate": 1.9142857142857143e-05, "loss": 1.4673, "memory/device_reserved (GiB)": 75.37, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 470, "tokens_per_second_per_gpu": 5.56 }, { "epoch": 0.3841761827079935, "grad_norm": 0.47982168197631836, "learning_rate": 1.9183673469387756e-05, "loss": 1.4449, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 471, "tokens_per_second_per_gpu": 7.19 }, { "epoch": 0.38499184339314846, "grad_norm": 0.36117255687713623, "learning_rate": 1.922448979591837e-05, "loss": 1.3653, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 472, "tokens_per_second_per_gpu": 7.69 }, { "epoch": 0.38580750407830344, "grad_norm": 0.48742619156837463, "learning_rate": 1.926530612244898e-05, "loss": 1.4875, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 473, "tokens_per_second_per_gpu": 6.57 }, { "epoch": 0.3866231647634584, "grad_norm": 0.44965726137161255, "learning_rate": 1.9306122448979593e-05, "loss": 1.4034, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 474, "tokens_per_second_per_gpu": 6.16 }, { "epoch": 0.38743882544861336, "grad_norm": 0.735654354095459, "learning_rate": 1.9346938775510203e-05, "loss": 1.2739, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 475, "tokens_per_second_per_gpu": 14.42 }, { "epoch": 0.38825448613376834, "grad_norm": 0.4942214787006378, "learning_rate": 1.9387755102040817e-05, "loss": 1.4266, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 476, "tokens_per_second_per_gpu": 8.04 }, { "epoch": 0.3890701468189233, "grad_norm": 0.4431212544441223, "learning_rate": 1.942857142857143e-05, "loss": 1.3539, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 477, "tokens_per_second_per_gpu": 12.36 }, { "epoch": 0.3898858075040783, "grad_norm": 0.32705721259117126, "learning_rate": 1.946938775510204e-05, "loss": 1.3592, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 478, "tokens_per_second_per_gpu": 9.93 }, { "epoch": 0.3907014681892333, "grad_norm": 0.35277867317199707, "learning_rate": 1.9510204081632654e-05, "loss": 1.4538, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 479, "tokens_per_second_per_gpu": 5.88 }, { "epoch": 0.3915171288743883, "grad_norm": 0.7304933071136475, "learning_rate": 1.9551020408163267e-05, "loss": 1.4068, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 480, "tokens_per_second_per_gpu": 3.58 }, { "epoch": 0.3923327895595432, "grad_norm": 0.5168952941894531, "learning_rate": 1.9591836734693877e-05, "loss": 1.3275, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 481, "tokens_per_second_per_gpu": 6.69 }, { "epoch": 0.3931484502446982, "grad_norm": 0.42828571796417236, "learning_rate": 1.963265306122449e-05, "loss": 1.2983, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 482, "tokens_per_second_per_gpu": 13.57 }, { "epoch": 0.3939641109298532, "grad_norm": 0.3684225082397461, "learning_rate": 1.9673469387755104e-05, "loss": 1.427, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 483, "tokens_per_second_per_gpu": 10.7 }, { "epoch": 0.39477977161500816, "grad_norm": 0.4505085349082947, "learning_rate": 1.9714285714285714e-05, "loss": 1.5221, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 484, "tokens_per_second_per_gpu": 9.97 }, { "epoch": 0.39559543230016314, "grad_norm": 0.4381003975868225, "learning_rate": 1.9755102040816328e-05, "loss": 1.2209, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 485, "tokens_per_second_per_gpu": 12.81 }, { "epoch": 0.3964110929853181, "grad_norm": 0.5136219263076782, "learning_rate": 1.9795918367346938e-05, "loss": 1.3498, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 486, "tokens_per_second_per_gpu": 12.4 }, { "epoch": 0.3972267536704731, "grad_norm": 0.5591856837272644, "learning_rate": 1.983673469387755e-05, "loss": 1.31, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 487, "tokens_per_second_per_gpu": 4.96 }, { "epoch": 0.39804241435562804, "grad_norm": 0.4713617265224457, "learning_rate": 1.9877551020408165e-05, "loss": 1.3304, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 488, "tokens_per_second_per_gpu": 12.62 }, { "epoch": 0.398858075040783, "grad_norm": 0.5552292466163635, "learning_rate": 1.9918367346938778e-05, "loss": 1.3525, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 489, "tokens_per_second_per_gpu": 16.94 }, { "epoch": 0.399673735725938, "grad_norm": 0.6341143846511841, "learning_rate": 1.9959183673469388e-05, "loss": 1.3621, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 490, "tokens_per_second_per_gpu": 13.49 }, { "epoch": 0.400489396411093, "grad_norm": 0.5194542407989502, "learning_rate": 2e-05, "loss": 1.3731, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 491, "tokens_per_second_per_gpu": 10.84 }, { "epoch": 0.401305057096248, "grad_norm": 0.5235454440116882, "learning_rate": 2.004081632653061e-05, "loss": 1.2566, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 492, "tokens_per_second_per_gpu": 4.03 }, { "epoch": 0.40212071778140296, "grad_norm": 0.372213751077652, "learning_rate": 2.0081632653061225e-05, "loss": 1.4597, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 493, "tokens_per_second_per_gpu": 12.81 }, { "epoch": 0.4029363784665579, "grad_norm": 0.3822343945503235, "learning_rate": 2.012244897959184e-05, "loss": 1.3694, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 494, "tokens_per_second_per_gpu": 9.12 }, { "epoch": 0.40375203915171287, "grad_norm": 0.6715487241744995, "learning_rate": 2.016326530612245e-05, "loss": 1.3434, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 495, "tokens_per_second_per_gpu": 22.31 }, { "epoch": 0.40456769983686786, "grad_norm": 0.40767258405685425, "learning_rate": 2.0204081632653062e-05, "loss": 1.2818, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 496, "tokens_per_second_per_gpu": 15.37 }, { "epoch": 0.40538336052202284, "grad_norm": 0.5102381110191345, "learning_rate": 2.0244897959183676e-05, "loss": 1.3541, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 497, "tokens_per_second_per_gpu": 14.41 }, { "epoch": 0.4061990212071778, "grad_norm": 0.4838424623012543, "learning_rate": 2.0285714285714286e-05, "loss": 1.3846, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 498, "tokens_per_second_per_gpu": 8.39 }, { "epoch": 0.4070146818923328, "grad_norm": 0.39775845408439636, "learning_rate": 2.03265306122449e-05, "loss": 1.4496, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 499, "tokens_per_second_per_gpu": 13.34 }, { "epoch": 0.4078303425774878, "grad_norm": 0.5263304114341736, "learning_rate": 2.0367346938775512e-05, "loss": 1.3647, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 500, "tokens_per_second_per_gpu": 5.05 }, { "epoch": 0.4086460032626427, "grad_norm": 0.4161772131919861, "learning_rate": 2.0408163265306123e-05, "loss": 1.339, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 501, "tokens_per_second_per_gpu": 11.56 }, { "epoch": 0.4094616639477977, "grad_norm": 0.3743976056575775, "learning_rate": 2.0448979591836736e-05, "loss": 1.3384, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.32, "memory/max_allocated (GiB)": 72.32, "step": 502, "tokens_per_second_per_gpu": 13.73 }, { "epoch": 0.4102773246329527, "grad_norm": 0.49267423152923584, "learning_rate": 2.0489795918367346e-05, "loss": 1.342, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 503, "tokens_per_second_per_gpu": 9.01 }, { "epoch": 0.4110929853181077, "grad_norm": 0.5394039750099182, "learning_rate": 2.053061224489796e-05, "loss": 1.3533, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 504, "tokens_per_second_per_gpu": 15.73 }, { "epoch": 0.41190864600326266, "grad_norm": 1.2523163557052612, "learning_rate": 2.0571428571428573e-05, "loss": 1.4624, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 505, "tokens_per_second_per_gpu": 14.61 }, { "epoch": 0.41272430668841764, "grad_norm": 0.42106398940086365, "learning_rate": 2.0612244897959186e-05, "loss": 1.2461, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 506, "tokens_per_second_per_gpu": 13.02 }, { "epoch": 0.41353996737357257, "grad_norm": 0.4883043169975281, "learning_rate": 2.0653061224489796e-05, "loss": 1.2723, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 507, "tokens_per_second_per_gpu": 12.42 }, { "epoch": 0.41435562805872755, "grad_norm": 0.5015066266059875, "learning_rate": 2.069387755102041e-05, "loss": 1.4543, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 508, "tokens_per_second_per_gpu": 7.73 }, { "epoch": 0.41517128874388254, "grad_norm": 0.387544721364975, "learning_rate": 2.073469387755102e-05, "loss": 1.3521, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 509, "tokens_per_second_per_gpu": 20.34 }, { "epoch": 0.4159869494290375, "grad_norm": 0.603112518787384, "learning_rate": 2.0775510204081633e-05, "loss": 1.3422, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 510, "tokens_per_second_per_gpu": 9.85 }, { "epoch": 0.4168026101141925, "grad_norm": 0.42428529262542725, "learning_rate": 2.0816326530612247e-05, "loss": 1.3167, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 511, "tokens_per_second_per_gpu": 14.89 }, { "epoch": 0.4176182707993475, "grad_norm": 0.5130533576011658, "learning_rate": 2.0857142857142857e-05, "loss": 1.3921, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.26, "memory/max_allocated (GiB)": 72.26, "step": 512, "tokens_per_second_per_gpu": 7.85 }, { "epoch": 0.4184339314845024, "grad_norm": 0.4605494439601898, "learning_rate": 2.089795918367347e-05, "loss": 1.2873, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 513, "tokens_per_second_per_gpu": 7.23 }, { "epoch": 0.4192495921696574, "grad_norm": 0.6813512444496155, "learning_rate": 2.0938775510204084e-05, "loss": 1.3664, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 514, "tokens_per_second_per_gpu": 16.76 }, { "epoch": 0.4200652528548124, "grad_norm": 0.56657475233078, "learning_rate": 2.0979591836734694e-05, "loss": 1.3818, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 515, "tokens_per_second_per_gpu": 9.04 }, { "epoch": 0.42088091353996737, "grad_norm": 0.4459725618362427, "learning_rate": 2.1020408163265307e-05, "loss": 1.501, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 516, "tokens_per_second_per_gpu": 12.51 }, { "epoch": 0.42169657422512236, "grad_norm": 0.472546249628067, "learning_rate": 2.106122448979592e-05, "loss": 1.3506, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 517, "tokens_per_second_per_gpu": 14.24 }, { "epoch": 0.42251223491027734, "grad_norm": 0.3695371747016907, "learning_rate": 2.110204081632653e-05, "loss": 1.3416, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 518, "tokens_per_second_per_gpu": 7.92 }, { "epoch": 0.4233278955954323, "grad_norm": 0.4768540561199188, "learning_rate": 2.1142857142857144e-05, "loss": 1.2978, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 519, "tokens_per_second_per_gpu": 10.36 }, { "epoch": 0.42414355628058725, "grad_norm": 0.6186034679412842, "learning_rate": 2.1183673469387754e-05, "loss": 1.4216, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 520, "tokens_per_second_per_gpu": 3.81 }, { "epoch": 0.42495921696574224, "grad_norm": 0.5437656044960022, "learning_rate": 2.1224489795918368e-05, "loss": 1.3307, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 521, "tokens_per_second_per_gpu": 8.82 }, { "epoch": 0.4257748776508972, "grad_norm": 0.4503440260887146, "learning_rate": 2.126530612244898e-05, "loss": 1.3457, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 522, "tokens_per_second_per_gpu": 10.3 }, { "epoch": 0.4265905383360522, "grad_norm": 0.5693266987800598, "learning_rate": 2.1306122448979595e-05, "loss": 1.4051, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 523, "tokens_per_second_per_gpu": 4.51 }, { "epoch": 0.4274061990212072, "grad_norm": 0.3893054723739624, "learning_rate": 2.1346938775510205e-05, "loss": 1.3242, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 524, "tokens_per_second_per_gpu": 15.14 }, { "epoch": 0.4282218597063622, "grad_norm": 0.4304143786430359, "learning_rate": 2.1387755102040818e-05, "loss": 1.3767, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 525, "tokens_per_second_per_gpu": 5.91 }, { "epoch": 0.4290375203915171, "grad_norm": 0.6662694215774536, "learning_rate": 2.1428571428571428e-05, "loss": 1.222, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 526, "tokens_per_second_per_gpu": 9.19 }, { "epoch": 0.4298531810766721, "grad_norm": 0.4507121741771698, "learning_rate": 2.146938775510204e-05, "loss": 1.3767, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 527, "tokens_per_second_per_gpu": 11.42 }, { "epoch": 0.43066884176182707, "grad_norm": 0.5725674629211426, "learning_rate": 2.1510204081632655e-05, "loss": 1.4143, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 528, "tokens_per_second_per_gpu": 11.15 }, { "epoch": 0.43148450244698205, "grad_norm": 0.5755972266197205, "learning_rate": 2.1551020408163265e-05, "loss": 1.4456, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 529, "tokens_per_second_per_gpu": 10.17 }, { "epoch": 0.43230016313213704, "grad_norm": 0.5427212715148926, "learning_rate": 2.159183673469388e-05, "loss": 1.416, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 530, "tokens_per_second_per_gpu": 12.92 }, { "epoch": 0.433115823817292, "grad_norm": 0.5035621523857117, "learning_rate": 2.1632653061224492e-05, "loss": 1.3277, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 531, "tokens_per_second_per_gpu": 4.59 }, { "epoch": 0.433931484502447, "grad_norm": 0.5715669393539429, "learning_rate": 2.1673469387755102e-05, "loss": 1.358, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 532, "tokens_per_second_per_gpu": 9.4 }, { "epoch": 0.43474714518760194, "grad_norm": 0.400736540555954, "learning_rate": 2.1714285714285715e-05, "loss": 1.4589, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 533, "tokens_per_second_per_gpu": 9.27 }, { "epoch": 0.4355628058727569, "grad_norm": 0.9243930578231812, "learning_rate": 2.175510204081633e-05, "loss": 1.3344, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 534, "tokens_per_second_per_gpu": 13.16 }, { "epoch": 0.4363784665579119, "grad_norm": 0.6148401498794556, "learning_rate": 2.179591836734694e-05, "loss": 1.3739, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 535, "tokens_per_second_per_gpu": 5.59 }, { "epoch": 0.4371941272430669, "grad_norm": 0.4880688786506653, "learning_rate": 2.1836734693877552e-05, "loss": 1.3204, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 536, "tokens_per_second_per_gpu": 8.7 }, { "epoch": 0.43800978792822187, "grad_norm": 0.4517725110054016, "learning_rate": 2.1877551020408162e-05, "loss": 1.3424, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 537, "tokens_per_second_per_gpu": 4.62 }, { "epoch": 0.43882544861337686, "grad_norm": 0.6549472808837891, "learning_rate": 2.1918367346938776e-05, "loss": 1.4969, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 538, "tokens_per_second_per_gpu": 10.15 }, { "epoch": 0.4396411092985318, "grad_norm": 0.5670276880264282, "learning_rate": 2.195918367346939e-05, "loss": 1.3936, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 539, "tokens_per_second_per_gpu": 6.08 }, { "epoch": 0.44045676998368677, "grad_norm": 0.44426849484443665, "learning_rate": 2.2000000000000003e-05, "loss": 1.3925, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 540, "tokens_per_second_per_gpu": 10.12 }, { "epoch": 0.44127243066884175, "grad_norm": 0.408425509929657, "learning_rate": 2.2040816326530613e-05, "loss": 1.4039, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 541, "tokens_per_second_per_gpu": 6.44 }, { "epoch": 0.44208809135399674, "grad_norm": 0.48913857340812683, "learning_rate": 2.2081632653061226e-05, "loss": 1.2493, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 542, "tokens_per_second_per_gpu": 8.4 }, { "epoch": 0.4429037520391517, "grad_norm": 0.4516531229019165, "learning_rate": 2.2122448979591836e-05, "loss": 1.3823, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 543, "tokens_per_second_per_gpu": 12.22 }, { "epoch": 0.4437194127243067, "grad_norm": 0.5109584331512451, "learning_rate": 2.216326530612245e-05, "loss": 1.3533, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 544, "tokens_per_second_per_gpu": 16.2 }, { "epoch": 0.4445350734094617, "grad_norm": 0.4095652401447296, "learning_rate": 2.2204081632653063e-05, "loss": 1.3816, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 545, "tokens_per_second_per_gpu": 10.92 }, { "epoch": 0.4453507340946166, "grad_norm": 0.5629735589027405, "learning_rate": 2.2244897959183673e-05, "loss": 1.4017, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 546, "tokens_per_second_per_gpu": 6.28 }, { "epoch": 0.4461663947797716, "grad_norm": 0.4274446964263916, "learning_rate": 2.2285714285714287e-05, "loss": 1.3597, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 547, "tokens_per_second_per_gpu": 11.26 }, { "epoch": 0.4469820554649266, "grad_norm": 0.32353395223617554, "learning_rate": 2.23265306122449e-05, "loss": 1.412, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 548, "tokens_per_second_per_gpu": 17.41 }, { "epoch": 0.44779771615008157, "grad_norm": 0.5364708304405212, "learning_rate": 2.236734693877551e-05, "loss": 1.2914, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 549, "tokens_per_second_per_gpu": 10.24 }, { "epoch": 0.44861337683523655, "grad_norm": 0.4795685410499573, "learning_rate": 2.2408163265306124e-05, "loss": 1.2502, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 550, "tokens_per_second_per_gpu": 11.63 }, { "epoch": 0.44942903752039154, "grad_norm": 0.48704734444618225, "learning_rate": 2.2448979591836737e-05, "loss": 1.3795, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 551, "tokens_per_second_per_gpu": 7.56 }, { "epoch": 0.45024469820554647, "grad_norm": 0.6294910311698914, "learning_rate": 2.2489795918367347e-05, "loss": 1.3589, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 552, "tokens_per_second_per_gpu": 7.24 }, { "epoch": 0.45106035889070145, "grad_norm": 0.6321533918380737, "learning_rate": 2.253061224489796e-05, "loss": 1.2998, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 553, "tokens_per_second_per_gpu": 8.08 }, { "epoch": 0.45187601957585644, "grad_norm": 0.6531383395195007, "learning_rate": 2.257142857142857e-05, "loss": 1.3063, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 554, "tokens_per_second_per_gpu": 14.88 }, { "epoch": 0.4526916802610114, "grad_norm": 0.6993620991706848, "learning_rate": 2.2612244897959184e-05, "loss": 1.3413, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 555, "tokens_per_second_per_gpu": 9.25 }, { "epoch": 0.4535073409461664, "grad_norm": 0.4883325397968292, "learning_rate": 2.2653061224489798e-05, "loss": 1.2676, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 556, "tokens_per_second_per_gpu": 13.79 }, { "epoch": 0.4543230016313214, "grad_norm": 0.4338706433773041, "learning_rate": 2.269387755102041e-05, "loss": 1.4292, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 557, "tokens_per_second_per_gpu": 13.42 }, { "epoch": 0.4551386623164764, "grad_norm": 0.8919666409492493, "learning_rate": 2.273469387755102e-05, "loss": 1.3115, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 558, "tokens_per_second_per_gpu": 4.19 }, { "epoch": 0.4559543230016313, "grad_norm": 0.45362377166748047, "learning_rate": 2.2775510204081635e-05, "loss": 1.3614, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 559, "tokens_per_second_per_gpu": 15.39 }, { "epoch": 0.4567699836867863, "grad_norm": 0.5201386213302612, "learning_rate": 2.2816326530612245e-05, "loss": 1.3733, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 560, "tokens_per_second_per_gpu": 8.66 }, { "epoch": 0.45758564437194127, "grad_norm": 0.2890126407146454, "learning_rate": 2.2857142857142858e-05, "loss": 1.3702, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 561, "tokens_per_second_per_gpu": 9.32 }, { "epoch": 0.45840130505709625, "grad_norm": 0.5578068494796753, "learning_rate": 2.289795918367347e-05, "loss": 1.4149, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 562, "tokens_per_second_per_gpu": 14.05 }, { "epoch": 0.45921696574225124, "grad_norm": 0.46415966749191284, "learning_rate": 2.293877551020408e-05, "loss": 1.3887, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 563, "tokens_per_second_per_gpu": 4.6 }, { "epoch": 0.4600326264274062, "grad_norm": 0.48381176590919495, "learning_rate": 2.2979591836734695e-05, "loss": 1.3007, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 564, "tokens_per_second_per_gpu": 10.4 }, { "epoch": 0.46084828711256115, "grad_norm": 0.49280208349227905, "learning_rate": 2.302040816326531e-05, "loss": 1.3578, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 565, "tokens_per_second_per_gpu": 12.78 }, { "epoch": 0.46166394779771613, "grad_norm": 0.6228516101837158, "learning_rate": 2.306122448979592e-05, "loss": 1.4715, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 566, "tokens_per_second_per_gpu": 8.75 }, { "epoch": 0.4624796084828711, "grad_norm": 0.4977130591869354, "learning_rate": 2.3102040816326532e-05, "loss": 1.4082, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 567, "tokens_per_second_per_gpu": 8.76 }, { "epoch": 0.4632952691680261, "grad_norm": 0.3920220136642456, "learning_rate": 2.3142857142857145e-05, "loss": 1.4087, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 568, "tokens_per_second_per_gpu": 12.14 }, { "epoch": 0.4641109298531811, "grad_norm": 0.7380698323249817, "learning_rate": 2.3183673469387755e-05, "loss": 1.3012, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 569, "tokens_per_second_per_gpu": 1.61 }, { "epoch": 0.46492659053833607, "grad_norm": 0.49136924743652344, "learning_rate": 2.322448979591837e-05, "loss": 1.4936, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 570, "tokens_per_second_per_gpu": 6.12 }, { "epoch": 0.46574225122349105, "grad_norm": 0.41024839878082275, "learning_rate": 2.326530612244898e-05, "loss": 1.3511, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 571, "tokens_per_second_per_gpu": 7.01 }, { "epoch": 0.466557911908646, "grad_norm": 0.5516024827957153, "learning_rate": 2.3306122448979592e-05, "loss": 1.2973, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 572, "tokens_per_second_per_gpu": 9.39 }, { "epoch": 0.46737357259380097, "grad_norm": 0.6393985748291016, "learning_rate": 2.3346938775510206e-05, "loss": 1.3973, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 573, "tokens_per_second_per_gpu": 10.97 }, { "epoch": 0.46818923327895595, "grad_norm": 0.5276475548744202, "learning_rate": 2.3387755102040816e-05, "loss": 1.3918, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 574, "tokens_per_second_per_gpu": 7.9 }, { "epoch": 0.46900489396411094, "grad_norm": 0.5948678255081177, "learning_rate": 2.342857142857143e-05, "loss": 1.3677, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 575, "tokens_per_second_per_gpu": 6.43 }, { "epoch": 0.4698205546492659, "grad_norm": 0.41483303904533386, "learning_rate": 2.3469387755102043e-05, "loss": 1.3805, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 576, "tokens_per_second_per_gpu": 13.56 }, { "epoch": 0.4706362153344209, "grad_norm": 0.6245461702346802, "learning_rate": 2.3510204081632653e-05, "loss": 1.3318, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 577, "tokens_per_second_per_gpu": 8.71 }, { "epoch": 0.47145187601957583, "grad_norm": 0.4153541624546051, "learning_rate": 2.3551020408163266e-05, "loss": 1.2117, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 578, "tokens_per_second_per_gpu": 4.39 }, { "epoch": 0.4722675367047308, "grad_norm": 0.49249541759490967, "learning_rate": 2.359183673469388e-05, "loss": 1.3609, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 579, "tokens_per_second_per_gpu": 18.76 }, { "epoch": 0.4730831973898858, "grad_norm": 0.6066439151763916, "learning_rate": 2.363265306122449e-05, "loss": 1.3877, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 580, "tokens_per_second_per_gpu": 5.52 }, { "epoch": 0.4738988580750408, "grad_norm": 0.4947279393672943, "learning_rate": 2.3673469387755103e-05, "loss": 1.3268, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 581, "tokens_per_second_per_gpu": 8.47 }, { "epoch": 0.47471451876019577, "grad_norm": 0.8465497493743896, "learning_rate": 2.3714285714285717e-05, "loss": 1.4061, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 582, "tokens_per_second_per_gpu": 7.46 }, { "epoch": 0.47553017944535075, "grad_norm": 0.5482879877090454, "learning_rate": 2.3755102040816327e-05, "loss": 1.3885, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 583, "tokens_per_second_per_gpu": 7.84 }, { "epoch": 0.4763458401305057, "grad_norm": 0.4621152877807617, "learning_rate": 2.379591836734694e-05, "loss": 1.319, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 584, "tokens_per_second_per_gpu": 13.42 }, { "epoch": 0.47716150081566067, "grad_norm": 0.665191113948822, "learning_rate": 2.3836734693877554e-05, "loss": 1.4007, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 585, "tokens_per_second_per_gpu": 2.13 }, { "epoch": 0.47797716150081565, "grad_norm": 0.47784629464149475, "learning_rate": 2.3877551020408164e-05, "loss": 1.3456, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 586, "tokens_per_second_per_gpu": 9.9 }, { "epoch": 0.47879282218597063, "grad_norm": 0.682706892490387, "learning_rate": 2.3918367346938777e-05, "loss": 1.4336, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 587, "tokens_per_second_per_gpu": 3.7 }, { "epoch": 0.4796084828711256, "grad_norm": 0.47959089279174805, "learning_rate": 2.3959183673469387e-05, "loss": 1.362, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 588, "tokens_per_second_per_gpu": 9.11 }, { "epoch": 0.4804241435562806, "grad_norm": 0.366928368806839, "learning_rate": 2.4e-05, "loss": 1.3816, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 589, "tokens_per_second_per_gpu": 13.05 }, { "epoch": 0.4812398042414356, "grad_norm": 0.4617217183113098, "learning_rate": 2.4040816326530614e-05, "loss": 1.3985, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 590, "tokens_per_second_per_gpu": 14.53 }, { "epoch": 0.4820554649265905, "grad_norm": 0.5298788547515869, "learning_rate": 2.4081632653061224e-05, "loss": 1.2622, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 591, "tokens_per_second_per_gpu": 7.13 }, { "epoch": 0.4828711256117455, "grad_norm": 0.39253613352775574, "learning_rate": 2.4122448979591838e-05, "loss": 1.3807, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 592, "tokens_per_second_per_gpu": 7.97 }, { "epoch": 0.4836867862969005, "grad_norm": 0.4826742112636566, "learning_rate": 2.416326530612245e-05, "loss": 1.3927, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 593, "tokens_per_second_per_gpu": 7.26 }, { "epoch": 0.48450244698205547, "grad_norm": 0.6061569452285767, "learning_rate": 2.420408163265306e-05, "loss": 1.4159, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 594, "tokens_per_second_per_gpu": 11.63 }, { "epoch": 0.48531810766721045, "grad_norm": 0.4523598253726959, "learning_rate": 2.4244897959183674e-05, "loss": 1.3444, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 595, "tokens_per_second_per_gpu": 8.51 }, { "epoch": 0.48613376835236544, "grad_norm": 0.4383697807788849, "learning_rate": 2.4285714285714288e-05, "loss": 1.4087, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 596, "tokens_per_second_per_gpu": 9.39 }, { "epoch": 0.48694942903752036, "grad_norm": 0.41315048933029175, "learning_rate": 2.4326530612244898e-05, "loss": 1.3097, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 597, "tokens_per_second_per_gpu": 10.14 }, { "epoch": 0.48776508972267535, "grad_norm": 0.5637692809104919, "learning_rate": 2.436734693877551e-05, "loss": 1.3693, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 598, "tokens_per_second_per_gpu": 6.06 }, { "epoch": 0.48858075040783033, "grad_norm": 0.6410973072052002, "learning_rate": 2.4408163265306125e-05, "loss": 1.3601, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 599, "tokens_per_second_per_gpu": 8.61 }, { "epoch": 0.4893964110929853, "grad_norm": 0.5013763308525085, "learning_rate": 2.4448979591836735e-05, "loss": 1.3152, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 600, "tokens_per_second_per_gpu": 9.07 }, { "epoch": 0.4902120717781403, "grad_norm": 0.5204721093177795, "learning_rate": 2.448979591836735e-05, "loss": 1.4084, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 601, "tokens_per_second_per_gpu": 9.88 }, { "epoch": 0.4910277324632953, "grad_norm": 0.6549529433250427, "learning_rate": 2.4530612244897962e-05, "loss": 1.3731, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 602, "tokens_per_second_per_gpu": 12.16 }, { "epoch": 0.49184339314845027, "grad_norm": 0.46892115473747253, "learning_rate": 2.4571428571428572e-05, "loss": 1.2639, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 603, "tokens_per_second_per_gpu": 11.15 }, { "epoch": 0.4926590538336052, "grad_norm": 0.5621423721313477, "learning_rate": 2.4612244897959185e-05, "loss": 1.2783, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 604, "tokens_per_second_per_gpu": 7.22 }, { "epoch": 0.4934747145187602, "grad_norm": 4.734508514404297, "learning_rate": 2.4653061224489795e-05, "loss": 1.3519, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 605, "tokens_per_second_per_gpu": 11.09 }, { "epoch": 0.49429037520391517, "grad_norm": 0.3123314678668976, "learning_rate": 2.469387755102041e-05, "loss": 1.4001, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 606, "tokens_per_second_per_gpu": 7.26 }, { "epoch": 0.49510603588907015, "grad_norm": 0.6794952750205994, "learning_rate": 2.4734693877551022e-05, "loss": 1.2744, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 607, "tokens_per_second_per_gpu": 17.25 }, { "epoch": 0.49592169657422513, "grad_norm": 0.563356339931488, "learning_rate": 2.4775510204081632e-05, "loss": 1.3805, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 608, "tokens_per_second_per_gpu": 11.36 }, { "epoch": 0.4967373572593801, "grad_norm": 0.5056926012039185, "learning_rate": 2.4816326530612246e-05, "loss": 1.3509, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 609, "tokens_per_second_per_gpu": 9.7 }, { "epoch": 0.49755301794453505, "grad_norm": 0.5787825584411621, "learning_rate": 2.485714285714286e-05, "loss": 1.4436, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 610, "tokens_per_second_per_gpu": 15.98 }, { "epoch": 0.49836867862969003, "grad_norm": 0.47602665424346924, "learning_rate": 2.489795918367347e-05, "loss": 1.355, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 611, "tokens_per_second_per_gpu": 15.24 }, { "epoch": 0.499184339314845, "grad_norm": 0.6177883148193359, "learning_rate": 2.4938775510204083e-05, "loss": 1.3487, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 612, "tokens_per_second_per_gpu": 8.58 }, { "epoch": 0.5, "grad_norm": 0.5888877511024475, "learning_rate": 2.4979591836734696e-05, "loss": 1.3747, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 613, "tokens_per_second_per_gpu": 5.37 }, { "epoch": 0.5008156606851549, "grad_norm": 0.5418539047241211, "learning_rate": 2.5020408163265306e-05, "loss": 1.37, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 614, "tokens_per_second_per_gpu": 17.64 }, { "epoch": 0.50163132137031, "grad_norm": 0.45799821615219116, "learning_rate": 2.506122448979592e-05, "loss": 1.3223, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 615, "tokens_per_second_per_gpu": 14.1 }, { "epoch": 0.5024469820554649, "grad_norm": 0.48642534017562866, "learning_rate": 2.5102040816326533e-05, "loss": 1.3397, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 616, "tokens_per_second_per_gpu": 9.31 }, { "epoch": 0.5032626427406199, "grad_norm": 0.5414573550224304, "learning_rate": 2.5142857142857147e-05, "loss": 1.4105, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 617, "tokens_per_second_per_gpu": 6.08 }, { "epoch": 0.5040783034257749, "grad_norm": 0.3169739842414856, "learning_rate": 2.518367346938776e-05, "loss": 1.4239, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 618, "tokens_per_second_per_gpu": 12.85 }, { "epoch": 0.5048939641109299, "grad_norm": 0.38471996784210205, "learning_rate": 2.522448979591837e-05, "loss": 1.3115, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 619, "tokens_per_second_per_gpu": 13.7 }, { "epoch": 0.5057096247960848, "grad_norm": 0.6026085615158081, "learning_rate": 2.526530612244898e-05, "loss": 1.3108, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 620, "tokens_per_second_per_gpu": 17.15 }, { "epoch": 0.5065252854812398, "grad_norm": 0.4492347836494446, "learning_rate": 2.530612244897959e-05, "loss": 1.302, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 621, "tokens_per_second_per_gpu": 10.49 }, { "epoch": 0.5073409461663948, "grad_norm": 0.37457531690597534, "learning_rate": 2.5346938775510204e-05, "loss": 1.3477, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 622, "tokens_per_second_per_gpu": 5.04 }, { "epoch": 0.5081566068515497, "grad_norm": 0.6660877466201782, "learning_rate": 2.5387755102040817e-05, "loss": 1.3914, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 623, "tokens_per_second_per_gpu": 6.68 }, { "epoch": 0.5089722675367048, "grad_norm": 0.4168795347213745, "learning_rate": 2.542857142857143e-05, "loss": 1.3438, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 624, "tokens_per_second_per_gpu": 9.34 }, { "epoch": 0.5097879282218597, "grad_norm": 0.559728741645813, "learning_rate": 2.546938775510204e-05, "loss": 1.299, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 625, "tokens_per_second_per_gpu": 4.3 }, { "epoch": 0.5106035889070146, "grad_norm": 0.567116916179657, "learning_rate": 2.5510204081632654e-05, "loss": 1.4569, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 626, "tokens_per_second_per_gpu": 7.59 }, { "epoch": 0.5114192495921697, "grad_norm": 0.3792031407356262, "learning_rate": 2.5551020408163267e-05, "loss": 1.3423, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 627, "tokens_per_second_per_gpu": 14.54 }, { "epoch": 0.5122349102773246, "grad_norm": 0.5733457207679749, "learning_rate": 2.559183673469388e-05, "loss": 1.2909, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 628, "tokens_per_second_per_gpu": 19.22 }, { "epoch": 0.5130505709624796, "grad_norm": 0.38350874185562134, "learning_rate": 2.5632653061224494e-05, "loss": 1.2867, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 629, "tokens_per_second_per_gpu": 10.45 }, { "epoch": 0.5138662316476346, "grad_norm": 0.46697503328323364, "learning_rate": 2.5673469387755104e-05, "loss": 1.3232, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 630, "tokens_per_second_per_gpu": 8.12 }, { "epoch": 0.5146818923327896, "grad_norm": 0.3588668704032898, "learning_rate": 2.5714285714285714e-05, "loss": 1.4809, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 631, "tokens_per_second_per_gpu": 9.56 }, { "epoch": 0.5154975530179445, "grad_norm": 0.3083694875240326, "learning_rate": 2.5755102040816325e-05, "loss": 1.3333, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 632, "tokens_per_second_per_gpu": 5.65 }, { "epoch": 0.5163132137030995, "grad_norm": 0.588860809803009, "learning_rate": 2.5795918367346938e-05, "loss": 1.2848, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 633, "tokens_per_second_per_gpu": 4.99 }, { "epoch": 0.5171288743882545, "grad_norm": 0.6858779788017273, "learning_rate": 2.583673469387755e-05, "loss": 1.3099, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 634, "tokens_per_second_per_gpu": 16.02 }, { "epoch": 0.5179445350734094, "grad_norm": 0.5935061573982239, "learning_rate": 2.5877551020408165e-05, "loss": 1.3508, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 635, "tokens_per_second_per_gpu": 3.05 }, { "epoch": 0.5187601957585645, "grad_norm": 0.3670724034309387, "learning_rate": 2.5918367346938778e-05, "loss": 1.3211, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 636, "tokens_per_second_per_gpu": 13.89 }, { "epoch": 0.5195758564437194, "grad_norm": 0.5775123238563538, "learning_rate": 2.595918367346939e-05, "loss": 1.3296, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 637, "tokens_per_second_per_gpu": 7.25 }, { "epoch": 0.5203915171288744, "grad_norm": 0.5732021331787109, "learning_rate": 2.6000000000000002e-05, "loss": 1.364, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 638, "tokens_per_second_per_gpu": 9.44 }, { "epoch": 0.5212071778140294, "grad_norm": 0.5329490900039673, "learning_rate": 2.6040816326530615e-05, "loss": 1.4228, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 639, "tokens_per_second_per_gpu": 6.15 }, { "epoch": 0.5220228384991843, "grad_norm": 0.5557101368904114, "learning_rate": 2.608163265306123e-05, "loss": 1.326, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 640, "tokens_per_second_per_gpu": 4.49 }, { "epoch": 0.5228384991843393, "grad_norm": 0.8022142052650452, "learning_rate": 2.612244897959184e-05, "loss": 1.4339, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 641, "tokens_per_second_per_gpu": 2.64 }, { "epoch": 0.5236541598694943, "grad_norm": 0.42484956979751587, "learning_rate": 2.616326530612245e-05, "loss": 1.3982, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 642, "tokens_per_second_per_gpu": 10.18 }, { "epoch": 0.5244698205546493, "grad_norm": 0.4716418981552124, "learning_rate": 2.620408163265306e-05, "loss": 1.2709, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 643, "tokens_per_second_per_gpu": 3.0 }, { "epoch": 0.5252854812398042, "grad_norm": 0.6492699980735779, "learning_rate": 2.6244897959183672e-05, "loss": 1.386, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 644, "tokens_per_second_per_gpu": 9.79 }, { "epoch": 0.5261011419249593, "grad_norm": 0.41983988881111145, "learning_rate": 2.6285714285714286e-05, "loss": 1.3118, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 645, "tokens_per_second_per_gpu": 11.32 }, { "epoch": 0.5269168026101142, "grad_norm": 0.535580575466156, "learning_rate": 2.63265306122449e-05, "loss": 1.3126, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 646, "tokens_per_second_per_gpu": 8.08 }, { "epoch": 0.5277324632952691, "grad_norm": 0.48669347167015076, "learning_rate": 2.6367346938775513e-05, "loss": 1.3384, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 647, "tokens_per_second_per_gpu": 13.41 }, { "epoch": 0.5285481239804242, "grad_norm": 0.3661770522594452, "learning_rate": 2.6408163265306123e-05, "loss": 1.3615, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 648, "tokens_per_second_per_gpu": 13.91 }, { "epoch": 0.5293637846655791, "grad_norm": 0.4193444550037384, "learning_rate": 2.6448979591836736e-05, "loss": 1.3209, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 649, "tokens_per_second_per_gpu": 8.59 }, { "epoch": 0.5301794453507341, "grad_norm": 0.7263711094856262, "learning_rate": 2.648979591836735e-05, "loss": 1.4529, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 650, "tokens_per_second_per_gpu": 3.01 }, { "epoch": 0.5309951060358891, "grad_norm": 0.4018748700618744, "learning_rate": 2.6530612244897963e-05, "loss": 1.3726, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 651, "tokens_per_second_per_gpu": 10.42 }, { "epoch": 0.531810766721044, "grad_norm": 0.7953455448150635, "learning_rate": 2.6571428571428576e-05, "loss": 1.3472, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 652, "tokens_per_second_per_gpu": 10.15 }, { "epoch": 0.532626427406199, "grad_norm": 0.292983740568161, "learning_rate": 2.6612244897959187e-05, "loss": 1.3211, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 653, "tokens_per_second_per_gpu": 9.4 }, { "epoch": 0.533442088091354, "grad_norm": 0.5852302312850952, "learning_rate": 2.6653061224489793e-05, "loss": 1.4259, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 654, "tokens_per_second_per_gpu": 10.42 }, { "epoch": 0.534257748776509, "grad_norm": 0.6558208465576172, "learning_rate": 2.6693877551020407e-05, "loss": 1.3419, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 655, "tokens_per_second_per_gpu": 8.15 }, { "epoch": 0.5350734094616639, "grad_norm": 0.5328148603439331, "learning_rate": 2.673469387755102e-05, "loss": 1.3667, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 656, "tokens_per_second_per_gpu": 5.41 }, { "epoch": 0.535889070146819, "grad_norm": 0.35301709175109863, "learning_rate": 2.6775510204081634e-05, "loss": 1.4109, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 657, "tokens_per_second_per_gpu": 9.43 }, { "epoch": 0.5367047308319739, "grad_norm": 0.5524665117263794, "learning_rate": 2.6816326530612247e-05, "loss": 1.4104, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 658, "tokens_per_second_per_gpu": 5.03 }, { "epoch": 0.5375203915171288, "grad_norm": 0.5728408694267273, "learning_rate": 2.6857142857142857e-05, "loss": 1.4647, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 659, "tokens_per_second_per_gpu": 5.7 }, { "epoch": 0.5383360522022839, "grad_norm": 0.5058271884918213, "learning_rate": 2.689795918367347e-05, "loss": 1.3412, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 660, "tokens_per_second_per_gpu": 10.31 }, { "epoch": 0.5391517128874388, "grad_norm": 0.5869598388671875, "learning_rate": 2.6938775510204084e-05, "loss": 1.3317, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 661, "tokens_per_second_per_gpu": 7.23 }, { "epoch": 0.5399673735725938, "grad_norm": 0.6473169922828674, "learning_rate": 2.6979591836734697e-05, "loss": 1.4479, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 662, "tokens_per_second_per_gpu": 8.02 }, { "epoch": 0.5407830342577488, "grad_norm": 0.40653514862060547, "learning_rate": 2.702040816326531e-05, "loss": 1.3538, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 663, "tokens_per_second_per_gpu": 5.13 }, { "epoch": 0.5415986949429038, "grad_norm": 0.429622083902359, "learning_rate": 2.706122448979592e-05, "loss": 1.3423, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 664, "tokens_per_second_per_gpu": 11.49 }, { "epoch": 0.5424143556280587, "grad_norm": 0.35518285632133484, "learning_rate": 2.710204081632653e-05, "loss": 1.382, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 665, "tokens_per_second_per_gpu": 10.38 }, { "epoch": 0.5432300163132137, "grad_norm": 0.5070427656173706, "learning_rate": 2.714285714285714e-05, "loss": 1.2756, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 666, "tokens_per_second_per_gpu": 9.92 }, { "epoch": 0.5440456769983687, "grad_norm": 0.48749467730522156, "learning_rate": 2.7183673469387754e-05, "loss": 1.4059, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 667, "tokens_per_second_per_gpu": 6.49 }, { "epoch": 0.5448613376835236, "grad_norm": 0.4480941891670227, "learning_rate": 2.7224489795918368e-05, "loss": 1.2131, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 668, "tokens_per_second_per_gpu": 7.52 }, { "epoch": 0.5456769983686787, "grad_norm": 0.559497594833374, "learning_rate": 2.726530612244898e-05, "loss": 1.3392, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 669, "tokens_per_second_per_gpu": 7.69 }, { "epoch": 0.5464926590538336, "grad_norm": 0.4177839159965515, "learning_rate": 2.730612244897959e-05, "loss": 1.3401, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 670, "tokens_per_second_per_gpu": 9.0 }, { "epoch": 0.5473083197389886, "grad_norm": 0.4199763536453247, "learning_rate": 2.7346938775510205e-05, "loss": 1.3878, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 671, "tokens_per_second_per_gpu": 10.69 }, { "epoch": 0.5481239804241436, "grad_norm": 0.38790106773376465, "learning_rate": 2.7387755102040818e-05, "loss": 1.3422, "memory/device_reserved (GiB)": 75.39, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 672, "tokens_per_second_per_gpu": 13.12 }, { "epoch": 0.5489396411092985, "grad_norm": 0.4396006166934967, "learning_rate": 2.742857142857143e-05, "loss": 1.3553, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 673, "tokens_per_second_per_gpu": 4.36 }, { "epoch": 0.5497553017944535, "grad_norm": 0.5439503192901611, "learning_rate": 2.7469387755102045e-05, "loss": 1.3498, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 674, "tokens_per_second_per_gpu": 15.13 }, { "epoch": 0.5505709624796085, "grad_norm": 0.48078468441963196, "learning_rate": 2.7510204081632655e-05, "loss": 1.34, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 675, "tokens_per_second_per_gpu": 7.09 }, { "epoch": 0.5513866231647635, "grad_norm": 0.626756489276886, "learning_rate": 2.7551020408163265e-05, "loss": 1.2385, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 676, "tokens_per_second_per_gpu": 7.92 }, { "epoch": 0.5522022838499184, "grad_norm": 0.5230774879455566, "learning_rate": 2.7591836734693875e-05, "loss": 1.4256, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 677, "tokens_per_second_per_gpu": 4.21 }, { "epoch": 0.5530179445350734, "grad_norm": 0.41005516052246094, "learning_rate": 2.763265306122449e-05, "loss": 1.3144, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 678, "tokens_per_second_per_gpu": 7.8 }, { "epoch": 0.5538336052202284, "grad_norm": 0.6286873817443848, "learning_rate": 2.7673469387755102e-05, "loss": 1.4293, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 679, "tokens_per_second_per_gpu": 4.23 }, { "epoch": 0.5546492659053833, "grad_norm": 0.38161972165107727, "learning_rate": 2.7714285714285716e-05, "loss": 1.3699, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 680, "tokens_per_second_per_gpu": 5.22 }, { "epoch": 0.5554649265905384, "grad_norm": 0.39938631653785706, "learning_rate": 2.775510204081633e-05, "loss": 1.3279, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 681, "tokens_per_second_per_gpu": 16.92 }, { "epoch": 0.5562805872756933, "grad_norm": 0.46594834327697754, "learning_rate": 2.779591836734694e-05, "loss": 1.305, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 682, "tokens_per_second_per_gpu": 9.26 }, { "epoch": 0.5570962479608483, "grad_norm": 0.38688042759895325, "learning_rate": 2.7836734693877553e-05, "loss": 1.4321, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 683, "tokens_per_second_per_gpu": 9.4 }, { "epoch": 0.5579119086460033, "grad_norm": 0.7292085886001587, "learning_rate": 2.7877551020408166e-05, "loss": 1.2434, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 684, "tokens_per_second_per_gpu": 9.52 }, { "epoch": 0.5587275693311582, "grad_norm": 0.391528844833374, "learning_rate": 2.791836734693878e-05, "loss": 1.2754, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 685, "tokens_per_second_per_gpu": 6.05 }, { "epoch": 0.5595432300163132, "grad_norm": 0.6943433880805969, "learning_rate": 2.7959183673469393e-05, "loss": 1.2968, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 686, "tokens_per_second_per_gpu": 5.3 }, { "epoch": 0.5603588907014682, "grad_norm": 0.6839162111282349, "learning_rate": 2.8000000000000003e-05, "loss": 1.4019, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 687, "tokens_per_second_per_gpu": 4.7 }, { "epoch": 0.5611745513866232, "grad_norm": 0.5362688899040222, "learning_rate": 2.804081632653061e-05, "loss": 1.3099, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 688, "tokens_per_second_per_gpu": 14.08 }, { "epoch": 0.5619902120717781, "grad_norm": 0.26453375816345215, "learning_rate": 2.8081632653061223e-05, "loss": 1.4207, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 689, "tokens_per_second_per_gpu": 13.28 }, { "epoch": 0.5628058727569332, "grad_norm": 0.6618549823760986, "learning_rate": 2.8122448979591837e-05, "loss": 1.4284, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 690, "tokens_per_second_per_gpu": 1.95 }, { "epoch": 0.5636215334420881, "grad_norm": 0.4500647783279419, "learning_rate": 2.816326530612245e-05, "loss": 1.3174, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 691, "tokens_per_second_per_gpu": 13.48 }, { "epoch": 0.564437194127243, "grad_norm": 0.3870519995689392, "learning_rate": 2.8204081632653063e-05, "loss": 1.2399, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 692, "tokens_per_second_per_gpu": 8.51 }, { "epoch": 0.5652528548123981, "grad_norm": 0.7589889764785767, "learning_rate": 2.8244897959183673e-05, "loss": 1.2889, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 693, "tokens_per_second_per_gpu": 5.98 }, { "epoch": 0.566068515497553, "grad_norm": 0.46478667855262756, "learning_rate": 2.8285714285714287e-05, "loss": 1.3784, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 694, "tokens_per_second_per_gpu": 7.66 }, { "epoch": 0.566884176182708, "grad_norm": 0.44254523515701294, "learning_rate": 2.83265306122449e-05, "loss": 1.314, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 695, "tokens_per_second_per_gpu": 12.73 }, { "epoch": 0.567699836867863, "grad_norm": 0.5748714208602905, "learning_rate": 2.8367346938775514e-05, "loss": 1.3777, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 696, "tokens_per_second_per_gpu": 1.72 }, { "epoch": 0.5685154975530179, "grad_norm": 0.5533417463302612, "learning_rate": 2.8408163265306127e-05, "loss": 1.3363, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 697, "tokens_per_second_per_gpu": 11.5 }, { "epoch": 0.5693311582381729, "grad_norm": 0.40582939982414246, "learning_rate": 2.8448979591836737e-05, "loss": 1.3707, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 698, "tokens_per_second_per_gpu": 4.43 }, { "epoch": 0.5701468189233279, "grad_norm": 0.43507158756256104, "learning_rate": 2.8489795918367347e-05, "loss": 1.2478, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 699, "tokens_per_second_per_gpu": 2.99 }, { "epoch": 0.5709624796084829, "grad_norm": 0.6379005908966064, "learning_rate": 2.8530612244897957e-05, "loss": 1.3496, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 700, "tokens_per_second_per_gpu": 3.87 }, { "epoch": 0.5717781402936378, "grad_norm": 0.5367976427078247, "learning_rate": 2.857142857142857e-05, "loss": 1.439, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 701, "tokens_per_second_per_gpu": 5.0 }, { "epoch": 0.5725938009787929, "grad_norm": 0.4133274555206299, "learning_rate": 2.8612244897959184e-05, "loss": 1.3931, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 702, "tokens_per_second_per_gpu": 10.07 }, { "epoch": 0.5734094616639478, "grad_norm": 0.24529893696308136, "learning_rate": 2.8653061224489798e-05, "loss": 1.3161, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 703, "tokens_per_second_per_gpu": 17.07 }, { "epoch": 0.5742251223491027, "grad_norm": 0.42475661635398865, "learning_rate": 2.8693877551020408e-05, "loss": 1.4441, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 704, "tokens_per_second_per_gpu": 14.21 }, { "epoch": 0.5750407830342578, "grad_norm": 0.27682387828826904, "learning_rate": 2.873469387755102e-05, "loss": 1.2157, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 705, "tokens_per_second_per_gpu": 12.58 }, { "epoch": 0.5758564437194127, "grad_norm": 0.6383972764015198, "learning_rate": 2.8775510204081635e-05, "loss": 1.3599, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 706, "tokens_per_second_per_gpu": 4.29 }, { "epoch": 0.5766721044045677, "grad_norm": 0.4300104081630707, "learning_rate": 2.8816326530612248e-05, "loss": 1.3684, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 707, "tokens_per_second_per_gpu": 13.71 }, { "epoch": 0.5774877650897227, "grad_norm": 0.5621813535690308, "learning_rate": 2.885714285714286e-05, "loss": 1.3994, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 708, "tokens_per_second_per_gpu": 9.74 }, { "epoch": 0.5783034257748777, "grad_norm": 0.45237836241722107, "learning_rate": 2.889795918367347e-05, "loss": 1.3175, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 709, "tokens_per_second_per_gpu": 7.56 }, { "epoch": 0.5791190864600326, "grad_norm": 0.5194471478462219, "learning_rate": 2.8938775510204082e-05, "loss": 1.3863, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 710, "tokens_per_second_per_gpu": 4.32 }, { "epoch": 0.5799347471451876, "grad_norm": 0.3001406788825989, "learning_rate": 2.8979591836734692e-05, "loss": 1.2711, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 711, "tokens_per_second_per_gpu": 11.9 }, { "epoch": 0.5807504078303426, "grad_norm": 0.5264477133750916, "learning_rate": 2.9020408163265305e-05, "loss": 1.2549, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 712, "tokens_per_second_per_gpu": 6.08 }, { "epoch": 0.5815660685154975, "grad_norm": 0.4894733428955078, "learning_rate": 2.906122448979592e-05, "loss": 1.287, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 713, "tokens_per_second_per_gpu": 3.87 }, { "epoch": 0.5823817292006526, "grad_norm": 0.39275678992271423, "learning_rate": 2.9102040816326532e-05, "loss": 1.3353, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 714, "tokens_per_second_per_gpu": 10.92 }, { "epoch": 0.5831973898858075, "grad_norm": 0.6444816589355469, "learning_rate": 2.9142857142857146e-05, "loss": 1.3169, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 715, "tokens_per_second_per_gpu": 12.43 }, { "epoch": 0.5840130505709625, "grad_norm": 0.3352125585079193, "learning_rate": 2.9183673469387756e-05, "loss": 1.383, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 716, "tokens_per_second_per_gpu": 6.88 }, { "epoch": 0.5848287112561175, "grad_norm": 0.5855259299278259, "learning_rate": 2.922448979591837e-05, "loss": 1.3397, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 717, "tokens_per_second_per_gpu": 9.32 }, { "epoch": 0.5856443719412724, "grad_norm": 1.8574632406234741, "learning_rate": 2.9265306122448982e-05, "loss": 1.3805, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 718, "tokens_per_second_per_gpu": 7.08 }, { "epoch": 0.5864600326264274, "grad_norm": 0.5146313309669495, "learning_rate": 2.9306122448979596e-05, "loss": 1.3054, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 719, "tokens_per_second_per_gpu": 11.35 }, { "epoch": 0.5872756933115824, "grad_norm": 0.5770206451416016, "learning_rate": 2.9346938775510206e-05, "loss": 1.4036, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 720, "tokens_per_second_per_gpu": 4.47 }, { "epoch": 0.5880913539967374, "grad_norm": 0.6004086136817932, "learning_rate": 2.938775510204082e-05, "loss": 1.3655, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 721, "tokens_per_second_per_gpu": 6.05 }, { "epoch": 0.5889070146818923, "grad_norm": 0.845537543296814, "learning_rate": 2.9428571428571426e-05, "loss": 1.3899, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 722, "tokens_per_second_per_gpu": 6.91 }, { "epoch": 0.5897226753670473, "grad_norm": 0.3231058716773987, "learning_rate": 2.946938775510204e-05, "loss": 1.3181, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 723, "tokens_per_second_per_gpu": 9.95 }, { "epoch": 0.5905383360522023, "grad_norm": 0.3838690221309662, "learning_rate": 2.9510204081632653e-05, "loss": 1.3325, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 724, "tokens_per_second_per_gpu": 6.46 }, { "epoch": 0.5913539967373572, "grad_norm": 0.5227867960929871, "learning_rate": 2.9551020408163266e-05, "loss": 1.3211, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 725, "tokens_per_second_per_gpu": 12.84 }, { "epoch": 0.5921696574225123, "grad_norm": 0.6033756136894226, "learning_rate": 2.959183673469388e-05, "loss": 1.3579, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 726, "tokens_per_second_per_gpu": 10.0 }, { "epoch": 0.5929853181076672, "grad_norm": 0.4959680140018463, "learning_rate": 2.963265306122449e-05, "loss": 1.3437, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 727, "tokens_per_second_per_gpu": 8.35 }, { "epoch": 0.5938009787928222, "grad_norm": 0.5851319432258606, "learning_rate": 2.9673469387755103e-05, "loss": 1.4252, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 728, "tokens_per_second_per_gpu": 5.8 }, { "epoch": 0.5946166394779772, "grad_norm": 0.43418529629707336, "learning_rate": 2.9714285714285717e-05, "loss": 1.3886, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 729, "tokens_per_second_per_gpu": 10.97 }, { "epoch": 0.5954323001631321, "grad_norm": 0.5129976272583008, "learning_rate": 2.975510204081633e-05, "loss": 1.4142, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 730, "tokens_per_second_per_gpu": 11.45 }, { "epoch": 0.5962479608482871, "grad_norm": 0.6235158443450928, "learning_rate": 2.9795918367346944e-05, "loss": 1.3516, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 731, "tokens_per_second_per_gpu": 4.87 }, { "epoch": 0.5970636215334421, "grad_norm": 0.6781034469604492, "learning_rate": 2.9836734693877554e-05, "loss": 1.2171, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 732, "tokens_per_second_per_gpu": 12.61 }, { "epoch": 0.5978792822185971, "grad_norm": 0.6993443369865417, "learning_rate": 2.987755102040816e-05, "loss": 1.5267, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 733, "tokens_per_second_per_gpu": 4.91 }, { "epoch": 0.598694942903752, "grad_norm": 0.5211313962936401, "learning_rate": 2.9918367346938774e-05, "loss": 1.3305, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 734, "tokens_per_second_per_gpu": 15.57 }, { "epoch": 0.5995106035889071, "grad_norm": 0.7690359354019165, "learning_rate": 2.9959183673469387e-05, "loss": 1.3036, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 735, "tokens_per_second_per_gpu": 6.59 }, { "epoch": 0.600326264274062, "grad_norm": 0.4698643684387207, "learning_rate": 3e-05, "loss": 1.3996, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 736, "tokens_per_second_per_gpu": 14.54 }, { "epoch": 0.6011419249592169, "grad_norm": 0.43188780546188354, "learning_rate": 3.0040816326530614e-05, "loss": 1.4304, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 737, "tokens_per_second_per_gpu": 6.61 }, { "epoch": 0.601957585644372, "grad_norm": 0.5587193369865417, "learning_rate": 3.0081632653061224e-05, "loss": 1.2551, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 738, "tokens_per_second_per_gpu": 8.24 }, { "epoch": 0.6027732463295269, "grad_norm": 0.9568424820899963, "learning_rate": 3.0122448979591838e-05, "loss": 1.369, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 739, "tokens_per_second_per_gpu": 10.35 }, { "epoch": 0.6035889070146819, "grad_norm": 0.4554611146450043, "learning_rate": 3.016326530612245e-05, "loss": 1.3274, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 740, "tokens_per_second_per_gpu": 9.44 }, { "epoch": 0.6044045676998369, "grad_norm": 0.4750651717185974, "learning_rate": 3.0204081632653065e-05, "loss": 1.3025, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 741, "tokens_per_second_per_gpu": 7.24 }, { "epoch": 0.6052202283849919, "grad_norm": 0.6417492628097534, "learning_rate": 3.0244897959183678e-05, "loss": 1.3655, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 742, "tokens_per_second_per_gpu": 2.33 }, { "epoch": 0.6060358890701468, "grad_norm": 0.38050809502601624, "learning_rate": 3.0285714285714288e-05, "loss": 1.3337, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 743, "tokens_per_second_per_gpu": 12.35 }, { "epoch": 0.6068515497553018, "grad_norm": 0.45447278022766113, "learning_rate": 3.0326530612244898e-05, "loss": 1.2764, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 744, "tokens_per_second_per_gpu": 14.17 }, { "epoch": 0.6076672104404568, "grad_norm": 0.537318766117096, "learning_rate": 3.0367346938775508e-05, "loss": 1.2911, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 745, "tokens_per_second_per_gpu": 18.93 }, { "epoch": 0.6084828711256117, "grad_norm": 0.6125952005386353, "learning_rate": 3.040816326530612e-05, "loss": 1.4578, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 746, "tokens_per_second_per_gpu": 4.87 }, { "epoch": 0.6092985318107668, "grad_norm": 0.4685211479663849, "learning_rate": 3.0448979591836735e-05, "loss": 1.3234, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 747, "tokens_per_second_per_gpu": 17.31 }, { "epoch": 0.6101141924959217, "grad_norm": 0.5809246301651001, "learning_rate": 3.048979591836735e-05, "loss": 1.3267, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 748, "tokens_per_second_per_gpu": 4.59 }, { "epoch": 0.6109298531810766, "grad_norm": 0.5616806149482727, "learning_rate": 3.053061224489796e-05, "loss": 1.3713, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 749, "tokens_per_second_per_gpu": 13.29 }, { "epoch": 0.6117455138662317, "grad_norm": 0.40785810351371765, "learning_rate": 3.057142857142857e-05, "loss": 1.3466, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 750, "tokens_per_second_per_gpu": 13.76 }, { "epoch": 0.6125611745513866, "grad_norm": 0.5907167196273804, "learning_rate": 3.061224489795919e-05, "loss": 1.3418, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 751, "tokens_per_second_per_gpu": 11.84 }, { "epoch": 0.6133768352365416, "grad_norm": 0.5732439756393433, "learning_rate": 3.06530612244898e-05, "loss": 1.4186, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 752, "tokens_per_second_per_gpu": 6.61 }, { "epoch": 0.6141924959216966, "grad_norm": 0.44164371490478516, "learning_rate": 3.069387755102041e-05, "loss": 1.3313, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 753, "tokens_per_second_per_gpu": 4.97 }, { "epoch": 0.6150081566068516, "grad_norm": 0.3994731903076172, "learning_rate": 3.0734693877551026e-05, "loss": 1.4365, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 754, "tokens_per_second_per_gpu": 12.12 }, { "epoch": 0.6158238172920065, "grad_norm": 0.4144476652145386, "learning_rate": 3.0775510204081636e-05, "loss": 1.4055, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 755, "tokens_per_second_per_gpu": 17.1 }, { "epoch": 0.6166394779771615, "grad_norm": 0.46010127663612366, "learning_rate": 3.0816326530612246e-05, "loss": 1.3963, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 756, "tokens_per_second_per_gpu": 10.46 }, { "epoch": 0.6174551386623165, "grad_norm": 0.3978196680545807, "learning_rate": 3.0857142857142856e-05, "loss": 1.2565, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 757, "tokens_per_second_per_gpu": 14.83 }, { "epoch": 0.6182707993474714, "grad_norm": 0.5044733881950378, "learning_rate": 3.0897959183673466e-05, "loss": 1.4614, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 758, "tokens_per_second_per_gpu": 4.66 }, { "epoch": 0.6190864600326265, "grad_norm": 0.534279465675354, "learning_rate": 3.093877551020408e-05, "loss": 1.3259, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 759, "tokens_per_second_per_gpu": 9.83 }, { "epoch": 0.6199021207177814, "grad_norm": 0.3236335515975952, "learning_rate": 3.097959183673469e-05, "loss": 1.3294, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 760, "tokens_per_second_per_gpu": 9.91 }, { "epoch": 0.6207177814029364, "grad_norm": 0.5932289361953735, "learning_rate": 3.102040816326531e-05, "loss": 1.3911, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 761, "tokens_per_second_per_gpu": 12.59 }, { "epoch": 0.6215334420880914, "grad_norm": 0.3719445466995239, "learning_rate": 3.106122448979592e-05, "loss": 1.447, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 762, "tokens_per_second_per_gpu": 12.67 }, { "epoch": 0.6223491027732463, "grad_norm": 0.49305304884910583, "learning_rate": 3.110204081632653e-05, "loss": 1.2838, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 763, "tokens_per_second_per_gpu": 7.15 }, { "epoch": 0.6231647634584013, "grad_norm": 0.5940515995025635, "learning_rate": 3.114285714285715e-05, "loss": 1.3151, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 764, "tokens_per_second_per_gpu": 9.27 }, { "epoch": 0.6239804241435563, "grad_norm": 0.4119418263435364, "learning_rate": 3.118367346938776e-05, "loss": 1.2212, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 765, "tokens_per_second_per_gpu": 11.85 }, { "epoch": 0.6247960848287113, "grad_norm": 0.5385092496871948, "learning_rate": 3.1224489795918374e-05, "loss": 1.3987, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 766, "tokens_per_second_per_gpu": 9.54 }, { "epoch": 0.6256117455138662, "grad_norm": 0.4678514301776886, "learning_rate": 3.126530612244898e-05, "loss": 1.3366, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 767, "tokens_per_second_per_gpu": 10.96 }, { "epoch": 0.6264274061990212, "grad_norm": 0.4030945897102356, "learning_rate": 3.1306122448979594e-05, "loss": 1.3344, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 768, "tokens_per_second_per_gpu": 9.79 }, { "epoch": 0.6272430668841762, "grad_norm": 0.7005003094673157, "learning_rate": 3.1346938775510204e-05, "loss": 1.4047, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 769, "tokens_per_second_per_gpu": 5.44 }, { "epoch": 0.6280587275693311, "grad_norm": 0.4531595706939697, "learning_rate": 3.1387755102040814e-05, "loss": 1.3103, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 770, "tokens_per_second_per_gpu": 7.76 }, { "epoch": 0.6288743882544862, "grad_norm": 0.5198403000831604, "learning_rate": 3.142857142857143e-05, "loss": 1.4308, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 771, "tokens_per_second_per_gpu": 7.93 }, { "epoch": 0.6296900489396411, "grad_norm": 0.5160313844680786, "learning_rate": 3.146938775510204e-05, "loss": 1.2981, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 772, "tokens_per_second_per_gpu": 9.85 }, { "epoch": 0.6305057096247961, "grad_norm": 0.3997355103492737, "learning_rate": 3.151020408163266e-05, "loss": 1.3135, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 773, "tokens_per_second_per_gpu": 17.32 }, { "epoch": 0.6313213703099511, "grad_norm": 0.5834109783172607, "learning_rate": 3.155102040816327e-05, "loss": 1.2447, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 774, "tokens_per_second_per_gpu": 14.0 }, { "epoch": 0.632137030995106, "grad_norm": 0.3795055150985718, "learning_rate": 3.159183673469388e-05, "loss": 1.3989, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 775, "tokens_per_second_per_gpu": 14.83 }, { "epoch": 0.632952691680261, "grad_norm": 0.7176637053489685, "learning_rate": 3.1632653061224494e-05, "loss": 1.244, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 776, "tokens_per_second_per_gpu": 5.21 }, { "epoch": 0.633768352365416, "grad_norm": 0.49444082379341125, "learning_rate": 3.1673469387755105e-05, "loss": 1.3913, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 777, "tokens_per_second_per_gpu": 7.47 }, { "epoch": 0.634584013050571, "grad_norm": 0.5121574997901917, "learning_rate": 3.1714285714285715e-05, "loss": 1.3328, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 778, "tokens_per_second_per_gpu": 8.0 }, { "epoch": 0.6353996737357259, "grad_norm": 0.6201639175415039, "learning_rate": 3.1755102040816325e-05, "loss": 1.2262, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 779, "tokens_per_second_per_gpu": 11.45 }, { "epoch": 0.636215334420881, "grad_norm": 0.39004719257354736, "learning_rate": 3.179591836734694e-05, "loss": 1.3787, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 780, "tokens_per_second_per_gpu": 17.56 }, { "epoch": 0.6370309951060359, "grad_norm": 0.42907750606536865, "learning_rate": 3.183673469387755e-05, "loss": 1.3021, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 781, "tokens_per_second_per_gpu": 8.69 }, { "epoch": 0.6378466557911908, "grad_norm": 0.41234907507896423, "learning_rate": 3.187755102040816e-05, "loss": 1.3732, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 782, "tokens_per_second_per_gpu": 9.03 }, { "epoch": 0.6386623164763459, "grad_norm": 0.6401681900024414, "learning_rate": 3.191836734693878e-05, "loss": 1.4133, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 783, "tokens_per_second_per_gpu": 9.05 }, { "epoch": 0.6394779771615008, "grad_norm": 0.8623344898223877, "learning_rate": 3.195918367346939e-05, "loss": 1.2263, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 784, "tokens_per_second_per_gpu": 10.47 }, { "epoch": 0.6402936378466558, "grad_norm": 0.5777239799499512, "learning_rate": 3.2000000000000005e-05, "loss": 1.4035, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 785, "tokens_per_second_per_gpu": 10.06 }, { "epoch": 0.6411092985318108, "grad_norm": 0.48366740345954895, "learning_rate": 3.2040816326530615e-05, "loss": 1.2892, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 786, "tokens_per_second_per_gpu": 4.36 }, { "epoch": 0.6419249592169658, "grad_norm": 0.6662882566452026, "learning_rate": 3.2081632653061225e-05, "loss": 1.3027, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 787, "tokens_per_second_per_gpu": 6.88 }, { "epoch": 0.6427406199021207, "grad_norm": 0.6265098452568054, "learning_rate": 3.212244897959184e-05, "loss": 1.3435, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 788, "tokens_per_second_per_gpu": 13.95 }, { "epoch": 0.6435562805872757, "grad_norm": 0.651458203792572, "learning_rate": 3.216326530612245e-05, "loss": 1.4074, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 789, "tokens_per_second_per_gpu": 12.73 }, { "epoch": 0.6443719412724307, "grad_norm": 0.46413683891296387, "learning_rate": 3.220408163265306e-05, "loss": 1.4439, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 790, "tokens_per_second_per_gpu": 11.52 }, { "epoch": 0.6451876019575856, "grad_norm": 0.6439623832702637, "learning_rate": 3.224489795918367e-05, "loss": 1.371, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 791, "tokens_per_second_per_gpu": 7.46 }, { "epoch": 0.6460032626427407, "grad_norm": 0.469061940908432, "learning_rate": 3.228571428571428e-05, "loss": 1.3779, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 792, "tokens_per_second_per_gpu": 13.06 }, { "epoch": 0.6468189233278956, "grad_norm": 0.39909204840660095, "learning_rate": 3.23265306122449e-05, "loss": 1.3128, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 793, "tokens_per_second_per_gpu": 13.06 }, { "epoch": 0.6476345840130505, "grad_norm": 0.5753593444824219, "learning_rate": 3.236734693877551e-05, "loss": 1.3614, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 794, "tokens_per_second_per_gpu": 4.86 }, { "epoch": 0.6484502446982056, "grad_norm": 0.27514269948005676, "learning_rate": 3.2408163265306126e-05, "loss": 1.2641, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 795, "tokens_per_second_per_gpu": 4.63 }, { "epoch": 0.6492659053833605, "grad_norm": 0.43858593702316284, "learning_rate": 3.2448979591836736e-05, "loss": 1.3528, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 796, "tokens_per_second_per_gpu": 13.62 }, { "epoch": 0.6500815660685155, "grad_norm": 0.33681634068489075, "learning_rate": 3.2489795918367346e-05, "loss": 1.3888, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 797, "tokens_per_second_per_gpu": 18.63 }, { "epoch": 0.6508972267536705, "grad_norm": 0.4185873866081238, "learning_rate": 3.253061224489796e-05, "loss": 1.3837, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 798, "tokens_per_second_per_gpu": 9.58 }, { "epoch": 0.6517128874388255, "grad_norm": 0.6606593728065491, "learning_rate": 3.257142857142857e-05, "loss": 1.4409, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 799, "tokens_per_second_per_gpu": 7.68 }, { "epoch": 0.6525285481239804, "grad_norm": 0.645282506942749, "learning_rate": 3.261224489795919e-05, "loss": 1.5335, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 800, "tokens_per_second_per_gpu": 1.8 }, { "epoch": 0.6533442088091354, "grad_norm": 0.34779593348503113, "learning_rate": 3.265306122448979e-05, "loss": 1.3156, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 801, "tokens_per_second_per_gpu": 8.29 }, { "epoch": 0.6541598694942904, "grad_norm": 0.6265929937362671, "learning_rate": 3.269387755102041e-05, "loss": 1.3522, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 802, "tokens_per_second_per_gpu": 4.68 }, { "epoch": 0.6549755301794453, "grad_norm": 0.3549352288246155, "learning_rate": 3.273469387755102e-05, "loss": 1.3089, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 803, "tokens_per_second_per_gpu": 7.17 }, { "epoch": 0.6557911908646004, "grad_norm": 0.27069467306137085, "learning_rate": 3.277551020408163e-05, "loss": 1.3096, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 804, "tokens_per_second_per_gpu": 9.7 }, { "epoch": 0.6566068515497553, "grad_norm": 0.5482134819030762, "learning_rate": 3.281632653061225e-05, "loss": 1.4444, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 805, "tokens_per_second_per_gpu": 4.51 }, { "epoch": 0.6574225122349103, "grad_norm": 0.3811684548854828, "learning_rate": 3.285714285714286e-05, "loss": 1.2748, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 806, "tokens_per_second_per_gpu": 8.04 }, { "epoch": 0.6582381729200653, "grad_norm": 0.5666738748550415, "learning_rate": 3.2897959183673474e-05, "loss": 1.3157, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 807, "tokens_per_second_per_gpu": 6.96 }, { "epoch": 0.6590538336052202, "grad_norm": 0.47155871987342834, "learning_rate": 3.2938775510204084e-05, "loss": 1.3126, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 808, "tokens_per_second_per_gpu": 15.11 }, { "epoch": 0.6598694942903752, "grad_norm": 0.6468315720558167, "learning_rate": 3.2979591836734694e-05, "loss": 1.2621, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 809, "tokens_per_second_per_gpu": 4.22 }, { "epoch": 0.6606851549755302, "grad_norm": 0.43057864904403687, "learning_rate": 3.302040816326531e-05, "loss": 1.1816, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 810, "tokens_per_second_per_gpu": 5.88 }, { "epoch": 0.6615008156606852, "grad_norm": 0.4684527516365051, "learning_rate": 3.306122448979592e-05, "loss": 1.3426, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 811, "tokens_per_second_per_gpu": 4.81 }, { "epoch": 0.6623164763458401, "grad_norm": 0.7018337845802307, "learning_rate": 3.310204081632653e-05, "loss": 1.3225, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 812, "tokens_per_second_per_gpu": 12.18 }, { "epoch": 0.6631321370309952, "grad_norm": 0.4303826093673706, "learning_rate": 3.314285714285714e-05, "loss": 1.4278, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 813, "tokens_per_second_per_gpu": 20.42 }, { "epoch": 0.6639477977161501, "grad_norm": 0.3324776589870453, "learning_rate": 3.318367346938776e-05, "loss": 1.2696, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 814, "tokens_per_second_per_gpu": 4.44 }, { "epoch": 0.664763458401305, "grad_norm": 0.5425388216972351, "learning_rate": 3.322448979591837e-05, "loss": 1.2198, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 815, "tokens_per_second_per_gpu": 12.37 }, { "epoch": 0.6655791190864601, "grad_norm": 0.43363532423973083, "learning_rate": 3.326530612244898e-05, "loss": 1.2089, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 816, "tokens_per_second_per_gpu": 10.19 }, { "epoch": 0.666394779771615, "grad_norm": 0.7394403219223022, "learning_rate": 3.3306122448979595e-05, "loss": 1.2638, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 817, "tokens_per_second_per_gpu": 6.63 }, { "epoch": 0.66721044045677, "grad_norm": 0.5427343845367432, "learning_rate": 3.3346938775510205e-05, "loss": 1.3262, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 818, "tokens_per_second_per_gpu": 5.62 }, { "epoch": 0.668026101141925, "grad_norm": 0.5513439774513245, "learning_rate": 3.338775510204082e-05, "loss": 1.2745, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 819, "tokens_per_second_per_gpu": 2.11 }, { "epoch": 0.6688417618270799, "grad_norm": 0.478743314743042, "learning_rate": 3.342857142857143e-05, "loss": 1.3696, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 820, "tokens_per_second_per_gpu": 8.57 }, { "epoch": 0.6696574225122349, "grad_norm": 0.3927666246891022, "learning_rate": 3.346938775510204e-05, "loss": 1.3567, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 821, "tokens_per_second_per_gpu": 10.48 }, { "epoch": 0.6704730831973899, "grad_norm": 0.4408244788646698, "learning_rate": 3.351020408163266e-05, "loss": 1.2645, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 822, "tokens_per_second_per_gpu": 16.75 }, { "epoch": 0.6712887438825449, "grad_norm": 0.41693928837776184, "learning_rate": 3.355102040816327e-05, "loss": 1.3032, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 823, "tokens_per_second_per_gpu": 10.07 }, { "epoch": 0.6721044045676998, "grad_norm": 0.4946994483470917, "learning_rate": 3.359183673469388e-05, "loss": 1.3254, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 824, "tokens_per_second_per_gpu": 7.01 }, { "epoch": 0.6729200652528549, "grad_norm": 0.5838682055473328, "learning_rate": 3.363265306122449e-05, "loss": 1.2843, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 825, "tokens_per_second_per_gpu": 13.03 }, { "epoch": 0.6737357259380098, "grad_norm": 0.4230104386806488, "learning_rate": 3.36734693877551e-05, "loss": 1.2633, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 826, "tokens_per_second_per_gpu": 4.74 }, { "epoch": 0.6745513866231647, "grad_norm": 0.49891456961631775, "learning_rate": 3.3714285714285716e-05, "loss": 1.3284, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 827, "tokens_per_second_per_gpu": 16.54 }, { "epoch": 0.6753670473083198, "grad_norm": 0.6883929967880249, "learning_rate": 3.3755102040816326e-05, "loss": 1.4418, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 828, "tokens_per_second_per_gpu": 9.07 }, { "epoch": 0.6761827079934747, "grad_norm": 0.48337942361831665, "learning_rate": 3.379591836734694e-05, "loss": 1.3712, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 829, "tokens_per_second_per_gpu": 6.7 }, { "epoch": 0.6769983686786297, "grad_norm": 0.5966946482658386, "learning_rate": 3.383673469387755e-05, "loss": 1.289, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 830, "tokens_per_second_per_gpu": 6.14 }, { "epoch": 0.6778140293637847, "grad_norm": 0.45917853713035583, "learning_rate": 3.387755102040816e-05, "loss": 1.3145, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 831, "tokens_per_second_per_gpu": 3.38 }, { "epoch": 0.6786296900489397, "grad_norm": 0.5946701169013977, "learning_rate": 3.391836734693878e-05, "loss": 1.3387, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 832, "tokens_per_second_per_gpu": 5.04 }, { "epoch": 0.6794453507340946, "grad_norm": 0.7094697952270508, "learning_rate": 3.395918367346939e-05, "loss": 1.2674, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 833, "tokens_per_second_per_gpu": 7.09 }, { "epoch": 0.6802610114192496, "grad_norm": 0.512756884098053, "learning_rate": 3.4000000000000007e-05, "loss": 1.3935, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 834, "tokens_per_second_per_gpu": 9.49 }, { "epoch": 0.6810766721044046, "grad_norm": 0.4943753480911255, "learning_rate": 3.404081632653061e-05, "loss": 1.266, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 835, "tokens_per_second_per_gpu": 10.69 }, { "epoch": 0.6818923327895595, "grad_norm": 0.44899982213974, "learning_rate": 3.408163265306123e-05, "loss": 1.2977, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 836, "tokens_per_second_per_gpu": 8.99 }, { "epoch": 0.6827079934747146, "grad_norm": 0.651210606098175, "learning_rate": 3.412244897959184e-05, "loss": 1.4604, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 837, "tokens_per_second_per_gpu": 2.35 }, { "epoch": 0.6835236541598695, "grad_norm": 0.44280731678009033, "learning_rate": 3.416326530612245e-05, "loss": 1.2227, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 838, "tokens_per_second_per_gpu": 9.64 }, { "epoch": 0.6843393148450244, "grad_norm": 0.5581017732620239, "learning_rate": 3.4204081632653064e-05, "loss": 1.3179, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 839, "tokens_per_second_per_gpu": 10.62 }, { "epoch": 0.6851549755301795, "grad_norm": 0.49037325382232666, "learning_rate": 3.4244897959183674e-05, "loss": 1.335, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 840, "tokens_per_second_per_gpu": 5.81 }, { "epoch": 0.6859706362153344, "grad_norm": 0.4312785863876343, "learning_rate": 3.428571428571429e-05, "loss": 1.2869, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 841, "tokens_per_second_per_gpu": 6.25 }, { "epoch": 0.6867862969004894, "grad_norm": 0.5511866211891174, "learning_rate": 3.43265306122449e-05, "loss": 1.4158, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 842, "tokens_per_second_per_gpu": 5.86 }, { "epoch": 0.6876019575856444, "grad_norm": 0.4785636067390442, "learning_rate": 3.436734693877551e-05, "loss": 1.3189, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 843, "tokens_per_second_per_gpu": 8.89 }, { "epoch": 0.6884176182707994, "grad_norm": 0.594661295413971, "learning_rate": 3.440816326530613e-05, "loss": 1.3025, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 844, "tokens_per_second_per_gpu": 16.04 }, { "epoch": 0.6892332789559543, "grad_norm": 0.5739701986312866, "learning_rate": 3.444897959183674e-05, "loss": 1.4191, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 845, "tokens_per_second_per_gpu": 5.72 }, { "epoch": 0.6900489396411092, "grad_norm": 0.5129398703575134, "learning_rate": 3.4489795918367354e-05, "loss": 1.299, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 846, "tokens_per_second_per_gpu": 6.32 }, { "epoch": 0.6908646003262643, "grad_norm": 0.4429929256439209, "learning_rate": 3.453061224489796e-05, "loss": 1.2908, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 847, "tokens_per_second_per_gpu": 9.46 }, { "epoch": 0.6916802610114192, "grad_norm": 0.37880799174308777, "learning_rate": 3.4571428571428574e-05, "loss": 1.3911, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 848, "tokens_per_second_per_gpu": 8.05 }, { "epoch": 0.6924959216965743, "grad_norm": 0.4822133481502533, "learning_rate": 3.4612244897959184e-05, "loss": 1.2674, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 849, "tokens_per_second_per_gpu": 8.3 }, { "epoch": 0.6933115823817292, "grad_norm": 0.5693739056587219, "learning_rate": 3.4653061224489795e-05, "loss": 1.3648, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 850, "tokens_per_second_per_gpu": 4.48 }, { "epoch": 0.6941272430668842, "grad_norm": 0.503729522228241, "learning_rate": 3.469387755102041e-05, "loss": 1.1915, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 851, "tokens_per_second_per_gpu": 12.28 }, { "epoch": 0.6949429037520392, "grad_norm": 0.3393414616584778, "learning_rate": 3.473469387755102e-05, "loss": 1.1799, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 852, "tokens_per_second_per_gpu": 13.36 }, { "epoch": 0.6957585644371941, "grad_norm": 0.4509128928184509, "learning_rate": 3.477551020408164e-05, "loss": 1.2612, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 853, "tokens_per_second_per_gpu": 7.96 }, { "epoch": 0.6965742251223491, "grad_norm": 0.6510946154594421, "learning_rate": 3.481632653061225e-05, "loss": 1.281, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 854, "tokens_per_second_per_gpu": 3.32 }, { "epoch": 0.697389885807504, "grad_norm": 0.3593266010284424, "learning_rate": 3.485714285714286e-05, "loss": 1.2608, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 855, "tokens_per_second_per_gpu": 12.33 }, { "epoch": 0.6982055464926591, "grad_norm": 0.313568115234375, "learning_rate": 3.4897959183673475e-05, "loss": 1.3265, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 856, "tokens_per_second_per_gpu": 10.25 }, { "epoch": 0.699021207177814, "grad_norm": 0.4514428973197937, "learning_rate": 3.4938775510204085e-05, "loss": 1.3283, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 857, "tokens_per_second_per_gpu": 11.42 }, { "epoch": 0.6998368678629691, "grad_norm": 0.40112560987472534, "learning_rate": 3.4979591836734695e-05, "loss": 1.3563, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 858, "tokens_per_second_per_gpu": 16.24 }, { "epoch": 0.700652528548124, "grad_norm": 0.2901712656021118, "learning_rate": 3.5020408163265305e-05, "loss": 1.2969, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 859, "tokens_per_second_per_gpu": 6.13 }, { "epoch": 0.7014681892332789, "grad_norm": 0.6708291172981262, "learning_rate": 3.5061224489795915e-05, "loss": 1.4869, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 860, "tokens_per_second_per_gpu": 9.37 }, { "epoch": 0.702283849918434, "grad_norm": 0.5584856271743774, "learning_rate": 3.510204081632653e-05, "loss": 1.3321, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 861, "tokens_per_second_per_gpu": 10.99 }, { "epoch": 0.7030995106035889, "grad_norm": 0.5388907194137573, "learning_rate": 3.514285714285714e-05, "loss": 1.3706, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.26, "memory/max_allocated (GiB)": 72.26, "step": 862, "tokens_per_second_per_gpu": 4.85 }, { "epoch": 0.7039151712887439, "grad_norm": 0.5052027702331543, "learning_rate": 3.518367346938776e-05, "loss": 1.2713, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 863, "tokens_per_second_per_gpu": 5.16 }, { "epoch": 0.7047308319738989, "grad_norm": 0.3984014689922333, "learning_rate": 3.522448979591837e-05, "loss": 1.339, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 864, "tokens_per_second_per_gpu": 10.12 }, { "epoch": 0.7055464926590538, "grad_norm": 0.7061243057250977, "learning_rate": 3.526530612244898e-05, "loss": 1.4282, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 865, "tokens_per_second_per_gpu": 7.81 }, { "epoch": 0.7063621533442088, "grad_norm": 0.6075819730758667, "learning_rate": 3.5306122448979596e-05, "loss": 1.3458, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 866, "tokens_per_second_per_gpu": 5.63 }, { "epoch": 0.7071778140293637, "grad_norm": 0.4511563181877136, "learning_rate": 3.5346938775510206e-05, "loss": 1.3354, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 867, "tokens_per_second_per_gpu": 9.21 }, { "epoch": 0.7079934747145188, "grad_norm": 0.7093430757522583, "learning_rate": 3.538775510204082e-05, "loss": 1.3971, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 868, "tokens_per_second_per_gpu": 7.51 }, { "epoch": 0.7088091353996737, "grad_norm": 0.4191691279411316, "learning_rate": 3.5428571428571426e-05, "loss": 1.2687, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 869, "tokens_per_second_per_gpu": 8.17 }, { "epoch": 0.7096247960848288, "grad_norm": 0.41514644026756287, "learning_rate": 3.546938775510204e-05, "loss": 1.4937, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 870, "tokens_per_second_per_gpu": 17.64 }, { "epoch": 0.7104404567699837, "grad_norm": 0.5817702412605286, "learning_rate": 3.551020408163265e-05, "loss": 1.3688, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 871, "tokens_per_second_per_gpu": 4.77 }, { "epoch": 0.7112561174551386, "grad_norm": 0.5116757750511169, "learning_rate": 3.555102040816326e-05, "loss": 1.2722, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 872, "tokens_per_second_per_gpu": 6.35 }, { "epoch": 0.7120717781402937, "grad_norm": 0.44880375266075134, "learning_rate": 3.559183673469388e-05, "loss": 1.2972, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 873, "tokens_per_second_per_gpu": 5.76 }, { "epoch": 0.7128874388254486, "grad_norm": 0.5277480483055115, "learning_rate": 3.563265306122449e-05, "loss": 1.4012, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 874, "tokens_per_second_per_gpu": 11.19 }, { "epoch": 0.7137030995106036, "grad_norm": 0.4010769724845886, "learning_rate": 3.567346938775511e-05, "loss": 1.3668, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 875, "tokens_per_second_per_gpu": 13.34 }, { "epoch": 0.7145187601957586, "grad_norm": 0.6195709705352783, "learning_rate": 3.571428571428572e-05, "loss": 1.2433, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 876, "tokens_per_second_per_gpu": 11.64 }, { "epoch": 0.7153344208809136, "grad_norm": 0.5131180286407471, "learning_rate": 3.575510204081633e-05, "loss": 1.3098, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 877, "tokens_per_second_per_gpu": 14.28 }, { "epoch": 0.7161500815660685, "grad_norm": 0.5309876799583435, "learning_rate": 3.5795918367346944e-05, "loss": 1.3474, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 878, "tokens_per_second_per_gpu": 3.11 }, { "epoch": 0.7169657422512234, "grad_norm": 0.533043622970581, "learning_rate": 3.5836734693877554e-05, "loss": 1.3575, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 879, "tokens_per_second_per_gpu": 9.3 }, { "epoch": 0.7177814029363785, "grad_norm": 0.6497002840042114, "learning_rate": 3.587755102040817e-05, "loss": 1.3019, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 880, "tokens_per_second_per_gpu": 6.33 }, { "epoch": 0.7185970636215334, "grad_norm": 0.5856229066848755, "learning_rate": 3.5918367346938774e-05, "loss": 1.3962, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 881, "tokens_per_second_per_gpu": 5.54 }, { "epoch": 0.7194127243066885, "grad_norm": 0.47242605686187744, "learning_rate": 3.595918367346939e-05, "loss": 1.4109, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 882, "tokens_per_second_per_gpu": 9.64 }, { "epoch": 0.7202283849918434, "grad_norm": 0.4616156220436096, "learning_rate": 3.6e-05, "loss": 1.2044, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 883, "tokens_per_second_per_gpu": 8.1 }, { "epoch": 0.7210440456769984, "grad_norm": 0.38666078448295593, "learning_rate": 3.604081632653061e-05, "loss": 1.2667, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 884, "tokens_per_second_per_gpu": 8.87 }, { "epoch": 0.7218597063621534, "grad_norm": 0.7753950357437134, "learning_rate": 3.608163265306123e-05, "loss": 1.451, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 885, "tokens_per_second_per_gpu": 1.42 }, { "epoch": 0.7226753670473083, "grad_norm": 0.4268159866333008, "learning_rate": 3.612244897959184e-05, "loss": 1.3236, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 886, "tokens_per_second_per_gpu": 8.22 }, { "epoch": 0.7234910277324633, "grad_norm": 0.685920000076294, "learning_rate": 3.6163265306122455e-05, "loss": 1.3855, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 887, "tokens_per_second_per_gpu": 5.02 }, { "epoch": 0.7243066884176182, "grad_norm": 0.5022143721580505, "learning_rate": 3.6204081632653065e-05, "loss": 1.3329, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 888, "tokens_per_second_per_gpu": 17.74 }, { "epoch": 0.7251223491027733, "grad_norm": 0.6718111038208008, "learning_rate": 3.6244897959183675e-05, "loss": 1.3947, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 889, "tokens_per_second_per_gpu": 7.31 }, { "epoch": 0.7259380097879282, "grad_norm": 0.6105276346206665, "learning_rate": 3.628571428571429e-05, "loss": 1.2897, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 890, "tokens_per_second_per_gpu": 5.73 }, { "epoch": 0.7267536704730831, "grad_norm": 0.5921761989593506, "learning_rate": 3.63265306122449e-05, "loss": 1.2479, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 891, "tokens_per_second_per_gpu": 9.11 }, { "epoch": 0.7275693311582382, "grad_norm": 0.582705020904541, "learning_rate": 3.636734693877551e-05, "loss": 1.239, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 892, "tokens_per_second_per_gpu": 4.93 }, { "epoch": 0.7283849918433931, "grad_norm": 0.40777963399887085, "learning_rate": 3.640816326530612e-05, "loss": 1.2849, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 893, "tokens_per_second_per_gpu": 6.66 }, { "epoch": 0.7292006525285482, "grad_norm": 0.496194452047348, "learning_rate": 3.644897959183673e-05, "loss": 1.3645, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 894, "tokens_per_second_per_gpu": 13.3 }, { "epoch": 0.7300163132137031, "grad_norm": 0.5330689549446106, "learning_rate": 3.648979591836735e-05, "loss": 1.2672, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 895, "tokens_per_second_per_gpu": 13.42 }, { "epoch": 0.7308319738988581, "grad_norm": 0.5220062732696533, "learning_rate": 3.653061224489796e-05, "loss": 1.3378, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 896, "tokens_per_second_per_gpu": 11.22 }, { "epoch": 0.731647634584013, "grad_norm": 0.5318239331245422, "learning_rate": 3.6571428571428576e-05, "loss": 1.3233, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 897, "tokens_per_second_per_gpu": 7.84 }, { "epoch": 0.732463295269168, "grad_norm": 0.3729937970638275, "learning_rate": 3.6612244897959186e-05, "loss": 1.3637, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 898, "tokens_per_second_per_gpu": 12.07 }, { "epoch": 0.733278955954323, "grad_norm": 0.44320061802864075, "learning_rate": 3.6653061224489796e-05, "loss": 1.3371, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 899, "tokens_per_second_per_gpu": 10.66 }, { "epoch": 0.734094616639478, "grad_norm": 0.5697821378707886, "learning_rate": 3.669387755102041e-05, "loss": 1.4365, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 900, "tokens_per_second_per_gpu": 6.77 }, { "epoch": 0.734910277324633, "grad_norm": 0.5525500774383545, "learning_rate": 3.673469387755102e-05, "loss": 1.3253, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 901, "tokens_per_second_per_gpu": 6.08 }, { "epoch": 0.7357259380097879, "grad_norm": 0.49901241064071655, "learning_rate": 3.677551020408164e-05, "loss": 1.3749, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 902, "tokens_per_second_per_gpu": 2.54 }, { "epoch": 0.736541598694943, "grad_norm": 0.5969358086585999, "learning_rate": 3.681632653061224e-05, "loss": 1.3768, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 903, "tokens_per_second_per_gpu": 10.13 }, { "epoch": 0.7373572593800979, "grad_norm": 0.45161691308021545, "learning_rate": 3.685714285714286e-05, "loss": 1.3256, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 904, "tokens_per_second_per_gpu": 6.22 }, { "epoch": 0.7381729200652528, "grad_norm": 0.38519227504730225, "learning_rate": 3.689795918367347e-05, "loss": 1.3032, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 905, "tokens_per_second_per_gpu": 8.8 }, { "epoch": 0.7389885807504079, "grad_norm": 0.39090874791145325, "learning_rate": 3.693877551020408e-05, "loss": 1.348, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 906, "tokens_per_second_per_gpu": 11.98 }, { "epoch": 0.7398042414355628, "grad_norm": 0.3645041286945343, "learning_rate": 3.6979591836734696e-05, "loss": 1.2791, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 907, "tokens_per_second_per_gpu": 6.27 }, { "epoch": 0.7406199021207178, "grad_norm": 0.27714717388153076, "learning_rate": 3.7020408163265307e-05, "loss": 1.3287, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 908, "tokens_per_second_per_gpu": 8.82 }, { "epoch": 0.7414355628058727, "grad_norm": 0.532908022403717, "learning_rate": 3.706122448979592e-05, "loss": 1.3535, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 909, "tokens_per_second_per_gpu": 6.88 }, { "epoch": 0.7422512234910277, "grad_norm": 0.24954257905483246, "learning_rate": 3.7102040816326533e-05, "loss": 1.3489, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 910, "tokens_per_second_per_gpu": 21.58 }, { "epoch": 0.7430668841761827, "grad_norm": 0.43075287342071533, "learning_rate": 3.7142857142857143e-05, "loss": 1.3401, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 911, "tokens_per_second_per_gpu": 13.88 }, { "epoch": 0.7438825448613376, "grad_norm": 0.7245692014694214, "learning_rate": 3.718367346938776e-05, "loss": 1.3424, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 912, "tokens_per_second_per_gpu": 4.21 }, { "epoch": 0.7446982055464927, "grad_norm": 0.4177038371562958, "learning_rate": 3.722448979591837e-05, "loss": 1.2901, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 913, "tokens_per_second_per_gpu": 4.11 }, { "epoch": 0.7455138662316476, "grad_norm": 0.6875289082527161, "learning_rate": 3.726530612244899e-05, "loss": 1.3314, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 914, "tokens_per_second_per_gpu": 13.55 }, { "epoch": 0.7463295269168027, "grad_norm": 0.5166841149330139, "learning_rate": 3.730612244897959e-05, "loss": 1.271, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 915, "tokens_per_second_per_gpu": 8.2 }, { "epoch": 0.7471451876019576, "grad_norm": 0.38385629653930664, "learning_rate": 3.734693877551021e-05, "loss": 1.3894, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 916, "tokens_per_second_per_gpu": 11.19 }, { "epoch": 0.7479608482871125, "grad_norm": 0.3709266483783722, "learning_rate": 3.738775510204082e-05, "loss": 1.4157, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 917, "tokens_per_second_per_gpu": 12.06 }, { "epoch": 0.7487765089722676, "grad_norm": 0.47073864936828613, "learning_rate": 3.742857142857143e-05, "loss": 1.3362, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 918, "tokens_per_second_per_gpu": 8.64 }, { "epoch": 0.7495921696574225, "grad_norm": 0.44455674290657043, "learning_rate": 3.7469387755102044e-05, "loss": 1.2792, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 919, "tokens_per_second_per_gpu": 11.98 }, { "epoch": 0.7504078303425775, "grad_norm": 0.3512020707130432, "learning_rate": 3.7510204081632654e-05, "loss": 1.2964, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 920, "tokens_per_second_per_gpu": 19.14 }, { "epoch": 0.7512234910277324, "grad_norm": 0.3672789931297302, "learning_rate": 3.7551020408163264e-05, "loss": 1.4237, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 921, "tokens_per_second_per_gpu": 8.66 }, { "epoch": 0.7520391517128875, "grad_norm": 0.4065171480178833, "learning_rate": 3.759183673469388e-05, "loss": 1.224, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 922, "tokens_per_second_per_gpu": 7.91 }, { "epoch": 0.7528548123980424, "grad_norm": 0.445993572473526, "learning_rate": 3.763265306122449e-05, "loss": 1.3176, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 923, "tokens_per_second_per_gpu": 8.57 }, { "epoch": 0.7536704730831973, "grad_norm": 0.6825697422027588, "learning_rate": 3.767346938775511e-05, "loss": 1.2902, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 924, "tokens_per_second_per_gpu": 12.46 }, { "epoch": 0.7544861337683524, "grad_norm": 0.45979467034339905, "learning_rate": 3.771428571428572e-05, "loss": 1.2058, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 925, "tokens_per_second_per_gpu": 9.4 }, { "epoch": 0.7553017944535073, "grad_norm": 0.46382641792297363, "learning_rate": 3.775510204081633e-05, "loss": 1.3737, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 926, "tokens_per_second_per_gpu": 13.57 }, { "epoch": 0.7561174551386624, "grad_norm": 0.3603871166706085, "learning_rate": 3.779591836734694e-05, "loss": 1.3289, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 927, "tokens_per_second_per_gpu": 5.19 }, { "epoch": 0.7569331158238173, "grad_norm": 0.5202210545539856, "learning_rate": 3.783673469387755e-05, "loss": 1.2479, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 928, "tokens_per_second_per_gpu": 11.73 }, { "epoch": 0.7577487765089723, "grad_norm": 0.41681578755378723, "learning_rate": 3.7877551020408165e-05, "loss": 1.4345, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 929, "tokens_per_second_per_gpu": 10.07 }, { "epoch": 0.7585644371941273, "grad_norm": 0.32660868763923645, "learning_rate": 3.7918367346938775e-05, "loss": 1.2568, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 930, "tokens_per_second_per_gpu": 14.61 }, { "epoch": 0.7593800978792822, "grad_norm": 0.6408383250236511, "learning_rate": 3.795918367346939e-05, "loss": 1.3121, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 931, "tokens_per_second_per_gpu": 8.08 }, { "epoch": 0.7601957585644372, "grad_norm": 0.37195152044296265, "learning_rate": 3.8e-05, "loss": 1.3559, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 932, "tokens_per_second_per_gpu": 11.29 }, { "epoch": 0.7610114192495921, "grad_norm": 0.3907943367958069, "learning_rate": 3.804081632653061e-05, "loss": 1.3095, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 933, "tokens_per_second_per_gpu": 16.92 }, { "epoch": 0.7618270799347472, "grad_norm": 0.5139783024787903, "learning_rate": 3.808163265306123e-05, "loss": 1.3452, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 934, "tokens_per_second_per_gpu": 3.42 }, { "epoch": 0.7626427406199021, "grad_norm": 0.43047332763671875, "learning_rate": 3.812244897959184e-05, "loss": 1.254, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 935, "tokens_per_second_per_gpu": 11.35 }, { "epoch": 0.763458401305057, "grad_norm": 0.4045772850513458, "learning_rate": 3.8163265306122456e-05, "loss": 1.2503, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 936, "tokens_per_second_per_gpu": 9.58 }, { "epoch": 0.7642740619902121, "grad_norm": 0.4480266571044922, "learning_rate": 3.820408163265306e-05, "loss": 1.3274, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 937, "tokens_per_second_per_gpu": 9.65 }, { "epoch": 0.765089722675367, "grad_norm": 0.2969704270362854, "learning_rate": 3.8244897959183676e-05, "loss": 1.3696, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 938, "tokens_per_second_per_gpu": 7.38 }, { "epoch": 0.765905383360522, "grad_norm": 0.39906999468803406, "learning_rate": 3.8285714285714286e-05, "loss": 1.3683, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 939, "tokens_per_second_per_gpu": 8.68 }, { "epoch": 0.766721044045677, "grad_norm": 0.32980722188949585, "learning_rate": 3.8326530612244896e-05, "loss": 1.3671, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 940, "tokens_per_second_per_gpu": 7.32 }, { "epoch": 0.767536704730832, "grad_norm": 0.46920305490493774, "learning_rate": 3.836734693877551e-05, "loss": 1.388, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 941, "tokens_per_second_per_gpu": 5.67 }, { "epoch": 0.768352365415987, "grad_norm": 0.4927214980125427, "learning_rate": 3.840816326530612e-05, "loss": 1.3674, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 942, "tokens_per_second_per_gpu": 11.88 }, { "epoch": 0.7691680261011419, "grad_norm": 0.30959412455558777, "learning_rate": 3.844897959183674e-05, "loss": 1.2615, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 943, "tokens_per_second_per_gpu": 14.83 }, { "epoch": 0.7699836867862969, "grad_norm": 0.5171527862548828, "learning_rate": 3.848979591836735e-05, "loss": 1.2826, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 944, "tokens_per_second_per_gpu": 6.38 }, { "epoch": 0.7707993474714518, "grad_norm": 0.45581454038619995, "learning_rate": 3.853061224489796e-05, "loss": 1.3147, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 945, "tokens_per_second_per_gpu": 10.68 }, { "epoch": 0.7716150081566069, "grad_norm": 0.6385869383811951, "learning_rate": 3.857142857142858e-05, "loss": 1.3848, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 946, "tokens_per_second_per_gpu": 5.84 }, { "epoch": 0.7724306688417618, "grad_norm": 0.6016236543655396, "learning_rate": 3.861224489795919e-05, "loss": 1.3452, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 947, "tokens_per_second_per_gpu": 10.49 }, { "epoch": 0.7732463295269169, "grad_norm": 0.48190510272979736, "learning_rate": 3.8653061224489804e-05, "loss": 1.4087, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 948, "tokens_per_second_per_gpu": 5.54 }, { "epoch": 0.7740619902120718, "grad_norm": 0.4573826491832733, "learning_rate": 3.869387755102041e-05, "loss": 1.3038, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 949, "tokens_per_second_per_gpu": 6.74 }, { "epoch": 0.7748776508972267, "grad_norm": 0.37575748562812805, "learning_rate": 3.8734693877551024e-05, "loss": 1.2215, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 950, "tokens_per_second_per_gpu": 8.51 }, { "epoch": 0.7756933115823818, "grad_norm": 0.49602410197257996, "learning_rate": 3.8775510204081634e-05, "loss": 1.3716, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 951, "tokens_per_second_per_gpu": 10.17 }, { "epoch": 0.7765089722675367, "grad_norm": 0.4339481294155121, "learning_rate": 3.8816326530612244e-05, "loss": 1.312, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 952, "tokens_per_second_per_gpu": 7.41 }, { "epoch": 0.7773246329526917, "grad_norm": 0.3770082890987396, "learning_rate": 3.885714285714286e-05, "loss": 1.3432, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 953, "tokens_per_second_per_gpu": 4.15 }, { "epoch": 0.7781402936378466, "grad_norm": 0.314221054315567, "learning_rate": 3.889795918367347e-05, "loss": 1.301, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 954, "tokens_per_second_per_gpu": 8.17 }, { "epoch": 0.7789559543230016, "grad_norm": 0.5170644521713257, "learning_rate": 3.893877551020408e-05, "loss": 1.3209, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 955, "tokens_per_second_per_gpu": 9.25 }, { "epoch": 0.7797716150081566, "grad_norm": 0.4686349630355835, "learning_rate": 3.89795918367347e-05, "loss": 1.3064, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 956, "tokens_per_second_per_gpu": 3.61 }, { "epoch": 0.7805872756933115, "grad_norm": 0.4161233603954315, "learning_rate": 3.902040816326531e-05, "loss": 1.4371, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 957, "tokens_per_second_per_gpu": 7.09 }, { "epoch": 0.7814029363784666, "grad_norm": 0.5770133137702942, "learning_rate": 3.9061224489795925e-05, "loss": 1.2781, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 958, "tokens_per_second_per_gpu": 11.3 }, { "epoch": 0.7822185970636215, "grad_norm": 0.5877393484115601, "learning_rate": 3.9102040816326535e-05, "loss": 1.4161, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 959, "tokens_per_second_per_gpu": 5.89 }, { "epoch": 0.7830342577487766, "grad_norm": 0.47084978222846985, "learning_rate": 3.9142857142857145e-05, "loss": 1.3446, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 960, "tokens_per_second_per_gpu": 5.61 }, { "epoch": 0.7838499184339315, "grad_norm": 0.3926248550415039, "learning_rate": 3.9183673469387755e-05, "loss": 1.2251, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 961, "tokens_per_second_per_gpu": 14.59 }, { "epoch": 0.7846655791190864, "grad_norm": 0.587943971157074, "learning_rate": 3.9224489795918365e-05, "loss": 1.3385, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 962, "tokens_per_second_per_gpu": 5.54 }, { "epoch": 0.7854812398042414, "grad_norm": 0.5045062303543091, "learning_rate": 3.926530612244898e-05, "loss": 1.3499, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 963, "tokens_per_second_per_gpu": 11.03 }, { "epoch": 0.7862969004893964, "grad_norm": 0.6623690128326416, "learning_rate": 3.930612244897959e-05, "loss": 1.3792, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 964, "tokens_per_second_per_gpu": 8.06 }, { "epoch": 0.7871125611745514, "grad_norm": 0.5103788375854492, "learning_rate": 3.934693877551021e-05, "loss": 1.2812, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 965, "tokens_per_second_per_gpu": 8.51 }, { "epoch": 0.7879282218597063, "grad_norm": 1.330450177192688, "learning_rate": 3.938775510204082e-05, "loss": 1.1596, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 966, "tokens_per_second_per_gpu": 9.36 }, { "epoch": 0.7887438825448614, "grad_norm": 0.9444594383239746, "learning_rate": 3.942857142857143e-05, "loss": 1.325, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 967, "tokens_per_second_per_gpu": 6.25 }, { "epoch": 0.7895595432300163, "grad_norm": 0.43867093324661255, "learning_rate": 3.9469387755102045e-05, "loss": 1.3378, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 968, "tokens_per_second_per_gpu": 5.68 }, { "epoch": 0.7903752039151712, "grad_norm": 0.46777647733688354, "learning_rate": 3.9510204081632655e-05, "loss": 1.2685, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 969, "tokens_per_second_per_gpu": 10.1 }, { "epoch": 0.7911908646003263, "grad_norm": 0.5907220244407654, "learning_rate": 3.955102040816327e-05, "loss": 1.3252, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 970, "tokens_per_second_per_gpu": 6.99 }, { "epoch": 0.7920065252854812, "grad_norm": 0.3631710112094879, "learning_rate": 3.9591836734693876e-05, "loss": 1.3686, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 971, "tokens_per_second_per_gpu": 15.85 }, { "epoch": 0.7928221859706363, "grad_norm": 0.5786092877388, "learning_rate": 3.963265306122449e-05, "loss": 1.3091, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.26, "memory/max_allocated (GiB)": 72.26, "step": 972, "tokens_per_second_per_gpu": 7.73 }, { "epoch": 0.7936378466557912, "grad_norm": 0.5699970722198486, "learning_rate": 3.96734693877551e-05, "loss": 1.3216, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 973, "tokens_per_second_per_gpu": 7.48 }, { "epoch": 0.7944535073409462, "grad_norm": 0.6171156764030457, "learning_rate": 3.971428571428571e-05, "loss": 1.3286, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 974, "tokens_per_second_per_gpu": 3.89 }, { "epoch": 0.7952691680261011, "grad_norm": 0.39571794867515564, "learning_rate": 3.975510204081633e-05, "loss": 1.338, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 975, "tokens_per_second_per_gpu": 10.51 }, { "epoch": 0.7960848287112561, "grad_norm": 0.5129187703132629, "learning_rate": 3.979591836734694e-05, "loss": 1.2851, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 976, "tokens_per_second_per_gpu": 10.15 }, { "epoch": 0.7969004893964111, "grad_norm": 0.3267071545124054, "learning_rate": 3.9836734693877556e-05, "loss": 1.3858, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 977, "tokens_per_second_per_gpu": 9.69 }, { "epoch": 0.797716150081566, "grad_norm": 0.5432353019714355, "learning_rate": 3.9877551020408166e-05, "loss": 1.3497, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 978, "tokens_per_second_per_gpu": 8.0 }, { "epoch": 0.7985318107667211, "grad_norm": 0.7501490116119385, "learning_rate": 3.9918367346938776e-05, "loss": 1.3055, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 979, "tokens_per_second_per_gpu": 11.67 }, { "epoch": 0.799347471451876, "grad_norm": 0.28615516424179077, "learning_rate": 3.995918367346939e-05, "loss": 1.2291, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 980, "tokens_per_second_per_gpu": 6.31 }, { "epoch": 0.8001631321370309, "grad_norm": 0.6200999617576599, "learning_rate": 4e-05, "loss": 1.3766, "memory/device_reserved (GiB)": 76.32, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 981, "tokens_per_second_per_gpu": 4.84 }, { "epoch": 0.800978792822186, "grad_norm": 0.5084929466247559, "learning_rate": 4.004081632653062e-05, "loss": 1.4172, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 982, "tokens_per_second_per_gpu": 9.86 }, { "epoch": 0.8017944535073409, "grad_norm": 0.285970538854599, "learning_rate": 4.008163265306122e-05, "loss": 1.4124, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 983, "tokens_per_second_per_gpu": 16.94 }, { "epoch": 0.802610114192496, "grad_norm": 0.5237358808517456, "learning_rate": 4.0122448979591833e-05, "loss": 1.2884, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 984, "tokens_per_second_per_gpu": 6.24 }, { "epoch": 0.8034257748776509, "grad_norm": 0.5700567364692688, "learning_rate": 4.016326530612245e-05, "loss": 1.1956, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 985, "tokens_per_second_per_gpu": 10.73 }, { "epoch": 0.8042414355628059, "grad_norm": 0.42709881067276, "learning_rate": 4.020408163265306e-05, "loss": 1.2758, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 986, "tokens_per_second_per_gpu": 6.76 }, { "epoch": 0.8050570962479608, "grad_norm": 0.4939895272254944, "learning_rate": 4.024489795918368e-05, "loss": 1.358, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 987, "tokens_per_second_per_gpu": 9.35 }, { "epoch": 0.8058727569331158, "grad_norm": 0.4240751266479492, "learning_rate": 4.028571428571429e-05, "loss": 1.3034, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 988, "tokens_per_second_per_gpu": 11.42 }, { "epoch": 0.8066884176182708, "grad_norm": 0.4759792685508728, "learning_rate": 4.03265306122449e-05, "loss": 1.2567, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 989, "tokens_per_second_per_gpu": 8.88 }, { "epoch": 0.8075040783034257, "grad_norm": 0.7285467982292175, "learning_rate": 4.0367346938775514e-05, "loss": 1.305, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 990, "tokens_per_second_per_gpu": 3.54 }, { "epoch": 0.8083197389885808, "grad_norm": 0.5189993381500244, "learning_rate": 4.0408163265306124e-05, "loss": 1.3055, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 991, "tokens_per_second_per_gpu": 19.76 }, { "epoch": 0.8091353996737357, "grad_norm": 0.42742300033569336, "learning_rate": 4.044897959183674e-05, "loss": 1.2764, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 992, "tokens_per_second_per_gpu": 9.03 }, { "epoch": 0.8099510603588908, "grad_norm": 0.3898756802082062, "learning_rate": 4.048979591836735e-05, "loss": 1.1923, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 993, "tokens_per_second_per_gpu": 5.2 }, { "epoch": 0.8107667210440457, "grad_norm": 0.3883078694343567, "learning_rate": 4.053061224489796e-05, "loss": 1.2084, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.26, "memory/max_allocated (GiB)": 72.26, "step": 994, "tokens_per_second_per_gpu": 12.02 }, { "epoch": 0.8115823817292006, "grad_norm": 0.5316084027290344, "learning_rate": 4.057142857142857e-05, "loss": 1.2754, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 995, "tokens_per_second_per_gpu": 6.47 }, { "epoch": 0.8123980424143556, "grad_norm": 0.4593391716480255, "learning_rate": 4.061224489795918e-05, "loss": 1.3306, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 996, "tokens_per_second_per_gpu": 8.77 }, { "epoch": 0.8132137030995106, "grad_norm": 0.6294608116149902, "learning_rate": 4.06530612244898e-05, "loss": 1.3428, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 997, "tokens_per_second_per_gpu": 3.51 }, { "epoch": 0.8140293637846656, "grad_norm": 0.34409117698669434, "learning_rate": 4.069387755102041e-05, "loss": 1.3541, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 998, "tokens_per_second_per_gpu": 15.0 }, { "epoch": 0.8148450244698205, "grad_norm": 0.7564115524291992, "learning_rate": 4.0734693877551025e-05, "loss": 1.3703, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 999, "tokens_per_second_per_gpu": 11.84 }, { "epoch": 0.8156606851549756, "grad_norm": 0.5268298387527466, "learning_rate": 4.0775510204081635e-05, "loss": 1.286, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1000, "tokens_per_second_per_gpu": 13.92 }, { "epoch": 0.8164763458401305, "grad_norm": 0.4760100841522217, "learning_rate": 4.0816326530612245e-05, "loss": 1.3496, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1001, "tokens_per_second_per_gpu": 5.96 }, { "epoch": 0.8172920065252854, "grad_norm": 0.5650264024734497, "learning_rate": 4.085714285714286e-05, "loss": 1.2054, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1002, "tokens_per_second_per_gpu": 7.86 }, { "epoch": 0.8181076672104405, "grad_norm": 0.4145214557647705, "learning_rate": 4.089795918367347e-05, "loss": 1.3415, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1003, "tokens_per_second_per_gpu": 7.16 }, { "epoch": 0.8189233278955954, "grad_norm": 0.4873950779438019, "learning_rate": 4.093877551020409e-05, "loss": 1.344, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1004, "tokens_per_second_per_gpu": 9.05 }, { "epoch": 0.8197389885807504, "grad_norm": 0.5197415947914124, "learning_rate": 4.097959183673469e-05, "loss": 1.3283, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1005, "tokens_per_second_per_gpu": 3.54 }, { "epoch": 0.8205546492659054, "grad_norm": 0.41262155771255493, "learning_rate": 4.102040816326531e-05, "loss": 1.4072, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1006, "tokens_per_second_per_gpu": 14.7 }, { "epoch": 0.8213703099510603, "grad_norm": 0.3153681755065918, "learning_rate": 4.106122448979592e-05, "loss": 1.2536, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1007, "tokens_per_second_per_gpu": 4.17 }, { "epoch": 0.8221859706362153, "grad_norm": 0.5450765490531921, "learning_rate": 4.110204081632653e-05, "loss": 1.3391, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1008, "tokens_per_second_per_gpu": 14.13 }, { "epoch": 0.8230016313213703, "grad_norm": 0.340486615896225, "learning_rate": 4.1142857142857146e-05, "loss": 1.3282, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1009, "tokens_per_second_per_gpu": 10.05 }, { "epoch": 0.8238172920065253, "grad_norm": 0.506267786026001, "learning_rate": 4.1183673469387756e-05, "loss": 1.3872, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1010, "tokens_per_second_per_gpu": 8.03 }, { "epoch": 0.8246329526916802, "grad_norm": 0.47114408016204834, "learning_rate": 4.122448979591837e-05, "loss": 1.3216, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1011, "tokens_per_second_per_gpu": 6.18 }, { "epoch": 0.8254486133768353, "grad_norm": 0.4256442189216614, "learning_rate": 4.126530612244898e-05, "loss": 1.2817, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1012, "tokens_per_second_per_gpu": 8.18 }, { "epoch": 0.8262642740619902, "grad_norm": 0.40575671195983887, "learning_rate": 4.130612244897959e-05, "loss": 1.2816, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1013, "tokens_per_second_per_gpu": 8.54 }, { "epoch": 0.8270799347471451, "grad_norm": 0.6168654561042786, "learning_rate": 4.134693877551021e-05, "loss": 1.3255, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1014, "tokens_per_second_per_gpu": 6.59 }, { "epoch": 0.8278955954323002, "grad_norm": 0.43695923686027527, "learning_rate": 4.138775510204082e-05, "loss": 1.3041, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1015, "tokens_per_second_per_gpu": 6.0 }, { "epoch": 0.8287112561174551, "grad_norm": 0.39579689502716064, "learning_rate": 4.1428571428571437e-05, "loss": 1.2878, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1016, "tokens_per_second_per_gpu": 10.29 }, { "epoch": 0.8295269168026101, "grad_norm": 0.5479215979576111, "learning_rate": 4.146938775510204e-05, "loss": 1.3099, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1017, "tokens_per_second_per_gpu": 5.95 }, { "epoch": 0.8303425774877651, "grad_norm": 1.0281543731689453, "learning_rate": 4.151020408163265e-05, "loss": 1.264, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1018, "tokens_per_second_per_gpu": 4.61 }, { "epoch": 0.8311582381729201, "grad_norm": 0.5730100870132446, "learning_rate": 4.155102040816327e-05, "loss": 1.2632, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1019, "tokens_per_second_per_gpu": 2.12 }, { "epoch": 0.831973898858075, "grad_norm": 0.37109407782554626, "learning_rate": 4.159183673469388e-05, "loss": 1.2665, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1020, "tokens_per_second_per_gpu": 6.77 }, { "epoch": 0.83278955954323, "grad_norm": 0.430551677942276, "learning_rate": 4.1632653061224494e-05, "loss": 1.356, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 1021, "tokens_per_second_per_gpu": 4.46 }, { "epoch": 0.833605220228385, "grad_norm": 0.43125811219215393, "learning_rate": 4.1673469387755104e-05, "loss": 1.4375, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1022, "tokens_per_second_per_gpu": 6.6 }, { "epoch": 0.8344208809135399, "grad_norm": 0.38244137167930603, "learning_rate": 4.1714285714285714e-05, "loss": 1.353, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1023, "tokens_per_second_per_gpu": 13.7 }, { "epoch": 0.835236541598695, "grad_norm": 0.5562241077423096, "learning_rate": 4.175510204081633e-05, "loss": 1.2957, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1024, "tokens_per_second_per_gpu": 6.7 }, { "epoch": 0.8360522022838499, "grad_norm": 0.5760193467140198, "learning_rate": 4.179591836734694e-05, "loss": 1.2866, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1025, "tokens_per_second_per_gpu": 4.78 }, { "epoch": 0.8368678629690048, "grad_norm": 0.38146284222602844, "learning_rate": 4.183673469387756e-05, "loss": 1.336, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1026, "tokens_per_second_per_gpu": 7.87 }, { "epoch": 0.8376835236541599, "grad_norm": 0.3902829885482788, "learning_rate": 4.187755102040817e-05, "loss": 1.3998, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1027, "tokens_per_second_per_gpu": 3.82 }, { "epoch": 0.8384991843393148, "grad_norm": 0.3940015435218811, "learning_rate": 4.191836734693878e-05, "loss": 1.279, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1028, "tokens_per_second_per_gpu": 9.89 }, { "epoch": 0.8393148450244698, "grad_norm": 0.3657895028591156, "learning_rate": 4.195918367346939e-05, "loss": 1.3776, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1029, "tokens_per_second_per_gpu": 3.89 }, { "epoch": 0.8401305057096248, "grad_norm": 0.4838431775569916, "learning_rate": 4.2e-05, "loss": 1.3168, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1030, "tokens_per_second_per_gpu": 7.96 }, { "epoch": 0.8409461663947798, "grad_norm": 0.3462434709072113, "learning_rate": 4.2040816326530615e-05, "loss": 1.321, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1031, "tokens_per_second_per_gpu": 19.93 }, { "epoch": 0.8417618270799347, "grad_norm": 0.5766960978507996, "learning_rate": 4.2081632653061225e-05, "loss": 1.3366, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.26, "memory/max_allocated (GiB)": 72.26, "step": 1032, "tokens_per_second_per_gpu": 11.0 }, { "epoch": 0.8425774877650897, "grad_norm": 0.32347503304481506, "learning_rate": 4.212244897959184e-05, "loss": 1.3353, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1033, "tokens_per_second_per_gpu": 5.96 }, { "epoch": 0.8433931484502447, "grad_norm": 0.4079866409301758, "learning_rate": 4.216326530612245e-05, "loss": 1.3652, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1034, "tokens_per_second_per_gpu": 10.39 }, { "epoch": 0.8442088091353996, "grad_norm": 0.48587819933891296, "learning_rate": 4.220408163265306e-05, "loss": 1.2356, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1035, "tokens_per_second_per_gpu": 4.91 }, { "epoch": 0.8450244698205547, "grad_norm": 0.5691571831703186, "learning_rate": 4.224489795918368e-05, "loss": 1.3142, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1036, "tokens_per_second_per_gpu": 8.09 }, { "epoch": 0.8458401305057096, "grad_norm": 0.39193639159202576, "learning_rate": 4.228571428571429e-05, "loss": 1.3398, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1037, "tokens_per_second_per_gpu": 5.47 }, { "epoch": 0.8466557911908646, "grad_norm": 0.5714961886405945, "learning_rate": 4.2326530612244905e-05, "loss": 1.3922, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1038, "tokens_per_second_per_gpu": 3.57 }, { "epoch": 0.8474714518760196, "grad_norm": 0.7239679098129272, "learning_rate": 4.236734693877551e-05, "loss": 1.3174, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1039, "tokens_per_second_per_gpu": 6.84 }, { "epoch": 0.8482871125611745, "grad_norm": 0.48588961362838745, "learning_rate": 4.2408163265306125e-05, "loss": 1.3273, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1040, "tokens_per_second_per_gpu": 6.98 }, { "epoch": 0.8491027732463295, "grad_norm": 0.43014103174209595, "learning_rate": 4.2448979591836735e-05, "loss": 1.2843, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1041, "tokens_per_second_per_gpu": 13.16 }, { "epoch": 0.8499184339314845, "grad_norm": 0.39624840021133423, "learning_rate": 4.2489795918367345e-05, "loss": 1.3317, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1042, "tokens_per_second_per_gpu": 9.99 }, { "epoch": 0.8507340946166395, "grad_norm": 0.4508655071258545, "learning_rate": 4.253061224489796e-05, "loss": 1.376, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1043, "tokens_per_second_per_gpu": 3.84 }, { "epoch": 0.8515497553017944, "grad_norm": 0.4512145221233368, "learning_rate": 4.257142857142857e-05, "loss": 1.2832, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1044, "tokens_per_second_per_gpu": 8.46 }, { "epoch": 0.8523654159869495, "grad_norm": 0.5072230696678162, "learning_rate": 4.261224489795919e-05, "loss": 1.2907, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1045, "tokens_per_second_per_gpu": 9.99 }, { "epoch": 0.8531810766721044, "grad_norm": 0.7547808885574341, "learning_rate": 4.26530612244898e-05, "loss": 1.2447, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1046, "tokens_per_second_per_gpu": 5.39 }, { "epoch": 0.8539967373572593, "grad_norm": 0.5557453632354736, "learning_rate": 4.269387755102041e-05, "loss": 1.3018, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1047, "tokens_per_second_per_gpu": 13.87 }, { "epoch": 0.8548123980424144, "grad_norm": 0.48283571004867554, "learning_rate": 4.2734693877551026e-05, "loss": 1.2794, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.26, "memory/max_allocated (GiB)": 72.26, "step": 1048, "tokens_per_second_per_gpu": 6.04 }, { "epoch": 0.8556280587275693, "grad_norm": 0.4384002983570099, "learning_rate": 4.2775510204081636e-05, "loss": 1.3353, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1049, "tokens_per_second_per_gpu": 9.81 }, { "epoch": 0.8564437194127243, "grad_norm": 0.42990362644195557, "learning_rate": 4.281632653061225e-05, "loss": 1.3382, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1050, "tokens_per_second_per_gpu": 12.54 }, { "epoch": 0.8572593800978793, "grad_norm": 0.8159483671188354, "learning_rate": 4.2857142857142856e-05, "loss": 1.4071, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1051, "tokens_per_second_per_gpu": 9.93 }, { "epoch": 0.8580750407830342, "grad_norm": 0.3914187550544739, "learning_rate": 4.2897959183673466e-05, "loss": 1.2962, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1052, "tokens_per_second_per_gpu": 8.62 }, { "epoch": 0.8588907014681892, "grad_norm": 0.6576930284500122, "learning_rate": 4.293877551020408e-05, "loss": 1.2818, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1053, "tokens_per_second_per_gpu": 3.82 }, { "epoch": 0.8597063621533442, "grad_norm": 0.4391036927700043, "learning_rate": 4.297959183673469e-05, "loss": 1.2125, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1054, "tokens_per_second_per_gpu": 7.22 }, { "epoch": 0.8605220228384992, "grad_norm": 0.388612300157547, "learning_rate": 4.302040816326531e-05, "loss": 1.2454, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1055, "tokens_per_second_per_gpu": 12.28 }, { "epoch": 0.8613376835236541, "grad_norm": 0.556125283241272, "learning_rate": 4.306122448979592e-05, "loss": 1.3655, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 1056, "tokens_per_second_per_gpu": 14.29 }, { "epoch": 0.8621533442088092, "grad_norm": 0.38084378838539124, "learning_rate": 4.310204081632653e-05, "loss": 1.3196, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.26, "memory/max_allocated (GiB)": 72.26, "step": 1057, "tokens_per_second_per_gpu": 18.19 }, { "epoch": 0.8629690048939641, "grad_norm": 0.5020302534103394, "learning_rate": 4.314285714285715e-05, "loss": 1.2531, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1058, "tokens_per_second_per_gpu": 8.26 }, { "epoch": 0.863784665579119, "grad_norm": 0.542355477809906, "learning_rate": 4.318367346938776e-05, "loss": 1.3138, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1059, "tokens_per_second_per_gpu": 14.1 }, { "epoch": 0.8646003262642741, "grad_norm": 0.3780638873577118, "learning_rate": 4.3224489795918374e-05, "loss": 1.1816, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1060, "tokens_per_second_per_gpu": 12.05 }, { "epoch": 0.865415986949429, "grad_norm": 0.44213151931762695, "learning_rate": 4.3265306122448984e-05, "loss": 1.3575, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1061, "tokens_per_second_per_gpu": 9.08 }, { "epoch": 0.866231647634584, "grad_norm": 0.37552398443222046, "learning_rate": 4.3306122448979594e-05, "loss": 1.3594, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1062, "tokens_per_second_per_gpu": 14.93 }, { "epoch": 0.867047308319739, "grad_norm": 0.35384249687194824, "learning_rate": 4.3346938775510204e-05, "loss": 1.3571, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1063, "tokens_per_second_per_gpu": 7.67 }, { "epoch": 0.867862969004894, "grad_norm": 0.4753800630569458, "learning_rate": 4.3387755102040814e-05, "loss": 1.2825, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1064, "tokens_per_second_per_gpu": 9.99 }, { "epoch": 0.8686786296900489, "grad_norm": 0.3003656566143036, "learning_rate": 4.342857142857143e-05, "loss": 1.2362, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1065, "tokens_per_second_per_gpu": 4.38 }, { "epoch": 0.8694942903752039, "grad_norm": 0.4250010848045349, "learning_rate": 4.346938775510204e-05, "loss": 1.3443, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1066, "tokens_per_second_per_gpu": 6.95 }, { "epoch": 0.8703099510603589, "grad_norm": 0.5738376975059509, "learning_rate": 4.351020408163266e-05, "loss": 1.2364, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1067, "tokens_per_second_per_gpu": 9.82 }, { "epoch": 0.8711256117455138, "grad_norm": 0.4871150851249695, "learning_rate": 4.355102040816327e-05, "loss": 1.3149, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1068, "tokens_per_second_per_gpu": 14.72 }, { "epoch": 0.8719412724306689, "grad_norm": 0.4603250026702881, "learning_rate": 4.359183673469388e-05, "loss": 1.4116, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1069, "tokens_per_second_per_gpu": 8.14 }, { "epoch": 0.8727569331158238, "grad_norm": 0.6057417392730713, "learning_rate": 4.3632653061224495e-05, "loss": 1.3591, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1070, "tokens_per_second_per_gpu": 15.96 }, { "epoch": 0.8735725938009788, "grad_norm": 0.34318047761917114, "learning_rate": 4.3673469387755105e-05, "loss": 1.2452, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1071, "tokens_per_second_per_gpu": 15.41 }, { "epoch": 0.8743882544861338, "grad_norm": 0.4649723768234253, "learning_rate": 4.371428571428572e-05, "loss": 1.3109, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1072, "tokens_per_second_per_gpu": 6.53 }, { "epoch": 0.8752039151712887, "grad_norm": 0.4578392803668976, "learning_rate": 4.3755102040816325e-05, "loss": 1.3161, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1073, "tokens_per_second_per_gpu": 8.47 }, { "epoch": 0.8760195758564437, "grad_norm": 0.3798984885215759, "learning_rate": 4.379591836734694e-05, "loss": 1.3074, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1074, "tokens_per_second_per_gpu": 7.6 }, { "epoch": 0.8768352365415987, "grad_norm": 0.3398929536342621, "learning_rate": 4.383673469387755e-05, "loss": 1.2308, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1075, "tokens_per_second_per_gpu": 13.47 }, { "epoch": 0.8776508972267537, "grad_norm": 0.6895826458930969, "learning_rate": 4.387755102040816e-05, "loss": 1.3348, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1076, "tokens_per_second_per_gpu": 6.39 }, { "epoch": 0.8784665579119086, "grad_norm": 0.27006521821022034, "learning_rate": 4.391836734693878e-05, "loss": 1.3061, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1077, "tokens_per_second_per_gpu": 20.4 }, { "epoch": 0.8792822185970636, "grad_norm": 0.5188771486282349, "learning_rate": 4.395918367346939e-05, "loss": 1.3364, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1078, "tokens_per_second_per_gpu": 11.1 }, { "epoch": 0.8800978792822186, "grad_norm": 0.39349329471588135, "learning_rate": 4.4000000000000006e-05, "loss": 1.3703, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1079, "tokens_per_second_per_gpu": 5.54 }, { "epoch": 0.8809135399673735, "grad_norm": 0.3790401220321655, "learning_rate": 4.4040816326530616e-05, "loss": 1.196, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1080, "tokens_per_second_per_gpu": 9.82 }, { "epoch": 0.8817292006525286, "grad_norm": 0.611820638179779, "learning_rate": 4.4081632653061226e-05, "loss": 1.2368, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1081, "tokens_per_second_per_gpu": 7.57 }, { "epoch": 0.8825448613376835, "grad_norm": 0.5779531598091125, "learning_rate": 4.412244897959184e-05, "loss": 1.3924, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1082, "tokens_per_second_per_gpu": 2.18 }, { "epoch": 0.8833605220228385, "grad_norm": 0.4981412887573242, "learning_rate": 4.416326530612245e-05, "loss": 1.2606, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1083, "tokens_per_second_per_gpu": 6.53 }, { "epoch": 0.8841761827079935, "grad_norm": 0.3874598741531372, "learning_rate": 4.420408163265306e-05, "loss": 1.2995, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1084, "tokens_per_second_per_gpu": 5.1 }, { "epoch": 0.8849918433931484, "grad_norm": 0.35845983028411865, "learning_rate": 4.424489795918367e-05, "loss": 1.2994, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1085, "tokens_per_second_per_gpu": 5.21 }, { "epoch": 0.8858075040783034, "grad_norm": 0.4354231357574463, "learning_rate": 4.428571428571428e-05, "loss": 1.4178, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1086, "tokens_per_second_per_gpu": 8.78 }, { "epoch": 0.8866231647634584, "grad_norm": 0.7199257612228394, "learning_rate": 4.43265306122449e-05, "loss": 1.3284, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1087, "tokens_per_second_per_gpu": 3.58 }, { "epoch": 0.8874388254486134, "grad_norm": 0.5678147077560425, "learning_rate": 4.436734693877551e-05, "loss": 1.286, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1088, "tokens_per_second_per_gpu": 4.8 }, { "epoch": 0.8882544861337683, "grad_norm": 0.47715088725090027, "learning_rate": 4.4408163265306127e-05, "loss": 1.2736, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1089, "tokens_per_second_per_gpu": 5.4 }, { "epoch": 0.8890701468189234, "grad_norm": 0.3845560848712921, "learning_rate": 4.4448979591836737e-05, "loss": 1.2702, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1090, "tokens_per_second_per_gpu": 9.95 }, { "epoch": 0.8898858075040783, "grad_norm": 0.5711014866828918, "learning_rate": 4.448979591836735e-05, "loss": 1.3309, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 1091, "tokens_per_second_per_gpu": 5.14 }, { "epoch": 0.8907014681892332, "grad_norm": 0.60215163230896, "learning_rate": 4.4530612244897963e-05, "loss": 1.2365, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1092, "tokens_per_second_per_gpu": 2.8 }, { "epoch": 0.8915171288743883, "grad_norm": 7.685591697692871, "learning_rate": 4.4571428571428574e-05, "loss": 1.3495, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1093, "tokens_per_second_per_gpu": 11.58 }, { "epoch": 0.8923327895595432, "grad_norm": 0.7159709930419922, "learning_rate": 4.461224489795919e-05, "loss": 1.3728, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1094, "tokens_per_second_per_gpu": 1.2 }, { "epoch": 0.8931484502446982, "grad_norm": 0.3748353123664856, "learning_rate": 4.46530612244898e-05, "loss": 1.2593, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1095, "tokens_per_second_per_gpu": 10.47 }, { "epoch": 0.8939641109298532, "grad_norm": 0.6031535267829895, "learning_rate": 4.469387755102041e-05, "loss": 1.3885, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1096, "tokens_per_second_per_gpu": 13.65 }, { "epoch": 0.8947797716150081, "grad_norm": 0.7521376013755798, "learning_rate": 4.473469387755102e-05, "loss": 1.341, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1097, "tokens_per_second_per_gpu": 6.36 }, { "epoch": 0.8955954323001631, "grad_norm": 0.4228546619415283, "learning_rate": 4.477551020408163e-05, "loss": 1.2819, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1098, "tokens_per_second_per_gpu": 6.88 }, { "epoch": 0.8964110929853181, "grad_norm": 0.32853129506111145, "learning_rate": 4.481632653061225e-05, "loss": 1.3743, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1099, "tokens_per_second_per_gpu": 5.87 }, { "epoch": 0.8972267536704731, "grad_norm": 0.6164416074752808, "learning_rate": 4.485714285714286e-05, "loss": 1.2504, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1100, "tokens_per_second_per_gpu": 1.87 }, { "epoch": 0.898042414355628, "grad_norm": 0.40029454231262207, "learning_rate": 4.4897959183673474e-05, "loss": 1.3728, "memory/device_reserved (GiB)": 76.33, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1101, "tokens_per_second_per_gpu": 11.55 }, { "epoch": 0.8988580750407831, "grad_norm": 0.36920610070228577, "learning_rate": 4.4938775510204084e-05, "loss": 1.358, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1102, "tokens_per_second_per_gpu": 11.86 }, { "epoch": 0.899673735725938, "grad_norm": 0.24916811287403107, "learning_rate": 4.4979591836734694e-05, "loss": 1.2994, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1103, "tokens_per_second_per_gpu": 9.07 }, { "epoch": 0.9004893964110929, "grad_norm": 0.8018535375595093, "learning_rate": 4.502040816326531e-05, "loss": 1.3825, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1104, "tokens_per_second_per_gpu": 7.5 }, { "epoch": 0.901305057096248, "grad_norm": 0.4488465487957001, "learning_rate": 4.506122448979592e-05, "loss": 1.2897, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1105, "tokens_per_second_per_gpu": 10.29 }, { "epoch": 0.9021207177814029, "grad_norm": 0.8278684616088867, "learning_rate": 4.510204081632654e-05, "loss": 1.1767, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1106, "tokens_per_second_per_gpu": 8.31 }, { "epoch": 0.9029363784665579, "grad_norm": 0.365783154964447, "learning_rate": 4.514285714285714e-05, "loss": 1.3577, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1107, "tokens_per_second_per_gpu": 10.15 }, { "epoch": 0.9037520391517129, "grad_norm": 0.5361992120742798, "learning_rate": 4.518367346938776e-05, "loss": 1.3809, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1108, "tokens_per_second_per_gpu": 5.74 }, { "epoch": 0.9045676998368679, "grad_norm": 0.4637894928455353, "learning_rate": 4.522448979591837e-05, "loss": 1.2779, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1109, "tokens_per_second_per_gpu": 12.23 }, { "epoch": 0.9053833605220228, "grad_norm": 0.655604898929596, "learning_rate": 4.526530612244898e-05, "loss": 1.2426, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1110, "tokens_per_second_per_gpu": 16.97 }, { "epoch": 0.9061990212071778, "grad_norm": 0.3523111343383789, "learning_rate": 4.5306122448979595e-05, "loss": 1.3186, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1111, "tokens_per_second_per_gpu": 10.35 }, { "epoch": 0.9070146818923328, "grad_norm": 0.47860756516456604, "learning_rate": 4.5346938775510205e-05, "loss": 1.2544, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1112, "tokens_per_second_per_gpu": 4.81 }, { "epoch": 0.9078303425774877, "grad_norm": 0.6440806984901428, "learning_rate": 4.538775510204082e-05, "loss": 1.2541, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1113, "tokens_per_second_per_gpu": 5.73 }, { "epoch": 0.9086460032626428, "grad_norm": 0.37379932403564453, "learning_rate": 4.542857142857143e-05, "loss": 1.2552, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1114, "tokens_per_second_per_gpu": 7.0 }, { "epoch": 0.9094616639477977, "grad_norm": 0.3786577582359314, "learning_rate": 4.546938775510204e-05, "loss": 1.3924, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1115, "tokens_per_second_per_gpu": 4.64 }, { "epoch": 0.9102773246329527, "grad_norm": 0.40136513113975525, "learning_rate": 4.551020408163266e-05, "loss": 1.3733, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1116, "tokens_per_second_per_gpu": 10.43 }, { "epoch": 0.9110929853181077, "grad_norm": 0.36410951614379883, "learning_rate": 4.555102040816327e-05, "loss": 1.1914, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1117, "tokens_per_second_per_gpu": 9.88 }, { "epoch": 0.9119086460032626, "grad_norm": 0.5245070457458496, "learning_rate": 4.559183673469388e-05, "loss": 1.2788, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1118, "tokens_per_second_per_gpu": 8.86 }, { "epoch": 0.9127243066884176, "grad_norm": 0.5090014934539795, "learning_rate": 4.563265306122449e-05, "loss": 1.4094, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1119, "tokens_per_second_per_gpu": 8.63 }, { "epoch": 0.9135399673735726, "grad_norm": 0.4999030530452728, "learning_rate": 4.56734693877551e-05, "loss": 1.3388, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.26, "memory/max_allocated (GiB)": 72.26, "step": 1120, "tokens_per_second_per_gpu": 6.62 }, { "epoch": 0.9143556280587276, "grad_norm": 0.5901906490325928, "learning_rate": 4.5714285714285716e-05, "loss": 1.2465, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1121, "tokens_per_second_per_gpu": 2.46 }, { "epoch": 0.9151712887438825, "grad_norm": 0.40685513615608215, "learning_rate": 4.5755102040816326e-05, "loss": 1.2769, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1122, "tokens_per_second_per_gpu": 9.41 }, { "epoch": 0.9159869494290375, "grad_norm": 0.6379269361495972, "learning_rate": 4.579591836734694e-05, "loss": 1.2526, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1123, "tokens_per_second_per_gpu": 2.95 }, { "epoch": 0.9168026101141925, "grad_norm": 0.45612746477127075, "learning_rate": 4.583673469387755e-05, "loss": 1.2684, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1124, "tokens_per_second_per_gpu": 8.47 }, { "epoch": 0.9176182707993474, "grad_norm": 0.5010482668876648, "learning_rate": 4.587755102040816e-05, "loss": 1.364, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1125, "tokens_per_second_per_gpu": 6.51 }, { "epoch": 0.9184339314845025, "grad_norm": 0.6479950547218323, "learning_rate": 4.591836734693878e-05, "loss": 1.3467, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1126, "tokens_per_second_per_gpu": 8.11 }, { "epoch": 0.9192495921696574, "grad_norm": 0.38675108551979065, "learning_rate": 4.595918367346939e-05, "loss": 1.2775, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1127, "tokens_per_second_per_gpu": 5.88 }, { "epoch": 0.9200652528548124, "grad_norm": 0.5974453687667847, "learning_rate": 4.600000000000001e-05, "loss": 1.3756, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1128, "tokens_per_second_per_gpu": 7.58 }, { "epoch": 0.9208809135399674, "grad_norm": 0.5130681991577148, "learning_rate": 4.604081632653062e-05, "loss": 1.3563, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1129, "tokens_per_second_per_gpu": 5.19 }, { "epoch": 0.9216965742251223, "grad_norm": 0.32518240809440613, "learning_rate": 4.608163265306123e-05, "loss": 1.3907, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1130, "tokens_per_second_per_gpu": 6.98 }, { "epoch": 0.9225122349102773, "grad_norm": 0.49000614881515503, "learning_rate": 4.612244897959184e-05, "loss": 1.3344, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1131, "tokens_per_second_per_gpu": 11.14 }, { "epoch": 0.9233278955954323, "grad_norm": 0.4624132215976715, "learning_rate": 4.616326530612245e-05, "loss": 1.4113, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1132, "tokens_per_second_per_gpu": 3.34 }, { "epoch": 0.9241435562805873, "grad_norm": 0.48015737533569336, "learning_rate": 4.6204081632653064e-05, "loss": 1.2822, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1133, "tokens_per_second_per_gpu": 4.65 }, { "epoch": 0.9249592169657422, "grad_norm": 0.4235599637031555, "learning_rate": 4.6244897959183674e-05, "loss": 1.2975, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1134, "tokens_per_second_per_gpu": 5.98 }, { "epoch": 0.9257748776508973, "grad_norm": 0.6076764464378357, "learning_rate": 4.628571428571429e-05, "loss": 1.3331, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1135, "tokens_per_second_per_gpu": 2.24 }, { "epoch": 0.9265905383360522, "grad_norm": 0.6243035793304443, "learning_rate": 4.63265306122449e-05, "loss": 1.329, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1136, "tokens_per_second_per_gpu": 10.75 }, { "epoch": 0.9274061990212071, "grad_norm": 0.3529213070869446, "learning_rate": 4.636734693877551e-05, "loss": 1.2729, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1137, "tokens_per_second_per_gpu": 7.45 }, { "epoch": 0.9282218597063622, "grad_norm": 0.3894992768764496, "learning_rate": 4.640816326530613e-05, "loss": 1.3969, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1138, "tokens_per_second_per_gpu": 9.14 }, { "epoch": 0.9290375203915171, "grad_norm": 0.5815197229385376, "learning_rate": 4.644897959183674e-05, "loss": 1.2904, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1139, "tokens_per_second_per_gpu": 4.59 }, { "epoch": 0.9298531810766721, "grad_norm": 0.5705400705337524, "learning_rate": 4.6489795918367355e-05, "loss": 1.4123, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1140, "tokens_per_second_per_gpu": 9.41 }, { "epoch": 0.9306688417618271, "grad_norm": 0.4439522325992584, "learning_rate": 4.653061224489796e-05, "loss": 1.3295, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1141, "tokens_per_second_per_gpu": 15.2 }, { "epoch": 0.9314845024469821, "grad_norm": 0.32019245624542236, "learning_rate": 4.6571428571428575e-05, "loss": 1.3432, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1142, "tokens_per_second_per_gpu": 10.65 }, { "epoch": 0.932300163132137, "grad_norm": 0.34472987055778503, "learning_rate": 4.6612244897959185e-05, "loss": 1.3561, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1143, "tokens_per_second_per_gpu": 15.22 }, { "epoch": 0.933115823817292, "grad_norm": 0.40536803007125854, "learning_rate": 4.6653061224489795e-05, "loss": 1.233, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1144, "tokens_per_second_per_gpu": 9.65 }, { "epoch": 0.933931484502447, "grad_norm": 0.5962960124015808, "learning_rate": 4.669387755102041e-05, "loss": 1.3529, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1145, "tokens_per_second_per_gpu": 13.56 }, { "epoch": 0.9347471451876019, "grad_norm": 0.3231654763221741, "learning_rate": 4.673469387755102e-05, "loss": 1.2267, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1146, "tokens_per_second_per_gpu": 3.08 }, { "epoch": 0.935562805872757, "grad_norm": 0.4227011799812317, "learning_rate": 4.677551020408163e-05, "loss": 1.4218, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1147, "tokens_per_second_per_gpu": 11.2 }, { "epoch": 0.9363784665579119, "grad_norm": 0.4715719521045685, "learning_rate": 4.681632653061225e-05, "loss": 1.3482, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1148, "tokens_per_second_per_gpu": 6.06 }, { "epoch": 0.9371941272430668, "grad_norm": 0.4200468361377716, "learning_rate": 4.685714285714286e-05, "loss": 1.2589, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1149, "tokens_per_second_per_gpu": 6.52 }, { "epoch": 0.9380097879282219, "grad_norm": 0.43880757689476013, "learning_rate": 4.6897959183673475e-05, "loss": 1.1841, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1150, "tokens_per_second_per_gpu": 14.29 }, { "epoch": 0.9388254486133768, "grad_norm": 0.5573946833610535, "learning_rate": 4.6938775510204086e-05, "loss": 1.2692, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1151, "tokens_per_second_per_gpu": 8.38 }, { "epoch": 0.9396411092985318, "grad_norm": 0.47034022212028503, "learning_rate": 4.6979591836734696e-05, "loss": 1.3859, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1152, "tokens_per_second_per_gpu": 7.18 }, { "epoch": 0.9404567699836868, "grad_norm": 0.37012237310409546, "learning_rate": 4.7020408163265306e-05, "loss": 1.431, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1153, "tokens_per_second_per_gpu": 8.89 }, { "epoch": 0.9412724306688418, "grad_norm": 0.5028272271156311, "learning_rate": 4.7061224489795916e-05, "loss": 1.2695, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1154, "tokens_per_second_per_gpu": 14.54 }, { "epoch": 0.9420880913539967, "grad_norm": 0.6255446672439575, "learning_rate": 4.710204081632653e-05, "loss": 1.3307, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1155, "tokens_per_second_per_gpu": 4.61 }, { "epoch": 0.9429037520391517, "grad_norm": 0.3949779272079468, "learning_rate": 4.714285714285714e-05, "loss": 1.2652, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1156, "tokens_per_second_per_gpu": 8.35 }, { "epoch": 0.9437194127243067, "grad_norm": 0.4853973090648651, "learning_rate": 4.718367346938776e-05, "loss": 1.4506, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1157, "tokens_per_second_per_gpu": 7.17 }, { "epoch": 0.9445350734094616, "grad_norm": 0.3840438425540924, "learning_rate": 4.722448979591837e-05, "loss": 1.3648, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 1158, "tokens_per_second_per_gpu": 15.6 }, { "epoch": 0.9453507340946167, "grad_norm": 0.44253045320510864, "learning_rate": 4.726530612244898e-05, "loss": 1.2522, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1159, "tokens_per_second_per_gpu": 4.93 }, { "epoch": 0.9461663947797716, "grad_norm": 0.4289775490760803, "learning_rate": 4.7306122448979596e-05, "loss": 1.2941, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1160, "tokens_per_second_per_gpu": 13.01 }, { "epoch": 0.9469820554649266, "grad_norm": 0.4984445571899414, "learning_rate": 4.7346938775510206e-05, "loss": 1.2072, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1161, "tokens_per_second_per_gpu": 4.41 }, { "epoch": 0.9477977161500816, "grad_norm": 0.3533351421356201, "learning_rate": 4.738775510204082e-05, "loss": 1.3237, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1162, "tokens_per_second_per_gpu": 7.84 }, { "epoch": 0.9486133768352365, "grad_norm": 0.42262932658195496, "learning_rate": 4.742857142857143e-05, "loss": 1.3568, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1163, "tokens_per_second_per_gpu": 7.74 }, { "epoch": 0.9494290375203915, "grad_norm": 0.4230494201183319, "learning_rate": 4.746938775510204e-05, "loss": 1.2756, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1164, "tokens_per_second_per_gpu": 9.33 }, { "epoch": 0.9502446982055465, "grad_norm": 0.48817190527915955, "learning_rate": 4.7510204081632653e-05, "loss": 1.235, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1165, "tokens_per_second_per_gpu": 5.44 }, { "epoch": 0.9510603588907015, "grad_norm": 0.40539461374282837, "learning_rate": 4.7551020408163263e-05, "loss": 1.3063, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1166, "tokens_per_second_per_gpu": 5.01 }, { "epoch": 0.9518760195758564, "grad_norm": 0.27489614486694336, "learning_rate": 4.759183673469388e-05, "loss": 1.2322, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1167, "tokens_per_second_per_gpu": 8.15 }, { "epoch": 0.9526916802610114, "grad_norm": 0.547562301158905, "learning_rate": 4.763265306122449e-05, "loss": 1.3257, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1168, "tokens_per_second_per_gpu": 7.42 }, { "epoch": 0.9535073409461664, "grad_norm": 0.4995080232620239, "learning_rate": 4.767346938775511e-05, "loss": 1.3786, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1169, "tokens_per_second_per_gpu": 5.27 }, { "epoch": 0.9543230016313213, "grad_norm": 0.6357192397117615, "learning_rate": 4.771428571428572e-05, "loss": 1.3625, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1170, "tokens_per_second_per_gpu": 11.41 }, { "epoch": 0.9551386623164764, "grad_norm": 0.45863306522369385, "learning_rate": 4.775510204081633e-05, "loss": 1.4596, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1171, "tokens_per_second_per_gpu": 7.3 }, { "epoch": 0.9559543230016313, "grad_norm": 0.34837499260902405, "learning_rate": 4.7795918367346944e-05, "loss": 1.3964, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1172, "tokens_per_second_per_gpu": 10.99 }, { "epoch": 0.9567699836867863, "grad_norm": 0.5038471221923828, "learning_rate": 4.7836734693877554e-05, "loss": 1.3094, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1173, "tokens_per_second_per_gpu": 5.89 }, { "epoch": 0.9575856443719413, "grad_norm": 0.3896941542625427, "learning_rate": 4.787755102040817e-05, "loss": 1.3016, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1174, "tokens_per_second_per_gpu": 8.47 }, { "epoch": 0.9584013050570962, "grad_norm": 0.45611822605133057, "learning_rate": 4.7918367346938774e-05, "loss": 1.2895, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1175, "tokens_per_second_per_gpu": 6.38 }, { "epoch": 0.9592169657422512, "grad_norm": 0.5494994521141052, "learning_rate": 4.795918367346939e-05, "loss": 1.2521, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1176, "tokens_per_second_per_gpu": 9.59 }, { "epoch": 0.9600326264274062, "grad_norm": 0.2670064866542816, "learning_rate": 4.8e-05, "loss": 1.1895, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1177, "tokens_per_second_per_gpu": 10.35 }, { "epoch": 0.9608482871125612, "grad_norm": 0.45179662108421326, "learning_rate": 4.804081632653061e-05, "loss": 1.3308, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1178, "tokens_per_second_per_gpu": 3.83 }, { "epoch": 0.9616639477977161, "grad_norm": 0.5642263889312744, "learning_rate": 4.808163265306123e-05, "loss": 1.3273, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1179, "tokens_per_second_per_gpu": 7.82 }, { "epoch": 0.9624796084828712, "grad_norm": 0.4388781785964966, "learning_rate": 4.812244897959184e-05, "loss": 1.3017, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1180, "tokens_per_second_per_gpu": 7.24 }, { "epoch": 0.9632952691680261, "grad_norm": 0.45621219277381897, "learning_rate": 4.816326530612245e-05, "loss": 1.3069, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1181, "tokens_per_second_per_gpu": 7.77 }, { "epoch": 0.964110929853181, "grad_norm": 0.28024399280548096, "learning_rate": 4.8204081632653065e-05, "loss": 1.2926, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1182, "tokens_per_second_per_gpu": 7.96 }, { "epoch": 0.9649265905383361, "grad_norm": 0.41220977902412415, "learning_rate": 4.8244897959183675e-05, "loss": 1.2598, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1183, "tokens_per_second_per_gpu": 12.23 }, { "epoch": 0.965742251223491, "grad_norm": 0.5750587582588196, "learning_rate": 4.828571428571429e-05, "loss": 1.4155, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1184, "tokens_per_second_per_gpu": 5.91 }, { "epoch": 0.966557911908646, "grad_norm": 0.3446974456310272, "learning_rate": 4.83265306122449e-05, "loss": 1.1875, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1185, "tokens_per_second_per_gpu": 8.98 }, { "epoch": 0.967373572593801, "grad_norm": 0.4283144176006317, "learning_rate": 4.836734693877551e-05, "loss": 1.2404, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1186, "tokens_per_second_per_gpu": 7.41 }, { "epoch": 0.968189233278956, "grad_norm": 0.34101441502571106, "learning_rate": 4.840816326530612e-05, "loss": 1.2582, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1187, "tokens_per_second_per_gpu": 6.95 }, { "epoch": 0.9690048939641109, "grad_norm": 0.5429081916809082, "learning_rate": 4.844897959183673e-05, "loss": 1.3498, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1188, "tokens_per_second_per_gpu": 11.31 }, { "epoch": 0.9698205546492659, "grad_norm": 0.36490005254745483, "learning_rate": 4.848979591836735e-05, "loss": 1.2937, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1189, "tokens_per_second_per_gpu": 11.64 }, { "epoch": 0.9706362153344209, "grad_norm": 0.37057173252105713, "learning_rate": 4.853061224489796e-05, "loss": 1.1944, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1190, "tokens_per_second_per_gpu": 11.68 }, { "epoch": 0.9714518760195758, "grad_norm": 0.3562639057636261, "learning_rate": 4.8571428571428576e-05, "loss": 1.2514, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.31, "memory/max_allocated (GiB)": 72.31, "step": 1191, "tokens_per_second_per_gpu": 16.54 }, { "epoch": 0.9722675367047309, "grad_norm": 0.49075958132743835, "learning_rate": 4.8612244897959186e-05, "loss": 1.3449, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1192, "tokens_per_second_per_gpu": 14.03 }, { "epoch": 0.9730831973898858, "grad_norm": 0.5436855554580688, "learning_rate": 4.8653061224489796e-05, "loss": 1.3744, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1193, "tokens_per_second_per_gpu": 4.35 }, { "epoch": 0.9738988580750407, "grad_norm": 0.6498190760612488, "learning_rate": 4.869387755102041e-05, "loss": 1.236, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1194, "tokens_per_second_per_gpu": 7.09 }, { "epoch": 0.9747145187601958, "grad_norm": 0.451345294713974, "learning_rate": 4.873469387755102e-05, "loss": 1.3074, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1195, "tokens_per_second_per_gpu": 7.45 }, { "epoch": 0.9755301794453507, "grad_norm": 0.3254075050354004, "learning_rate": 4.877551020408164e-05, "loss": 1.3449, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1196, "tokens_per_second_per_gpu": 7.54 }, { "epoch": 0.9763458401305057, "grad_norm": 0.40079668164253235, "learning_rate": 4.881632653061225e-05, "loss": 1.3405, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1197, "tokens_per_second_per_gpu": 8.76 }, { "epoch": 0.9771615008156607, "grad_norm": 0.7156198024749756, "learning_rate": 4.885714285714286e-05, "loss": 1.1788, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1198, "tokens_per_second_per_gpu": 7.93 }, { "epoch": 0.9779771615008157, "grad_norm": 0.5835684537887573, "learning_rate": 4.889795918367347e-05, "loss": 1.2702, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1199, "tokens_per_second_per_gpu": 12.05 }, { "epoch": 0.9787928221859706, "grad_norm": 0.4627419710159302, "learning_rate": 4.893877551020408e-05, "loss": 1.2317, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1200, "tokens_per_second_per_gpu": 7.45 }, { "epoch": 0.9796084828711256, "grad_norm": 0.6123296022415161, "learning_rate": 4.89795918367347e-05, "loss": 1.297, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1201, "tokens_per_second_per_gpu": 3.01 }, { "epoch": 0.9804241435562806, "grad_norm": 0.46367084980010986, "learning_rate": 4.902040816326531e-05, "loss": 1.3607, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1202, "tokens_per_second_per_gpu": 11.44 }, { "epoch": 0.9812398042414355, "grad_norm": 0.45110562443733215, "learning_rate": 4.9061224489795924e-05, "loss": 1.2799, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1203, "tokens_per_second_per_gpu": 11.36 }, { "epoch": 0.9820554649265906, "grad_norm": 0.4775084853172302, "learning_rate": 4.9102040816326534e-05, "loss": 1.3764, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1204, "tokens_per_second_per_gpu": 4.87 }, { "epoch": 0.9828711256117455, "grad_norm": 0.31291982531547546, "learning_rate": 4.9142857142857144e-05, "loss": 1.2364, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1205, "tokens_per_second_per_gpu": 13.19 }, { "epoch": 0.9836867862969005, "grad_norm": 0.423784077167511, "learning_rate": 4.918367346938776e-05, "loss": 1.2231, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1206, "tokens_per_second_per_gpu": 7.63 }, { "epoch": 0.9845024469820555, "grad_norm": 0.3906443119049072, "learning_rate": 4.922448979591837e-05, "loss": 1.3287, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1207, "tokens_per_second_per_gpu": 10.95 }, { "epoch": 0.9853181076672104, "grad_norm": 0.43709951639175415, "learning_rate": 4.926530612244899e-05, "loss": 1.3852, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1208, "tokens_per_second_per_gpu": 15.63 }, { "epoch": 0.9861337683523654, "grad_norm": 0.49447256326675415, "learning_rate": 4.930612244897959e-05, "loss": 1.3021, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1209, "tokens_per_second_per_gpu": 9.93 }, { "epoch": 0.9869494290375204, "grad_norm": 0.44613441824913025, "learning_rate": 4.93469387755102e-05, "loss": 1.3214, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1210, "tokens_per_second_per_gpu": 2.9 }, { "epoch": 0.9877650897226754, "grad_norm": 0.5062761306762695, "learning_rate": 4.938775510204082e-05, "loss": 1.316, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1211, "tokens_per_second_per_gpu": 14.04 }, { "epoch": 0.9885807504078303, "grad_norm": 0.5743575692176819, "learning_rate": 4.942857142857143e-05, "loss": 1.2422, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1212, "tokens_per_second_per_gpu": 4.06 }, { "epoch": 0.9893964110929854, "grad_norm": 0.30643418431282043, "learning_rate": 4.9469387755102045e-05, "loss": 1.2827, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1213, "tokens_per_second_per_gpu": 11.85 }, { "epoch": 0.9902120717781403, "grad_norm": 0.40725111961364746, "learning_rate": 4.9510204081632655e-05, "loss": 1.2434, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1214, "tokens_per_second_per_gpu": 12.16 }, { "epoch": 0.9910277324632952, "grad_norm": 0.3481321930885315, "learning_rate": 4.9551020408163265e-05, "loss": 1.3642, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1215, "tokens_per_second_per_gpu": 12.8 }, { "epoch": 0.9918433931484503, "grad_norm": 0.5487518310546875, "learning_rate": 4.959183673469388e-05, "loss": 1.3116, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1216, "tokens_per_second_per_gpu": 6.93 }, { "epoch": 0.9926590538336052, "grad_norm": 0.3920314311981201, "learning_rate": 4.963265306122449e-05, "loss": 1.3131, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1217, "tokens_per_second_per_gpu": 8.81 }, { "epoch": 0.9934747145187602, "grad_norm": 0.45315971970558167, "learning_rate": 4.967346938775511e-05, "loss": 1.4255, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1218, "tokens_per_second_per_gpu": 6.93 }, { "epoch": 0.9942903752039152, "grad_norm": 0.5107300877571106, "learning_rate": 4.971428571428572e-05, "loss": 1.2159, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1219, "tokens_per_second_per_gpu": 8.79 }, { "epoch": 0.9951060358890701, "grad_norm": 0.5051597952842712, "learning_rate": 4.975510204081633e-05, "loss": 1.2973, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1220, "tokens_per_second_per_gpu": 3.21 }, { "epoch": 0.9959216965742251, "grad_norm": 0.31220704317092896, "learning_rate": 4.979591836734694e-05, "loss": 1.2839, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.28, "memory/max_allocated (GiB)": 72.28, "step": 1221, "tokens_per_second_per_gpu": 6.94 }, { "epoch": 0.9967373572593801, "grad_norm": 0.3443696200847626, "learning_rate": 4.983673469387755e-05, "loss": 1.2926, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1222, "tokens_per_second_per_gpu": 10.93 }, { "epoch": 0.9975530179445351, "grad_norm": 0.6091650128364563, "learning_rate": 4.9877551020408165e-05, "loss": 1.2856, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.27, "memory/max_allocated (GiB)": 72.27, "step": 1223, "tokens_per_second_per_gpu": 4.15 }, { "epoch": 0.99836867862969, "grad_norm": 0.3919083774089813, "learning_rate": 4.9918367346938776e-05, "loss": 1.2155, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.3, "memory/max_allocated (GiB)": 72.3, "step": 1224, "tokens_per_second_per_gpu": 6.96 }, { "epoch": 0.9991843393148451, "grad_norm": 0.4975505471229553, "learning_rate": 4.995918367346939e-05, "loss": 1.3243, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 72.29, "memory/max_allocated (GiB)": 72.29, "step": 1225, "tokens_per_second_per_gpu": 8.34 }, { "epoch": 0.9991843393148451, "eval_loss": 1.3015837669372559, "eval_runtime": 3445.1657, "eval_samples_per_second": 0.435, "eval_steps_per_second": 0.145, "memory/device_reserved (GiB)": 76.34, "memory/max_active (GiB)": 58.02, "memory/max_allocated (GiB)": 58.02, "step": 1225 } ], "logging_steps": 1, "max_steps": 24500, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 1225, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2413304278351872e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null }