| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9989281886387995, |
| "eval_steps": 59, |
| "global_step": 467, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0, |
| "eval_loss": 2.934493064880371, |
| "eval_ppl": 18.81196, |
| "eval_runtime": 181.0816, |
| "eval_samples_per_second": 1.149, |
| "eval_steps_per_second": 1.149, |
| "memory/device_reserved (GiB)": 18.64, |
| "memory/max_active (GiB)": 18.28, |
| "memory/max_allocated (GiB)": 18.28, |
| "step": 0 |
| }, |
| { |
| "epoch": 0.004287245444801715, |
| "grad_norm": 0.5241990685462952, |
| "learning_rate": 0.0025, |
| "loss": 3.2761871814727783, |
| "memory/device_reserved (GiB)": 18.85, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 26.47464, |
| "step": 1, |
| "tokens/total": 2816, |
| "tokens/train_per_sec_per_gpu": 1.6, |
| "tokens/trainable": 1916 |
| }, |
| { |
| "epoch": 0.00857449088960343, |
| "grad_norm": 0.2679648697376251, |
| "learning_rate": 0.0025, |
| "loss": 2.743480920791626, |
| "memory/device_reserved (GiB)": 18.94, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 15.54099, |
| "step": 2, |
| "tokens/total": 4416, |
| "tokens/train_per_sec_per_gpu": 26.79, |
| "tokens/trainable": 2568 |
| }, |
| { |
| "epoch": 0.012861736334405145, |
| "grad_norm": 2.105276346206665, |
| "learning_rate": 0.0025, |
| "loss": 4.918459415435791, |
| "memory/device_reserved (GiB)": 18.94, |
| "memory/max_active (GiB)": 18.39, |
| "memory/max_allocated (GiB)": 18.39, |
| "ppl": 136.79171, |
| "step": 3, |
| "tokens/total": 6272, |
| "tokens/train_per_sec_per_gpu": 3.76, |
| "tokens/trainable": 3571 |
| }, |
| { |
| "epoch": 0.01714898177920686, |
| "grad_norm": 1.7258063554763794, |
| "learning_rate": 0.0025, |
| "loss": 4.470706462860107, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 19.42, |
| "memory/max_allocated (GiB)": 19.42, |
| "ppl": 87.41846, |
| "step": 4, |
| "tokens/total": 9664, |
| "tokens/train_per_sec_per_gpu": 4.21, |
| "tokens/trainable": 6033 |
| }, |
| { |
| "epoch": 0.021436227224008574, |
| "grad_norm": 0.9528670907020569, |
| "learning_rate": 0.0025, |
| "loss": 3.6473782062530518, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 38.37393, |
| "step": 5, |
| "tokens/total": 11392, |
| "tokens/train_per_sec_per_gpu": 22.21, |
| "tokens/trainable": 6851 |
| }, |
| { |
| "epoch": 0.02572347266881029, |
| "grad_norm": 1.7168548107147217, |
| "learning_rate": 0.0025, |
| "loss": 3.641300678253174, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.63, |
| "memory/max_allocated (GiB)": 18.63, |
| "ppl": 38.14141, |
| "step": 6, |
| "tokens/total": 14464, |
| "tokens/train_per_sec_per_gpu": 25.23, |
| "tokens/trainable": 8951 |
| }, |
| { |
| "epoch": 0.030010718113612004, |
| "grad_norm": 9.308958053588867, |
| "learning_rate": 0.0025, |
| "loss": 11.468059539794922, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 95612.56656, |
| "step": 7, |
| "tokens/total": 16512, |
| "tokens/train_per_sec_per_gpu": 43.04, |
| "tokens/trainable": 9981 |
| }, |
| { |
| "epoch": 0.03429796355841372, |
| "grad_norm": 4.779436111450195, |
| "learning_rate": 0.0025, |
| "loss": 12.928940773010254, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.73, |
| "memory/max_allocated (GiB)": 18.73, |
| "ppl": 412066.80902, |
| "step": 8, |
| "tokens/total": 20160, |
| "tokens/train_per_sec_per_gpu": 206.05, |
| "tokens/trainable": 12656 |
| }, |
| { |
| "epoch": 0.03858520900321544, |
| "grad_norm": 11.870071411132812, |
| "learning_rate": 0.0025, |
| "loss": 26.585786819458008, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 351609428878.3472, |
| "step": 9, |
| "tokens/total": 22272, |
| "tokens/train_per_sec_per_gpu": 21.53, |
| "tokens/trainable": 13844 |
| }, |
| { |
| "epoch": 0.04287245444801715, |
| "grad_norm": 3.3879966735839844, |
| "learning_rate": 0.0025, |
| "loss": 21.875102996826172, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 3164000347.95887, |
| "step": 10, |
| "tokens/total": 24256, |
| "tokens/train_per_sec_per_gpu": 53.92, |
| "tokens/trainable": 15016 |
| }, |
| { |
| "epoch": 0.04715969989281887, |
| "grad_norm": 0.25865933299064636, |
| "learning_rate": 0.0025, |
| "loss": 14.399214744567871, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.53, |
| "memory/max_allocated (GiB)": 18.53, |
| "ppl": 1792666.51864, |
| "step": 11, |
| "tokens/total": 27008, |
| "tokens/train_per_sec_per_gpu": 67.68, |
| "tokens/trainable": 16945 |
| }, |
| { |
| "epoch": 0.05144694533762058, |
| "grad_norm": 3.67203950881958, |
| "learning_rate": 0.0025, |
| "loss": 12.23837947845459, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.59, |
| "memory/max_allocated (GiB)": 18.59, |
| "ppl": 206566.87286, |
| "step": 12, |
| "tokens/total": 29568, |
| "tokens/train_per_sec_per_gpu": 20.56, |
| "tokens/trainable": 18583 |
| }, |
| { |
| "epoch": 0.055734190782422297, |
| "grad_norm": 2.215568780899048, |
| "learning_rate": 0.0025, |
| "loss": 15.4072847366333, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 4912456.77743, |
| "step": 13, |
| "tokens/total": 31936, |
| "tokens/train_per_sec_per_gpu": 30.37, |
| "tokens/trainable": 19999 |
| }, |
| { |
| "epoch": 0.06002143622722401, |
| "grad_norm": 1.5136427879333496, |
| "learning_rate": 0.0025, |
| "loss": 11.96999454498291, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.37, |
| "memory/max_allocated (GiB)": 18.37, |
| "ppl": 157943.79881, |
| "step": 14, |
| "tokens/total": 33856, |
| "tokens/train_per_sec_per_gpu": 4.74, |
| "tokens/trainable": 21034 |
| }, |
| { |
| "epoch": 0.06430868167202572, |
| "grad_norm": 12.923323631286621, |
| "learning_rate": 0.0025, |
| "loss": 11.865056991577148, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.87, |
| "memory/max_allocated (GiB)": 18.87, |
| "ppl": 142209.55548, |
| "step": 15, |
| "tokens/total": 37632, |
| "tokens/train_per_sec_per_gpu": 4.06, |
| "tokens/trainable": 23846 |
| }, |
| { |
| "epoch": 0.06859592711682744, |
| "grad_norm": 1.347749948501587, |
| "learning_rate": 0.0025, |
| "loss": 9.3814115524292, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 11865.75207, |
| "step": 16, |
| "tokens/total": 39744, |
| "tokens/train_per_sec_per_gpu": 78.77, |
| "tokens/trainable": 24979 |
| }, |
| { |
| "epoch": 0.07288317256162916, |
| "grad_norm": 0.6687317490577698, |
| "learning_rate": 0.0025, |
| "loss": 8.9339599609375, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 7585.24325, |
| "step": 17, |
| "tokens/total": 42368, |
| "tokens/train_per_sec_per_gpu": 30.07, |
| "tokens/trainable": 26674 |
| }, |
| { |
| "epoch": 0.07717041800643087, |
| "grad_norm": 1.8986364603042603, |
| "learning_rate": 0.0025, |
| "loss": 10.20353889465332, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 26998.56228, |
| "step": 18, |
| "tokens/total": 44480, |
| "tokens/train_per_sec_per_gpu": 10.04, |
| "tokens/trainable": 27850 |
| }, |
| { |
| "epoch": 0.08145766345123258, |
| "grad_norm": 0.3103311359882355, |
| "learning_rate": 0.0025, |
| "loss": 8.43118953704834, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 4587.95439, |
| "step": 19, |
| "tokens/total": 45952, |
| "tokens/train_per_sec_per_gpu": 4.36, |
| "tokens/trainable": 28429 |
| }, |
| { |
| "epoch": 0.0857449088960343, |
| "grad_norm": 0.33680954575538635, |
| "learning_rate": 0.0025, |
| "loss": 8.632229804992676, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 5609.57257, |
| "step": 20, |
| "tokens/total": 48512, |
| "tokens/train_per_sec_per_gpu": 157.2, |
| "tokens/trainable": 29973 |
| }, |
| { |
| "epoch": 0.09003215434083602, |
| "grad_norm": 0.3231711983680725, |
| "learning_rate": 0.0025, |
| "loss": 8.600848197937012, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.86, |
| "memory/max_allocated (GiB)": 18.86, |
| "ppl": 5436.26867, |
| "step": 21, |
| "tokens/total": 51264, |
| "tokens/train_per_sec_per_gpu": 2.82, |
| "tokens/trainable": 31831 |
| }, |
| { |
| "epoch": 0.09431939978563773, |
| "grad_norm": 0.28503984212875366, |
| "learning_rate": 0.0025, |
| "loss": 8.879640579223633, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.38, |
| "memory/max_allocated (GiB)": 18.38, |
| "ppl": 7184.20812, |
| "step": 22, |
| "tokens/total": 52736, |
| "tokens/train_per_sec_per_gpu": 52.3, |
| "tokens/trainable": 32257 |
| }, |
| { |
| "epoch": 0.09860664523043944, |
| "grad_norm": 0.17652413249015808, |
| "learning_rate": 0.0025, |
| "loss": 8.288322448730469, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.66, |
| "memory/max_allocated (GiB)": 18.66, |
| "ppl": 3977.15671, |
| "step": 23, |
| "tokens/total": 55936, |
| "tokens/train_per_sec_per_gpu": 2.04, |
| "tokens/trainable": 34507 |
| }, |
| { |
| "epoch": 0.10289389067524116, |
| "grad_norm": 0.17270459234714508, |
| "learning_rate": 0.0025, |
| "loss": 7.758934020996094, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 2342.40632, |
| "step": 24, |
| "tokens/total": 57856, |
| "tokens/train_per_sec_per_gpu": 52.59, |
| "tokens/trainable": 35539 |
| }, |
| { |
| "epoch": 0.10718113612004287, |
| "grad_norm": 0.14134642481803894, |
| "learning_rate": 0.0025, |
| "loss": 7.277863502502441, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 1447.8913, |
| "step": 25, |
| "tokens/total": 60288, |
| "tokens/train_per_sec_per_gpu": 175.7, |
| "tokens/trainable": 37092 |
| }, |
| { |
| "epoch": 0.11146838156484459, |
| "grad_norm": 0.3017260730266571, |
| "learning_rate": 0.0025, |
| "loss": 7.528397083282471, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 1860.12149, |
| "step": 26, |
| "tokens/total": 62784, |
| "tokens/train_per_sec_per_gpu": 67.24, |
| "tokens/trainable": 38599 |
| }, |
| { |
| "epoch": 0.1157556270096463, |
| "grad_norm": 0.1806621253490448, |
| "learning_rate": 0.0025, |
| "loss": 7.274528503417969, |
| "memory/device_reserved (GiB)": 19.79, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 1443.07063, |
| "step": 27, |
| "tokens/total": 65024, |
| "tokens/train_per_sec_per_gpu": 22.31, |
| "tokens/trainable": 39852 |
| }, |
| { |
| "epoch": 0.12004287245444802, |
| "grad_norm": 0.4423042833805084, |
| "learning_rate": 0.0025, |
| "loss": 6.991304874420166, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 19.54, |
| "memory/max_allocated (GiB)": 19.54, |
| "ppl": 1087.13913, |
| "step": 28, |
| "tokens/total": 70784, |
| "tokens/train_per_sec_per_gpu": 70.63, |
| "tokens/trainable": 44731 |
| }, |
| { |
| "epoch": 0.12433011789924973, |
| "grad_norm": 0.22539205849170685, |
| "learning_rate": 0.0025, |
| "loss": 6.762757778167725, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 865.02445, |
| "step": 29, |
| "tokens/total": 73024, |
| "tokens/train_per_sec_per_gpu": 73.89, |
| "tokens/trainable": 46095 |
| }, |
| { |
| "epoch": 0.12861736334405144, |
| "grad_norm": 0.585552990436554, |
| "learning_rate": 0.0025, |
| "loss": 6.865115165710449, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.64, |
| "memory/max_allocated (GiB)": 18.64, |
| "ppl": 958.25619, |
| "step": 30, |
| "tokens/total": 75456, |
| "tokens/train_per_sec_per_gpu": 67.76, |
| "tokens/trainable": 47570 |
| }, |
| { |
| "epoch": 0.13290460878885316, |
| "grad_norm": 0.9950224161148071, |
| "learning_rate": 0.0025, |
| "loss": 7.070884704589844, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 1177.18904, |
| "step": 31, |
| "tokens/total": 77824, |
| "tokens/train_per_sec_per_gpu": 29.22, |
| "tokens/trainable": 48877 |
| }, |
| { |
| "epoch": 0.13719185423365488, |
| "grad_norm": 1.1349307298660278, |
| "learning_rate": 0.0025, |
| "loss": 6.7641096115112305, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 866.19461, |
| "step": 32, |
| "tokens/total": 79680, |
| "tokens/train_per_sec_per_gpu": 12.6, |
| "tokens/trainable": 49878 |
| }, |
| { |
| "epoch": 0.1414790996784566, |
| "grad_norm": 0.19686993956565857, |
| "learning_rate": 0.0025, |
| "loss": 6.472617149353027, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 647.17527, |
| "step": 33, |
| "tokens/total": 81792, |
| "tokens/train_per_sec_per_gpu": 110.67, |
| "tokens/trainable": 51049 |
| }, |
| { |
| "epoch": 0.1457663451232583, |
| "grad_norm": 0.4460529685020447, |
| "learning_rate": 0.0025, |
| "loss": 6.628453254699707, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 756.31144, |
| "step": 34, |
| "tokens/total": 84032, |
| "tokens/train_per_sec_per_gpu": 15.12, |
| "tokens/trainable": 52250 |
| }, |
| { |
| "epoch": 0.15005359056806003, |
| "grad_norm": 0.09157463908195496, |
| "learning_rate": 0.0025, |
| "loss": 6.950263977050781, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 1043.42513, |
| "step": 35, |
| "tokens/total": 86656, |
| "tokens/train_per_sec_per_gpu": 25.29, |
| "tokens/trainable": 53911 |
| }, |
| { |
| "epoch": 0.15434083601286175, |
| "grad_norm": 1.2685779333114624, |
| "learning_rate": 0.0025, |
| "loss": 6.143320083618164, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 465.59683, |
| "step": 36, |
| "tokens/total": 88640, |
| "tokens/train_per_sec_per_gpu": 15.77, |
| "tokens/trainable": 54904 |
| }, |
| { |
| "epoch": 0.15862808145766344, |
| "grad_norm": 0.31893390417099, |
| "learning_rate": 0.0025, |
| "loss": 6.968283653259277, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.38, |
| "memory/max_allocated (GiB)": 18.38, |
| "ppl": 1062.39774, |
| "step": 37, |
| "tokens/total": 90496, |
| "tokens/train_per_sec_per_gpu": 58.59, |
| "tokens/trainable": 55743 |
| }, |
| { |
| "epoch": 0.16291532690246516, |
| "grad_norm": 1.0469295978546143, |
| "learning_rate": 0.0025, |
| "loss": 7.234709739685059, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 1386.73832, |
| "step": 38, |
| "tokens/total": 92352, |
| "tokens/train_per_sec_per_gpu": 0.83, |
| "tokens/trainable": 56555 |
| }, |
| { |
| "epoch": 0.16720257234726688, |
| "grad_norm": 0.2749118506908417, |
| "learning_rate": 0.0025, |
| "loss": 6.587377071380615, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 725.87445, |
| "step": 39, |
| "tokens/total": 94400, |
| "tokens/train_per_sec_per_gpu": 33.93, |
| "tokens/trainable": 57680 |
| }, |
| { |
| "epoch": 0.1714898177920686, |
| "grad_norm": 0.18221265077590942, |
| "learning_rate": 0.0025, |
| "loss": 6.830031871795654, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 925.2203, |
| "step": 40, |
| "tokens/total": 96512, |
| "tokens/train_per_sec_per_gpu": 45.38, |
| "tokens/trainable": 58955 |
| }, |
| { |
| "epoch": 0.1757770632368703, |
| "grad_norm": 0.24708712100982666, |
| "learning_rate": 0.0025, |
| "loss": 6.151037693023682, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 469.20402, |
| "step": 41, |
| "tokens/total": 98752, |
| "tokens/train_per_sec_per_gpu": 74.94, |
| "tokens/trainable": 60277 |
| }, |
| { |
| "epoch": 0.18006430868167203, |
| "grad_norm": 0.17541086673736572, |
| "learning_rate": 0.0025, |
| "loss": 6.18589973449707, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.63, |
| "memory/max_allocated (GiB)": 18.63, |
| "ppl": 485.8499, |
| "step": 42, |
| "tokens/total": 101440, |
| "tokens/train_per_sec_per_gpu": 19.12, |
| "tokens/trainable": 61928 |
| }, |
| { |
| "epoch": 0.18435155412647375, |
| "grad_norm": 0.5008364319801331, |
| "learning_rate": 0.0025, |
| "loss": 6.555770397186279, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 703.29075, |
| "step": 43, |
| "tokens/total": 103104, |
| "tokens/train_per_sec_per_gpu": 8.28, |
| "tokens/trainable": 62757 |
| }, |
| { |
| "epoch": 0.18863879957127547, |
| "grad_norm": 0.8753749132156372, |
| "learning_rate": 0.0025, |
| "loss": 6.545720100402832, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.54, |
| "memory/max_allocated (GiB)": 18.54, |
| "ppl": 696.25787, |
| "step": 44, |
| "tokens/total": 105536, |
| "tokens/train_per_sec_per_gpu": 4.81, |
| "tokens/trainable": 64177 |
| }, |
| { |
| "epoch": 0.19292604501607716, |
| "grad_norm": 0.22259370982646942, |
| "learning_rate": 0.0025, |
| "loss": 6.528131008148193, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 684.1184, |
| "step": 45, |
| "tokens/total": 107584, |
| "tokens/train_per_sec_per_gpu": 14.82, |
| "tokens/trainable": 65253 |
| }, |
| { |
| "epoch": 0.19721329046087888, |
| "grad_norm": 0.10767526179552078, |
| "learning_rate": 0.0025, |
| "loss": 6.292204856872559, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 540.3434, |
| "step": 46, |
| "tokens/total": 109632, |
| "tokens/train_per_sec_per_gpu": 19.33, |
| "tokens/trainable": 66464 |
| }, |
| { |
| "epoch": 0.2015005359056806, |
| "grad_norm": 0.12123644351959229, |
| "learning_rate": 0.0025, |
| "loss": 6.8791117668151855, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 971.76282, |
| "step": 47, |
| "tokens/total": 112192, |
| "tokens/train_per_sec_per_gpu": 223.37, |
| "tokens/trainable": 68117 |
| }, |
| { |
| "epoch": 0.2057877813504823, |
| "grad_norm": 0.1776631772518158, |
| "learning_rate": 0.0025, |
| "loss": 6.613104343414307, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.38, |
| "memory/max_allocated (GiB)": 18.38, |
| "ppl": 744.79152, |
| "step": 48, |
| "tokens/total": 113856, |
| "tokens/train_per_sec_per_gpu": 54.59, |
| "tokens/trainable": 68805 |
| }, |
| { |
| "epoch": 0.21007502679528403, |
| "grad_norm": 0.16078130900859833, |
| "learning_rate": 0.0025, |
| "loss": 6.84950065612793, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 943.4097, |
| "step": 49, |
| "tokens/total": 116032, |
| "tokens/train_per_sec_per_gpu": 75.86, |
| "tokens/trainable": 70058 |
| }, |
| { |
| "epoch": 0.21436227224008575, |
| "grad_norm": 0.10362584888935089, |
| "learning_rate": 0.0025, |
| "loss": 7.038058280944824, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.54, |
| "memory/max_allocated (GiB)": 18.54, |
| "ppl": 1139.1735, |
| "step": 50, |
| "tokens/total": 118336, |
| "tokens/train_per_sec_per_gpu": 60.13, |
| "tokens/trainable": 71390 |
| }, |
| { |
| "epoch": 0.21864951768488747, |
| "grad_norm": 0.14731702208518982, |
| "learning_rate": 0.0025, |
| "loss": 5.994701385498047, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 401.29683, |
| "step": 51, |
| "tokens/total": 120384, |
| "tokens/train_per_sec_per_gpu": 3.02, |
| "tokens/trainable": 72578 |
| }, |
| { |
| "epoch": 0.22293676312968919, |
| "grad_norm": 0.15322738885879517, |
| "learning_rate": 0.0025, |
| "loss": 7.148180961608887, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 1271.79041, |
| "step": 52, |
| "tokens/total": 122432, |
| "tokens/train_per_sec_per_gpu": 49.03, |
| "tokens/trainable": 73654 |
| }, |
| { |
| "epoch": 0.22722400857449088, |
| "grad_norm": 0.2865282893180847, |
| "learning_rate": 0.0025, |
| "loss": 6.537259578704834, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 690.39202, |
| "step": 53, |
| "tokens/total": 125056, |
| "tokens/train_per_sec_per_gpu": 37.17, |
| "tokens/trainable": 75448 |
| }, |
| { |
| "epoch": 0.2315112540192926, |
| "grad_norm": 0.19199238717556, |
| "learning_rate": 0.0025, |
| "loss": 6.6607537269592285, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 781.13948, |
| "step": 54, |
| "tokens/total": 127360, |
| "tokens/train_per_sec_per_gpu": 153.39, |
| "tokens/trainable": 76842 |
| }, |
| { |
| "epoch": 0.2357984994640943, |
| "grad_norm": 0.07870891690254211, |
| "learning_rate": 0.0025, |
| "loss": 6.618167400360107, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.88, |
| "memory/max_allocated (GiB)": 18.88, |
| "ppl": 748.57201, |
| "step": 55, |
| "tokens/total": 131008, |
| "tokens/train_per_sec_per_gpu": 12.42, |
| "tokens/trainable": 79539 |
| }, |
| { |
| "epoch": 0.24008574490889603, |
| "grad_norm": 0.10853379964828491, |
| "learning_rate": 0.0025, |
| "loss": 6.207864761352539, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 496.63967, |
| "step": 56, |
| "tokens/total": 132800, |
| "tokens/train_per_sec_per_gpu": 9.35, |
| "tokens/trainable": 80444 |
| }, |
| { |
| "epoch": 0.24437299035369775, |
| "grad_norm": 0.06515457481145859, |
| "learning_rate": 0.0025, |
| "loss": 6.508807182312012, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 671.02553, |
| "step": 57, |
| "tokens/total": 135040, |
| "tokens/train_per_sec_per_gpu": 112.03, |
| "tokens/trainable": 81812 |
| }, |
| { |
| "epoch": 0.24866023579849947, |
| "grad_norm": 0.12576992809772491, |
| "learning_rate": 0.0025, |
| "loss": 6.428314208984375, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 619.12935, |
| "step": 58, |
| "tokens/total": 137472, |
| "tokens/train_per_sec_per_gpu": 117.23, |
| "tokens/trainable": 83345 |
| }, |
| { |
| "epoch": 0.2529474812433012, |
| "grad_norm": 0.11791636049747467, |
| "learning_rate": 0.0025, |
| "loss": 6.194557189941406, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.38, |
| "memory/max_allocated (GiB)": 18.38, |
| "ppl": 490.07439, |
| "step": 59, |
| "tokens/total": 139136, |
| "tokens/train_per_sec_per_gpu": 54.16, |
| "tokens/trainable": 84003 |
| }, |
| { |
| "epoch": 0.2529474812433012, |
| "eval_loss": 6.30873441696167, |
| "eval_ppl": 549.34926, |
| "eval_runtime": 17.198, |
| "eval_samples_per_second": 12.094, |
| "eval_steps_per_second": 12.094, |
| "memory/device_reserved (GiB)": 19.96, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.2572347266881029, |
| "grad_norm": 0.20866750180721283, |
| "learning_rate": 0.0025, |
| "loss": 6.447090148925781, |
| "memory/device_reserved (GiB)": 18.69, |
| "memory/max_active (GiB)": 18.54, |
| "memory/max_allocated (GiB)": 18.54, |
| "ppl": 630.8639, |
| "step": 60, |
| "tokens/total": 142208, |
| "tokens/train_per_sec_per_gpu": 37.47, |
| "tokens/trainable": 86018 |
| }, |
| { |
| "epoch": 0.2615219721329046, |
| "grad_norm": 0.08183833956718445, |
| "learning_rate": 0.0025, |
| "loss": 6.008518218994141, |
| "memory/device_reserved (GiB)": 18.73, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 406.87997, |
| "step": 61, |
| "tokens/total": 144000, |
| "tokens/train_per_sec_per_gpu": 22.8, |
| "tokens/trainable": 86897 |
| }, |
| { |
| "epoch": 0.2658092175777063, |
| "grad_norm": 0.21841929852962494, |
| "learning_rate": 0.0025, |
| "loss": 6.573157787322998, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.82, |
| "memory/max_allocated (GiB)": 18.82, |
| "ppl": 715.62607, |
| "step": 62, |
| "tokens/total": 147520, |
| "tokens/train_per_sec_per_gpu": 19.02, |
| "tokens/trainable": 89504 |
| }, |
| { |
| "epoch": 0.27009646302250806, |
| "grad_norm": 0.09511567652225494, |
| "learning_rate": 0.0025, |
| "loss": 6.555922985076904, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 703.39808, |
| "step": 63, |
| "tokens/total": 150016, |
| "tokens/train_per_sec_per_gpu": 183.78, |
| "tokens/trainable": 91081 |
| }, |
| { |
| "epoch": 0.27438370846730975, |
| "grad_norm": 0.07103318721055984, |
| "learning_rate": 0.0025, |
| "loss": 6.265772342681885, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.54, |
| "memory/max_allocated (GiB)": 18.54, |
| "ppl": 526.24787, |
| "step": 64, |
| "tokens/total": 152384, |
| "tokens/train_per_sec_per_gpu": 50.64, |
| "tokens/trainable": 92465 |
| }, |
| { |
| "epoch": 0.27867095391211144, |
| "grad_norm": 0.14229358732700348, |
| "learning_rate": 0.0025, |
| "loss": 6.940990924835205, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.38, |
| "memory/max_allocated (GiB)": 18.38, |
| "ppl": 1033.79412, |
| "step": 65, |
| "tokens/total": 153792, |
| "tokens/train_per_sec_per_gpu": 2.2, |
| "tokens/trainable": 93043 |
| }, |
| { |
| "epoch": 0.2829581993569132, |
| "grad_norm": 0.148279070854187, |
| "learning_rate": 0.0025, |
| "loss": 6.338968276977539, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 566.21184, |
| "step": 66, |
| "tokens/total": 156352, |
| "tokens/train_per_sec_per_gpu": 56.15, |
| "tokens/trainable": 94644 |
| }, |
| { |
| "epoch": 0.2872454448017149, |
| "grad_norm": 0.18289832770824432, |
| "learning_rate": 0.0025, |
| "loss": 6.198648929595947, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 492.08375, |
| "step": 67, |
| "tokens/total": 158144, |
| "tokens/train_per_sec_per_gpu": 72.42, |
| "tokens/trainable": 95506 |
| }, |
| { |
| "epoch": 0.2915326902465166, |
| "grad_norm": 0.11502089351415634, |
| "learning_rate": 0.0025, |
| "loss": 6.177389621734619, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 481.73281, |
| "step": 68, |
| "tokens/total": 159808, |
| "tokens/train_per_sec_per_gpu": 13.1, |
| "tokens/trainable": 96311 |
| }, |
| { |
| "epoch": 0.2958199356913183, |
| "grad_norm": 0.766052782535553, |
| "learning_rate": 0.0025, |
| "loss": 6.210302829742432, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 497.85199, |
| "step": 69, |
| "tokens/total": 162240, |
| "tokens/train_per_sec_per_gpu": 15.54, |
| "tokens/trainable": 97821 |
| }, |
| { |
| "epoch": 0.30010718113612006, |
| "grad_norm": 0.16312995553016663, |
| "learning_rate": 0.0025, |
| "loss": 6.322083950042725, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 556.73199, |
| "step": 70, |
| "tokens/total": 163968, |
| "tokens/train_per_sec_per_gpu": 35.21, |
| "tokens/trainable": 98557 |
| }, |
| { |
| "epoch": 0.30439442658092175, |
| "grad_norm": 0.09915313869714737, |
| "learning_rate": 0.0025, |
| "loss": 5.732677459716797, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 308.79495, |
| "step": 71, |
| "tokens/total": 165952, |
| "tokens/train_per_sec_per_gpu": 126.46, |
| "tokens/trainable": 99716 |
| }, |
| { |
| "epoch": 0.3086816720257235, |
| "grad_norm": 0.3138703405857086, |
| "learning_rate": 0.0025, |
| "loss": 5.753807067871094, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.39, |
| "memory/max_allocated (GiB)": 18.39, |
| "ppl": 315.38909, |
| "step": 72, |
| "tokens/total": 167872, |
| "tokens/train_per_sec_per_gpu": 27.76, |
| "tokens/trainable": 100641 |
| }, |
| { |
| "epoch": 0.3129689174705252, |
| "grad_norm": 0.2080921083688736, |
| "learning_rate": 0.0025, |
| "loss": 6.6848883628845215, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.6, |
| "memory/max_allocated (GiB)": 18.6, |
| "ppl": 800.22134, |
| "step": 73, |
| "tokens/total": 170304, |
| "tokens/train_per_sec_per_gpu": 275.0, |
| "tokens/trainable": 102108 |
| }, |
| { |
| "epoch": 0.3172561629153269, |
| "grad_norm": 0.09075198322534561, |
| "learning_rate": 0.0025, |
| "loss": 5.682041168212891, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 293.548, |
| "step": 74, |
| "tokens/total": 171840, |
| "tokens/train_per_sec_per_gpu": 24.72, |
| "tokens/trainable": 102828 |
| }, |
| { |
| "epoch": 0.3215434083601286, |
| "grad_norm": 0.19326482713222504, |
| "learning_rate": 0.0025, |
| "loss": 6.265585899353027, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 526.14977, |
| "step": 75, |
| "tokens/total": 173632, |
| "tokens/train_per_sec_per_gpu": 9.51, |
| "tokens/trainable": 103667 |
| }, |
| { |
| "epoch": 0.3258306538049303, |
| "grad_norm": 0.09544038772583008, |
| "learning_rate": 0.0025, |
| "loss": 5.699280261993408, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.42, |
| "memory/max_allocated (GiB)": 18.42, |
| "ppl": 298.65237, |
| "step": 76, |
| "tokens/total": 175808, |
| "tokens/train_per_sec_per_gpu": 2.17, |
| "tokens/trainable": 104870 |
| }, |
| { |
| "epoch": 0.33011789924973206, |
| "grad_norm": 0.7390451431274414, |
| "learning_rate": 0.0025, |
| "loss": 6.410941123962402, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 608.46605, |
| "step": 77, |
| "tokens/total": 177664, |
| "tokens/train_per_sec_per_gpu": 10.88, |
| "tokens/trainable": 105944 |
| }, |
| { |
| "epoch": 0.33440514469453375, |
| "grad_norm": 0.14168062806129456, |
| "learning_rate": 0.0025, |
| "loss": 7.039776802062988, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 1141.13288, |
| "step": 78, |
| "tokens/total": 179584, |
| "tokens/train_per_sec_per_gpu": 103.93, |
| "tokens/trainable": 106970 |
| }, |
| { |
| "epoch": 0.3386923901393355, |
| "grad_norm": 0.8850395679473877, |
| "learning_rate": 0.0025, |
| "loss": 6.027164459228516, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 414.53792, |
| "step": 79, |
| "tokens/total": 181312, |
| "tokens/train_per_sec_per_gpu": 97.76, |
| "tokens/trainable": 107821 |
| }, |
| { |
| "epoch": 0.3429796355841372, |
| "grad_norm": 0.17418481409549713, |
| "learning_rate": 0.0025, |
| "loss": 7.046483993530273, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.57, |
| "memory/max_allocated (GiB)": 18.57, |
| "ppl": 1148.8124, |
| "step": 80, |
| "tokens/total": 183936, |
| "tokens/train_per_sec_per_gpu": 123.29, |
| "tokens/trainable": 109492 |
| }, |
| { |
| "epoch": 0.34726688102893893, |
| "grad_norm": 0.09937312453985214, |
| "learning_rate": 0.0025, |
| "loss": 5.586414337158203, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 266.77733, |
| "step": 81, |
| "tokens/total": 185600, |
| "tokens/train_per_sec_per_gpu": 23.19, |
| "tokens/trainable": 110255 |
| }, |
| { |
| "epoch": 0.3515541264737406, |
| "grad_norm": 0.3117140233516693, |
| "learning_rate": 0.0025, |
| "loss": 6.475000381469727, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.54, |
| "memory/max_allocated (GiB)": 18.54, |
| "ppl": 648.71948, |
| "step": 82, |
| "tokens/total": 187776, |
| "tokens/train_per_sec_per_gpu": 44.82, |
| "tokens/trainable": 111472 |
| }, |
| { |
| "epoch": 0.3558413719185423, |
| "grad_norm": 0.3783544600009918, |
| "learning_rate": 0.0025, |
| "loss": 6.57240104675293, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 715.08474, |
| "step": 83, |
| "tokens/total": 189952, |
| "tokens/train_per_sec_per_gpu": 106.76, |
| "tokens/trainable": 112729 |
| }, |
| { |
| "epoch": 0.36012861736334406, |
| "grad_norm": 0.0920150876045227, |
| "learning_rate": 0.0025, |
| "loss": 6.143086910247803, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.54, |
| "memory/max_allocated (GiB)": 18.54, |
| "ppl": 465.48828, |
| "step": 84, |
| "tokens/total": 192320, |
| "tokens/train_per_sec_per_gpu": 49.38, |
| "tokens/trainable": 114215 |
| }, |
| { |
| "epoch": 0.36441586280814575, |
| "grad_norm": 0.09526661038398743, |
| "learning_rate": 0.0025, |
| "loss": 5.70522403717041, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 300.43278, |
| "step": 85, |
| "tokens/total": 194048, |
| "tokens/train_per_sec_per_gpu": 31.47, |
| "tokens/trainable": 114924 |
| }, |
| { |
| "epoch": 0.3687031082529475, |
| "grad_norm": 0.15185925364494324, |
| "learning_rate": 0.0025, |
| "loss": 6.073705673217773, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.39, |
| "memory/max_allocated (GiB)": 18.39, |
| "ppl": 434.28703, |
| "step": 86, |
| "tokens/total": 195392, |
| "tokens/train_per_sec_per_gpu": 10.82, |
| "tokens/trainable": 115406 |
| }, |
| { |
| "epoch": 0.3729903536977492, |
| "grad_norm": 0.11615428328514099, |
| "learning_rate": 0.0025, |
| "loss": 6.724306106567383, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 832.39418, |
| "step": 87, |
| "tokens/total": 197760, |
| "tokens/train_per_sec_per_gpu": 32.79, |
| "tokens/trainable": 116773 |
| }, |
| { |
| "epoch": 0.37727759914255093, |
| "grad_norm": 0.602443277835846, |
| "learning_rate": 0.0025, |
| "loss": 6.486382007598877, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 656.14514, |
| "step": 88, |
| "tokens/total": 200192, |
| "tokens/train_per_sec_per_gpu": 45.96, |
| "tokens/trainable": 118226 |
| }, |
| { |
| "epoch": 0.3815648445873526, |
| "grad_norm": 0.09665144979953766, |
| "learning_rate": 0.0025, |
| "loss": 6.018401622772217, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 410.92126, |
| "step": 89, |
| "tokens/total": 202816, |
| "tokens/train_per_sec_per_gpu": 39.89, |
| "tokens/trainable": 119912 |
| }, |
| { |
| "epoch": 0.3858520900321543, |
| "grad_norm": 0.11786024272441864, |
| "learning_rate": 0.0025, |
| "loss": 6.514355659484863, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 674.75905, |
| "step": 90, |
| "tokens/total": 204544, |
| "tokens/train_per_sec_per_gpu": 16.48, |
| "tokens/trainable": 120771 |
| }, |
| { |
| "epoch": 0.39013933547695606, |
| "grad_norm": 0.07515699416399002, |
| "learning_rate": 0.0025, |
| "loss": 5.9910969734191895, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 399.853, |
| "step": 91, |
| "tokens/total": 206464, |
| "tokens/train_per_sec_per_gpu": 37.79, |
| "tokens/trainable": 121829 |
| }, |
| { |
| "epoch": 0.39442658092175775, |
| "grad_norm": 0.23832163214683533, |
| "learning_rate": 0.0025, |
| "loss": 6.451230525970459, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 633.48133, |
| "step": 92, |
| "tokens/total": 208384, |
| "tokens/train_per_sec_per_gpu": 6.91, |
| "tokens/trainable": 122828 |
| }, |
| { |
| "epoch": 0.3987138263665595, |
| "grad_norm": 0.06281202286481857, |
| "learning_rate": 0.0025, |
| "loss": 5.861863613128662, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 351.37837, |
| "step": 93, |
| "tokens/total": 210688, |
| "tokens/train_per_sec_per_gpu": 96.78, |
| "tokens/trainable": 124211 |
| }, |
| { |
| "epoch": 0.4030010718113612, |
| "grad_norm": 0.1325235366821289, |
| "learning_rate": 0.0025, |
| "loss": 6.324355125427246, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.36, |
| "memory/max_allocated (GiB)": 18.36, |
| "ppl": 557.99786, |
| "step": 94, |
| "tokens/total": 212096, |
| "tokens/train_per_sec_per_gpu": 32.79, |
| "tokens/trainable": 124710 |
| }, |
| { |
| "epoch": 0.40728831725616294, |
| "grad_norm": 0.159224733710289, |
| "learning_rate": 0.0025, |
| "loss": 5.801607131958008, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 330.83082, |
| "step": 95, |
| "tokens/total": 213696, |
| "tokens/train_per_sec_per_gpu": 21.94, |
| "tokens/trainable": 125363 |
| }, |
| { |
| "epoch": 0.4115755627009646, |
| "grad_norm": 0.1464780569076538, |
| "learning_rate": 0.0025, |
| "loss": 6.5112481117248535, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 672.66546, |
| "step": 96, |
| "tokens/total": 216320, |
| "tokens/train_per_sec_per_gpu": 106.96, |
| "tokens/trainable": 127129 |
| }, |
| { |
| "epoch": 0.41586280814576637, |
| "grad_norm": 0.10386360436677933, |
| "learning_rate": 0.0025, |
| "loss": 6.267631530761719, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 527.22718, |
| "step": 97, |
| "tokens/total": 218368, |
| "tokens/train_per_sec_per_gpu": 92.64, |
| "tokens/trainable": 128258 |
| }, |
| { |
| "epoch": 0.42015005359056806, |
| "grad_norm": 0.08260782063007355, |
| "learning_rate": 0.0025, |
| "loss": 6.162295341491699, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.53, |
| "memory/max_allocated (GiB)": 18.53, |
| "ppl": 474.516, |
| "step": 98, |
| "tokens/total": 220288, |
| "tokens/train_per_sec_per_gpu": 228.13, |
| "tokens/trainable": 129259 |
| }, |
| { |
| "epoch": 0.42443729903536975, |
| "grad_norm": 0.15949538350105286, |
| "learning_rate": 0.0025, |
| "loss": 6.426211833953857, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.53, |
| "memory/max_allocated (GiB)": 18.53, |
| "ppl": 617.82907, |
| "step": 99, |
| "tokens/total": 222528, |
| "tokens/train_per_sec_per_gpu": 27.38, |
| "tokens/trainable": 130563 |
| }, |
| { |
| "epoch": 0.4287245444801715, |
| "grad_norm": 0.13608896732330322, |
| "learning_rate": 0.0025, |
| "loss": 5.752037048339844, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 314.83133, |
| "step": 100, |
| "tokens/total": 224320, |
| "tokens/train_per_sec_per_gpu": 4.44, |
| "tokens/trainable": 131392 |
| }, |
| { |
| "epoch": 0.4330117899249732, |
| "grad_norm": 0.0612945631146431, |
| "learning_rate": 0.0025, |
| "loss": 6.353605270385742, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.59, |
| "memory/max_allocated (GiB)": 18.59, |
| "ppl": 574.56043, |
| "step": 101, |
| "tokens/total": 226496, |
| "tokens/train_per_sec_per_gpu": 64.63, |
| "tokens/trainable": 132634 |
| }, |
| { |
| "epoch": 0.43729903536977494, |
| "grad_norm": 0.3771326243877411, |
| "learning_rate": 0.0025, |
| "loss": 5.794309616088867, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 328.42537, |
| "step": 102, |
| "tokens/total": 228736, |
| "tokens/train_per_sec_per_gpu": 23.41, |
| "tokens/trainable": 134004 |
| }, |
| { |
| "epoch": 0.4415862808145766, |
| "grad_norm": 0.05130897834897041, |
| "learning_rate": 0.0025, |
| "loss": 5.724153518676758, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.57, |
| "memory/max_allocated (GiB)": 18.57, |
| "ppl": 306.17398, |
| "step": 103, |
| "tokens/total": 231360, |
| "tokens/train_per_sec_per_gpu": 36.23, |
| "tokens/trainable": 135730 |
| }, |
| { |
| "epoch": 0.44587352625937837, |
| "grad_norm": 0.05757886916399002, |
| "learning_rate": 0.0025, |
| "loss": 5.929745674133301, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 376.05886, |
| "step": 104, |
| "tokens/total": 233664, |
| "tokens/train_per_sec_per_gpu": 35.62, |
| "tokens/trainable": 137092 |
| }, |
| { |
| "epoch": 0.45016077170418006, |
| "grad_norm": 0.17747093737125397, |
| "learning_rate": 0.0025, |
| "loss": 5.519756317138672, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 249.57421, |
| "step": 105, |
| "tokens/total": 235456, |
| "tokens/train_per_sec_per_gpu": 6.16, |
| "tokens/trainable": 138050 |
| }, |
| { |
| "epoch": 0.45444801714898175, |
| "grad_norm": 0.9483416080474854, |
| "learning_rate": 0.0025, |
| "loss": 6.132290363311768, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.53, |
| "memory/max_allocated (GiB)": 18.53, |
| "ppl": 460.48964, |
| "step": 106, |
| "tokens/total": 238016, |
| "tokens/train_per_sec_per_gpu": 8.58, |
| "tokens/trainable": 139739 |
| }, |
| { |
| "epoch": 0.4587352625937835, |
| "grad_norm": 0.0577179454267025, |
| "learning_rate": 0.0025, |
| "loss": 6.135179042816162, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 461.82177, |
| "step": 107, |
| "tokens/total": 240768, |
| "tokens/train_per_sec_per_gpu": 197.15, |
| "tokens/trainable": 141578 |
| }, |
| { |
| "epoch": 0.4630225080385852, |
| "grad_norm": 0.13632942736148834, |
| "learning_rate": 0.0025, |
| "loss": 5.787478923797607, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 326.18964, |
| "step": 108, |
| "tokens/total": 242624, |
| "tokens/train_per_sec_per_gpu": 6.1, |
| "tokens/trainable": 142512 |
| }, |
| { |
| "epoch": 0.46730975348338694, |
| "grad_norm": 0.06379847973585129, |
| "learning_rate": 0.0025, |
| "loss": 5.883008003234863, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.81, |
| "memory/max_allocated (GiB)": 18.81, |
| "ppl": 358.88715, |
| "step": 109, |
| "tokens/total": 246208, |
| "tokens/train_per_sec_per_gpu": 80.44, |
| "tokens/trainable": 145158 |
| }, |
| { |
| "epoch": 0.4715969989281886, |
| "grad_norm": 0.06732220202684402, |
| "learning_rate": 0.0025, |
| "loss": 6.3092498779296875, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 549.6325, |
| "step": 110, |
| "tokens/total": 248448, |
| "tokens/train_per_sec_per_gpu": 111.87, |
| "tokens/trainable": 146499 |
| }, |
| { |
| "epoch": 0.4758842443729904, |
| "grad_norm": 0.06807619333267212, |
| "learning_rate": 0.0025, |
| "loss": 5.3626203536987305, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.61, |
| "memory/max_allocated (GiB)": 18.61, |
| "ppl": 213.28309, |
| "step": 111, |
| "tokens/total": 251264, |
| "tokens/train_per_sec_per_gpu": 30.5, |
| "tokens/trainable": 148381 |
| }, |
| { |
| "epoch": 0.48017148981779206, |
| "grad_norm": 0.07463269680738449, |
| "learning_rate": 0.0025, |
| "loss": 6.311191558837891, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 550.70075, |
| "step": 112, |
| "tokens/total": 252928, |
| "tokens/train_per_sec_per_gpu": 25.34, |
| "tokens/trainable": 149181 |
| }, |
| { |
| "epoch": 0.4844587352625938, |
| "grad_norm": 0.08292581140995026, |
| "learning_rate": 0.0025, |
| "loss": 5.2816314697265625, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 196.69051, |
| "step": 113, |
| "tokens/total": 254464, |
| "tokens/train_per_sec_per_gpu": 136.32, |
| "tokens/trainable": 149924 |
| }, |
| { |
| "epoch": 0.4887459807073955, |
| "grad_norm": 0.061865709722042084, |
| "learning_rate": 0.0025, |
| "loss": 6.011983394622803, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 408.29232, |
| "step": 114, |
| "tokens/total": 256768, |
| "tokens/train_per_sec_per_gpu": 10.89, |
| "tokens/trainable": 151286 |
| }, |
| { |
| "epoch": 0.4930332261521972, |
| "grad_norm": 0.10802624374628067, |
| "learning_rate": 0.0025, |
| "loss": 5.940651893615723, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 380.18269, |
| "step": 115, |
| "tokens/total": 258688, |
| "tokens/train_per_sec_per_gpu": 90.71, |
| "tokens/trainable": 152264 |
| }, |
| { |
| "epoch": 0.49732047159699894, |
| "grad_norm": 0.06800372898578644, |
| "learning_rate": 0.0025, |
| "loss": 5.748905181884766, |
| "memory/device_reserved (GiB)": 19.35, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 313.84687, |
| "step": 116, |
| "tokens/total": 260800, |
| "tokens/train_per_sec_per_gpu": 10.94, |
| "tokens/trainable": 153396 |
| }, |
| { |
| "epoch": 0.5016077170418006, |
| "grad_norm": 0.05444978550076485, |
| "learning_rate": 0.0025, |
| "loss": 5.818991661071777, |
| "memory/device_reserved (GiB)": 19.39, |
| "memory/max_active (GiB)": 19.1, |
| "memory/max_allocated (GiB)": 19.1, |
| "ppl": 336.63244, |
| "step": 117, |
| "tokens/total": 264320, |
| "tokens/train_per_sec_per_gpu": 603.9, |
| "tokens/trainable": 156011 |
| }, |
| { |
| "epoch": 0.5058949624866024, |
| "grad_norm": 0.05440564081072807, |
| "learning_rate": 0.0025, |
| "loss": 5.707052230834961, |
| "memory/device_reserved (GiB)": 19.39, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 300.98253, |
| "step": 118, |
| "tokens/total": 266176, |
| "tokens/train_per_sec_per_gpu": 31.98, |
| "tokens/trainable": 156975 |
| }, |
| { |
| "epoch": 0.5058949624866024, |
| "eval_loss": 5.828575611114502, |
| "eval_ppl": 339.87422, |
| "eval_runtime": 17.0818, |
| "eval_samples_per_second": 12.177, |
| "eval_steps_per_second": 12.177, |
| "memory/device_reserved (GiB)": 19.39, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.5101822079314041, |
| "grad_norm": 0.10290460288524628, |
| "learning_rate": 0.0025, |
| "loss": 6.018277168273926, |
| "memory/device_reserved (GiB)": 18.51, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 410.87013, |
| "step": 119, |
| "tokens/total": 267776, |
| "tokens/train_per_sec_per_gpu": 11.96, |
| "tokens/trainable": 157706 |
| }, |
| { |
| "epoch": 0.5144694533762058, |
| "grad_norm": 0.06775107234716415, |
| "learning_rate": 0.0025, |
| "loss": 6.1761064529418945, |
| "memory/device_reserved (GiB)": 18.72, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 481.11506, |
| "step": 120, |
| "tokens/total": 270464, |
| "tokens/train_per_sec_per_gpu": 63.79, |
| "tokens/trainable": 159440 |
| }, |
| { |
| "epoch": 0.5187566988210075, |
| "grad_norm": 0.09368869662284851, |
| "learning_rate": 0.0025, |
| "loss": 5.502309799194336, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.69, |
| "memory/max_allocated (GiB)": 18.69, |
| "ppl": 245.25777, |
| "step": 121, |
| "tokens/total": 273024, |
| "tokens/train_per_sec_per_gpu": 0.87, |
| "tokens/trainable": 161081 |
| }, |
| { |
| "epoch": 0.5230439442658092, |
| "grad_norm": 0.05680066719651222, |
| "learning_rate": 0.0025, |
| "loss": 5.426424503326416, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 227.33496, |
| "step": 122, |
| "tokens/total": 274944, |
| "tokens/train_per_sec_per_gpu": 7.41, |
| "tokens/trainable": 162094 |
| }, |
| { |
| "epoch": 0.5273311897106109, |
| "grad_norm": 0.05151861906051636, |
| "learning_rate": 0.0025, |
| "loss": 5.80197811126709, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 330.95358, |
| "step": 123, |
| "tokens/total": 276864, |
| "tokens/train_per_sec_per_gpu": 49.9, |
| "tokens/trainable": 163090 |
| }, |
| { |
| "epoch": 0.5316184351554126, |
| "grad_norm": 0.08232463896274567, |
| "learning_rate": 0.0025, |
| "loss": 5.85071325302124, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.63, |
| "memory/max_allocated (GiB)": 18.63, |
| "ppl": 347.48213, |
| "step": 124, |
| "tokens/total": 279360, |
| "tokens/train_per_sec_per_gpu": 1.75, |
| "tokens/trainable": 164564 |
| }, |
| { |
| "epoch": 0.5359056806002144, |
| "grad_norm": 0.08689901232719421, |
| "learning_rate": 0.0025, |
| "loss": 5.9564080238342285, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 386.22034, |
| "step": 125, |
| "tokens/total": 281984, |
| "tokens/train_per_sec_per_gpu": 192.74, |
| "tokens/trainable": 166194 |
| }, |
| { |
| "epoch": 0.5401929260450161, |
| "grad_norm": 0.06799639761447906, |
| "learning_rate": 0.0025, |
| "loss": 5.424775123596191, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 226.9603, |
| "step": 126, |
| "tokens/total": 283584, |
| "tokens/train_per_sec_per_gpu": 20.32, |
| "tokens/trainable": 166912 |
| }, |
| { |
| "epoch": 0.5444801714898178, |
| "grad_norm": 0.06623026728630066, |
| "learning_rate": 0.0025, |
| "loss": 5.536446571350098, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 253.77463, |
| "step": 127, |
| "tokens/total": 286400, |
| "tokens/train_per_sec_per_gpu": 112.92, |
| "tokens/trainable": 168763 |
| }, |
| { |
| "epoch": 0.5487674169346195, |
| "grad_norm": 0.07652036845684052, |
| "learning_rate": 0.0025, |
| "loss": 4.905512809753418, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 135.03214, |
| "step": 128, |
| "tokens/total": 288448, |
| "tokens/train_per_sec_per_gpu": 213.64, |
| "tokens/trainable": 169869 |
| }, |
| { |
| "epoch": 0.5530546623794212, |
| "grad_norm": 0.07232151925563812, |
| "learning_rate": 0.0025, |
| "loss": 5.730243682861328, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 308.04432, |
| "step": 129, |
| "tokens/total": 290560, |
| "tokens/train_per_sec_per_gpu": 8.69, |
| "tokens/trainable": 171082 |
| }, |
| { |
| "epoch": 0.5573419078242229, |
| "grad_norm": 0.1090153232216835, |
| "learning_rate": 0.0025, |
| "loss": 4.770911693572998, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 118.0268, |
| "step": 130, |
| "tokens/total": 292096, |
| "tokens/train_per_sec_per_gpu": 2.14, |
| "tokens/trainable": 171701 |
| }, |
| { |
| "epoch": 0.5616291532690246, |
| "grad_norm": 0.07109065353870392, |
| "learning_rate": 0.0025, |
| "loss": 6.074782371520996, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 434.75488, |
| "step": 131, |
| "tokens/total": 293952, |
| "tokens/train_per_sec_per_gpu": 11.1, |
| "tokens/trainable": 172572 |
| }, |
| { |
| "epoch": 0.5659163987138264, |
| "grad_norm": 0.06394513696432114, |
| "learning_rate": 0.0025, |
| "loss": 5.415735244750977, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 224.91785, |
| "step": 132, |
| "tokens/total": 295424, |
| "tokens/train_per_sec_per_gpu": 25.67, |
| "tokens/trainable": 173255 |
| }, |
| { |
| "epoch": 0.5702036441586281, |
| "grad_norm": 0.07912840694189072, |
| "learning_rate": 0.0025, |
| "loss": 5.249301910400391, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.67, |
| "memory/max_allocated (GiB)": 18.67, |
| "ppl": 190.43328, |
| "step": 133, |
| "tokens/total": 298112, |
| "tokens/train_per_sec_per_gpu": 6.48, |
| "tokens/trainable": 175036 |
| }, |
| { |
| "epoch": 0.5744908896034298, |
| "grad_norm": 0.05524107813835144, |
| "learning_rate": 0.0025, |
| "loss": 6.232509613037109, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 509.03135, |
| "step": 134, |
| "tokens/total": 300288, |
| "tokens/train_per_sec_per_gpu": 114.55, |
| "tokens/trainable": 176200 |
| }, |
| { |
| "epoch": 0.5787781350482315, |
| "grad_norm": 0.08867005258798599, |
| "learning_rate": 0.0025, |
| "loss": 5.501221656799316, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 244.99104, |
| "step": 135, |
| "tokens/total": 302016, |
| "tokens/train_per_sec_per_gpu": 8.77, |
| "tokens/trainable": 176997 |
| }, |
| { |
| "epoch": 0.5830653804930332, |
| "grad_norm": 0.0581279955804348, |
| "learning_rate": 0.0025, |
| "loss": 6.093716144561768, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 443.06485, |
| "step": 136, |
| "tokens/total": 303936, |
| "tokens/train_per_sec_per_gpu": 49.42, |
| "tokens/trainable": 177895 |
| }, |
| { |
| "epoch": 0.587352625937835, |
| "grad_norm": 0.24256502091884613, |
| "learning_rate": 0.0025, |
| "loss": 5.912446975708008, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 369.60948, |
| "step": 137, |
| "tokens/total": 306240, |
| "tokens/train_per_sec_per_gpu": 117.87, |
| "tokens/trainable": 179285 |
| }, |
| { |
| "epoch": 0.5916398713826366, |
| "grad_norm": 0.08909733593463898, |
| "learning_rate": 0.0025, |
| "loss": 7.02211332321167, |
| "memory/device_reserved (GiB)": 19.02, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 1121.15348, |
| "step": 138, |
| "tokens/total": 308672, |
| "tokens/train_per_sec_per_gpu": 145.17, |
| "tokens/trainable": 180732 |
| }, |
| { |
| "epoch": 0.5959271168274384, |
| "grad_norm": 0.11382216215133667, |
| "learning_rate": 0.0025, |
| "loss": 6.5203537940979, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.92, |
| "memory/max_allocated (GiB)": 18.92, |
| "ppl": 678.8185, |
| "step": 139, |
| "tokens/total": 311552, |
| "tokens/train_per_sec_per_gpu": 17.16, |
| "tokens/trainable": 182719 |
| }, |
| { |
| "epoch": 0.6002143622722401, |
| "grad_norm": 0.09745891392230988, |
| "learning_rate": 0.0025, |
| "loss": 5.971193790435791, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 391.97333, |
| "step": 140, |
| "tokens/total": 313856, |
| "tokens/train_per_sec_per_gpu": 150.24, |
| "tokens/trainable": 184142 |
| }, |
| { |
| "epoch": 0.6045016077170418, |
| "grad_norm": 0.0861353650689125, |
| "learning_rate": 0.0025, |
| "loss": 5.941656112670898, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 380.56467, |
| "step": 141, |
| "tokens/total": 315392, |
| "tokens/train_per_sec_per_gpu": 68.11, |
| "tokens/trainable": 184842 |
| }, |
| { |
| "epoch": 0.6087888531618435, |
| "grad_norm": 0.07818640768527985, |
| "learning_rate": 0.0025, |
| "loss": 5.195644378662109, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 180.48441, |
| "step": 142, |
| "tokens/total": 317248, |
| "tokens/train_per_sec_per_gpu": 24.45, |
| "tokens/trainable": 185810 |
| }, |
| { |
| "epoch": 0.6130760986066452, |
| "grad_norm": 0.06124793365597725, |
| "learning_rate": 0.0025, |
| "loss": 5.502886772155762, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 245.39932, |
| "step": 143, |
| "tokens/total": 319232, |
| "tokens/train_per_sec_per_gpu": 22.21, |
| "tokens/trainable": 186800 |
| }, |
| { |
| "epoch": 0.617363344051447, |
| "grad_norm": 0.0705777034163475, |
| "learning_rate": 0.0025, |
| "loss": 6.341245651245117, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.72, |
| "memory/max_allocated (GiB)": 18.72, |
| "ppl": 567.50278, |
| "step": 144, |
| "tokens/total": 322304, |
| "tokens/train_per_sec_per_gpu": 19.8, |
| "tokens/trainable": 189022 |
| }, |
| { |
| "epoch": 0.6216505894962486, |
| "grad_norm": 0.07551176846027374, |
| "learning_rate": 0.0025, |
| "loss": 5.537265777587891, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 253.9826, |
| "step": 145, |
| "tokens/total": 324608, |
| "tokens/train_per_sec_per_gpu": 14.15, |
| "tokens/trainable": 190415 |
| }, |
| { |
| "epoch": 0.6259378349410504, |
| "grad_norm": 0.04925059527158737, |
| "learning_rate": 0.0025, |
| "loss": 5.8168511390686035, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 335.91264, |
| "step": 146, |
| "tokens/total": 327168, |
| "tokens/train_per_sec_per_gpu": 28.95, |
| "tokens/trainable": 192079 |
| }, |
| { |
| "epoch": 0.6302250803858521, |
| "grad_norm": 0.06655045598745346, |
| "learning_rate": 0.0025, |
| "loss": 5.837461471557617, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 342.90775, |
| "step": 147, |
| "tokens/total": 328640, |
| "tokens/train_per_sec_per_gpu": 33.92, |
| "tokens/trainable": 192665 |
| }, |
| { |
| "epoch": 0.6345123258306538, |
| "grad_norm": 0.07563883811235428, |
| "learning_rate": 0.0025, |
| "loss": 5.163289546966553, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 174.73832, |
| "step": 148, |
| "tokens/total": 330432, |
| "tokens/train_per_sec_per_gpu": 112.65, |
| "tokens/trainable": 193531 |
| }, |
| { |
| "epoch": 0.6387995712754555, |
| "grad_norm": 0.05340481176972389, |
| "learning_rate": 0.0025, |
| "loss": 5.348882675170898, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 210.37311, |
| "step": 149, |
| "tokens/total": 332032, |
| "tokens/train_per_sec_per_gpu": 35.17, |
| "tokens/trainable": 194286 |
| }, |
| { |
| "epoch": 0.6430868167202572, |
| "grad_norm": 0.07053768634796143, |
| "learning_rate": 0.0025, |
| "loss": 6.0246500968933105, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 413.49693, |
| "step": 150, |
| "tokens/total": 334592, |
| "tokens/train_per_sec_per_gpu": 53.06, |
| "tokens/trainable": 195894 |
| }, |
| { |
| "epoch": 0.647374062165059, |
| "grad_norm": 0.10346148163080215, |
| "learning_rate": 0.0025, |
| "loss": 6.111691474914551, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 451.1011, |
| "step": 151, |
| "tokens/total": 336896, |
| "tokens/train_per_sec_per_gpu": 84.27, |
| "tokens/trainable": 197140 |
| }, |
| { |
| "epoch": 0.6516613076098606, |
| "grad_norm": 0.05668232962489128, |
| "learning_rate": 0.0025, |
| "loss": 5.328207015991211, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 206.06817, |
| "step": 152, |
| "tokens/total": 339392, |
| "tokens/train_per_sec_per_gpu": 68.85, |
| "tokens/trainable": 198641 |
| }, |
| { |
| "epoch": 0.6559485530546624, |
| "grad_norm": 0.07267311960458755, |
| "learning_rate": 0.0025, |
| "loss": 5.421064853668213, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 226.11978, |
| "step": 153, |
| "tokens/total": 341312, |
| "tokens/train_per_sec_per_gpu": 108.38, |
| "tokens/trainable": 199572 |
| }, |
| { |
| "epoch": 0.6602357984994641, |
| "grad_norm": 0.05049528181552887, |
| "learning_rate": 0.0025, |
| "loss": 5.701568603515625, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.63, |
| "memory/max_allocated (GiB)": 18.63, |
| "ppl": 299.33657, |
| "step": 154, |
| "tokens/total": 344064, |
| "tokens/train_per_sec_per_gpu": 300.52, |
| "tokens/trainable": 201242 |
| }, |
| { |
| "epoch": 0.6645230439442658, |
| "grad_norm": 0.056077949702739716, |
| "learning_rate": 0.0025, |
| "loss": 5.401850700378418, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 221.81655, |
| "step": 155, |
| "tokens/total": 346432, |
| "tokens/train_per_sec_per_gpu": 258.94, |
| "tokens/trainable": 202788 |
| }, |
| { |
| "epoch": 0.6688102893890675, |
| "grad_norm": 0.11511314660310745, |
| "learning_rate": 0.0025, |
| "loss": 5.723288536071777, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 305.90926, |
| "step": 156, |
| "tokens/total": 348160, |
| "tokens/train_per_sec_per_gpu": 129.3, |
| "tokens/trainable": 203548 |
| }, |
| { |
| "epoch": 0.6730975348338692, |
| "grad_norm": 0.04815061762928963, |
| "learning_rate": 0.0025, |
| "loss": 5.29791259765625, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 199.91906, |
| "step": 157, |
| "tokens/total": 349888, |
| "tokens/train_per_sec_per_gpu": 61.9, |
| "tokens/trainable": 204408 |
| }, |
| { |
| "epoch": 0.677384780278671, |
| "grad_norm": 0.0801524966955185, |
| "learning_rate": 0.0025, |
| "loss": 6.227181911468506, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 506.3266, |
| "step": 158, |
| "tokens/total": 351744, |
| "tokens/train_per_sec_per_gpu": 102.56, |
| "tokens/trainable": 205262 |
| }, |
| { |
| "epoch": 0.6816720257234726, |
| "grad_norm": 0.059293024241924286, |
| "learning_rate": 0.0025, |
| "loss": 5.615620136260986, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 274.68367, |
| "step": 159, |
| "tokens/total": 354176, |
| "tokens/train_per_sec_per_gpu": 31.25, |
| "tokens/trainable": 206828 |
| }, |
| { |
| "epoch": 0.6859592711682744, |
| "grad_norm": 0.07457486540079117, |
| "learning_rate": 0.0025, |
| "loss": 5.283931255340576, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 197.14337, |
| "step": 160, |
| "tokens/total": 356416, |
| "tokens/train_per_sec_per_gpu": 51.94, |
| "tokens/trainable": 208192 |
| }, |
| { |
| "epoch": 0.6902465166130761, |
| "grad_norm": 0.06552717089653015, |
| "learning_rate": 0.0025, |
| "loss": 5.763904571533203, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 318.58986, |
| "step": 161, |
| "tokens/total": 358336, |
| "tokens/train_per_sec_per_gpu": 169.49, |
| "tokens/trainable": 209238 |
| }, |
| { |
| "epoch": 0.6945337620578779, |
| "grad_norm": 0.07754746079444885, |
| "learning_rate": 0.0025, |
| "loss": 5.163079261779785, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 174.70158, |
| "step": 162, |
| "tokens/total": 360576, |
| "tokens/train_per_sec_per_gpu": 17.68, |
| "tokens/trainable": 210560 |
| }, |
| { |
| "epoch": 0.6988210075026795, |
| "grad_norm": 0.11694307625293732, |
| "learning_rate": 0.0025, |
| "loss": 6.16943359375, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 477.91534, |
| "step": 163, |
| "tokens/total": 362560, |
| "tokens/train_per_sec_per_gpu": 6.89, |
| "tokens/trainable": 211646 |
| }, |
| { |
| "epoch": 0.7031082529474812, |
| "grad_norm": 0.3541814684867859, |
| "learning_rate": 0.0025, |
| "loss": 6.192336082458496, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 488.98709, |
| "step": 164, |
| "tokens/total": 364032, |
| "tokens/train_per_sec_per_gpu": 68.77, |
| "tokens/trainable": 212233 |
| }, |
| { |
| "epoch": 0.707395498392283, |
| "grad_norm": 0.0691906213760376, |
| "learning_rate": 0.0025, |
| "loss": 5.444247722625732, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 231.42312, |
| "step": 165, |
| "tokens/total": 366208, |
| "tokens/train_per_sec_per_gpu": 74.88, |
| "tokens/trainable": 213482 |
| }, |
| { |
| "epoch": 0.7116827438370846, |
| "grad_norm": 0.06896678358316422, |
| "learning_rate": 0.0025, |
| "loss": 5.45513391494751, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 233.9562, |
| "step": 166, |
| "tokens/total": 368192, |
| "tokens/train_per_sec_per_gpu": 150.94, |
| "tokens/trainable": 214570 |
| }, |
| { |
| "epoch": 0.7159699892818864, |
| "grad_norm": 0.07068932056427002, |
| "learning_rate": 0.0025, |
| "loss": 5.088237762451172, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 162.10394, |
| "step": 167, |
| "tokens/total": 370432, |
| "tokens/train_per_sec_per_gpu": 37.23, |
| "tokens/trainable": 215958 |
| }, |
| { |
| "epoch": 0.7202572347266881, |
| "grad_norm": 0.07037021219730377, |
| "learning_rate": 0.0025, |
| "loss": 5.780329704284668, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 323.86595, |
| "step": 168, |
| "tokens/total": 373248, |
| "tokens/train_per_sec_per_gpu": 48.79, |
| "tokens/trainable": 217782 |
| }, |
| { |
| "epoch": 0.7245444801714899, |
| "grad_norm": 0.09113272279500961, |
| "learning_rate": 0.0025, |
| "loss": 5.445977210998535, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 231.82371, |
| "step": 169, |
| "tokens/total": 375296, |
| "tokens/train_per_sec_per_gpu": 135.42, |
| "tokens/trainable": 218943 |
| }, |
| { |
| "epoch": 0.7288317256162915, |
| "grad_norm": 0.04421088844537735, |
| "learning_rate": 0.0025, |
| "loss": 5.593143463134766, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 268.57856, |
| "step": 170, |
| "tokens/total": 377280, |
| "tokens/train_per_sec_per_gpu": 32.25, |
| "tokens/trainable": 220097 |
| }, |
| { |
| "epoch": 0.7331189710610932, |
| "grad_norm": 0.07146560400724411, |
| "learning_rate": 0.0025, |
| "loss": 5.347398281097412, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 210.06107, |
| "step": 171, |
| "tokens/total": 379392, |
| "tokens/train_per_sec_per_gpu": 15.36, |
| "tokens/trainable": 221250 |
| }, |
| { |
| "epoch": 0.737406216505895, |
| "grad_norm": 0.058737654238939285, |
| "learning_rate": 0.0025, |
| "loss": 5.430908203125, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.58, |
| "memory/max_allocated (GiB)": 18.58, |
| "ppl": 228.35655, |
| "step": 172, |
| "tokens/total": 382144, |
| "tokens/train_per_sec_per_gpu": 17.26, |
| "tokens/trainable": 223081 |
| }, |
| { |
| "epoch": 0.7416934619506966, |
| "grad_norm": 0.08348660171031952, |
| "learning_rate": 0.0025, |
| "loss": 5.785150527954102, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 325.43102, |
| "step": 173, |
| "tokens/total": 384768, |
| "tokens/train_per_sec_per_gpu": 56.26, |
| "tokens/trainable": 224769 |
| }, |
| { |
| "epoch": 0.7459807073954984, |
| "grad_norm": 0.04298631101846695, |
| "learning_rate": 0.0025, |
| "loss": 5.393362998962402, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 219.94181, |
| "step": 174, |
| "tokens/total": 387136, |
| "tokens/train_per_sec_per_gpu": 37.67, |
| "tokens/trainable": 226216 |
| }, |
| { |
| "epoch": 0.7502679528403001, |
| "grad_norm": 0.08122890442609787, |
| "learning_rate": 0.0025, |
| "loss": 5.101113796234131, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 164.2047, |
| "step": 175, |
| "tokens/total": 388608, |
| "tokens/train_per_sec_per_gpu": 44.01, |
| "tokens/trainable": 226770 |
| }, |
| { |
| "epoch": 0.7545551982851019, |
| "grad_norm": 0.11935891956090927, |
| "learning_rate": 0.0025, |
| "loss": 5.519252300262451, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 249.44845, |
| "step": 176, |
| "tokens/total": 390784, |
| "tokens/train_per_sec_per_gpu": 39.54, |
| "tokens/trainable": 228098 |
| }, |
| { |
| "epoch": 0.7588424437299035, |
| "grad_norm": 0.15656660497188568, |
| "learning_rate": 0.0025, |
| "loss": 4.941908836364746, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 140.0373, |
| "step": 177, |
| "tokens/total": 392512, |
| "tokens/train_per_sec_per_gpu": 36.08, |
| "tokens/trainable": 228809 |
| }, |
| { |
| "epoch": 0.7588424437299035, |
| "eval_loss": 5.460564136505127, |
| "eval_ppl": 235.23009, |
| "eval_runtime": 17.2516, |
| "eval_samples_per_second": 12.057, |
| "eval_steps_per_second": 12.057, |
| "memory/device_reserved (GiB)": 19.56, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.7631296891747053, |
| "grad_norm": 0.05968443304300308, |
| "learning_rate": 0.0025, |
| "loss": 5.653349876403809, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 19.12, |
| "memory/max_allocated (GiB)": 19.12, |
| "ppl": 285.2454, |
| "step": 178, |
| "tokens/total": 396416, |
| "tokens/train_per_sec_per_gpu": 110.71, |
| "tokens/trainable": 231745 |
| }, |
| { |
| "epoch": 0.767416934619507, |
| "grad_norm": 0.11675012111663818, |
| "learning_rate": 0.0025, |
| "loss": 5.658262252807617, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 286.65008, |
| "step": 179, |
| "tokens/total": 399040, |
| "tokens/train_per_sec_per_gpu": 101.59, |
| "tokens/trainable": 233370 |
| }, |
| { |
| "epoch": 0.7717041800643086, |
| "grad_norm": 0.07265754044055939, |
| "learning_rate": 0.0025, |
| "loss": 5.7116546630859375, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 302.37098, |
| "step": 180, |
| "tokens/total": 400896, |
| "tokens/train_per_sec_per_gpu": 5.2, |
| "tokens/trainable": 234357 |
| }, |
| { |
| "epoch": 0.7759914255091104, |
| "grad_norm": 0.06884697079658508, |
| "learning_rate": 0.0025, |
| "loss": 5.63464879989624, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 279.96058, |
| "step": 181, |
| "tokens/total": 403072, |
| "tokens/train_per_sec_per_gpu": 105.0, |
| "tokens/trainable": 235702 |
| }, |
| { |
| "epoch": 0.7802786709539121, |
| "grad_norm": 0.12419719249010086, |
| "learning_rate": 0.0025, |
| "loss": 5.239850997924805, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 188.64199, |
| "step": 182, |
| "tokens/total": 404928, |
| "tokens/train_per_sec_per_gpu": 19.58, |
| "tokens/trainable": 236615 |
| }, |
| { |
| "epoch": 0.7845659163987139, |
| "grad_norm": 0.10391955822706223, |
| "learning_rate": 0.0025, |
| "loss": 4.728728294372559, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 18.39, |
| "memory/max_allocated (GiB)": 18.39, |
| "ppl": 113.15158, |
| "step": 183, |
| "tokens/total": 406272, |
| "tokens/train_per_sec_per_gpu": 24.51, |
| "tokens/trainable": 237082 |
| }, |
| { |
| "epoch": 0.7888531618435155, |
| "grad_norm": 0.07022784650325775, |
| "learning_rate": 0.0025, |
| "loss": 5.611477375030518, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 273.54807, |
| "step": 184, |
| "tokens/total": 408384, |
| "tokens/train_per_sec_per_gpu": 110.02, |
| "tokens/trainable": 238396 |
| }, |
| { |
| "epoch": 0.7931404072883173, |
| "grad_norm": 0.11678767204284668, |
| "learning_rate": 0.0025, |
| "loss": 6.098667144775391, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 445.2639, |
| "step": 185, |
| "tokens/total": 410560, |
| "tokens/train_per_sec_per_gpu": 2.18, |
| "tokens/trainable": 239649 |
| }, |
| { |
| "epoch": 0.797427652733119, |
| "grad_norm": 0.06710375845432281, |
| "learning_rate": 0.0025, |
| "loss": 5.643878936767578, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 18.65, |
| "memory/max_allocated (GiB)": 18.65, |
| "ppl": 282.55661, |
| "step": 186, |
| "tokens/total": 412800, |
| "tokens/train_per_sec_per_gpu": 12.9, |
| "tokens/trainable": 240972 |
| }, |
| { |
| "epoch": 0.8017148981779206, |
| "grad_norm": 0.09496990591287613, |
| "learning_rate": 0.0025, |
| "loss": 5.7106122970581055, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 302.05596, |
| "step": 187, |
| "tokens/total": 415296, |
| "tokens/train_per_sec_per_gpu": 67.78, |
| "tokens/trainable": 242450 |
| }, |
| { |
| "epoch": 0.8060021436227224, |
| "grad_norm": 0.06658744066953659, |
| "learning_rate": 0.0025, |
| "loss": 5.754177093505859, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 315.50581, |
| "step": 188, |
| "tokens/total": 417856, |
| "tokens/train_per_sec_per_gpu": 14.18, |
| "tokens/trainable": 244076 |
| }, |
| { |
| "epoch": 0.8102893890675241, |
| "grad_norm": 0.054122067987918854, |
| "learning_rate": 0.0025, |
| "loss": 5.310729026794434, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 202.4978, |
| "step": 189, |
| "tokens/total": 419584, |
| "tokens/train_per_sec_per_gpu": 11.57, |
| "tokens/trainable": 244832 |
| }, |
| { |
| "epoch": 0.8145766345123259, |
| "grad_norm": 0.11224393546581268, |
| "learning_rate": 0.0025, |
| "loss": 5.699178695678711, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 298.62204, |
| "step": 190, |
| "tokens/total": 420928, |
| "tokens/train_per_sec_per_gpu": 8.17, |
| "tokens/trainable": 245280 |
| }, |
| { |
| "epoch": 0.8188638799571275, |
| "grad_norm": 0.05273159593343735, |
| "learning_rate": 0.0025, |
| "loss": 5.184141159057617, |
| "memory/device_reserved (GiB)": 19.59, |
| "memory/max_active (GiB)": 18.73, |
| "memory/max_allocated (GiB)": 18.73, |
| "ppl": 178.42015, |
| "step": 191, |
| "tokens/total": 424000, |
| "tokens/train_per_sec_per_gpu": 53.69, |
| "tokens/trainable": 247590 |
| }, |
| { |
| "epoch": 0.8231511254019293, |
| "grad_norm": 0.03733557090163231, |
| "learning_rate": 0.0025, |
| "loss": 5.854153633117676, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 19.12, |
| "memory/max_allocated (GiB)": 19.12, |
| "ppl": 348.67966, |
| "step": 192, |
| "tokens/total": 428480, |
| "tokens/train_per_sec_per_gpu": 52.36, |
| "tokens/trainable": 251024 |
| }, |
| { |
| "epoch": 0.827438370846731, |
| "grad_norm": 0.06321356445550919, |
| "learning_rate": 0.0025, |
| "loss": 5.20097017288208, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 181.44819, |
| "step": 193, |
| "tokens/total": 430656, |
| "tokens/train_per_sec_per_gpu": 1.74, |
| "tokens/trainable": 252277 |
| }, |
| { |
| "epoch": 0.8317256162915327, |
| "grad_norm": 0.1052091047167778, |
| "learning_rate": 0.0025, |
| "loss": 4.629343509674072, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 102.44679, |
| "step": 194, |
| "tokens/total": 432064, |
| "tokens/train_per_sec_per_gpu": 21.14, |
| "tokens/trainable": 252834 |
| }, |
| { |
| "epoch": 0.8360128617363344, |
| "grad_norm": 0.06142156571149826, |
| "learning_rate": 0.0025, |
| "loss": 5.491484642028809, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 242.61714, |
| "step": 195, |
| "tokens/total": 434624, |
| "tokens/train_per_sec_per_gpu": 23.29, |
| "tokens/trainable": 254420 |
| }, |
| { |
| "epoch": 0.8403001071811361, |
| "grad_norm": 0.05292431265115738, |
| "learning_rate": 0.0025, |
| "loss": 5.481139183044434, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.62, |
| "memory/max_allocated (GiB)": 18.62, |
| "ppl": 240.12009, |
| "step": 196, |
| "tokens/total": 437440, |
| "tokens/train_per_sec_per_gpu": 52.86, |
| "tokens/trainable": 256271 |
| }, |
| { |
| "epoch": 0.8445873526259379, |
| "grad_norm": 0.0795208215713501, |
| "learning_rate": 0.0025, |
| "loss": 5.259885787963867, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 192.45951, |
| "step": 197, |
| "tokens/total": 439744, |
| "tokens/train_per_sec_per_gpu": 64.52, |
| "tokens/trainable": 257576 |
| }, |
| { |
| "epoch": 0.8488745980707395, |
| "grad_norm": 0.08473947644233704, |
| "learning_rate": 0.0025, |
| "loss": 5.696974754333496, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.59, |
| "memory/max_allocated (GiB)": 18.59, |
| "ppl": 297.96462, |
| "step": 198, |
| "tokens/total": 442816, |
| "tokens/train_per_sec_per_gpu": 36.19, |
| "tokens/trainable": 259567 |
| }, |
| { |
| "epoch": 0.8531618435155413, |
| "grad_norm": 0.057201892137527466, |
| "learning_rate": 0.0025, |
| "loss": 5.780117034912109, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.62, |
| "memory/max_allocated (GiB)": 18.62, |
| "ppl": 323.79708, |
| "step": 199, |
| "tokens/total": 445888, |
| "tokens/train_per_sec_per_gpu": 81.99, |
| "tokens/trainable": 261637 |
| }, |
| { |
| "epoch": 0.857449088960343, |
| "grad_norm": 0.09185982495546341, |
| "learning_rate": 0.0025, |
| "loss": 6.029691219329834, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 415.58668, |
| "step": 200, |
| "tokens/total": 449408, |
| "tokens/train_per_sec_per_gpu": 163.44, |
| "tokens/trainable": 264161 |
| }, |
| { |
| "epoch": 0.8617363344051447, |
| "grad_norm": 0.055811040103435516, |
| "learning_rate": 0.0025, |
| "loss": 5.37666654586792, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 216.30005, |
| "step": 201, |
| "tokens/total": 452032, |
| "tokens/train_per_sec_per_gpu": 14.76, |
| "tokens/trainable": 265933 |
| }, |
| { |
| "epoch": 0.8660235798499464, |
| "grad_norm": 0.06049516424536705, |
| "learning_rate": 0.0025, |
| "loss": 5.930117130279541, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 376.19858, |
| "step": 202, |
| "tokens/total": 455232, |
| "tokens/train_per_sec_per_gpu": 34.59, |
| "tokens/trainable": 268176 |
| }, |
| { |
| "epoch": 0.8703108252947481, |
| "grad_norm": 0.04760754853487015, |
| "learning_rate": 0.0025, |
| "loss": 5.988245487213135, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.54, |
| "memory/max_allocated (GiB)": 18.54, |
| "ppl": 398.71445, |
| "step": 203, |
| "tokens/total": 457792, |
| "tokens/train_per_sec_per_gpu": 23.31, |
| "tokens/trainable": 269826 |
| }, |
| { |
| "epoch": 0.8745980707395499, |
| "grad_norm": 0.3524979054927826, |
| "learning_rate": 0.0025, |
| "loss": 5.5907745361328125, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 267.94307, |
| "step": 204, |
| "tokens/total": 460224, |
| "tokens/train_per_sec_per_gpu": 112.76, |
| "tokens/trainable": 271226 |
| }, |
| { |
| "epoch": 0.8788853161843515, |
| "grad_norm": 0.07816269993782043, |
| "learning_rate": 0.0025, |
| "loss": 5.5978217124938965, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.53, |
| "memory/max_allocated (GiB)": 18.53, |
| "ppl": 269.83798, |
| "step": 205, |
| "tokens/total": 463232, |
| "tokens/train_per_sec_per_gpu": 83.11, |
| "tokens/trainable": 273301 |
| }, |
| { |
| "epoch": 0.8831725616291533, |
| "grad_norm": 0.06009744852781296, |
| "learning_rate": 0.0025, |
| "loss": 5.7178802490234375, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 304.25928, |
| "step": 206, |
| "tokens/total": 464960, |
| "tokens/train_per_sec_per_gpu": 16.0, |
| "tokens/trainable": 274259 |
| }, |
| { |
| "epoch": 0.887459807073955, |
| "grad_norm": 0.09694831818342209, |
| "learning_rate": 0.0025, |
| "loss": 5.388084888458252, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 218.78399, |
| "step": 207, |
| "tokens/total": 467008, |
| "tokens/train_per_sec_per_gpu": 34.61, |
| "tokens/trainable": 275392 |
| }, |
| { |
| "epoch": 0.8917470525187567, |
| "grad_norm": 0.06148134917020798, |
| "learning_rate": 0.0025, |
| "loss": 5.084897041320801, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 161.5633, |
| "step": 208, |
| "tokens/total": 468608, |
| "tokens/train_per_sec_per_gpu": 9.12, |
| "tokens/trainable": 276045 |
| }, |
| { |
| "epoch": 0.8960342979635584, |
| "grad_norm": 0.09476284682750702, |
| "learning_rate": 0.0025, |
| "loss": 5.941885471343994, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.6, |
| "memory/max_allocated (GiB)": 18.6, |
| "ppl": 380.65196, |
| "step": 209, |
| "tokens/total": 470912, |
| "tokens/train_per_sec_per_gpu": 31.97, |
| "tokens/trainable": 277484 |
| }, |
| { |
| "epoch": 0.9003215434083601, |
| "grad_norm": 0.07026304304599762, |
| "learning_rate": 0.0025, |
| "loss": 5.341467380523682, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.7, |
| "memory/max_allocated (GiB)": 18.7, |
| "ppl": 208.8189, |
| "step": 210, |
| "tokens/total": 473920, |
| "tokens/train_per_sec_per_gpu": 114.73, |
| "tokens/trainable": 279470 |
| }, |
| { |
| "epoch": 0.9046087888531619, |
| "grad_norm": 0.046190474182367325, |
| "learning_rate": 0.0025, |
| "loss": 5.205893516540527, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.54, |
| "memory/max_allocated (GiB)": 18.54, |
| "ppl": 182.34373, |
| "step": 211, |
| "tokens/total": 476480, |
| "tokens/train_per_sec_per_gpu": 112.77, |
| "tokens/trainable": 280947 |
| }, |
| { |
| "epoch": 0.9088960342979635, |
| "grad_norm": 0.06688795238733292, |
| "learning_rate": 0.0025, |
| "loss": 4.937982559204102, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 139.48856, |
| "step": 212, |
| "tokens/total": 478016, |
| "tokens/train_per_sec_per_gpu": 101.68, |
| "tokens/trainable": 281612 |
| }, |
| { |
| "epoch": 0.9131832797427653, |
| "grad_norm": 0.07849342375993729, |
| "learning_rate": 0.0025, |
| "loss": 5.182241439819336, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 178.08152, |
| "step": 213, |
| "tokens/total": 479680, |
| "tokens/train_per_sec_per_gpu": 150.05, |
| "tokens/trainable": 282478 |
| }, |
| { |
| "epoch": 0.917470525187567, |
| "grad_norm": 0.05929256230592728, |
| "learning_rate": 0.0025, |
| "loss": 4.967282295227051, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.42, |
| "memory/max_allocated (GiB)": 18.42, |
| "ppl": 143.636, |
| "step": 214, |
| "tokens/total": 481728, |
| "tokens/train_per_sec_per_gpu": 128.72, |
| "tokens/trainable": 283641 |
| }, |
| { |
| "epoch": 0.9217577706323687, |
| "grad_norm": 0.07487839460372925, |
| "learning_rate": 0.0025, |
| "loss": 5.071913719177246, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.39, |
| "memory/max_allocated (GiB)": 18.39, |
| "ppl": 159.47923, |
| "step": 215, |
| "tokens/total": 483200, |
| "tokens/train_per_sec_per_gpu": 13.64, |
| "tokens/trainable": 284253 |
| }, |
| { |
| "epoch": 0.9260450160771704, |
| "grad_norm": 0.08760891854763031, |
| "learning_rate": 0.0025, |
| "loss": 5.086419582366943, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 161.80948, |
| "step": 216, |
| "tokens/total": 485184, |
| "tokens/train_per_sec_per_gpu": 33.77, |
| "tokens/trainable": 285350 |
| }, |
| { |
| "epoch": 0.9303322615219721, |
| "grad_norm": 0.058040693402290344, |
| "learning_rate": 0.0025, |
| "loss": 5.140557289123535, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 170.81093, |
| "step": 217, |
| "tokens/total": 487488, |
| "tokens/train_per_sec_per_gpu": 148.26, |
| "tokens/trainable": 286725 |
| }, |
| { |
| "epoch": 0.9346195069667739, |
| "grad_norm": 0.051069747656583786, |
| "learning_rate": 0.0025, |
| "loss": 5.112553119659424, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.54, |
| "memory/max_allocated (GiB)": 18.54, |
| "ppl": 166.09387, |
| "step": 218, |
| "tokens/total": 490176, |
| "tokens/train_per_sec_per_gpu": 7.43, |
| "tokens/trainable": 288433 |
| }, |
| { |
| "epoch": 0.9389067524115756, |
| "grad_norm": 0.04658494517207146, |
| "learning_rate": 0.0025, |
| "loss": 5.061060428619385, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 157.75772, |
| "step": 219, |
| "tokens/total": 492288, |
| "tokens/train_per_sec_per_gpu": 18.83, |
| "tokens/trainable": 289637 |
| }, |
| { |
| "epoch": 0.9431939978563773, |
| "grad_norm": 0.058834441006183624, |
| "learning_rate": 0.0025, |
| "loss": 5.130789756774902, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 169.15065, |
| "step": 220, |
| "tokens/total": 494848, |
| "tokens/train_per_sec_per_gpu": 137.69, |
| "tokens/trainable": 291132 |
| }, |
| { |
| "epoch": 0.947481243301179, |
| "grad_norm": 0.10351614654064178, |
| "learning_rate": 0.0025, |
| "loss": 5.497127532958984, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 243.99007, |
| "step": 221, |
| "tokens/total": 497088, |
| "tokens/train_per_sec_per_gpu": 195.46, |
| "tokens/trainable": 292518 |
| }, |
| { |
| "epoch": 0.9517684887459807, |
| "grad_norm": 0.14364013075828552, |
| "learning_rate": 0.0025, |
| "loss": 5.386436939239502, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 218.42374, |
| "step": 222, |
| "tokens/total": 499520, |
| "tokens/train_per_sec_per_gpu": 4.31, |
| "tokens/trainable": 293866 |
| }, |
| { |
| "epoch": 0.9560557341907824, |
| "grad_norm": 0.06514472514390945, |
| "learning_rate": 0.0025, |
| "loss": 5.5871148109436035, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.53, |
| "memory/max_allocated (GiB)": 18.53, |
| "ppl": 266.96427, |
| "step": 223, |
| "tokens/total": 502208, |
| "tokens/train_per_sec_per_gpu": 19.56, |
| "tokens/trainable": 295684 |
| }, |
| { |
| "epoch": 0.9603429796355841, |
| "grad_norm": 0.05746331810951233, |
| "learning_rate": 0.0025, |
| "loss": 5.019771099090576, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.57, |
| "memory/max_allocated (GiB)": 18.57, |
| "ppl": 151.37665, |
| "step": 224, |
| "tokens/total": 504576, |
| "tokens/train_per_sec_per_gpu": 133.41, |
| "tokens/trainable": 297180 |
| }, |
| { |
| "epoch": 0.9646302250803859, |
| "grad_norm": 0.06428291648626328, |
| "learning_rate": 0.0025, |
| "loss": 5.374805450439453, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.77, |
| "memory/max_allocated (GiB)": 18.77, |
| "ppl": 215.89787, |
| "step": 225, |
| "tokens/total": 508224, |
| "tokens/train_per_sec_per_gpu": 174.74, |
| "tokens/trainable": 299827 |
| }, |
| { |
| "epoch": 0.9689174705251876, |
| "grad_norm": 0.06595566868782043, |
| "learning_rate": 0.0025, |
| "loss": 5.579074859619141, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 264.82649, |
| "step": 226, |
| "tokens/total": 509824, |
| "tokens/train_per_sec_per_gpu": 6.15, |
| "tokens/trainable": 300582 |
| }, |
| { |
| "epoch": 0.9732047159699893, |
| "grad_norm": 0.06129618361592293, |
| "learning_rate": 0.0025, |
| "loss": 4.87337064743042, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.67, |
| "memory/max_allocated (GiB)": 18.67, |
| "ppl": 130.76092, |
| "step": 227, |
| "tokens/total": 512768, |
| "tokens/train_per_sec_per_gpu": 14.39, |
| "tokens/trainable": 302643 |
| }, |
| { |
| "epoch": 0.977491961414791, |
| "grad_norm": 0.07292018085718155, |
| "learning_rate": 0.0025, |
| "loss": 5.493099212646484, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.55, |
| "memory/max_allocated (GiB)": 18.55, |
| "ppl": 243.00918, |
| "step": 228, |
| "tokens/total": 515328, |
| "tokens/train_per_sec_per_gpu": 8.6, |
| "tokens/trainable": 304358 |
| }, |
| { |
| "epoch": 0.9817792068595927, |
| "grad_norm": 0.04551401734352112, |
| "learning_rate": 0.0025, |
| "loss": 5.522790908813477, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.54, |
| "memory/max_allocated (GiB)": 18.54, |
| "ppl": 250.33272, |
| "step": 229, |
| "tokens/total": 518144, |
| "tokens/train_per_sec_per_gpu": 36.31, |
| "tokens/trainable": 306242 |
| }, |
| { |
| "epoch": 0.9860664523043944, |
| "grad_norm": 0.08372899889945984, |
| "learning_rate": 0.0025, |
| "loss": 5.140377998352051, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 170.78031, |
| "step": 230, |
| "tokens/total": 520000, |
| "tokens/train_per_sec_per_gpu": 20.62, |
| "tokens/trainable": 307298 |
| }, |
| { |
| "epoch": 0.9903536977491961, |
| "grad_norm": 0.0876646488904953, |
| "learning_rate": 0.0025, |
| "loss": 5.655140399932861, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.96, |
| "memory/max_allocated (GiB)": 18.96, |
| "ppl": 285.7566, |
| "step": 231, |
| "tokens/total": 523456, |
| "tokens/train_per_sec_per_gpu": 176.78, |
| "tokens/trainable": 309825 |
| }, |
| { |
| "epoch": 0.9946409431939979, |
| "grad_norm": 0.11478639394044876, |
| "learning_rate": 0.0025, |
| "loss": 5.731156826019287, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 308.32574, |
| "step": 232, |
| "tokens/total": 524928, |
| "tokens/train_per_sec_per_gpu": 73.31, |
| "tokens/trainable": 310402 |
| }, |
| { |
| "epoch": 0.9989281886387996, |
| "grad_norm": 0.05332854762673378, |
| "learning_rate": 0.0025, |
| "loss": 4.958339214324951, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 142.35717, |
| "step": 233, |
| "tokens/total": 527040, |
| "tokens/train_per_sec_per_gpu": 168.24, |
| "tokens/trainable": 311695 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.09713005274534225, |
| "learning_rate": 0.0025, |
| "loss": 4.436938285827637, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 84.51578, |
| "step": 234, |
| "tokens/total": 527680, |
| "tokens/train_per_sec_per_gpu": 682.75, |
| "tokens/trainable": 312130 |
| }, |
| { |
| "epoch": 1.0042872454448017, |
| "grad_norm": 0.05925685912370682, |
| "learning_rate": 0.0025, |
| "loss": 4.939643859863281, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 139.72048, |
| "step": 235, |
| "tokens/total": 529920, |
| "tokens/train_per_sec_per_gpu": 77.24, |
| "tokens/trainable": 313388 |
| }, |
| { |
| "epoch": 1.0085744908896035, |
| "grad_norm": 0.09591115266084671, |
| "learning_rate": 0.0025, |
| "loss": 5.589860916137695, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 267.69838, |
| "step": 236, |
| "tokens/total": 531648, |
| "tokens/train_per_sec_per_gpu": 11.46, |
| "tokens/trainable": 314241 |
| }, |
| { |
| "epoch": 1.0085744908896035, |
| "eval_loss": 5.21028470993042, |
| "eval_ppl": 183.14619, |
| "eval_runtime": 17.0244, |
| "eval_samples_per_second": 12.218, |
| "eval_steps_per_second": 12.218, |
| "memory/device_reserved (GiB)": 19.84, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.0128617363344052, |
| "grad_norm": 0.07797813415527344, |
| "learning_rate": 0.0025, |
| "loss": 5.490115165710449, |
| "memory/device_reserved (GiB)": 18.62, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 242.28511, |
| "step": 237, |
| "tokens/total": 533760, |
| "tokens/train_per_sec_per_gpu": 66.73, |
| "tokens/trainable": 315528 |
| }, |
| { |
| "epoch": 1.0171489817792068, |
| "grad_norm": 0.058185361325740814, |
| "learning_rate": 0.0025, |
| "loss": 5.693612098693848, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.75, |
| "memory/max_allocated (GiB)": 18.75, |
| "ppl": 296.96435, |
| "step": 238, |
| "tokens/total": 536832, |
| "tokens/train_per_sec_per_gpu": 6.22, |
| "tokens/trainable": 317658 |
| }, |
| { |
| "epoch": 1.0214362272240085, |
| "grad_norm": 0.06836100667715073, |
| "learning_rate": 0.0025, |
| "loss": 5.170307159423828, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 175.96888, |
| "step": 239, |
| "tokens/total": 539648, |
| "tokens/train_per_sec_per_gpu": 124.58, |
| "tokens/trainable": 319553 |
| }, |
| { |
| "epoch": 1.0257234726688103, |
| "grad_norm": 0.063715860247612, |
| "learning_rate": 0.0025, |
| "loss": 5.572962760925293, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 263.21278, |
| "step": 240, |
| "tokens/total": 542144, |
| "tokens/train_per_sec_per_gpu": 16.68, |
| "tokens/trainable": 321099 |
| }, |
| { |
| "epoch": 1.030010718113612, |
| "grad_norm": 0.04595501348376274, |
| "learning_rate": 0.0025, |
| "loss": 5.575761795043945, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 263.95056, |
| "step": 241, |
| "tokens/total": 544768, |
| "tokens/train_per_sec_per_gpu": 42.37, |
| "tokens/trainable": 322797 |
| }, |
| { |
| "epoch": 1.0342979635584137, |
| "grad_norm": 0.043023187667131424, |
| "learning_rate": 0.0025, |
| "loss": 5.4010138511657715, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 221.631, |
| "step": 242, |
| "tokens/total": 547392, |
| "tokens/train_per_sec_per_gpu": 64.52, |
| "tokens/trainable": 324497 |
| }, |
| { |
| "epoch": 1.0385852090032155, |
| "grad_norm": 0.10789299011230469, |
| "learning_rate": 0.0025, |
| "loss": 5.370054244995117, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.39, |
| "memory/max_allocated (GiB)": 18.39, |
| "ppl": 214.87452, |
| "step": 243, |
| "tokens/total": 548736, |
| "tokens/train_per_sec_per_gpu": 3.01, |
| "tokens/trainable": 324950 |
| }, |
| { |
| "epoch": 1.0428724544480172, |
| "grad_norm": 0.043591853231191635, |
| "learning_rate": 0.0025, |
| "loss": 5.297659397125244, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.59, |
| "memory/max_allocated (GiB)": 18.59, |
| "ppl": 199.86845, |
| "step": 244, |
| "tokens/total": 551872, |
| "tokens/train_per_sec_per_gpu": 99.52, |
| "tokens/trainable": 327123 |
| }, |
| { |
| "epoch": 1.0471596998928188, |
| "grad_norm": 0.07090502977371216, |
| "learning_rate": 0.0025, |
| "loss": 5.015192985534668, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 150.68521, |
| "step": 245, |
| "tokens/total": 554048, |
| "tokens/train_per_sec_per_gpu": 4.32, |
| "tokens/trainable": 328432 |
| }, |
| { |
| "epoch": 1.0514469453376205, |
| "grad_norm": 0.06152981519699097, |
| "learning_rate": 0.0025, |
| "loss": 4.667083740234375, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 106.38704, |
| "step": 246, |
| "tokens/total": 555712, |
| "tokens/train_per_sec_per_gpu": 45.68, |
| "tokens/trainable": 329309 |
| }, |
| { |
| "epoch": 1.0557341907824223, |
| "grad_norm": 0.06200568005442619, |
| "learning_rate": 0.0025, |
| "loss": 5.22236967086792, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 185.37294, |
| "step": 247, |
| "tokens/total": 557632, |
| "tokens/train_per_sec_per_gpu": 6.58, |
| "tokens/trainable": 330316 |
| }, |
| { |
| "epoch": 1.060021436227224, |
| "grad_norm": 0.0687415823340416, |
| "learning_rate": 0.0025, |
| "loss": 4.802757740020752, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.59, |
| "memory/max_allocated (GiB)": 18.59, |
| "ppl": 121.84597, |
| "step": 248, |
| "tokens/total": 560320, |
| "tokens/train_per_sec_per_gpu": 18.48, |
| "tokens/trainable": 331985 |
| }, |
| { |
| "epoch": 1.0643086816720257, |
| "grad_norm": 0.07006296515464783, |
| "learning_rate": 0.0025, |
| "loss": 5.4001383781433105, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 221.43706, |
| "step": 249, |
| "tokens/total": 562304, |
| "tokens/train_per_sec_per_gpu": 15.87, |
| "tokens/trainable": 333042 |
| }, |
| { |
| "epoch": 1.0685959271168275, |
| "grad_norm": 0.057536471635103226, |
| "learning_rate": 0.0025, |
| "loss": 4.920461654663086, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 137.06588, |
| "step": 250, |
| "tokens/total": 564096, |
| "tokens/train_per_sec_per_gpu": 97.41, |
| "tokens/trainable": 334009 |
| }, |
| { |
| "epoch": 1.0728831725616292, |
| "grad_norm": 0.2622619867324829, |
| "learning_rate": 0.0025, |
| "loss": 6.004157543182373, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.38, |
| "memory/max_allocated (GiB)": 18.38, |
| "ppl": 405.10956, |
| "step": 251, |
| "tokens/total": 565376, |
| "tokens/train_per_sec_per_gpu": 11.03, |
| "tokens/trainable": 334450 |
| }, |
| { |
| "epoch": 1.077170418006431, |
| "grad_norm": 0.08943215012550354, |
| "learning_rate": 0.0025, |
| "loss": 5.477893829345703, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.62, |
| "memory/max_allocated (GiB)": 18.62, |
| "ppl": 239.34208, |
| "step": 252, |
| "tokens/total": 567744, |
| "tokens/train_per_sec_per_gpu": 28.99, |
| "tokens/trainable": 335890 |
| }, |
| { |
| "epoch": 1.0814576634512325, |
| "grad_norm": 0.05865178629755974, |
| "learning_rate": 0.0025, |
| "loss": 4.780120849609375, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 119.11874, |
| "step": 253, |
| "tokens/total": 570240, |
| "tokens/train_per_sec_per_gpu": 32.08, |
| "tokens/trainable": 337404 |
| }, |
| { |
| "epoch": 1.0857449088960343, |
| "grad_norm": 0.08746153116226196, |
| "learning_rate": 0.0025, |
| "loss": 5.729345798492432, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 307.76786, |
| "step": 254, |
| "tokens/total": 572416, |
| "tokens/train_per_sec_per_gpu": 37.32, |
| "tokens/trainable": 338644 |
| }, |
| { |
| "epoch": 1.090032154340836, |
| "grad_norm": 0.0820365622639656, |
| "learning_rate": 0.0025, |
| "loss": 4.818498611450195, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 123.77911, |
| "step": 255, |
| "tokens/total": 574144, |
| "tokens/train_per_sec_per_gpu": 204.86, |
| "tokens/trainable": 339546 |
| }, |
| { |
| "epoch": 1.0943193997856377, |
| "grad_norm": 0.05401737242937088, |
| "learning_rate": 0.0025, |
| "loss": 4.729460716247559, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 113.23448, |
| "step": 256, |
| "tokens/total": 576448, |
| "tokens/train_per_sec_per_gpu": 62.54, |
| "tokens/trainable": 340927 |
| }, |
| { |
| "epoch": 1.0986066452304395, |
| "grad_norm": 0.08213179558515549, |
| "learning_rate": 0.0025, |
| "loss": 4.536296844482422, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 93.34449, |
| "step": 257, |
| "tokens/total": 577856, |
| "tokens/train_per_sec_per_gpu": 25.55, |
| "tokens/trainable": 341479 |
| }, |
| { |
| "epoch": 1.1028938906752412, |
| "grad_norm": 0.09325698018074036, |
| "learning_rate": 0.0025, |
| "loss": 6.08807897567749, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.58, |
| "memory/max_allocated (GiB)": 18.58, |
| "ppl": 440.57424, |
| "step": 258, |
| "tokens/total": 580928, |
| "tokens/train_per_sec_per_gpu": 30.01, |
| "tokens/trainable": 343497 |
| }, |
| { |
| "epoch": 1.107181136120043, |
| "grad_norm": 0.06686203926801682, |
| "learning_rate": 0.0025, |
| "loss": 5.119194984436035, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 167.20072, |
| "step": 259, |
| "tokens/total": 583040, |
| "tokens/train_per_sec_per_gpu": 112.68, |
| "tokens/trainable": 344661 |
| }, |
| { |
| "epoch": 1.1114683815648445, |
| "grad_norm": 0.08321559429168701, |
| "learning_rate": 0.0025, |
| "loss": 5.304489612579346, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 201.23827, |
| "step": 260, |
| "tokens/total": 585472, |
| "tokens/train_per_sec_per_gpu": 2.12, |
| "tokens/trainable": 346087 |
| }, |
| { |
| "epoch": 1.1157556270096463, |
| "grad_norm": 0.05973471328616142, |
| "learning_rate": 0.0025, |
| "loss": 5.243081092834473, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.42, |
| "memory/max_allocated (GiB)": 18.42, |
| "ppl": 189.25231, |
| "step": 261, |
| "tokens/total": 587520, |
| "tokens/train_per_sec_per_gpu": 118.72, |
| "tokens/trainable": 347244 |
| }, |
| { |
| "epoch": 1.120042872454448, |
| "grad_norm": 0.06643401831388474, |
| "learning_rate": 0.0025, |
| "loss": 4.716557025909424, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.6, |
| "memory/max_allocated (GiB)": 18.6, |
| "ppl": 111.78272, |
| "step": 262, |
| "tokens/total": 589504, |
| "tokens/train_per_sec_per_gpu": 7.82, |
| "tokens/trainable": 348354 |
| }, |
| { |
| "epoch": 1.1243301178992497, |
| "grad_norm": 0.05456831306219101, |
| "learning_rate": 0.0025, |
| "loss": 5.202316761016846, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 181.69269, |
| "step": 263, |
| "tokens/total": 592448, |
| "tokens/train_per_sec_per_gpu": 247.8, |
| "tokens/trainable": 350338 |
| }, |
| { |
| "epoch": 1.1286173633440515, |
| "grad_norm": 0.05614905431866646, |
| "learning_rate": 0.0025, |
| "loss": 4.99215030670166, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 147.25272, |
| "step": 264, |
| "tokens/total": 594432, |
| "tokens/train_per_sec_per_gpu": 2.18, |
| "tokens/trainable": 351396 |
| }, |
| { |
| "epoch": 1.1329046087888532, |
| "grad_norm": 0.06128396466374397, |
| "learning_rate": 0.0025, |
| "loss": 4.919600009918213, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 136.94782, |
| "step": 265, |
| "tokens/total": 596864, |
| "tokens/train_per_sec_per_gpu": 143.33, |
| "tokens/trainable": 352905 |
| }, |
| { |
| "epoch": 1.137191854233655, |
| "grad_norm": 0.09635547548532486, |
| "learning_rate": 0.0025, |
| "loss": 4.9075703620910645, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 135.31026, |
| "step": 266, |
| "tokens/total": 598784, |
| "tokens/train_per_sec_per_gpu": 95.18, |
| "tokens/trainable": 353980 |
| }, |
| { |
| "epoch": 1.1414790996784565, |
| "grad_norm": 0.07255875319242477, |
| "learning_rate": 0.0025, |
| "loss": 5.526827812194824, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 251.34533, |
| "step": 267, |
| "tokens/total": 600704, |
| "tokens/train_per_sec_per_gpu": 208.93, |
| "tokens/trainable": 355034 |
| }, |
| { |
| "epoch": 1.1457663451232583, |
| "grad_norm": 0.07017937302589417, |
| "learning_rate": 0.0025, |
| "loss": 5.108213424682617, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.59, |
| "memory/max_allocated (GiB)": 18.59, |
| "ppl": 165.37464, |
| "step": 268, |
| "tokens/total": 603136, |
| "tokens/train_per_sec_per_gpu": 169.95, |
| "tokens/trainable": 356502 |
| }, |
| { |
| "epoch": 1.15005359056806, |
| "grad_norm": 0.05392616242170334, |
| "learning_rate": 0.0025, |
| "loss": 4.911001205444336, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 135.77529, |
| "step": 269, |
| "tokens/total": 605696, |
| "tokens/train_per_sec_per_gpu": 98.87, |
| "tokens/trainable": 358164 |
| }, |
| { |
| "epoch": 1.1543408360128617, |
| "grad_norm": 0.06459183990955353, |
| "learning_rate": 0.0025, |
| "loss": 5.657564640045166, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.67, |
| "memory/max_allocated (GiB)": 18.67, |
| "ppl": 286.45018, |
| "step": 270, |
| "tokens/total": 608256, |
| "tokens/train_per_sec_per_gpu": 34.31, |
| "tokens/trainable": 359715 |
| }, |
| { |
| "epoch": 1.1586280814576635, |
| "grad_norm": 0.05815054103732109, |
| "learning_rate": 0.0025, |
| "loss": 5.023680210113525, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 151.96956, |
| "step": 271, |
| "tokens/total": 610752, |
| "tokens/train_per_sec_per_gpu": 256.08, |
| "tokens/trainable": 361290 |
| }, |
| { |
| "epoch": 1.1629153269024652, |
| "grad_norm": 0.08935742825269699, |
| "learning_rate": 0.0025, |
| "loss": 5.135861396789551, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 170.0107, |
| "step": 272, |
| "tokens/total": 612480, |
| "tokens/train_per_sec_per_gpu": 3.04, |
| "tokens/trainable": 362007 |
| }, |
| { |
| "epoch": 1.167202572347267, |
| "grad_norm": 0.07162267714738846, |
| "learning_rate": 0.0025, |
| "loss": 4.85862922668457, |
| "memory/device_reserved (GiB)": 19.05, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 128.84746, |
| "step": 273, |
| "tokens/total": 614144, |
| "tokens/train_per_sec_per_gpu": 10.47, |
| "tokens/trainable": 362762 |
| }, |
| { |
| "epoch": 1.1714898177920685, |
| "grad_norm": 0.048264019191265106, |
| "learning_rate": 0.0025, |
| "loss": 5.584687232971191, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.85, |
| "memory/max_allocated (GiB)": 18.85, |
| "ppl": 266.31697, |
| "step": 274, |
| "tokens/total": 617792, |
| "tokens/train_per_sec_per_gpu": 37.41, |
| "tokens/trainable": 365433 |
| }, |
| { |
| "epoch": 1.1757770632368703, |
| "grad_norm": 0.05624736472964287, |
| "learning_rate": 0.0025, |
| "loss": 5.056115627288818, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 156.97956, |
| "step": 275, |
| "tokens/total": 619712, |
| "tokens/train_per_sec_per_gpu": 177.31, |
| "tokens/trainable": 366496 |
| }, |
| { |
| "epoch": 1.180064308681672, |
| "grad_norm": 0.0889284759759903, |
| "learning_rate": 0.0025, |
| "loss": 5.753776550292969, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 315.37946, |
| "step": 276, |
| "tokens/total": 621248, |
| "tokens/train_per_sec_per_gpu": 30.13, |
| "tokens/trainable": 367083 |
| }, |
| { |
| "epoch": 1.1843515541264737, |
| "grad_norm": 0.07002771645784378, |
| "learning_rate": 0.0025, |
| "loss": 5.0761637687683105, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 160.15847, |
| "step": 277, |
| "tokens/total": 623552, |
| "tokens/train_per_sec_per_gpu": 124.59, |
| "tokens/trainable": 368541 |
| }, |
| { |
| "epoch": 1.1886387995712755, |
| "grad_norm": 0.04624473676085472, |
| "learning_rate": 0.0025, |
| "loss": 5.2077860832214355, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 182.68915, |
| "step": 278, |
| "tokens/total": 625600, |
| "tokens/train_per_sec_per_gpu": 20.49, |
| "tokens/trainable": 369666 |
| }, |
| { |
| "epoch": 1.1929260450160772, |
| "grad_norm": 0.08523814380168915, |
| "learning_rate": 0.0025, |
| "loss": 4.794674873352051, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 120.86508, |
| "step": 279, |
| "tokens/total": 627008, |
| "tokens/train_per_sec_per_gpu": 6.56, |
| "tokens/trainable": 370220 |
| }, |
| { |
| "epoch": 1.197213290460879, |
| "grad_norm": 0.04043371230363846, |
| "learning_rate": 0.0025, |
| "loss": 4.8553667068481445, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.6, |
| "memory/max_allocated (GiB)": 18.6, |
| "ppl": 128.42778, |
| "step": 280, |
| "tokens/total": 629824, |
| "tokens/train_per_sec_per_gpu": 277.58, |
| "tokens/trainable": 372088 |
| }, |
| { |
| "epoch": 1.2015005359056805, |
| "grad_norm": 0.05826675891876221, |
| "learning_rate": 0.0025, |
| "loss": 5.2880730628967285, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 197.9616, |
| "step": 281, |
| "tokens/total": 632128, |
| "tokens/train_per_sec_per_gpu": 58.45, |
| "tokens/trainable": 373418 |
| }, |
| { |
| "epoch": 1.2057877813504823, |
| "grad_norm": 0.055210962891578674, |
| "learning_rate": 0.0025, |
| "loss": 5.282061576843262, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 196.77512, |
| "step": 282, |
| "tokens/total": 634880, |
| "tokens/train_per_sec_per_gpu": 43.45, |
| "tokens/trainable": 375160 |
| }, |
| { |
| "epoch": 1.210075026795284, |
| "grad_norm": 0.05953294038772583, |
| "learning_rate": 0.0025, |
| "loss": 4.843015193939209, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 126.85126, |
| "step": 283, |
| "tokens/total": 637120, |
| "tokens/train_per_sec_per_gpu": 11.03, |
| "tokens/trainable": 376400 |
| }, |
| { |
| "epoch": 1.2143622722400857, |
| "grad_norm": 0.09921937435865402, |
| "learning_rate": 0.0025, |
| "loss": 5.3062543869018555, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.68, |
| "memory/max_allocated (GiB)": 18.68, |
| "ppl": 201.59372, |
| "step": 284, |
| "tokens/total": 640192, |
| "tokens/train_per_sec_per_gpu": 26.81, |
| "tokens/trainable": 378440 |
| }, |
| { |
| "epoch": 1.2186495176848875, |
| "grad_norm": 0.07062297314405441, |
| "learning_rate": 0.0025, |
| "loss": 5.3862786293029785, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 218.38916, |
| "step": 285, |
| "tokens/total": 642432, |
| "tokens/train_per_sec_per_gpu": 106.98, |
| "tokens/trainable": 379692 |
| }, |
| { |
| "epoch": 1.2229367631296892, |
| "grad_norm": 0.061749882996082306, |
| "learning_rate": 0.0025, |
| "loss": 4.912004470825195, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 135.91157, |
| "step": 286, |
| "tokens/total": 644224, |
| "tokens/train_per_sec_per_gpu": 97.43, |
| "tokens/trainable": 380455 |
| }, |
| { |
| "epoch": 1.227224008574491, |
| "grad_norm": 0.08968321979045868, |
| "learning_rate": 0.0025, |
| "loss": 5.269050598144531, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.77, |
| "memory/max_allocated (GiB)": 18.77, |
| "ppl": 194.23147, |
| "step": 287, |
| "tokens/total": 647488, |
| "tokens/train_per_sec_per_gpu": 11.93, |
| "tokens/trainable": 382717 |
| }, |
| { |
| "epoch": 1.2315112540192925, |
| "grad_norm": 0.06253078579902649, |
| "learning_rate": 0.0025, |
| "loss": 4.804765701293945, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 122.09088, |
| "step": 288, |
| "tokens/total": 649216, |
| "tokens/train_per_sec_per_gpu": 112.0, |
| "tokens/trainable": 383607 |
| }, |
| { |
| "epoch": 1.2357984994640943, |
| "grad_norm": 0.07760690897703171, |
| "learning_rate": 0.0025, |
| "loss": 4.299499034881592, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.66, |
| "memory/max_allocated (GiB)": 18.66, |
| "ppl": 73.66288, |
| "step": 289, |
| "tokens/total": 652160, |
| "tokens/train_per_sec_per_gpu": 115.01, |
| "tokens/trainable": 385609 |
| }, |
| { |
| "epoch": 1.240085744908896, |
| "grad_norm": 0.05732857063412666, |
| "learning_rate": 0.0025, |
| "loss": 4.712733268737793, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 111.35611, |
| "step": 290, |
| "tokens/total": 655104, |
| "tokens/train_per_sec_per_gpu": 26.12, |
| "tokens/trainable": 387600 |
| }, |
| { |
| "epoch": 1.2443729903536977, |
| "grad_norm": 0.08104580640792847, |
| "learning_rate": 0.0025, |
| "loss": 5.645717620849609, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 283.07663, |
| "step": 291, |
| "tokens/total": 657600, |
| "tokens/train_per_sec_per_gpu": 134.65, |
| "tokens/trainable": 389160 |
| }, |
| { |
| "epoch": 1.2486602357984995, |
| "grad_norm": 0.07328224182128906, |
| "learning_rate": 0.0025, |
| "loss": 4.8165693283081055, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 123.54054, |
| "step": 292, |
| "tokens/total": 659520, |
| "tokens/train_per_sec_per_gpu": 29.84, |
| "tokens/trainable": 389993 |
| }, |
| { |
| "epoch": 1.2529474812433012, |
| "grad_norm": 0.05570969730615616, |
| "learning_rate": 0.0025, |
| "loss": 4.818185806274414, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 123.7404, |
| "step": 293, |
| "tokens/total": 661824, |
| "tokens/train_per_sec_per_gpu": 58.03, |
| "tokens/trainable": 391387 |
| }, |
| { |
| "epoch": 1.257234726688103, |
| "grad_norm": 0.06483778357505798, |
| "learning_rate": 0.0025, |
| "loss": 4.867541313171387, |
| "memory/device_reserved (GiB)": 19.21, |
| "memory/max_active (GiB)": 18.86, |
| "memory/max_allocated (GiB)": 18.86, |
| "ppl": 130.00089, |
| "step": 294, |
| "tokens/total": 664384, |
| "tokens/train_per_sec_per_gpu": 58.74, |
| "tokens/trainable": 393119 |
| }, |
| { |
| "epoch": 1.2615219721329045, |
| "grad_norm": 0.06346052885055542, |
| "learning_rate": 0.0025, |
| "loss": 4.831150054931641, |
| "memory/device_reserved (GiB)": 19.34, |
| "memory/max_active (GiB)": 18.91, |
| "memory/max_allocated (GiB)": 18.91, |
| "ppl": 125.35504, |
| "step": 295, |
| "tokens/total": 667456, |
| "tokens/train_per_sec_per_gpu": 105.27, |
| "tokens/trainable": 395350 |
| }, |
| { |
| "epoch": 1.2615219721329045, |
| "eval_loss": 4.9676361083984375, |
| "eval_ppl": 143.68683, |
| "eval_runtime": 17.3088, |
| "eval_samples_per_second": 12.017, |
| "eval_steps_per_second": 12.017, |
| "memory/device_reserved (GiB)": 19.34, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.2658092175777063, |
| "grad_norm": 0.06215736269950867, |
| "learning_rate": 0.0025, |
| "loss": 5.038169860839844, |
| "memory/device_reserved (GiB)": 18.81, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 154.18757, |
| "step": 296, |
| "tokens/total": 669568, |
| "tokens/train_per_sec_per_gpu": 85.6, |
| "tokens/trainable": 396670 |
| }, |
| { |
| "epoch": 1.270096463022508, |
| "grad_norm": 0.07183429598808289, |
| "learning_rate": 0.0025, |
| "loss": 5.635323524475098, |
| "memory/device_reserved (GiB)": 18.81, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 280.14954, |
| "step": 297, |
| "tokens/total": 671360, |
| "tokens/train_per_sec_per_gpu": 3.91, |
| "tokens/trainable": 397597 |
| }, |
| { |
| "epoch": 1.2743837084673098, |
| "grad_norm": 0.08423589169979095, |
| "learning_rate": 0.0025, |
| "loss": 4.927524089813232, |
| "memory/device_reserved (GiB)": 18.81, |
| "memory/max_active (GiB)": 18.39, |
| "memory/max_allocated (GiB)": 18.39, |
| "ppl": 138.03732, |
| "step": 298, |
| "tokens/total": 672896, |
| "tokens/train_per_sec_per_gpu": 21.59, |
| "tokens/trainable": 398137 |
| }, |
| { |
| "epoch": 1.2786709539121115, |
| "grad_norm": 0.050887443125247955, |
| "learning_rate": 0.0025, |
| "loss": 4.658658981323242, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.67, |
| "memory/max_allocated (GiB)": 18.67, |
| "ppl": 105.49452, |
| "step": 299, |
| "tokens/total": 675968, |
| "tokens/train_per_sec_per_gpu": 343.04, |
| "tokens/trainable": 400317 |
| }, |
| { |
| "epoch": 1.2829581993569132, |
| "grad_norm": 0.0761413648724556, |
| "learning_rate": 0.0025, |
| "loss": 5.080306529998779, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 160.82335, |
| "step": 300, |
| "tokens/total": 678400, |
| "tokens/train_per_sec_per_gpu": 112.46, |
| "tokens/trainable": 401891 |
| }, |
| { |
| "epoch": 1.287245444801715, |
| "grad_norm": 0.08556882292032242, |
| "learning_rate": 0.0025, |
| "loss": 5.198375225067139, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 180.97795, |
| "step": 301, |
| "tokens/total": 680576, |
| "tokens/train_per_sec_per_gpu": 57.45, |
| "tokens/trainable": 403197 |
| }, |
| { |
| "epoch": 1.2915326902465165, |
| "grad_norm": 0.05796833708882332, |
| "learning_rate": 0.0025, |
| "loss": 5.415544509887695, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 224.87496, |
| "step": 302, |
| "tokens/total": 682688, |
| "tokens/train_per_sec_per_gpu": 111.88, |
| "tokens/trainable": 404402 |
| }, |
| { |
| "epoch": 1.2958199356913183, |
| "grad_norm": 0.054447028785943985, |
| "learning_rate": 0.0025, |
| "loss": 5.261867523193359, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 192.84129, |
| "step": 303, |
| "tokens/total": 684864, |
| "tokens/train_per_sec_per_gpu": 103.18, |
| "tokens/trainable": 405715 |
| }, |
| { |
| "epoch": 1.30010718113612, |
| "grad_norm": 0.060531970113515854, |
| "learning_rate": 0.0025, |
| "loss": 5.523534297943115, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 250.51888, |
| "step": 304, |
| "tokens/total": 687616, |
| "tokens/train_per_sec_per_gpu": 106.29, |
| "tokens/trainable": 407456 |
| }, |
| { |
| "epoch": 1.3043944265809218, |
| "grad_norm": 0.050943512469530106, |
| "learning_rate": 0.0025, |
| "loss": 5.201202392578125, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 181.49033, |
| "step": 305, |
| "tokens/total": 689984, |
| "tokens/train_per_sec_per_gpu": 146.93, |
| "tokens/trainable": 408956 |
| }, |
| { |
| "epoch": 1.3086816720257235, |
| "grad_norm": 0.0500502735376358, |
| "learning_rate": 0.0025, |
| "loss": 4.794355392456055, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.63, |
| "memory/max_allocated (GiB)": 18.63, |
| "ppl": 120.82647, |
| "step": 306, |
| "tokens/total": 693440, |
| "tokens/train_per_sec_per_gpu": 298.58, |
| "tokens/trainable": 411585 |
| }, |
| { |
| "epoch": 1.3129689174705252, |
| "grad_norm": 0.055616557598114014, |
| "learning_rate": 0.0025, |
| "loss": 5.319107532501221, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 204.20156, |
| "step": 307, |
| "tokens/total": 695872, |
| "tokens/train_per_sec_per_gpu": 88.09, |
| "tokens/trainable": 413098 |
| }, |
| { |
| "epoch": 1.317256162915327, |
| "grad_norm": 0.1226491928100586, |
| "learning_rate": 0.0025, |
| "loss": 4.878961563110352, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 131.49404, |
| "step": 308, |
| "tokens/total": 697664, |
| "tokens/train_per_sec_per_gpu": 14.26, |
| "tokens/trainable": 414049 |
| }, |
| { |
| "epoch": 1.3215434083601285, |
| "grad_norm": 0.044849809259176254, |
| "learning_rate": 0.0025, |
| "loss": 4.39945650100708, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 81.40661, |
| "step": 309, |
| "tokens/total": 699456, |
| "tokens/train_per_sec_per_gpu": 35.77, |
| "tokens/trainable": 414939 |
| }, |
| { |
| "epoch": 1.3258306538049303, |
| "grad_norm": 0.10974457114934921, |
| "learning_rate": 0.0025, |
| "loss": 4.777448654174805, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 118.80086, |
| "step": 310, |
| "tokens/total": 701312, |
| "tokens/train_per_sec_per_gpu": 35.79, |
| "tokens/trainable": 415891 |
| }, |
| { |
| "epoch": 1.330117899249732, |
| "grad_norm": 0.06229991093277931, |
| "learning_rate": 0.0025, |
| "loss": 4.938532829284668, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 139.56533, |
| "step": 311, |
| "tokens/total": 703744, |
| "tokens/train_per_sec_per_gpu": 57.07, |
| "tokens/trainable": 417446 |
| }, |
| { |
| "epoch": 1.3344051446945338, |
| "grad_norm": 0.06946682929992676, |
| "learning_rate": 0.0025, |
| "loss": 5.452759265899658, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 233.40129, |
| "step": 312, |
| "tokens/total": 706112, |
| "tokens/train_per_sec_per_gpu": 213.08, |
| "tokens/trainable": 418846 |
| }, |
| { |
| "epoch": 1.3386923901393355, |
| "grad_norm": 0.0820111483335495, |
| "learning_rate": 0.0025, |
| "loss": 4.500866413116455, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.42, |
| "memory/max_allocated (GiB)": 18.42, |
| "ppl": 90.09516, |
| "step": 313, |
| "tokens/total": 707520, |
| "tokens/train_per_sec_per_gpu": 99.78, |
| "tokens/trainable": 419352 |
| }, |
| { |
| "epoch": 1.3429796355841372, |
| "grad_norm": 0.16722634434700012, |
| "learning_rate": 0.0025, |
| "loss": 4.281378269195557, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.59, |
| "memory/max_allocated (GiB)": 18.59, |
| "ppl": 72.34008, |
| "step": 314, |
| "tokens/total": 709440, |
| "tokens/train_per_sec_per_gpu": 15.86, |
| "tokens/trainable": 420321 |
| }, |
| { |
| "epoch": 1.347266881028939, |
| "grad_norm": 0.06021692231297493, |
| "learning_rate": 0.0025, |
| "loss": 5.971747875213623, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 392.19057, |
| "step": 315, |
| "tokens/total": 711936, |
| "tokens/train_per_sec_per_gpu": 77.5, |
| "tokens/trainable": 421997 |
| }, |
| { |
| "epoch": 1.3515541264737405, |
| "grad_norm": 0.048991329967975616, |
| "learning_rate": 0.0025, |
| "loss": 5.36166524887085, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 213.07948, |
| "step": 316, |
| "tokens/total": 714432, |
| "tokens/train_per_sec_per_gpu": 144.42, |
| "tokens/trainable": 423421 |
| }, |
| { |
| "epoch": 1.3558413719185423, |
| "grad_norm": 0.11430433392524719, |
| "learning_rate": 0.0025, |
| "loss": 4.600022792816162, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.36, |
| "memory/max_allocated (GiB)": 18.36, |
| "ppl": 99.48658, |
| "step": 317, |
| "tokens/total": 715648, |
| "tokens/train_per_sec_per_gpu": 0.87, |
| "tokens/trainable": 423767 |
| }, |
| { |
| "epoch": 1.360128617363344, |
| "grad_norm": 0.08438309282064438, |
| "learning_rate": 0.0025, |
| "loss": 5.151257038116455, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 172.64838, |
| "step": 318, |
| "tokens/total": 718272, |
| "tokens/train_per_sec_per_gpu": 259.31, |
| "tokens/trainable": 425467 |
| }, |
| { |
| "epoch": 1.3644158628081458, |
| "grad_norm": 0.056045304983854294, |
| "learning_rate": 0.0025, |
| "loss": 4.737667560577393, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 114.1676, |
| "step": 319, |
| "tokens/total": 720384, |
| "tokens/train_per_sec_per_gpu": 30.87, |
| "tokens/trainable": 426694 |
| }, |
| { |
| "epoch": 1.3687031082529475, |
| "grad_norm": 0.06773889809846878, |
| "learning_rate": 0.0025, |
| "loss": 5.229560852050781, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.57, |
| "memory/max_allocated (GiB)": 18.57, |
| "ppl": 186.71079, |
| "step": 320, |
| "tokens/total": 722688, |
| "tokens/train_per_sec_per_gpu": 10.25, |
| "tokens/trainable": 428099 |
| }, |
| { |
| "epoch": 1.3729903536977492, |
| "grad_norm": 0.06447850167751312, |
| "learning_rate": 0.0025, |
| "loss": 5.048516750335693, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.42, |
| "memory/max_allocated (GiB)": 18.42, |
| "ppl": 155.79122, |
| "step": 321, |
| "tokens/total": 724736, |
| "tokens/train_per_sec_per_gpu": 14.38, |
| "tokens/trainable": 429136 |
| }, |
| { |
| "epoch": 1.377277599142551, |
| "grad_norm": 0.07881579548120499, |
| "learning_rate": 0.0025, |
| "loss": 5.187434673309326, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.57, |
| "memory/max_allocated (GiB)": 18.57, |
| "ppl": 179.00875, |
| "step": 322, |
| "tokens/total": 726784, |
| "tokens/train_per_sec_per_gpu": 25.14, |
| "tokens/trainable": 430417 |
| }, |
| { |
| "epoch": 1.3815648445873525, |
| "grad_norm": 0.06841576844453812, |
| "learning_rate": 0.0025, |
| "loss": 5.470486640930176, |
| "memory/device_reserved (GiB)": 19.0, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 237.57578, |
| "step": 323, |
| "tokens/total": 728512, |
| "tokens/train_per_sec_per_gpu": 156.65, |
| "tokens/trainable": 431369 |
| }, |
| { |
| "epoch": 1.3858520900321543, |
| "grad_norm": 0.11785981059074402, |
| "learning_rate": 0.0025, |
| "loss": 4.845800399780273, |
| "memory/device_reserved (GiB)": 19.14, |
| "memory/max_active (GiB)": 18.69, |
| "memory/max_allocated (GiB)": 18.69, |
| "ppl": 127.20506, |
| "step": 324, |
| "tokens/total": 731200, |
| "tokens/train_per_sec_per_gpu": 6.44, |
| "tokens/trainable": 433078 |
| }, |
| { |
| "epoch": 1.390139335476956, |
| "grad_norm": 0.07563474774360657, |
| "learning_rate": 0.0025, |
| "loss": 4.149139404296875, |
| "memory/device_reserved (GiB)": 19.14, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 63.37943, |
| "step": 325, |
| "tokens/total": 733504, |
| "tokens/train_per_sec_per_gpu": 77.55, |
| "tokens/trainable": 434444 |
| }, |
| { |
| "epoch": 1.3944265809217578, |
| "grad_norm": 0.07862015813589096, |
| "learning_rate": 0.0025, |
| "loss": 4.405404090881348, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 19.09, |
| "memory/max_allocated (GiB)": 19.09, |
| "ppl": 81.89223, |
| "step": 326, |
| "tokens/total": 737088, |
| "tokens/train_per_sec_per_gpu": 155.85, |
| "tokens/trainable": 437043 |
| }, |
| { |
| "epoch": 1.3987138263665595, |
| "grad_norm": 0.07842207700014114, |
| "learning_rate": 0.0025, |
| "loss": 5.559260368347168, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 259.63073, |
| "step": 327, |
| "tokens/total": 738880, |
| "tokens/train_per_sec_per_gpu": 24.68, |
| "tokens/trainable": 437857 |
| }, |
| { |
| "epoch": 1.4030010718113612, |
| "grad_norm": 0.0685216560959816, |
| "learning_rate": 0.0025, |
| "loss": 5.114203453063965, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 166.36821, |
| "step": 328, |
| "tokens/total": 740928, |
| "tokens/train_per_sec_per_gpu": 8.77, |
| "tokens/trainable": 439031 |
| }, |
| { |
| "epoch": 1.407288317256163, |
| "grad_norm": 0.07267401367425919, |
| "learning_rate": 0.0025, |
| "loss": 4.98257303237915, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 145.84917, |
| "step": 329, |
| "tokens/total": 742848, |
| "tokens/train_per_sec_per_gpu": 6.98, |
| "tokens/trainable": 440007 |
| }, |
| { |
| "epoch": 1.4115755627009645, |
| "grad_norm": 0.0542726069688797, |
| "learning_rate": 0.0025, |
| "loss": 4.922747611999512, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 137.37956, |
| "step": 330, |
| "tokens/total": 744960, |
| "tokens/train_per_sec_per_gpu": 109.51, |
| "tokens/trainable": 441203 |
| }, |
| { |
| "epoch": 1.4158628081457665, |
| "grad_norm": 0.06696043908596039, |
| "learning_rate": 0.0025, |
| "loss": 4.691984176635742, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 109.06938, |
| "step": 331, |
| "tokens/total": 747136, |
| "tokens/train_per_sec_per_gpu": 129.1, |
| "tokens/trainable": 442561 |
| }, |
| { |
| "epoch": 1.420150053590568, |
| "grad_norm": 0.06947220861911774, |
| "learning_rate": 0.0025, |
| "loss": 4.651721000671387, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 104.76513, |
| "step": 332, |
| "tokens/total": 748800, |
| "tokens/train_per_sec_per_gpu": 85.45, |
| "tokens/trainable": 443290 |
| }, |
| { |
| "epoch": 1.4244372990353698, |
| "grad_norm": 0.06656062602996826, |
| "learning_rate": 0.0025, |
| "loss": 5.165863990783691, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 175.18875, |
| "step": 333, |
| "tokens/total": 751296, |
| "tokens/train_per_sec_per_gpu": 130.0, |
| "tokens/trainable": 444808 |
| }, |
| { |
| "epoch": 1.4287245444801715, |
| "grad_norm": 0.0503920316696167, |
| "learning_rate": 0.0025, |
| "loss": 5.2461981773376465, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.9, |
| "memory/max_allocated (GiB)": 18.9, |
| "ppl": 189.84314, |
| "step": 334, |
| "tokens/total": 755072, |
| "tokens/train_per_sec_per_gpu": 87.86, |
| "tokens/trainable": 447741 |
| }, |
| { |
| "epoch": 1.4330117899249732, |
| "grad_norm": 0.06683284789323807, |
| "learning_rate": 0.0025, |
| "loss": 4.544827461242676, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 94.14418, |
| "step": 335, |
| "tokens/total": 756544, |
| "tokens/train_per_sec_per_gpu": 5.23, |
| "tokens/trainable": 448356 |
| }, |
| { |
| "epoch": 1.437299035369775, |
| "grad_norm": 0.06916282325983047, |
| "learning_rate": 0.0025, |
| "loss": 5.005459308624268, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 149.22561, |
| "step": 336, |
| "tokens/total": 759424, |
| "tokens/train_per_sec_per_gpu": 82.24, |
| "tokens/trainable": 450248 |
| }, |
| { |
| "epoch": 1.4415862808145765, |
| "grad_norm": 0.0864240899682045, |
| "learning_rate": 0.0025, |
| "loss": 4.898914813995361, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.42, |
| "memory/max_allocated (GiB)": 18.42, |
| "ppl": 134.14413, |
| "step": 337, |
| "tokens/total": 761408, |
| "tokens/train_per_sec_per_gpu": 25.71, |
| "tokens/trainable": 451300 |
| }, |
| { |
| "epoch": 1.4458735262593785, |
| "grad_norm": 0.05566547438502312, |
| "learning_rate": 0.0025, |
| "loss": 5.002068996429443, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 148.72054, |
| "step": 338, |
| "tokens/total": 763904, |
| "tokens/train_per_sec_per_gpu": 2.62, |
| "tokens/trainable": 452916 |
| }, |
| { |
| "epoch": 1.45016077170418, |
| "grad_norm": 0.05582151934504509, |
| "learning_rate": 0.0025, |
| "loss": 5.314091682434082, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 203.17988, |
| "step": 339, |
| "tokens/total": 766208, |
| "tokens/train_per_sec_per_gpu": 107.08, |
| "tokens/trainable": 454272 |
| }, |
| { |
| "epoch": 1.4544480171489818, |
| "grad_norm": 0.05226564779877663, |
| "learning_rate": 0.0025, |
| "loss": 4.944754600524902, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.42, |
| "memory/max_allocated (GiB)": 18.42, |
| "ppl": 140.43638, |
| "step": 340, |
| "tokens/total": 768064, |
| "tokens/train_per_sec_per_gpu": 112.77, |
| "tokens/trainable": 455346 |
| }, |
| { |
| "epoch": 1.4587352625937835, |
| "grad_norm": 0.06044873222708702, |
| "learning_rate": 0.0025, |
| "loss": 4.841938018798828, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 126.71469, |
| "step": 341, |
| "tokens/total": 769664, |
| "tokens/train_per_sec_per_gpu": 62.41, |
| "tokens/trainable": 456009 |
| }, |
| { |
| "epoch": 1.4630225080385852, |
| "grad_norm": 0.04582054540514946, |
| "learning_rate": 0.0025, |
| "loss": 5.606667518615723, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 272.2355, |
| "step": 342, |
| "tokens/total": 772224, |
| "tokens/train_per_sec_per_gpu": 167.3, |
| "tokens/trainable": 457672 |
| }, |
| { |
| "epoch": 1.467309753483387, |
| "grad_norm": 0.052784983068704605, |
| "learning_rate": 0.0025, |
| "loss": 4.380001068115234, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 79.83812, |
| "step": 343, |
| "tokens/total": 774592, |
| "tokens/train_per_sec_per_gpu": 64.87, |
| "tokens/trainable": 459004 |
| }, |
| { |
| "epoch": 1.4715969989281885, |
| "grad_norm": 0.04827815666794777, |
| "learning_rate": 0.0025, |
| "loss": 4.817045211791992, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 123.59934, |
| "step": 344, |
| "tokens/total": 777280, |
| "tokens/train_per_sec_per_gpu": 39.6, |
| "tokens/trainable": 460657 |
| }, |
| { |
| "epoch": 1.4758842443729905, |
| "grad_norm": 0.07826294749975204, |
| "learning_rate": 0.0025, |
| "loss": 3.8754422664642334, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.39, |
| "memory/max_allocated (GiB)": 18.39, |
| "ppl": 48.20401, |
| "step": 345, |
| "tokens/total": 778496, |
| "tokens/train_per_sec_per_gpu": 23.47, |
| "tokens/trainable": 461041 |
| }, |
| { |
| "epoch": 1.480171489817792, |
| "grad_norm": 0.04768767207860947, |
| "learning_rate": 0.0025, |
| "loss": 5.038158893585205, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 154.18588, |
| "step": 346, |
| "tokens/total": 780992, |
| "tokens/train_per_sec_per_gpu": 78.54, |
| "tokens/trainable": 462567 |
| }, |
| { |
| "epoch": 1.4844587352625938, |
| "grad_norm": 0.2351859211921692, |
| "learning_rate": 0.0025, |
| "loss": 5.444571018218994, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 231.49795, |
| "step": 347, |
| "tokens/total": 783616, |
| "tokens/train_per_sec_per_gpu": 168.35, |
| "tokens/trainable": 464210 |
| }, |
| { |
| "epoch": 1.4887459807073955, |
| "grad_norm": 0.04593876376748085, |
| "learning_rate": 0.0025, |
| "loss": 5.326495170593262, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 205.71571, |
| "step": 348, |
| "tokens/total": 786304, |
| "tokens/train_per_sec_per_gpu": 140.75, |
| "tokens/trainable": 466015 |
| }, |
| { |
| "epoch": 1.4930332261521972, |
| "grad_norm": 0.06556063890457153, |
| "learning_rate": 0.0025, |
| "loss": 5.152454376220703, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 172.85522, |
| "step": 349, |
| "tokens/total": 788480, |
| "tokens/train_per_sec_per_gpu": 224.08, |
| "tokens/trainable": 467198 |
| }, |
| { |
| "epoch": 1.497320471596999, |
| "grad_norm": 0.06161191314458847, |
| "learning_rate": 0.0025, |
| "loss": 5.141845226287842, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 171.03107, |
| "step": 350, |
| "tokens/total": 790336, |
| "tokens/train_per_sec_per_gpu": 74.32, |
| "tokens/trainable": 468199 |
| }, |
| { |
| "epoch": 1.5016077170418005, |
| "grad_norm": 0.28069961071014404, |
| "learning_rate": 0.0025, |
| "loss": 5.9613165855407715, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 388.12078, |
| "step": 351, |
| "tokens/total": 792320, |
| "tokens/train_per_sec_per_gpu": 10.1, |
| "tokens/trainable": 469192 |
| }, |
| { |
| "epoch": 1.5058949624866025, |
| "grad_norm": 0.09382814168930054, |
| "learning_rate": 0.0025, |
| "loss": 5.591994762420654, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 268.27022, |
| "step": 352, |
| "tokens/total": 794816, |
| "tokens/train_per_sec_per_gpu": 35.14, |
| "tokens/trainable": 470744 |
| }, |
| { |
| "epoch": 1.510182207931404, |
| "grad_norm": 0.06651383638381958, |
| "learning_rate": 0.0025, |
| "loss": 4.758882522583008, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 116.61554, |
| "step": 353, |
| "tokens/total": 796800, |
| "tokens/train_per_sec_per_gpu": 46.01, |
| "tokens/trainable": 471734 |
| }, |
| { |
| "epoch": 1.5144694533762058, |
| "grad_norm": 0.07342278957366943, |
| "learning_rate": 0.0025, |
| "loss": 5.06705379486084, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 158.70606, |
| "step": 354, |
| "tokens/total": 798400, |
| "tokens/train_per_sec_per_gpu": 46.96, |
| "tokens/trainable": 472368 |
| }, |
| { |
| "epoch": 1.5144694533762058, |
| "eval_loss": 4.888035297393799, |
| "eval_ppl": 132.69262, |
| "eval_runtime": 17.094, |
| "eval_samples_per_second": 12.168, |
| "eval_steps_per_second": 12.168, |
| "memory/device_reserved (GiB)": 19.64, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.5187566988210075, |
| "grad_norm": 0.05180167779326439, |
| "learning_rate": 0.0025, |
| "loss": 4.341151714324951, |
| "memory/device_reserved (GiB)": 18.71, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 76.79594, |
| "step": 355, |
| "tokens/total": 801024, |
| "tokens/train_per_sec_per_gpu": 18.76, |
| "tokens/trainable": 474170 |
| }, |
| { |
| "epoch": 1.5230439442658092, |
| "grad_norm": 0.05290725454688072, |
| "learning_rate": 0.0025, |
| "loss": 4.733070373535156, |
| "memory/device_reserved (GiB)": 18.74, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 113.64396, |
| "step": 356, |
| "tokens/total": 803072, |
| "tokens/train_per_sec_per_gpu": 10.8, |
| "tokens/trainable": 475255 |
| }, |
| { |
| "epoch": 1.527331189710611, |
| "grad_norm": 0.08694328367710114, |
| "learning_rate": 0.0025, |
| "loss": 4.449097156524658, |
| "memory/device_reserved (GiB)": 18.74, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 85.54967, |
| "step": 357, |
| "tokens/total": 804864, |
| "tokens/train_per_sec_per_gpu": 36.39, |
| "tokens/trainable": 476172 |
| }, |
| { |
| "epoch": 1.5316184351554125, |
| "grad_norm": 0.059812407940626144, |
| "learning_rate": 0.0025, |
| "loss": 5.0324225425720215, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 19.55, |
| "memory/max_allocated (GiB)": 19.55, |
| "ppl": 153.30395, |
| "step": 358, |
| "tokens/total": 809344, |
| "tokens/train_per_sec_per_gpu": 62.57, |
| "tokens/trainable": 479719 |
| }, |
| { |
| "epoch": 1.5359056806002145, |
| "grad_norm": 0.05654964968562126, |
| "learning_rate": 0.0025, |
| "loss": 5.156862735748291, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 19.12, |
| "memory/max_allocated (GiB)": 19.12, |
| "ppl": 173.61891, |
| "step": 359, |
| "tokens/total": 813440, |
| "tokens/train_per_sec_per_gpu": 30.77, |
| "tokens/trainable": 482873 |
| }, |
| { |
| "epoch": 1.540192926045016, |
| "grad_norm": 0.04540662467479706, |
| "learning_rate": 0.0025, |
| "loss": 4.702476501464844, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 110.21979, |
| "step": 360, |
| "tokens/total": 815424, |
| "tokens/train_per_sec_per_gpu": 10.68, |
| "tokens/trainable": 483879 |
| }, |
| { |
| "epoch": 1.5444801714898178, |
| "grad_norm": 0.08061626553535461, |
| "learning_rate": 0.0025, |
| "loss": 4.724915504455566, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 19.5, |
| "memory/max_allocated (GiB)": 19.5, |
| "ppl": 112.72097, |
| "step": 361, |
| "tokens/total": 819520, |
| "tokens/train_per_sec_per_gpu": 7.14, |
| "tokens/trainable": 486988 |
| }, |
| { |
| "epoch": 1.5487674169346195, |
| "grad_norm": 0.047675181180238724, |
| "learning_rate": 0.0025, |
| "loss": 4.952810764312744, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 141.57233, |
| "step": 362, |
| "tokens/total": 822528, |
| "tokens/train_per_sec_per_gpu": 188.99, |
| "tokens/trainable": 489093 |
| }, |
| { |
| "epoch": 1.5530546623794212, |
| "grad_norm": 0.0705411285161972, |
| "learning_rate": 0.0025, |
| "loss": 4.235616683959961, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.38, |
| "memory/max_allocated (GiB)": 18.38, |
| "ppl": 69.10428, |
| "step": 363, |
| "tokens/total": 824000, |
| "tokens/train_per_sec_per_gpu": 40.32, |
| "tokens/trainable": 489788 |
| }, |
| { |
| "epoch": 1.557341907824223, |
| "grad_norm": 0.0532984621822834, |
| "learning_rate": 0.0025, |
| "loss": 4.592817306518555, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 98.77231, |
| "step": 364, |
| "tokens/total": 825984, |
| "tokens/train_per_sec_per_gpu": 126.04, |
| "tokens/trainable": 490775 |
| }, |
| { |
| "epoch": 1.5616291532690245, |
| "grad_norm": 0.05280710384249687, |
| "learning_rate": 0.0025, |
| "loss": 5.201740264892578, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 181.58798, |
| "step": 365, |
| "tokens/total": 828096, |
| "tokens/train_per_sec_per_gpu": 54.56, |
| "tokens/trainable": 492005 |
| }, |
| { |
| "epoch": 1.5659163987138265, |
| "grad_norm": 0.051445771008729935, |
| "learning_rate": 0.0025, |
| "loss": 4.719823360443115, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 112.14844, |
| "step": 366, |
| "tokens/total": 830720, |
| "tokens/train_per_sec_per_gpu": 197.49, |
| "tokens/trainable": 493565 |
| }, |
| { |
| "epoch": 1.570203644158628, |
| "grad_norm": 0.05346055328845978, |
| "learning_rate": 0.0025, |
| "loss": 4.971454620361328, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 144.23654, |
| "step": 367, |
| "tokens/total": 833216, |
| "tokens/train_per_sec_per_gpu": 12.08, |
| "tokens/trainable": 495164 |
| }, |
| { |
| "epoch": 1.5744908896034298, |
| "grad_norm": 0.0699196457862854, |
| "learning_rate": 0.0025, |
| "loss": 4.878042221069336, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 131.37321, |
| "step": 368, |
| "tokens/total": 835520, |
| "tokens/train_per_sec_per_gpu": 10.86, |
| "tokens/trainable": 496515 |
| }, |
| { |
| "epoch": 1.5787781350482315, |
| "grad_norm": 0.06464764475822449, |
| "learning_rate": 0.0025, |
| "loss": 4.946186542510986, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 140.63762, |
| "step": 369, |
| "tokens/total": 837696, |
| "tokens/train_per_sec_per_gpu": 199.16, |
| "tokens/trainable": 497713 |
| }, |
| { |
| "epoch": 1.5830653804930332, |
| "grad_norm": 0.08636437356472015, |
| "learning_rate": 0.0025, |
| "loss": 4.280492305755615, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.39, |
| "memory/max_allocated (GiB)": 18.39, |
| "ppl": 72.27601, |
| "step": 370, |
| "tokens/total": 839168, |
| "tokens/train_per_sec_per_gpu": 71.64, |
| "tokens/trainable": 498229 |
| }, |
| { |
| "epoch": 1.587352625937835, |
| "grad_norm": 0.047297775745391846, |
| "learning_rate": 0.0025, |
| "loss": 5.235410213470459, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 187.80613, |
| "step": 371, |
| "tokens/total": 842112, |
| "tokens/train_per_sec_per_gpu": 170.73, |
| "tokens/trainable": 500237 |
| }, |
| { |
| "epoch": 1.5916398713826365, |
| "grad_norm": 0.07992982119321823, |
| "learning_rate": 0.0025, |
| "loss": 4.507941246032715, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.39, |
| "memory/max_allocated (GiB)": 18.39, |
| "ppl": 90.73483, |
| "step": 372, |
| "tokens/total": 843584, |
| "tokens/train_per_sec_per_gpu": 11.01, |
| "tokens/trainable": 500839 |
| }, |
| { |
| "epoch": 1.5959271168274385, |
| "grad_norm": 0.06259352713823318, |
| "learning_rate": 0.0025, |
| "loss": 4.919366836547852, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 136.9159, |
| "step": 373, |
| "tokens/total": 845504, |
| "tokens/train_per_sec_per_gpu": 41.51, |
| "tokens/trainable": 501881 |
| }, |
| { |
| "epoch": 1.60021436227224, |
| "grad_norm": 0.08243437856435776, |
| "learning_rate": 0.0025, |
| "loss": 5.151687145233154, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.58, |
| "memory/max_allocated (GiB)": 18.58, |
| "ppl": 172.72265, |
| "step": 374, |
| "tokens/total": 848768, |
| "tokens/train_per_sec_per_gpu": 261.87, |
| "tokens/trainable": 504139 |
| }, |
| { |
| "epoch": 1.6045016077170418, |
| "grad_norm": 0.08473316580057144, |
| "learning_rate": 0.0025, |
| "loss": 5.018991470336914, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 151.25868, |
| "step": 375, |
| "tokens/total": 851136, |
| "tokens/train_per_sec_per_gpu": 4.3, |
| "tokens/trainable": 505536 |
| }, |
| { |
| "epoch": 1.6087888531618435, |
| "grad_norm": 0.04689257964491844, |
| "learning_rate": 0.0025, |
| "loss": 4.897285461425781, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 133.92574, |
| "step": 376, |
| "tokens/total": 853504, |
| "tokens/train_per_sec_per_gpu": 29.13, |
| "tokens/trainable": 507033 |
| }, |
| { |
| "epoch": 1.6130760986066452, |
| "grad_norm": 0.058138296008110046, |
| "learning_rate": 0.0025, |
| "loss": 4.437278747558594, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.39, |
| "memory/max_allocated (GiB)": 18.39, |
| "ppl": 84.54456, |
| "step": 377, |
| "tokens/total": 855040, |
| "tokens/train_per_sec_per_gpu": 60.77, |
| "tokens/trainable": 507780 |
| }, |
| { |
| "epoch": 1.617363344051447, |
| "grad_norm": 0.07955910265445709, |
| "learning_rate": 0.0025, |
| "loss": 4.341729640960693, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.53, |
| "memory/max_allocated (GiB)": 18.53, |
| "ppl": 76.84033, |
| "step": 378, |
| "tokens/total": 857024, |
| "tokens/train_per_sec_per_gpu": 1.3, |
| "tokens/trainable": 508911 |
| }, |
| { |
| "epoch": 1.6216505894962485, |
| "grad_norm": 0.057746682316064835, |
| "learning_rate": 0.0025, |
| "loss": 3.7874722480773926, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 44.14467, |
| "step": 379, |
| "tokens/total": 858752, |
| "tokens/train_per_sec_per_gpu": 27.9, |
| "tokens/trainable": 509683 |
| }, |
| { |
| "epoch": 1.6259378349410505, |
| "grad_norm": 0.07005178928375244, |
| "learning_rate": 0.0025, |
| "loss": 5.148179531097412, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 172.11787, |
| "step": 380, |
| "tokens/total": 860928, |
| "tokens/train_per_sec_per_gpu": 56.8, |
| "tokens/trainable": 510959 |
| }, |
| { |
| "epoch": 1.630225080385852, |
| "grad_norm": 0.04911843314766884, |
| "learning_rate": 0.0025, |
| "loss": 4.848702430725098, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 127.57475, |
| "step": 381, |
| "tokens/total": 862976, |
| "tokens/train_per_sec_per_gpu": 93.02, |
| "tokens/trainable": 512078 |
| }, |
| { |
| "epoch": 1.6345123258306538, |
| "grad_norm": 0.07803714275360107, |
| "learning_rate": 0.0025, |
| "loss": 4.658711910247803, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 105.5001, |
| "step": 382, |
| "tokens/total": 865088, |
| "tokens/train_per_sec_per_gpu": 67.71, |
| "tokens/trainable": 513209 |
| }, |
| { |
| "epoch": 1.6387995712754555, |
| "grad_norm": 0.039557769894599915, |
| "learning_rate": 0.0025, |
| "loss": 4.664157867431641, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 106.07622, |
| "step": 383, |
| "tokens/total": 867520, |
| "tokens/train_per_sec_per_gpu": 144.15, |
| "tokens/trainable": 514700 |
| }, |
| { |
| "epoch": 1.6430868167202572, |
| "grad_norm": 0.047754038125276566, |
| "learning_rate": 0.0025, |
| "loss": 4.329286575317383, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.64, |
| "memory/max_allocated (GiB)": 18.64, |
| "ppl": 75.89013, |
| "step": 384, |
| "tokens/total": 869632, |
| "tokens/train_per_sec_per_gpu": 32.67, |
| "tokens/trainable": 516045 |
| }, |
| { |
| "epoch": 1.647374062165059, |
| "grad_norm": 0.0593026764690876, |
| "learning_rate": 0.0025, |
| "loss": 5.5122809410095215, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.55, |
| "memory/max_allocated (GiB)": 18.55, |
| "ppl": 247.71551, |
| "step": 385, |
| "tokens/total": 872128, |
| "tokens/train_per_sec_per_gpu": 6.48, |
| "tokens/trainable": 517536 |
| }, |
| { |
| "epoch": 1.6516613076098605, |
| "grad_norm": 0.0731717124581337, |
| "learning_rate": 0.0025, |
| "loss": 4.586673736572266, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 98.16736, |
| "step": 386, |
| "tokens/total": 874240, |
| "tokens/train_per_sec_per_gpu": 15.25, |
| "tokens/trainable": 518738 |
| }, |
| { |
| "epoch": 1.6559485530546625, |
| "grad_norm": 0.049605630338191986, |
| "learning_rate": 0.0025, |
| "loss": 4.679656982421875, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 107.73311, |
| "step": 387, |
| "tokens/total": 876288, |
| "tokens/train_per_sec_per_gpu": 135.43, |
| "tokens/trainable": 519876 |
| }, |
| { |
| "epoch": 1.660235798499464, |
| "grad_norm": 0.05317911505699158, |
| "learning_rate": 0.0025, |
| "loss": 4.681480407714844, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.73, |
| "memory/max_allocated (GiB)": 18.73, |
| "ppl": 107.92973, |
| "step": 388, |
| "tokens/total": 879360, |
| "tokens/train_per_sec_per_gpu": 368.87, |
| "tokens/trainable": 521966 |
| }, |
| { |
| "epoch": 1.6645230439442658, |
| "grad_norm": 0.060831792652606964, |
| "learning_rate": 0.0025, |
| "loss": 5.22551155090332, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 185.95627, |
| "step": 389, |
| "tokens/total": 881664, |
| "tokens/train_per_sec_per_gpu": 7.84, |
| "tokens/trainable": 523262 |
| }, |
| { |
| "epoch": 1.6688102893890675, |
| "grad_norm": 0.10634256899356842, |
| "learning_rate": 0.0025, |
| "loss": 5.569226264953613, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 262.23112, |
| "step": 390, |
| "tokens/total": 883200, |
| "tokens/train_per_sec_per_gpu": 7.0, |
| "tokens/trainable": 523927 |
| }, |
| { |
| "epoch": 1.6730975348338692, |
| "grad_norm": 0.04912353307008743, |
| "learning_rate": 0.0025, |
| "loss": 4.896402359008789, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 133.80752, |
| "step": 391, |
| "tokens/total": 885376, |
| "tokens/train_per_sec_per_gpu": 131.57, |
| "tokens/trainable": 525245 |
| }, |
| { |
| "epoch": 1.677384780278671, |
| "grad_norm": 0.051567140966653824, |
| "learning_rate": 0.0025, |
| "loss": 4.537640571594238, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 93.47, |
| "step": 392, |
| "tokens/total": 887680, |
| "tokens/train_per_sec_per_gpu": 23.66, |
| "tokens/trainable": 526583 |
| }, |
| { |
| "epoch": 1.6816720257234725, |
| "grad_norm": 0.05488206073641777, |
| "learning_rate": 0.0025, |
| "loss": 4.715126037597656, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 111.62288, |
| "step": 393, |
| "tokens/total": 890304, |
| "tokens/train_per_sec_per_gpu": 140.1, |
| "tokens/trainable": 528304 |
| }, |
| { |
| "epoch": 1.6859592711682745, |
| "grad_norm": 0.04531604424118996, |
| "learning_rate": 0.0025, |
| "loss": 4.594015121459961, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 98.89069, |
| "step": 394, |
| "tokens/total": 892416, |
| "tokens/train_per_sec_per_gpu": 63.67, |
| "tokens/trainable": 529449 |
| }, |
| { |
| "epoch": 1.690246516613076, |
| "grad_norm": 0.054829467087984085, |
| "learning_rate": 0.0025, |
| "loss": 4.549041271209717, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 94.54172, |
| "step": 395, |
| "tokens/total": 894144, |
| "tokens/train_per_sec_per_gpu": 30.68, |
| "tokens/trainable": 530299 |
| }, |
| { |
| "epoch": 1.694533762057878, |
| "grad_norm": 0.053975410759449005, |
| "learning_rate": 0.0025, |
| "loss": 4.356207370758057, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.59, |
| "memory/max_allocated (GiB)": 18.59, |
| "ppl": 77.9609, |
| "step": 396, |
| "tokens/total": 896192, |
| "tokens/train_per_sec_per_gpu": 23.37, |
| "tokens/trainable": 531383 |
| }, |
| { |
| "epoch": 1.6988210075026795, |
| "grad_norm": 0.06466397643089294, |
| "learning_rate": 0.0025, |
| "loss": 4.832691192626953, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 125.54838, |
| "step": 397, |
| "tokens/total": 898368, |
| "tokens/train_per_sec_per_gpu": 28.95, |
| "tokens/trainable": 532533 |
| }, |
| { |
| "epoch": 1.7031082529474812, |
| "grad_norm": 0.0506359338760376, |
| "learning_rate": 0.0025, |
| "loss": 4.359593868255615, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 78.22536, |
| "step": 398, |
| "tokens/total": 900736, |
| "tokens/train_per_sec_per_gpu": 7.39, |
| "tokens/trainable": 533957 |
| }, |
| { |
| "epoch": 1.707395498392283, |
| "grad_norm": 0.06138148903846741, |
| "learning_rate": 0.0025, |
| "loss": 4.625148296356201, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 102.0179, |
| "step": 399, |
| "tokens/total": 902592, |
| "tokens/train_per_sec_per_gpu": 20.91, |
| "tokens/trainable": 534928 |
| }, |
| { |
| "epoch": 1.7116827438370845, |
| "grad_norm": 0.047848962247371674, |
| "learning_rate": 0.0025, |
| "loss": 4.433683395385742, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 84.24114, |
| "step": 400, |
| "tokens/total": 904448, |
| "tokens/train_per_sec_per_gpu": 18.74, |
| "tokens/trainable": 535890 |
| }, |
| { |
| "epoch": 1.7159699892818865, |
| "grad_norm": 0.06819909065961838, |
| "learning_rate": 0.0025, |
| "loss": 4.891788482666016, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 133.19157, |
| "step": 401, |
| "tokens/total": 906048, |
| "tokens/train_per_sec_per_gpu": 52.8, |
| "tokens/trainable": 536499 |
| }, |
| { |
| "epoch": 1.720257234726688, |
| "grad_norm": 0.04256964474916458, |
| "learning_rate": 0.0025, |
| "loss": 4.25316858291626, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 70.3279, |
| "step": 402, |
| "tokens/total": 907968, |
| "tokens/train_per_sec_per_gpu": 121.28, |
| "tokens/trainable": 537519 |
| }, |
| { |
| "epoch": 1.72454448017149, |
| "grad_norm": 0.04542100802063942, |
| "learning_rate": 0.0025, |
| "loss": 5.214837551116943, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.52, |
| "memory/max_allocated (GiB)": 18.52, |
| "ppl": 183.98193, |
| "step": 403, |
| "tokens/total": 910848, |
| "tokens/train_per_sec_per_gpu": 233.51, |
| "tokens/trainable": 539428 |
| }, |
| { |
| "epoch": 1.7288317256162915, |
| "grad_norm": 0.04585760459303856, |
| "learning_rate": 0.0025, |
| "loss": 4.585484027862549, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.53, |
| "memory/max_allocated (GiB)": 18.53, |
| "ppl": 98.05063, |
| "step": 404, |
| "tokens/total": 913536, |
| "tokens/train_per_sec_per_gpu": 183.23, |
| "tokens/trainable": 541209 |
| }, |
| { |
| "epoch": 1.7331189710610932, |
| "grad_norm": 0.0482996366918087, |
| "learning_rate": 0.0025, |
| "loss": 4.8759870529174805, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.63, |
| "memory/max_allocated (GiB)": 18.63, |
| "ppl": 131.1035, |
| "step": 405, |
| "tokens/total": 916032, |
| "tokens/train_per_sec_per_gpu": 65.34, |
| "tokens/trainable": 542856 |
| }, |
| { |
| "epoch": 1.737406216505895, |
| "grad_norm": 0.04029145836830139, |
| "learning_rate": 0.0025, |
| "loss": 4.391783714294434, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.4, |
| "memory/max_allocated (GiB)": 18.4, |
| "ppl": 80.78439, |
| "step": 406, |
| "tokens/total": 917824, |
| "tokens/train_per_sec_per_gpu": 6.99, |
| "tokens/trainable": 543743 |
| }, |
| { |
| "epoch": 1.7416934619506965, |
| "grad_norm": 0.03549795225262642, |
| "learning_rate": 0.0025, |
| "loss": 4.487819671630859, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 88.92734, |
| "step": 407, |
| "tokens/total": 920192, |
| "tokens/train_per_sec_per_gpu": 142.0, |
| "tokens/trainable": 545102 |
| }, |
| { |
| "epoch": 1.7459807073954985, |
| "grad_norm": 0.05987889692187309, |
| "learning_rate": 0.0025, |
| "loss": 4.304838180541992, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.39, |
| "memory/max_allocated (GiB)": 18.39, |
| "ppl": 74.05723, |
| "step": 408, |
| "tokens/total": 921920, |
| "tokens/train_per_sec_per_gpu": 80.61, |
| "tokens/trainable": 545864 |
| }, |
| { |
| "epoch": 1.7502679528403, |
| "grad_norm": 0.03903573006391525, |
| "learning_rate": 0.0025, |
| "loss": 4.431785583496094, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 84.08142, |
| "step": 409, |
| "tokens/total": 924160, |
| "tokens/train_per_sec_per_gpu": 56.37, |
| "tokens/trainable": 547197 |
| }, |
| { |
| "epoch": 1.754555198285102, |
| "grad_norm": 0.08176471292972565, |
| "learning_rate": 0.0025, |
| "loss": 5.05267333984375, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 156.44012, |
| "step": 410, |
| "tokens/total": 926400, |
| "tokens/train_per_sec_per_gpu": 78.56, |
| "tokens/trainable": 548428 |
| }, |
| { |
| "epoch": 1.7588424437299035, |
| "grad_norm": 0.06569211184978485, |
| "learning_rate": 0.0025, |
| "loss": 4.557641506195068, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 95.35831, |
| "step": 411, |
| "tokens/total": 928256, |
| "tokens/train_per_sec_per_gpu": 69.11, |
| "tokens/trainable": 549334 |
| }, |
| { |
| "epoch": 1.7631296891747053, |
| "grad_norm": 0.0812261626124382, |
| "learning_rate": 0.0025, |
| "loss": 4.661388397216797, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.72, |
| "memory/max_allocated (GiB)": 18.72, |
| "ppl": 105.78285, |
| "step": 412, |
| "tokens/total": 930432, |
| "tokens/train_per_sec_per_gpu": 30.49, |
| "tokens/trainable": 550597 |
| }, |
| { |
| "epoch": 1.767416934619507, |
| "grad_norm": 0.06816331297159195, |
| "learning_rate": 0.0025, |
| "loss": 4.940434455871582, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 139.83099, |
| "step": 413, |
| "tokens/total": 932672, |
| "tokens/train_per_sec_per_gpu": 46.78, |
| "tokens/trainable": 551971 |
| }, |
| { |
| "epoch": 1.767416934619507, |
| "eval_loss": 4.7477898597717285, |
| "eval_ppl": 115.32911, |
| "eval_runtime": 16.9681, |
| "eval_samples_per_second": 12.258, |
| "eval_steps_per_second": 12.258, |
| "memory/device_reserved (GiB)": 19.92, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "step": 413 |
| }, |
| { |
| "epoch": 1.7717041800643085, |
| "grad_norm": 0.04632922261953354, |
| "learning_rate": 0.0025, |
| "loss": 4.674541473388672, |
| "memory/device_reserved (GiB)": 18.75, |
| "memory/max_active (GiB)": 18.53, |
| "memory/max_allocated (GiB)": 18.53, |
| "ppl": 107.18341, |
| "step": 414, |
| "tokens/total": 934720, |
| "tokens/train_per_sec_per_gpu": 21.87, |
| "tokens/trainable": 553178 |
| }, |
| { |
| "epoch": 1.7759914255091105, |
| "grad_norm": 0.0418037474155426, |
| "learning_rate": 0.0025, |
| "loss": 4.476470947265625, |
| "memory/device_reserved (GiB)": 18.75, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 87.92384, |
| "step": 415, |
| "tokens/total": 936768, |
| "tokens/train_per_sec_per_gpu": 44.4, |
| "tokens/trainable": 554330 |
| }, |
| { |
| "epoch": 1.780278670953912, |
| "grad_norm": 0.0720926821231842, |
| "learning_rate": 0.0025, |
| "loss": 4.901095390319824, |
| "memory/device_reserved (GiB)": 18.75, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 134.43696, |
| "step": 416, |
| "tokens/total": 938752, |
| "tokens/train_per_sec_per_gpu": 58.68, |
| "tokens/trainable": 555375 |
| }, |
| { |
| "epoch": 1.784565916398714, |
| "grad_norm": 0.0682898610830307, |
| "learning_rate": 0.0025, |
| "loss": 4.806495189666748, |
| "memory/device_reserved (GiB)": 18.75, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 122.30222, |
| "step": 417, |
| "tokens/total": 940736, |
| "tokens/train_per_sec_per_gpu": 60.27, |
| "tokens/trainable": 556410 |
| }, |
| { |
| "epoch": 1.7888531618435155, |
| "grad_norm": 0.05859844386577606, |
| "learning_rate": 0.0025, |
| "loss": 4.619528293609619, |
| "memory/device_reserved (GiB)": 18.75, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 101.44617, |
| "step": 418, |
| "tokens/total": 942720, |
| "tokens/train_per_sec_per_gpu": 119.79, |
| "tokens/trainable": 557481 |
| }, |
| { |
| "epoch": 1.7931404072883173, |
| "grad_norm": 0.0587584562599659, |
| "learning_rate": 0.0025, |
| "loss": 4.6873064041137695, |
| "memory/device_reserved (GiB)": 18.75, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 108.56037, |
| "step": 419, |
| "tokens/total": 944576, |
| "tokens/train_per_sec_per_gpu": 103.57, |
| "tokens/trainable": 558411 |
| }, |
| { |
| "epoch": 1.797427652733119, |
| "grad_norm": 0.048332419246435165, |
| "learning_rate": 0.0025, |
| "loss": 4.776644706726074, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 19.42, |
| "memory/max_allocated (GiB)": 19.42, |
| "ppl": 118.70539, |
| "step": 420, |
| "tokens/total": 948160, |
| "tokens/train_per_sec_per_gpu": 18.63, |
| "tokens/trainable": 561055 |
| }, |
| { |
| "epoch": 1.8017148981779205, |
| "grad_norm": 0.045766137540340424, |
| "learning_rate": 0.0025, |
| "loss": 4.8820977210998535, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 131.90708, |
| "step": 421, |
| "tokens/total": 950784, |
| "tokens/train_per_sec_per_gpu": 58.05, |
| "tokens/trainable": 562707 |
| }, |
| { |
| "epoch": 1.8060021436227225, |
| "grad_norm": 0.07353589683771133, |
| "learning_rate": 0.0025, |
| "loss": 4.372645854949951, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 79.25305, |
| "step": 422, |
| "tokens/total": 952384, |
| "tokens/train_per_sec_per_gpu": 94.96, |
| "tokens/trainable": 563370 |
| }, |
| { |
| "epoch": 1.810289389067524, |
| "grad_norm": 0.057513438165187836, |
| "learning_rate": 0.0025, |
| "loss": 4.444611549377441, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 85.16679, |
| "step": 423, |
| "tokens/total": 954560, |
| "tokens/train_per_sec_per_gpu": 14.88, |
| "tokens/trainable": 564680 |
| }, |
| { |
| "epoch": 1.814576634512326, |
| "grad_norm": 0.054809462279081345, |
| "learning_rate": 0.0025, |
| "loss": 4.237483501434326, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 69.23341, |
| "step": 424, |
| "tokens/total": 957696, |
| "tokens/train_per_sec_per_gpu": 52.44, |
| "tokens/trainable": 566858 |
| }, |
| { |
| "epoch": 1.8188638799571275, |
| "grad_norm": 0.042437877506017685, |
| "learning_rate": 0.0025, |
| "loss": 4.65795373916626, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 105.42014, |
| "step": 425, |
| "tokens/total": 960064, |
| "tokens/train_per_sec_per_gpu": 58.49, |
| "tokens/trainable": 568277 |
| }, |
| { |
| "epoch": 1.8231511254019293, |
| "grad_norm": 0.0690232664346695, |
| "learning_rate": 0.0025, |
| "loss": 4.57647180557251, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 19.1, |
| "memory/max_allocated (GiB)": 19.1, |
| "ppl": 97.17095, |
| "step": 426, |
| "tokens/total": 963264, |
| "tokens/train_per_sec_per_gpu": 604.98, |
| "tokens/trainable": 570607 |
| }, |
| { |
| "epoch": 1.827438370846731, |
| "grad_norm": 0.05784786492586136, |
| "learning_rate": 0.0025, |
| "loss": 5.262682914733887, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 192.9986, |
| "step": 427, |
| "tokens/total": 965824, |
| "tokens/train_per_sec_per_gpu": 120.41, |
| "tokens/trainable": 572251 |
| }, |
| { |
| "epoch": 1.8317256162915327, |
| "grad_norm": 0.07479379326105118, |
| "learning_rate": 0.0025, |
| "loss": 5.224900722503662, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 185.84272, |
| "step": 428, |
| "tokens/total": 968064, |
| "tokens/train_per_sec_per_gpu": 7.22, |
| "tokens/trainable": 573467 |
| }, |
| { |
| "epoch": 1.8360128617363345, |
| "grad_norm": 0.048712894320487976, |
| "learning_rate": 0.0025, |
| "loss": 5.22140645980835, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 185.19447, |
| "step": 429, |
| "tokens/total": 970432, |
| "tokens/train_per_sec_per_gpu": 17.21, |
| "tokens/trainable": 574765 |
| }, |
| { |
| "epoch": 1.840300107181136, |
| "grad_norm": 0.0432349257171154, |
| "learning_rate": 0.0025, |
| "loss": 4.933577060699463, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 138.87539, |
| "step": 430, |
| "tokens/total": 972928, |
| "tokens/train_per_sec_per_gpu": 126.94, |
| "tokens/trainable": 576351 |
| }, |
| { |
| "epoch": 1.844587352625938, |
| "grad_norm": 0.035638924688100815, |
| "learning_rate": 0.0025, |
| "loss": 5.010141372680664, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.5, |
| "memory/max_allocated (GiB)": 18.5, |
| "ppl": 149.92593, |
| "step": 431, |
| "tokens/total": 975616, |
| "tokens/train_per_sec_per_gpu": 55.33, |
| "tokens/trainable": 578016 |
| }, |
| { |
| "epoch": 1.8488745980707395, |
| "grad_norm": 0.05865227058529854, |
| "learning_rate": 0.0025, |
| "loss": 4.759288787841797, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.47, |
| "memory/max_allocated (GiB)": 18.47, |
| "ppl": 116.66292, |
| "step": 432, |
| "tokens/total": 977344, |
| "tokens/train_per_sec_per_gpu": 17.53, |
| "tokens/trainable": 578907 |
| }, |
| { |
| "epoch": 1.8531618435155413, |
| "grad_norm": 0.037786636501550674, |
| "learning_rate": 0.0025, |
| "loss": 4.746092796325684, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.56, |
| "memory/max_allocated (GiB)": 18.56, |
| "ppl": 115.13355, |
| "step": 433, |
| "tokens/total": 980800, |
| "tokens/train_per_sec_per_gpu": 37.5, |
| "tokens/trainable": 581317 |
| }, |
| { |
| "epoch": 1.857449088960343, |
| "grad_norm": 0.036064039915800095, |
| "learning_rate": 0.0025, |
| "loss": 4.497668743133545, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 89.80752, |
| "step": 434, |
| "tokens/total": 982976, |
| "tokens/train_per_sec_per_gpu": 41.65, |
| "tokens/trainable": 582503 |
| }, |
| { |
| "epoch": 1.8617363344051447, |
| "grad_norm": 0.059433262795209885, |
| "learning_rate": 0.0025, |
| "loss": 5.0172624588012695, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.55, |
| "memory/max_allocated (GiB)": 18.55, |
| "ppl": 150.99738, |
| "step": 435, |
| "tokens/total": 985600, |
| "tokens/train_per_sec_per_gpu": 18.07, |
| "tokens/trainable": 584121 |
| }, |
| { |
| "epoch": 1.8660235798499465, |
| "grad_norm": 0.05385487526655197, |
| "learning_rate": 0.0025, |
| "loss": 5.047094821929932, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.71, |
| "memory/max_allocated (GiB)": 18.71, |
| "ppl": 155.56985, |
| "step": 436, |
| "tokens/total": 988352, |
| "tokens/train_per_sec_per_gpu": 21.47, |
| "tokens/trainable": 585931 |
| }, |
| { |
| "epoch": 1.870310825294748, |
| "grad_norm": 0.07723033428192139, |
| "learning_rate": 0.0025, |
| "loss": 5.156320571899414, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.53, |
| "memory/max_allocated (GiB)": 18.53, |
| "ppl": 173.52481, |
| "step": 437, |
| "tokens/total": 991104, |
| "tokens/train_per_sec_per_gpu": 210.83, |
| "tokens/trainable": 587715 |
| }, |
| { |
| "epoch": 1.87459807073955, |
| "grad_norm": 0.054785728454589844, |
| "learning_rate": 0.0025, |
| "loss": 4.405440330505371, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.36, |
| "memory/max_allocated (GiB)": 18.36, |
| "ppl": 81.8952, |
| "step": 438, |
| "tokens/total": 992448, |
| "tokens/train_per_sec_per_gpu": 34.58, |
| "tokens/trainable": 588195 |
| }, |
| { |
| "epoch": 1.8788853161843515, |
| "grad_norm": 0.044970739632844925, |
| "learning_rate": 0.0025, |
| "loss": 4.495140075683594, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 89.58072, |
| "step": 439, |
| "tokens/total": 995008, |
| "tokens/train_per_sec_per_gpu": 10.07, |
| "tokens/trainable": 589904 |
| }, |
| { |
| "epoch": 1.8831725616291533, |
| "grad_norm": 0.05420251190662384, |
| "learning_rate": 0.0025, |
| "loss": 4.917167663574219, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.63, |
| "memory/max_allocated (GiB)": 18.63, |
| "ppl": 136.61512, |
| "step": 440, |
| "tokens/total": 997568, |
| "tokens/train_per_sec_per_gpu": 58.61, |
| "tokens/trainable": 591429 |
| }, |
| { |
| "epoch": 1.887459807073955, |
| "grad_norm": 0.06885336339473724, |
| "learning_rate": 0.0025, |
| "loss": 4.854959011077881, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.64, |
| "memory/max_allocated (GiB)": 18.64, |
| "ppl": 128.37543, |
| "step": 441, |
| "tokens/total": 1000000, |
| "tokens/train_per_sec_per_gpu": 2.19, |
| "tokens/trainable": 592888 |
| }, |
| { |
| "epoch": 1.8917470525187567, |
| "grad_norm": 0.04857528582215309, |
| "learning_rate": 0.0025, |
| "loss": 4.567473888397217, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.67, |
| "memory/max_allocated (GiB)": 18.67, |
| "ppl": 96.30054, |
| "step": 442, |
| "tokens/total": 1002496, |
| "tokens/train_per_sec_per_gpu": 28.11, |
| "tokens/trainable": 594412 |
| }, |
| { |
| "epoch": 1.8960342979635585, |
| "grad_norm": 0.05679011344909668, |
| "learning_rate": 0.0025, |
| "loss": 4.383760452270508, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 80.13883, |
| "step": 443, |
| "tokens/total": 1004032, |
| "tokens/train_per_sec_per_gpu": 7.89, |
| "tokens/trainable": 595042 |
| }, |
| { |
| "epoch": 1.90032154340836, |
| "grad_norm": 0.08521363139152527, |
| "learning_rate": 0.0025, |
| "loss": 5.441523551940918, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 230.79354, |
| "step": 444, |
| "tokens/total": 1005824, |
| "tokens/train_per_sec_per_gpu": 182.26, |
| "tokens/trainable": 595958 |
| }, |
| { |
| "epoch": 1.904608788853162, |
| "grad_norm": 0.04015873000025749, |
| "learning_rate": 0.0025, |
| "loss": 4.99971866607666, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.57, |
| "memory/max_allocated (GiB)": 18.57, |
| "ppl": 148.37141, |
| "step": 445, |
| "tokens/total": 1008640, |
| "tokens/train_per_sec_per_gpu": 16.68, |
| "tokens/trainable": 597780 |
| }, |
| { |
| "epoch": 1.9088960342979635, |
| "grad_norm": 0.05560390651226044, |
| "learning_rate": 0.0025, |
| "loss": 4.3576273918151855, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 78.07168, |
| "step": 446, |
| "tokens/total": 1010752, |
| "tokens/train_per_sec_per_gpu": 100.98, |
| "tokens/trainable": 598947 |
| }, |
| { |
| "epoch": 1.9131832797427653, |
| "grad_norm": 0.07791434973478317, |
| "learning_rate": 0.0025, |
| "loss": 4.918298721313477, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 136.76973, |
| "step": 447, |
| "tokens/total": 1012352, |
| "tokens/train_per_sec_per_gpu": 78.47, |
| "tokens/trainable": 599685 |
| }, |
| { |
| "epoch": 1.917470525187567, |
| "grad_norm": 0.05341208353638649, |
| "learning_rate": 0.0025, |
| "loss": 5.289507865905762, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.49, |
| "memory/max_allocated (GiB)": 18.49, |
| "ppl": 198.24584, |
| "step": 448, |
| "tokens/total": 1014592, |
| "tokens/train_per_sec_per_gpu": 6.23, |
| "tokens/trainable": 601074 |
| }, |
| { |
| "epoch": 1.9217577706323687, |
| "grad_norm": 0.06588708609342575, |
| "learning_rate": 0.0025, |
| "loss": 4.922712802886963, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 137.37478, |
| "step": 449, |
| "tokens/total": 1016832, |
| "tokens/train_per_sec_per_gpu": 6.2, |
| "tokens/trainable": 602419 |
| }, |
| { |
| "epoch": 1.9260450160771705, |
| "grad_norm": 0.08113836497068405, |
| "learning_rate": 0.0025, |
| "loss": 4.302712440490723, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 73.89997, |
| "step": 450, |
| "tokens/total": 1018880, |
| "tokens/train_per_sec_per_gpu": 54.48, |
| "tokens/trainable": 603525 |
| }, |
| { |
| "epoch": 1.930332261521972, |
| "grad_norm": 0.038718972355127335, |
| "learning_rate": 0.0025, |
| "loss": 5.016265869140625, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 150.84697, |
| "step": 451, |
| "tokens/total": 1021440, |
| "tokens/train_per_sec_per_gpu": 27.21, |
| "tokens/trainable": 605190 |
| }, |
| { |
| "epoch": 1.934619506966774, |
| "grad_norm": 0.06918424367904663, |
| "learning_rate": 0.0025, |
| "loss": 4.20094633102417, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.39, |
| "memory/max_allocated (GiB)": 18.39, |
| "ppl": 66.74947, |
| "step": 452, |
| "tokens/total": 1022976, |
| "tokens/train_per_sec_per_gpu": 59.21, |
| "tokens/trainable": 605785 |
| }, |
| { |
| "epoch": 1.9389067524115755, |
| "grad_norm": 0.05727904662489891, |
| "learning_rate": 0.0025, |
| "loss": 4.495724201202393, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 89.63306, |
| "step": 453, |
| "tokens/total": 1024832, |
| "tokens/train_per_sec_per_gpu": 15.8, |
| "tokens/trainable": 606673 |
| }, |
| { |
| "epoch": 1.9431939978563773, |
| "grad_norm": 0.050397999584674835, |
| "learning_rate": 0.0025, |
| "loss": 4.682392120361328, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.45, |
| "memory/max_allocated (GiB)": 18.45, |
| "ppl": 108.02818, |
| "step": 454, |
| "tokens/total": 1026880, |
| "tokens/train_per_sec_per_gpu": 14.36, |
| "tokens/trainable": 607794 |
| }, |
| { |
| "epoch": 1.947481243301179, |
| "grad_norm": 0.05501880869269371, |
| "learning_rate": 0.0025, |
| "loss": 4.579135894775391, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.94, |
| "memory/max_allocated (GiB)": 18.94, |
| "ppl": 97.43017, |
| "step": 455, |
| "tokens/total": 1030144, |
| "tokens/train_per_sec_per_gpu": 522.12, |
| "tokens/trainable": 610019 |
| }, |
| { |
| "epoch": 1.9517684887459807, |
| "grad_norm": 0.04997771605849266, |
| "learning_rate": 0.0025, |
| "loss": 4.384857177734375, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 80.22676, |
| "step": 456, |
| "tokens/total": 1032128, |
| "tokens/train_per_sec_per_gpu": 14.95, |
| "tokens/trainable": 611215 |
| }, |
| { |
| "epoch": 1.9560557341907825, |
| "grad_norm": 0.036863308399915695, |
| "learning_rate": 0.0025, |
| "loss": 4.938703536987305, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 139.58916, |
| "step": 457, |
| "tokens/total": 1034240, |
| "tokens/train_per_sec_per_gpu": 39.48, |
| "tokens/trainable": 612358 |
| }, |
| { |
| "epoch": 1.960342979635584, |
| "grad_norm": 0.0519835501909256, |
| "learning_rate": 0.0025, |
| "loss": 4.352743148803711, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 77.69129, |
| "step": 458, |
| "tokens/total": 1035712, |
| "tokens/train_per_sec_per_gpu": 86.87, |
| "tokens/trainable": 612911 |
| }, |
| { |
| "epoch": 1.964630225080386, |
| "grad_norm": 0.06522325426340103, |
| "learning_rate": 0.0025, |
| "loss": 4.363661766052246, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.44, |
| "memory/max_allocated (GiB)": 18.44, |
| "ppl": 78.54422, |
| "step": 459, |
| "tokens/total": 1037376, |
| "tokens/train_per_sec_per_gpu": 48.59, |
| "tokens/trainable": 613674 |
| }, |
| { |
| "epoch": 1.9689174705251875, |
| "grad_norm": 0.04710303246974945, |
| "learning_rate": 0.0025, |
| "loss": 4.830078601837158, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.51, |
| "memory/max_allocated (GiB)": 18.51, |
| "ppl": 125.2208, |
| "step": 460, |
| "tokens/total": 1039296, |
| "tokens/train_per_sec_per_gpu": 21.74, |
| "tokens/trainable": 614688 |
| }, |
| { |
| "epoch": 1.9732047159699893, |
| "grad_norm": 0.05254080519080162, |
| "learning_rate": 0.0025, |
| "loss": 4.752572536468506, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 115.88201, |
| "step": 461, |
| "tokens/total": 1041280, |
| "tokens/train_per_sec_per_gpu": 28.76, |
| "tokens/trainable": 615727 |
| }, |
| { |
| "epoch": 1.977491961414791, |
| "grad_norm": 0.04451625421643257, |
| "learning_rate": 0.0025, |
| "loss": 4.789048671722412, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.77, |
| "memory/max_allocated (GiB)": 18.77, |
| "ppl": 120.18698, |
| "step": 462, |
| "tokens/total": 1044224, |
| "tokens/train_per_sec_per_gpu": 95.08, |
| "tokens/trainable": 617841 |
| }, |
| { |
| "epoch": 1.9817792068595927, |
| "grad_norm": 0.07913687825202942, |
| "learning_rate": 0.0025, |
| "loss": 4.311519622802734, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.43, |
| "memory/max_allocated (GiB)": 18.43, |
| "ppl": 74.5537, |
| "step": 463, |
| "tokens/total": 1046208, |
| "tokens/train_per_sec_per_gpu": 8.84, |
| "tokens/trainable": 618874 |
| }, |
| { |
| "epoch": 1.9860664523043945, |
| "grad_norm": 0.047682974487543106, |
| "learning_rate": 0.0025, |
| "loss": 5.018848419189453, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.48, |
| "memory/max_allocated (GiB)": 18.48, |
| "ppl": 151.23704, |
| "step": 464, |
| "tokens/total": 1048704, |
| "tokens/train_per_sec_per_gpu": 10.86, |
| "tokens/trainable": 620535 |
| }, |
| { |
| "epoch": 1.990353697749196, |
| "grad_norm": 0.06579267233610153, |
| "learning_rate": 0.0025, |
| "loss": 4.598994255065918, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.41, |
| "memory/max_allocated (GiB)": 18.41, |
| "ppl": 99.38431, |
| "step": 465, |
| "tokens/total": 1050496, |
| "tokens/train_per_sec_per_gpu": 18.04, |
| "tokens/trainable": 621414 |
| }, |
| { |
| "epoch": 1.994640943193998, |
| "grad_norm": 0.049845289438962936, |
| "learning_rate": 0.0025, |
| "loss": 4.27248477935791, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.46, |
| "memory/max_allocated (GiB)": 18.46, |
| "ppl": 71.69957, |
| "step": 466, |
| "tokens/total": 1052480, |
| "tokens/train_per_sec_per_gpu": 112.51, |
| "tokens/trainable": 622451 |
| }, |
| { |
| "epoch": 1.9989281886387995, |
| "grad_norm": 0.04582377150654793, |
| "learning_rate": 0.0025, |
| "loss": 4.850298881530762, |
| "memory/device_reserved (GiB)": 19.85, |
| "memory/max_active (GiB)": 18.53, |
| "memory/max_allocated (GiB)": 18.53, |
| "ppl": 127.77857, |
| "step": 467, |
| "tokens/total": 1054528, |
| "tokens/train_per_sec_per_gpu": 210.97, |
| "tokens/trainable": 623603 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 467, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 234, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.347829287307059e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|