diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,22146 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 157, + "global_step": 1570, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "eval_loss": 2.523393154144287, + "eval_ppl": 12.47084, + "eval_runtime": 43.864, + "eval_samples_per_second": 61.577, + "eval_steps_per_second": 3.853, + "memory/device_reserved (GiB)": 60.88, + "memory/max_active (GiB)": 50.21, + "memory/max_allocated (GiB)": 50.21, + "step": 0 + }, + { + "epoch": 0.0031847133757961785, + "grad_norm": 26.125, + "learning_rate": 0.0, + "loss": 2.513824939727783, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 60.25, + "memory/max_allocated (GiB)": 60.25, + "ppl": 12.35209, + "step": 1, + "tokens/total": 131072, + "tokens/train_per_sec_per_gpu": 2648.66, + "tokens/trainable": 14388 + }, + { + "epoch": 0.006369426751592357, + "grad_norm": 26.5, + "learning_rate": 3.1847133757961787e-07, + "loss": 2.5059545040130615, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 12.25525, + "step": 2, + "tokens/total": 262144, + "tokens/train_per_sec_per_gpu": 3269.04, + "tokens/trainable": 27845 + }, + { + "epoch": 0.009554140127388535, + "grad_norm": 25.625, + "learning_rate": 6.369426751592357e-07, + "loss": 2.4954071044921875, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 12.12667, + "step": 3, + "tokens/total": 393216, + "tokens/train_per_sec_per_gpu": 3166.68, + "tokens/trainable": 40998 + }, + { + "epoch": 0.012738853503184714, + "grad_norm": 26.25, + "learning_rate": 9.554140127388535e-07, + "loss": 2.526397943496704, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 12.50837, + "step": 4, + "tokens/total": 524288, + "tokens/train_per_sec_per_gpu": 3343.09, + "tokens/trainable": 54878 + }, + { + "epoch": 0.01592356687898089, + "grad_norm": 26.0, + "learning_rate": 1.2738853503184715e-06, + "loss": 2.480510711669922, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 11.94736, + "step": 5, + "tokens/total": 655360, + "tokens/train_per_sec_per_gpu": 3076.8, + "tokens/trainable": 67675 + }, + { + "epoch": 0.01910828025477707, + "grad_norm": 26.25, + "learning_rate": 1.5923566878980892e-06, + "loss": 2.5267443656921387, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 12.5127, + "step": 6, + "tokens/total": 786432, + "tokens/train_per_sec_per_gpu": 3657.85, + "tokens/trainable": 82725 + }, + { + "epoch": 0.022292993630573247, + "grad_norm": 25.625, + "learning_rate": 1.910828025477707e-06, + "loss": 2.505220651626587, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 12.24626, + "step": 7, + "tokens/total": 917504, + "tokens/train_per_sec_per_gpu": 3615.3, + "tokens/trainable": 97609 + }, + { + "epoch": 0.025477707006369428, + "grad_norm": 26.25, + "learning_rate": 2.229299363057325e-06, + "loss": 2.47495174407959, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 11.88113, + "step": 8, + "tokens/total": 1048576, + "tokens/train_per_sec_per_gpu": 3341.37, + "tokens/trainable": 111360 + }, + { + "epoch": 0.028662420382165606, + "grad_norm": 25.375, + "learning_rate": 2.547770700636943e-06, + "loss": 2.464661121368408, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 11.7595, + "step": 9, + "tokens/total": 1179648, + "tokens/train_per_sec_per_gpu": 3391.98, + "tokens/trainable": 125377 + }, + { + "epoch": 0.03184713375796178, + "grad_norm": 25.125, + "learning_rate": 2.8662420382165605e-06, + "loss": 2.4051315784454346, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 11.07989, + "step": 10, + "tokens/total": 1310720, + "tokens/train_per_sec_per_gpu": 3538.93, + "tokens/trainable": 139941 + }, + { + "epoch": 0.03503184713375796, + "grad_norm": 24.25, + "learning_rate": 3.1847133757961785e-06, + "loss": 2.3649113178253174, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 10.64309, + "step": 11, + "tokens/total": 1441792, + "tokens/train_per_sec_per_gpu": 3190.84, + "tokens/trainable": 153243 + }, + { + "epoch": 0.03821656050955414, + "grad_norm": 23.75, + "learning_rate": 3.5031847133757964e-06, + "loss": 2.2840771675109863, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 9.81662, + "step": 12, + "tokens/total": 1572864, + "tokens/train_per_sec_per_gpu": 3122.46, + "tokens/trainable": 166236 + }, + { + "epoch": 0.041401273885350316, + "grad_norm": 23.5, + "learning_rate": 3.821656050955414e-06, + "loss": 2.2835350036621094, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 9.8113, + "step": 13, + "tokens/total": 1703936, + "tokens/train_per_sec_per_gpu": 3508.01, + "tokens/trainable": 180765 + }, + { + "epoch": 0.044585987261146494, + "grad_norm": 22.625, + "learning_rate": 4.140127388535032e-06, + "loss": 2.178839921951294, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 8.83605, + "step": 14, + "tokens/total": 1835008, + "tokens/train_per_sec_per_gpu": 3391.4, + "tokens/trainable": 194814 + }, + { + "epoch": 0.04777070063694268, + "grad_norm": 20.625, + "learning_rate": 4.45859872611465e-06, + "loss": 2.029291868209839, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 7.6087, + "step": 15, + "tokens/total": 1966080, + "tokens/train_per_sec_per_gpu": 2894.11, + "tokens/trainable": 206939 + }, + { + "epoch": 0.050955414012738856, + "grad_norm": 19.125, + "learning_rate": 4.777070063694268e-06, + "loss": 1.9433990716934204, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 6.98244, + "step": 16, + "tokens/total": 2097152, + "tokens/train_per_sec_per_gpu": 3260.95, + "tokens/trainable": 220459 + }, + { + "epoch": 0.054140127388535034, + "grad_norm": 17.0, + "learning_rate": 5.095541401273886e-06, + "loss": 1.825382113456726, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 6.20517, + "step": 17, + "tokens/total": 2228224, + "tokens/train_per_sec_per_gpu": 3108.44, + "tokens/trainable": 233450 + }, + { + "epoch": 0.05732484076433121, + "grad_norm": 15.8125, + "learning_rate": 5.414012738853504e-06, + "loss": 1.7230491638183594, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 5.60158, + "step": 18, + "tokens/total": 2359296, + "tokens/train_per_sec_per_gpu": 3341.04, + "tokens/trainable": 247328 + }, + { + "epoch": 0.06050955414012739, + "grad_norm": 14.8125, + "learning_rate": 5.732484076433121e-06, + "loss": 1.6547000408172607, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 5.23151, + "step": 19, + "tokens/total": 2490368, + "tokens/train_per_sec_per_gpu": 3383.25, + "tokens/trainable": 261435 + }, + { + "epoch": 0.06369426751592357, + "grad_norm": 13.5625, + "learning_rate": 6.050955414012739e-06, + "loss": 1.544914960861206, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 4.68757, + "step": 20, + "tokens/total": 2621440, + "tokens/train_per_sec_per_gpu": 3349.84, + "tokens/trainable": 275370 + }, + { + "epoch": 0.06687898089171974, + "grad_norm": 12.6875, + "learning_rate": 6.369426751592357e-06, + "loss": 1.4839664697647095, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 4.4104, + "step": 21, + "tokens/total": 2752512, + "tokens/train_per_sec_per_gpu": 3158.43, + "tokens/trainable": 288580 + }, + { + "epoch": 0.07006369426751592, + "grad_norm": 12.0625, + "learning_rate": 6.687898089171975e-06, + "loss": 1.3859291076660156, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 3.99854, + "step": 22, + "tokens/total": 2883584, + "tokens/train_per_sec_per_gpu": 3623.28, + "tokens/trainable": 303623 + }, + { + "epoch": 0.0732484076433121, + "grad_norm": 11.1875, + "learning_rate": 7.006369426751593e-06, + "loss": 1.2559714317321777, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 3.51125, + "step": 23, + "tokens/total": 3014656, + "tokens/train_per_sec_per_gpu": 3333.96, + "tokens/trainable": 317478 + }, + { + "epoch": 0.07643312101910828, + "grad_norm": 10.1875, + "learning_rate": 7.32484076433121e-06, + "loss": 1.1163444519042969, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 3.05367, + "step": 24, + "tokens/total": 3145728, + "tokens/train_per_sec_per_gpu": 3273.07, + "tokens/trainable": 331087 + }, + { + "epoch": 0.07961783439490445, + "grad_norm": 9.625, + "learning_rate": 7.643312101910828e-06, + "loss": 0.9755889177322388, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 2.65273, + "step": 25, + "tokens/total": 3276800, + "tokens/train_per_sec_per_gpu": 3686.54, + "tokens/trainable": 346421 + }, + { + "epoch": 0.08280254777070063, + "grad_norm": 8.5625, + "learning_rate": 7.961783439490445e-06, + "loss": 0.8369104266166687, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 2.30922, + "step": 26, + "tokens/total": 3407872, + "tokens/train_per_sec_per_gpu": 3225.45, + "tokens/trainable": 359891 + }, + { + "epoch": 0.08598726114649681, + "grad_norm": 7.65625, + "learning_rate": 8.280254777070064e-06, + "loss": 0.7086498737335205, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 2.03125, + "step": 27, + "tokens/total": 3538944, + "tokens/train_per_sec_per_gpu": 3049.77, + "tokens/trainable": 372710 + }, + { + "epoch": 0.08917197452229299, + "grad_norm": 7.03125, + "learning_rate": 8.598726114649681e-06, + "loss": 0.6029537320137024, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.82751, + "step": 28, + "tokens/total": 3670016, + "tokens/train_per_sec_per_gpu": 3413.19, + "tokens/trainable": 386972 + }, + { + "epoch": 0.09235668789808917, + "grad_norm": 6.59375, + "learning_rate": 8.9171974522293e-06, + "loss": 0.5023248195648193, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.65256, + "step": 29, + "tokens/total": 3801088, + "tokens/train_per_sec_per_gpu": 2978.06, + "tokens/trainable": 399448 + }, + { + "epoch": 0.09554140127388536, + "grad_norm": 5.96875, + "learning_rate": 9.235668789808917e-06, + "loss": 0.4153555631637573, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.51491, + "step": 30, + "tokens/total": 3932160, + "tokens/train_per_sec_per_gpu": 3448.36, + "tokens/trainable": 413796 + }, + { + "epoch": 0.09872611464968153, + "grad_norm": 5.3125, + "learning_rate": 9.554140127388536e-06, + "loss": 0.329733669757843, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.3906, + "step": 31, + "tokens/total": 4063232, + "tokens/train_per_sec_per_gpu": 3050.66, + "tokens/trainable": 426585 + }, + { + "epoch": 0.10191082802547771, + "grad_norm": 4.65625, + "learning_rate": 9.872611464968155e-06, + "loss": 0.2749524414539337, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.31647, + "step": 32, + "tokens/total": 4194304, + "tokens/train_per_sec_per_gpu": 3412.69, + "tokens/trainable": 440864 + }, + { + "epoch": 0.10509554140127389, + "grad_norm": 3.8125, + "learning_rate": 1.0191082802547772e-05, + "loss": 0.2164468914270401, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.24166, + "step": 33, + "tokens/total": 4325376, + "tokens/train_per_sec_per_gpu": 3101.83, + "tokens/trainable": 453864 + }, + { + "epoch": 0.10828025477707007, + "grad_norm": 3.125, + "learning_rate": 1.0509554140127389e-05, + "loss": 0.16533951461315155, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.17979, + "step": 34, + "tokens/total": 4456448, + "tokens/train_per_sec_per_gpu": 2919.92, + "tokens/trainable": 466189 + }, + { + "epoch": 0.11146496815286625, + "grad_norm": 2.3125, + "learning_rate": 1.0828025477707008e-05, + "loss": 0.13319599628448486, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.14247, + "step": 35, + "tokens/total": 4587520, + "tokens/train_per_sec_per_gpu": 3395.27, + "tokens/trainable": 480345 + }, + { + "epoch": 0.11464968152866242, + "grad_norm": 1.734375, + "learning_rate": 1.1146496815286625e-05, + "loss": 0.11769881844520569, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.12491, + "step": 36, + "tokens/total": 4718592, + "tokens/train_per_sec_per_gpu": 3283.23, + "tokens/trainable": 494113 + }, + { + "epoch": 0.1178343949044586, + "grad_norm": 1.2734375, + "learning_rate": 1.1464968152866242e-05, + "loss": 0.09715006500482559, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.10203, + "step": 37, + "tokens/total": 4849664, + "tokens/train_per_sec_per_gpu": 3440.9, + "tokens/trainable": 508490 + }, + { + "epoch": 0.12101910828025478, + "grad_norm": 1.3828125, + "learning_rate": 1.178343949044586e-05, + "loss": 0.08853279799222946, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.09257, + "step": 38, + "tokens/total": 4980736, + "tokens/train_per_sec_per_gpu": 3324.91, + "tokens/trainable": 522428 + }, + { + "epoch": 0.12420382165605096, + "grad_norm": 1.0625, + "learning_rate": 1.2101910828025478e-05, + "loss": 0.07282212376594543, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.07554, + "step": 39, + "tokens/total": 5111808, + "tokens/train_per_sec_per_gpu": 3291.66, + "tokens/trainable": 536220 + }, + { + "epoch": 0.12738853503184713, + "grad_norm": 0.921875, + "learning_rate": 1.2420382165605097e-05, + "loss": 0.07131636142730713, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.07392, + "step": 40, + "tokens/total": 5242880, + "tokens/train_per_sec_per_gpu": 3067.47, + "tokens/trainable": 549148 + }, + { + "epoch": 0.1305732484076433, + "grad_norm": 0.91015625, + "learning_rate": 1.2738853503184714e-05, + "loss": 0.07583475857973099, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.07878, + "step": 41, + "tokens/total": 5373952, + "tokens/train_per_sec_per_gpu": 3078.11, + "tokens/trainable": 562021 + }, + { + "epoch": 0.1337579617834395, + "grad_norm": 1.015625, + "learning_rate": 1.3057324840764331e-05, + "loss": 0.05423282831907272, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.05573, + "step": 42, + "tokens/total": 5505024, + "tokens/train_per_sec_per_gpu": 3152.3, + "tokens/trainable": 575214 + }, + { + "epoch": 0.13694267515923567, + "grad_norm": 1.0703125, + "learning_rate": 1.337579617834395e-05, + "loss": 0.05849003419280052, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.06023, + "step": 43, + "tokens/total": 5636096, + "tokens/train_per_sec_per_gpu": 3026.82, + "tokens/trainable": 587989 + }, + { + "epoch": 0.14012738853503184, + "grad_norm": 0.671875, + "learning_rate": 1.3694267515923567e-05, + "loss": 0.047232724726200104, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.04837, + "step": 44, + "tokens/total": 5767168, + "tokens/train_per_sec_per_gpu": 3186.14, + "tokens/trainable": 601337 + }, + { + "epoch": 0.14331210191082802, + "grad_norm": 0.8203125, + "learning_rate": 1.4012738853503186e-05, + "loss": 0.0633855015039444, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.06544, + "step": 45, + "tokens/total": 5898240, + "tokens/train_per_sec_per_gpu": 3243.91, + "tokens/trainable": 614903 + }, + { + "epoch": 0.1464968152866242, + "grad_norm": 0.7734375, + "learning_rate": 1.4331210191082803e-05, + "loss": 0.057890165597200394, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0596, + "step": 46, + "tokens/total": 6029312, + "tokens/train_per_sec_per_gpu": 3235.78, + "tokens/trainable": 628512 + }, + { + "epoch": 0.14968152866242038, + "grad_norm": 0.62890625, + "learning_rate": 1.464968152866242e-05, + "loss": 0.057463180273771286, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.05915, + "step": 47, + "tokens/total": 6160384, + "tokens/train_per_sec_per_gpu": 3663.91, + "tokens/trainable": 643746 + }, + { + "epoch": 0.15286624203821655, + "grad_norm": 0.55859375, + "learning_rate": 1.4968152866242039e-05, + "loss": 0.047860756516456604, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.04902, + "step": 48, + "tokens/total": 6291456, + "tokens/train_per_sec_per_gpu": 3663.1, + "tokens/trainable": 659004 + }, + { + "epoch": 0.15605095541401273, + "grad_norm": 0.69140625, + "learning_rate": 1.5286624203821656e-05, + "loss": 0.04775935783982277, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.04892, + "step": 49, + "tokens/total": 6422528, + "tokens/train_per_sec_per_gpu": 3484.38, + "tokens/trainable": 673537 + }, + { + "epoch": 0.1592356687898089, + "grad_norm": 0.65234375, + "learning_rate": 1.5605095541401275e-05, + "loss": 0.041205767542123795, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.04207, + "step": 50, + "tokens/total": 6553600, + "tokens/train_per_sec_per_gpu": 3230.47, + "tokens/trainable": 687060 + }, + { + "epoch": 0.1624203821656051, + "grad_norm": 0.5625, + "learning_rate": 1.592356687898089e-05, + "loss": 0.04386754706501961, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.04484, + "step": 51, + "tokens/total": 6684672, + "tokens/train_per_sec_per_gpu": 3268.41, + "tokens/trainable": 700730 + }, + { + "epoch": 0.16560509554140126, + "grad_norm": 0.44140625, + "learning_rate": 1.624203821656051e-05, + "loss": 0.041807860136032104, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.04269, + "step": 52, + "tokens/total": 6815744, + "tokens/train_per_sec_per_gpu": 3368.11, + "tokens/trainable": 714773 + }, + { + "epoch": 0.16878980891719744, + "grad_norm": 0.54296875, + "learning_rate": 1.6560509554140128e-05, + "loss": 0.04267745837569237, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0436, + "step": 53, + "tokens/total": 6946816, + "tokens/train_per_sec_per_gpu": 3215.88, + "tokens/trainable": 728248 + }, + { + "epoch": 0.17197452229299362, + "grad_norm": 0.54296875, + "learning_rate": 1.6878980891719747e-05, + "loss": 0.04988788813352585, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.05115, + "step": 54, + "tokens/total": 7077888, + "tokens/train_per_sec_per_gpu": 3378.45, + "tokens/trainable": 742393 + }, + { + "epoch": 0.1751592356687898, + "grad_norm": 0.60546875, + "learning_rate": 1.7197452229299362e-05, + "loss": 0.03681975603103638, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03751, + "step": 55, + "tokens/total": 7208960, + "tokens/train_per_sec_per_gpu": 3317.61, + "tokens/trainable": 756289 + }, + { + "epoch": 0.17834394904458598, + "grad_norm": 0.54296875, + "learning_rate": 1.751592356687898e-05, + "loss": 0.03921874612569809, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.04, + "step": 56, + "tokens/total": 7340032, + "tokens/train_per_sec_per_gpu": 3135.92, + "tokens/trainable": 769413 + }, + { + "epoch": 0.18152866242038215, + "grad_norm": 0.498046875, + "learning_rate": 1.78343949044586e-05, + "loss": 0.03980698809027672, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.04061, + "step": 57, + "tokens/total": 7471104, + "tokens/train_per_sec_per_gpu": 3113.74, + "tokens/trainable": 782484 + }, + { + "epoch": 0.18471337579617833, + "grad_norm": 0.62109375, + "learning_rate": 1.8152866242038215e-05, + "loss": 0.03426855802536011, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03486, + "step": 58, + "tokens/total": 7602176, + "tokens/train_per_sec_per_gpu": 3252.75, + "tokens/trainable": 796083 + }, + { + "epoch": 0.18789808917197454, + "grad_norm": 0.51953125, + "learning_rate": 1.8471337579617834e-05, + "loss": 0.03522620350122452, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03585, + "step": 59, + "tokens/total": 7733248, + "tokens/train_per_sec_per_gpu": 3557.53, + "tokens/trainable": 810976 + }, + { + "epoch": 0.1910828025477707, + "grad_norm": 0.609375, + "learning_rate": 1.8789808917197453e-05, + "loss": 0.03881306201219559, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03958, + "step": 60, + "tokens/total": 7864320, + "tokens/train_per_sec_per_gpu": 3437.92, + "tokens/trainable": 825388 + }, + { + "epoch": 0.1942675159235669, + "grad_norm": 0.7890625, + "learning_rate": 1.910828025477707e-05, + "loss": 0.04205251485109329, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.04295, + "step": 61, + "tokens/total": 7995392, + "tokens/train_per_sec_per_gpu": 2932.75, + "tokens/trainable": 837817 + }, + { + "epoch": 0.19745222929936307, + "grad_norm": 0.58203125, + "learning_rate": 1.942675159235669e-05, + "loss": 0.03300648555159569, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03356, + "step": 62, + "tokens/total": 8126464, + "tokens/train_per_sec_per_gpu": 3125.85, + "tokens/trainable": 850991 + }, + { + "epoch": 0.20063694267515925, + "grad_norm": 0.87109375, + "learning_rate": 1.974522292993631e-05, + "loss": 0.03468535467982292, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03529, + "step": 63, + "tokens/total": 8257536, + "tokens/train_per_sec_per_gpu": 3543.61, + "tokens/trainable": 865759 + }, + { + "epoch": 0.20382165605095542, + "grad_norm": 0.6171875, + "learning_rate": 2.0063694267515925e-05, + "loss": 0.035250235348939896, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03588, + "step": 64, + "tokens/total": 8388608, + "tokens/train_per_sec_per_gpu": 3393.88, + "tokens/trainable": 879904 + }, + { + "epoch": 0.2070063694267516, + "grad_norm": 0.63671875, + "learning_rate": 2.0382165605095544e-05, + "loss": 0.03242558240890503, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03296, + "step": 65, + "tokens/total": 8519680, + "tokens/train_per_sec_per_gpu": 2965.68, + "tokens/trainable": 892375 + }, + { + "epoch": 0.21019108280254778, + "grad_norm": 0.765625, + "learning_rate": 2.0700636942675162e-05, + "loss": 0.04080452769994736, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.04165, + "step": 66, + "tokens/total": 8650752, + "tokens/train_per_sec_per_gpu": 3513.42, + "tokens/trainable": 907090 + }, + { + "epoch": 0.21337579617834396, + "grad_norm": 0.40625, + "learning_rate": 2.1019108280254778e-05, + "loss": 0.02815978415310383, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02856, + "step": 67, + "tokens/total": 8781824, + "tokens/train_per_sec_per_gpu": 3257.68, + "tokens/trainable": 920761 + }, + { + "epoch": 0.21656050955414013, + "grad_norm": 0.53125, + "learning_rate": 2.1337579617834397e-05, + "loss": 0.034378018230199814, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03498, + "step": 68, + "tokens/total": 8912896, + "tokens/train_per_sec_per_gpu": 3612.11, + "tokens/trainable": 935785 + }, + { + "epoch": 0.2197452229299363, + "grad_norm": 0.65234375, + "learning_rate": 2.1656050955414015e-05, + "loss": 0.03373882547020912, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03431, + "step": 69, + "tokens/total": 9043968, + "tokens/train_per_sec_per_gpu": 3727.74, + "tokens/trainable": 951259 + }, + { + "epoch": 0.2229299363057325, + "grad_norm": 0.458984375, + "learning_rate": 2.197452229299363e-05, + "loss": 0.03272494301199913, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03327, + "step": 70, + "tokens/total": 9175040, + "tokens/train_per_sec_per_gpu": 3482.14, + "tokens/trainable": 965829 + }, + { + "epoch": 0.22611464968152867, + "grad_norm": 0.55078125, + "learning_rate": 2.229299363057325e-05, + "loss": 0.02994038723409176, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03039, + "step": 71, + "tokens/total": 9306112, + "tokens/train_per_sec_per_gpu": 3238.41, + "tokens/trainable": 979395 + }, + { + "epoch": 0.22929936305732485, + "grad_norm": 0.75390625, + "learning_rate": 2.261146496815287e-05, + "loss": 0.033101145178079605, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03366, + "step": 72, + "tokens/total": 9437184, + "tokens/train_per_sec_per_gpu": 3700.58, + "tokens/trainable": 994803 + }, + { + "epoch": 0.23248407643312102, + "grad_norm": 0.396484375, + "learning_rate": 2.2929936305732484e-05, + "loss": 0.03042842261493206, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0309, + "step": 73, + "tokens/total": 9568256, + "tokens/train_per_sec_per_gpu": 3386.78, + "tokens/trainable": 1008996 + }, + { + "epoch": 0.2356687898089172, + "grad_norm": 0.53515625, + "learning_rate": 2.3248407643312103e-05, + "loss": 0.02688576839864254, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02725, + "step": 74, + "tokens/total": 9699328, + "tokens/train_per_sec_per_gpu": 3353.01, + "tokens/trainable": 1023021 + }, + { + "epoch": 0.23885350318471338, + "grad_norm": 0.51953125, + "learning_rate": 2.356687898089172e-05, + "loss": 0.028813578188419342, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02923, + "step": 75, + "tokens/total": 9830400, + "tokens/train_per_sec_per_gpu": 3035.97, + "tokens/trainable": 1035757 + }, + { + "epoch": 0.24203821656050956, + "grad_norm": 0.546875, + "learning_rate": 2.388535031847134e-05, + "loss": 0.035763900727033615, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03641, + "step": 76, + "tokens/total": 9961472, + "tokens/train_per_sec_per_gpu": 2971.17, + "tokens/trainable": 1048202 + }, + { + "epoch": 0.24522292993630573, + "grad_norm": 0.61328125, + "learning_rate": 2.4203821656050956e-05, + "loss": 0.026223331689834595, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02657, + "step": 77, + "tokens/total": 10092544, + "tokens/train_per_sec_per_gpu": 3195.37, + "tokens/trainable": 1061576 + }, + { + "epoch": 0.2484076433121019, + "grad_norm": 0.451171875, + "learning_rate": 2.4522292993630575e-05, + "loss": 0.037136998027563095, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.03784, + "step": 78, + "tokens/total": 10223616, + "tokens/train_per_sec_per_gpu": 3185.64, + "tokens/trainable": 1074924 + }, + { + "epoch": 0.2515923566878981, + "grad_norm": 0.44140625, + "learning_rate": 2.4840764331210193e-05, + "loss": 0.02757476083934307, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02796, + "step": 79, + "tokens/total": 10354688, + "tokens/train_per_sec_per_gpu": 3141.94, + "tokens/trainable": 1088089 + }, + { + "epoch": 0.25477707006369427, + "grad_norm": 0.60546875, + "learning_rate": 2.515923566878981e-05, + "loss": 0.026085954159498215, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02643, + "step": 80, + "tokens/total": 10485760, + "tokens/train_per_sec_per_gpu": 3340.19, + "tokens/trainable": 1102070 + }, + { + "epoch": 0.25796178343949044, + "grad_norm": 0.41015625, + "learning_rate": 2.5477707006369428e-05, + "loss": 0.027341356500983238, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02772, + "step": 81, + "tokens/total": 10616832, + "tokens/train_per_sec_per_gpu": 3294.79, + "tokens/trainable": 1115858 + }, + { + "epoch": 0.2611464968152866, + "grad_norm": 0.431640625, + "learning_rate": 2.5796178343949047e-05, + "loss": 0.028896335512399673, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02932, + "step": 82, + "tokens/total": 10747904, + "tokens/train_per_sec_per_gpu": 3433.89, + "tokens/trainable": 1130226 + }, + { + "epoch": 0.2643312101910828, + "grad_norm": 0.466796875, + "learning_rate": 2.6114649681528662e-05, + "loss": 0.026260778307914734, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02661, + "step": 83, + "tokens/total": 10878976, + "tokens/train_per_sec_per_gpu": 3711.62, + "tokens/trainable": 1145755 + }, + { + "epoch": 0.267515923566879, + "grad_norm": 0.53125, + "learning_rate": 2.643312101910828e-05, + "loss": 0.027284812182188034, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02766, + "step": 84, + "tokens/total": 11010048, + "tokens/train_per_sec_per_gpu": 3309.83, + "tokens/trainable": 1159641 + }, + { + "epoch": 0.27070063694267515, + "grad_norm": 0.376953125, + "learning_rate": 2.67515923566879e-05, + "loss": 0.02594919502735138, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02629, + "step": 85, + "tokens/total": 11141120, + "tokens/train_per_sec_per_gpu": 3424.34, + "tokens/trainable": 1173967 + }, + { + "epoch": 0.27388535031847133, + "grad_norm": 0.50390625, + "learning_rate": 2.707006369426752e-05, + "loss": 0.025507405400276184, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02584, + "step": 86, + "tokens/total": 11272192, + "tokens/train_per_sec_per_gpu": 2757.24, + "tokens/trainable": 1185544 + }, + { + "epoch": 0.2770700636942675, + "grad_norm": 0.4765625, + "learning_rate": 2.7388535031847134e-05, + "loss": 0.024133453145623207, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02443, + "step": 87, + "tokens/total": 11403264, + "tokens/train_per_sec_per_gpu": 3215.45, + "tokens/trainable": 1199051 + }, + { + "epoch": 0.2802547770700637, + "grad_norm": 0.45703125, + "learning_rate": 2.7707006369426753e-05, + "loss": 0.026854459196329117, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02722, + "step": 88, + "tokens/total": 11534336, + "tokens/train_per_sec_per_gpu": 3550.27, + "tokens/trainable": 1213857 + }, + { + "epoch": 0.28343949044585987, + "grad_norm": 0.45703125, + "learning_rate": 2.802547770700637e-05, + "loss": 0.02602829411625862, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02637, + "step": 89, + "tokens/total": 11665408, + "tokens/train_per_sec_per_gpu": 3183.98, + "tokens/trainable": 1227192 + }, + { + "epoch": 0.28662420382165604, + "grad_norm": 0.337890625, + "learning_rate": 2.8343949044585987e-05, + "loss": 0.020508471876382828, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02072, + "step": 90, + "tokens/total": 11796480, + "tokens/train_per_sec_per_gpu": 3402.64, + "tokens/trainable": 1241432 + }, + { + "epoch": 0.2898089171974522, + "grad_norm": 0.408203125, + "learning_rate": 2.8662420382165606e-05, + "loss": 0.017694037407636642, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01785, + "step": 91, + "tokens/total": 11927552, + "tokens/train_per_sec_per_gpu": 3333.79, + "tokens/trainable": 1255396 + }, + { + "epoch": 0.2929936305732484, + "grad_norm": 0.4140625, + "learning_rate": 2.8980891719745225e-05, + "loss": 0.027573810890316963, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02796, + "step": 92, + "tokens/total": 12058624, + "tokens/train_per_sec_per_gpu": 2994.34, + "tokens/trainable": 1268041 + }, + { + "epoch": 0.2961783439490446, + "grad_norm": 0.486328125, + "learning_rate": 2.929936305732484e-05, + "loss": 0.028143662959337234, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02854, + "step": 93, + "tokens/total": 12189696, + "tokens/train_per_sec_per_gpu": 3516.43, + "tokens/trainable": 1282765 + }, + { + "epoch": 0.29936305732484075, + "grad_norm": 0.4765625, + "learning_rate": 2.961783439490446e-05, + "loss": 0.026264818385243416, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02661, + "step": 94, + "tokens/total": 12320768, + "tokens/train_per_sec_per_gpu": 3304.3, + "tokens/trainable": 1296613 + }, + { + "epoch": 0.30254777070063693, + "grad_norm": 0.462890625, + "learning_rate": 2.9936305732484078e-05, + "loss": 0.026661768555641174, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02702, + "step": 95, + "tokens/total": 12451840, + "tokens/train_per_sec_per_gpu": 3563.3, + "tokens/trainable": 1311465 + }, + { + "epoch": 0.3057324840764331, + "grad_norm": 0.306640625, + "learning_rate": 3.0254777070063693e-05, + "loss": 0.017260678112506866, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01741, + "step": 96, + "tokens/total": 12582912, + "tokens/train_per_sec_per_gpu": 3428.76, + "tokens/trainable": 1325753 + }, + { + "epoch": 0.3089171974522293, + "grad_norm": 0.5703125, + "learning_rate": 3.057324840764331e-05, + "loss": 0.022419072687625885, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02267, + "step": 97, + "tokens/total": 12713984, + "tokens/train_per_sec_per_gpu": 3443.07, + "tokens/trainable": 1340109 + }, + { + "epoch": 0.31210191082802546, + "grad_norm": 0.50390625, + "learning_rate": 3.089171974522293e-05, + "loss": 0.023397397249937057, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02367, + "step": 98, + "tokens/total": 12845056, + "tokens/train_per_sec_per_gpu": 3420.73, + "tokens/trainable": 1354398 + }, + { + "epoch": 0.31528662420382164, + "grad_norm": 0.43359375, + "learning_rate": 3.121019108280255e-05, + "loss": 0.024743150919675827, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02505, + "step": 99, + "tokens/total": 12976128, + "tokens/train_per_sec_per_gpu": 3420.87, + "tokens/trainable": 1368740 + }, + { + "epoch": 0.3184713375796178, + "grad_norm": 0.3984375, + "learning_rate": 3.1528662420382165e-05, + "loss": 0.023541904985904694, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02382, + "step": 100, + "tokens/total": 13107200, + "tokens/train_per_sec_per_gpu": 3192.28, + "tokens/trainable": 1382180 + }, + { + "epoch": 0.321656050955414, + "grad_norm": 0.66015625, + "learning_rate": 3.184713375796178e-05, + "loss": 0.023172177374362946, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02344, + "step": 101, + "tokens/total": 13238272, + "tokens/train_per_sec_per_gpu": 3177.98, + "tokens/trainable": 1395593 + }, + { + "epoch": 0.3248407643312102, + "grad_norm": 0.48828125, + "learning_rate": 3.21656050955414e-05, + "loss": 0.025406980887055397, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02573, + "step": 102, + "tokens/total": 13369344, + "tokens/train_per_sec_per_gpu": 3638.95, + "tokens/trainable": 1410783 + }, + { + "epoch": 0.32802547770700635, + "grad_norm": 0.69921875, + "learning_rate": 3.248407643312102e-05, + "loss": 0.02435356006026268, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02465, + "step": 103, + "tokens/total": 13500416, + "tokens/train_per_sec_per_gpu": 3263.52, + "tokens/trainable": 1424464 + }, + { + "epoch": 0.33121019108280253, + "grad_norm": 0.404296875, + "learning_rate": 3.2802547770700634e-05, + "loss": 0.02753208577632904, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02791, + "step": 104, + "tokens/total": 13631488, + "tokens/train_per_sec_per_gpu": 3426.67, + "tokens/trainable": 1438808 + }, + { + "epoch": 0.3343949044585987, + "grad_norm": 0.404296875, + "learning_rate": 3.3121019108280256e-05, + "loss": 0.0209305789321661, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02115, + "step": 105, + "tokens/total": 13762560, + "tokens/train_per_sec_per_gpu": 3761.41, + "tokens/trainable": 1454488 + }, + { + "epoch": 0.3375796178343949, + "grad_norm": 0.5859375, + "learning_rate": 3.343949044585987e-05, + "loss": 0.023175280541181564, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02345, + "step": 106, + "tokens/total": 13893632, + "tokens/train_per_sec_per_gpu": 3065.06, + "tokens/trainable": 1467330 + }, + { + "epoch": 0.34076433121019106, + "grad_norm": 0.443359375, + "learning_rate": 3.375796178343949e-05, + "loss": 0.022064058110117912, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02231, + "step": 107, + "tokens/total": 14024704, + "tokens/train_per_sec_per_gpu": 3306.24, + "tokens/trainable": 1481142 + }, + { + "epoch": 0.34394904458598724, + "grad_norm": 0.490234375, + "learning_rate": 3.407643312101911e-05, + "loss": 0.0202829297631979, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02049, + "step": 108, + "tokens/total": 14155776, + "tokens/train_per_sec_per_gpu": 3532.79, + "tokens/trainable": 1495947 + }, + { + "epoch": 0.3471337579617834, + "grad_norm": 0.4453125, + "learning_rate": 3.4394904458598724e-05, + "loss": 0.01804858073592186, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01821, + "step": 109, + "tokens/total": 14286848, + "tokens/train_per_sec_per_gpu": 3518.95, + "tokens/trainable": 1510694 + }, + { + "epoch": 0.3503184713375796, + "grad_norm": 0.42578125, + "learning_rate": 3.4713375796178346e-05, + "loss": 0.0210330281406641, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02126, + "step": 110, + "tokens/total": 14417920, + "tokens/train_per_sec_per_gpu": 2978.43, + "tokens/trainable": 1523251 + }, + { + "epoch": 0.3535031847133758, + "grad_norm": 0.427734375, + "learning_rate": 3.503184713375796e-05, + "loss": 0.026296302676200867, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02665, + "step": 111, + "tokens/total": 14548992, + "tokens/train_per_sec_per_gpu": 3167.03, + "tokens/trainable": 1536545 + }, + { + "epoch": 0.35668789808917195, + "grad_norm": 0.5234375, + "learning_rate": 3.535031847133758e-05, + "loss": 0.020682599395513535, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0209, + "step": 112, + "tokens/total": 14680064, + "tokens/train_per_sec_per_gpu": 3285.72, + "tokens/trainable": 1550307 + }, + { + "epoch": 0.35987261146496813, + "grad_norm": 0.53125, + "learning_rate": 3.56687898089172e-05, + "loss": 0.018929051235318184, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01911, + "step": 113, + "tokens/total": 14811136, + "tokens/train_per_sec_per_gpu": 3527.93, + "tokens/trainable": 1565056 + }, + { + "epoch": 0.3630573248407643, + "grad_norm": 0.453125, + "learning_rate": 3.5987261146496815e-05, + "loss": 0.02578428015112877, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02612, + "step": 114, + "tokens/total": 14942208, + "tokens/train_per_sec_per_gpu": 3195.27, + "tokens/trainable": 1578471 + }, + { + "epoch": 0.3662420382165605, + "grad_norm": 0.54296875, + "learning_rate": 3.630573248407643e-05, + "loss": 0.02062690444290638, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02084, + "step": 115, + "tokens/total": 15073280, + "tokens/train_per_sec_per_gpu": 3476.31, + "tokens/trainable": 1593028 + }, + { + "epoch": 0.36942675159235666, + "grad_norm": 0.5546875, + "learning_rate": 3.662420382165605e-05, + "loss": 0.018274614587426186, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01844, + "step": 116, + "tokens/total": 15204352, + "tokens/train_per_sec_per_gpu": 3437.34, + "tokens/trainable": 1607412 + }, + { + "epoch": 0.37261146496815284, + "grad_norm": 0.3359375, + "learning_rate": 3.694267515923567e-05, + "loss": 0.02159012109041214, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02182, + "step": 117, + "tokens/total": 15335424, + "tokens/train_per_sec_per_gpu": 3467.82, + "tokens/trainable": 1621934 + }, + { + "epoch": 0.37579617834394907, + "grad_norm": 0.4609375, + "learning_rate": 3.7261146496815283e-05, + "loss": 0.0239134319126606, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0242, + "step": 118, + "tokens/total": 15466496, + "tokens/train_per_sec_per_gpu": 3526.82, + "tokens/trainable": 1636693 + }, + { + "epoch": 0.37898089171974525, + "grad_norm": 0.546875, + "learning_rate": 3.7579617834394906e-05, + "loss": 0.021818162873387337, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02206, + "step": 119, + "tokens/total": 15597568, + "tokens/train_per_sec_per_gpu": 3233.2, + "tokens/trainable": 1650256 + }, + { + "epoch": 0.3821656050955414, + "grad_norm": 0.3671875, + "learning_rate": 3.789808917197453e-05, + "loss": 0.023171117529273033, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02344, + "step": 120, + "tokens/total": 15728640, + "tokens/train_per_sec_per_gpu": 3502.77, + "tokens/trainable": 1664915 + }, + { + "epoch": 0.3853503184713376, + "grad_norm": 0.408203125, + "learning_rate": 3.821656050955414e-05, + "loss": 0.019905205816030502, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0201, + "step": 121, + "tokens/total": 15859712, + "tokens/train_per_sec_per_gpu": 3495.55, + "tokens/trainable": 1679527 + }, + { + "epoch": 0.3885350318471338, + "grad_norm": 0.4765625, + "learning_rate": 3.8535031847133766e-05, + "loss": 0.01511327363550663, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01523, + "step": 122, + "tokens/total": 15990784, + "tokens/train_per_sec_per_gpu": 3507.46, + "tokens/trainable": 1694159 + }, + { + "epoch": 0.39171974522292996, + "grad_norm": 0.44921875, + "learning_rate": 3.885350318471338e-05, + "loss": 0.02048143371939659, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02069, + "step": 123, + "tokens/total": 16121856, + "tokens/train_per_sec_per_gpu": 3490.03, + "tokens/trainable": 1708712 + }, + { + "epoch": 0.39490445859872614, + "grad_norm": 0.392578125, + "learning_rate": 3.9171974522292996e-05, + "loss": 0.02280033566057682, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02306, + "step": 124, + "tokens/total": 16252928, + "tokens/train_per_sec_per_gpu": 3443.73, + "tokens/trainable": 1723059 + }, + { + "epoch": 0.3980891719745223, + "grad_norm": 0.322265625, + "learning_rate": 3.949044585987262e-05, + "loss": 0.01703651435673237, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01718, + "step": 125, + "tokens/total": 16384000, + "tokens/train_per_sec_per_gpu": 3396.46, + "tokens/trainable": 1737268 + }, + { + "epoch": 0.4012738853503185, + "grad_norm": 0.37109375, + "learning_rate": 3.9808917197452234e-05, + "loss": 0.019548913463950157, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01974, + "step": 126, + "tokens/total": 16515072, + "tokens/train_per_sec_per_gpu": 3220.09, + "tokens/trainable": 1750779 + }, + { + "epoch": 0.40445859872611467, + "grad_norm": 0.4609375, + "learning_rate": 4.012738853503185e-05, + "loss": 0.021433480083942413, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02166, + "step": 127, + "tokens/total": 16646144, + "tokens/train_per_sec_per_gpu": 3135.99, + "tokens/trainable": 1763916 + }, + { + "epoch": 0.40764331210191085, + "grad_norm": 0.36328125, + "learning_rate": 4.044585987261147e-05, + "loss": 0.01608860120177269, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01622, + "step": 128, + "tokens/total": 16777216, + "tokens/train_per_sec_per_gpu": 3294.85, + "tokens/trainable": 1777688 + }, + { + "epoch": 0.410828025477707, + "grad_norm": 0.384765625, + "learning_rate": 4.076433121019109e-05, + "loss": 0.02616111747920513, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02651, + "step": 129, + "tokens/total": 16908288, + "tokens/train_per_sec_per_gpu": 3542.44, + "tokens/trainable": 1792526 + }, + { + "epoch": 0.4140127388535032, + "grad_norm": 0.359375, + "learning_rate": 4.10828025477707e-05, + "loss": 0.023339644074440002, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02361, + "step": 130, + "tokens/total": 17039360, + "tokens/train_per_sec_per_gpu": 3579.44, + "tokens/trainable": 1807456 + }, + { + "epoch": 0.4171974522292994, + "grad_norm": 0.396484375, + "learning_rate": 4.1401273885350325e-05, + "loss": 0.01703963428735733, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01719, + "step": 131, + "tokens/total": 17170432, + "tokens/train_per_sec_per_gpu": 3374.03, + "tokens/trainable": 1821617 + }, + { + "epoch": 0.42038216560509556, + "grad_norm": 0.322265625, + "learning_rate": 4.171974522292994e-05, + "loss": 0.018855011090636253, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01903, + "step": 132, + "tokens/total": 17301504, + "tokens/train_per_sec_per_gpu": 3358.78, + "tokens/trainable": 1835657 + }, + { + "epoch": 0.42356687898089174, + "grad_norm": 0.32421875, + "learning_rate": 4.2038216560509556e-05, + "loss": 0.018383294343948364, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01855, + "step": 133, + "tokens/total": 17432576, + "tokens/train_per_sec_per_gpu": 3288.93, + "tokens/trainable": 1849363 + }, + { + "epoch": 0.4267515923566879, + "grad_norm": 0.341796875, + "learning_rate": 4.235668789808918e-05, + "loss": 0.018167613074183464, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01833, + "step": 134, + "tokens/total": 17563648, + "tokens/train_per_sec_per_gpu": 3327.75, + "tokens/trainable": 1863304 + }, + { + "epoch": 0.4299363057324841, + "grad_norm": 0.263671875, + "learning_rate": 4.267515923566879e-05, + "loss": 0.016551347449421883, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01669, + "step": 135, + "tokens/total": 17694720, + "tokens/train_per_sec_per_gpu": 3278.66, + "tokens/trainable": 1877019 + }, + { + "epoch": 0.43312101910828027, + "grad_norm": 0.3359375, + "learning_rate": 4.299363057324841e-05, + "loss": 0.02233925275504589, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02259, + "step": 136, + "tokens/total": 17825792, + "tokens/train_per_sec_per_gpu": 3065.32, + "tokens/trainable": 1889991 + }, + { + "epoch": 0.43630573248407645, + "grad_norm": 0.35546875, + "learning_rate": 4.331210191082803e-05, + "loss": 0.01874961145222187, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01893, + "step": 137, + "tokens/total": 17956864, + "tokens/train_per_sec_per_gpu": 3421.98, + "tokens/trainable": 1904258 + }, + { + "epoch": 0.4394904458598726, + "grad_norm": 0.35546875, + "learning_rate": 4.3630573248407646e-05, + "loss": 0.016853082925081253, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.017, + "step": 138, + "tokens/total": 18087936, + "tokens/train_per_sec_per_gpu": 3173.54, + "tokens/trainable": 1917589 + }, + { + "epoch": 0.4426751592356688, + "grad_norm": 0.373046875, + "learning_rate": 4.394904458598726e-05, + "loss": 0.015192901715636253, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01531, + "step": 139, + "tokens/total": 18219008, + "tokens/train_per_sec_per_gpu": 2954.41, + "tokens/trainable": 1930014 + }, + { + "epoch": 0.445859872611465, + "grad_norm": 0.302734375, + "learning_rate": 4.4267515923566884e-05, + "loss": 0.01463925652205944, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01475, + "step": 140, + "tokens/total": 18350080, + "tokens/train_per_sec_per_gpu": 3666.98, + "tokens/trainable": 1945307 + }, + { + "epoch": 0.44904458598726116, + "grad_norm": 0.390625, + "learning_rate": 4.45859872611465e-05, + "loss": 0.020933344960212708, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.02115, + "step": 141, + "tokens/total": 18481152, + "tokens/train_per_sec_per_gpu": 3580.73, + "tokens/trainable": 1960244 + }, + { + "epoch": 0.45222929936305734, + "grad_norm": 0.345703125, + "learning_rate": 4.4904458598726115e-05, + "loss": 0.016706032678484917, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01685, + "step": 142, + "tokens/total": 18612224, + "tokens/train_per_sec_per_gpu": 3692.46, + "tokens/trainable": 1975680 + }, + { + "epoch": 0.4554140127388535, + "grad_norm": 0.271484375, + "learning_rate": 4.522292993630574e-05, + "loss": 0.0143811646848917, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01449, + "step": 143, + "tokens/total": 18743296, + "tokens/train_per_sec_per_gpu": 3610.19, + "tokens/trainable": 1990745 + }, + { + "epoch": 0.4585987261146497, + "grad_norm": 0.333984375, + "learning_rate": 4.554140127388535e-05, + "loss": 0.015790347009897232, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01592, + "step": 144, + "tokens/total": 18874368, + "tokens/train_per_sec_per_gpu": 3290.56, + "tokens/trainable": 2004531 + }, + { + "epoch": 0.46178343949044587, + "grad_norm": 0.251953125, + "learning_rate": 4.585987261146497e-05, + "loss": 0.013354619033634663, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01344, + "step": 145, + "tokens/total": 19005440, + "tokens/train_per_sec_per_gpu": 3241.55, + "tokens/trainable": 2018101 + }, + { + "epoch": 0.46496815286624205, + "grad_norm": 0.376953125, + "learning_rate": 4.617834394904459e-05, + "loss": 0.01745392382144928, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01761, + "step": 146, + "tokens/total": 19136512, + "tokens/train_per_sec_per_gpu": 3409.89, + "tokens/trainable": 2032310 + }, + { + "epoch": 0.4681528662420382, + "grad_norm": 0.38671875, + "learning_rate": 4.6496815286624206e-05, + "loss": 0.015100197866559029, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01521, + "step": 147, + "tokens/total": 19267584, + "tokens/train_per_sec_per_gpu": 3269.82, + "tokens/trainable": 2045999 + }, + { + "epoch": 0.4713375796178344, + "grad_norm": 0.310546875, + "learning_rate": 4.681528662420383e-05, + "loss": 0.01744706742465496, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0176, + "step": 148, + "tokens/total": 19398656, + "tokens/train_per_sec_per_gpu": 3709.08, + "tokens/trainable": 2061453 + }, + { + "epoch": 0.4745222929936306, + "grad_norm": 0.283203125, + "learning_rate": 4.713375796178344e-05, + "loss": 0.013093837536871433, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01318, + "step": 149, + "tokens/total": 19529728, + "tokens/train_per_sec_per_gpu": 3292.43, + "tokens/trainable": 2075180 + }, + { + "epoch": 0.47770700636942676, + "grad_norm": 0.275390625, + "learning_rate": 4.745222929936306e-05, + "loss": 0.01639549434185028, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01653, + "step": 150, + "tokens/total": 19660800, + "tokens/train_per_sec_per_gpu": 3175.73, + "tokens/trainable": 2088491 + }, + { + "epoch": 0.48089171974522293, + "grad_norm": 0.31640625, + "learning_rate": 4.777070063694268e-05, + "loss": 0.015184286050498486, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0153, + "step": 151, + "tokens/total": 19791872, + "tokens/train_per_sec_per_gpu": 3611.48, + "tokens/trainable": 2103581 + }, + { + "epoch": 0.4840764331210191, + "grad_norm": 0.318359375, + "learning_rate": 4.8089171974522296e-05, + "loss": 0.015232382342219353, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01535, + "step": 152, + "tokens/total": 19922944, + "tokens/train_per_sec_per_gpu": 3138.84, + "tokens/trainable": 2116743 + }, + { + "epoch": 0.4872611464968153, + "grad_norm": 0.4140625, + "learning_rate": 4.840764331210191e-05, + "loss": 0.018071118742227554, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01824, + "step": 153, + "tokens/total": 20054016, + "tokens/train_per_sec_per_gpu": 2935.94, + "tokens/trainable": 2129049 + }, + { + "epoch": 0.49044585987261147, + "grad_norm": 0.26953125, + "learning_rate": 4.8726114649681534e-05, + "loss": 0.015034169889986515, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01515, + "step": 154, + "tokens/total": 20185088, + "tokens/train_per_sec_per_gpu": 3956.78, + "tokens/trainable": 2145499 + }, + { + "epoch": 0.49363057324840764, + "grad_norm": 0.2734375, + "learning_rate": 4.904458598726115e-05, + "loss": 0.013894051313400269, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01399, + "step": 155, + "tokens/total": 20316160, + "tokens/train_per_sec_per_gpu": 3559.16, + "tokens/trainable": 2160294 + }, + { + "epoch": 0.4968152866242038, + "grad_norm": 0.29296875, + "learning_rate": 4.9363057324840765e-05, + "loss": 0.01629924215376377, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01643, + "step": 156, + "tokens/total": 20447232, + "tokens/train_per_sec_per_gpu": 3108.59, + "tokens/trainable": 2173313 + }, + { + "epoch": 0.5, + "grad_norm": 0.34375, + "learning_rate": 4.968152866242039e-05, + "loss": 0.014140879735350609, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01424, + "step": 157, + "tokens/total": 20578304, + "tokens/train_per_sec_per_gpu": 3311.55, + "tokens/trainable": 2187121 + }, + { + "epoch": 0.5, + "eval_loss": 0.0162150077521801, + "eval_ppl": 1.01635, + "eval_runtime": 42.1529, + "eval_samples_per_second": 64.076, + "eval_steps_per_second": 4.009, + "memory/device_reserved (GiB)": 68.88, + "memory/max_active (GiB)": 54.61, + "memory/max_allocated (GiB)": 54.61, + "step": 157 + }, + { + "epoch": 0.5031847133757962, + "grad_norm": 0.255859375, + "learning_rate": 5e-05, + "loss": 0.012421849183738232, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0125, + "step": 158, + "tokens/total": 20709376, + "tokens/train_per_sec_per_gpu": 3796.22, + "tokens/trainable": 2202882 + }, + { + "epoch": 0.5063694267515924, + "grad_norm": 0.298828125, + "learning_rate": 4.999993820899543e-05, + "loss": 0.014737301506102085, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01485, + "step": 159, + "tokens/total": 20840448, + "tokens/train_per_sec_per_gpu": 2912.87, + "tokens/trainable": 2215142 + }, + { + "epoch": 0.5095541401273885, + "grad_norm": 0.3828125, + "learning_rate": 4.999975283628719e-05, + "loss": 0.017280632629990578, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01743, + "step": 160, + "tokens/total": 20971520, + "tokens/train_per_sec_per_gpu": 2864.73, + "tokens/trainable": 2227241 + }, + { + "epoch": 0.5127388535031847, + "grad_norm": 0.30078125, + "learning_rate": 4.999944388279162e-05, + "loss": 0.014671262353658676, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01478, + "step": 161, + "tokens/total": 21102592, + "tokens/train_per_sec_per_gpu": 3598.96, + "tokens/trainable": 2242266 + }, + { + "epoch": 0.5159235668789809, + "grad_norm": 0.357421875, + "learning_rate": 4.999901135003596e-05, + "loss": 0.01328805461525917, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01338, + "step": 162, + "tokens/total": 21233664, + "tokens/train_per_sec_per_gpu": 3491.44, + "tokens/trainable": 2256820 + }, + { + "epoch": 0.5191082802547771, + "grad_norm": 0.294921875, + "learning_rate": 4.9998455240158346e-05, + "loss": 0.015039588324725628, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01515, + "step": 163, + "tokens/total": 21364736, + "tokens/train_per_sec_per_gpu": 2929.21, + "tokens/trainable": 2269119 + }, + { + "epoch": 0.5222929936305732, + "grad_norm": 0.3203125, + "learning_rate": 4.999777555590779e-05, + "loss": 0.014336930587887764, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01444, + "step": 164, + "tokens/total": 21495808, + "tokens/train_per_sec_per_gpu": 3728.34, + "tokens/trainable": 2284700 + }, + { + "epoch": 0.5254777070063694, + "grad_norm": 0.279296875, + "learning_rate": 4.999697230064414e-05, + "loss": 0.01668444462120533, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01682, + "step": 165, + "tokens/total": 21626880, + "tokens/train_per_sec_per_gpu": 3542.38, + "tokens/trainable": 2299523 + }, + { + "epoch": 0.5286624203821656, + "grad_norm": 0.275390625, + "learning_rate": 4.999604547833814e-05, + "loss": 0.01559534203261137, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01572, + "step": 166, + "tokens/total": 21757952, + "tokens/train_per_sec_per_gpu": 3348.3, + "tokens/trainable": 2313539 + }, + { + "epoch": 0.5318471337579618, + "grad_norm": 0.251953125, + "learning_rate": 4.9994995093571314e-05, + "loss": 0.01181457843631506, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01188, + "step": 167, + "tokens/total": 21889024, + "tokens/train_per_sec_per_gpu": 3193.91, + "tokens/trainable": 2326972 + }, + { + "epoch": 0.535031847133758, + "grad_norm": 0.326171875, + "learning_rate": 4.9993821151536024e-05, + "loss": 0.014408236369490623, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01451, + "step": 168, + "tokens/total": 22020096, + "tokens/train_per_sec_per_gpu": 3172.43, + "tokens/trainable": 2340305 + }, + { + "epoch": 0.5382165605095541, + "grad_norm": 0.259765625, + "learning_rate": 4.9992523658035376e-05, + "loss": 0.010526357218623161, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01058, + "step": 169, + "tokens/total": 22151168, + "tokens/train_per_sec_per_gpu": 3477.3, + "tokens/trainable": 2354865 + }, + { + "epoch": 0.5414012738853503, + "grad_norm": 0.2734375, + "learning_rate": 4.9991102619483254e-05, + "loss": 0.015866123139858246, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01599, + "step": 170, + "tokens/total": 22282240, + "tokens/train_per_sec_per_gpu": 3352.2, + "tokens/trainable": 2368942 + }, + { + "epoch": 0.5445859872611465, + "grad_norm": 0.34765625, + "learning_rate": 4.998955804290425e-05, + "loss": 0.015990689396858215, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01612, + "step": 171, + "tokens/total": 22413312, + "tokens/train_per_sec_per_gpu": 3329.47, + "tokens/trainable": 2382903 + }, + { + "epoch": 0.5477707006369427, + "grad_norm": 0.294921875, + "learning_rate": 4.998788993593364e-05, + "loss": 0.012892219237983227, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01298, + "step": 172, + "tokens/total": 22544384, + "tokens/train_per_sec_per_gpu": 3491.15, + "tokens/trainable": 2397472 + }, + { + "epoch": 0.5509554140127388, + "grad_norm": 0.326171875, + "learning_rate": 4.998609830681734e-05, + "loss": 0.016418559476733208, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01655, + "step": 173, + "tokens/total": 22675456, + "tokens/train_per_sec_per_gpu": 3177.44, + "tokens/trainable": 2410837 + }, + { + "epoch": 0.554140127388535, + "grad_norm": 0.275390625, + "learning_rate": 4.998418316441188e-05, + "loss": 0.0159194003790617, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01605, + "step": 174, + "tokens/total": 22806528, + "tokens/train_per_sec_per_gpu": 3252.23, + "tokens/trainable": 2424499 + }, + { + "epoch": 0.5573248407643312, + "grad_norm": 0.255859375, + "learning_rate": 4.998214451818434e-05, + "loss": 0.017272397875785828, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01742, + "step": 175, + "tokens/total": 22937600, + "tokens/train_per_sec_per_gpu": 3335.6, + "tokens/trainable": 2438525 + }, + { + "epoch": 0.5605095541401274, + "grad_norm": 0.3671875, + "learning_rate": 4.997998237821233e-05, + "loss": 0.018668157979846, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01884, + "step": 176, + "tokens/total": 23068672, + "tokens/train_per_sec_per_gpu": 3037.8, + "tokens/trainable": 2451344 + }, + { + "epoch": 0.5636942675159236, + "grad_norm": 0.275390625, + "learning_rate": 4.99776967551839e-05, + "loss": 0.013892064802348614, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01399, + "step": 177, + "tokens/total": 23199744, + "tokens/train_per_sec_per_gpu": 3608.18, + "tokens/trainable": 2466462 + }, + { + "epoch": 0.5668789808917197, + "grad_norm": 0.318359375, + "learning_rate": 4.997528766039754e-05, + "loss": 0.018128130584955215, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01829, + "step": 178, + "tokens/total": 23330816, + "tokens/train_per_sec_per_gpu": 3418.17, + "tokens/trainable": 2480794 + }, + { + "epoch": 0.5700636942675159, + "grad_norm": 0.279296875, + "learning_rate": 4.997275510576207e-05, + "loss": 0.015599234029650688, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01572, + "step": 179, + "tokens/total": 23461888, + "tokens/train_per_sec_per_gpu": 3348.28, + "tokens/trainable": 2494826 + }, + { + "epoch": 0.5732484076433121, + "grad_norm": 0.263671875, + "learning_rate": 4.9970099103796625e-05, + "loss": 0.01772911660373211, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01789, + "step": 180, + "tokens/total": 23592960, + "tokens/train_per_sec_per_gpu": 3350.41, + "tokens/trainable": 2508825 + }, + { + "epoch": 0.5764331210191083, + "grad_norm": 0.3046875, + "learning_rate": 4.9967319667630567e-05, + "loss": 0.017531519755721092, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01769, + "step": 181, + "tokens/total": 23724032, + "tokens/train_per_sec_per_gpu": 3416.59, + "tokens/trainable": 2523152 + }, + { + "epoch": 0.5796178343949044, + "grad_norm": 0.25390625, + "learning_rate": 4.9964416811003414e-05, + "loss": 0.01645725592970848, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01659, + "step": 182, + "tokens/total": 23855104, + "tokens/train_per_sec_per_gpu": 3286.33, + "tokens/trainable": 2536956 + }, + { + "epoch": 0.5828025477707006, + "grad_norm": 0.298828125, + "learning_rate": 4.996139054826482e-05, + "loss": 0.017507638782262802, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01766, + "step": 183, + "tokens/total": 23986176, + "tokens/train_per_sec_per_gpu": 3802.12, + "tokens/trainable": 2552813 + }, + { + "epoch": 0.5859872611464968, + "grad_norm": 0.2333984375, + "learning_rate": 4.9958240894374433e-05, + "loss": 0.015289016999304295, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01541, + "step": 184, + "tokens/total": 24117248, + "tokens/train_per_sec_per_gpu": 3166.13, + "tokens/trainable": 2566093 + }, + { + "epoch": 0.589171974522293, + "grad_norm": 0.2275390625, + "learning_rate": 4.995496786490189e-05, + "loss": 0.01385944988578558, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01396, + "step": 185, + "tokens/total": 24248320, + "tokens/train_per_sec_per_gpu": 3395.23, + "tokens/trainable": 2580324 + }, + { + "epoch": 0.5923566878980892, + "grad_norm": 0.28515625, + "learning_rate": 4.995157147602669e-05, + "loss": 0.01804269105195999, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01821, + "step": 186, + "tokens/total": 24379392, + "tokens/train_per_sec_per_gpu": 3278.99, + "tokens/trainable": 2594113 + }, + { + "epoch": 0.5955414012738853, + "grad_norm": 0.3359375, + "learning_rate": 4.994805174453813e-05, + "loss": 0.01675378903746605, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01689, + "step": 187, + "tokens/total": 24510464, + "tokens/train_per_sec_per_gpu": 3247.25, + "tokens/trainable": 2607778 + }, + { + "epoch": 0.5987261146496815, + "grad_norm": 0.2578125, + "learning_rate": 4.994440868783522e-05, + "loss": 0.014898994006216526, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01501, + "step": 188, + "tokens/total": 24641536, + "tokens/train_per_sec_per_gpu": 3439.86, + "tokens/trainable": 2622165 + }, + { + "epoch": 0.6019108280254777, + "grad_norm": 0.236328125, + "learning_rate": 4.994064232392664e-05, + "loss": 0.012711770832538605, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01279, + "step": 189, + "tokens/total": 24772608, + "tokens/train_per_sec_per_gpu": 3250.18, + "tokens/trainable": 2635842 + }, + { + "epoch": 0.6050955414012739, + "grad_norm": 0.201171875, + "learning_rate": 4.993675267143056e-05, + "loss": 0.0118938647210598, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01196, + "step": 190, + "tokens/total": 24903680, + "tokens/train_per_sec_per_gpu": 3684.73, + "tokens/trainable": 2651259 + }, + { + "epoch": 0.60828025477707, + "grad_norm": 0.265625, + "learning_rate": 4.993273974957463e-05, + "loss": 0.011486702598631382, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01155, + "step": 191, + "tokens/total": 25034752, + "tokens/train_per_sec_per_gpu": 3177.1, + "tokens/trainable": 2664566 + }, + { + "epoch": 0.6114649681528662, + "grad_norm": 0.23046875, + "learning_rate": 4.992860357819584e-05, + "loss": 0.012811151333153248, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01289, + "step": 192, + "tokens/total": 25165824, + "tokens/train_per_sec_per_gpu": 3414.45, + "tokens/trainable": 2678871 + }, + { + "epoch": 0.6146496815286624, + "grad_norm": 0.30078125, + "learning_rate": 4.992434417774045e-05, + "loss": 0.011826693080365658, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0119, + "step": 193, + "tokens/total": 25296896, + "tokens/train_per_sec_per_gpu": 3298.7, + "tokens/trainable": 2692737 + }, + { + "epoch": 0.6178343949044586, + "grad_norm": 0.2353515625, + "learning_rate": 4.991996156926387e-05, + "loss": 0.01326029933989048, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01335, + "step": 194, + "tokens/total": 25427968, + "tokens/train_per_sec_per_gpu": 3122.96, + "tokens/trainable": 2705928 + }, + { + "epoch": 0.6210191082802548, + "grad_norm": 0.2890625, + "learning_rate": 4.991545577443057e-05, + "loss": 0.012153583578765392, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01223, + "step": 195, + "tokens/total": 25559040, + "tokens/train_per_sec_per_gpu": 3089.07, + "tokens/trainable": 2718915 + }, + { + "epoch": 0.6242038216560509, + "grad_norm": 0.296875, + "learning_rate": 4.991082681551396e-05, + "loss": 0.014371933415532112, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01448, + "step": 196, + "tokens/total": 25690112, + "tokens/train_per_sec_per_gpu": 3197.51, + "tokens/trainable": 2732376 + }, + { + "epoch": 0.6273885350318471, + "grad_norm": 0.26953125, + "learning_rate": 4.990607471539626e-05, + "loss": 0.012046409770846367, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01212, + "step": 197, + "tokens/total": 25821184, + "tokens/train_per_sec_per_gpu": 3374.92, + "tokens/trainable": 2746546 + }, + { + "epoch": 0.6305732484076433, + "grad_norm": 0.2392578125, + "learning_rate": 4.990119949756845e-05, + "loss": 0.009664296172559261, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00971, + "step": 198, + "tokens/total": 25952256, + "tokens/train_per_sec_per_gpu": 3569.8, + "tokens/trainable": 2761477 + }, + { + "epoch": 0.6337579617834395, + "grad_norm": 0.279296875, + "learning_rate": 4.989620118613009e-05, + "loss": 0.00950827170163393, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00955, + "step": 199, + "tokens/total": 26083328, + "tokens/train_per_sec_per_gpu": 3265.27, + "tokens/trainable": 2775167 + }, + { + "epoch": 0.6369426751592356, + "grad_norm": 0.310546875, + "learning_rate": 4.989107980578924e-05, + "loss": 0.01698843576014042, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01713, + "step": 200, + "tokens/total": 26214400, + "tokens/train_per_sec_per_gpu": 3262.25, + "tokens/trainable": 2788865 + }, + { + "epoch": 0.6401273885350318, + "grad_norm": 0.248046875, + "learning_rate": 4.9885835381862326e-05, + "loss": 0.009720825590193272, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00977, + "step": 201, + "tokens/total": 26345472, + "tokens/train_per_sec_per_gpu": 3459.38, + "tokens/trainable": 2803380 + }, + { + "epoch": 0.643312101910828, + "grad_norm": 0.30859375, + "learning_rate": 4.988046794027399e-05, + "loss": 0.01347583532333374, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01357, + "step": 202, + "tokens/total": 26476544, + "tokens/train_per_sec_per_gpu": 3450.44, + "tokens/trainable": 2817829 + }, + { + "epoch": 0.6464968152866242, + "grad_norm": 0.2890625, + "learning_rate": 4.987497750755702e-05, + "loss": 0.014860209077596664, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01497, + "step": 203, + "tokens/total": 26607616, + "tokens/train_per_sec_per_gpu": 3450.98, + "tokens/trainable": 2832277 + }, + { + "epoch": 0.6496815286624203, + "grad_norm": 0.31640625, + "learning_rate": 4.986936411085214e-05, + "loss": 0.016120830550789833, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01625, + "step": 204, + "tokens/total": 26738688, + "tokens/train_per_sec_per_gpu": 3184.35, + "tokens/trainable": 2845614 + }, + { + "epoch": 0.6528662420382165, + "grad_norm": 0.2578125, + "learning_rate": 4.986362777790796e-05, + "loss": 0.01890011504292488, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01908, + "step": 205, + "tokens/total": 26869760, + "tokens/train_per_sec_per_gpu": 3386.54, + "tokens/trainable": 2859717 + }, + { + "epoch": 0.6560509554140127, + "grad_norm": 0.333984375, + "learning_rate": 4.9857768537080784e-05, + "loss": 0.014317265711724758, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01442, + "step": 206, + "tokens/total": 27000832, + "tokens/train_per_sec_per_gpu": 3426.69, + "tokens/trainable": 2874068 + }, + { + "epoch": 0.6592356687898089, + "grad_norm": 0.31640625, + "learning_rate": 4.9851786417334466e-05, + "loss": 0.013661851175129414, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01376, + "step": 207, + "tokens/total": 27131904, + "tokens/train_per_sec_per_gpu": 3324.85, + "tokens/trainable": 2887963 + }, + { + "epoch": 0.6624203821656051, + "grad_norm": 0.251953125, + "learning_rate": 4.984568144824032e-05, + "loss": 0.01245003379881382, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01253, + "step": 208, + "tokens/total": 27262976, + "tokens/train_per_sec_per_gpu": 3335.64, + "tokens/trainable": 2901885 + }, + { + "epoch": 0.6656050955414012, + "grad_norm": 0.265625, + "learning_rate": 4.983945365997691e-05, + "loss": 0.010308452881872654, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01036, + "step": 209, + "tokens/total": 27394048, + "tokens/train_per_sec_per_gpu": 2771.97, + "tokens/trainable": 2913512 + }, + { + "epoch": 0.6687898089171974, + "grad_norm": 0.234375, + "learning_rate": 4.9833103083329947e-05, + "loss": 0.013119550421833992, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01321, + "step": 210, + "tokens/total": 27525120, + "tokens/train_per_sec_per_gpu": 3729.48, + "tokens/trainable": 2929046 + }, + { + "epoch": 0.6719745222929936, + "grad_norm": 0.259765625, + "learning_rate": 4.98266297496921e-05, + "loss": 0.01352207362651825, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01361, + "step": 211, + "tokens/total": 27656192, + "tokens/train_per_sec_per_gpu": 3277.56, + "tokens/trainable": 2942780 + }, + { + "epoch": 0.6751592356687898, + "grad_norm": 0.34765625, + "learning_rate": 4.982003369106287e-05, + "loss": 0.017431171610951424, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01758, + "step": 212, + "tokens/total": 27787264, + "tokens/train_per_sec_per_gpu": 3344.98, + "tokens/trainable": 2956783 + }, + { + "epoch": 0.678343949044586, + "grad_norm": 0.255859375, + "learning_rate": 4.981331494004845e-05, + "loss": 0.01397764589637518, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01408, + "step": 213, + "tokens/total": 27918336, + "tokens/train_per_sec_per_gpu": 3185.6, + "tokens/trainable": 2970117 + }, + { + "epoch": 0.6815286624203821, + "grad_norm": 0.30859375, + "learning_rate": 4.980647352986148e-05, + "loss": 0.014616122469305992, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01472, + "step": 214, + "tokens/total": 28049408, + "tokens/train_per_sec_per_gpu": 3594.29, + "tokens/trainable": 2985083 + }, + { + "epoch": 0.6847133757961783, + "grad_norm": 0.34375, + "learning_rate": 4.979950949432098e-05, + "loss": 0.012630216777324677, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01271, + "step": 215, + "tokens/total": 28180480, + "tokens/train_per_sec_per_gpu": 3114.53, + "tokens/trainable": 2998164 + }, + { + "epoch": 0.6878980891719745, + "grad_norm": 0.369140625, + "learning_rate": 4.979242286785214e-05, + "loss": 0.01619878038764, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01633, + "step": 216, + "tokens/total": 28311552, + "tokens/train_per_sec_per_gpu": 3343.4, + "tokens/trainable": 3012168 + }, + { + "epoch": 0.6910828025477707, + "grad_norm": 0.1923828125, + "learning_rate": 4.978521368548612e-05, + "loss": 0.00897720456123352, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00902, + "step": 217, + "tokens/total": 28442624, + "tokens/train_per_sec_per_gpu": 3292.3, + "tokens/trainable": 3025888 + }, + { + "epoch": 0.6942675159235668, + "grad_norm": 0.232421875, + "learning_rate": 4.977788198285995e-05, + "loss": 0.010021158494055271, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01007, + "step": 218, + "tokens/total": 28573696, + "tokens/train_per_sec_per_gpu": 3319.6, + "tokens/trainable": 3039763 + }, + { + "epoch": 0.697452229299363, + "grad_norm": 0.23828125, + "learning_rate": 4.9770427796216284e-05, + "loss": 0.01425202563405037, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01435, + "step": 219, + "tokens/total": 28704768, + "tokens/train_per_sec_per_gpu": 2847.77, + "tokens/trainable": 3051731 + }, + { + "epoch": 0.7006369426751592, + "grad_norm": 0.322265625, + "learning_rate": 4.976285116240326e-05, + "loss": 0.014778842218220234, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01489, + "step": 220, + "tokens/total": 28835840, + "tokens/train_per_sec_per_gpu": 3280.14, + "tokens/trainable": 3065475 + }, + { + "epoch": 0.7038216560509554, + "grad_norm": 0.2275390625, + "learning_rate": 4.9755152118874294e-05, + "loss": 0.011257003992795944, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01132, + "step": 221, + "tokens/total": 28966912, + "tokens/train_per_sec_per_gpu": 3367.48, + "tokens/trainable": 3079510 + }, + { + "epoch": 0.7070063694267515, + "grad_norm": 0.2021484375, + "learning_rate": 4.9747330703687914e-05, + "loss": 0.013675577938556671, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01377, + "step": 222, + "tokens/total": 29097984, + "tokens/train_per_sec_per_gpu": 3844.8, + "tokens/trainable": 3095524 + }, + { + "epoch": 0.7101910828025477, + "grad_norm": 0.2294921875, + "learning_rate": 4.9739386955507587e-05, + "loss": 0.01433156430721283, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01443, + "step": 223, + "tokens/total": 29229056, + "tokens/train_per_sec_per_gpu": 3346.69, + "tokens/trainable": 3109543 + }, + { + "epoch": 0.7133757961783439, + "grad_norm": 0.2177734375, + "learning_rate": 4.9731320913601474e-05, + "loss": 0.010345865972340107, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0104, + "step": 224, + "tokens/total": 29360128, + "tokens/train_per_sec_per_gpu": 3025.76, + "tokens/trainable": 3122229 + }, + { + "epoch": 0.7165605095541401, + "grad_norm": 0.2109375, + "learning_rate": 4.9723132617842284e-05, + "loss": 0.014529074542224407, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01464, + "step": 225, + "tokens/total": 29491200, + "tokens/train_per_sec_per_gpu": 3346.66, + "tokens/trainable": 3136235 + }, + { + "epoch": 0.7197452229299363, + "grad_norm": 0.263671875, + "learning_rate": 4.971482210870706e-05, + "loss": 0.017442386597394943, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0176, + "step": 226, + "tokens/total": 29622272, + "tokens/train_per_sec_per_gpu": 3192.22, + "tokens/trainable": 3149606 + }, + { + "epoch": 0.7229299363057324, + "grad_norm": 0.1875, + "learning_rate": 4.970638942727698e-05, + "loss": 0.00844226311892271, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00848, + "step": 227, + "tokens/total": 29753344, + "tokens/train_per_sec_per_gpu": 3247.88, + "tokens/trainable": 3163147 + }, + { + "epoch": 0.7261146496815286, + "grad_norm": 0.1748046875, + "learning_rate": 4.969783461523714e-05, + "loss": 0.010366439819335938, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01042, + "step": 228, + "tokens/total": 29884416, + "tokens/train_per_sec_per_gpu": 3545.1, + "tokens/trainable": 3177891 + }, + { + "epoch": 0.7292993630573248, + "grad_norm": 0.259765625, + "learning_rate": 4.968915771487639e-05, + "loss": 0.011432585306465626, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0115, + "step": 229, + "tokens/total": 30015488, + "tokens/train_per_sec_per_gpu": 3336.0, + "tokens/trainable": 3191819 + }, + { + "epoch": 0.732484076433121, + "grad_norm": 0.2412109375, + "learning_rate": 4.9680358769087076e-05, + "loss": 0.012058578431606293, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01213, + "step": 230, + "tokens/total": 30146560, + "tokens/train_per_sec_per_gpu": 3245.98, + "tokens/trainable": 3205431 + }, + { + "epoch": 0.7356687898089171, + "grad_norm": 0.216796875, + "learning_rate": 4.9671437821364855e-05, + "loss": 0.013203555718064308, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01329, + "step": 231, + "tokens/total": 30277632, + "tokens/train_per_sec_per_gpu": 2895.23, + "tokens/trainable": 3217538 + }, + { + "epoch": 0.7388535031847133, + "grad_norm": 0.2109375, + "learning_rate": 4.966239491580847e-05, + "loss": 0.011110116727650166, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01117, + "step": 232, + "tokens/total": 30408704, + "tokens/train_per_sec_per_gpu": 3255.67, + "tokens/trainable": 3231099 + }, + { + "epoch": 0.7420382165605095, + "grad_norm": 0.19921875, + "learning_rate": 4.965323009711954e-05, + "loss": 0.01235074270516634, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01243, + "step": 233, + "tokens/total": 30539776, + "tokens/train_per_sec_per_gpu": 3738.25, + "tokens/trainable": 3246613 + }, + { + "epoch": 0.7452229299363057, + "grad_norm": 0.2119140625, + "learning_rate": 4.964394341060233e-05, + "loss": 0.014128293842077255, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01423, + "step": 234, + "tokens/total": 30670848, + "tokens/train_per_sec_per_gpu": 3075.78, + "tokens/trainable": 3259483 + }, + { + "epoch": 0.7484076433121019, + "grad_norm": 0.2041015625, + "learning_rate": 4.9634534902163544e-05, + "loss": 0.011594554409384727, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01166, + "step": 235, + "tokens/total": 30801920, + "tokens/train_per_sec_per_gpu": 3397.95, + "tokens/trainable": 3273641 + }, + { + "epoch": 0.7515923566878981, + "grad_norm": 0.34375, + "learning_rate": 4.962500461831207e-05, + "loss": 0.015983082354068756, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01611, + "step": 236, + "tokens/total": 30932992, + "tokens/train_per_sec_per_gpu": 3322.87, + "tokens/trainable": 3287575 + }, + { + "epoch": 0.7547770700636943, + "grad_norm": 0.2333984375, + "learning_rate": 4.961535260615876e-05, + "loss": 0.01292226929217577, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01301, + "step": 237, + "tokens/total": 31064064, + "tokens/train_per_sec_per_gpu": 3320.22, + "tokens/trainable": 3301421 + }, + { + "epoch": 0.7579617834394905, + "grad_norm": 0.2197265625, + "learning_rate": 4.9605578913416245e-05, + "loss": 0.014275891706347466, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01438, + "step": 238, + "tokens/total": 31195136, + "tokens/train_per_sec_per_gpu": 3614.8, + "tokens/trainable": 3316404 + }, + { + "epoch": 0.7611464968152867, + "grad_norm": 0.267578125, + "learning_rate": 4.959568358839861e-05, + "loss": 0.01322453934699297, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01331, + "step": 239, + "tokens/total": 31326208, + "tokens/train_per_sec_per_gpu": 3704.99, + "tokens/trainable": 3331869 + }, + { + "epoch": 0.7643312101910829, + "grad_norm": 0.240234375, + "learning_rate": 4.958566668002123e-05, + "loss": 0.01428250689059496, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01438, + "step": 240, + "tokens/total": 31457280, + "tokens/train_per_sec_per_gpu": 3217.37, + "tokens/trainable": 3345254 + }, + { + "epoch": 0.767515923566879, + "grad_norm": 0.248046875, + "learning_rate": 4.957552823780047e-05, + "loss": 0.011499980464577675, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01157, + "step": 241, + "tokens/total": 31588352, + "tokens/train_per_sec_per_gpu": 3332.37, + "tokens/trainable": 3359111 + }, + { + "epoch": 0.7707006369426752, + "grad_norm": 0.25, + "learning_rate": 4.956526831185353e-05, + "loss": 0.014339377172291279, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01444, + "step": 242, + "tokens/total": 31719424, + "tokens/train_per_sec_per_gpu": 3461.36, + "tokens/trainable": 3373551 + }, + { + "epoch": 0.7738853503184714, + "grad_norm": 0.1787109375, + "learning_rate": 4.955488695289806e-05, + "loss": 0.009887355379760265, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00994, + "step": 243, + "tokens/total": 31850496, + "tokens/train_per_sec_per_gpu": 3502.72, + "tokens/trainable": 3388151 + }, + { + "epoch": 0.7770700636942676, + "grad_norm": 0.236328125, + "learning_rate": 4.954438421225206e-05, + "loss": 0.013017972931265831, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0131, + "step": 244, + "tokens/total": 31981568, + "tokens/train_per_sec_per_gpu": 3313.42, + "tokens/trainable": 3401935 + }, + { + "epoch": 0.7802547770700637, + "grad_norm": 0.22265625, + "learning_rate": 4.9533760141833506e-05, + "loss": 0.012434033676981926, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01251, + "step": 245, + "tokens/total": 32112640, + "tokens/train_per_sec_per_gpu": 3363.79, + "tokens/trainable": 3415979 + }, + { + "epoch": 0.7834394904458599, + "grad_norm": 0.19140625, + "learning_rate": 4.952301479416015e-05, + "loss": 0.011714441701769829, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01178, + "step": 246, + "tokens/total": 32243712, + "tokens/train_per_sec_per_gpu": 3236.03, + "tokens/trainable": 3429486 + }, + { + "epoch": 0.7866242038216561, + "grad_norm": 0.2294921875, + "learning_rate": 4.9512148222349274e-05, + "loss": 0.01364858727902174, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01374, + "step": 247, + "tokens/total": 32374784, + "tokens/train_per_sec_per_gpu": 3117.68, + "tokens/trainable": 3442584 + }, + { + "epoch": 0.7898089171974523, + "grad_norm": 0.185546875, + "learning_rate": 4.950116048011739e-05, + "loss": 0.00907064788043499, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00911, + "step": 248, + "tokens/total": 32505856, + "tokens/train_per_sec_per_gpu": 3310.3, + "tokens/trainable": 3456412 + }, + { + "epoch": 0.7929936305732485, + "grad_norm": 0.185546875, + "learning_rate": 4.949005162177997e-05, + "loss": 0.011760072782635689, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01183, + "step": 249, + "tokens/total": 32636928, + "tokens/train_per_sec_per_gpu": 3404.86, + "tokens/trainable": 3470647 + }, + { + "epoch": 0.7961783439490446, + "grad_norm": 0.2294921875, + "learning_rate": 4.9478821702251234e-05, + "loss": 0.014284678734838963, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01439, + "step": 250, + "tokens/total": 32768000, + "tokens/train_per_sec_per_gpu": 3377.89, + "tokens/trainable": 3484748 + }, + { + "epoch": 0.7993630573248408, + "grad_norm": 0.18359375, + "learning_rate": 4.9467470777043806e-05, + "loss": 0.011529207229614258, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0116, + "step": 251, + "tokens/total": 32899072, + "tokens/train_per_sec_per_gpu": 3574.68, + "tokens/trainable": 3499669 + }, + { + "epoch": 0.802547770700637, + "grad_norm": 0.2314453125, + "learning_rate": 4.9455998902268504e-05, + "loss": 0.01309981569647789, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01319, + "step": 252, + "tokens/total": 33030144, + "tokens/train_per_sec_per_gpu": 3255.28, + "tokens/trainable": 3513312 + }, + { + "epoch": 0.8057324840764332, + "grad_norm": 0.1748046875, + "learning_rate": 4.944440613463402e-05, + "loss": 0.007244420703500509, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00727, + "step": 253, + "tokens/total": 33161216, + "tokens/train_per_sec_per_gpu": 3061.67, + "tokens/trainable": 3526131 + }, + { + "epoch": 0.8089171974522293, + "grad_norm": 0.1982421875, + "learning_rate": 4.943269253144664e-05, + "loss": 0.012152907438576221, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01223, + "step": 254, + "tokens/total": 33292288, + "tokens/train_per_sec_per_gpu": 3129.1, + "tokens/trainable": 3539258 + }, + { + "epoch": 0.8121019108280255, + "grad_norm": 0.1923828125, + "learning_rate": 4.9420858150610025e-05, + "loss": 0.009945802390575409, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01, + "step": 255, + "tokens/total": 33423360, + "tokens/train_per_sec_per_gpu": 3101.37, + "tokens/trainable": 3552212 + }, + { + "epoch": 0.8152866242038217, + "grad_norm": 0.1826171875, + "learning_rate": 4.9408903050624796e-05, + "loss": 0.00950522068887949, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00955, + "step": 256, + "tokens/total": 33554432, + "tokens/train_per_sec_per_gpu": 3437.25, + "tokens/trainable": 3566622 + }, + { + "epoch": 0.8184713375796179, + "grad_norm": 0.265625, + "learning_rate": 4.939682729058839e-05, + "loss": 0.012676852755248547, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01276, + "step": 257, + "tokens/total": 33685504, + "tokens/train_per_sec_per_gpu": 3405.34, + "tokens/trainable": 3580857 + }, + { + "epoch": 0.821656050955414, + "grad_norm": 0.2392578125, + "learning_rate": 4.938463093019466e-05, + "loss": 0.012163055129349232, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01224, + "step": 258, + "tokens/total": 33816576, + "tokens/train_per_sec_per_gpu": 3175.85, + "tokens/trainable": 3594180 + }, + { + "epoch": 0.8248407643312102, + "grad_norm": 0.220703125, + "learning_rate": 4.937231402973365e-05, + "loss": 0.011768801137804985, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01184, + "step": 259, + "tokens/total": 33947648, + "tokens/train_per_sec_per_gpu": 3036.11, + "tokens/trainable": 3606954 + }, + { + "epoch": 0.8280254777070064, + "grad_norm": 0.2333984375, + "learning_rate": 4.935987665009123e-05, + "loss": 0.01067468523979187, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01073, + "step": 260, + "tokens/total": 34078720, + "tokens/train_per_sec_per_gpu": 3332.71, + "tokens/trainable": 3620834 + }, + { + "epoch": 0.8312101910828026, + "grad_norm": 0.208984375, + "learning_rate": 4.934731885274887e-05, + "loss": 0.008789247833192348, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00883, + "step": 261, + "tokens/total": 34209792, + "tokens/train_per_sec_per_gpu": 3139.67, + "tokens/trainable": 3633998 + }, + { + "epoch": 0.8343949044585988, + "grad_norm": 0.2119140625, + "learning_rate": 4.9334640699783286e-05, + "loss": 0.011909011751413345, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01198, + "step": 262, + "tokens/total": 34340864, + "tokens/train_per_sec_per_gpu": 3340.74, + "tokens/trainable": 3647974 + }, + { + "epoch": 0.8375796178343949, + "grad_norm": 0.265625, + "learning_rate": 4.9321842253866136e-05, + "loss": 0.013996127992868423, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01409, + "step": 263, + "tokens/total": 34471936, + "tokens/train_per_sec_per_gpu": 3762.99, + "tokens/trainable": 3663593 + }, + { + "epoch": 0.8407643312101911, + "grad_norm": 0.228515625, + "learning_rate": 4.930892357826373e-05, + "loss": 0.014773533679544926, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01488, + "step": 264, + "tokens/total": 34603008, + "tokens/train_per_sec_per_gpu": 3474.74, + "tokens/trainable": 3678065 + }, + { + "epoch": 0.8439490445859873, + "grad_norm": 0.2138671875, + "learning_rate": 4.92958847368367e-05, + "loss": 0.01498363260179758, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0151, + "step": 265, + "tokens/total": 34734080, + "tokens/train_per_sec_per_gpu": 3050.93, + "tokens/trainable": 3690846 + }, + { + "epoch": 0.8471337579617835, + "grad_norm": 0.1884765625, + "learning_rate": 4.928272579403969e-05, + "loss": 0.009248088113963604, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00929, + "step": 266, + "tokens/total": 34865152, + "tokens/train_per_sec_per_gpu": 3185.95, + "tokens/trainable": 3704117 + }, + { + "epoch": 0.8503184713375797, + "grad_norm": 0.2138671875, + "learning_rate": 4.926944681492106e-05, + "loss": 0.012684832327067852, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01277, + "step": 267, + "tokens/total": 34996224, + "tokens/train_per_sec_per_gpu": 3411.13, + "tokens/trainable": 3718339 + }, + { + "epoch": 0.8535031847133758, + "grad_norm": 0.2099609375, + "learning_rate": 4.925604786512251e-05, + "loss": 0.0118259247392416, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0119, + "step": 268, + "tokens/total": 35127296, + "tokens/train_per_sec_per_gpu": 3032.33, + "tokens/trainable": 3731032 + }, + { + "epoch": 0.856687898089172, + "grad_norm": 0.1953125, + "learning_rate": 4.924252901087881e-05, + "loss": 0.009350091218948364, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00939, + "step": 269, + "tokens/total": 35258368, + "tokens/train_per_sec_per_gpu": 3595.2, + "tokens/trainable": 3746006 + }, + { + "epoch": 0.8598726114649682, + "grad_norm": 0.2275390625, + "learning_rate": 4.922889031901745e-05, + "loss": 0.01463128998875618, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01474, + "step": 270, + "tokens/total": 35389440, + "tokens/train_per_sec_per_gpu": 3514.64, + "tokens/trainable": 3760731 + }, + { + "epoch": 0.8630573248407644, + "grad_norm": 0.185546875, + "learning_rate": 4.921513185695831e-05, + "loss": 0.009343666024506092, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00939, + "step": 271, + "tokens/total": 35520512, + "tokens/train_per_sec_per_gpu": 3137.39, + "tokens/trainable": 3773865 + }, + { + "epoch": 0.8662420382165605, + "grad_norm": 0.1806640625, + "learning_rate": 4.920125369271332e-05, + "loss": 0.011359314434230328, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01142, + "step": 272, + "tokens/total": 35651584, + "tokens/train_per_sec_per_gpu": 3710.71, + "tokens/trainable": 3789305 + }, + { + "epoch": 0.8694267515923567, + "grad_norm": 0.173828125, + "learning_rate": 4.9187255894886134e-05, + "loss": 0.011224365793168545, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01129, + "step": 273, + "tokens/total": 35782656, + "tokens/train_per_sec_per_gpu": 3673.45, + "tokens/trainable": 3804528 + }, + { + "epoch": 0.8726114649681529, + "grad_norm": 0.2353515625, + "learning_rate": 4.9173138532671796e-05, + "loss": 0.012716785073280334, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0128, + "step": 274, + "tokens/total": 35913728, + "tokens/train_per_sec_per_gpu": 3495.34, + "tokens/trainable": 3819131 + }, + { + "epoch": 0.8757961783439491, + "grad_norm": 0.193359375, + "learning_rate": 4.9158901675856395e-05, + "loss": 0.008782695978879929, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00882, + "step": 275, + "tokens/total": 36044800, + "tokens/train_per_sec_per_gpu": 3305.01, + "tokens/trainable": 3832973 + }, + { + "epoch": 0.8789808917197452, + "grad_norm": 0.169921875, + "learning_rate": 4.9144545394816687e-05, + "loss": 0.008706534281373024, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00874, + "step": 276, + "tokens/total": 36175872, + "tokens/train_per_sec_per_gpu": 3043.21, + "tokens/trainable": 3845728 + }, + { + "epoch": 0.8821656050955414, + "grad_norm": 0.27734375, + "learning_rate": 4.91300697605198e-05, + "loss": 0.01517584826797247, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01529, + "step": 277, + "tokens/total": 36306944, + "tokens/train_per_sec_per_gpu": 3664.41, + "tokens/trainable": 3860973 + }, + { + "epoch": 0.8853503184713376, + "grad_norm": 0.2099609375, + "learning_rate": 4.911547484452286e-05, + "loss": 0.009684903547167778, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00973, + "step": 278, + "tokens/total": 36438016, + "tokens/train_per_sec_per_gpu": 3416.95, + "tokens/trainable": 3875221 + }, + { + "epoch": 0.8885350318471338, + "grad_norm": 0.201171875, + "learning_rate": 4.9100760718972624e-05, + "loss": 0.011975611560046673, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01205, + "step": 279, + "tokens/total": 36569088, + "tokens/train_per_sec_per_gpu": 3231.7, + "tokens/trainable": 3888737 + }, + { + "epoch": 0.89171974522293, + "grad_norm": 0.171875, + "learning_rate": 4.908592745660514e-05, + "loss": 0.009973946958780289, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01002, + "step": 280, + "tokens/total": 36700160, + "tokens/train_per_sec_per_gpu": 3510.18, + "tokens/trainable": 3903383 + }, + { + "epoch": 0.8949044585987261, + "grad_norm": 0.189453125, + "learning_rate": 4.9070975130745387e-05, + "loss": 0.009210948832333088, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00925, + "step": 281, + "tokens/total": 36831232, + "tokens/train_per_sec_per_gpu": 3276.53, + "tokens/trainable": 3917095 + }, + { + "epoch": 0.8980891719745223, + "grad_norm": 0.216796875, + "learning_rate": 4.905590381530689e-05, + "loss": 0.010272481478750706, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01033, + "step": 282, + "tokens/total": 36962304, + "tokens/train_per_sec_per_gpu": 3515.84, + "tokens/trainable": 3931741 + }, + { + "epoch": 0.9012738853503185, + "grad_norm": 0.203125, + "learning_rate": 4.9040713584791406e-05, + "loss": 0.009833472780883312, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00988, + "step": 283, + "tokens/total": 37093376, + "tokens/train_per_sec_per_gpu": 2930.03, + "tokens/trainable": 3944068 + }, + { + "epoch": 0.9044585987261147, + "grad_norm": 0.173828125, + "learning_rate": 4.902540451428849e-05, + "loss": 0.008189358748495579, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00822, + "step": 284, + "tokens/total": 37224448, + "tokens/train_per_sec_per_gpu": 3765.41, + "tokens/trainable": 3959725 + }, + { + "epoch": 0.9076433121019108, + "grad_norm": 0.2216796875, + "learning_rate": 4.900997667947518e-05, + "loss": 0.013849266804754734, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01395, + "step": 285, + "tokens/total": 37355520, + "tokens/train_per_sec_per_gpu": 3186.67, + "tokens/trainable": 3973038 + }, + { + "epoch": 0.910828025477707, + "grad_norm": 0.2373046875, + "learning_rate": 4.899443015661557e-05, + "loss": 0.008526762947440147, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00856, + "step": 286, + "tokens/total": 37486592, + "tokens/train_per_sec_per_gpu": 3056.98, + "tokens/trainable": 3985851 + }, + { + "epoch": 0.9140127388535032, + "grad_norm": 0.1650390625, + "learning_rate": 4.89787650225605e-05, + "loss": 0.008836560882627964, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00888, + "step": 287, + "tokens/total": 37617664, + "tokens/train_per_sec_per_gpu": 3316.33, + "tokens/trainable": 3999725 + }, + { + "epoch": 0.9171974522292994, + "grad_norm": 0.263671875, + "learning_rate": 4.896298135474711e-05, + "loss": 0.01038228627294302, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01044, + "step": 288, + "tokens/total": 37748736, + "tokens/train_per_sec_per_gpu": 3125.36, + "tokens/trainable": 4012867 + }, + { + "epoch": 0.9203821656050956, + "grad_norm": 0.21875, + "learning_rate": 4.8947079231198504e-05, + "loss": 0.012707007117569447, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01279, + "step": 289, + "tokens/total": 37879808, + "tokens/train_per_sec_per_gpu": 3307.2, + "tokens/trainable": 4026670 + }, + { + "epoch": 0.9235668789808917, + "grad_norm": 0.2060546875, + "learning_rate": 4.893105873052333e-05, + "loss": 0.010869958437979221, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01093, + "step": 290, + "tokens/total": 38010880, + "tokens/train_per_sec_per_gpu": 3449.15, + "tokens/trainable": 4041053 + }, + { + "epoch": 0.9267515923566879, + "grad_norm": 0.2216796875, + "learning_rate": 4.8914919931915407e-05, + "loss": 0.010028751567006111, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01008, + "step": 291, + "tokens/total": 38141952, + "tokens/train_per_sec_per_gpu": 3442.24, + "tokens/trainable": 4055450 + }, + { + "epoch": 0.9299363057324841, + "grad_norm": 0.220703125, + "learning_rate": 4.889866291515336e-05, + "loss": 0.012203947640955448, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01228, + "step": 292, + "tokens/total": 38273024, + "tokens/train_per_sec_per_gpu": 2829.0, + "tokens/trainable": 4067366 + }, + { + "epoch": 0.9331210191082803, + "grad_norm": 0.1884765625, + "learning_rate": 4.888228776060016e-05, + "loss": 0.010833281092345715, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01089, + "step": 293, + "tokens/total": 38404096, + "tokens/train_per_sec_per_gpu": 3495.99, + "tokens/trainable": 4081929 + }, + { + "epoch": 0.9363057324840764, + "grad_norm": 0.181640625, + "learning_rate": 4.886579454920281e-05, + "loss": 0.012121611274778843, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0122, + "step": 294, + "tokens/total": 38535168, + "tokens/train_per_sec_per_gpu": 3777.6, + "tokens/trainable": 4097707 + }, + { + "epoch": 0.9394904458598726, + "grad_norm": 0.1826171875, + "learning_rate": 4.884918336249186e-05, + "loss": 0.009699760004878044, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00975, + "step": 295, + "tokens/total": 38666240, + "tokens/train_per_sec_per_gpu": 3588.34, + "tokens/trainable": 4112623 + }, + { + "epoch": 0.9426751592356688, + "grad_norm": 0.2138671875, + "learning_rate": 4.883245428258107e-05, + "loss": 0.011465213261544704, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01153, + "step": 296, + "tokens/total": 38797312, + "tokens/train_per_sec_per_gpu": 3411.03, + "tokens/trainable": 4126849 + }, + { + "epoch": 0.945859872611465, + "grad_norm": 0.1904296875, + "learning_rate": 4.881560739216697e-05, + "loss": 0.009318836033344269, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00936, + "step": 297, + "tokens/total": 38928384, + "tokens/train_per_sec_per_gpu": 3338.53, + "tokens/trainable": 4140757 + }, + { + "epoch": 0.9490445859872612, + "grad_norm": 0.2216796875, + "learning_rate": 4.879864277452847e-05, + "loss": 0.012642276473343372, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01272, + "step": 298, + "tokens/total": 39059456, + "tokens/train_per_sec_per_gpu": 3555.91, + "tokens/trainable": 4155522 + }, + { + "epoch": 0.9522292993630573, + "grad_norm": 0.20703125, + "learning_rate": 4.8781560513526414e-05, + "loss": 0.013654773123562336, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01375, + "step": 299, + "tokens/total": 39190528, + "tokens/train_per_sec_per_gpu": 3459.38, + "tokens/trainable": 4169921 + }, + { + "epoch": 0.9554140127388535, + "grad_norm": 0.1787109375, + "learning_rate": 4.876436069360323e-05, + "loss": 0.006959032732993364, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00698, + "step": 300, + "tokens/total": 39321600, + "tokens/train_per_sec_per_gpu": 3298.43, + "tokens/trainable": 4183671 + }, + { + "epoch": 0.9585987261146497, + "grad_norm": 0.2109375, + "learning_rate": 4.8747043399782424e-05, + "loss": 0.01015427801758051, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01021, + "step": 301, + "tokens/total": 39452672, + "tokens/train_per_sec_per_gpu": 3056.79, + "tokens/trainable": 4196527 + }, + { + "epoch": 0.9617834394904459, + "grad_norm": 0.189453125, + "learning_rate": 4.8729608717668265e-05, + "loss": 0.015600456856191158, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01572, + "step": 302, + "tokens/total": 39583744, + "tokens/train_per_sec_per_gpu": 3500.83, + "tokens/trainable": 4211124 + }, + { + "epoch": 0.964968152866242, + "grad_norm": 0.275390625, + "learning_rate": 4.871205673344525e-05, + "loss": 0.014728494919836521, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01484, + "step": 303, + "tokens/total": 39714816, + "tokens/train_per_sec_per_gpu": 3241.93, + "tokens/trainable": 4224632 + }, + { + "epoch": 0.9681528662420382, + "grad_norm": 0.185546875, + "learning_rate": 4.869438753387777e-05, + "loss": 0.008857826702296734, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0089, + "step": 304, + "tokens/total": 39845888, + "tokens/train_per_sec_per_gpu": 3447.73, + "tokens/trainable": 4239052 + }, + { + "epoch": 0.9713375796178344, + "grad_norm": 0.1572265625, + "learning_rate": 4.867660120630962e-05, + "loss": 0.006837591528892517, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00686, + "step": 305, + "tokens/total": 39976960, + "tokens/train_per_sec_per_gpu": 3652.81, + "tokens/trainable": 4254227 + }, + { + "epoch": 0.9745222929936306, + "grad_norm": 0.21484375, + "learning_rate": 4.8658697838663625e-05, + "loss": 0.01278127171099186, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01286, + "step": 306, + "tokens/total": 40108032, + "tokens/train_per_sec_per_gpu": 3363.52, + "tokens/trainable": 4268312 + }, + { + "epoch": 0.9777070063694268, + "grad_norm": 0.19140625, + "learning_rate": 4.864067751944113e-05, + "loss": 0.010625463910400867, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01068, + "step": 307, + "tokens/total": 40239104, + "tokens/train_per_sec_per_gpu": 3301.6, + "tokens/trainable": 4282394 + }, + { + "epoch": 0.9808917197452229, + "grad_norm": 0.19140625, + "learning_rate": 4.862254033772164e-05, + "loss": 0.010408475063741207, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01046, + "step": 308, + "tokens/total": 40370176, + "tokens/train_per_sec_per_gpu": 3139.89, + "tokens/trainable": 4295549 + }, + { + "epoch": 0.9840764331210191, + "grad_norm": 0.15625, + "learning_rate": 4.8604286383162326e-05, + "loss": 0.00865277647972107, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00869, + "step": 309, + "tokens/total": 40501248, + "tokens/train_per_sec_per_gpu": 3451.88, + "tokens/trainable": 4309931 + }, + { + "epoch": 0.9872611464968153, + "grad_norm": 0.173828125, + "learning_rate": 4.858591574599759e-05, + "loss": 0.010455441661179066, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01051, + "step": 310, + "tokens/total": 40632320, + "tokens/train_per_sec_per_gpu": 3652.99, + "tokens/trainable": 4325145 + }, + { + "epoch": 0.9904458598726115, + "grad_norm": 0.1806640625, + "learning_rate": 4.856742851703866e-05, + "loss": 0.009725190699100494, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00977, + "step": 311, + "tokens/total": 40763392, + "tokens/train_per_sec_per_gpu": 3095.9, + "tokens/trainable": 4338115 + }, + { + "epoch": 0.9936305732484076, + "grad_norm": 0.189453125, + "learning_rate": 4.854882478767308e-05, + "loss": 0.0067247929982841015, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00675, + "step": 312, + "tokens/total": 40894464, + "tokens/train_per_sec_per_gpu": 3608.14, + "tokens/trainable": 4353094 + }, + { + "epoch": 0.9968152866242038, + "grad_norm": 0.177734375, + "learning_rate": 4.8530104649864306e-05, + "loss": 0.008235358633100986, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00827, + "step": 313, + "tokens/total": 41025536, + "tokens/train_per_sec_per_gpu": 3438.93, + "tokens/trainable": 4367439 + }, + { + "epoch": 1.0, + "grad_norm": 0.31640625, + "learning_rate": 4.8511268196151224e-05, + "loss": 0.013931503519415855, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 39.25, + "memory/max_allocated (GiB)": 39.25, + "ppl": 1.01403, + "step": 314, + "tokens/total": 41099264, + "tokens/train_per_sec_per_gpu": 2079.74, + "tokens/trainable": 4374676 + }, + { + "epoch": 1.0, + "eval_loss": 0.010794572532176971, + "eval_ppl": 1.01085, + "eval_runtime": 42.176, + "eval_samples_per_second": 64.041, + "eval_steps_per_second": 4.007, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 54.61, + "memory/max_allocated (GiB)": 54.61, + "step": 314 + }, + { + "epoch": 1.0031847133757963, + "grad_norm": 0.19921875, + "learning_rate": 4.849231551964771e-05, + "loss": 0.01005562860518694, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01011, + "step": 315, + "tokens/total": 41230336, + "tokens/train_per_sec_per_gpu": 3300.4, + "tokens/trainable": 4388312 + }, + { + "epoch": 1.0063694267515924, + "grad_norm": 0.1962890625, + "learning_rate": 4.8473246714042155e-05, + "loss": 0.009829830378293991, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00988, + "step": 316, + "tokens/total": 41361408, + "tokens/train_per_sec_per_gpu": 2786.13, + "tokens/trainable": 4400052 + }, + { + "epoch": 1.0095541401273886, + "grad_norm": 0.2119140625, + "learning_rate": 4.845406187359701e-05, + "loss": 0.009766732342541218, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00981, + "step": 317, + "tokens/total": 41492480, + "tokens/train_per_sec_per_gpu": 3444.53, + "tokens/trainable": 4414268 + }, + { + "epoch": 1.0127388535031847, + "grad_norm": 0.17578125, + "learning_rate": 4.843476109314833e-05, + "loss": 0.009223168715834618, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00927, + "step": 318, + "tokens/total": 41623552, + "tokens/train_per_sec_per_gpu": 3515.7, + "tokens/trainable": 4428804 + }, + { + "epoch": 1.015923566878981, + "grad_norm": 0.1611328125, + "learning_rate": 4.841534446810527e-05, + "loss": 0.008030703291296959, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00806, + "step": 319, + "tokens/total": 41754624, + "tokens/train_per_sec_per_gpu": 3297.15, + "tokens/trainable": 4442458 + }, + { + "epoch": 1.019108280254777, + "grad_norm": 0.1669921875, + "learning_rate": 4.839581209444966e-05, + "loss": 0.008971852250397205, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00901, + "step": 320, + "tokens/total": 41885696, + "tokens/train_per_sec_per_gpu": 3348.3, + "tokens/trainable": 4456319 + }, + { + "epoch": 1.0222929936305734, + "grad_norm": 0.189453125, + "learning_rate": 4.8376164068735485e-05, + "loss": 0.011034002527594566, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0111, + "step": 321, + "tokens/total": 42016768, + "tokens/train_per_sec_per_gpu": 3467.0, + "tokens/trainable": 4470692 + }, + { + "epoch": 1.0254777070063694, + "grad_norm": 0.21484375, + "learning_rate": 4.835640048808847e-05, + "loss": 0.008709516376256943, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00875, + "step": 322, + "tokens/total": 42147840, + "tokens/train_per_sec_per_gpu": 3335.02, + "tokens/trainable": 4484563 + }, + { + "epoch": 1.0286624203821657, + "grad_norm": 0.166015625, + "learning_rate": 4.833652145020551e-05, + "loss": 0.006180301308631897, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0062, + "step": 323, + "tokens/total": 42278912, + "tokens/train_per_sec_per_gpu": 3293.93, + "tokens/trainable": 4498340 + }, + { + "epoch": 1.0318471337579618, + "grad_norm": 0.15234375, + "learning_rate": 4.831652705335428e-05, + "loss": 0.007071372587233782, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0071, + "step": 324, + "tokens/total": 42409984, + "tokens/train_per_sec_per_gpu": 3496.34, + "tokens/trainable": 4512959 + }, + { + "epoch": 1.035031847133758, + "grad_norm": 0.2216796875, + "learning_rate": 4.829641739637269e-05, + "loss": 0.010390223003923893, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01044, + "step": 325, + "tokens/total": 42541056, + "tokens/train_per_sec_per_gpu": 3109.54, + "tokens/trainable": 4525947 + }, + { + "epoch": 1.0382165605095541, + "grad_norm": 0.19140625, + "learning_rate": 4.827619257866839e-05, + "loss": 0.010280653834342957, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01033, + "step": 326, + "tokens/total": 42672128, + "tokens/train_per_sec_per_gpu": 3494.82, + "tokens/trainable": 4540559 + }, + { + "epoch": 1.0414012738853504, + "grad_norm": 0.291015625, + "learning_rate": 4.825585270021835e-05, + "loss": 0.009634558111429214, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00968, + "step": 327, + "tokens/total": 42803200, + "tokens/train_per_sec_per_gpu": 3081.6, + "tokens/trainable": 4553474 + }, + { + "epoch": 1.0445859872611465, + "grad_norm": 0.21875, + "learning_rate": 4.823539786156828e-05, + "loss": 0.012012935243546963, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01209, + "step": 328, + "tokens/total": 42934272, + "tokens/train_per_sec_per_gpu": 3405.54, + "tokens/trainable": 4567721 + }, + { + "epoch": 1.0477707006369428, + "grad_norm": 0.1552734375, + "learning_rate": 4.821482816383218e-05, + "loss": 0.005780364852398634, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0058, + "step": 329, + "tokens/total": 43065344, + "tokens/train_per_sec_per_gpu": 3703.56, + "tokens/trainable": 4583144 + }, + { + "epoch": 1.0509554140127388, + "grad_norm": 0.1787109375, + "learning_rate": 4.8194143708691844e-05, + "loss": 0.010735648684203625, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01079, + "step": 330, + "tokens/total": 43196416, + "tokens/train_per_sec_per_gpu": 3454.77, + "tokens/trainable": 4597528 + }, + { + "epoch": 1.0541401273885351, + "grad_norm": 0.2119140625, + "learning_rate": 4.817334459839633e-05, + "loss": 0.009996584616601467, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01005, + "step": 331, + "tokens/total": 43327488, + "tokens/train_per_sec_per_gpu": 3088.63, + "tokens/trainable": 4610506 + }, + { + "epoch": 1.0573248407643312, + "grad_norm": 0.1513671875, + "learning_rate": 4.8152430935761456e-05, + "loss": 0.007421544287353754, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00745, + "step": 332, + "tokens/total": 43458560, + "tokens/train_per_sec_per_gpu": 3395.75, + "tokens/trainable": 4624715 + }, + { + "epoch": 1.0605095541401275, + "grad_norm": 0.12255859375, + "learning_rate": 4.8131402824169336e-05, + "loss": 0.004339924082159996, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00435, + "step": 333, + "tokens/total": 43589632, + "tokens/train_per_sec_per_gpu": 2923.1, + "tokens/trainable": 4636991 + }, + { + "epoch": 1.0636942675159236, + "grad_norm": 0.2109375, + "learning_rate": 4.8110260367567816e-05, + "loss": 0.007030356675386429, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00706, + "step": 334, + "tokens/total": 43720704, + "tokens/train_per_sec_per_gpu": 3278.5, + "tokens/trainable": 4650745 + }, + { + "epoch": 1.0668789808917198, + "grad_norm": 0.2373046875, + "learning_rate": 4.808900367046999e-05, + "loss": 0.00917564332485199, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00922, + "step": 335, + "tokens/total": 43851776, + "tokens/train_per_sec_per_gpu": 3402.45, + "tokens/trainable": 4664997 + }, + { + "epoch": 1.070063694267516, + "grad_norm": 0.158203125, + "learning_rate": 4.806763283795366e-05, + "loss": 0.0065734670497477055, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0066, + "step": 336, + "tokens/total": 43982848, + "tokens/train_per_sec_per_gpu": 2932.03, + "tokens/trainable": 4677280 + }, + { + "epoch": 1.0732484076433122, + "grad_norm": 0.154296875, + "learning_rate": 4.804614797566086e-05, + "loss": 0.00853950995951891, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00858, + "step": 337, + "tokens/total": 44113920, + "tokens/train_per_sec_per_gpu": 3499.45, + "tokens/trainable": 4691898 + }, + { + "epoch": 1.0764331210191083, + "grad_norm": 0.271484375, + "learning_rate": 4.8024549189797276e-05, + "loss": 0.012293344363570213, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01237, + "step": 338, + "tokens/total": 44244992, + "tokens/train_per_sec_per_gpu": 3312.19, + "tokens/trainable": 4705870 + }, + { + "epoch": 1.0796178343949046, + "grad_norm": 0.1728515625, + "learning_rate": 4.800283658713177e-05, + "loss": 0.010073346085846424, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01012, + "step": 339, + "tokens/total": 44376064, + "tokens/train_per_sec_per_gpu": 3473.54, + "tokens/trainable": 4720409 + }, + { + "epoch": 1.0828025477707006, + "grad_norm": 0.1962890625, + "learning_rate": 4.798101027499581e-05, + "loss": 0.010279987938702106, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01033, + "step": 340, + "tokens/total": 44507136, + "tokens/train_per_sec_per_gpu": 3370.76, + "tokens/trainable": 4734524 + }, + { + "epoch": 1.085987261146497, + "grad_norm": 0.2041015625, + "learning_rate": 4.795907036128299e-05, + "loss": 0.009196259081363678, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00924, + "step": 341, + "tokens/total": 44638208, + "tokens/train_per_sec_per_gpu": 3347.17, + "tokens/trainable": 4748535 + }, + { + "epoch": 1.089171974522293, + "grad_norm": 0.2080078125, + "learning_rate": 4.793701695444846e-05, + "loss": 0.009703228250145912, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00975, + "step": 342, + "tokens/total": 44769280, + "tokens/train_per_sec_per_gpu": 3220.71, + "tokens/trainable": 4762018 + }, + { + "epoch": 1.0923566878980893, + "grad_norm": 0.18359375, + "learning_rate": 4.791485016350837e-05, + "loss": 0.010180710814893246, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01023, + "step": 343, + "tokens/total": 44900352, + "tokens/train_per_sec_per_gpu": 3726.69, + "tokens/trainable": 4777568 + }, + { + "epoch": 1.0955414012738853, + "grad_norm": 0.1826171875, + "learning_rate": 4.78925700980394e-05, + "loss": 0.007739739958196878, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00777, + "step": 344, + "tokens/total": 45031424, + "tokens/train_per_sec_per_gpu": 3151.58, + "tokens/trainable": 4790766 + }, + { + "epoch": 1.0987261146496816, + "grad_norm": 0.265625, + "learning_rate": 4.787017686817816e-05, + "loss": 0.013002859428524971, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01309, + "step": 345, + "tokens/total": 45162496, + "tokens/train_per_sec_per_gpu": 3615.54, + "tokens/trainable": 4805850 + }, + { + "epoch": 1.1019108280254777, + "grad_norm": 0.1669921875, + "learning_rate": 4.7847670584620653e-05, + "loss": 0.008513463661074638, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00855, + "step": 346, + "tokens/total": 45293568, + "tokens/train_per_sec_per_gpu": 3554.41, + "tokens/trainable": 4820707 + }, + { + "epoch": 1.105095541401274, + "grad_norm": 0.2041015625, + "learning_rate": 4.782505135862176e-05, + "loss": 0.012663084082305431, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01274, + "step": 347, + "tokens/total": 45424640, + "tokens/train_per_sec_per_gpu": 3406.8, + "tokens/trainable": 4834965 + }, + { + "epoch": 1.10828025477707, + "grad_norm": 0.1650390625, + "learning_rate": 4.780231930199465e-05, + "loss": 0.006982079707086086, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00701, + "step": 348, + "tokens/total": 45555712, + "tokens/train_per_sec_per_gpu": 3420.63, + "tokens/trainable": 4849306 + }, + { + "epoch": 1.1114649681528663, + "grad_norm": 0.150390625, + "learning_rate": 4.777947452711026e-05, + "loss": 0.007746942341327667, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00778, + "step": 349, + "tokens/total": 45686784, + "tokens/train_per_sec_per_gpu": 3182.73, + "tokens/trainable": 4862654 + }, + { + "epoch": 1.1146496815286624, + "grad_norm": 0.2021484375, + "learning_rate": 4.77565171468967e-05, + "loss": 0.008427651599049568, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00846, + "step": 350, + "tokens/total": 45817856, + "tokens/train_per_sec_per_gpu": 3011.28, + "tokens/trainable": 4875396 + }, + { + "epoch": 1.1178343949044587, + "grad_norm": 0.150390625, + "learning_rate": 4.773344727483876e-05, + "loss": 0.007029036991298199, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00705, + "step": 351, + "tokens/total": 45948928, + "tokens/train_per_sec_per_gpu": 2910.12, + "tokens/trainable": 4887648 + }, + { + "epoch": 1.1210191082802548, + "grad_norm": 0.203125, + "learning_rate": 4.771026502497726e-05, + "loss": 0.009960726834833622, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01001, + "step": 352, + "tokens/total": 46080000, + "tokens/train_per_sec_per_gpu": 3171.62, + "tokens/trainable": 4900946 + }, + { + "epoch": 1.124203821656051, + "grad_norm": 0.2109375, + "learning_rate": 4.7686970511908594e-05, + "loss": 0.010911881923675537, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01097, + "step": 353, + "tokens/total": 46211072, + "tokens/train_per_sec_per_gpu": 3471.86, + "tokens/trainable": 4915383 + }, + { + "epoch": 1.127388535031847, + "grad_norm": 0.19921875, + "learning_rate": 4.766356385078403e-05, + "loss": 0.01082072127610445, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01088, + "step": 354, + "tokens/total": 46342144, + "tokens/train_per_sec_per_gpu": 3528.47, + "tokens/trainable": 4930118 + }, + { + "epoch": 1.1305732484076434, + "grad_norm": 0.189453125, + "learning_rate": 4.7640045157309286e-05, + "loss": 0.00796705111861229, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.008, + "step": 355, + "tokens/total": 46473216, + "tokens/train_per_sec_per_gpu": 3675.06, + "tokens/trainable": 4945407 + }, + { + "epoch": 1.1337579617834395, + "grad_norm": 0.1650390625, + "learning_rate": 4.761641454774386e-05, + "loss": 0.009853512980043888, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0099, + "step": 356, + "tokens/total": 46604288, + "tokens/train_per_sec_per_gpu": 3426.1, + "tokens/trainable": 4959713 + }, + { + "epoch": 1.1369426751592357, + "grad_norm": 0.1728515625, + "learning_rate": 4.759267213890046e-05, + "loss": 0.008251532912254333, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00829, + "step": 357, + "tokens/total": 46735360, + "tokens/train_per_sec_per_gpu": 3370.12, + "tokens/trainable": 4973803 + }, + { + "epoch": 1.1401273885350318, + "grad_norm": 0.171875, + "learning_rate": 4.756881804814448e-05, + "loss": 0.007583227939903736, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00761, + "step": 358, + "tokens/total": 46866432, + "tokens/train_per_sec_per_gpu": 3085.13, + "tokens/trainable": 4986783 + }, + { + "epoch": 1.143312101910828, + "grad_norm": 0.1171875, + "learning_rate": 4.7544852393393375e-05, + "loss": 0.005565401166677475, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00558, + "step": 359, + "tokens/total": 46997504, + "tokens/train_per_sec_per_gpu": 3283.64, + "tokens/trainable": 5000464 + }, + { + "epoch": 1.1464968152866242, + "grad_norm": 0.158203125, + "learning_rate": 4.7520775293116096e-05, + "loss": 0.007274336647242308, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0073, + "step": 360, + "tokens/total": 47128576, + "tokens/train_per_sec_per_gpu": 3219.09, + "tokens/trainable": 5013941 + }, + { + "epoch": 1.1496815286624205, + "grad_norm": 0.173828125, + "learning_rate": 4.749658686633251e-05, + "loss": 0.007295841351151466, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00732, + "step": 361, + "tokens/total": 47259648, + "tokens/train_per_sec_per_gpu": 3222.39, + "tokens/trainable": 5027460 + }, + { + "epoch": 1.1528662420382165, + "grad_norm": 0.126953125, + "learning_rate": 4.747228723261278e-05, + "loss": 0.004342417698353529, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00435, + "step": 362, + "tokens/total": 47390720, + "tokens/train_per_sec_per_gpu": 3121.81, + "tokens/trainable": 5040541 + }, + { + "epoch": 1.1560509554140128, + "grad_norm": 0.197265625, + "learning_rate": 4.7447876512076815e-05, + "loss": 0.00851562898606062, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00855, + "step": 363, + "tokens/total": 47521792, + "tokens/train_per_sec_per_gpu": 3480.8, + "tokens/trainable": 5055042 + }, + { + "epoch": 1.1592356687898089, + "grad_norm": 0.1923828125, + "learning_rate": 4.7423354825393646e-05, + "loss": 0.011735991574823856, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01181, + "step": 364, + "tokens/total": 47652864, + "tokens/train_per_sec_per_gpu": 3454.35, + "tokens/trainable": 5069432 + }, + { + "epoch": 1.1624203821656052, + "grad_norm": 0.203125, + "learning_rate": 4.739872229378085e-05, + "loss": 0.009628934785723686, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00968, + "step": 365, + "tokens/total": 47783936, + "tokens/train_per_sec_per_gpu": 3056.5, + "tokens/trainable": 5082238 + }, + { + "epoch": 1.1656050955414012, + "grad_norm": 0.181640625, + "learning_rate": 4.737397903900393e-05, + "loss": 0.008178248070180416, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00821, + "step": 366, + "tokens/total": 47915008, + "tokens/train_per_sec_per_gpu": 3187.45, + "tokens/trainable": 5095582 + }, + { + "epoch": 1.1687898089171975, + "grad_norm": 0.2109375, + "learning_rate": 4.734912518337574e-05, + "loss": 0.010145166888833046, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0102, + "step": 367, + "tokens/total": 48046080, + "tokens/train_per_sec_per_gpu": 3535.81, + "tokens/trainable": 5110321 + }, + { + "epoch": 1.1719745222929936, + "grad_norm": 0.158203125, + "learning_rate": 4.732416084975585e-05, + "loss": 0.008553897961974144, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00859, + "step": 368, + "tokens/total": 48177152, + "tokens/train_per_sec_per_gpu": 3223.93, + "tokens/trainable": 5123813 + }, + { + "epoch": 1.1751592356687899, + "grad_norm": 0.146484375, + "learning_rate": 4.729908616154996e-05, + "loss": 0.007267483975738287, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00729, + "step": 369, + "tokens/total": 48308224, + "tokens/train_per_sec_per_gpu": 3596.57, + "tokens/trainable": 5138875 + }, + { + "epoch": 1.178343949044586, + "grad_norm": 0.20703125, + "learning_rate": 4.727390124270929e-05, + "loss": 0.010045611299574375, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0101, + "step": 370, + "tokens/total": 48439296, + "tokens/train_per_sec_per_gpu": 3361.05, + "tokens/trainable": 5152957 + }, + { + "epoch": 1.1815286624203822, + "grad_norm": 0.166015625, + "learning_rate": 4.724860621772995e-05, + "loss": 0.006381361745297909, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0064, + "step": 371, + "tokens/total": 48570368, + "tokens/train_per_sec_per_gpu": 3270.56, + "tokens/trainable": 5166655 + }, + { + "epoch": 1.1847133757961783, + "grad_norm": 0.1259765625, + "learning_rate": 4.7223201211652346e-05, + "loss": 0.0061474088579416275, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00617, + "step": 372, + "tokens/total": 48701440, + "tokens/train_per_sec_per_gpu": 3413.31, + "tokens/trainable": 5180889 + }, + { + "epoch": 1.1878980891719746, + "grad_norm": 0.205078125, + "learning_rate": 4.7197686350060535e-05, + "loss": 0.013294153846800327, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01338, + "step": 373, + "tokens/total": 48832512, + "tokens/train_per_sec_per_gpu": 3307.53, + "tokens/trainable": 5194736 + }, + { + "epoch": 1.1910828025477707, + "grad_norm": 0.1669921875, + "learning_rate": 4.717206175908164e-05, + "loss": 0.009227165952324867, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00927, + "step": 374, + "tokens/total": 48963584, + "tokens/train_per_sec_per_gpu": 3407.27, + "tokens/trainable": 5208974 + }, + { + "epoch": 1.194267515923567, + "grad_norm": 0.2421875, + "learning_rate": 4.7146327565385195e-05, + "loss": 0.009992158971726894, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01004, + "step": 375, + "tokens/total": 49094656, + "tokens/train_per_sec_per_gpu": 3078.36, + "tokens/trainable": 5221898 + }, + { + "epoch": 1.197452229299363, + "grad_norm": 0.1630859375, + "learning_rate": 4.712048389618254e-05, + "loss": 0.0076246620155870914, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00765, + "step": 376, + "tokens/total": 49225728, + "tokens/train_per_sec_per_gpu": 3454.62, + "tokens/trainable": 5236300 + }, + { + "epoch": 1.2006369426751593, + "grad_norm": 0.2119140625, + "learning_rate": 4.7094530879226166e-05, + "loss": 0.010849738493561745, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01091, + "step": 377, + "tokens/total": 49356800, + "tokens/train_per_sec_per_gpu": 3211.73, + "tokens/trainable": 5249796 + }, + { + "epoch": 1.2038216560509554, + "grad_norm": 0.1669921875, + "learning_rate": 4.706846864280913e-05, + "loss": 0.00665281992405653, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00667, + "step": 378, + "tokens/total": 49487872, + "tokens/train_per_sec_per_gpu": 3615.73, + "tokens/trainable": 5264940 + }, + { + "epoch": 1.2070063694267517, + "grad_norm": 0.1689453125, + "learning_rate": 4.704229731576435e-05, + "loss": 0.009321301244199276, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00936, + "step": 379, + "tokens/total": 49618944, + "tokens/train_per_sec_per_gpu": 3521.87, + "tokens/trainable": 5279679 + }, + { + "epoch": 1.2101910828025477, + "grad_norm": 0.1728515625, + "learning_rate": 4.701601702746405e-05, + "loss": 0.009726524353027344, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00977, + "step": 380, + "tokens/total": 49750016, + "tokens/train_per_sec_per_gpu": 3758.86, + "tokens/trainable": 5295322 + }, + { + "epoch": 1.213375796178344, + "grad_norm": 0.138671875, + "learning_rate": 4.698962790781906e-05, + "loss": 0.00720211723819375, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00723, + "step": 381, + "tokens/total": 49881088, + "tokens/train_per_sec_per_gpu": 3392.46, + "tokens/trainable": 5309524 + }, + { + "epoch": 1.21656050955414, + "grad_norm": 0.1943359375, + "learning_rate": 4.696313008727819e-05, + "loss": 0.009434825740754604, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00948, + "step": 382, + "tokens/total": 50012160, + "tokens/train_per_sec_per_gpu": 3237.9, + "tokens/trainable": 5323073 + }, + { + "epoch": 1.2197452229299364, + "grad_norm": 0.203125, + "learning_rate": 4.6936523696827615e-05, + "loss": 0.013360480777919292, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01345, + "step": 383, + "tokens/total": 50143232, + "tokens/train_per_sec_per_gpu": 3386.03, + "tokens/trainable": 5337238 + }, + { + "epoch": 1.2229299363057324, + "grad_norm": 0.1748046875, + "learning_rate": 4.690980886799016e-05, + "loss": 0.009163031354546547, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00921, + "step": 384, + "tokens/total": 50274304, + "tokens/train_per_sec_per_gpu": 3800.38, + "tokens/trainable": 5353034 + }, + { + "epoch": 1.2261146496815287, + "grad_norm": 0.142578125, + "learning_rate": 4.688298573282473e-05, + "loss": 0.006065514404326677, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00608, + "step": 385, + "tokens/total": 50405376, + "tokens/train_per_sec_per_gpu": 3325.28, + "tokens/trainable": 5366994 + }, + { + "epoch": 1.2292993630573248, + "grad_norm": 0.1611328125, + "learning_rate": 4.685605442392559e-05, + "loss": 0.007522703614085913, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00755, + "step": 386, + "tokens/total": 50536448, + "tokens/train_per_sec_per_gpu": 3244.1, + "tokens/trainable": 5380585 + }, + { + "epoch": 1.232484076433121, + "grad_norm": 0.158203125, + "learning_rate": 4.6829015074421754e-05, + "loss": 0.008297629654407501, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00833, + "step": 387, + "tokens/total": 50667520, + "tokens/train_per_sec_per_gpu": 3675.24, + "tokens/trainable": 5395883 + }, + { + "epoch": 1.2356687898089171, + "grad_norm": 0.1923828125, + "learning_rate": 4.680186781797632e-05, + "loss": 0.008283684030175209, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00832, + "step": 388, + "tokens/total": 50798592, + "tokens/train_per_sec_per_gpu": 3323.41, + "tokens/trainable": 5409819 + }, + { + "epoch": 1.2388535031847134, + "grad_norm": 0.1669921875, + "learning_rate": 4.677461278878577e-05, + "loss": 0.009029434062540531, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00907, + "step": 389, + "tokens/total": 50929664, + "tokens/train_per_sec_per_gpu": 2967.79, + "tokens/trainable": 5422282 + }, + { + "epoch": 1.2420382165605095, + "grad_norm": 0.1298828125, + "learning_rate": 4.674725012157936e-05, + "loss": 0.0059669832699000835, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00598, + "step": 390, + "tokens/total": 51060736, + "tokens/train_per_sec_per_gpu": 3230.78, + "tokens/trainable": 5435820 + }, + { + "epoch": 1.2452229299363058, + "grad_norm": 0.14453125, + "learning_rate": 4.671977995161843e-05, + "loss": 0.005600204225629568, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00562, + "step": 391, + "tokens/total": 51191808, + "tokens/train_per_sec_per_gpu": 3398.04, + "tokens/trainable": 5450055 + }, + { + "epoch": 1.2484076433121019, + "grad_norm": 0.166015625, + "learning_rate": 4.669220241469573e-05, + "loss": 0.007735088467597961, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00777, + "step": 392, + "tokens/total": 51322880, + "tokens/train_per_sec_per_gpu": 3315.69, + "tokens/trainable": 5463943 + }, + { + "epoch": 1.2515923566878981, + "grad_norm": 0.2138671875, + "learning_rate": 4.666451764713475e-05, + "loss": 0.010222709737718105, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01028, + "step": 393, + "tokens/total": 51453952, + "tokens/train_per_sec_per_gpu": 3438.45, + "tokens/trainable": 5478266 + }, + { + "epoch": 1.2547770700636942, + "grad_norm": 0.154296875, + "learning_rate": 4.663672578578908e-05, + "loss": 0.007789981085807085, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00782, + "step": 394, + "tokens/total": 51585024, + "tokens/train_per_sec_per_gpu": 3144.21, + "tokens/trainable": 5491440 + }, + { + "epoch": 1.2579617834394905, + "grad_norm": 0.1982421875, + "learning_rate": 4.660882696804165e-05, + "loss": 0.01257528830319643, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01265, + "step": 395, + "tokens/total": 51716096, + "tokens/train_per_sec_per_gpu": 3704.11, + "tokens/trainable": 5506947 + }, + { + "epoch": 1.2611464968152866, + "grad_norm": 0.1669921875, + "learning_rate": 4.658082133180416e-05, + "loss": 0.007808534894138575, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00784, + "step": 396, + "tokens/total": 51847168, + "tokens/train_per_sec_per_gpu": 3138.89, + "tokens/trainable": 5520102 + }, + { + "epoch": 1.2643312101910829, + "grad_norm": 0.1787109375, + "learning_rate": 4.655270901551632e-05, + "loss": 0.008749695494771004, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00879, + "step": 397, + "tokens/total": 51978240, + "tokens/train_per_sec_per_gpu": 3068.52, + "tokens/trainable": 5532992 + }, + { + "epoch": 1.267515923566879, + "grad_norm": 0.193359375, + "learning_rate": 4.652449015814518e-05, + "loss": 0.010582723654806614, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01064, + "step": 398, + "tokens/total": 52109312, + "tokens/train_per_sec_per_gpu": 3477.89, + "tokens/trainable": 5547568 + }, + { + "epoch": 1.2707006369426752, + "grad_norm": 0.177734375, + "learning_rate": 4.649616489918448e-05, + "loss": 0.007580795791000128, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00761, + "step": 399, + "tokens/total": 52240384, + "tokens/train_per_sec_per_gpu": 3136.71, + "tokens/trainable": 5560738 + }, + { + "epoch": 1.2738853503184713, + "grad_norm": 0.177734375, + "learning_rate": 4.646773337865391e-05, + "loss": 0.00638965331017971, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00641, + "step": 400, + "tokens/total": 52371456, + "tokens/train_per_sec_per_gpu": 3189.44, + "tokens/trainable": 5574146 + }, + { + "epoch": 1.2770700636942676, + "grad_norm": 0.185546875, + "learning_rate": 4.643919573709843e-05, + "loss": 0.007701355963945389, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00773, + "step": 401, + "tokens/total": 52502528, + "tokens/train_per_sec_per_gpu": 3217.16, + "tokens/trainable": 5587632 + }, + { + "epoch": 1.2802547770700636, + "grad_norm": 0.1962890625, + "learning_rate": 4.641055211558762e-05, + "loss": 0.009735530242323875, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00978, + "step": 402, + "tokens/total": 52633600, + "tokens/train_per_sec_per_gpu": 3104.94, + "tokens/trainable": 5600617 + }, + { + "epoch": 1.28343949044586, + "grad_norm": 0.193359375, + "learning_rate": 4.6381802655714946e-05, + "loss": 0.009511996060609818, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00956, + "step": 403, + "tokens/total": 52764672, + "tokens/train_per_sec_per_gpu": 3181.89, + "tokens/trainable": 5613940 + }, + { + "epoch": 1.286624203821656, + "grad_norm": 0.1669921875, + "learning_rate": 4.6352947499597024e-05, + "loss": 0.008532877080142498, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00857, + "step": 404, + "tokens/total": 52895744, + "tokens/train_per_sec_per_gpu": 3220.05, + "tokens/trainable": 5627419 + }, + { + "epoch": 1.2898089171974523, + "grad_norm": 0.1787109375, + "learning_rate": 4.632398678987298e-05, + "loss": 0.007435362320393324, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00746, + "step": 405, + "tokens/total": 53026816, + "tokens/train_per_sec_per_gpu": 3293.31, + "tokens/trainable": 5641255 + }, + { + "epoch": 1.2929936305732483, + "grad_norm": 0.185546875, + "learning_rate": 4.629492066970373e-05, + "loss": 0.009640632197260857, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00969, + "step": 406, + "tokens/total": 53157888, + "tokens/train_per_sec_per_gpu": 3502.03, + "tokens/trainable": 5655889 + }, + { + "epoch": 1.2961783439490446, + "grad_norm": 0.1865234375, + "learning_rate": 4.626574928277127e-05, + "loss": 0.00989444274455309, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00994, + "step": 407, + "tokens/total": 53288960, + "tokens/train_per_sec_per_gpu": 3544.59, + "tokens/trainable": 5670642 + }, + { + "epoch": 1.2993630573248407, + "grad_norm": 0.23828125, + "learning_rate": 4.623647277327792e-05, + "loss": 0.009198141284286976, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00924, + "step": 408, + "tokens/total": 53420032, + "tokens/train_per_sec_per_gpu": 3306.22, + "tokens/trainable": 5684524 + }, + { + "epoch": 1.302547770700637, + "grad_norm": 0.216796875, + "learning_rate": 4.6207091285945694e-05, + "loss": 0.010384837165474892, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01044, + "step": 409, + "tokens/total": 53551104, + "tokens/train_per_sec_per_gpu": 3444.91, + "tokens/trainable": 5698889 + }, + { + "epoch": 1.305732484076433, + "grad_norm": 0.1640625, + "learning_rate": 4.61776049660155e-05, + "loss": 0.0068597206845879555, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00688, + "step": 410, + "tokens/total": 53682176, + "tokens/train_per_sec_per_gpu": 3083.21, + "tokens/trainable": 5711820 + }, + { + "epoch": 1.3089171974522293, + "grad_norm": 0.125, + "learning_rate": 4.614801395924649e-05, + "loss": 0.005090971477329731, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0051, + "step": 411, + "tokens/total": 53813248, + "tokens/train_per_sec_per_gpu": 3042.48, + "tokens/trainable": 5724613 + }, + { + "epoch": 1.3121019108280254, + "grad_norm": 0.142578125, + "learning_rate": 4.611831841191533e-05, + "loss": 0.005095964763313532, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00511, + "step": 412, + "tokens/total": 53944320, + "tokens/train_per_sec_per_gpu": 3189.46, + "tokens/trainable": 5737985 + }, + { + "epoch": 1.3152866242038217, + "grad_norm": 0.177734375, + "learning_rate": 4.608851847081542e-05, + "loss": 0.009599323384463787, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00965, + "step": 413, + "tokens/total": 54075392, + "tokens/train_per_sec_per_gpu": 3425.1, + "tokens/trainable": 5752257 + }, + { + "epoch": 1.3184713375796178, + "grad_norm": 0.1533203125, + "learning_rate": 4.6058614283256205e-05, + "loss": 0.007107466459274292, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00713, + "step": 414, + "tokens/total": 54206464, + "tokens/train_per_sec_per_gpu": 3284.05, + "tokens/trainable": 5766000 + }, + { + "epoch": 1.321656050955414, + "grad_norm": 0.2021484375, + "learning_rate": 4.60286059970625e-05, + "loss": 0.009428326040506363, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00947, + "step": 415, + "tokens/total": 54337536, + "tokens/train_per_sec_per_gpu": 3375.64, + "tokens/trainable": 5780141 + }, + { + "epoch": 1.3248407643312101, + "grad_norm": 0.1484375, + "learning_rate": 4.599849376057366e-05, + "loss": 0.006207283120602369, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00623, + "step": 416, + "tokens/total": 54468608, + "tokens/train_per_sec_per_gpu": 3150.96, + "tokens/trainable": 5793324 + }, + { + "epoch": 1.3280254777070064, + "grad_norm": 0.193359375, + "learning_rate": 4.5968277722642915e-05, + "loss": 0.011342452839016914, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01141, + "step": 417, + "tokens/total": 54599680, + "tokens/train_per_sec_per_gpu": 3068.96, + "tokens/trainable": 5806288 + }, + { + "epoch": 1.3312101910828025, + "grad_norm": 0.2197265625, + "learning_rate": 4.593795803263661e-05, + "loss": 0.0096285380423069, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00968, + "step": 418, + "tokens/total": 54730752, + "tokens/train_per_sec_per_gpu": 3414.96, + "tokens/trainable": 5820535 + }, + { + "epoch": 1.3343949044585988, + "grad_norm": 0.1787109375, + "learning_rate": 4.590753484043348e-05, + "loss": 0.008351242169737816, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00839, + "step": 419, + "tokens/total": 54861824, + "tokens/train_per_sec_per_gpu": 3383.61, + "tokens/trainable": 5834705 + }, + { + "epoch": 1.3375796178343948, + "grad_norm": 0.20703125, + "learning_rate": 4.5877008296423886e-05, + "loss": 0.010140678845345974, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01019, + "step": 420, + "tokens/total": 54992896, + "tokens/train_per_sec_per_gpu": 3557.31, + "tokens/trainable": 5849593 + }, + { + "epoch": 1.3407643312101911, + "grad_norm": 0.1005859375, + "learning_rate": 4.5846378551509097e-05, + "loss": 0.003956064116209745, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00396, + "step": 421, + "tokens/total": 55123968, + "tokens/train_per_sec_per_gpu": 3127.02, + "tokens/trainable": 5862715 + }, + { + "epoch": 1.3439490445859872, + "grad_norm": 0.19921875, + "learning_rate": 4.581564575710053e-05, + "loss": 0.011450878344476223, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01152, + "step": 422, + "tokens/total": 55255040, + "tokens/train_per_sec_per_gpu": 3045.36, + "tokens/trainable": 5875602 + }, + { + "epoch": 1.3471337579617835, + "grad_norm": 0.1689453125, + "learning_rate": 4.5784810065119e-05, + "loss": 0.008104214444756508, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00814, + "step": 423, + "tokens/total": 55386112, + "tokens/train_per_sec_per_gpu": 3284.72, + "tokens/trainable": 5889428 + }, + { + "epoch": 1.3503184713375795, + "grad_norm": 0.14453125, + "learning_rate": 4.575387162799399e-05, + "loss": 0.006891798693686724, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00692, + "step": 424, + "tokens/total": 55517184, + "tokens/train_per_sec_per_gpu": 3722.92, + "tokens/trainable": 5904973 + }, + { + "epoch": 1.3535031847133758, + "grad_norm": 0.1669921875, + "learning_rate": 4.5722830598662854e-05, + "loss": 0.009776144288480282, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00982, + "step": 425, + "tokens/total": 55648256, + "tokens/train_per_sec_per_gpu": 3412.77, + "tokens/trainable": 5919245 + }, + { + "epoch": 1.356687898089172, + "grad_norm": 0.166015625, + "learning_rate": 4.56916871305701e-05, + "loss": 0.007931388914585114, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00796, + "step": 426, + "tokens/total": 55779328, + "tokens/train_per_sec_per_gpu": 3407.46, + "tokens/trainable": 5933535 + }, + { + "epoch": 1.3598726114649682, + "grad_norm": 0.1904296875, + "learning_rate": 4.5660441377666654e-05, + "loss": 0.008083492517471313, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00812, + "step": 427, + "tokens/total": 55910400, + "tokens/train_per_sec_per_gpu": 3599.03, + "tokens/trainable": 5948481 + }, + { + "epoch": 1.3630573248407643, + "grad_norm": 0.1484375, + "learning_rate": 4.562909349440899e-05, + "loss": 0.006925994995981455, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00695, + "step": 428, + "tokens/total": 56041472, + "tokens/train_per_sec_per_gpu": 3517.79, + "tokens/trainable": 5963175 + }, + { + "epoch": 1.3662420382165605, + "grad_norm": 0.1484375, + "learning_rate": 4.559764363575851e-05, + "loss": 0.008385020308196545, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00842, + "step": 429, + "tokens/total": 56172544, + "tokens/train_per_sec_per_gpu": 3404.08, + "tokens/trainable": 5977423 + }, + { + "epoch": 1.3694267515923566, + "grad_norm": 0.1669921875, + "learning_rate": 4.556609195718068e-05, + "loss": 0.005221434403210878, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00524, + "step": 430, + "tokens/total": 56303616, + "tokens/train_per_sec_per_gpu": 3212.05, + "tokens/trainable": 5990835 + }, + { + "epoch": 1.372611464968153, + "grad_norm": 0.193359375, + "learning_rate": 4.5534438614644294e-05, + "loss": 0.009253652766346931, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0093, + "step": 431, + "tokens/total": 56434688, + "tokens/train_per_sec_per_gpu": 3584.45, + "tokens/trainable": 6005749 + }, + { + "epoch": 1.3757961783439492, + "grad_norm": 0.2021484375, + "learning_rate": 4.550268376462068e-05, + "loss": 0.009988540783524513, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01004, + "step": 432, + "tokens/total": 56565760, + "tokens/train_per_sec_per_gpu": 3148.92, + "tokens/trainable": 6018952 + }, + { + "epoch": 1.3789808917197452, + "grad_norm": 0.166015625, + "learning_rate": 4.547082756408299e-05, + "loss": 0.007521233521401882, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00755, + "step": 433, + "tokens/total": 56696832, + "tokens/train_per_sec_per_gpu": 3309.63, + "tokens/trainable": 6032837 + }, + { + "epoch": 1.3821656050955413, + "grad_norm": 0.1337890625, + "learning_rate": 4.543887017050534e-05, + "loss": 0.005825295113027096, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00584, + "step": 434, + "tokens/total": 56827904, + "tokens/train_per_sec_per_gpu": 3375.36, + "tokens/trainable": 6046929 + }, + { + "epoch": 1.3853503184713376, + "grad_norm": 0.2265625, + "learning_rate": 4.540681174186209e-05, + "loss": 0.011601070873439312, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01167, + "step": 435, + "tokens/total": 56958976, + "tokens/train_per_sec_per_gpu": 3215.41, + "tokens/trainable": 6060425 + }, + { + "epoch": 1.388535031847134, + "grad_norm": 0.1884765625, + "learning_rate": 4.537465243662704e-05, + "loss": 0.008219108916819096, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00825, + "step": 436, + "tokens/total": 57090048, + "tokens/train_per_sec_per_gpu": 3133.32, + "tokens/trainable": 6073533 + }, + { + "epoch": 1.39171974522293, + "grad_norm": 0.140625, + "learning_rate": 4.534239241377266e-05, + "loss": 0.007054620422422886, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00708, + "step": 437, + "tokens/total": 57221120, + "tokens/train_per_sec_per_gpu": 3767.46, + "tokens/trainable": 6089174 + }, + { + "epoch": 1.394904458598726, + "grad_norm": 0.1455078125, + "learning_rate": 4.5310031832769275e-05, + "loss": 0.007198185659945011, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00722, + "step": 438, + "tokens/total": 57352192, + "tokens/train_per_sec_per_gpu": 3417.69, + "tokens/trainable": 6103402 + }, + { + "epoch": 1.3980891719745223, + "grad_norm": 0.1474609375, + "learning_rate": 4.527757085358431e-05, + "loss": 0.007888494990766048, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00792, + "step": 439, + "tokens/total": 57483264, + "tokens/train_per_sec_per_gpu": 3744.21, + "tokens/trainable": 6119012 + }, + { + "epoch": 1.4012738853503186, + "grad_norm": 0.19140625, + "learning_rate": 4.52450096366815e-05, + "loss": 0.010496556758880615, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01055, + "step": 440, + "tokens/total": 57614336, + "tokens/train_per_sec_per_gpu": 3474.71, + "tokens/trainable": 6133429 + }, + { + "epoch": 1.4044585987261147, + "grad_norm": 0.1572265625, + "learning_rate": 4.521234834302006e-05, + "loss": 0.008718312717974186, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00876, + "step": 441, + "tokens/total": 57745408, + "tokens/train_per_sec_per_gpu": 3481.7, + "tokens/trainable": 6147945 + }, + { + "epoch": 1.4076433121019107, + "grad_norm": 0.2041015625, + "learning_rate": 4.5179587134053916e-05, + "loss": 0.01150327455252409, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01157, + "step": 442, + "tokens/total": 57876480, + "tokens/train_per_sec_per_gpu": 3229.57, + "tokens/trainable": 6161469 + }, + { + "epoch": 1.410828025477707, + "grad_norm": 0.216796875, + "learning_rate": 4.514672617173091e-05, + "loss": 0.011761811561882496, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01183, + "step": 443, + "tokens/total": 58007552, + "tokens/train_per_sec_per_gpu": 3416.37, + "tokens/trainable": 6175738 + }, + { + "epoch": 1.4140127388535033, + "grad_norm": 0.177734375, + "learning_rate": 4.511376561849201e-05, + "loss": 0.008984040468931198, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00902, + "step": 444, + "tokens/total": 58138624, + "tokens/train_per_sec_per_gpu": 3352.83, + "tokens/trainable": 6189737 + }, + { + "epoch": 1.4171974522292994, + "grad_norm": 0.1748046875, + "learning_rate": 4.5080705637270446e-05, + "loss": 0.006133932154625654, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00615, + "step": 445, + "tokens/total": 58269696, + "tokens/train_per_sec_per_gpu": 3183.46, + "tokens/trainable": 6203050 + }, + { + "epoch": 1.4203821656050954, + "grad_norm": 0.169921875, + "learning_rate": 4.5047546391491e-05, + "loss": 0.008717117831110954, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00876, + "step": 446, + "tokens/total": 58400768, + "tokens/train_per_sec_per_gpu": 3819.2, + "tokens/trainable": 6218900 + }, + { + "epoch": 1.4235668789808917, + "grad_norm": 0.14453125, + "learning_rate": 4.50142880450691e-05, + "loss": 0.006517563946545124, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00654, + "step": 447, + "tokens/total": 58531840, + "tokens/train_per_sec_per_gpu": 3083.03, + "tokens/trainable": 6231819 + }, + { + "epoch": 1.426751592356688, + "grad_norm": 0.1591796875, + "learning_rate": 4.4980930762410084e-05, + "loss": 0.010371977463364601, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01043, + "step": 448, + "tokens/total": 58662912, + "tokens/train_per_sec_per_gpu": 3608.97, + "tokens/trainable": 6246842 + }, + { + "epoch": 1.429936305732484, + "grad_norm": 0.1787109375, + "learning_rate": 4.4947474708408353e-05, + "loss": 0.00814439170062542, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00818, + "step": 449, + "tokens/total": 58793984, + "tokens/train_per_sec_per_gpu": 3576.32, + "tokens/trainable": 6261750 + }, + { + "epoch": 1.4331210191082802, + "grad_norm": 0.181640625, + "learning_rate": 4.491392004844656e-05, + "loss": 0.00930082332342863, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00934, + "step": 450, + "tokens/total": 58925056, + "tokens/train_per_sec_per_gpu": 3149.76, + "tokens/trainable": 6274962 + }, + { + "epoch": 1.4363057324840764, + "grad_norm": 0.1875, + "learning_rate": 4.48802669483948e-05, + "loss": 0.01012382097542286, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01018, + "step": 451, + "tokens/total": 59056128, + "tokens/train_per_sec_per_gpu": 3375.6, + "tokens/trainable": 6289094 + }, + { + "epoch": 1.4394904458598727, + "grad_norm": 0.13671875, + "learning_rate": 4.484651557460978e-05, + "loss": 0.007823411375284195, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00785, + "step": 452, + "tokens/total": 59187200, + "tokens/train_per_sec_per_gpu": 3541.02, + "tokens/trainable": 6303818 + }, + { + "epoch": 1.4426751592356688, + "grad_norm": 0.1767578125, + "learning_rate": 4.4812666093934e-05, + "loss": 0.010683316737413406, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01074, + "step": 453, + "tokens/total": 59318272, + "tokens/train_per_sec_per_gpu": 3656.59, + "tokens/trainable": 6319060 + }, + { + "epoch": 1.4458598726114649, + "grad_norm": 0.1650390625, + "learning_rate": 4.477871867369494e-05, + "loss": 0.01043397095054388, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01049, + "step": 454, + "tokens/total": 59449344, + "tokens/train_per_sec_per_gpu": 3638.87, + "tokens/trainable": 6334323 + }, + { + "epoch": 1.4490445859872612, + "grad_norm": 0.16015625, + "learning_rate": 4.474467348170421e-05, + "loss": 0.008449015207588673, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00848, + "step": 455, + "tokens/total": 59580416, + "tokens/train_per_sec_per_gpu": 3481.97, + "tokens/trainable": 6348832 + }, + { + "epoch": 1.4522292993630574, + "grad_norm": 0.1494140625, + "learning_rate": 4.471053068625674e-05, + "loss": 0.008155008777976036, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00819, + "step": 456, + "tokens/total": 59711488, + "tokens/train_per_sec_per_gpu": 3541.25, + "tokens/trainable": 6363586 + }, + { + "epoch": 1.4554140127388535, + "grad_norm": 0.15625, + "learning_rate": 4.467629045612994e-05, + "loss": 0.008736428804695606, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00877, + "step": 457, + "tokens/total": 59842560, + "tokens/train_per_sec_per_gpu": 3481.13, + "tokens/trainable": 6378173 + }, + { + "epoch": 1.4585987261146496, + "grad_norm": 0.185546875, + "learning_rate": 4.4641952960582877e-05, + "loss": 0.013414832763373852, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01351, + "step": 458, + "tokens/total": 59973632, + "tokens/train_per_sec_per_gpu": 3571.41, + "tokens/trainable": 6393061 + }, + { + "epoch": 1.4617834394904459, + "grad_norm": 0.208984375, + "learning_rate": 4.4607518369355403e-05, + "loss": 0.008803540840744972, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00884, + "step": 459, + "tokens/total": 60104704, + "tokens/train_per_sec_per_gpu": 3270.87, + "tokens/trainable": 6406746 + }, + { + "epoch": 1.4649681528662422, + "grad_norm": 0.1806640625, + "learning_rate": 4.457298685266737e-05, + "loss": 0.008787565864622593, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00883, + "step": 460, + "tokens/total": 60235776, + "tokens/train_per_sec_per_gpu": 3181.98, + "tokens/trainable": 6420083 + }, + { + "epoch": 1.4681528662420382, + "grad_norm": 0.1943359375, + "learning_rate": 4.453835858121773e-05, + "loss": 0.008562528528273106, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0086, + "step": 461, + "tokens/total": 60366848, + "tokens/train_per_sec_per_gpu": 3258.95, + "tokens/trainable": 6433715 + }, + { + "epoch": 1.4713375796178343, + "grad_norm": 0.162109375, + "learning_rate": 4.450363372618376e-05, + "loss": 0.0074198306538164616, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00745, + "step": 462, + "tokens/total": 60497920, + "tokens/train_per_sec_per_gpu": 3544.34, + "tokens/trainable": 6448466 + }, + { + "epoch": 1.4745222929936306, + "grad_norm": 0.1484375, + "learning_rate": 4.4468812459220135e-05, + "loss": 0.006448620930314064, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00647, + "step": 463, + "tokens/total": 60628992, + "tokens/train_per_sec_per_gpu": 3102.02, + "tokens/trainable": 6461464 + }, + { + "epoch": 1.4777070063694269, + "grad_norm": 0.1572265625, + "learning_rate": 4.4433894952458156e-05, + "loss": 0.008648392744362354, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00869, + "step": 464, + "tokens/total": 60760064, + "tokens/train_per_sec_per_gpu": 3168.95, + "tokens/trainable": 6475263 + }, + { + "epoch": 1.480891719745223, + "grad_norm": 0.15234375, + "learning_rate": 4.439888137850483e-05, + "loss": 0.008528076112270355, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00856, + "step": 465, + "tokens/total": 60891136, + "tokens/train_per_sec_per_gpu": 3278.68, + "tokens/trainable": 6488927 + }, + { + "epoch": 1.484076433121019, + "grad_norm": 0.1806640625, + "learning_rate": 4.436377191044208e-05, + "loss": 0.009064987301826477, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00911, + "step": 466, + "tokens/total": 61022208, + "tokens/train_per_sec_per_gpu": 3401.94, + "tokens/trainable": 6503171 + }, + { + "epoch": 1.4872611464968153, + "grad_norm": 0.1552734375, + "learning_rate": 4.4328566721825846e-05, + "loss": 0.009180644527077675, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00922, + "step": 467, + "tokens/total": 61153280, + "tokens/train_per_sec_per_gpu": 3399.29, + "tokens/trainable": 6517402 + }, + { + "epoch": 1.4904458598726116, + "grad_norm": 0.2158203125, + "learning_rate": 4.4293265986685264e-05, + "loss": 0.00970767717808485, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00975, + "step": 468, + "tokens/total": 61284352, + "tokens/train_per_sec_per_gpu": 2969.25, + "tokens/trainable": 6529847 + }, + { + "epoch": 1.4936305732484076, + "grad_norm": 0.154296875, + "learning_rate": 4.425786987952174e-05, + "loss": 0.009157263673841953, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0092, + "step": 469, + "tokens/total": 61415424, + "tokens/train_per_sec_per_gpu": 3645.56, + "tokens/trainable": 6545001 + }, + { + "epoch": 1.4968152866242037, + "grad_norm": 0.1396484375, + "learning_rate": 4.4222378575308164e-05, + "loss": 0.0058856685645878315, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0059, + "step": 470, + "tokens/total": 61546496, + "tokens/train_per_sec_per_gpu": 3065.44, + "tokens/trainable": 6557875 + }, + { + "epoch": 1.5, + "grad_norm": 0.1552734375, + "learning_rate": 4.4186792249488005e-05, + "loss": 0.006844916380941868, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00687, + "step": 471, + "tokens/total": 61677568, + "tokens/train_per_sec_per_gpu": 3406.5, + "tokens/trainable": 6572069 + }, + { + "epoch": 1.5, + "eval_loss": 0.009513070806860924, + "eval_ppl": 1.00956, + "eval_runtime": 41.9975, + "eval_samples_per_second": 64.313, + "eval_steps_per_second": 4.024, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 54.61, + "memory/max_allocated (GiB)": 54.61, + "step": 471 + }, + { + "epoch": 1.5031847133757963, + "grad_norm": 0.18359375, + "learning_rate": 4.415111107797445e-05, + "loss": 0.007119299378246069, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00714, + "step": 472, + "tokens/total": 61808640, + "tokens/train_per_sec_per_gpu": 3291.47, + "tokens/trainable": 6585775 + }, + { + "epoch": 1.5063694267515924, + "grad_norm": 0.1689453125, + "learning_rate": 4.411533523714954e-05, + "loss": 0.007842868566513062, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00787, + "step": 473, + "tokens/total": 61939712, + "tokens/train_per_sec_per_gpu": 3180.93, + "tokens/trainable": 6599115 + }, + { + "epoch": 1.5095541401273884, + "grad_norm": 0.181640625, + "learning_rate": 4.4079464903863266e-05, + "loss": 0.008342721499502659, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00838, + "step": 474, + "tokens/total": 62070784, + "tokens/train_per_sec_per_gpu": 3367.39, + "tokens/trainable": 6613147 + }, + { + "epoch": 1.5127388535031847, + "grad_norm": 0.171875, + "learning_rate": 4.404350025543276e-05, + "loss": 0.010307609103620052, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01036, + "step": 475, + "tokens/total": 62201856, + "tokens/train_per_sec_per_gpu": 3430.72, + "tokens/trainable": 6627509 + }, + { + "epoch": 1.515923566878981, + "grad_norm": 0.1787109375, + "learning_rate": 4.400744146964136e-05, + "loss": 0.008362861350178719, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0084, + "step": 476, + "tokens/total": 62332928, + "tokens/train_per_sec_per_gpu": 3051.31, + "tokens/trainable": 6640317 + }, + { + "epoch": 1.519108280254777, + "grad_norm": 0.232421875, + "learning_rate": 4.3971288724737745e-05, + "loss": 0.009740196168422699, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00979, + "step": 477, + "tokens/total": 62464000, + "tokens/train_per_sec_per_gpu": 2966.55, + "tokens/trainable": 6652748 + }, + { + "epoch": 1.5222929936305731, + "grad_norm": 0.1328125, + "learning_rate": 4.393504219943509e-05, + "loss": 0.004925255198031664, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00494, + "step": 478, + "tokens/total": 62595072, + "tokens/train_per_sec_per_gpu": 3160.26, + "tokens/trainable": 6666019 + }, + { + "epoch": 1.5254777070063694, + "grad_norm": 0.1748046875, + "learning_rate": 4.3898702072910095e-05, + "loss": 0.008841407485306263, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00888, + "step": 479, + "tokens/total": 62726144, + "tokens/train_per_sec_per_gpu": 3085.05, + "tokens/trainable": 6679004 + }, + { + "epoch": 1.5286624203821657, + "grad_norm": 0.15625, + "learning_rate": 4.386226852480223e-05, + "loss": 0.007529627997428179, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00756, + "step": 480, + "tokens/total": 62857216, + "tokens/train_per_sec_per_gpu": 3263.46, + "tokens/trainable": 6692672 + }, + { + "epoch": 1.5318471337579618, + "grad_norm": 0.1376953125, + "learning_rate": 4.382574173521272e-05, + "loss": 0.006781514268368483, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0068, + "step": 481, + "tokens/total": 62988288, + "tokens/train_per_sec_per_gpu": 3583.46, + "tokens/trainable": 6707600 + }, + { + "epoch": 1.5350318471337578, + "grad_norm": 0.16015625, + "learning_rate": 4.378912188470373e-05, + "loss": 0.0076340967789292336, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00766, + "step": 482, + "tokens/total": 63119360, + "tokens/train_per_sec_per_gpu": 3108.97, + "tokens/trainable": 6720612 + }, + { + "epoch": 1.5382165605095541, + "grad_norm": 0.212890625, + "learning_rate": 4.375240915429745e-05, + "loss": 0.009363564662635326, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00941, + "step": 483, + "tokens/total": 63250432, + "tokens/train_per_sec_per_gpu": 3151.23, + "tokens/trainable": 6733897 + }, + { + "epoch": 1.5414012738853504, + "grad_norm": 0.1474609375, + "learning_rate": 4.3715603725475195e-05, + "loss": 0.008497594855725765, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00853, + "step": 484, + "tokens/total": 63381504, + "tokens/train_per_sec_per_gpu": 3630.74, + "tokens/trainable": 6749020 + }, + { + "epoch": 1.5445859872611465, + "grad_norm": 0.1162109375, + "learning_rate": 4.367870578017653e-05, + "loss": 0.004754690453410149, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00477, + "step": 485, + "tokens/total": 63512576, + "tokens/train_per_sec_per_gpu": 3401.99, + "tokens/trainable": 6763237 + }, + { + "epoch": 1.5477707006369426, + "grad_norm": 0.1748046875, + "learning_rate": 4.364171550079833e-05, + "loss": 0.010673021897673607, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01073, + "step": 486, + "tokens/total": 63643648, + "tokens/train_per_sec_per_gpu": 3289.49, + "tokens/trainable": 6777048 + }, + { + "epoch": 1.5509554140127388, + "grad_norm": 0.1748046875, + "learning_rate": 4.3604633070193915e-05, + "loss": 0.009158292785286903, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0092, + "step": 487, + "tokens/total": 63774720, + "tokens/train_per_sec_per_gpu": 3323.4, + "tokens/trainable": 6790934 + }, + { + "epoch": 1.5541401273885351, + "grad_norm": 0.1298828125, + "learning_rate": 4.3567458671672154e-05, + "loss": 0.007650249172002077, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00768, + "step": 488, + "tokens/total": 63905792, + "tokens/train_per_sec_per_gpu": 3747.82, + "tokens/trainable": 6806546 + }, + { + "epoch": 1.5573248407643312, + "grad_norm": 0.142578125, + "learning_rate": 4.35301924889965e-05, + "loss": 0.006640854757279158, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00666, + "step": 489, + "tokens/total": 64036864, + "tokens/train_per_sec_per_gpu": 3390.74, + "tokens/trainable": 6820768 + }, + { + "epoch": 1.5605095541401273, + "grad_norm": 0.1611328125, + "learning_rate": 4.3492834706384154e-05, + "loss": 0.008299214765429497, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00833, + "step": 490, + "tokens/total": 64167936, + "tokens/train_per_sec_per_gpu": 3305.77, + "tokens/trainable": 6834601 + }, + { + "epoch": 1.5636942675159236, + "grad_norm": 0.1416015625, + "learning_rate": 4.345538550850512e-05, + "loss": 0.0071832421235740185, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00721, + "step": 491, + "tokens/total": 64299008, + "tokens/train_per_sec_per_gpu": 3301.85, + "tokens/trainable": 6848433 + }, + { + "epoch": 1.5668789808917198, + "grad_norm": 0.16796875, + "learning_rate": 4.3417845080481255e-05, + "loss": 0.008073330856859684, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00811, + "step": 492, + "tokens/total": 64430080, + "tokens/train_per_sec_per_gpu": 3378.84, + "tokens/trainable": 6862587 + }, + { + "epoch": 1.570063694267516, + "grad_norm": 0.1611328125, + "learning_rate": 4.3380213607885443e-05, + "loss": 0.009880865924060345, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00993, + "step": 493, + "tokens/total": 64561152, + "tokens/train_per_sec_per_gpu": 3303.22, + "tokens/trainable": 6876421 + }, + { + "epoch": 1.573248407643312, + "grad_norm": 0.1806640625, + "learning_rate": 4.3342491276740595e-05, + "loss": 0.008753279224038124, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00879, + "step": 494, + "tokens/total": 64692224, + "tokens/train_per_sec_per_gpu": 3323.89, + "tokens/trainable": 6890346 + }, + { + "epoch": 1.5764331210191083, + "grad_norm": 0.15234375, + "learning_rate": 4.3304678273518776e-05, + "loss": 0.009203528985381126, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00925, + "step": 495, + "tokens/total": 64823296, + "tokens/train_per_sec_per_gpu": 3358.21, + "tokens/trainable": 6904412 + }, + { + "epoch": 1.5796178343949046, + "grad_norm": 0.1689453125, + "learning_rate": 4.326677478514024e-05, + "loss": 0.00659502949565649, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00662, + "step": 496, + "tokens/total": 64954368, + "tokens/train_per_sec_per_gpu": 3221.86, + "tokens/trainable": 6917910 + }, + { + "epoch": 1.5828025477707006, + "grad_norm": 0.1591796875, + "learning_rate": 4.322878099897259e-05, + "loss": 0.009297506883740425, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00934, + "step": 497, + "tokens/total": 65085440, + "tokens/train_per_sec_per_gpu": 3425.35, + "tokens/trainable": 6932231 + }, + { + "epoch": 1.5859872611464967, + "grad_norm": 0.134765625, + "learning_rate": 4.319069710282974e-05, + "loss": 0.006143941078335047, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00616, + "step": 498, + "tokens/total": 65216512, + "tokens/train_per_sec_per_gpu": 3594.69, + "tokens/trainable": 6947330 + }, + { + "epoch": 1.589171974522293, + "grad_norm": 0.1689453125, + "learning_rate": 4.315252328497107e-05, + "loss": 0.006281242705881596, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0063, + "step": 499, + "tokens/total": 65347584, + "tokens/train_per_sec_per_gpu": 3393.37, + "tokens/trainable": 6961586 + }, + { + "epoch": 1.5923566878980893, + "grad_norm": 0.1572265625, + "learning_rate": 4.311425973410047e-05, + "loss": 0.007922859862446785, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00795, + "step": 500, + "tokens/total": 65478656, + "tokens/train_per_sec_per_gpu": 3263.1, + "tokens/trainable": 6975331 + }, + { + "epoch": 1.5955414012738853, + "grad_norm": 0.23046875, + "learning_rate": 4.307590663936541e-05, + "loss": 0.009491047821938992, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00954, + "step": 501, + "tokens/total": 65609728, + "tokens/train_per_sec_per_gpu": 3050.63, + "tokens/trainable": 6988184 + }, + { + "epoch": 1.5987261146496814, + "grad_norm": 0.1591796875, + "learning_rate": 4.3037464190355955e-05, + "loss": 0.007340395823121071, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00737, + "step": 502, + "tokens/total": 65740800, + "tokens/train_per_sec_per_gpu": 3185.7, + "tokens/trainable": 7001560 + }, + { + "epoch": 1.6019108280254777, + "grad_norm": 0.13671875, + "learning_rate": 4.299893257710394e-05, + "loss": 0.006943684071302414, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00697, + "step": 503, + "tokens/total": 65871872, + "tokens/train_per_sec_per_gpu": 3219.92, + "tokens/trainable": 7015042 + }, + { + "epoch": 1.605095541401274, + "grad_norm": 0.185546875, + "learning_rate": 4.2960311990081924e-05, + "loss": 0.009585048072040081, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00963, + "step": 504, + "tokens/total": 66002944, + "tokens/train_per_sec_per_gpu": 3349.25, + "tokens/trainable": 7029069 + }, + { + "epoch": 1.60828025477707, + "grad_norm": 0.1748046875, + "learning_rate": 4.292160262020229e-05, + "loss": 0.007607592269778252, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00764, + "step": 505, + "tokens/total": 66134016, + "tokens/train_per_sec_per_gpu": 3369.3, + "tokens/trainable": 7043148 + }, + { + "epoch": 1.611464968152866, + "grad_norm": 0.16015625, + "learning_rate": 4.288280465881632e-05, + "loss": 0.009396728128194809, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00944, + "step": 506, + "tokens/total": 66265088, + "tokens/train_per_sec_per_gpu": 3676.06, + "tokens/trainable": 7058458 + }, + { + "epoch": 1.6146496815286624, + "grad_norm": 0.1474609375, + "learning_rate": 4.2843918297713196e-05, + "loss": 0.007050440181046724, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00708, + "step": 507, + "tokens/total": 66396160, + "tokens/train_per_sec_per_gpu": 3145.55, + "tokens/trainable": 7071711 + }, + { + "epoch": 1.6178343949044587, + "grad_norm": 0.126953125, + "learning_rate": 4.2804943729119115e-05, + "loss": 0.007194128353148699, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00722, + "step": 508, + "tokens/total": 66527232, + "tokens/train_per_sec_per_gpu": 3462.21, + "tokens/trainable": 7086201 + }, + { + "epoch": 1.6210191082802548, + "grad_norm": 0.17578125, + "learning_rate": 4.2765881145696306e-05, + "loss": 0.00787313375622034, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0079, + "step": 509, + "tokens/total": 66658304, + "tokens/train_per_sec_per_gpu": 3054.79, + "tokens/trainable": 7099037 + }, + { + "epoch": 1.6242038216560508, + "grad_norm": 0.1572265625, + "learning_rate": 4.272673074054205e-05, + "loss": 0.006892327219247818, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00692, + "step": 510, + "tokens/total": 66789376, + "tokens/train_per_sec_per_gpu": 3439.83, + "tokens/trainable": 7113420 + }, + { + "epoch": 1.627388535031847, + "grad_norm": 0.13671875, + "learning_rate": 4.268749270718778e-05, + "loss": 0.006877953186631203, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0069, + "step": 511, + "tokens/total": 66920448, + "tokens/train_per_sec_per_gpu": 3209.06, + "tokens/trainable": 7126871 + }, + { + "epoch": 1.6305732484076434, + "grad_norm": 0.1474609375, + "learning_rate": 4.2648167239598115e-05, + "loss": 0.00894979014992714, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00899, + "step": 512, + "tokens/total": 67051520, + "tokens/train_per_sec_per_gpu": 3488.79, + "tokens/trainable": 7141439 + }, + { + "epoch": 1.6337579617834395, + "grad_norm": 0.2021484375, + "learning_rate": 4.260875453216985e-05, + "loss": 0.011133270338177681, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0112, + "step": 513, + "tokens/total": 67182592, + "tokens/train_per_sec_per_gpu": 3127.41, + "tokens/trainable": 7154645 + }, + { + "epoch": 1.6369426751592355, + "grad_norm": 0.1826171875, + "learning_rate": 4.256925477973105e-05, + "loss": 0.00897931307554245, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00902, + "step": 514, + "tokens/total": 67313664, + "tokens/train_per_sec_per_gpu": 3531.85, + "tokens/trainable": 7169429 + }, + { + "epoch": 1.6401273885350318, + "grad_norm": 0.1689453125, + "learning_rate": 4.2529668177540064e-05, + "loss": 0.007193025201559067, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00722, + "step": 515, + "tokens/total": 67444736, + "tokens/train_per_sec_per_gpu": 3300.04, + "tokens/trainable": 7183294 + }, + { + "epoch": 1.643312101910828, + "grad_norm": 0.1953125, + "learning_rate": 4.248999492128456e-05, + "loss": 0.008410904556512833, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00845, + "step": 516, + "tokens/total": 67575808, + "tokens/train_per_sec_per_gpu": 3053.28, + "tokens/trainable": 7196109 + }, + { + "epoch": 1.6464968152866242, + "grad_norm": 0.169921875, + "learning_rate": 4.2450235207080594e-05, + "loss": 0.007929853163659573, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00796, + "step": 517, + "tokens/total": 67706880, + "tokens/train_per_sec_per_gpu": 3368.48, + "tokens/trainable": 7210198 + }, + { + "epoch": 1.6496815286624202, + "grad_norm": 0.166015625, + "learning_rate": 4.241038923147154e-05, + "loss": 0.011742248199880123, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01181, + "step": 518, + "tokens/total": 67837952, + "tokens/train_per_sec_per_gpu": 3747.84, + "tokens/trainable": 7225870 + }, + { + "epoch": 1.6528662420382165, + "grad_norm": 0.150390625, + "learning_rate": 4.237045719142726e-05, + "loss": 0.007296052295714617, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00732, + "step": 519, + "tokens/total": 67969024, + "tokens/train_per_sec_per_gpu": 3095.92, + "tokens/trainable": 7238841 + }, + { + "epoch": 1.6560509554140128, + "grad_norm": 0.15234375, + "learning_rate": 4.2330439284343015e-05, + "loss": 0.006907866336405277, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00693, + "step": 520, + "tokens/total": 68100096, + "tokens/train_per_sec_per_gpu": 3589.32, + "tokens/trainable": 7253801 + }, + { + "epoch": 1.6592356687898089, + "grad_norm": 0.15625, + "learning_rate": 4.229033570803853e-05, + "loss": 0.0074706668965518475, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0075, + "step": 521, + "tokens/total": 68231168, + "tokens/train_per_sec_per_gpu": 3802.59, + "tokens/trainable": 7269629 + }, + { + "epoch": 1.662420382165605, + "grad_norm": 0.1513671875, + "learning_rate": 4.2250146660757036e-05, + "loss": 0.009104968048632145, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00915, + "step": 522, + "tokens/total": 68362240, + "tokens/train_per_sec_per_gpu": 3755.79, + "tokens/trainable": 7285363 + }, + { + "epoch": 1.6656050955414012, + "grad_norm": 0.1484375, + "learning_rate": 4.220987234116426e-05, + "loss": 0.005891850218176842, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00591, + "step": 523, + "tokens/total": 68493312, + "tokens/train_per_sec_per_gpu": 3446.53, + "tokens/trainable": 7299790 + }, + { + "epoch": 1.6687898089171975, + "grad_norm": 0.162109375, + "learning_rate": 4.216951294834744e-05, + "loss": 0.006473960820585489, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00649, + "step": 524, + "tokens/total": 68624384, + "tokens/train_per_sec_per_gpu": 3751.7, + "tokens/trainable": 7315516 + }, + { + "epoch": 1.6719745222929936, + "grad_norm": 0.1337890625, + "learning_rate": 4.2129068681814396e-05, + "loss": 0.0052047837525606155, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00522, + "step": 525, + "tokens/total": 68755456, + "tokens/train_per_sec_per_gpu": 3241.92, + "tokens/trainable": 7329146 + }, + { + "epoch": 1.6751592356687897, + "grad_norm": 0.2490234375, + "learning_rate": 4.208853974149246e-05, + "loss": 0.01116788387298584, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01123, + "step": 526, + "tokens/total": 68886528, + "tokens/train_per_sec_per_gpu": 3005.85, + "tokens/trainable": 7341894 + }, + { + "epoch": 1.678343949044586, + "grad_norm": 0.2490234375, + "learning_rate": 4.204792632772754e-05, + "loss": 0.01081200409680605, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01087, + "step": 527, + "tokens/total": 69017600, + "tokens/train_per_sec_per_gpu": 3072.53, + "tokens/trainable": 7354819 + }, + { + "epoch": 1.6815286624203822, + "grad_norm": 0.181640625, + "learning_rate": 4.200722864128315e-05, + "loss": 0.007884484715759754, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00792, + "step": 528, + "tokens/total": 69148672, + "tokens/train_per_sec_per_gpu": 3481.52, + "tokens/trainable": 7369372 + }, + { + "epoch": 1.6847133757961783, + "grad_norm": 0.146484375, + "learning_rate": 4.196644688333935e-05, + "loss": 0.006211051717400551, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00623, + "step": 529, + "tokens/total": 69279744, + "tokens/train_per_sec_per_gpu": 3436.61, + "tokens/trainable": 7383760 + }, + { + "epoch": 1.6878980891719744, + "grad_norm": 0.1689453125, + "learning_rate": 4.19255812554918e-05, + "loss": 0.007918811403214931, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00795, + "step": 530, + "tokens/total": 69410816, + "tokens/train_per_sec_per_gpu": 3498.93, + "tokens/trainable": 7398384 + }, + { + "epoch": 1.6910828025477707, + "grad_norm": 0.2177734375, + "learning_rate": 4.1884631959750766e-05, + "loss": 0.007444203365594149, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00747, + "step": 531, + "tokens/total": 69541888, + "tokens/train_per_sec_per_gpu": 3052.82, + "tokens/trainable": 7411204 + }, + { + "epoch": 1.694267515923567, + "grad_norm": 0.2392578125, + "learning_rate": 4.1843599198540095e-05, + "loss": 0.006427375599741936, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00645, + "step": 532, + "tokens/total": 69672960, + "tokens/train_per_sec_per_gpu": 2976.57, + "tokens/trainable": 7423690 + }, + { + "epoch": 1.697452229299363, + "grad_norm": 0.169921875, + "learning_rate": 4.1802483174696214e-05, + "loss": 0.007701891474425793, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00773, + "step": 533, + "tokens/total": 69804032, + "tokens/train_per_sec_per_gpu": 2933.51, + "tokens/trainable": 7436112 + }, + { + "epoch": 1.700636942675159, + "grad_norm": 0.13671875, + "learning_rate": 4.176128409146718e-05, + "loss": 0.006673748139292002, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0067, + "step": 534, + "tokens/total": 69935104, + "tokens/train_per_sec_per_gpu": 3182.67, + "tokens/trainable": 7449477 + }, + { + "epoch": 1.7038216560509554, + "grad_norm": 0.1328125, + "learning_rate": 4.172000215251161e-05, + "loss": 0.008220399729907513, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00825, + "step": 535, + "tokens/total": 70066176, + "tokens/train_per_sec_per_gpu": 3196.8, + "tokens/trainable": 7462890 + }, + { + "epoch": 1.7070063694267517, + "grad_norm": 0.1982421875, + "learning_rate": 4.167863756189767e-05, + "loss": 0.008523629046976566, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00856, + "step": 536, + "tokens/total": 70197248, + "tokens/train_per_sec_per_gpu": 3266.89, + "tokens/trainable": 7476579 + }, + { + "epoch": 1.7101910828025477, + "grad_norm": 0.1630859375, + "learning_rate": 4.163719052410217e-05, + "loss": 0.008510093204677105, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00855, + "step": 537, + "tokens/total": 70328320, + "tokens/train_per_sec_per_gpu": 3648.67, + "tokens/trainable": 7491858 + }, + { + "epoch": 1.7133757961783438, + "grad_norm": 0.16796875, + "learning_rate": 4.159566124400942e-05, + "loss": 0.00962991826236248, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00968, + "step": 538, + "tokens/total": 70459392, + "tokens/train_per_sec_per_gpu": 3612.85, + "tokens/trainable": 7507000 + }, + { + "epoch": 1.71656050955414, + "grad_norm": 0.1611328125, + "learning_rate": 4.1554049926910285e-05, + "loss": 0.006633860524743795, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00666, + "step": 539, + "tokens/total": 70590464, + "tokens/train_per_sec_per_gpu": 3414.03, + "tokens/trainable": 7521257 + }, + { + "epoch": 1.7197452229299364, + "grad_norm": 0.1611328125, + "learning_rate": 4.151235677850119e-05, + "loss": 0.007898521609604359, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00793, + "step": 540, + "tokens/total": 70721536, + "tokens/train_per_sec_per_gpu": 3441.64, + "tokens/trainable": 7535621 + }, + { + "epoch": 1.7229299363057324, + "grad_norm": 0.154296875, + "learning_rate": 4.147058200488305e-05, + "loss": 0.009673213586211205, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00972, + "step": 541, + "tokens/total": 70852608, + "tokens/train_per_sec_per_gpu": 3247.07, + "tokens/trainable": 7549162 + }, + { + "epoch": 1.7261146496815285, + "grad_norm": 0.1416015625, + "learning_rate": 4.142872581256028e-05, + "loss": 0.007840042002499104, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00787, + "step": 542, + "tokens/total": 70983680, + "tokens/train_per_sec_per_gpu": 3313.54, + "tokens/trainable": 7563047 + }, + { + "epoch": 1.7292993630573248, + "grad_norm": 0.158203125, + "learning_rate": 4.1386788408439784e-05, + "loss": 0.005681775975972414, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0057, + "step": 543, + "tokens/total": 71114752, + "tokens/train_per_sec_per_gpu": 3239.22, + "tokens/trainable": 7576603 + }, + { + "epoch": 1.732484076433121, + "grad_norm": 0.140625, + "learning_rate": 4.134476999982989e-05, + "loss": 0.005047548562288284, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00506, + "step": 544, + "tokens/total": 71245824, + "tokens/train_per_sec_per_gpu": 3265.49, + "tokens/trainable": 7590285 + }, + { + "epoch": 1.7356687898089171, + "grad_norm": 0.1572265625, + "learning_rate": 4.130267079443938e-05, + "loss": 0.0074127367697656155, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00744, + "step": 545, + "tokens/total": 71376896, + "tokens/train_per_sec_per_gpu": 3477.95, + "tokens/trainable": 7604842 + }, + { + "epoch": 1.7388535031847132, + "grad_norm": 0.197265625, + "learning_rate": 4.1260491000376446e-05, + "loss": 0.007608677726238966, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00764, + "step": 546, + "tokens/total": 71507968, + "tokens/train_per_sec_per_gpu": 3118.58, + "tokens/trainable": 7617958 + }, + { + "epoch": 1.7420382165605095, + "grad_norm": 0.21484375, + "learning_rate": 4.1218230826147615e-05, + "loss": 0.01108642015606165, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01115, + "step": 547, + "tokens/total": 71639040, + "tokens/train_per_sec_per_gpu": 3443.01, + "tokens/trainable": 7632387 + }, + { + "epoch": 1.7452229299363058, + "grad_norm": 0.1552734375, + "learning_rate": 4.117589048065677e-05, + "loss": 0.006157029885798693, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00618, + "step": 548, + "tokens/total": 71770112, + "tokens/train_per_sec_per_gpu": 3439.27, + "tokens/trainable": 7646780 + }, + { + "epoch": 1.7484076433121019, + "grad_norm": 0.138671875, + "learning_rate": 4.113347017320414e-05, + "loss": 0.005342322401702404, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00536, + "step": 549, + "tokens/total": 71901184, + "tokens/train_per_sec_per_gpu": 3001.79, + "tokens/trainable": 7659368 + }, + { + "epoch": 1.7515923566878981, + "grad_norm": 0.09423828125, + "learning_rate": 4.1090970113485184e-05, + "loss": 0.0040708379819989204, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00408, + "step": 550, + "tokens/total": 72032256, + "tokens/train_per_sec_per_gpu": 3546.66, + "tokens/trainable": 7674195 + }, + { + "epoch": 1.7547770700636942, + "grad_norm": 0.19921875, + "learning_rate": 4.1048390511589595e-05, + "loss": 0.01067125890403986, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01073, + "step": 551, + "tokens/total": 72163328, + "tokens/train_per_sec_per_gpu": 3415.06, + "tokens/trainable": 7688507 + }, + { + "epoch": 1.7579617834394905, + "grad_norm": 0.171875, + "learning_rate": 4.1005731578000305e-05, + "loss": 0.008569694124162197, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00861, + "step": 552, + "tokens/total": 72294400, + "tokens/train_per_sec_per_gpu": 3627.75, + "tokens/trainable": 7703620 + }, + { + "epoch": 1.7611464968152868, + "grad_norm": 0.1767578125, + "learning_rate": 4.0962993523592374e-05, + "loss": 0.009042307734489441, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00908, + "step": 553, + "tokens/total": 72425472, + "tokens/train_per_sec_per_gpu": 3269.54, + "tokens/trainable": 7717318 + }, + { + "epoch": 1.7643312101910829, + "grad_norm": 0.1943359375, + "learning_rate": 4.092017655963198e-05, + "loss": 0.007899527437984943, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00793, + "step": 554, + "tokens/total": 72556544, + "tokens/train_per_sec_per_gpu": 3212.13, + "tokens/trainable": 7730777 + }, + { + "epoch": 1.767515923566879, + "grad_norm": 0.1787109375, + "learning_rate": 4.0877280897775406e-05, + "loss": 0.010296393185853958, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01035, + "step": 555, + "tokens/total": 72687616, + "tokens/train_per_sec_per_gpu": 3337.93, + "tokens/trainable": 7744761 + }, + { + "epoch": 1.7707006369426752, + "grad_norm": 0.146484375, + "learning_rate": 4.083430675006791e-05, + "loss": 0.009942286647856236, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00999, + "step": 556, + "tokens/total": 72818688, + "tokens/train_per_sec_per_gpu": 3400.69, + "tokens/trainable": 7759000 + }, + { + "epoch": 1.7738853503184715, + "grad_norm": 0.197265625, + "learning_rate": 4.0791254328942756e-05, + "loss": 0.00717775197699666, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0072, + "step": 557, + "tokens/total": 72949760, + "tokens/train_per_sec_per_gpu": 3122.89, + "tokens/trainable": 7772100 + }, + { + "epoch": 1.7770700636942676, + "grad_norm": 0.1435546875, + "learning_rate": 4.074812384722014e-05, + "loss": 0.008067919872701168, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0081, + "step": 558, + "tokens/total": 73080832, + "tokens/train_per_sec_per_gpu": 3519.68, + "tokens/trainable": 7786740 + }, + { + "epoch": 1.7802547770700636, + "grad_norm": 0.169921875, + "learning_rate": 4.0704915518106125e-05, + "loss": 0.007346912752836943, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00737, + "step": 559, + "tokens/total": 73211904, + "tokens/train_per_sec_per_gpu": 3104.83, + "tokens/trainable": 7799749 + }, + { + "epoch": 1.78343949044586, + "grad_norm": 0.1298828125, + "learning_rate": 4.066162955519159e-05, + "loss": 0.0073562380857765675, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00738, + "step": 560, + "tokens/total": 73342976, + "tokens/train_per_sec_per_gpu": 3360.32, + "tokens/trainable": 7813821 + }, + { + "epoch": 1.7866242038216562, + "grad_norm": 0.19921875, + "learning_rate": 4.061826617245119e-05, + "loss": 0.007865460589528084, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0079, + "step": 561, + "tokens/total": 73474048, + "tokens/train_per_sec_per_gpu": 2859.48, + "tokens/trainable": 7825883 + }, + { + "epoch": 1.7898089171974523, + "grad_norm": 0.189453125, + "learning_rate": 4.0574825584242275e-05, + "loss": 0.008709411136806011, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00875, + "step": 562, + "tokens/total": 73605120, + "tokens/train_per_sec_per_gpu": 3470.03, + "tokens/trainable": 7840437 + }, + { + "epoch": 1.7929936305732483, + "grad_norm": 0.1728515625, + "learning_rate": 4.053130800530386e-05, + "loss": 0.010312874801456928, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01037, + "step": 563, + "tokens/total": 73736192, + "tokens/train_per_sec_per_gpu": 3416.83, + "tokens/trainable": 7854748 + }, + { + "epoch": 1.7961783439490446, + "grad_norm": 0.1787109375, + "learning_rate": 4.048771365075554e-05, + "loss": 0.006712635047733784, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00674, + "step": 564, + "tokens/total": 73867264, + "tokens/train_per_sec_per_gpu": 3008.08, + "tokens/trainable": 7867356 + }, + { + "epoch": 1.799363057324841, + "grad_norm": 0.216796875, + "learning_rate": 4.0444042736096435e-05, + "loss": 0.012959079816937447, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01304, + "step": 565, + "tokens/total": 73998336, + "tokens/train_per_sec_per_gpu": 3455.78, + "tokens/trainable": 7881813 + }, + { + "epoch": 1.802547770700637, + "grad_norm": 0.142578125, + "learning_rate": 4.0400295477204105e-05, + "loss": 0.006475909147411585, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0065, + "step": 566, + "tokens/total": 74129408, + "tokens/train_per_sec_per_gpu": 3215.58, + "tokens/trainable": 7895287 + }, + { + "epoch": 1.805732484076433, + "grad_norm": 0.1748046875, + "learning_rate": 4.035647209033353e-05, + "loss": 0.009855620563030243, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0099, + "step": 567, + "tokens/total": 74260480, + "tokens/train_per_sec_per_gpu": 3762.07, + "tokens/trainable": 7910983 + }, + { + "epoch": 1.8089171974522293, + "grad_norm": 0.169921875, + "learning_rate": 4.031257279211599e-05, + "loss": 0.007472330704331398, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0075, + "step": 568, + "tokens/total": 74391552, + "tokens/train_per_sec_per_gpu": 3163.77, + "tokens/trainable": 7924299 + }, + { + "epoch": 1.8121019108280256, + "grad_norm": 0.16796875, + "learning_rate": 4.026859779955802e-05, + "loss": 0.008227458223700523, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00826, + "step": 569, + "tokens/total": 74522624, + "tokens/train_per_sec_per_gpu": 3363.95, + "tokens/trainable": 7938293 + }, + { + "epoch": 1.8152866242038217, + "grad_norm": 0.1591796875, + "learning_rate": 4.022454733004035e-05, + "loss": 0.0075818696059286594, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00761, + "step": 570, + "tokens/total": 74653696, + "tokens/train_per_sec_per_gpu": 2956.76, + "tokens/trainable": 7950764 + }, + { + "epoch": 1.8184713375796178, + "grad_norm": 0.1865234375, + "learning_rate": 4.01804216013168e-05, + "loss": 0.009609042666852474, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00966, + "step": 571, + "tokens/total": 74784768, + "tokens/train_per_sec_per_gpu": 3564.99, + "tokens/trainable": 7965654 + }, + { + "epoch": 1.821656050955414, + "grad_norm": 0.2041015625, + "learning_rate": 4.013622083151321e-05, + "loss": 0.011200753971934319, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01126, + "step": 572, + "tokens/total": 74915840, + "tokens/train_per_sec_per_gpu": 3175.53, + "tokens/trainable": 7978976 + }, + { + "epoch": 1.8248407643312103, + "grad_norm": 0.1943359375, + "learning_rate": 4.009194523912638e-05, + "loss": 0.011081540025770664, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01114, + "step": 573, + "tokens/total": 75046912, + "tokens/train_per_sec_per_gpu": 3512.32, + "tokens/trainable": 7993605 + }, + { + "epoch": 1.8280254777070064, + "grad_norm": 0.150390625, + "learning_rate": 4.004759504302297e-05, + "loss": 0.007977863773703575, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00801, + "step": 574, + "tokens/total": 75177984, + "tokens/train_per_sec_per_gpu": 3555.25, + "tokens/trainable": 8008422 + }, + { + "epoch": 1.8312101910828025, + "grad_norm": 0.1474609375, + "learning_rate": 4.000317046243845e-05, + "loss": 0.005992071703076363, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00601, + "step": 575, + "tokens/total": 75309056, + "tokens/train_per_sec_per_gpu": 2628.73, + "tokens/trainable": 8019575 + }, + { + "epoch": 1.8343949044585988, + "grad_norm": 0.134765625, + "learning_rate": 3.9958671716975966e-05, + "loss": 0.005763609427958727, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00578, + "step": 576, + "tokens/total": 75440128, + "tokens/train_per_sec_per_gpu": 3099.03, + "tokens/trainable": 8032607 + }, + { + "epoch": 1.837579617834395, + "grad_norm": 0.18359375, + "learning_rate": 3.9914099026605286e-05, + "loss": 0.00910909567028284, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00915, + "step": 577, + "tokens/total": 75571200, + "tokens/train_per_sec_per_gpu": 3311.4, + "tokens/trainable": 8046493 + }, + { + "epoch": 1.8407643312101911, + "grad_norm": 0.1630859375, + "learning_rate": 3.986945261166174e-05, + "loss": 0.0058432393707334995, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00586, + "step": 578, + "tokens/total": 75702272, + "tokens/train_per_sec_per_gpu": 2863.58, + "tokens/trainable": 8058562 + }, + { + "epoch": 1.8439490445859872, + "grad_norm": 0.1494140625, + "learning_rate": 3.9824732692845045e-05, + "loss": 0.006885102018713951, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00691, + "step": 579, + "tokens/total": 75833344, + "tokens/train_per_sec_per_gpu": 3605.27, + "tokens/trainable": 8073625 + }, + { + "epoch": 1.8471337579617835, + "grad_norm": 0.150390625, + "learning_rate": 3.977993949121831e-05, + "loss": 0.007448772434145212, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00748, + "step": 580, + "tokens/total": 75964416, + "tokens/train_per_sec_per_gpu": 3178.81, + "tokens/trainable": 8086952 + }, + { + "epoch": 1.8503184713375798, + "grad_norm": 0.1787109375, + "learning_rate": 3.9735073228206896e-05, + "loss": 0.01037865225225687, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01043, + "step": 581, + "tokens/total": 76095488, + "tokens/train_per_sec_per_gpu": 3771.77, + "tokens/trainable": 8102605 + }, + { + "epoch": 1.8535031847133758, + "grad_norm": 0.1728515625, + "learning_rate": 3.9690134125597315e-05, + "loss": 0.005139034241437912, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00515, + "step": 582, + "tokens/total": 76226560, + "tokens/train_per_sec_per_gpu": 3043.71, + "tokens/trainable": 8115368 + }, + { + "epoch": 1.856687898089172, + "grad_norm": 0.1376953125, + "learning_rate": 3.9645122405536144e-05, + "loss": 0.006013063248246908, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00603, + "step": 583, + "tokens/total": 76357632, + "tokens/train_per_sec_per_gpu": 3251.38, + "tokens/trainable": 8129041 + }, + { + "epoch": 1.8598726114649682, + "grad_norm": 0.1728515625, + "learning_rate": 3.9600038290528944e-05, + "loss": 0.00799723993986845, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00803, + "step": 584, + "tokens/total": 76488704, + "tokens/train_per_sec_per_gpu": 3320.59, + "tokens/trainable": 8142910 + }, + { + "epoch": 1.8630573248407645, + "grad_norm": 0.142578125, + "learning_rate": 3.955488200343913e-05, + "loss": 0.00701179401949048, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00704, + "step": 585, + "tokens/total": 76619776, + "tokens/train_per_sec_per_gpu": 3561.59, + "tokens/trainable": 8157807 + }, + { + "epoch": 1.8662420382165605, + "grad_norm": 0.1533203125, + "learning_rate": 3.950965376748689e-05, + "loss": 0.00536251999437809, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00538, + "step": 586, + "tokens/total": 76750848, + "tokens/train_per_sec_per_gpu": 3253.01, + "tokens/trainable": 8171483 + }, + { + "epoch": 1.8694267515923566, + "grad_norm": 0.1474609375, + "learning_rate": 3.946435380624808e-05, + "loss": 0.005477463360875845, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00549, + "step": 587, + "tokens/total": 76881920, + "tokens/train_per_sec_per_gpu": 2986.43, + "tokens/trainable": 8184075 + }, + { + "epoch": 1.872611464968153, + "grad_norm": 0.154296875, + "learning_rate": 3.94189823436531e-05, + "loss": 0.007740751840174198, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00777, + "step": 588, + "tokens/total": 77012992, + "tokens/train_per_sec_per_gpu": 3302.5, + "tokens/trainable": 8197922 + }, + { + "epoch": 1.8757961783439492, + "grad_norm": 0.1669921875, + "learning_rate": 3.937353960398581e-05, + "loss": 0.007541216444224119, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00757, + "step": 589, + "tokens/total": 77144064, + "tokens/train_per_sec_per_gpu": 3178.73, + "tokens/trainable": 8211342 + }, + { + "epoch": 1.8789808917197452, + "grad_norm": 0.1435546875, + "learning_rate": 3.932802581188243e-05, + "loss": 0.006363678723573685, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00638, + "step": 590, + "tokens/total": 77275136, + "tokens/train_per_sec_per_gpu": 3260.49, + "tokens/trainable": 8225038 + }, + { + "epoch": 1.8821656050955413, + "grad_norm": 0.2060546875, + "learning_rate": 3.928244119233038e-05, + "loss": 0.010623229667544365, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01068, + "step": 591, + "tokens/total": 77406208, + "tokens/train_per_sec_per_gpu": 3108.47, + "tokens/trainable": 8238178 + }, + { + "epoch": 1.8853503184713376, + "grad_norm": 0.205078125, + "learning_rate": 3.9236785970667214e-05, + "loss": 0.008010565303266048, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00804, + "step": 592, + "tokens/total": 77537280, + "tokens/train_per_sec_per_gpu": 3537.95, + "tokens/trainable": 8252970 + }, + { + "epoch": 1.888535031847134, + "grad_norm": 0.130859375, + "learning_rate": 3.91910603725795e-05, + "loss": 0.006147422362118959, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00617, + "step": 593, + "tokens/total": 77668352, + "tokens/train_per_sec_per_gpu": 3429.45, + "tokens/trainable": 8267246 + }, + { + "epoch": 1.89171974522293, + "grad_norm": 0.162109375, + "learning_rate": 3.9145264624101676e-05, + "loss": 0.007066651247441769, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00709, + "step": 594, + "tokens/total": 77799424, + "tokens/train_per_sec_per_gpu": 3344.2, + "tokens/trainable": 8281242 + }, + { + "epoch": 1.894904458598726, + "grad_norm": 0.169921875, + "learning_rate": 3.909939895161498e-05, + "loss": 0.007343544624745846, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00737, + "step": 595, + "tokens/total": 77930496, + "tokens/train_per_sec_per_gpu": 3615.15, + "tokens/trainable": 8296272 + }, + { + "epoch": 1.8980891719745223, + "grad_norm": 0.1669921875, + "learning_rate": 3.905346358184629e-05, + "loss": 0.006192365661263466, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00621, + "step": 596, + "tokens/total": 78061568, + "tokens/train_per_sec_per_gpu": 3043.7, + "tokens/trainable": 8309000 + }, + { + "epoch": 1.9012738853503186, + "grad_norm": 0.201171875, + "learning_rate": 3.900745874186701e-05, + "loss": 0.008313626050949097, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00835, + "step": 597, + "tokens/total": 78192640, + "tokens/train_per_sec_per_gpu": 3567.72, + "tokens/trainable": 8323863 + }, + { + "epoch": 1.9044585987261147, + "grad_norm": 0.1298828125, + "learning_rate": 3.896138465909196e-05, + "loss": 0.006214370485395193, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00623, + "step": 598, + "tokens/total": 78323712, + "tokens/train_per_sec_per_gpu": 3365.54, + "tokens/trainable": 8337954 + }, + { + "epoch": 1.9076433121019107, + "grad_norm": 0.1591796875, + "learning_rate": 3.8915241561278266e-05, + "loss": 0.007558876648545265, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00759, + "step": 599, + "tokens/total": 78454784, + "tokens/train_per_sec_per_gpu": 3425.9, + "tokens/trainable": 8352302 + }, + { + "epoch": 1.910828025477707, + "grad_norm": 0.166015625, + "learning_rate": 3.8869029676524174e-05, + "loss": 0.005686955992132425, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0057, + "step": 600, + "tokens/total": 78585856, + "tokens/train_per_sec_per_gpu": 3357.73, + "tokens/trainable": 8366310 + }, + { + "epoch": 1.9140127388535033, + "grad_norm": 0.1630859375, + "learning_rate": 3.8822749233268006e-05, + "loss": 0.0077353366650640965, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00777, + "step": 601, + "tokens/total": 78716928, + "tokens/train_per_sec_per_gpu": 3165.47, + "tokens/trainable": 8379573 + }, + { + "epoch": 1.9171974522292994, + "grad_norm": 0.1279296875, + "learning_rate": 3.877640046028696e-05, + "loss": 0.0062081338837742805, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00623, + "step": 602, + "tokens/total": 78848000, + "tokens/train_per_sec_per_gpu": 3562.58, + "tokens/trainable": 8394385 + }, + { + "epoch": 1.9203821656050954, + "grad_norm": 0.154296875, + "learning_rate": 3.872998358669601e-05, + "loss": 0.006809039041399956, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00683, + "step": 603, + "tokens/total": 78979072, + "tokens/train_per_sec_per_gpu": 3598.12, + "tokens/trainable": 8409414 + }, + { + "epoch": 1.9235668789808917, + "grad_norm": 0.1513671875, + "learning_rate": 3.868349884194678e-05, + "loss": 0.004747939296066761, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00476, + "step": 604, + "tokens/total": 79110144, + "tokens/train_per_sec_per_gpu": 3006.34, + "tokens/trainable": 8422068 + }, + { + "epoch": 1.926751592356688, + "grad_norm": 0.1806640625, + "learning_rate": 3.863694645582642e-05, + "loss": 0.007029777858406305, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00705, + "step": 605, + "tokens/total": 79241216, + "tokens/train_per_sec_per_gpu": 3244.04, + "tokens/trainable": 8435649 + }, + { + "epoch": 1.929936305732484, + "grad_norm": 0.1650390625, + "learning_rate": 3.8590326658456376e-05, + "loss": 0.006050920579582453, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00607, + "step": 606, + "tokens/total": 79372288, + "tokens/train_per_sec_per_gpu": 3305.31, + "tokens/trainable": 8449489 + }, + { + "epoch": 1.9331210191082802, + "grad_norm": 0.17578125, + "learning_rate": 3.854363968029142e-05, + "loss": 0.0075315129943192005, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00756, + "step": 607, + "tokens/total": 79503360, + "tokens/train_per_sec_per_gpu": 3186.77, + "tokens/trainable": 8462838 + }, + { + "epoch": 1.9363057324840764, + "grad_norm": 0.158203125, + "learning_rate": 3.849688575211836e-05, + "loss": 0.006646899972110987, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00667, + "step": 608, + "tokens/total": 79634432, + "tokens/train_per_sec_per_gpu": 3410.5, + "tokens/trainable": 8477093 + }, + { + "epoch": 1.9394904458598727, + "grad_norm": 0.189453125, + "learning_rate": 3.8450065105054966e-05, + "loss": 0.00727155851200223, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0073, + "step": 609, + "tokens/total": 79765504, + "tokens/train_per_sec_per_gpu": 3308.9, + "tokens/trainable": 8490972 + }, + { + "epoch": 1.9426751592356688, + "grad_norm": 0.1875, + "learning_rate": 3.840317797054882e-05, + "loss": 0.009210377931594849, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00925, + "step": 610, + "tokens/total": 79896576, + "tokens/train_per_sec_per_gpu": 3480.61, + "tokens/trainable": 8505449 + }, + { + "epoch": 1.9458598726114649, + "grad_norm": 0.1650390625, + "learning_rate": 3.83562245803762e-05, + "loss": 0.008728603832423687, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00877, + "step": 611, + "tokens/total": 80027648, + "tokens/train_per_sec_per_gpu": 3432.92, + "tokens/trainable": 8519873 + }, + { + "epoch": 1.9490445859872612, + "grad_norm": 0.2119140625, + "learning_rate": 3.830920516664085e-05, + "loss": 0.00592332798987627, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00594, + "step": 612, + "tokens/total": 80158720, + "tokens/train_per_sec_per_gpu": 2869.74, + "tokens/trainable": 8531911 + }, + { + "epoch": 1.9522292993630574, + "grad_norm": 0.162109375, + "learning_rate": 3.826211996177291e-05, + "loss": 0.00876440480351448, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0088, + "step": 613, + "tokens/total": 80289792, + "tokens/train_per_sec_per_gpu": 3232.65, + "tokens/trainable": 8545475 + }, + { + "epoch": 1.9554140127388535, + "grad_norm": 0.173828125, + "learning_rate": 3.8214969198527787e-05, + "loss": 0.010759076103568077, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.01082, + "step": 614, + "tokens/total": 80420864, + "tokens/train_per_sec_per_gpu": 3370.92, + "tokens/trainable": 8559559 + }, + { + "epoch": 1.9585987261146496, + "grad_norm": 0.158203125, + "learning_rate": 3.8167753109984886e-05, + "loss": 0.007340329699218273, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00737, + "step": 615, + "tokens/total": 80551936, + "tokens/train_per_sec_per_gpu": 3167.2, + "tokens/trainable": 8572860 + }, + { + "epoch": 1.9617834394904459, + "grad_norm": 0.1923828125, + "learning_rate": 3.8120471929546576e-05, + "loss": 0.009786421433091164, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00983, + "step": 616, + "tokens/total": 80683008, + "tokens/train_per_sec_per_gpu": 3398.38, + "tokens/trainable": 8587062 + }, + { + "epoch": 1.9649681528662422, + "grad_norm": 0.1591796875, + "learning_rate": 3.807312589093701e-05, + "loss": 0.007565875072032213, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00759, + "step": 617, + "tokens/total": 80814080, + "tokens/train_per_sec_per_gpu": 3589.95, + "tokens/trainable": 8602027 + }, + { + "epoch": 1.9681528662420382, + "grad_norm": 0.158203125, + "learning_rate": 3.802571522820091e-05, + "loss": 0.005681060254573822, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0057, + "step": 618, + "tokens/total": 80945152, + "tokens/train_per_sec_per_gpu": 3097.22, + "tokens/trainable": 8615057 + }, + { + "epoch": 1.9713375796178343, + "grad_norm": 0.140625, + "learning_rate": 3.7978240175702475e-05, + "loss": 0.007764302659779787, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00779, + "step": 619, + "tokens/total": 81076224, + "tokens/train_per_sec_per_gpu": 3585.75, + "tokens/trainable": 8630164 + }, + { + "epoch": 1.9745222929936306, + "grad_norm": 0.1787109375, + "learning_rate": 3.7930700968124214e-05, + "loss": 0.007851460948586464, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00788, + "step": 620, + "tokens/total": 81207296, + "tokens/train_per_sec_per_gpu": 3407.67, + "tokens/trainable": 8644417 + }, + { + "epoch": 1.9777070063694269, + "grad_norm": 0.158203125, + "learning_rate": 3.788309784046574e-05, + "loss": 0.007941392250359058, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00797, + "step": 621, + "tokens/total": 81338368, + "tokens/train_per_sec_per_gpu": 3178.19, + "tokens/trainable": 8657791 + }, + { + "epoch": 1.980891719745223, + "grad_norm": 0.19921875, + "learning_rate": 3.7835431028042664e-05, + "loss": 0.008540787734091282, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00858, + "step": 622, + "tokens/total": 81469440, + "tokens/train_per_sec_per_gpu": 3471.65, + "tokens/trainable": 8672330 + }, + { + "epoch": 1.984076433121019, + "grad_norm": 0.1611328125, + "learning_rate": 3.778770076648543e-05, + "loss": 0.008716538548469543, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00875, + "step": 623, + "tokens/total": 81600512, + "tokens/train_per_sec_per_gpu": 3219.98, + "tokens/trainable": 8685828 + }, + { + "epoch": 1.9872611464968153, + "grad_norm": 0.158203125, + "learning_rate": 3.773990729173807e-05, + "loss": 0.007769486866891384, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0078, + "step": 624, + "tokens/total": 81731584, + "tokens/train_per_sec_per_gpu": 3473.48, + "tokens/trainable": 8700372 + }, + { + "epoch": 1.9904458598726116, + "grad_norm": 0.1484375, + "learning_rate": 3.769205084005714e-05, + "loss": 0.008443665690720081, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00848, + "step": 625, + "tokens/total": 81862656, + "tokens/train_per_sec_per_gpu": 3400.54, + "tokens/trainable": 8714943 + }, + { + "epoch": 1.9936305732484076, + "grad_norm": 0.201171875, + "learning_rate": 3.7644131648010494e-05, + "loss": 0.009850014001131058, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0099, + "step": 626, + "tokens/total": 81993728, + "tokens/train_per_sec_per_gpu": 3193.43, + "tokens/trainable": 8728328 + }, + { + "epoch": 1.9968152866242037, + "grad_norm": 0.1416015625, + "learning_rate": 3.759614995247615e-05, + "loss": 0.0070216236636042595, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00705, + "step": 627, + "tokens/total": 82124800, + "tokens/train_per_sec_per_gpu": 3280.91, + "tokens/trainable": 8742135 + }, + { + "epoch": 2.0, + "grad_norm": 0.2431640625, + "learning_rate": 3.7548105990641055e-05, + "loss": 0.008461863733828068, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 39.25, + "memory/max_allocated (GiB)": 39.25, + "ppl": 1.0085, + "step": 628, + "tokens/total": 82198528, + "tokens/train_per_sec_per_gpu": 3127.15, + "tokens/trainable": 8749352 + }, + { + "epoch": 2.0, + "eval_loss": 0.00880392361432314, + "eval_ppl": 1.00884, + "eval_runtime": 41.5789, + "eval_samples_per_second": 64.961, + "eval_steps_per_second": 4.065, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 54.61, + "memory/max_allocated (GiB)": 54.61, + "step": 628 + }, + { + "epoch": 2.0031847133757963, + "grad_norm": 0.1103515625, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.00489779282361269, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00491, + "step": 629, + "tokens/total": 82329600, + "tokens/train_per_sec_per_gpu": 3227.88, + "tokens/trainable": 8762717 + }, + { + "epoch": 2.0063694267515926, + "grad_norm": 0.140625, + "learning_rate": 3.745183221835439e-05, + "loss": 0.0062369704246521, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00626, + "step": 630, + "tokens/total": 82460672, + "tokens/train_per_sec_per_gpu": 3063.9, + "tokens/trainable": 8775458 + }, + { + "epoch": 2.0095541401273884, + "grad_norm": 0.1015625, + "learning_rate": 3.740360288381105e-05, + "loss": 0.004345161374658346, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00435, + "step": 631, + "tokens/total": 82591744, + "tokens/train_per_sec_per_gpu": 3775.58, + "tokens/trainable": 8791109 + }, + { + "epoch": 2.0127388535031847, + "grad_norm": 0.09326171875, + "learning_rate": 3.735531223478113e-05, + "loss": 0.003698494518175721, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00371, + "step": 632, + "tokens/total": 82722816, + "tokens/train_per_sec_per_gpu": 3064.96, + "tokens/trainable": 8803927 + }, + { + "epoch": 2.015923566878981, + "grad_norm": 0.1416015625, + "learning_rate": 3.730696050997883e-05, + "loss": 0.006370588671416044, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00639, + "step": 633, + "tokens/total": 82853888, + "tokens/train_per_sec_per_gpu": 3757.38, + "tokens/trainable": 8819539 + }, + { + "epoch": 2.0191082802547773, + "grad_norm": 0.12255859375, + "learning_rate": 3.725854794842028e-05, + "loss": 0.004867184441536665, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00488, + "step": 634, + "tokens/total": 82984960, + "tokens/train_per_sec_per_gpu": 3367.17, + "tokens/trainable": 8833596 + }, + { + "epoch": 2.022292993630573, + "grad_norm": 0.11669921875, + "learning_rate": 3.721007478942236e-05, + "loss": 0.005630412604659796, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00565, + "step": 635, + "tokens/total": 83116032, + "tokens/train_per_sec_per_gpu": 3394.97, + "tokens/trainable": 8847823 + }, + { + "epoch": 2.0254777070063694, + "grad_norm": 0.158203125, + "learning_rate": 3.716154127260147e-05, + "loss": 0.0077174571342766285, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00775, + "step": 636, + "tokens/total": 83247104, + "tokens/train_per_sec_per_gpu": 3334.91, + "tokens/trainable": 8861774 + }, + { + "epoch": 2.0286624203821657, + "grad_norm": 0.11376953125, + "learning_rate": 3.7112947637872395e-05, + "loss": 0.0045103938318789005, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00452, + "step": 637, + "tokens/total": 83378176, + "tokens/train_per_sec_per_gpu": 3076.34, + "tokens/trainable": 8874743 + }, + { + "epoch": 2.031847133757962, + "grad_norm": 0.126953125, + "learning_rate": 3.706429412544711e-05, + "loss": 0.005497838370501995, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00551, + "step": 638, + "tokens/total": 83509248, + "tokens/train_per_sec_per_gpu": 3103.99, + "tokens/trainable": 8887731 + }, + { + "epoch": 2.035031847133758, + "grad_norm": 0.1279296875, + "learning_rate": 3.701558097583355e-05, + "loss": 0.004869392607361078, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00488, + "step": 639, + "tokens/total": 83640320, + "tokens/train_per_sec_per_gpu": 3107.44, + "tokens/trainable": 8900746 + }, + { + "epoch": 2.038216560509554, + "grad_norm": 0.13671875, + "learning_rate": 3.696680842983447e-05, + "loss": 0.006643592845648527, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00667, + "step": 640, + "tokens/total": 83771392, + "tokens/train_per_sec_per_gpu": 3319.13, + "tokens/trainable": 8914631 + }, + { + "epoch": 2.0414012738853504, + "grad_norm": 0.1376953125, + "learning_rate": 3.691797672854625e-05, + "loss": 0.005362721625715494, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00538, + "step": 641, + "tokens/total": 83902464, + "tokens/train_per_sec_per_gpu": 3013.92, + "tokens/trainable": 8927265 + }, + { + "epoch": 2.0445859872611467, + "grad_norm": 0.1357421875, + "learning_rate": 3.686908611335768e-05, + "loss": 0.005462102126330137, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00548, + "step": 642, + "tokens/total": 84033536, + "tokens/train_per_sec_per_gpu": 3114.98, + "tokens/trainable": 8940288 + }, + { + "epoch": 2.0477707006369426, + "grad_norm": 0.125, + "learning_rate": 3.682013682594876e-05, + "loss": 0.0043016825802624226, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00431, + "step": 643, + "tokens/total": 84164608, + "tokens/train_per_sec_per_gpu": 3431.73, + "tokens/trainable": 8954653 + }, + { + "epoch": 2.050955414012739, + "grad_norm": 0.19140625, + "learning_rate": 3.677112910828957e-05, + "loss": 0.0066072107292711735, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00663, + "step": 644, + "tokens/total": 84295680, + "tokens/train_per_sec_per_gpu": 2967.06, + "tokens/trainable": 8967100 + }, + { + "epoch": 2.054140127388535, + "grad_norm": 0.1455078125, + "learning_rate": 3.672206320263897e-05, + "loss": 0.0054827104322612286, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0055, + "step": 645, + "tokens/total": 84426752, + "tokens/train_per_sec_per_gpu": 3194.77, + "tokens/trainable": 8980536 + }, + { + "epoch": 2.0573248407643314, + "grad_norm": 0.1513671875, + "learning_rate": 3.66729393515435e-05, + "loss": 0.005452790763229132, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00547, + "step": 646, + "tokens/total": 84557824, + "tokens/train_per_sec_per_gpu": 3109.68, + "tokens/trainable": 8993576 + }, + { + "epoch": 2.0605095541401273, + "grad_norm": 0.181640625, + "learning_rate": 3.662375779783614e-05, + "loss": 0.0072727687656879425, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0073, + "step": 647, + "tokens/total": 84688896, + "tokens/train_per_sec_per_gpu": 3147.5, + "tokens/trainable": 9006855 + }, + { + "epoch": 2.0636942675159236, + "grad_norm": 0.1064453125, + "learning_rate": 3.657451878463508e-05, + "loss": 0.003491069655865431, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0035, + "step": 648, + "tokens/total": 84819968, + "tokens/train_per_sec_per_gpu": 3224.99, + "tokens/trainable": 9020369 + }, + { + "epoch": 2.06687898089172, + "grad_norm": 0.1455078125, + "learning_rate": 3.652522255534258e-05, + "loss": 0.005467304494231939, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00548, + "step": 649, + "tokens/total": 84951040, + "tokens/train_per_sec_per_gpu": 3640.83, + "tokens/trainable": 9035605 + }, + { + "epoch": 2.070063694267516, + "grad_norm": 0.1337890625, + "learning_rate": 3.647586935364372e-05, + "loss": 0.004504749551415443, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00451, + "step": 650, + "tokens/total": 85082112, + "tokens/train_per_sec_per_gpu": 3394.33, + "tokens/trainable": 9049828 + }, + { + "epoch": 2.073248407643312, + "grad_norm": 0.1787109375, + "learning_rate": 3.6426459423505214e-05, + "loss": 0.007018570322543383, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00704, + "step": 651, + "tokens/total": 85213184, + "tokens/train_per_sec_per_gpu": 2787.07, + "tokens/trainable": 9061509 + }, + { + "epoch": 2.0764331210191083, + "grad_norm": 0.12890625, + "learning_rate": 3.637699300917418e-05, + "loss": 0.005671660415828228, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00569, + "step": 652, + "tokens/total": 85344256, + "tokens/train_per_sec_per_gpu": 3667.49, + "tokens/trainable": 9076828 + }, + { + "epoch": 2.0796178343949046, + "grad_norm": 0.1455078125, + "learning_rate": 3.632747035517701e-05, + "loss": 0.005398279055953026, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00541, + "step": 653, + "tokens/total": 85475328, + "tokens/train_per_sec_per_gpu": 3551.7, + "tokens/trainable": 9091646 + }, + { + "epoch": 2.082802547770701, + "grad_norm": 0.2099609375, + "learning_rate": 3.6277891706318036e-05, + "loss": 0.007613079622387886, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00764, + "step": 654, + "tokens/total": 85606400, + "tokens/train_per_sec_per_gpu": 3571.21, + "tokens/trainable": 9106545 + }, + { + "epoch": 2.0859872611464967, + "grad_norm": 0.1640625, + "learning_rate": 3.622825730767842e-05, + "loss": 0.0069300332106649876, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00695, + "step": 655, + "tokens/total": 85737472, + "tokens/train_per_sec_per_gpu": 3786.4, + "tokens/trainable": 9122295 + }, + { + "epoch": 2.089171974522293, + "grad_norm": 0.19140625, + "learning_rate": 3.6178567404614936e-05, + "loss": 0.006750217638909817, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00677, + "step": 656, + "tokens/total": 85868544, + "tokens/train_per_sec_per_gpu": 3589.29, + "tokens/trainable": 9137329 + }, + { + "epoch": 2.0923566878980893, + "grad_norm": 0.162109375, + "learning_rate": 3.6128822242758686e-05, + "loss": 0.0060827480629086494, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0061, + "step": 657, + "tokens/total": 85999616, + "tokens/train_per_sec_per_gpu": 3096.84, + "tokens/trainable": 9150353 + }, + { + "epoch": 2.0955414012738856, + "grad_norm": 0.1337890625, + "learning_rate": 3.6079022068013945e-05, + "loss": 0.006425363477319479, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00645, + "step": 658, + "tokens/total": 86130688, + "tokens/train_per_sec_per_gpu": 3687.66, + "tokens/trainable": 9165791 + }, + { + "epoch": 2.0987261146496814, + "grad_norm": 0.13671875, + "learning_rate": 3.602916712655697e-05, + "loss": 0.004524726886302233, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00453, + "step": 659, + "tokens/total": 86261760, + "tokens/train_per_sec_per_gpu": 3224.45, + "tokens/trainable": 9179333 + }, + { + "epoch": 2.1019108280254777, + "grad_norm": 0.1806640625, + "learning_rate": 3.597925766483468e-05, + "loss": 0.008739529177546501, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00878, + "step": 660, + "tokens/total": 86392832, + "tokens/train_per_sec_per_gpu": 3380.82, + "tokens/trainable": 9193503 + }, + { + "epoch": 2.105095541401274, + "grad_norm": 0.125, + "learning_rate": 3.592929392956355e-05, + "loss": 0.003972796723246574, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00398, + "step": 661, + "tokens/total": 86523904, + "tokens/train_per_sec_per_gpu": 3337.39, + "tokens/trainable": 9207523 + }, + { + "epoch": 2.1082802547770703, + "grad_norm": 0.1435546875, + "learning_rate": 3.587927616772834e-05, + "loss": 0.00485801137983799, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00487, + "step": 662, + "tokens/total": 86654976, + "tokens/train_per_sec_per_gpu": 3418.39, + "tokens/trainable": 9221801 + }, + { + "epoch": 2.111464968152866, + "grad_norm": 0.14453125, + "learning_rate": 3.5829204626580856e-05, + "loss": 0.005488412454724312, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0055, + "step": 663, + "tokens/total": 86786048, + "tokens/train_per_sec_per_gpu": 3308.78, + "tokens/trainable": 9235658 + }, + { + "epoch": 2.1146496815286624, + "grad_norm": 0.1708984375, + "learning_rate": 3.577907955363877e-05, + "loss": 0.007495546247810125, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00752, + "step": 664, + "tokens/total": 86917120, + "tokens/train_per_sec_per_gpu": 3508.8, + "tokens/trainable": 9250377 + }, + { + "epoch": 2.1178343949044587, + "grad_norm": 0.185546875, + "learning_rate": 3.572890119668439e-05, + "loss": 0.007228251546621323, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00725, + "step": 665, + "tokens/total": 87048192, + "tokens/train_per_sec_per_gpu": 3480.87, + "tokens/trainable": 9264939 + }, + { + "epoch": 2.121019108280255, + "grad_norm": 0.1396484375, + "learning_rate": 3.567866980376337e-05, + "loss": 0.005014233291149139, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00503, + "step": 666, + "tokens/total": 87179264, + "tokens/train_per_sec_per_gpu": 3039.74, + "tokens/trainable": 9277680 + }, + { + "epoch": 2.124203821656051, + "grad_norm": 0.16015625, + "learning_rate": 3.562838562318358e-05, + "loss": 0.004775107838213444, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00479, + "step": 667, + "tokens/total": 87310336, + "tokens/train_per_sec_per_gpu": 3171.11, + "tokens/trainable": 9291025 + }, + { + "epoch": 2.127388535031847, + "grad_norm": 0.146484375, + "learning_rate": 3.557804890351383e-05, + "loss": 0.006139194592833519, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00616, + "step": 668, + "tokens/total": 87441408, + "tokens/train_per_sec_per_gpu": 3193.18, + "tokens/trainable": 9304405 + }, + { + "epoch": 2.1305732484076434, + "grad_norm": 0.1279296875, + "learning_rate": 3.5527659893582635e-05, + "loss": 0.004298456013202667, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00431, + "step": 669, + "tokens/total": 87572480, + "tokens/train_per_sec_per_gpu": 3227.18, + "tokens/trainable": 9317913 + }, + { + "epoch": 2.1337579617834397, + "grad_norm": 0.162109375, + "learning_rate": 3.547721884247699e-05, + "loss": 0.005037225782871246, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00505, + "step": 670, + "tokens/total": 87703552, + "tokens/train_per_sec_per_gpu": 3102.83, + "tokens/trainable": 9331033 + }, + { + "epoch": 2.1369426751592355, + "grad_norm": 0.1640625, + "learning_rate": 3.5426725999541174e-05, + "loss": 0.005763325374573469, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00578, + "step": 671, + "tokens/total": 87834624, + "tokens/train_per_sec_per_gpu": 3489.71, + "tokens/trainable": 9345625 + }, + { + "epoch": 2.140127388535032, + "grad_norm": 0.1591796875, + "learning_rate": 3.5376181614375436e-05, + "loss": 0.005982933100312948, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.006, + "step": 672, + "tokens/total": 87965696, + "tokens/train_per_sec_per_gpu": 3141.99, + "tokens/trainable": 9358787 + }, + { + "epoch": 2.143312101910828, + "grad_norm": 0.140625, + "learning_rate": 3.532558593683486e-05, + "loss": 0.005526629742234945, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00554, + "step": 673, + "tokens/total": 88096768, + "tokens/train_per_sec_per_gpu": 3602.99, + "tokens/trainable": 9373882 + }, + { + "epoch": 2.1464968152866244, + "grad_norm": 0.11767578125, + "learning_rate": 3.527493921702807e-05, + "loss": 0.0037272945046424866, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00373, + "step": 674, + "tokens/total": 88227840, + "tokens/train_per_sec_per_gpu": 3350.15, + "tokens/trainable": 9387904 + }, + { + "epoch": 2.1496815286624202, + "grad_norm": 0.158203125, + "learning_rate": 3.5224241705316e-05, + "loss": 0.006022762041538954, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00604, + "step": 675, + "tokens/total": 88358912, + "tokens/train_per_sec_per_gpu": 3348.71, + "tokens/trainable": 9401921 + }, + { + "epoch": 2.1528662420382165, + "grad_norm": 0.138671875, + "learning_rate": 3.517349365231065e-05, + "loss": 0.005744612775743008, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00576, + "step": 676, + "tokens/total": 88489984, + "tokens/train_per_sec_per_gpu": 3430.41, + "tokens/trainable": 9416291 + }, + { + "epoch": 2.156050955414013, + "grad_norm": 0.1484375, + "learning_rate": 3.5122695308873886e-05, + "loss": 0.005131675861775875, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00514, + "step": 677, + "tokens/total": 88621056, + "tokens/train_per_sec_per_gpu": 3279.37, + "tokens/trainable": 9430037 + }, + { + "epoch": 2.159235668789809, + "grad_norm": 0.1806640625, + "learning_rate": 3.5071846926116156e-05, + "loss": 0.007699973881244659, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00773, + "step": 678, + "tokens/total": 88752128, + "tokens/train_per_sec_per_gpu": 3222.97, + "tokens/trainable": 9443541 + }, + { + "epoch": 2.162420382165605, + "grad_norm": 0.142578125, + "learning_rate": 3.502094875539528e-05, + "loss": 0.004470378626137972, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00448, + "step": 679, + "tokens/total": 88883200, + "tokens/train_per_sec_per_gpu": 3648.92, + "tokens/trainable": 9458725 + }, + { + "epoch": 2.1656050955414012, + "grad_norm": 0.1982421875, + "learning_rate": 3.497000104831518e-05, + "loss": 0.00871230848133564, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00875, + "step": 680, + "tokens/total": 89014272, + "tokens/train_per_sec_per_gpu": 3137.55, + "tokens/trainable": 9471880 + }, + { + "epoch": 2.1687898089171975, + "grad_norm": 0.130859375, + "learning_rate": 3.491900405672466e-05, + "loss": 0.0037058612797409296, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00371, + "step": 681, + "tokens/total": 89145344, + "tokens/train_per_sec_per_gpu": 3191.78, + "tokens/trainable": 9485245 + }, + { + "epoch": 2.171974522292994, + "grad_norm": 0.1318359375, + "learning_rate": 3.486795803271614e-05, + "loss": 0.004613788798451424, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00462, + "step": 682, + "tokens/total": 89276416, + "tokens/train_per_sec_per_gpu": 3499.78, + "tokens/trainable": 9499844 + }, + { + "epoch": 2.1751592356687897, + "grad_norm": 0.12109375, + "learning_rate": 3.481686322862443e-05, + "loss": 0.003956732805818319, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00396, + "step": 683, + "tokens/total": 89407488, + "tokens/train_per_sec_per_gpu": 3088.9, + "tokens/trainable": 9512840 + }, + { + "epoch": 2.178343949044586, + "grad_norm": 0.1611328125, + "learning_rate": 3.476571989702548e-05, + "loss": 0.006073053926229477, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00609, + "step": 684, + "tokens/total": 89538560, + "tokens/train_per_sec_per_gpu": 3425.79, + "tokens/trainable": 9527160 + }, + { + "epoch": 2.1815286624203822, + "grad_norm": 0.2412109375, + "learning_rate": 3.4714528290735105e-05, + "loss": 0.005430576391518116, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00545, + "step": 685, + "tokens/total": 89669632, + "tokens/train_per_sec_per_gpu": 3295.01, + "tokens/trainable": 9540964 + }, + { + "epoch": 2.1847133757961785, + "grad_norm": 0.10205078125, + "learning_rate": 3.466328866280778e-05, + "loss": 0.003143883775919676, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00315, + "step": 686, + "tokens/total": 89800704, + "tokens/train_per_sec_per_gpu": 3469.02, + "tokens/trainable": 9555485 + }, + { + "epoch": 2.1878980891719744, + "grad_norm": 0.1328125, + "learning_rate": 3.4612001266535345e-05, + "loss": 0.005530213471502066, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00555, + "step": 687, + "tokens/total": 89931776, + "tokens/train_per_sec_per_gpu": 3563.1, + "tokens/trainable": 9570400 + }, + { + "epoch": 2.1910828025477707, + "grad_norm": 0.1357421875, + "learning_rate": 3.456066635544577e-05, + "loss": 0.004905232228338718, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00492, + "step": 688, + "tokens/total": 90062848, + "tokens/train_per_sec_per_gpu": 3391.03, + "tokens/trainable": 9584608 + }, + { + "epoch": 2.194267515923567, + "grad_norm": 0.146484375, + "learning_rate": 3.450928418330193e-05, + "loss": 0.006313517689704895, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00633, + "step": 689, + "tokens/total": 90193920, + "tokens/train_per_sec_per_gpu": 3288.42, + "tokens/trainable": 9598448 + }, + { + "epoch": 2.1974522292993632, + "grad_norm": 0.1201171875, + "learning_rate": 3.44578550041003e-05, + "loss": 0.0040069082751870155, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00401, + "step": 690, + "tokens/total": 90324992, + "tokens/train_per_sec_per_gpu": 3679.5, + "tokens/trainable": 9613777 + }, + { + "epoch": 2.200636942675159, + "grad_norm": 0.1396484375, + "learning_rate": 3.440637907206973e-05, + "loss": 0.0068097589537501335, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00683, + "step": 691, + "tokens/total": 90456064, + "tokens/train_per_sec_per_gpu": 3546.91, + "tokens/trainable": 9628632 + }, + { + "epoch": 2.2038216560509554, + "grad_norm": 0.1357421875, + "learning_rate": 3.435485664167019e-05, + "loss": 0.004060130566358566, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00407, + "step": 692, + "tokens/total": 90587136, + "tokens/train_per_sec_per_gpu": 3218.86, + "tokens/trainable": 9642131 + }, + { + "epoch": 2.2070063694267517, + "grad_norm": 0.1923828125, + "learning_rate": 3.4303287967591484e-05, + "loss": 0.008195128291845322, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00823, + "step": 693, + "tokens/total": 90718208, + "tokens/train_per_sec_per_gpu": 3409.39, + "tokens/trainable": 9656400 + }, + { + "epoch": 2.210191082802548, + "grad_norm": 0.1796875, + "learning_rate": 3.425167330475205e-05, + "loss": 0.0061119189485907555, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00613, + "step": 694, + "tokens/total": 90849280, + "tokens/train_per_sec_per_gpu": 3495.48, + "tokens/trainable": 9670962 + }, + { + "epoch": 2.213375796178344, + "grad_norm": 0.13671875, + "learning_rate": 3.420001290829761e-05, + "loss": 0.004308244213461876, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00432, + "step": 695, + "tokens/total": 90980352, + "tokens/train_per_sec_per_gpu": 3283.37, + "tokens/trainable": 9684728 + }, + { + "epoch": 2.21656050955414, + "grad_norm": 0.1494140625, + "learning_rate": 3.4148307033600014e-05, + "loss": 0.006343189161270857, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00636, + "step": 696, + "tokens/total": 91111424, + "tokens/train_per_sec_per_gpu": 3576.41, + "tokens/trainable": 9699704 + }, + { + "epoch": 2.2197452229299364, + "grad_norm": 0.1533203125, + "learning_rate": 3.409655593625587e-05, + "loss": 0.006463784724473953, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00648, + "step": 697, + "tokens/total": 91242496, + "tokens/train_per_sec_per_gpu": 3253.16, + "tokens/trainable": 9713318 + }, + { + "epoch": 2.2229299363057327, + "grad_norm": 0.12451171875, + "learning_rate": 3.404475987208539e-05, + "loss": 0.0030284496024250984, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00303, + "step": 698, + "tokens/total": 91373568, + "tokens/train_per_sec_per_gpu": 3353.96, + "tokens/trainable": 9727366 + }, + { + "epoch": 2.2261146496815285, + "grad_norm": 0.1357421875, + "learning_rate": 3.399291909713101e-05, + "loss": 0.004884797614067793, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0049, + "step": 699, + "tokens/total": 91504640, + "tokens/train_per_sec_per_gpu": 3792.66, + "tokens/trainable": 9743134 + }, + { + "epoch": 2.229299363057325, + "grad_norm": 0.1875, + "learning_rate": 3.394103386765625e-05, + "loss": 0.005894185043871403, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00591, + "step": 700, + "tokens/total": 91635712, + "tokens/train_per_sec_per_gpu": 2956.26, + "tokens/trainable": 9755576 + }, + { + "epoch": 2.232484076433121, + "grad_norm": 0.1533203125, + "learning_rate": 3.388910444014432e-05, + "loss": 0.0050967601127922535, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00511, + "step": 701, + "tokens/total": 91766784, + "tokens/train_per_sec_per_gpu": 3181.89, + "tokens/trainable": 9768924 + }, + { + "epoch": 2.2356687898089174, + "grad_norm": 0.1357421875, + "learning_rate": 3.3837131071296945e-05, + "loss": 0.004923132713884115, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00494, + "step": 702, + "tokens/total": 91897856, + "tokens/train_per_sec_per_gpu": 3211.71, + "tokens/trainable": 9782384 + }, + { + "epoch": 2.238853503184713, + "grad_norm": 0.1416015625, + "learning_rate": 3.378511401803307e-05, + "loss": 0.005397360771894455, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00541, + "step": 703, + "tokens/total": 92028928, + "tokens/train_per_sec_per_gpu": 3335.08, + "tokens/trainable": 9796325 + }, + { + "epoch": 2.2420382165605095, + "grad_norm": 0.1376953125, + "learning_rate": 3.373305353748755e-05, + "loss": 0.004327027127146721, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00434, + "step": 704, + "tokens/total": 92160000, + "tokens/train_per_sec_per_gpu": 3315.25, + "tokens/trainable": 9810212 + }, + { + "epoch": 2.245222929936306, + "grad_norm": 0.15625, + "learning_rate": 3.368094988700996e-05, + "loss": 0.007469909265637398, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0075, + "step": 705, + "tokens/total": 92291072, + "tokens/train_per_sec_per_gpu": 3489.88, + "tokens/trainable": 9824826 + }, + { + "epoch": 2.248407643312102, + "grad_norm": 0.1513671875, + "learning_rate": 3.3628803324163236e-05, + "loss": 0.005583882797509432, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0056, + "step": 706, + "tokens/total": 92422144, + "tokens/train_per_sec_per_gpu": 3391.08, + "tokens/trainable": 9839023 + }, + { + "epoch": 2.251592356687898, + "grad_norm": 0.1298828125, + "learning_rate": 3.357661410672247e-05, + "loss": 0.004044718574732542, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00405, + "step": 707, + "tokens/total": 92553216, + "tokens/train_per_sec_per_gpu": 3547.4, + "tokens/trainable": 9853877 + }, + { + "epoch": 2.254777070063694, + "grad_norm": 0.1552734375, + "learning_rate": 3.352438249267359e-05, + "loss": 0.005919166840612888, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00594, + "step": 708, + "tokens/total": 92684288, + "tokens/train_per_sec_per_gpu": 3416.57, + "tokens/trainable": 9868162 + }, + { + "epoch": 2.2579617834394905, + "grad_norm": 0.150390625, + "learning_rate": 3.347210874021211e-05, + "loss": 0.005268896464258432, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00528, + "step": 709, + "tokens/total": 92815360, + "tokens/train_per_sec_per_gpu": 3315.44, + "tokens/trainable": 9882010 + }, + { + "epoch": 2.261146496815287, + "grad_norm": 0.1630859375, + "learning_rate": 3.3419793107741834e-05, + "loss": 0.0063535538502037525, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00637, + "step": 710, + "tokens/total": 92946432, + "tokens/train_per_sec_per_gpu": 3215.77, + "tokens/trainable": 9895483 + }, + { + "epoch": 2.2643312101910826, + "grad_norm": 0.11865234375, + "learning_rate": 3.336743585387362e-05, + "loss": 0.0036360113881528378, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00364, + "step": 711, + "tokens/total": 93077504, + "tokens/train_per_sec_per_gpu": 3574.66, + "tokens/trainable": 9910386 + }, + { + "epoch": 2.267515923566879, + "grad_norm": 0.1552734375, + "learning_rate": 3.3315037237424036e-05, + "loss": 0.0054854946210980415, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0055, + "step": 712, + "tokens/total": 93208576, + "tokens/train_per_sec_per_gpu": 3491.98, + "tokens/trainable": 9924935 + }, + { + "epoch": 2.270700636942675, + "grad_norm": 0.1884765625, + "learning_rate": 3.326259751741414e-05, + "loss": 0.0039428528398275375, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00395, + "step": 713, + "tokens/total": 93339648, + "tokens/train_per_sec_per_gpu": 3219.92, + "tokens/trainable": 9938434 + }, + { + "epoch": 2.2738853503184715, + "grad_norm": 0.169921875, + "learning_rate": 3.321011695306818e-05, + "loss": 0.007426953874528408, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00745, + "step": 714, + "tokens/total": 93470720, + "tokens/train_per_sec_per_gpu": 3424.44, + "tokens/trainable": 9952758 + }, + { + "epoch": 2.2770700636942673, + "grad_norm": 0.1787109375, + "learning_rate": 3.315759580381228e-05, + "loss": 0.006136072333902121, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00615, + "step": 715, + "tokens/total": 93601792, + "tokens/train_per_sec_per_gpu": 3058.02, + "tokens/trainable": 9965569 + }, + { + "epoch": 2.2802547770700636, + "grad_norm": 0.140625, + "learning_rate": 3.310503432927322e-05, + "loss": 0.004970425274223089, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00498, + "step": 716, + "tokens/total": 93732864, + "tokens/train_per_sec_per_gpu": 3320.74, + "tokens/trainable": 9979461 + }, + { + "epoch": 2.28343949044586, + "grad_norm": 0.201171875, + "learning_rate": 3.305243278927711e-05, + "loss": 0.006117875222116709, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00614, + "step": 717, + "tokens/total": 93863936, + "tokens/train_per_sec_per_gpu": 3419.81, + "tokens/trainable": 9993781 + }, + { + "epoch": 2.286624203821656, + "grad_norm": 0.1474609375, + "learning_rate": 3.299979144384808e-05, + "loss": 0.005094599910080433, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00511, + "step": 718, + "tokens/total": 93995008, + "tokens/train_per_sec_per_gpu": 3621.22, + "tokens/trainable": 10008873 + }, + { + "epoch": 2.289808917197452, + "grad_norm": 0.150390625, + "learning_rate": 3.29471105532071e-05, + "loss": 0.005003094207495451, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00502, + "step": 719, + "tokens/total": 94126080, + "tokens/train_per_sec_per_gpu": 3296.22, + "tokens/trainable": 10022678 + }, + { + "epoch": 2.2929936305732483, + "grad_norm": 0.16015625, + "learning_rate": 3.2894390377770556e-05, + "loss": 0.005475780460983515, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00549, + "step": 720, + "tokens/total": 94257152, + "tokens/train_per_sec_per_gpu": 3130.91, + "tokens/trainable": 10035849 + }, + { + "epoch": 2.2961783439490446, + "grad_norm": 0.1796875, + "learning_rate": 3.284163117814906e-05, + "loss": 0.005412337835878134, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00543, + "step": 721, + "tokens/total": 94388224, + "tokens/train_per_sec_per_gpu": 3388.52, + "tokens/trainable": 10050035 + }, + { + "epoch": 2.299363057324841, + "grad_norm": 0.15625, + "learning_rate": 3.278883321514613e-05, + "loss": 0.005983334966003895, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.006, + "step": 722, + "tokens/total": 94519296, + "tokens/train_per_sec_per_gpu": 3388.95, + "tokens/trainable": 10064219 + }, + { + "epoch": 2.3025477707006368, + "grad_norm": 0.1865234375, + "learning_rate": 3.27359967497569e-05, + "loss": 0.006622286047786474, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00664, + "step": 723, + "tokens/total": 94650368, + "tokens/train_per_sec_per_gpu": 3043.49, + "tokens/trainable": 10077049 + }, + { + "epoch": 2.305732484076433, + "grad_norm": 0.15234375, + "learning_rate": 3.268312204316684e-05, + "loss": 0.005963774397969246, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00598, + "step": 724, + "tokens/total": 94781440, + "tokens/train_per_sec_per_gpu": 3577.18, + "tokens/trainable": 10091953 + }, + { + "epoch": 2.3089171974522293, + "grad_norm": 0.15625, + "learning_rate": 3.263020935675043e-05, + "loss": 0.003999189008027315, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00401, + "step": 725, + "tokens/total": 94912512, + "tokens/train_per_sec_per_gpu": 3227.08, + "tokens/trainable": 10105451 + }, + { + "epoch": 2.3121019108280256, + "grad_norm": 0.1337890625, + "learning_rate": 3.2577258952069934e-05, + "loss": 0.0032455208711326122, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00325, + "step": 726, + "tokens/total": 95043584, + "tokens/train_per_sec_per_gpu": 3151.03, + "tokens/trainable": 10118668 + }, + { + "epoch": 2.3152866242038215, + "grad_norm": 0.1611328125, + "learning_rate": 3.252427109087403e-05, + "loss": 0.004745165351778269, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00476, + "step": 727, + "tokens/total": 95174656, + "tokens/train_per_sec_per_gpu": 3383.32, + "tokens/trainable": 10132813 + }, + { + "epoch": 2.3184713375796178, + "grad_norm": 0.1630859375, + "learning_rate": 3.247124603509659e-05, + "loss": 0.004897519946098328, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00491, + "step": 728, + "tokens/total": 95305728, + "tokens/train_per_sec_per_gpu": 3389.91, + "tokens/trainable": 10147011 + }, + { + "epoch": 2.321656050955414, + "grad_norm": 0.1396484375, + "learning_rate": 3.241818404685531e-05, + "loss": 0.0032559458632022142, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00326, + "step": 729, + "tokens/total": 95436800, + "tokens/train_per_sec_per_gpu": 3355.7, + "tokens/trainable": 10161054 + }, + { + "epoch": 2.3248407643312103, + "grad_norm": 0.2001953125, + "learning_rate": 3.236508538845049e-05, + "loss": 0.007957718335092068, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00799, + "step": 730, + "tokens/total": 95567872, + "tokens/train_per_sec_per_gpu": 3374.92, + "tokens/trainable": 10175261 + }, + { + "epoch": 2.328025477707006, + "grad_norm": 0.1552734375, + "learning_rate": 3.2311950322363685e-05, + "loss": 0.004248796030879021, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00426, + "step": 731, + "tokens/total": 95698944, + "tokens/train_per_sec_per_gpu": 2920.63, + "tokens/trainable": 10187549 + }, + { + "epoch": 2.3312101910828025, + "grad_norm": 0.1689453125, + "learning_rate": 3.225877911125642e-05, + "loss": 0.0069992574863135815, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00702, + "step": 732, + "tokens/total": 95830016, + "tokens/train_per_sec_per_gpu": 3329.17, + "tokens/trainable": 10201476 + }, + { + "epoch": 2.3343949044585988, + "grad_norm": 0.142578125, + "learning_rate": 3.2205572017968895e-05, + "loss": 0.0038090457674115896, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00382, + "step": 733, + "tokens/total": 95961088, + "tokens/train_per_sec_per_gpu": 3853.91, + "tokens/trainable": 10217517 + }, + { + "epoch": 2.337579617834395, + "grad_norm": 0.2041015625, + "learning_rate": 3.21523293055187e-05, + "loss": 0.005244470667093992, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00526, + "step": 734, + "tokens/total": 96092160, + "tokens/train_per_sec_per_gpu": 3372.34, + "tokens/trainable": 10231610 + }, + { + "epoch": 2.340764331210191, + "grad_norm": 0.224609375, + "learning_rate": 3.2099051237099475e-05, + "loss": 0.007509202696382999, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00754, + "step": 735, + "tokens/total": 96223232, + "tokens/train_per_sec_per_gpu": 3226.63, + "tokens/trainable": 10245132 + }, + { + "epoch": 2.343949044585987, + "grad_norm": 0.1396484375, + "learning_rate": 3.204573807607967e-05, + "loss": 0.004419627133756876, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00443, + "step": 736, + "tokens/total": 96354304, + "tokens/train_per_sec_per_gpu": 3439.56, + "tokens/trainable": 10259460 + }, + { + "epoch": 2.3471337579617835, + "grad_norm": 0.125, + "learning_rate": 3.199239008600117e-05, + "loss": 0.0039891735650599, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.004, + "step": 737, + "tokens/total": 96485376, + "tokens/train_per_sec_per_gpu": 3495.62, + "tokens/trainable": 10274090 + }, + { + "epoch": 2.3503184713375798, + "grad_norm": 0.1572265625, + "learning_rate": 3.193900753057805e-05, + "loss": 0.004535307642072439, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00455, + "step": 738, + "tokens/total": 96616448, + "tokens/train_per_sec_per_gpu": 3314.96, + "tokens/trainable": 10287987 + }, + { + "epoch": 2.3535031847133756, + "grad_norm": 0.1552734375, + "learning_rate": 3.188559067369525e-05, + "loss": 0.004258223343640566, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00427, + "step": 739, + "tokens/total": 96747520, + "tokens/train_per_sec_per_gpu": 3314.93, + "tokens/trainable": 10301858 + }, + { + "epoch": 2.356687898089172, + "grad_norm": 0.1611328125, + "learning_rate": 3.183213977940726e-05, + "loss": 0.00545046990737319, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00547, + "step": 740, + "tokens/total": 96878592, + "tokens/train_per_sec_per_gpu": 3349.75, + "tokens/trainable": 10315882 + }, + { + "epoch": 2.359872611464968, + "grad_norm": 0.255859375, + "learning_rate": 3.1778655111936866e-05, + "loss": 0.005119058303534985, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00513, + "step": 741, + "tokens/total": 97009664, + "tokens/train_per_sec_per_gpu": 3190.1, + "tokens/trainable": 10329249 + }, + { + "epoch": 2.3630573248407645, + "grad_norm": 0.150390625, + "learning_rate": 3.172513693567375e-05, + "loss": 0.004317954182624817, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00433, + "step": 742, + "tokens/total": 97140736, + "tokens/train_per_sec_per_gpu": 3570.82, + "tokens/trainable": 10344194 + }, + { + "epoch": 2.3662420382165603, + "grad_norm": 0.1494140625, + "learning_rate": 3.167158551517326e-05, + "loss": 0.004607304465025663, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00462, + "step": 743, + "tokens/total": 97271808, + "tokens/train_per_sec_per_gpu": 2783.4, + "tokens/trainable": 10355961 + }, + { + "epoch": 2.3694267515923566, + "grad_norm": 0.185546875, + "learning_rate": 3.1618001115155095e-05, + "loss": 0.00533033162355423, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00534, + "step": 744, + "tokens/total": 97402880, + "tokens/train_per_sec_per_gpu": 3466.49, + "tokens/trainable": 10370470 + }, + { + "epoch": 2.372611464968153, + "grad_norm": 0.154296875, + "learning_rate": 3.1564384000501954e-05, + "loss": 0.003959702793508768, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00397, + "step": 745, + "tokens/total": 97533952, + "tokens/train_per_sec_per_gpu": 3617.49, + "tokens/trainable": 10385521 + }, + { + "epoch": 2.375796178343949, + "grad_norm": 0.166015625, + "learning_rate": 3.151073443625828e-05, + "loss": 0.006154323928058147, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00617, + "step": 746, + "tokens/total": 97665024, + "tokens/train_per_sec_per_gpu": 3457.18, + "tokens/trainable": 10400021 + }, + { + "epoch": 2.3789808917197455, + "grad_norm": 0.1748046875, + "learning_rate": 3.1457052687628905e-05, + "loss": 0.0052504888735711575, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00526, + "step": 747, + "tokens/total": 97796096, + "tokens/train_per_sec_per_gpu": 2728.56, + "tokens/trainable": 10411548 + }, + { + "epoch": 2.3821656050955413, + "grad_norm": 0.1435546875, + "learning_rate": 3.140333901997776e-05, + "loss": 0.004432502668350935, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00444, + "step": 748, + "tokens/total": 97927168, + "tokens/train_per_sec_per_gpu": 3271.0, + "tokens/trainable": 10425216 + }, + { + "epoch": 2.3853503184713376, + "grad_norm": 0.1865234375, + "learning_rate": 3.1349593698826566e-05, + "loss": 0.006921032909303904, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00695, + "step": 749, + "tokens/total": 98058240, + "tokens/train_per_sec_per_gpu": 3261.31, + "tokens/trainable": 10438873 + }, + { + "epoch": 2.388535031847134, + "grad_norm": 0.216796875, + "learning_rate": 3.1295816989853514e-05, + "loss": 0.004738848190754652, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00475, + "step": 750, + "tokens/total": 98189312, + "tokens/train_per_sec_per_gpu": 3321.82, + "tokens/trainable": 10452755 + }, + { + "epoch": 2.3917197452229297, + "grad_norm": 0.1708984375, + "learning_rate": 3.124200915889195e-05, + "loss": 0.0069868722930550575, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00701, + "step": 751, + "tokens/total": 98320384, + "tokens/train_per_sec_per_gpu": 3436.55, + "tokens/trainable": 10467133 + }, + { + "epoch": 2.394904458598726, + "grad_norm": 0.1142578125, + "learning_rate": 3.118817047192907e-05, + "loss": 0.0037817361298948526, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00379, + "step": 752, + "tokens/total": 98451456, + "tokens/train_per_sec_per_gpu": 3295.93, + "tokens/trainable": 10480957 + }, + { + "epoch": 2.3980891719745223, + "grad_norm": 0.2109375, + "learning_rate": 3.11343011951046e-05, + "loss": 0.006747876293957233, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00677, + "step": 753, + "tokens/total": 98582528, + "tokens/train_per_sec_per_gpu": 3181.77, + "tokens/trainable": 10494276 + }, + { + "epoch": 2.4012738853503186, + "grad_norm": 0.1650390625, + "learning_rate": 3.108040159470949e-05, + "loss": 0.005729879718273878, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00575, + "step": 754, + "tokens/total": 98713600, + "tokens/train_per_sec_per_gpu": 3542.0, + "tokens/trainable": 10509041 + }, + { + "epoch": 2.404458598726115, + "grad_norm": 0.193359375, + "learning_rate": 3.1026471937184554e-05, + "loss": 0.005885195918381214, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0059, + "step": 755, + "tokens/total": 98844672, + "tokens/train_per_sec_per_gpu": 3159.44, + "tokens/trainable": 10522288 + }, + { + "epoch": 2.4076433121019107, + "grad_norm": 0.1630859375, + "learning_rate": 3.097251248911922e-05, + "loss": 0.005482829641550779, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0055, + "step": 756, + "tokens/total": 98975744, + "tokens/train_per_sec_per_gpu": 3443.41, + "tokens/trainable": 10536684 + }, + { + "epoch": 2.410828025477707, + "grad_norm": 0.150390625, + "learning_rate": 3.091852351725018e-05, + "loss": 0.003930831328034401, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00394, + "step": 757, + "tokens/total": 99106816, + "tokens/train_per_sec_per_gpu": 3398.83, + "tokens/trainable": 10550908 + }, + { + "epoch": 2.4140127388535033, + "grad_norm": 0.1708984375, + "learning_rate": 3.0864505288460034e-05, + "loss": 0.006072892341762781, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00609, + "step": 758, + "tokens/total": 99237888, + "tokens/train_per_sec_per_gpu": 3411.03, + "tokens/trainable": 10565215 + }, + { + "epoch": 2.417197452229299, + "grad_norm": 0.166015625, + "learning_rate": 3.0810458069776044e-05, + "loss": 0.0038501895032823086, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00386, + "step": 759, + "tokens/total": 99368960, + "tokens/train_per_sec_per_gpu": 3448.42, + "tokens/trainable": 10579654 + }, + { + "epoch": 2.4203821656050954, + "grad_norm": 0.1796875, + "learning_rate": 3.0756382128368765e-05, + "loss": 0.006182640325278044, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0062, + "step": 760, + "tokens/total": 99500032, + "tokens/train_per_sec_per_gpu": 3253.44, + "tokens/trainable": 10593291 + }, + { + "epoch": 2.4235668789808917, + "grad_norm": 0.1767578125, + "learning_rate": 3.070227773155074e-05, + "loss": 0.0059751239605247974, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00599, + "step": 761, + "tokens/total": 99631104, + "tokens/train_per_sec_per_gpu": 3587.67, + "tokens/trainable": 10608279 + }, + { + "epoch": 2.426751592356688, + "grad_norm": 0.1513671875, + "learning_rate": 3.064814514677517e-05, + "loss": 0.005476124584674835, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00549, + "step": 762, + "tokens/total": 99762176, + "tokens/train_per_sec_per_gpu": 3337.14, + "tokens/trainable": 10622276 + }, + { + "epoch": 2.4299363057324843, + "grad_norm": 0.2216796875, + "learning_rate": 3.0593984641634595e-05, + "loss": 0.007891716435551643, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00792, + "step": 763, + "tokens/total": 99893248, + "tokens/train_per_sec_per_gpu": 2999.6, + "tokens/trainable": 10634845 + }, + { + "epoch": 2.43312101910828, + "grad_norm": 0.130859375, + "learning_rate": 3.053979648385957e-05, + "loss": 0.004688839428126812, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0047, + "step": 764, + "tokens/total": 100024320, + "tokens/train_per_sec_per_gpu": 3484.41, + "tokens/trainable": 10649410 + }, + { + "epoch": 2.4363057324840764, + "grad_norm": 0.150390625, + "learning_rate": 3.048558094131737e-05, + "loss": 0.004935243632644415, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00495, + "step": 765, + "tokens/total": 100155392, + "tokens/train_per_sec_per_gpu": 3011.34, + "tokens/trainable": 10662117 + }, + { + "epoch": 2.4394904458598727, + "grad_norm": 0.1689453125, + "learning_rate": 3.0431338282010606e-05, + "loss": 0.004069786984473467, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00408, + "step": 766, + "tokens/total": 100286464, + "tokens/train_per_sec_per_gpu": 3252.38, + "tokens/trainable": 10675770 + }, + { + "epoch": 2.4426751592356686, + "grad_norm": 0.16796875, + "learning_rate": 3.0377068774075957e-05, + "loss": 0.005909848026931286, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00593, + "step": 767, + "tokens/total": 100417536, + "tokens/train_per_sec_per_gpu": 3084.65, + "tokens/trainable": 10688759 + }, + { + "epoch": 2.445859872611465, + "grad_norm": 0.1689453125, + "learning_rate": 3.0322772685782815e-05, + "loss": 0.005527772940695286, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00554, + "step": 768, + "tokens/total": 100548608, + "tokens/train_per_sec_per_gpu": 3143.83, + "tokens/trainable": 10701925 + }, + { + "epoch": 2.449044585987261, + "grad_norm": 0.1728515625, + "learning_rate": 3.0268450285531967e-05, + "loss": 0.005178853869438171, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00519, + "step": 769, + "tokens/total": 100679680, + "tokens/train_per_sec_per_gpu": 3510.23, + "tokens/trainable": 10716553 + }, + { + "epoch": 2.4522292993630574, + "grad_norm": 0.115234375, + "learning_rate": 3.021410184185427e-05, + "loss": 0.0034743379801511765, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00348, + "step": 770, + "tokens/total": 100810752, + "tokens/train_per_sec_per_gpu": 3316.12, + "tokens/trainable": 10730411 + }, + { + "epoch": 2.4554140127388537, + "grad_norm": 0.1650390625, + "learning_rate": 3.0159727623409313e-05, + "loss": 0.0041341050527989864, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00414, + "step": 771, + "tokens/total": 100941824, + "tokens/train_per_sec_per_gpu": 3027.88, + "tokens/trainable": 10743149 + }, + { + "epoch": 2.4585987261146496, + "grad_norm": 0.1513671875, + "learning_rate": 3.0105327898984102e-05, + "loss": 0.004606778733432293, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00462, + "step": 772, + "tokens/total": 101072896, + "tokens/train_per_sec_per_gpu": 3423.19, + "tokens/trainable": 10757437 + }, + { + "epoch": 2.461783439490446, + "grad_norm": 0.193359375, + "learning_rate": 3.005090293749174e-05, + "loss": 0.006537875160574913, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00656, + "step": 773, + "tokens/total": 101203968, + "tokens/train_per_sec_per_gpu": 3675.05, + "tokens/trainable": 10772736 + }, + { + "epoch": 2.464968152866242, + "grad_norm": 0.181640625, + "learning_rate": 2.9996453007970056e-05, + "loss": 0.006382662802934647, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0064, + "step": 774, + "tokens/total": 101335040, + "tokens/train_per_sec_per_gpu": 3821.84, + "tokens/trainable": 10788651 + }, + { + "epoch": 2.468152866242038, + "grad_norm": 0.16015625, + "learning_rate": 2.994197837958032e-05, + "loss": 0.005575335118919611, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00559, + "step": 775, + "tokens/total": 101466112, + "tokens/train_per_sec_per_gpu": 3358.0, + "tokens/trainable": 10802732 + }, + { + "epoch": 2.4713375796178343, + "grad_norm": 0.16015625, + "learning_rate": 2.9887479321605895e-05, + "loss": 0.005272061098366976, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00529, + "step": 776, + "tokens/total": 101597184, + "tokens/train_per_sec_per_gpu": 3377.54, + "tokens/trainable": 10816888 + }, + { + "epoch": 2.4745222929936306, + "grad_norm": 0.1376953125, + "learning_rate": 2.9832956103450905e-05, + "loss": 0.0034832200035452843, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00349, + "step": 777, + "tokens/total": 101728256, + "tokens/train_per_sec_per_gpu": 3313.99, + "tokens/trainable": 10830748 + }, + { + "epoch": 2.477707006369427, + "grad_norm": 0.166015625, + "learning_rate": 2.9778408994638906e-05, + "loss": 0.005426026880741119, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00544, + "step": 778, + "tokens/total": 101859328, + "tokens/train_per_sec_per_gpu": 3283.57, + "tokens/trainable": 10844538 + }, + { + "epoch": 2.480891719745223, + "grad_norm": 0.1748046875, + "learning_rate": 2.9723838264811545e-05, + "loss": 0.00458392733708024, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00459, + "step": 779, + "tokens/total": 101990400, + "tokens/train_per_sec_per_gpu": 3438.29, + "tokens/trainable": 10858901 + }, + { + "epoch": 2.484076433121019, + "grad_norm": 0.1875, + "learning_rate": 2.966924418372724e-05, + "loss": 0.006339904386550188, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00636, + "step": 780, + "tokens/total": 102121472, + "tokens/train_per_sec_per_gpu": 3321.39, + "tokens/trainable": 10873475 + }, + { + "epoch": 2.4872611464968153, + "grad_norm": 0.1826171875, + "learning_rate": 2.9614627021259846e-05, + "loss": 0.006326707080006599, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00635, + "step": 781, + "tokens/total": 102252544, + "tokens/train_per_sec_per_gpu": 3305.16, + "tokens/trainable": 10887308 + }, + { + "epoch": 2.4904458598726116, + "grad_norm": 0.1884765625, + "learning_rate": 2.9559987047397303e-05, + "loss": 0.006832771003246307, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00686, + "step": 782, + "tokens/total": 102383616, + "tokens/train_per_sec_per_gpu": 2932.75, + "tokens/trainable": 10899670 + }, + { + "epoch": 2.4936305732484074, + "grad_norm": 0.1357421875, + "learning_rate": 2.950532453224032e-05, + "loss": 0.003962225280702114, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00397, + "step": 783, + "tokens/total": 102514688, + "tokens/train_per_sec_per_gpu": 3082.59, + "tokens/trainable": 10912648 + }, + { + "epoch": 2.4968152866242037, + "grad_norm": 0.1533203125, + "learning_rate": 2.945063974600104e-05, + "loss": 0.005036994814872742, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00505, + "step": 784, + "tokens/total": 102645760, + "tokens/train_per_sec_per_gpu": 3643.04, + "tokens/trainable": 10927870 + }, + { + "epoch": 2.5, + "grad_norm": 0.173828125, + "learning_rate": 2.9395932959001692e-05, + "loss": 0.0055970605462789536, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00561, + "step": 785, + "tokens/total": 102776832, + "tokens/train_per_sec_per_gpu": 3510.01, + "tokens/trainable": 10942573 + }, + { + "epoch": 2.5, + "eval_loss": 0.00919391866773367, + "eval_ppl": 1.00924, + "eval_runtime": 41.9998, + "eval_samples_per_second": 64.31, + "eval_steps_per_second": 4.024, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 54.61, + "memory/max_allocated (GiB)": 54.61, + "step": 785 + }, + { + "epoch": 2.5031847133757963, + "grad_norm": 0.1689453125, + "learning_rate": 2.9341204441673266e-05, + "loss": 0.004385429434478283, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0044, + "step": 786, + "tokens/total": 102907904, + "tokens/train_per_sec_per_gpu": 3007.8, + "tokens/trainable": 10955164 + }, + { + "epoch": 2.5063694267515926, + "grad_norm": 0.2080078125, + "learning_rate": 2.9286454464554152e-05, + "loss": 0.006849427707493305, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00687, + "step": 787, + "tokens/total": 103038976, + "tokens/train_per_sec_per_gpu": 3371.47, + "tokens/trainable": 10969219 + }, + { + "epoch": 2.5095541401273884, + "grad_norm": 0.150390625, + "learning_rate": 2.9231683298288853e-05, + "loss": 0.005230756010860205, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00524, + "step": 788, + "tokens/total": 103170048, + "tokens/train_per_sec_per_gpu": 3590.97, + "tokens/trainable": 10984159 + }, + { + "epoch": 2.5127388535031847, + "grad_norm": 0.1533203125, + "learning_rate": 2.9176891213626595e-05, + "loss": 0.00515084620565176, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00516, + "step": 789, + "tokens/total": 103301120, + "tokens/train_per_sec_per_gpu": 3471.15, + "tokens/trainable": 10998703 + }, + { + "epoch": 2.515923566878981, + "grad_norm": 0.1513671875, + "learning_rate": 2.9122078481420012e-05, + "loss": 0.005567297339439392, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00558, + "step": 790, + "tokens/total": 103432192, + "tokens/train_per_sec_per_gpu": 3566.36, + "tokens/trainable": 11013580 + }, + { + "epoch": 2.519108280254777, + "grad_norm": 0.1650390625, + "learning_rate": 2.906724537262381e-05, + "loss": 0.005145716480910778, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00516, + "step": 791, + "tokens/total": 103563264, + "tokens/train_per_sec_per_gpu": 3399.88, + "tokens/trainable": 11027822 + }, + { + "epoch": 2.522292993630573, + "grad_norm": 0.1162109375, + "learning_rate": 2.901239215829341e-05, + "loss": 0.0032891561277210712, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00329, + "step": 792, + "tokens/total": 103694336, + "tokens/train_per_sec_per_gpu": 3050.99, + "tokens/trainable": 11040610 + }, + { + "epoch": 2.5254777070063694, + "grad_norm": 0.166015625, + "learning_rate": 2.895751910958364e-05, + "loss": 0.005250695627182722, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00526, + "step": 793, + "tokens/total": 103825408, + "tokens/train_per_sec_per_gpu": 3579.93, + "tokens/trainable": 11055502 + }, + { + "epoch": 2.5286624203821657, + "grad_norm": 0.1865234375, + "learning_rate": 2.8902626497747366e-05, + "loss": 0.005496595986187458, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00551, + "step": 794, + "tokens/total": 103956480, + "tokens/train_per_sec_per_gpu": 3718.84, + "tokens/trainable": 11070940 + }, + { + "epoch": 2.531847133757962, + "grad_norm": 0.1630859375, + "learning_rate": 2.8847714594134144e-05, + "loss": 0.006310721859335899, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00633, + "step": 795, + "tokens/total": 104087552, + "tokens/train_per_sec_per_gpu": 3917.97, + "tokens/trainable": 11087217 + }, + { + "epoch": 2.535031847133758, + "grad_norm": 0.1767578125, + "learning_rate": 2.8792783670188927e-05, + "loss": 0.005432881880551577, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00545, + "step": 796, + "tokens/total": 104218624, + "tokens/train_per_sec_per_gpu": 3115.17, + "tokens/trainable": 11100308 + }, + { + "epoch": 2.538216560509554, + "grad_norm": 0.1748046875, + "learning_rate": 2.873783399745066e-05, + "loss": 0.005197789054363966, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00521, + "step": 797, + "tokens/total": 104349696, + "tokens/train_per_sec_per_gpu": 3361.6, + "tokens/trainable": 11114376 + }, + { + "epoch": 2.5414012738853504, + "grad_norm": 0.1767578125, + "learning_rate": 2.868286584755099e-05, + "loss": 0.005434297490864992, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00545, + "step": 798, + "tokens/total": 104480768, + "tokens/train_per_sec_per_gpu": 3144.12, + "tokens/trainable": 11127547 + }, + { + "epoch": 2.5445859872611463, + "grad_norm": 0.1005859375, + "learning_rate": 2.862787949221288e-05, + "loss": 0.0028167557902634144, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00282, + "step": 799, + "tokens/total": 104611840, + "tokens/train_per_sec_per_gpu": 3311.57, + "tokens/trainable": 11141342 + }, + { + "epoch": 2.5477707006369426, + "grad_norm": 0.1376953125, + "learning_rate": 2.857287520324931e-05, + "loss": 0.0033000826369971037, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00331, + "step": 800, + "tokens/total": 104742912, + "tokens/train_per_sec_per_gpu": 3334.18, + "tokens/trainable": 11155303 + }, + { + "epoch": 2.550955414012739, + "grad_norm": 0.1279296875, + "learning_rate": 2.8517853252561906e-05, + "loss": 0.004212173167616129, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00422, + "step": 801, + "tokens/total": 104873984, + "tokens/train_per_sec_per_gpu": 3504.45, + "tokens/trainable": 11169930 + }, + { + "epoch": 2.554140127388535, + "grad_norm": 0.166015625, + "learning_rate": 2.8462813912139586e-05, + "loss": 0.005329788196831942, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00534, + "step": 802, + "tokens/total": 105005056, + "tokens/train_per_sec_per_gpu": 3283.1, + "tokens/trainable": 11183661 + }, + { + "epoch": 2.5573248407643314, + "grad_norm": 0.134765625, + "learning_rate": 2.8407757454057248e-05, + "loss": 0.0038679102435708046, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00388, + "step": 803, + "tokens/total": 105136128, + "tokens/train_per_sec_per_gpu": 3540.39, + "tokens/trainable": 11198409 + }, + { + "epoch": 2.5605095541401273, + "grad_norm": 0.1787109375, + "learning_rate": 2.83526841504744e-05, + "loss": 0.004407938569784164, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00442, + "step": 804, + "tokens/total": 105267200, + "tokens/train_per_sec_per_gpu": 3017.63, + "tokens/trainable": 11211070 + }, + { + "epoch": 2.5636942675159236, + "grad_norm": 0.16015625, + "learning_rate": 2.8297594273633816e-05, + "loss": 0.004717926029115915, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00473, + "step": 805, + "tokens/total": 105398272, + "tokens/train_per_sec_per_gpu": 3319.15, + "tokens/trainable": 11224899 + }, + { + "epoch": 2.56687898089172, + "grad_norm": 0.1875, + "learning_rate": 2.824248809586021e-05, + "loss": 0.005949638783931732, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00597, + "step": 806, + "tokens/total": 105529344, + "tokens/train_per_sec_per_gpu": 3475.48, + "tokens/trainable": 11239401 + }, + { + "epoch": 2.5700636942675157, + "grad_norm": 0.1787109375, + "learning_rate": 2.8187365889558858e-05, + "loss": 0.004526551812887192, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00454, + "step": 807, + "tokens/total": 105660416, + "tokens/train_per_sec_per_gpu": 3481.42, + "tokens/trainable": 11253859 + }, + { + "epoch": 2.573248407643312, + "grad_norm": 0.1552734375, + "learning_rate": 2.81322279272143e-05, + "loss": 0.005305514670908451, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00532, + "step": 808, + "tokens/total": 105791488, + "tokens/train_per_sec_per_gpu": 3355.57, + "tokens/trainable": 11267936 + }, + { + "epoch": 2.5764331210191083, + "grad_norm": 0.189453125, + "learning_rate": 2.8077074481388927e-05, + "loss": 0.003922187723219395, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00393, + "step": 809, + "tokens/total": 105922560, + "tokens/train_per_sec_per_gpu": 3053.79, + "tokens/trainable": 11280737 + }, + { + "epoch": 2.5796178343949046, + "grad_norm": 0.1416015625, + "learning_rate": 2.802190582472168e-05, + "loss": 0.004988102242350578, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.005, + "step": 810, + "tokens/total": 106053632, + "tokens/train_per_sec_per_gpu": 3417.76, + "tokens/trainable": 11295020 + }, + { + "epoch": 2.582802547770701, + "grad_norm": 0.1494140625, + "learning_rate": 2.7966722229926712e-05, + "loss": 0.002851355355232954, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00286, + "step": 811, + "tokens/total": 106184704, + "tokens/train_per_sec_per_gpu": 3055.95, + "tokens/trainable": 11307828 + }, + { + "epoch": 2.5859872611464967, + "grad_norm": 0.16015625, + "learning_rate": 2.7911523969791997e-05, + "loss": 0.00479587959125638, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00481, + "step": 812, + "tokens/total": 106315776, + "tokens/train_per_sec_per_gpu": 3332.89, + "tokens/trainable": 11321718 + }, + { + "epoch": 2.589171974522293, + "grad_norm": 0.1533203125, + "learning_rate": 2.7856311317178002e-05, + "loss": 0.00479497155174613, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00481, + "step": 813, + "tokens/total": 106446848, + "tokens/train_per_sec_per_gpu": 3224.61, + "tokens/trainable": 11335234 + }, + { + "epoch": 2.5923566878980893, + "grad_norm": 0.1767578125, + "learning_rate": 2.7801084545016364e-05, + "loss": 0.005322256591171026, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00534, + "step": 814, + "tokens/total": 106577920, + "tokens/train_per_sec_per_gpu": 3067.32, + "tokens/trainable": 11348079 + }, + { + "epoch": 2.595541401273885, + "grad_norm": 0.162109375, + "learning_rate": 2.774584392630849e-05, + "loss": 0.004532738588750362, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00454, + "step": 815, + "tokens/total": 106708992, + "tokens/train_per_sec_per_gpu": 3239.47, + "tokens/trainable": 11361632 + }, + { + "epoch": 2.5987261146496814, + "grad_norm": 0.1748046875, + "learning_rate": 2.769058973412424e-05, + "loss": 0.005558821838349104, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00557, + "step": 816, + "tokens/total": 106840064, + "tokens/train_per_sec_per_gpu": 3432.78, + "tokens/trainable": 11376015 + }, + { + "epoch": 2.6019108280254777, + "grad_norm": 0.23046875, + "learning_rate": 2.7635322241600603e-05, + "loss": 0.008326980285346508, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00836, + "step": 817, + "tokens/total": 106971136, + "tokens/train_per_sec_per_gpu": 3064.13, + "tokens/trainable": 11388898 + }, + { + "epoch": 2.605095541401274, + "grad_norm": 0.1796875, + "learning_rate": 2.7580041721940264e-05, + "loss": 0.005567263346165419, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00558, + "step": 818, + "tokens/total": 107102208, + "tokens/train_per_sec_per_gpu": 3339.15, + "tokens/trainable": 11402886 + }, + { + "epoch": 2.6082802547770703, + "grad_norm": 0.10693359375, + "learning_rate": 2.7524748448410337e-05, + "loss": 0.0028434821870177984, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00285, + "step": 819, + "tokens/total": 107233280, + "tokens/train_per_sec_per_gpu": 3304.29, + "tokens/trainable": 11416722 + }, + { + "epoch": 2.611464968152866, + "grad_norm": 0.1748046875, + "learning_rate": 2.7469442694340984e-05, + "loss": 0.0058287507854402065, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00585, + "step": 820, + "tokens/total": 107364352, + "tokens/train_per_sec_per_gpu": 3313.02, + "tokens/trainable": 11430598 + }, + { + "epoch": 2.6146496815286624, + "grad_norm": 0.15625, + "learning_rate": 2.7414124733124046e-05, + "loss": 0.004522873554378748, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00453, + "step": 821, + "tokens/total": 107495424, + "tokens/train_per_sec_per_gpu": 3603.28, + "tokens/trainable": 11445614 + }, + { + "epoch": 2.6178343949044587, + "grad_norm": 0.1298828125, + "learning_rate": 2.735879483821171e-05, + "loss": 0.004399726167321205, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00441, + "step": 822, + "tokens/total": 107626496, + "tokens/train_per_sec_per_gpu": 3592.02, + "tokens/trainable": 11460570 + }, + { + "epoch": 2.6210191082802545, + "grad_norm": 0.13671875, + "learning_rate": 2.7303453283115177e-05, + "loss": 0.004378693178296089, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00439, + "step": 823, + "tokens/total": 107757568, + "tokens/train_per_sec_per_gpu": 3491.55, + "tokens/trainable": 11475144 + }, + { + "epoch": 2.624203821656051, + "grad_norm": 0.1650390625, + "learning_rate": 2.7248100341403247e-05, + "loss": 0.0058170161210000515, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00583, + "step": 824, + "tokens/total": 107888640, + "tokens/train_per_sec_per_gpu": 3439.8, + "tokens/trainable": 11489495 + }, + { + "epoch": 2.627388535031847, + "grad_norm": 0.13671875, + "learning_rate": 2.7192736286701042e-05, + "loss": 0.0035439918283373117, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00355, + "step": 825, + "tokens/total": 108019712, + "tokens/train_per_sec_per_gpu": 3559.22, + "tokens/trainable": 11504322 + }, + { + "epoch": 2.6305732484076434, + "grad_norm": 0.1435546875, + "learning_rate": 2.7137361392688613e-05, + "loss": 0.004517707973718643, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00453, + "step": 826, + "tokens/total": 108150784, + "tokens/train_per_sec_per_gpu": 3300.49, + "tokens/trainable": 11518136 + }, + { + "epoch": 2.6337579617834397, + "grad_norm": 0.13671875, + "learning_rate": 2.7081975933099573e-05, + "loss": 0.005291810724884272, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00531, + "step": 827, + "tokens/total": 108281856, + "tokens/train_per_sec_per_gpu": 3485.86, + "tokens/trainable": 11532728 + }, + { + "epoch": 2.6369426751592355, + "grad_norm": 0.123046875, + "learning_rate": 2.7026580181719774e-05, + "loss": 0.0031160882208496332, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00312, + "step": 828, + "tokens/total": 108412928, + "tokens/train_per_sec_per_gpu": 3088.88, + "tokens/trainable": 11545669 + }, + { + "epoch": 2.640127388535032, + "grad_norm": 0.1748046875, + "learning_rate": 2.697117441238597e-05, + "loss": 0.004703770391643047, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00471, + "step": 829, + "tokens/total": 108544000, + "tokens/train_per_sec_per_gpu": 3390.81, + "tokens/trainable": 11559794 + }, + { + "epoch": 2.643312101910828, + "grad_norm": 0.1875, + "learning_rate": 2.6915758898984384e-05, + "loss": 0.006808799225836992, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00683, + "step": 830, + "tokens/total": 108675072, + "tokens/train_per_sec_per_gpu": 3340.4, + "tokens/trainable": 11573796 + }, + { + "epoch": 2.646496815286624, + "grad_norm": 0.212890625, + "learning_rate": 2.686033391544945e-05, + "loss": 0.005405929870903492, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00542, + "step": 831, + "tokens/total": 108806144, + "tokens/train_per_sec_per_gpu": 3397.97, + "tokens/trainable": 11588025 + }, + { + "epoch": 2.6496815286624202, + "grad_norm": 0.1611328125, + "learning_rate": 2.6804899735762405e-05, + "loss": 0.006530239712446928, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00655, + "step": 832, + "tokens/total": 108937216, + "tokens/train_per_sec_per_gpu": 3240.0, + "tokens/trainable": 11601588 + }, + { + "epoch": 2.6528662420382165, + "grad_norm": 0.142578125, + "learning_rate": 2.6749456633949932e-05, + "loss": 0.0037627576384693384, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00377, + "step": 833, + "tokens/total": 109068288, + "tokens/train_per_sec_per_gpu": 3152.61, + "tokens/trainable": 11614787 + }, + { + "epoch": 2.656050955414013, + "grad_norm": 0.158203125, + "learning_rate": 2.6694004884082825e-05, + "loss": 0.0034914424177259207, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0035, + "step": 834, + "tokens/total": 109199360, + "tokens/train_per_sec_per_gpu": 3518.76, + "tokens/trainable": 11629463 + }, + { + "epoch": 2.659235668789809, + "grad_norm": 0.1318359375, + "learning_rate": 2.663854476027465e-05, + "loss": 0.004583639558404684, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00459, + "step": 835, + "tokens/total": 109330432, + "tokens/train_per_sec_per_gpu": 3674.53, + "tokens/trainable": 11644760 + }, + { + "epoch": 2.662420382165605, + "grad_norm": 0.21875, + "learning_rate": 2.6583076536680323e-05, + "loss": 0.007365885656327009, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00739, + "step": 836, + "tokens/total": 109461504, + "tokens/train_per_sec_per_gpu": 3133.81, + "tokens/trainable": 11657934 + }, + { + "epoch": 2.6656050955414012, + "grad_norm": 0.1396484375, + "learning_rate": 2.652760048749483e-05, + "loss": 0.004122959915548563, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00413, + "step": 837, + "tokens/total": 109592576, + "tokens/train_per_sec_per_gpu": 3445.42, + "tokens/trainable": 11672312 + }, + { + "epoch": 2.6687898089171975, + "grad_norm": 0.150390625, + "learning_rate": 2.647211688695186e-05, + "loss": 0.005676808767020702, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00569, + "step": 838, + "tokens/total": 109723648, + "tokens/train_per_sec_per_gpu": 3626.52, + "tokens/trainable": 11687353 + }, + { + "epoch": 2.6719745222929934, + "grad_norm": 0.2138671875, + "learning_rate": 2.6416626009322375e-05, + "loss": 0.005739385262131691, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00576, + "step": 839, + "tokens/total": 109854720, + "tokens/train_per_sec_per_gpu": 3288.19, + "tokens/trainable": 11701118 + }, + { + "epoch": 2.6751592356687897, + "grad_norm": 0.1689453125, + "learning_rate": 2.6361128128913347e-05, + "loss": 0.00492321141064167, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00494, + "step": 840, + "tokens/total": 109985792, + "tokens/train_per_sec_per_gpu": 3438.3, + "tokens/trainable": 11715518 + }, + { + "epoch": 2.678343949044586, + "grad_norm": 0.1611328125, + "learning_rate": 2.6305623520066382e-05, + "loss": 0.0048889112658798695, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0049, + "step": 841, + "tokens/total": 110116864, + "tokens/train_per_sec_per_gpu": 3362.44, + "tokens/trainable": 11729606 + }, + { + "epoch": 2.6815286624203822, + "grad_norm": 0.189453125, + "learning_rate": 2.6250112457156296e-05, + "loss": 0.005592016503214836, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00561, + "step": 842, + "tokens/total": 110247936, + "tokens/train_per_sec_per_gpu": 2882.24, + "tokens/trainable": 11741702 + }, + { + "epoch": 2.6847133757961785, + "grad_norm": 0.1787109375, + "learning_rate": 2.619459521458984e-05, + "loss": 0.0058587053790688515, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00588, + "step": 843, + "tokens/total": 110379008, + "tokens/train_per_sec_per_gpu": 3463.6, + "tokens/trainable": 11756164 + }, + { + "epoch": 2.6878980891719744, + "grad_norm": 0.1796875, + "learning_rate": 2.6139072066804332e-05, + "loss": 0.004927500616759062, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00494, + "step": 844, + "tokens/total": 110510080, + "tokens/train_per_sec_per_gpu": 3446.83, + "tokens/trainable": 11770544 + }, + { + "epoch": 2.6910828025477707, + "grad_norm": 0.2109375, + "learning_rate": 2.6083543288266233e-05, + "loss": 0.007675695698708296, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00771, + "step": 845, + "tokens/total": 110641152, + "tokens/train_per_sec_per_gpu": 3146.49, + "tokens/trainable": 11783721 + }, + { + "epoch": 2.694267515923567, + "grad_norm": 0.2470703125, + "learning_rate": 2.602800915346986e-05, + "loss": 0.004762662574648857, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00477, + "step": 846, + "tokens/total": 110772224, + "tokens/train_per_sec_per_gpu": 3111.12, + "tokens/trainable": 11796753 + }, + { + "epoch": 2.697452229299363, + "grad_norm": 0.1865234375, + "learning_rate": 2.5972469936936046e-05, + "loss": 0.006559155881404877, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00658, + "step": 847, + "tokens/total": 110903296, + "tokens/train_per_sec_per_gpu": 3445.43, + "tokens/trainable": 11811161 + }, + { + "epoch": 2.700636942675159, + "grad_norm": 0.185546875, + "learning_rate": 2.5916925913210677e-05, + "loss": 0.005181832704693079, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0052, + "step": 848, + "tokens/total": 111034368, + "tokens/train_per_sec_per_gpu": 3072.78, + "tokens/trainable": 11824046 + }, + { + "epoch": 2.7038216560509554, + "grad_norm": 0.1640625, + "learning_rate": 2.5861377356863437e-05, + "loss": 0.005784741137176752, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0058, + "step": 849, + "tokens/total": 111165440, + "tokens/train_per_sec_per_gpu": 3386.17, + "tokens/trainable": 11838220 + }, + { + "epoch": 2.7070063694267517, + "grad_norm": 0.181640625, + "learning_rate": 2.5805824542486434e-05, + "loss": 0.006970499642193317, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00699, + "step": 850, + "tokens/total": 111296512, + "tokens/train_per_sec_per_gpu": 3653.3, + "tokens/trainable": 11853456 + }, + { + "epoch": 2.710191082802548, + "grad_norm": 0.1806640625, + "learning_rate": 2.5750267744692786e-05, + "loss": 0.005797088146209717, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00581, + "step": 851, + "tokens/total": 111427584, + "tokens/train_per_sec_per_gpu": 3852.62, + "tokens/trainable": 11869442 + }, + { + "epoch": 2.713375796178344, + "grad_norm": 0.138671875, + "learning_rate": 2.5694707238115323e-05, + "loss": 0.003937084693461657, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00394, + "step": 852, + "tokens/total": 111558656, + "tokens/train_per_sec_per_gpu": 3164.69, + "tokens/trainable": 11882674 + }, + { + "epoch": 2.71656050955414, + "grad_norm": 0.177734375, + "learning_rate": 2.5639143297405222e-05, + "loss": 0.004891657270491123, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0049, + "step": 853, + "tokens/total": 111689728, + "tokens/train_per_sec_per_gpu": 3223.75, + "tokens/trainable": 11896189 + }, + { + "epoch": 2.7197452229299364, + "grad_norm": 0.14453125, + "learning_rate": 2.5583576197230603e-05, + "loss": 0.003982385154813528, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00399, + "step": 854, + "tokens/total": 111820800, + "tokens/train_per_sec_per_gpu": 3223.56, + "tokens/trainable": 11909688 + }, + { + "epoch": 2.722929936305732, + "grad_norm": 0.1484375, + "learning_rate": 2.5528006212275218e-05, + "loss": 0.0039648148231208324, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00397, + "step": 855, + "tokens/total": 111951872, + "tokens/train_per_sec_per_gpu": 3301.63, + "tokens/trainable": 11923437 + }, + { + "epoch": 2.7261146496815285, + "grad_norm": 0.193359375, + "learning_rate": 2.5472433617237107e-05, + "loss": 0.006385331507772207, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00641, + "step": 856, + "tokens/total": 112082944, + "tokens/train_per_sec_per_gpu": 3430.81, + "tokens/trainable": 11937805 + }, + { + "epoch": 2.729299363057325, + "grad_norm": 0.162109375, + "learning_rate": 2.541685868682716e-05, + "loss": 0.005221599247306585, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00524, + "step": 857, + "tokens/total": 112214016, + "tokens/train_per_sec_per_gpu": 3535.81, + "tokens/trainable": 11952610 + }, + { + "epoch": 2.732484076433121, + "grad_norm": 0.1650390625, + "learning_rate": 2.5361281695767854e-05, + "loss": 0.004517777357250452, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00453, + "step": 858, + "tokens/total": 112345088, + "tokens/train_per_sec_per_gpu": 3097.23, + "tokens/trainable": 11965620 + }, + { + "epoch": 2.7356687898089174, + "grad_norm": 0.1328125, + "learning_rate": 2.530570291879184e-05, + "loss": 0.003382981289178133, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00339, + "step": 859, + "tokens/total": 112476160, + "tokens/train_per_sec_per_gpu": 3500.78, + "tokens/trainable": 11980302 + }, + { + "epoch": 2.738853503184713, + "grad_norm": 0.162109375, + "learning_rate": 2.5250122630640587e-05, + "loss": 0.005662713665515184, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00568, + "step": 860, + "tokens/total": 112607232, + "tokens/train_per_sec_per_gpu": 3446.65, + "tokens/trainable": 11994679 + }, + { + "epoch": 2.7420382165605095, + "grad_norm": 0.15625, + "learning_rate": 2.519454110606304e-05, + "loss": 0.004983518272638321, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.005, + "step": 861, + "tokens/total": 112738304, + "tokens/train_per_sec_per_gpu": 3673.71, + "tokens/trainable": 12009986 + }, + { + "epoch": 2.745222929936306, + "grad_norm": 0.146484375, + "learning_rate": 2.5138958619814275e-05, + "loss": 0.004905369598418474, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00492, + "step": 862, + "tokens/total": 112869376, + "tokens/train_per_sec_per_gpu": 3063.86, + "tokens/trainable": 12022816 + }, + { + "epoch": 2.7484076433121016, + "grad_norm": 0.16796875, + "learning_rate": 2.5083375446654083e-05, + "loss": 0.006565258372575045, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00659, + "step": 863, + "tokens/total": 113000448, + "tokens/train_per_sec_per_gpu": 3636.75, + "tokens/trainable": 12037957 + }, + { + "epoch": 2.7515923566878984, + "grad_norm": 0.142578125, + "learning_rate": 2.502779186134568e-05, + "loss": 0.004305466078221798, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00431, + "step": 864, + "tokens/total": 113131520, + "tokens/train_per_sec_per_gpu": 3370.27, + "tokens/trainable": 12052017 + }, + { + "epoch": 2.754777070063694, + "grad_norm": 0.130859375, + "learning_rate": 2.497220813865432e-05, + "loss": 0.0037764415610581636, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00378, + "step": 865, + "tokens/total": 113262592, + "tokens/train_per_sec_per_gpu": 3212.97, + "tokens/trainable": 12065431 + }, + { + "epoch": 2.7579617834394905, + "grad_norm": 0.1689453125, + "learning_rate": 2.491662455334592e-05, + "loss": 0.005136569030582905, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00515, + "step": 866, + "tokens/total": 113393664, + "tokens/train_per_sec_per_gpu": 3273.22, + "tokens/trainable": 12079122 + }, + { + "epoch": 2.761146496815287, + "grad_norm": 0.1513671875, + "learning_rate": 2.4861041380185738e-05, + "loss": 0.003261574311181903, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00327, + "step": 867, + "tokens/total": 113524736, + "tokens/train_per_sec_per_gpu": 3109.77, + "tokens/trainable": 12092147 + }, + { + "epoch": 2.7643312101910826, + "grad_norm": 0.2021484375, + "learning_rate": 2.4805458893936963e-05, + "loss": 0.0064933402463793755, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00651, + "step": 868, + "tokens/total": 113655808, + "tokens/train_per_sec_per_gpu": 3269.96, + "tokens/trainable": 12105826 + }, + { + "epoch": 2.767515923566879, + "grad_norm": 0.140625, + "learning_rate": 2.474987736935942e-05, + "loss": 0.004877043422311544, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00489, + "step": 869, + "tokens/total": 113786880, + "tokens/train_per_sec_per_gpu": 3459.54, + "tokens/trainable": 12120248 + }, + { + "epoch": 2.770700636942675, + "grad_norm": 0.15625, + "learning_rate": 2.469429708120817e-05, + "loss": 0.004386639688163996, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0044, + "step": 870, + "tokens/total": 113917952, + "tokens/train_per_sec_per_gpu": 3176.3, + "tokens/trainable": 12133552 + }, + { + "epoch": 2.7738853503184715, + "grad_norm": 0.166015625, + "learning_rate": 2.463871830423215e-05, + "loss": 0.00508409459143877, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0051, + "step": 871, + "tokens/total": 114049024, + "tokens/train_per_sec_per_gpu": 3403.46, + "tokens/trainable": 12147799 + }, + { + "epoch": 2.777070063694268, + "grad_norm": 0.17578125, + "learning_rate": 2.4583141313172842e-05, + "loss": 0.003352643456310034, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00336, + "step": 872, + "tokens/total": 114180096, + "tokens/train_per_sec_per_gpu": 3192.03, + "tokens/trainable": 12161167 + }, + { + "epoch": 2.7802547770700636, + "grad_norm": 0.1962890625, + "learning_rate": 2.4527566382762902e-05, + "loss": 0.005316773895174265, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00533, + "step": 873, + "tokens/total": 114311168, + "tokens/train_per_sec_per_gpu": 3192.46, + "tokens/trainable": 12174546 + }, + { + "epoch": 2.78343949044586, + "grad_norm": 0.1298828125, + "learning_rate": 2.4471993787724777e-05, + "loss": 0.00329143856652081, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0033, + "step": 874, + "tokens/total": 114442240, + "tokens/train_per_sec_per_gpu": 3208.11, + "tokens/trainable": 12187974 + }, + { + "epoch": 2.786624203821656, + "grad_norm": 0.162109375, + "learning_rate": 2.4416423802769403e-05, + "loss": 0.0036203130148351192, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00363, + "step": 875, + "tokens/total": 114573312, + "tokens/train_per_sec_per_gpu": 2915.58, + "tokens/trainable": 12200203 + }, + { + "epoch": 2.789808917197452, + "grad_norm": 0.1259765625, + "learning_rate": 2.436085670259479e-05, + "loss": 0.003102727932855487, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00311, + "step": 876, + "tokens/total": 114704384, + "tokens/train_per_sec_per_gpu": 3023.1, + "tokens/trainable": 12212847 + }, + { + "epoch": 2.7929936305732483, + "grad_norm": 0.2080078125, + "learning_rate": 2.4305292761884676e-05, + "loss": 0.005169394891709089, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00518, + "step": 877, + "tokens/total": 114835456, + "tokens/train_per_sec_per_gpu": 3140.34, + "tokens/trainable": 12226003 + }, + { + "epoch": 2.7961783439490446, + "grad_norm": 0.1728515625, + "learning_rate": 2.4249732255307216e-05, + "loss": 0.004676941316574812, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00469, + "step": 878, + "tokens/total": 114966528, + "tokens/train_per_sec_per_gpu": 2960.89, + "tokens/trainable": 12238405 + }, + { + "epoch": 2.799363057324841, + "grad_norm": 0.2001953125, + "learning_rate": 2.4194175457513575e-05, + "loss": 0.005923910532146692, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00594, + "step": 879, + "tokens/total": 115097600, + "tokens/train_per_sec_per_gpu": 3330.91, + "tokens/trainable": 12252333 + }, + { + "epoch": 2.802547770700637, + "grad_norm": 0.16015625, + "learning_rate": 2.4138622643136562e-05, + "loss": 0.004777503665536642, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00479, + "step": 880, + "tokens/total": 115228672, + "tokens/train_per_sec_per_gpu": 3471.55, + "tokens/trainable": 12266874 + }, + { + "epoch": 2.805732484076433, + "grad_norm": 0.15234375, + "learning_rate": 2.4083074086789332e-05, + "loss": 0.004388585686683655, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0044, + "step": 881, + "tokens/total": 115359744, + "tokens/train_per_sec_per_gpu": 3109.72, + "tokens/trainable": 12279904 + }, + { + "epoch": 2.8089171974522293, + "grad_norm": 0.2041015625, + "learning_rate": 2.4027530063063966e-05, + "loss": 0.00651566544547677, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00654, + "step": 882, + "tokens/total": 115490816, + "tokens/train_per_sec_per_gpu": 3355.68, + "tokens/trainable": 12293954 + }, + { + "epoch": 2.8121019108280256, + "grad_norm": 0.14453125, + "learning_rate": 2.3971990846530134e-05, + "loss": 0.0046853781677782536, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0047, + "step": 883, + "tokens/total": 115621888, + "tokens/train_per_sec_per_gpu": 3459.29, + "tokens/trainable": 12308453 + }, + { + "epoch": 2.8152866242038215, + "grad_norm": 0.1748046875, + "learning_rate": 2.3916456711733776e-05, + "loss": 0.004514369182288647, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00452, + "step": 884, + "tokens/total": 115752960, + "tokens/train_per_sec_per_gpu": 3622.31, + "tokens/trainable": 12323539 + }, + { + "epoch": 2.8184713375796178, + "grad_norm": 0.130859375, + "learning_rate": 2.386092793319568e-05, + "loss": 0.004971818067133427, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00498, + "step": 885, + "tokens/total": 115884032, + "tokens/train_per_sec_per_gpu": 3500.64, + "tokens/trainable": 12338133 + }, + { + "epoch": 2.821656050955414, + "grad_norm": 0.150390625, + "learning_rate": 2.3805404785410157e-05, + "loss": 0.004273276310414076, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00428, + "step": 886, + "tokens/total": 116015104, + "tokens/train_per_sec_per_gpu": 3731.5, + "tokens/trainable": 12353671 + }, + { + "epoch": 2.8248407643312103, + "grad_norm": 0.130859375, + "learning_rate": 2.374988754284371e-05, + "loss": 0.0031330641359090805, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00314, + "step": 887, + "tokens/total": 116146176, + "tokens/train_per_sec_per_gpu": 3214.83, + "tokens/trainable": 12367172 + }, + { + "epoch": 2.8280254777070066, + "grad_norm": 0.1708984375, + "learning_rate": 2.369437647993363e-05, + "loss": 0.007122378330677748, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00715, + "step": 888, + "tokens/total": 116277248, + "tokens/train_per_sec_per_gpu": 3830.69, + "tokens/trainable": 12383092 + }, + { + "epoch": 2.8312101910828025, + "grad_norm": 0.1435546875, + "learning_rate": 2.3638871871086652e-05, + "loss": 0.003396370681002736, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0034, + "step": 889, + "tokens/total": 116408320, + "tokens/train_per_sec_per_gpu": 3247.86, + "tokens/trainable": 12396628 + }, + { + "epoch": 2.8343949044585988, + "grad_norm": 0.1748046875, + "learning_rate": 2.358337399067763e-05, + "loss": 0.00505115557461977, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00506, + "step": 890, + "tokens/total": 116539392, + "tokens/train_per_sec_per_gpu": 3332.28, + "tokens/trainable": 12410665 + }, + { + "epoch": 2.837579617834395, + "grad_norm": 0.1259765625, + "learning_rate": 2.3527883113048154e-05, + "loss": 0.0035984639544039965, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0036, + "step": 891, + "tokens/total": 116670464, + "tokens/train_per_sec_per_gpu": 3242.03, + "tokens/trainable": 12424229 + }, + { + "epoch": 2.840764331210191, + "grad_norm": 0.2109375, + "learning_rate": 2.3472399512505165e-05, + "loss": 0.007709989324212074, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00774, + "step": 892, + "tokens/total": 116801536, + "tokens/train_per_sec_per_gpu": 3025.59, + "tokens/trainable": 12436996 + }, + { + "epoch": 2.843949044585987, + "grad_norm": 0.1669921875, + "learning_rate": 2.3416923463319686e-05, + "loss": 0.00600704038515687, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00603, + "step": 893, + "tokens/total": 116932608, + "tokens/train_per_sec_per_gpu": 3495.42, + "tokens/trainable": 12451599 + }, + { + "epoch": 2.8471337579617835, + "grad_norm": 0.1474609375, + "learning_rate": 2.3361455239725364e-05, + "loss": 0.0037581382784992456, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00377, + "step": 894, + "tokens/total": 117063680, + "tokens/train_per_sec_per_gpu": 3183.5, + "tokens/trainable": 12464960 + }, + { + "epoch": 2.8503184713375798, + "grad_norm": 0.177734375, + "learning_rate": 2.3305995115917177e-05, + "loss": 0.004449051804840565, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00446, + "step": 895, + "tokens/total": 117194752, + "tokens/train_per_sec_per_gpu": 3342.72, + "tokens/trainable": 12478964 + }, + { + "epoch": 2.853503184713376, + "grad_norm": 0.12890625, + "learning_rate": 2.3250543366050074e-05, + "loss": 0.004355857148766518, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00437, + "step": 896, + "tokens/total": 117325824, + "tokens/train_per_sec_per_gpu": 3500.36, + "tokens/trainable": 12493585 + }, + { + "epoch": 2.856687898089172, + "grad_norm": 0.138671875, + "learning_rate": 2.3195100264237607e-05, + "loss": 0.004324641078710556, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00433, + "step": 897, + "tokens/total": 117456896, + "tokens/train_per_sec_per_gpu": 3278.5, + "tokens/trainable": 12507318 + }, + { + "epoch": 2.859872611464968, + "grad_norm": 0.2392578125, + "learning_rate": 2.3139666084550553e-05, + "loss": 0.005408423021435738, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00542, + "step": 898, + "tokens/total": 117587968, + "tokens/train_per_sec_per_gpu": 3103.8, + "tokens/trainable": 12520317 + }, + { + "epoch": 2.8630573248407645, + "grad_norm": 0.1796875, + "learning_rate": 2.308424110101562e-05, + "loss": 0.005885708145797253, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0059, + "step": 899, + "tokens/total": 117719040, + "tokens/train_per_sec_per_gpu": 3937.35, + "tokens/trainable": 12536633 + }, + { + "epoch": 2.8662420382165603, + "grad_norm": 0.1904296875, + "learning_rate": 2.3028825587614044e-05, + "loss": 0.0059039052575826645, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00592, + "step": 900, + "tokens/total": 117850112, + "tokens/train_per_sec_per_gpu": 3269.26, + "tokens/trainable": 12550322 + }, + { + "epoch": 2.8694267515923566, + "grad_norm": 0.1630859375, + "learning_rate": 2.2973419818280225e-05, + "loss": 0.004266998264938593, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00428, + "step": 901, + "tokens/total": 117981184, + "tokens/train_per_sec_per_gpu": 2909.52, + "tokens/trainable": 12562584 + }, + { + "epoch": 2.872611464968153, + "grad_norm": 0.19140625, + "learning_rate": 2.2918024066900433e-05, + "loss": 0.005715237930417061, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00573, + "step": 902, + "tokens/total": 118112256, + "tokens/train_per_sec_per_gpu": 3359.95, + "tokens/trainable": 12576629 + }, + { + "epoch": 2.875796178343949, + "grad_norm": 0.12158203125, + "learning_rate": 2.28626386073114e-05, + "loss": 0.0025465991348028183, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00255, + "step": 903, + "tokens/total": 118243328, + "tokens/train_per_sec_per_gpu": 3119.57, + "tokens/trainable": 12589699 + }, + { + "epoch": 2.8789808917197455, + "grad_norm": 0.1328125, + "learning_rate": 2.2807263713298957e-05, + "loss": 0.003974359482526779, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00398, + "step": 904, + "tokens/total": 118374400, + "tokens/train_per_sec_per_gpu": 3276.02, + "tokens/trainable": 12603410 + }, + { + "epoch": 2.8821656050955413, + "grad_norm": 0.1328125, + "learning_rate": 2.2751899658596755e-05, + "loss": 0.004021751694381237, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00403, + "step": 905, + "tokens/total": 118505472, + "tokens/train_per_sec_per_gpu": 3674.77, + "tokens/trainable": 12618803 + }, + { + "epoch": 2.8853503184713376, + "grad_norm": 0.1435546875, + "learning_rate": 2.2696546716884835e-05, + "loss": 0.003338857088238001, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00334, + "step": 906, + "tokens/total": 118636544, + "tokens/train_per_sec_per_gpu": 2909.02, + "tokens/trainable": 12631025 + }, + { + "epoch": 2.888535031847134, + "grad_norm": 0.12451171875, + "learning_rate": 2.2641205161788287e-05, + "loss": 0.0033922025468200445, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0034, + "step": 907, + "tokens/total": 118767616, + "tokens/train_per_sec_per_gpu": 3269.87, + "tokens/trainable": 12644738 + }, + { + "epoch": 2.8917197452229297, + "grad_norm": 0.1484375, + "learning_rate": 2.2585875266875956e-05, + "loss": 0.005157338920980692, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00517, + "step": 908, + "tokens/total": 118898688, + "tokens/train_per_sec_per_gpu": 3300.06, + "tokens/trainable": 12658494 + }, + { + "epoch": 2.894904458598726, + "grad_norm": 0.1943359375, + "learning_rate": 2.253055730565902e-05, + "loss": 0.006811058614403009, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00683, + "step": 909, + "tokens/total": 119029760, + "tokens/train_per_sec_per_gpu": 3546.15, + "tokens/trainable": 12673250 + }, + { + "epoch": 2.8980891719745223, + "grad_norm": 0.14453125, + "learning_rate": 2.2475251551589662e-05, + "loss": 0.003177374368533492, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00318, + "step": 910, + "tokens/total": 119160832, + "tokens/train_per_sec_per_gpu": 3016.5, + "tokens/trainable": 12685906 + }, + { + "epoch": 2.9012738853503186, + "grad_norm": 0.1669921875, + "learning_rate": 2.241995827805974e-05, + "loss": 0.005059496965259314, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00507, + "step": 911, + "tokens/total": 119291904, + "tokens/train_per_sec_per_gpu": 3674.19, + "tokens/trainable": 12701221 + }, + { + "epoch": 2.904458598726115, + "grad_norm": 0.126953125, + "learning_rate": 2.2364677758399406e-05, + "loss": 0.0032712582033127546, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00328, + "step": 912, + "tokens/total": 119422976, + "tokens/train_per_sec_per_gpu": 3246.71, + "tokens/trainable": 12714734 + }, + { + "epoch": 2.9076433121019107, + "grad_norm": 0.212890625, + "learning_rate": 2.230941026587576e-05, + "loss": 0.007138803135603666, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00716, + "step": 913, + "tokens/total": 119554048, + "tokens/train_per_sec_per_gpu": 3265.13, + "tokens/trainable": 12728429 + }, + { + "epoch": 2.910828025477707, + "grad_norm": 0.1708984375, + "learning_rate": 2.2254156073691518e-05, + "loss": 0.00541570782661438, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00543, + "step": 914, + "tokens/total": 119685120, + "tokens/train_per_sec_per_gpu": 3389.26, + "tokens/trainable": 12742539 + }, + { + "epoch": 2.9140127388535033, + "grad_norm": 0.1640625, + "learning_rate": 2.219891545498365e-05, + "loss": 0.0042840586975216866, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00429, + "step": 915, + "tokens/total": 119816192, + "tokens/train_per_sec_per_gpu": 3318.34, + "tokens/trainable": 12756353 + }, + { + "epoch": 2.917197452229299, + "grad_norm": 0.17578125, + "learning_rate": 2.2143688682822e-05, + "loss": 0.005752744618803263, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00577, + "step": 916, + "tokens/total": 119947264, + "tokens/train_per_sec_per_gpu": 3560.06, + "tokens/trainable": 12771211 + }, + { + "epoch": 2.9203821656050954, + "grad_norm": 0.1728515625, + "learning_rate": 2.2088476030208012e-05, + "loss": 0.003762285690754652, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00377, + "step": 917, + "tokens/total": 120078336, + "tokens/train_per_sec_per_gpu": 2930.2, + "tokens/trainable": 12783504 + }, + { + "epoch": 2.9235668789808917, + "grad_norm": 0.11767578125, + "learning_rate": 2.2033277770073297e-05, + "loss": 0.0025295563973486423, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00253, + "step": 918, + "tokens/total": 120209408, + "tokens/train_per_sec_per_gpu": 3098.34, + "tokens/trainable": 12796457 + }, + { + "epoch": 2.926751592356688, + "grad_norm": 0.1337890625, + "learning_rate": 2.1978094175278323e-05, + "loss": 0.004149306565523148, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00416, + "step": 919, + "tokens/total": 120340480, + "tokens/train_per_sec_per_gpu": 3238.11, + "tokens/trainable": 12810005 + }, + { + "epoch": 2.9299363057324843, + "grad_norm": 0.1826171875, + "learning_rate": 2.192292551861108e-05, + "loss": 0.006155917886644602, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00617, + "step": 920, + "tokens/total": 120471552, + "tokens/train_per_sec_per_gpu": 3351.96, + "tokens/trainable": 12824046 + }, + { + "epoch": 2.93312101910828, + "grad_norm": 0.140625, + "learning_rate": 2.1867772072785708e-05, + "loss": 0.005103899631649256, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00512, + "step": 921, + "tokens/total": 120602624, + "tokens/train_per_sec_per_gpu": 3263.43, + "tokens/trainable": 12837714 + }, + { + "epoch": 2.9363057324840764, + "grad_norm": 0.171875, + "learning_rate": 2.181263411044114e-05, + "loss": 0.004437371157109737, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00445, + "step": 922, + "tokens/total": 120733696, + "tokens/train_per_sec_per_gpu": 3276.05, + "tokens/trainable": 12851431 + }, + { + "epoch": 2.9394904458598727, + "grad_norm": 0.1689453125, + "learning_rate": 2.1757511904139793e-05, + "loss": 0.005264171864837408, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00528, + "step": 923, + "tokens/total": 120864768, + "tokens/train_per_sec_per_gpu": 3525.14, + "tokens/trainable": 12866186 + }, + { + "epoch": 2.9426751592356686, + "grad_norm": 0.16796875, + "learning_rate": 2.1702405726366193e-05, + "loss": 0.0048398361541330814, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00485, + "step": 924, + "tokens/total": 120995840, + "tokens/train_per_sec_per_gpu": 3474.68, + "tokens/trainable": 12880741 + }, + { + "epoch": 2.945859872611465, + "grad_norm": 0.1513671875, + "learning_rate": 2.1647315849525606e-05, + "loss": 0.0037978398613631725, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00381, + "step": 925, + "tokens/total": 121126912, + "tokens/train_per_sec_per_gpu": 3132.29, + "tokens/trainable": 12893946 + }, + { + "epoch": 2.949044585987261, + "grad_norm": 0.1650390625, + "learning_rate": 2.1592242545942755e-05, + "loss": 0.005401961971074343, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00542, + "step": 926, + "tokens/total": 121257984, + "tokens/train_per_sec_per_gpu": 3184.76, + "tokens/trainable": 12907278 + }, + { + "epoch": 2.9522292993630574, + "grad_norm": 0.1474609375, + "learning_rate": 2.1537186087860423e-05, + "loss": 0.005091848783195019, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0051, + "step": 927, + "tokens/total": 121389056, + "tokens/train_per_sec_per_gpu": 3525.92, + "tokens/trainable": 12921953 + }, + { + "epoch": 2.9554140127388537, + "grad_norm": 0.162109375, + "learning_rate": 2.14821467474381e-05, + "loss": 0.005307201761752367, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00532, + "step": 928, + "tokens/total": 121520128, + "tokens/train_per_sec_per_gpu": 3465.19, + "tokens/trainable": 12936423 + }, + { + "epoch": 2.9585987261146496, + "grad_norm": 0.12109375, + "learning_rate": 2.1427124796750696e-05, + "loss": 0.002976613584905863, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00298, + "step": 929, + "tokens/total": 121651200, + "tokens/train_per_sec_per_gpu": 3415.41, + "tokens/trainable": 12950697 + }, + { + "epoch": 2.961783439490446, + "grad_norm": 0.2021484375, + "learning_rate": 2.1372120507787134e-05, + "loss": 0.004961484577506781, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00497, + "step": 930, + "tokens/total": 121782272, + "tokens/train_per_sec_per_gpu": 3237.03, + "tokens/trainable": 12964260 + }, + { + "epoch": 2.964968152866242, + "grad_norm": 0.193359375, + "learning_rate": 2.131713415244902e-05, + "loss": 0.0067651160061359406, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00679, + "step": 931, + "tokens/total": 121913344, + "tokens/train_per_sec_per_gpu": 3323.81, + "tokens/trainable": 12978164 + }, + { + "epoch": 2.968152866242038, + "grad_norm": 0.166015625, + "learning_rate": 2.1262166002549344e-05, + "loss": 0.005593163892626762, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00561, + "step": 932, + "tokens/total": 122044416, + "tokens/train_per_sec_per_gpu": 3178.79, + "tokens/trainable": 12991495 + }, + { + "epoch": 2.9713375796178343, + "grad_norm": 0.177734375, + "learning_rate": 2.1207216329811082e-05, + "loss": 0.0055503519251942635, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00557, + "step": 933, + "tokens/total": 122175488, + "tokens/train_per_sec_per_gpu": 2983.66, + "tokens/trainable": 13003996 + }, + { + "epoch": 2.9745222929936306, + "grad_norm": 0.162109375, + "learning_rate": 2.115228540586586e-05, + "loss": 0.004628556780517101, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00464, + "step": 934, + "tokens/total": 122306560, + "tokens/train_per_sec_per_gpu": 3348.38, + "tokens/trainable": 13017998 + }, + { + "epoch": 2.977707006369427, + "grad_norm": 0.146484375, + "learning_rate": 2.109737350225264e-05, + "loss": 0.0036150780506432056, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00362, + "step": 935, + "tokens/total": 122437632, + "tokens/train_per_sec_per_gpu": 3386.9, + "tokens/trainable": 13032100 + }, + { + "epoch": 2.980891719745223, + "grad_norm": 0.15234375, + "learning_rate": 2.1042480890416368e-05, + "loss": 0.004233770538121462, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00424, + "step": 936, + "tokens/total": 122568704, + "tokens/train_per_sec_per_gpu": 3171.53, + "tokens/trainable": 13045341 + }, + { + "epoch": 2.984076433121019, + "grad_norm": 0.1728515625, + "learning_rate": 2.0987607841706595e-05, + "loss": 0.004372127819806337, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00438, + "step": 937, + "tokens/total": 122699776, + "tokens/train_per_sec_per_gpu": 3077.26, + "tokens/trainable": 13058291 + }, + { + "epoch": 2.9872611464968153, + "grad_norm": 0.154296875, + "learning_rate": 2.09327546273762e-05, + "loss": 0.005242812447249889, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00526, + "step": 938, + "tokens/total": 122830848, + "tokens/train_per_sec_per_gpu": 3362.79, + "tokens/trainable": 13072317 + }, + { + "epoch": 2.9904458598726116, + "grad_norm": 0.150390625, + "learning_rate": 2.087792151858e-05, + "loss": 0.0044011822901666164, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00441, + "step": 939, + "tokens/total": 122961920, + "tokens/train_per_sec_per_gpu": 3314.79, + "tokens/trainable": 13086158 + }, + { + "epoch": 2.9936305732484074, + "grad_norm": 0.1650390625, + "learning_rate": 2.0823108786373414e-05, + "loss": 0.004296471830457449, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00431, + "step": 940, + "tokens/total": 123092992, + "tokens/train_per_sec_per_gpu": 3532.4, + "tokens/trainable": 13100899 + }, + { + "epoch": 2.9968152866242037, + "grad_norm": 0.134765625, + "learning_rate": 2.0768316701711153e-05, + "loss": 0.0038203117437660694, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00383, + "step": 941, + "tokens/total": 123224064, + "tokens/train_per_sec_per_gpu": 3339.11, + "tokens/trainable": 13115218 + }, + { + "epoch": 3.0, + "grad_norm": 0.2158203125, + "learning_rate": 2.0713545535445857e-05, + "loss": 0.005111368373036385, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 39.25, + "memory/max_allocated (GiB)": 39.25, + "ppl": 1.00512, + "step": 942, + "tokens/total": 123297792, + "tokens/train_per_sec_per_gpu": 3851.53, + "tokens/trainable": 13124028 + }, + { + "epoch": 3.0, + "eval_loss": 0.008717856369912624, + "eval_ppl": 1.00876, + "eval_runtime": 41.6707, + "eval_samples_per_second": 64.818, + "eval_steps_per_second": 4.056, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 54.61, + "memory/max_allocated (GiB)": 54.61, + "step": 942 + }, + { + "epoch": 3.0031847133757963, + "grad_norm": 0.111328125, + "learning_rate": 2.0658795558326743e-05, + "loss": 0.0027498805429786444, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00275, + "step": 943, + "tokens/total": 123428864, + "tokens/train_per_sec_per_gpu": 3250.23, + "tokens/trainable": 13137492 + }, + { + "epoch": 3.0063694267515926, + "grad_norm": 0.08642578125, + "learning_rate": 2.0604067040998314e-05, + "loss": 0.002591141266748309, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00259, + "step": 944, + "tokens/total": 123559936, + "tokens/train_per_sec_per_gpu": 3658.64, + "tokens/trainable": 13152727 + }, + { + "epoch": 3.0095541401273884, + "grad_norm": 0.11328125, + "learning_rate": 2.054936025399897e-05, + "loss": 0.0033186483196914196, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00332, + "step": 945, + "tokens/total": 123691008, + "tokens/train_per_sec_per_gpu": 3830.57, + "tokens/trainable": 13168699 + }, + { + "epoch": 3.0127388535031847, + "grad_norm": 0.1318359375, + "learning_rate": 2.049467546775968e-05, + "loss": 0.0039662388153374195, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00397, + "step": 946, + "tokens/total": 123822080, + "tokens/train_per_sec_per_gpu": 3532.79, + "tokens/trainable": 13183492 + }, + { + "epoch": 3.015923566878981, + "grad_norm": 0.10986328125, + "learning_rate": 2.0440012952602706e-05, + "loss": 0.003088605822995305, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00309, + "step": 947, + "tokens/total": 123953152, + "tokens/train_per_sec_per_gpu": 3257.92, + "tokens/trainable": 13197146 + }, + { + "epoch": 3.0191082802547773, + "grad_norm": 0.12890625, + "learning_rate": 2.0385372978740167e-05, + "loss": 0.0031338452827185392, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00314, + "step": 948, + "tokens/total": 124084224, + "tokens/train_per_sec_per_gpu": 3231.23, + "tokens/trainable": 13210673 + }, + { + "epoch": 3.022292993630573, + "grad_norm": 0.123046875, + "learning_rate": 2.033075581627276e-05, + "loss": 0.0032858422491699457, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00329, + "step": 949, + "tokens/total": 124215296, + "tokens/train_per_sec_per_gpu": 3298.75, + "tokens/trainable": 13224347 + }, + { + "epoch": 3.0254777070063694, + "grad_norm": 0.10205078125, + "learning_rate": 2.0276161735188458e-05, + "loss": 0.0026432094164192677, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00265, + "step": 950, + "tokens/total": 124346368, + "tokens/train_per_sec_per_gpu": 3518.56, + "tokens/trainable": 13238926 + }, + { + "epoch": 3.0286624203821657, + "grad_norm": 0.1279296875, + "learning_rate": 2.0221591005361104e-05, + "loss": 0.0035607037134468555, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00357, + "step": 951, + "tokens/total": 124477440, + "tokens/train_per_sec_per_gpu": 3364.92, + "tokens/trainable": 13252966 + }, + { + "epoch": 3.031847133757962, + "grad_norm": 0.140625, + "learning_rate": 2.0167043896549097e-05, + "loss": 0.004281069617718458, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00429, + "step": 952, + "tokens/total": 124608512, + "tokens/train_per_sec_per_gpu": 3140.17, + "tokens/trainable": 13266012 + }, + { + "epoch": 3.035031847133758, + "grad_norm": 0.140625, + "learning_rate": 2.0112520678394107e-05, + "loss": 0.003244205377995968, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00325, + "step": 953, + "tokens/total": 124739584, + "tokens/train_per_sec_per_gpu": 3319.22, + "tokens/trainable": 13279830 + }, + { + "epoch": 3.038216560509554, + "grad_norm": 0.1357421875, + "learning_rate": 2.005802162041969e-05, + "loss": 0.0033878230024129152, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00339, + "step": 954, + "tokens/total": 124870656, + "tokens/train_per_sec_per_gpu": 3384.16, + "tokens/trainable": 13293926 + }, + { + "epoch": 3.0414012738853504, + "grad_norm": 0.134765625, + "learning_rate": 2.0003546992029953e-05, + "loss": 0.002641953295096755, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00265, + "step": 955, + "tokens/total": 125001728, + "tokens/train_per_sec_per_gpu": 2720.72, + "tokens/trainable": 13305413 + }, + { + "epoch": 3.0445859872611467, + "grad_norm": 0.138671875, + "learning_rate": 1.9949097062508267e-05, + "loss": 0.003417475149035454, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00342, + "step": 956, + "tokens/total": 125132800, + "tokens/train_per_sec_per_gpu": 3595.19, + "tokens/trainable": 13320381 + }, + { + "epoch": 3.0477707006369426, + "grad_norm": 0.1064453125, + "learning_rate": 1.9894672101015904e-05, + "loss": 0.002634722040966153, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00264, + "step": 957, + "tokens/total": 125263872, + "tokens/train_per_sec_per_gpu": 3285.4, + "tokens/trainable": 13334096 + }, + { + "epoch": 3.050955414012739, + "grad_norm": 0.1669921875, + "learning_rate": 1.9840272376590693e-05, + "loss": 0.0045495470985770226, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00456, + "step": 958, + "tokens/total": 125394944, + "tokens/train_per_sec_per_gpu": 3160.53, + "tokens/trainable": 13347392 + }, + { + "epoch": 3.054140127388535, + "grad_norm": 0.126953125, + "learning_rate": 1.9785898158145738e-05, + "loss": 0.0035640057176351547, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00357, + "step": 959, + "tokens/total": 125526016, + "tokens/train_per_sec_per_gpu": 3402.01, + "tokens/trainable": 13361641 + }, + { + "epoch": 3.0573248407643314, + "grad_norm": 0.12890625, + "learning_rate": 1.9731549714468045e-05, + "loss": 0.003452250501140952, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00346, + "step": 960, + "tokens/total": 125657088, + "tokens/train_per_sec_per_gpu": 3116.14, + "tokens/trainable": 13374682 + }, + { + "epoch": 3.0605095541401273, + "grad_norm": 0.1220703125, + "learning_rate": 1.9677227314217188e-05, + "loss": 0.0024322110693901777, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00244, + "step": 961, + "tokens/total": 125788160, + "tokens/train_per_sec_per_gpu": 2974.51, + "tokens/trainable": 13387164 + }, + { + "epoch": 3.0636942675159236, + "grad_norm": 0.12451171875, + "learning_rate": 1.962293122592405e-05, + "loss": 0.00328466366045177, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00329, + "step": 962, + "tokens/total": 125919232, + "tokens/train_per_sec_per_gpu": 3223.72, + "tokens/trainable": 13400626 + }, + { + "epoch": 3.06687898089172, + "grad_norm": 0.1171875, + "learning_rate": 1.9568661717989407e-05, + "loss": 0.0021802615374326706, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00218, + "step": 963, + "tokens/total": 126050304, + "tokens/train_per_sec_per_gpu": 3537.27, + "tokens/trainable": 13415382 + }, + { + "epoch": 3.070063694267516, + "grad_norm": 0.150390625, + "learning_rate": 1.951441905868264e-05, + "loss": 0.003219526493921876, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00322, + "step": 964, + "tokens/total": 126181376, + "tokens/train_per_sec_per_gpu": 3173.36, + "tokens/trainable": 13428745 + }, + { + "epoch": 3.073248407643312, + "grad_norm": 0.130859375, + "learning_rate": 1.9460203516140433e-05, + "loss": 0.0025150931905955076, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00252, + "step": 965, + "tokens/total": 126312448, + "tokens/train_per_sec_per_gpu": 3357.46, + "tokens/trainable": 13442783 + }, + { + "epoch": 3.0764331210191083, + "grad_norm": 0.13671875, + "learning_rate": 1.940601535836542e-05, + "loss": 0.002752315253019333, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00276, + "step": 966, + "tokens/total": 126443520, + "tokens/train_per_sec_per_gpu": 3434.62, + "tokens/trainable": 13457099 + }, + { + "epoch": 3.0796178343949046, + "grad_norm": 0.11865234375, + "learning_rate": 1.9351854853224837e-05, + "loss": 0.002302248729392886, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0023, + "step": 967, + "tokens/total": 126574592, + "tokens/train_per_sec_per_gpu": 3069.39, + "tokens/trainable": 13470035 + }, + { + "epoch": 3.082802547770701, + "grad_norm": 0.138671875, + "learning_rate": 1.9297722268449264e-05, + "loss": 0.00326096941716969, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00327, + "step": 968, + "tokens/total": 126705664, + "tokens/train_per_sec_per_gpu": 3547.61, + "tokens/trainable": 13484891 + }, + { + "epoch": 3.0859872611464967, + "grad_norm": 0.14453125, + "learning_rate": 1.9243617871631245e-05, + "loss": 0.0029772731941193342, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00298, + "step": 969, + "tokens/total": 126836736, + "tokens/train_per_sec_per_gpu": 3593.08, + "tokens/trainable": 13499838 + }, + { + "epoch": 3.089171974522293, + "grad_norm": 0.12890625, + "learning_rate": 1.9189541930223965e-05, + "loss": 0.0024753999896347523, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00248, + "step": 970, + "tokens/total": 126967808, + "tokens/train_per_sec_per_gpu": 3311.76, + "tokens/trainable": 13513723 + }, + { + "epoch": 3.0923566878980893, + "grad_norm": 0.134765625, + "learning_rate": 1.9135494711539975e-05, + "loss": 0.003328888211399317, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00333, + "step": 971, + "tokens/total": 127098880, + "tokens/train_per_sec_per_gpu": 3188.59, + "tokens/trainable": 13527089 + }, + { + "epoch": 3.0955414012738856, + "grad_norm": 0.10693359375, + "learning_rate": 1.9081476482749838e-05, + "loss": 0.0020992374047636986, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0021, + "step": 972, + "tokens/total": 127229952, + "tokens/train_per_sec_per_gpu": 3319.78, + "tokens/trainable": 13540974 + }, + { + "epoch": 3.0987261146496814, + "grad_norm": 0.1474609375, + "learning_rate": 1.902748751088078e-05, + "loss": 0.0023126029409468174, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00232, + "step": 973, + "tokens/total": 127361024, + "tokens/train_per_sec_per_gpu": 3187.82, + "tokens/trainable": 13554328 + }, + { + "epoch": 3.1019108280254777, + "grad_norm": 0.11572265625, + "learning_rate": 1.8973528062815452e-05, + "loss": 0.001823435421101749, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00183, + "step": 974, + "tokens/total": 127492096, + "tokens/train_per_sec_per_gpu": 2959.79, + "tokens/trainable": 13566755 + }, + { + "epoch": 3.105095541401274, + "grad_norm": 0.134765625, + "learning_rate": 1.8919598405290522e-05, + "loss": 0.002975163981318474, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00298, + "step": 975, + "tokens/total": 127623168, + "tokens/train_per_sec_per_gpu": 3605.57, + "tokens/trainable": 13581801 + }, + { + "epoch": 3.1082802547770703, + "grad_norm": 0.1455078125, + "learning_rate": 1.88656988048954e-05, + "loss": 0.0033629476092755795, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00337, + "step": 976, + "tokens/total": 127754240, + "tokens/train_per_sec_per_gpu": 3305.94, + "tokens/trainable": 13595673 + }, + { + "epoch": 3.111464968152866, + "grad_norm": 0.11474609375, + "learning_rate": 1.8811829528070935e-05, + "loss": 0.0019825787749141455, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00198, + "step": 977, + "tokens/total": 127885312, + "tokens/train_per_sec_per_gpu": 3321.78, + "tokens/trainable": 13609562 + }, + { + "epoch": 3.1146496815286624, + "grad_norm": 0.134765625, + "learning_rate": 1.8757990841108065e-05, + "loss": 0.00240930519066751, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00241, + "step": 978, + "tokens/total": 128016384, + "tokens/train_per_sec_per_gpu": 3254.37, + "tokens/trainable": 13623198 + }, + { + "epoch": 3.1178343949044587, + "grad_norm": 0.146484375, + "learning_rate": 1.87041830101465e-05, + "loss": 0.0034942845813930035, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0035, + "step": 979, + "tokens/total": 128147456, + "tokens/train_per_sec_per_gpu": 3162.68, + "tokens/trainable": 13636512 + }, + { + "epoch": 3.121019108280255, + "grad_norm": 0.1494140625, + "learning_rate": 1.8650406301173447e-05, + "loss": 0.0034091034904122353, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00341, + "step": 980, + "tokens/total": 128278528, + "tokens/train_per_sec_per_gpu": 3456.01, + "tokens/trainable": 13650976 + }, + { + "epoch": 3.124203821656051, + "grad_norm": 0.15625, + "learning_rate": 1.8596660980022258e-05, + "loss": 0.0025934309232980013, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0026, + "step": 981, + "tokens/total": 128409600, + "tokens/train_per_sec_per_gpu": 3098.66, + "tokens/trainable": 13663952 + }, + { + "epoch": 3.127388535031847, + "grad_norm": 0.10595703125, + "learning_rate": 1.8542947312371108e-05, + "loss": 0.0022293792571872473, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00223, + "step": 982, + "tokens/total": 128540672, + "tokens/train_per_sec_per_gpu": 3308.14, + "tokens/trainable": 13677809 + }, + { + "epoch": 3.1305732484076434, + "grad_norm": 0.220703125, + "learning_rate": 1.8489265563741725e-05, + "loss": 0.0036684228107333183, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00368, + "step": 983, + "tokens/total": 128671744, + "tokens/train_per_sec_per_gpu": 2671.82, + "tokens/trainable": 13689205 + }, + { + "epoch": 3.1337579617834397, + "grad_norm": 0.146484375, + "learning_rate": 1.8435615999498045e-05, + "loss": 0.003023945726454258, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00303, + "step": 984, + "tokens/total": 128802816, + "tokens/train_per_sec_per_gpu": 3353.12, + "tokens/trainable": 13703248 + }, + { + "epoch": 3.1369426751592355, + "grad_norm": 0.1357421875, + "learning_rate": 1.8381998884844914e-05, + "loss": 0.0030851985793560743, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00309, + "step": 985, + "tokens/total": 128933888, + "tokens/train_per_sec_per_gpu": 3504.98, + "tokens/trainable": 13717913 + }, + { + "epoch": 3.140127388535032, + "grad_norm": 0.1416015625, + "learning_rate": 1.8328414484826745e-05, + "loss": 0.002863124944269657, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00287, + "step": 986, + "tokens/total": 129064960, + "tokens/train_per_sec_per_gpu": 3309.73, + "tokens/trainable": 13731778 + }, + { + "epoch": 3.143312101910828, + "grad_norm": 0.138671875, + "learning_rate": 1.8274863064326253e-05, + "loss": 0.0033043615985661745, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00331, + "step": 987, + "tokens/total": 129196032, + "tokens/train_per_sec_per_gpu": 3489.76, + "tokens/trainable": 13746393 + }, + { + "epoch": 3.1464968152866244, + "grad_norm": 0.1669921875, + "learning_rate": 1.822134488806314e-05, + "loss": 0.003721470246091485, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00373, + "step": 988, + "tokens/total": 129327104, + "tokens/train_per_sec_per_gpu": 3260.94, + "tokens/trainable": 13760070 + }, + { + "epoch": 3.1496815286624202, + "grad_norm": 0.107421875, + "learning_rate": 1.8167860220592736e-05, + "loss": 0.002208119258284569, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00221, + "step": 989, + "tokens/total": 129458176, + "tokens/train_per_sec_per_gpu": 3466.56, + "tokens/trainable": 13774565 + }, + { + "epoch": 3.1528662420382165, + "grad_norm": 0.1591796875, + "learning_rate": 1.8114409326304754e-05, + "loss": 0.0030963195022195578, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0031, + "step": 990, + "tokens/total": 129589248, + "tokens/train_per_sec_per_gpu": 3297.94, + "tokens/trainable": 13788405 + }, + { + "epoch": 3.156050955414013, + "grad_norm": 0.146484375, + "learning_rate": 1.806099246942196e-05, + "loss": 0.0031601302325725555, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00317, + "step": 991, + "tokens/total": 129720320, + "tokens/train_per_sec_per_gpu": 3314.16, + "tokens/trainable": 13802332 + }, + { + "epoch": 3.159235668789809, + "grad_norm": 0.1650390625, + "learning_rate": 1.800760991399884e-05, + "loss": 0.003068044548854232, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00307, + "step": 992, + "tokens/total": 129851392, + "tokens/train_per_sec_per_gpu": 3131.59, + "tokens/trainable": 13815450 + }, + { + "epoch": 3.162420382165605, + "grad_norm": 0.142578125, + "learning_rate": 1.7954261923920335e-05, + "loss": 0.003088792786002159, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00309, + "step": 993, + "tokens/total": 129982464, + "tokens/train_per_sec_per_gpu": 3446.24, + "tokens/trainable": 13829844 + }, + { + "epoch": 3.1656050955414012, + "grad_norm": 0.1259765625, + "learning_rate": 1.7900948762900527e-05, + "loss": 0.002409819047898054, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00241, + "step": 994, + "tokens/total": 130113536, + "tokens/train_per_sec_per_gpu": 3166.51, + "tokens/trainable": 13843168 + }, + { + "epoch": 3.1687898089171975, + "grad_norm": 0.1669921875, + "learning_rate": 1.7847670694481307e-05, + "loss": 0.004092029761523008, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0041, + "step": 995, + "tokens/total": 130244608, + "tokens/train_per_sec_per_gpu": 3606.98, + "tokens/trainable": 13858161 + }, + { + "epoch": 3.171974522292994, + "grad_norm": 0.11865234375, + "learning_rate": 1.7794427982031104e-05, + "loss": 0.001977186882868409, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00198, + "step": 996, + "tokens/total": 130375680, + "tokens/train_per_sec_per_gpu": 3153.56, + "tokens/trainable": 13871441 + }, + { + "epoch": 3.1751592356687897, + "grad_norm": 0.1728515625, + "learning_rate": 1.7741220888743587e-05, + "loss": 0.0029397865291684866, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00294, + "step": 997, + "tokens/total": 130506752, + "tokens/train_per_sec_per_gpu": 3096.09, + "tokens/trainable": 13884512 + }, + { + "epoch": 3.178343949044586, + "grad_norm": 0.1416015625, + "learning_rate": 1.768804967763632e-05, + "loss": 0.0025828841608017683, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00259, + "step": 998, + "tokens/total": 130637824, + "tokens/train_per_sec_per_gpu": 3237.57, + "tokens/trainable": 13898147 + }, + { + "epoch": 3.1815286624203822, + "grad_norm": 0.1455078125, + "learning_rate": 1.763491461154951e-05, + "loss": 0.002550513716414571, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00255, + "step": 999, + "tokens/total": 130768896, + "tokens/train_per_sec_per_gpu": 3248.32, + "tokens/trainable": 13911818 + }, + { + "epoch": 3.1847133757961785, + "grad_norm": 0.1220703125, + "learning_rate": 1.7581815953144694e-05, + "loss": 0.0023207683116197586, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00232, + "step": 1000, + "tokens/total": 130899968, + "tokens/train_per_sec_per_gpu": 3189.52, + "tokens/trainable": 13925184 + }, + { + "epoch": 3.1878980891719744, + "grad_norm": 0.14453125, + "learning_rate": 1.7528753964903422e-05, + "loss": 0.0033754960168153048, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00338, + "step": 1001, + "tokens/total": 131031040, + "tokens/train_per_sec_per_gpu": 3425.19, + "tokens/trainable": 13939522 + }, + { + "epoch": 3.1910828025477707, + "grad_norm": 0.11767578125, + "learning_rate": 1.7475728909125967e-05, + "loss": 0.0025386540219187737, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00254, + "step": 1002, + "tokens/total": 131162112, + "tokens/train_per_sec_per_gpu": 3600.39, + "tokens/trainable": 13954592 + }, + { + "epoch": 3.194267515923567, + "grad_norm": 0.10888671875, + "learning_rate": 1.7422741047930075e-05, + "loss": 0.00221554609015584, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00222, + "step": 1003, + "tokens/total": 131293184, + "tokens/train_per_sec_per_gpu": 3073.34, + "tokens/trainable": 13967458 + }, + { + "epoch": 3.1974522292993632, + "grad_norm": 0.1572265625, + "learning_rate": 1.7369790643249573e-05, + "loss": 0.0035816675517708063, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00359, + "step": 1004, + "tokens/total": 131424256, + "tokens/train_per_sec_per_gpu": 3426.39, + "tokens/trainable": 13981803 + }, + { + "epoch": 3.200636942675159, + "grad_norm": 0.15234375, + "learning_rate": 1.731687795683316e-05, + "loss": 0.0033436615485697985, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00335, + "step": 1005, + "tokens/total": 131555328, + "tokens/train_per_sec_per_gpu": 3313.83, + "tokens/trainable": 13995695 + }, + { + "epoch": 3.2038216560509554, + "grad_norm": 0.1416015625, + "learning_rate": 1.7264003250243102e-05, + "loss": 0.002780565060675144, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00278, + "step": 1006, + "tokens/total": 131686400, + "tokens/train_per_sec_per_gpu": 3199.71, + "tokens/trainable": 14009116 + }, + { + "epoch": 3.2070063694267517, + "grad_norm": 0.1552734375, + "learning_rate": 1.7211166784853874e-05, + "loss": 0.003775578923523426, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00378, + "step": 1007, + "tokens/total": 131817472, + "tokens/train_per_sec_per_gpu": 3328.07, + "tokens/trainable": 14023153 + }, + { + "epoch": 3.210191082802548, + "grad_norm": 0.1201171875, + "learning_rate": 1.715836882185094e-05, + "loss": 0.0018264808459207416, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00183, + "step": 1008, + "tokens/total": 131948544, + "tokens/train_per_sec_per_gpu": 2972.58, + "tokens/trainable": 14035645 + }, + { + "epoch": 3.213375796178344, + "grad_norm": 0.1171875, + "learning_rate": 1.710560962222945e-05, + "loss": 0.0018301783129572868, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00183, + "step": 1009, + "tokens/total": 132079616, + "tokens/train_per_sec_per_gpu": 3211.08, + "tokens/trainable": 14049113 + }, + { + "epoch": 3.21656050955414, + "grad_norm": 0.11328125, + "learning_rate": 1.705288944679291e-05, + "loss": 0.002403366146609187, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00241, + "step": 1010, + "tokens/total": 132210688, + "tokens/train_per_sec_per_gpu": 3364.87, + "tokens/trainable": 14063206 + }, + { + "epoch": 3.2197452229299364, + "grad_norm": 0.1640625, + "learning_rate": 1.7000208556151915e-05, + "loss": 0.00280455662868917, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00281, + "step": 1011, + "tokens/total": 132341760, + "tokens/train_per_sec_per_gpu": 3264.32, + "tokens/trainable": 14076868 + }, + { + "epoch": 3.2229299363057327, + "grad_norm": 0.1513671875, + "learning_rate": 1.6947567210722905e-05, + "loss": 0.0029342826455831528, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00294, + "step": 1012, + "tokens/total": 132472832, + "tokens/train_per_sec_per_gpu": 3305.91, + "tokens/trainable": 14090726 + }, + { + "epoch": 3.2261146496815285, + "grad_norm": 0.1875, + "learning_rate": 1.689496567072678e-05, + "loss": 0.0028477348387241364, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00285, + "step": 1013, + "tokens/total": 132603904, + "tokens/train_per_sec_per_gpu": 3194.5, + "tokens/trainable": 14104098 + }, + { + "epoch": 3.229299363057325, + "grad_norm": 0.15625, + "learning_rate": 1.6842404196187715e-05, + "loss": 0.002830425277352333, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00283, + "step": 1014, + "tokens/total": 132734976, + "tokens/train_per_sec_per_gpu": 3606.8, + "tokens/trainable": 14119202 + }, + { + "epoch": 3.232484076433121, + "grad_norm": 0.12451171875, + "learning_rate": 1.678988304693183e-05, + "loss": 0.002606867579743266, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00261, + "step": 1015, + "tokens/total": 132866048, + "tokens/train_per_sec_per_gpu": 3574.5, + "tokens/trainable": 14134144 + }, + { + "epoch": 3.2356687898089174, + "grad_norm": 0.1484375, + "learning_rate": 1.6737402482585863e-05, + "loss": 0.0034160753712058067, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00342, + "step": 1016, + "tokens/total": 132997120, + "tokens/train_per_sec_per_gpu": 3134.2, + "tokens/trainable": 14147367 + }, + { + "epoch": 3.238853503184713, + "grad_norm": 0.12060546875, + "learning_rate": 1.6684962762575966e-05, + "loss": 0.0016203324776142836, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00162, + "step": 1017, + "tokens/total": 133128192, + "tokens/train_per_sec_per_gpu": 3101.12, + "tokens/trainable": 14160359 + }, + { + "epoch": 3.2420382165605095, + "grad_norm": 0.1611328125, + "learning_rate": 1.663256414612639e-05, + "loss": 0.0028734614606946707, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00288, + "step": 1018, + "tokens/total": 133259264, + "tokens/train_per_sec_per_gpu": 2813.19, + "tokens/trainable": 14172273 + }, + { + "epoch": 3.245222929936306, + "grad_norm": 0.1630859375, + "learning_rate": 1.658020689225817e-05, + "loss": 0.0035582587588578463, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00356, + "step": 1019, + "tokens/total": 133390336, + "tokens/train_per_sec_per_gpu": 3006.37, + "tokens/trainable": 14184925 + }, + { + "epoch": 3.248407643312102, + "grad_norm": 0.16796875, + "learning_rate": 1.6527891259787895e-05, + "loss": 0.0026477861683815718, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00265, + "step": 1020, + "tokens/total": 133521408, + "tokens/train_per_sec_per_gpu": 3004.12, + "tokens/trainable": 14197554 + }, + { + "epoch": 3.251592356687898, + "grad_norm": 0.15234375, + "learning_rate": 1.6475617507326418e-05, + "loss": 0.0031140560749918222, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00312, + "step": 1021, + "tokens/total": 133652480, + "tokens/train_per_sec_per_gpu": 3175.24, + "tokens/trainable": 14210893 + }, + { + "epoch": 3.254777070063694, + "grad_norm": 0.1611328125, + "learning_rate": 1.6423385893277536e-05, + "loss": 0.003689323551952839, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0037, + "step": 1022, + "tokens/total": 133783552, + "tokens/train_per_sec_per_gpu": 3444.39, + "tokens/trainable": 14225297 + }, + { + "epoch": 3.2579617834394905, + "grad_norm": 0.13671875, + "learning_rate": 1.6371196675836763e-05, + "loss": 0.0028125548269599676, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00282, + "step": 1023, + "tokens/total": 133914624, + "tokens/train_per_sec_per_gpu": 3577.78, + "tokens/trainable": 14240285 + }, + { + "epoch": 3.261146496815287, + "grad_norm": 0.1513671875, + "learning_rate": 1.631905011299005e-05, + "loss": 0.003101219655945897, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00311, + "step": 1024, + "tokens/total": 134045696, + "tokens/train_per_sec_per_gpu": 3314.34, + "tokens/trainable": 14254160 + }, + { + "epoch": 3.2643312101910826, + "grad_norm": 0.1962890625, + "learning_rate": 1.6266946462512455e-05, + "loss": 0.002571912482380867, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00258, + "step": 1025, + "tokens/total": 134176768, + "tokens/train_per_sec_per_gpu": 3129.65, + "tokens/trainable": 14267272 + }, + { + "epoch": 3.267515923566879, + "grad_norm": 0.126953125, + "learning_rate": 1.6214885981966937e-05, + "loss": 0.002030417090281844, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00203, + "step": 1026, + "tokens/total": 134307840, + "tokens/train_per_sec_per_gpu": 3312.29, + "tokens/trainable": 14281152 + }, + { + "epoch": 3.270700636942675, + "grad_norm": 0.142578125, + "learning_rate": 1.6162868928703057e-05, + "loss": 0.0021212187130004168, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00212, + "step": 1027, + "tokens/total": 134438912, + "tokens/train_per_sec_per_gpu": 3278.18, + "tokens/trainable": 14294941 + }, + { + "epoch": 3.2738853503184715, + "grad_norm": 0.1337890625, + "learning_rate": 1.6110895559855684e-05, + "loss": 0.0034488090313971043, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00345, + "step": 1028, + "tokens/total": 134569984, + "tokens/train_per_sec_per_gpu": 3722.82, + "tokens/trainable": 14310525 + }, + { + "epoch": 3.2770700636942673, + "grad_norm": 0.138671875, + "learning_rate": 1.605896613234375e-05, + "loss": 0.002809841651469469, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00281, + "step": 1029, + "tokens/total": 134701056, + "tokens/train_per_sec_per_gpu": 3356.33, + "tokens/trainable": 14324590 + }, + { + "epoch": 3.2802547770700636, + "grad_norm": 0.1572265625, + "learning_rate": 1.6007080902868986e-05, + "loss": 0.003251892514526844, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00326, + "step": 1030, + "tokens/total": 134832128, + "tokens/train_per_sec_per_gpu": 3390.28, + "tokens/trainable": 14338793 + }, + { + "epoch": 3.28343949044586, + "grad_norm": 0.1591796875, + "learning_rate": 1.5955240127914618e-05, + "loss": 0.003499697893857956, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00351, + "step": 1031, + "tokens/total": 134963200, + "tokens/train_per_sec_per_gpu": 3280.67, + "tokens/trainable": 14352526 + }, + { + "epoch": 3.286624203821656, + "grad_norm": 0.126953125, + "learning_rate": 1.5903444063744126e-05, + "loss": 0.0027691691648215055, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00277, + "step": 1032, + "tokens/total": 135094272, + "tokens/train_per_sec_per_gpu": 3269.79, + "tokens/trainable": 14366213 + }, + { + "epoch": 3.289808917197452, + "grad_norm": 0.1640625, + "learning_rate": 1.5851692966399996e-05, + "loss": 0.004021272994577885, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00403, + "step": 1033, + "tokens/total": 135225344, + "tokens/train_per_sec_per_gpu": 3501.62, + "tokens/trainable": 14380810 + }, + { + "epoch": 3.2929936305732483, + "grad_norm": 0.1484375, + "learning_rate": 1.579998709170239e-05, + "loss": 0.003093718783929944, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0031, + "step": 1034, + "tokens/total": 135356416, + "tokens/train_per_sec_per_gpu": 3052.87, + "tokens/trainable": 14393602 + }, + { + "epoch": 3.2961783439490446, + "grad_norm": 0.1533203125, + "learning_rate": 1.5748326695247957e-05, + "loss": 0.003595340298488736, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0036, + "step": 1035, + "tokens/total": 135487488, + "tokens/train_per_sec_per_gpu": 3610.85, + "tokens/trainable": 14408657 + }, + { + "epoch": 3.299363057324841, + "grad_norm": 0.17578125, + "learning_rate": 1.569671203240852e-05, + "loss": 0.0037980927154421806, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00381, + "step": 1036, + "tokens/total": 135618560, + "tokens/train_per_sec_per_gpu": 3399.48, + "tokens/trainable": 14422876 + }, + { + "epoch": 3.3025477707006368, + "grad_norm": 0.1796875, + "learning_rate": 1.5645143358329815e-05, + "loss": 0.003825873602181673, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00383, + "step": 1037, + "tokens/total": 135749632, + "tokens/train_per_sec_per_gpu": 3345.54, + "tokens/trainable": 14436870 + }, + { + "epoch": 3.305732484076433, + "grad_norm": 0.12255859375, + "learning_rate": 1.559362092793027e-05, + "loss": 0.002097800839692354, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0021, + "step": 1038, + "tokens/total": 135880704, + "tokens/train_per_sec_per_gpu": 3530.34, + "tokens/trainable": 14451577 + }, + { + "epoch": 3.3089171974522293, + "grad_norm": 0.1572265625, + "learning_rate": 1.5542144995899698e-05, + "loss": 0.003578023286536336, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00358, + "step": 1039, + "tokens/total": 136011776, + "tokens/train_per_sec_per_gpu": 3208.09, + "tokens/trainable": 14465046 + }, + { + "epoch": 3.3121019108280256, + "grad_norm": 0.1376953125, + "learning_rate": 1.5490715816698077e-05, + "loss": 0.002384308958426118, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00239, + "step": 1040, + "tokens/total": 136142848, + "tokens/train_per_sec_per_gpu": 3313.93, + "tokens/trainable": 14478889 + }, + { + "epoch": 3.3152866242038215, + "grad_norm": 0.1396484375, + "learning_rate": 1.5439333644554227e-05, + "loss": 0.0023124567233026028, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00232, + "step": 1041, + "tokens/total": 136273920, + "tokens/train_per_sec_per_gpu": 3490.61, + "tokens/trainable": 14493436 + }, + { + "epoch": 3.3184713375796178, + "grad_norm": 0.1640625, + "learning_rate": 1.538799873346466e-05, + "loss": 0.004312054719775915, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00432, + "step": 1042, + "tokens/total": 136404992, + "tokens/train_per_sec_per_gpu": 3468.79, + "tokens/trainable": 14508009 + }, + { + "epoch": 3.321656050955414, + "grad_norm": 0.1611328125, + "learning_rate": 1.5336711337192227e-05, + "loss": 0.0034810621291399, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00349, + "step": 1043, + "tokens/total": 136536064, + "tokens/train_per_sec_per_gpu": 3681.02, + "tokens/trainable": 14523389 + }, + { + "epoch": 3.3248407643312103, + "grad_norm": 0.1201171875, + "learning_rate": 1.5285471709264897e-05, + "loss": 0.0020460544619709253, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00205, + "step": 1044, + "tokens/total": 136667136, + "tokens/train_per_sec_per_gpu": 3329.86, + "tokens/trainable": 14537340 + }, + { + "epoch": 3.328025477707006, + "grad_norm": 0.1455078125, + "learning_rate": 1.5234280102974525e-05, + "loss": 0.003296096809208393, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0033, + "step": 1045, + "tokens/total": 136798208, + "tokens/train_per_sec_per_gpu": 3446.83, + "tokens/trainable": 14551699 + }, + { + "epoch": 3.3312101910828025, + "grad_norm": 0.1328125, + "learning_rate": 1.5183136771375579e-05, + "loss": 0.0019932978320866823, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.002, + "step": 1046, + "tokens/total": 136929280, + "tokens/train_per_sec_per_gpu": 3210.34, + "tokens/trainable": 14565142 + }, + { + "epoch": 3.3343949044585988, + "grad_norm": 0.1376953125, + "learning_rate": 1.5132041967283866e-05, + "loss": 0.001847305684350431, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00185, + "step": 1047, + "tokens/total": 137060352, + "tokens/train_per_sec_per_gpu": 3505.73, + "tokens/trainable": 14579823 + }, + { + "epoch": 3.337579617834395, + "grad_norm": 0.1474609375, + "learning_rate": 1.5080995943275348e-05, + "loss": 0.00248389202170074, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00249, + "step": 1048, + "tokens/total": 137191424, + "tokens/train_per_sec_per_gpu": 3588.55, + "tokens/trainable": 14594782 + }, + { + "epoch": 3.340764331210191, + "grad_norm": 0.18359375, + "learning_rate": 1.5029998951684828e-05, + "loss": 0.00269156857393682, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0027, + "step": 1049, + "tokens/total": 137322496, + "tokens/train_per_sec_per_gpu": 3464.55, + "tokens/trainable": 14609308 + }, + { + "epoch": 3.343949044585987, + "grad_norm": 0.173828125, + "learning_rate": 1.4979051244604722e-05, + "loss": 0.003072477411478758, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00308, + "step": 1050, + "tokens/total": 137453568, + "tokens/train_per_sec_per_gpu": 3052.82, + "tokens/trainable": 14622170 + }, + { + "epoch": 3.3471337579617835, + "grad_norm": 0.1767578125, + "learning_rate": 1.4928153073883843e-05, + "loss": 0.003987753763794899, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.004, + "step": 1051, + "tokens/total": 137584640, + "tokens/train_per_sec_per_gpu": 3233.94, + "tokens/trainable": 14635795 + }, + { + "epoch": 3.3503184713375798, + "grad_norm": 0.130859375, + "learning_rate": 1.4877304691126123e-05, + "loss": 0.0029561547562479973, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00296, + "step": 1052, + "tokens/total": 137715712, + "tokens/train_per_sec_per_gpu": 3268.66, + "tokens/trainable": 14649498 + }, + { + "epoch": 3.3535031847133756, + "grad_norm": 0.150390625, + "learning_rate": 1.4826506347689353e-05, + "loss": 0.0022640160750597715, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00227, + "step": 1053, + "tokens/total": 137846784, + "tokens/train_per_sec_per_gpu": 3172.82, + "tokens/trainable": 14662788 + }, + { + "epoch": 3.356687898089172, + "grad_norm": 0.181640625, + "learning_rate": 1.4775758294684006e-05, + "loss": 0.0038375440053641796, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00384, + "step": 1054, + "tokens/total": 137977856, + "tokens/train_per_sec_per_gpu": 3000.65, + "tokens/trainable": 14675379 + }, + { + "epoch": 3.359872611464968, + "grad_norm": 0.1630859375, + "learning_rate": 1.4725060782971933e-05, + "loss": 0.0024567164946347475, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00246, + "step": 1055, + "tokens/total": 138108928, + "tokens/train_per_sec_per_gpu": 3533.93, + "tokens/trainable": 14690140 + }, + { + "epoch": 3.3630573248407645, + "grad_norm": 0.10205078125, + "learning_rate": 1.4674414063165137e-05, + "loss": 0.0013129838043823838, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00131, + "step": 1056, + "tokens/total": 138240000, + "tokens/train_per_sec_per_gpu": 3290.43, + "tokens/trainable": 14703961 + }, + { + "epoch": 3.3662420382165603, + "grad_norm": 0.1748046875, + "learning_rate": 1.4623818385624566e-05, + "loss": 0.003262344980612397, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00327, + "step": 1057, + "tokens/total": 138371072, + "tokens/train_per_sec_per_gpu": 3399.17, + "tokens/trainable": 14718152 + }, + { + "epoch": 3.3694267515923566, + "grad_norm": 0.1767578125, + "learning_rate": 1.457327400045884e-05, + "loss": 0.0037125989329069853, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00372, + "step": 1058, + "tokens/total": 138502144, + "tokens/train_per_sec_per_gpu": 3413.99, + "tokens/trainable": 14732369 + }, + { + "epoch": 3.372611464968153, + "grad_norm": 0.171875, + "learning_rate": 1.4522781157523008e-05, + "loss": 0.003059735056012869, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00306, + "step": 1059, + "tokens/total": 138633216, + "tokens/train_per_sec_per_gpu": 3170.31, + "tokens/trainable": 14745664 + }, + { + "epoch": 3.375796178343949, + "grad_norm": 0.16796875, + "learning_rate": 1.4472340106417375e-05, + "loss": 0.0033829023595899343, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00339, + "step": 1060, + "tokens/total": 138764288, + "tokens/train_per_sec_per_gpu": 3121.95, + "tokens/trainable": 14758786 + }, + { + "epoch": 3.3789808917197455, + "grad_norm": 0.1220703125, + "learning_rate": 1.4421951096486171e-05, + "loss": 0.0024168547242879868, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00242, + "step": 1061, + "tokens/total": 138895360, + "tokens/train_per_sec_per_gpu": 3410.32, + "tokens/trainable": 14773023 + }, + { + "epoch": 3.3821656050955413, + "grad_norm": 0.1728515625, + "learning_rate": 1.4371614376816416e-05, + "loss": 0.0038187310565263033, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00383, + "step": 1062, + "tokens/total": 139026432, + "tokens/train_per_sec_per_gpu": 3277.49, + "tokens/trainable": 14786713 + }, + { + "epoch": 3.3853503184713376, + "grad_norm": 0.130859375, + "learning_rate": 1.4321330196236638e-05, + "loss": 0.002092313254252076, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00209, + "step": 1063, + "tokens/total": 139157504, + "tokens/train_per_sec_per_gpu": 3363.98, + "tokens/trainable": 14800746 + }, + { + "epoch": 3.388535031847134, + "grad_norm": 0.16015625, + "learning_rate": 1.4271098803315624e-05, + "loss": 0.0034465331118553877, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00345, + "step": 1064, + "tokens/total": 139288576, + "tokens/train_per_sec_per_gpu": 3553.56, + "tokens/trainable": 14815617 + }, + { + "epoch": 3.3917197452229297, + "grad_norm": 0.1728515625, + "learning_rate": 1.4220920446361224e-05, + "loss": 0.003886766964569688, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00389, + "step": 1065, + "tokens/total": 139419648, + "tokens/train_per_sec_per_gpu": 3092.87, + "tokens/trainable": 14828591 + }, + { + "epoch": 3.394904458598726, + "grad_norm": 0.1376953125, + "learning_rate": 1.4170795373419148e-05, + "loss": 0.0024511385709047318, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00245, + "step": 1066, + "tokens/total": 139550720, + "tokens/train_per_sec_per_gpu": 3130.21, + "tokens/trainable": 14841695 + }, + { + "epoch": 3.3980891719745223, + "grad_norm": 0.1748046875, + "learning_rate": 1.4120723832271665e-05, + "loss": 0.0035048723220825195, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00351, + "step": 1067, + "tokens/total": 139681792, + "tokens/train_per_sec_per_gpu": 3774.26, + "tokens/trainable": 14857394 + }, + { + "epoch": 3.4012738853503186, + "grad_norm": 0.154296875, + "learning_rate": 1.4070706070436446e-05, + "loss": 0.0028158228378742933, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00282, + "step": 1068, + "tokens/total": 139812864, + "tokens/train_per_sec_per_gpu": 3417.94, + "tokens/trainable": 14871671 + }, + { + "epoch": 3.404458598726115, + "grad_norm": 0.1669921875, + "learning_rate": 1.4020742335165326e-05, + "loss": 0.003797327633947134, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0038, + "step": 1069, + "tokens/total": 139943936, + "tokens/train_per_sec_per_gpu": 3477.81, + "tokens/trainable": 14886204 + }, + { + "epoch": 3.4076433121019107, + "grad_norm": 0.11474609375, + "learning_rate": 1.3970832873443043e-05, + "loss": 0.0019341235747560859, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00194, + "step": 1070, + "tokens/total": 140075008, + "tokens/train_per_sec_per_gpu": 3491.7, + "tokens/trainable": 14900766 + }, + { + "epoch": 3.410828025477707, + "grad_norm": 0.1533203125, + "learning_rate": 1.392097793198605e-05, + "loss": 0.0030175955034792423, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00302, + "step": 1071, + "tokens/total": 140206080, + "tokens/train_per_sec_per_gpu": 3393.54, + "tokens/trainable": 14914981 + }, + { + "epoch": 3.4140127388535033, + "grad_norm": 0.12255859375, + "learning_rate": 1.3871177757241326e-05, + "loss": 0.001799887279048562, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0018, + "step": 1072, + "tokens/total": 140337152, + "tokens/train_per_sec_per_gpu": 3339.98, + "tokens/trainable": 14928954 + }, + { + "epoch": 3.417197452229299, + "grad_norm": 0.1396484375, + "learning_rate": 1.382143259538507e-05, + "loss": 0.001962024951353669, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00196, + "step": 1073, + "tokens/total": 140468224, + "tokens/train_per_sec_per_gpu": 3376.61, + "tokens/trainable": 14943033 + }, + { + "epoch": 3.4203821656050954, + "grad_norm": 0.16015625, + "learning_rate": 1.3771742692321574e-05, + "loss": 0.0027512316592037678, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00276, + "step": 1074, + "tokens/total": 140599296, + "tokens/train_per_sec_per_gpu": 3139.25, + "tokens/trainable": 14956205 + }, + { + "epoch": 3.4235668789808917, + "grad_norm": 0.15625, + "learning_rate": 1.3722108293681973e-05, + "loss": 0.0029566381126642227, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00296, + "step": 1075, + "tokens/total": 140730368, + "tokens/train_per_sec_per_gpu": 3445.88, + "tokens/trainable": 14970584 + }, + { + "epoch": 3.426751592356688, + "grad_norm": 0.1640625, + "learning_rate": 1.3672529644823004e-05, + "loss": 0.0029452519956976175, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00295, + "step": 1076, + "tokens/total": 140861440, + "tokens/train_per_sec_per_gpu": 3354.99, + "tokens/trainable": 14984596 + }, + { + "epoch": 3.4299363057324843, + "grad_norm": 0.10693359375, + "learning_rate": 1.362300699082582e-05, + "loss": 0.0017804743256419897, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00178, + "step": 1077, + "tokens/total": 140992512, + "tokens/train_per_sec_per_gpu": 3354.98, + "tokens/trainable": 14998636 + }, + { + "epoch": 3.43312101910828, + "grad_norm": 0.2001953125, + "learning_rate": 1.35735405764948e-05, + "loss": 0.003846959676593542, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00385, + "step": 1078, + "tokens/total": 141123584, + "tokens/train_per_sec_per_gpu": 3226.73, + "tokens/trainable": 15012165 + }, + { + "epoch": 3.4363057324840764, + "grad_norm": 0.166015625, + "learning_rate": 1.3524130646356283e-05, + "loss": 0.0025776573456823826, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00258, + "step": 1079, + "tokens/total": 141254656, + "tokens/train_per_sec_per_gpu": 3226.06, + "tokens/trainable": 15025665 + }, + { + "epoch": 3.4394904458598727, + "grad_norm": 0.1630859375, + "learning_rate": 1.3474777444657415e-05, + "loss": 0.0029838993214070797, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00299, + "step": 1080, + "tokens/total": 141385728, + "tokens/train_per_sec_per_gpu": 3689.69, + "tokens/trainable": 15041028 + }, + { + "epoch": 3.4426751592356686, + "grad_norm": 0.14453125, + "learning_rate": 1.3425481215364922e-05, + "loss": 0.0022048731334507465, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00221, + "step": 1081, + "tokens/total": 141516800, + "tokens/train_per_sec_per_gpu": 3238.87, + "tokens/trainable": 15054618 + }, + { + "epoch": 3.445859872611465, + "grad_norm": 0.185546875, + "learning_rate": 1.3376242202163868e-05, + "loss": 0.004590876400470734, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0046, + "step": 1082, + "tokens/total": 141647872, + "tokens/train_per_sec_per_gpu": 3402.12, + "tokens/trainable": 15068791 + }, + { + "epoch": 3.449044585987261, + "grad_norm": 0.15625, + "learning_rate": 1.3327060648456502e-05, + "loss": 0.0026096594519913197, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00261, + "step": 1083, + "tokens/total": 141778944, + "tokens/train_per_sec_per_gpu": 3599.47, + "tokens/trainable": 15083794 + }, + { + "epoch": 3.4522292993630574, + "grad_norm": 0.1162109375, + "learning_rate": 1.3277936797361043e-05, + "loss": 0.0020494635682553053, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00205, + "step": 1084, + "tokens/total": 141910016, + "tokens/train_per_sec_per_gpu": 3307.62, + "tokens/trainable": 15097640 + }, + { + "epoch": 3.4554140127388537, + "grad_norm": 0.1552734375, + "learning_rate": 1.3228870891710443e-05, + "loss": 0.003234599716961384, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00324, + "step": 1085, + "tokens/total": 142041088, + "tokens/train_per_sec_per_gpu": 3210.5, + "tokens/trainable": 15111127 + }, + { + "epoch": 3.4585987261146496, + "grad_norm": 0.14453125, + "learning_rate": 1.3179863174051238e-05, + "loss": 0.002322172513231635, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00232, + "step": 1086, + "tokens/total": 142172160, + "tokens/train_per_sec_per_gpu": 3065.87, + "tokens/trainable": 15123986 + }, + { + "epoch": 3.461783439490446, + "grad_norm": 0.16796875, + "learning_rate": 1.3130913886642333e-05, + "loss": 0.003022089833393693, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00303, + "step": 1087, + "tokens/total": 142303232, + "tokens/train_per_sec_per_gpu": 3611.73, + "tokens/trainable": 15139047 + }, + { + "epoch": 3.464968152866242, + "grad_norm": 0.1396484375, + "learning_rate": 1.3082023271453759e-05, + "loss": 0.0020968448370695114, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0021, + "step": 1088, + "tokens/total": 142434304, + "tokens/train_per_sec_per_gpu": 3221.7, + "tokens/trainable": 15152542 + }, + { + "epoch": 3.468152866242038, + "grad_norm": 0.171875, + "learning_rate": 1.3033191570165532e-05, + "loss": 0.00432826392352581, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00434, + "step": 1089, + "tokens/total": 142565376, + "tokens/train_per_sec_per_gpu": 3192.76, + "tokens/trainable": 15165913 + }, + { + "epoch": 3.4713375796178343, + "grad_norm": 0.1142578125, + "learning_rate": 1.298441902416646e-05, + "loss": 0.0018635153537616134, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00187, + "step": 1090, + "tokens/total": 142696448, + "tokens/train_per_sec_per_gpu": 3635.08, + "tokens/trainable": 15181017 + }, + { + "epoch": 3.4745222929936306, + "grad_norm": 0.1806640625, + "learning_rate": 1.2935705874552894e-05, + "loss": 0.0037171547301113605, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00372, + "step": 1091, + "tokens/total": 142827520, + "tokens/train_per_sec_per_gpu": 3549.32, + "tokens/trainable": 15195900 + }, + { + "epoch": 3.477707006369427, + "grad_norm": 0.154296875, + "learning_rate": 1.2887052362127594e-05, + "loss": 0.0025141574442386627, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00252, + "step": 1092, + "tokens/total": 142958592, + "tokens/train_per_sec_per_gpu": 3427.88, + "tokens/trainable": 15210182 + }, + { + "epoch": 3.480891719745223, + "grad_norm": 0.1630859375, + "learning_rate": 1.2838458727398531e-05, + "loss": 0.0030665546655654907, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00307, + "step": 1093, + "tokens/total": 143089664, + "tokens/train_per_sec_per_gpu": 4042.03, + "tokens/trainable": 15226897 + }, + { + "epoch": 3.484076433121019, + "grad_norm": 0.12890625, + "learning_rate": 1.2789925210577647e-05, + "loss": 0.0020227362401783466, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00202, + "step": 1094, + "tokens/total": 143220736, + "tokens/train_per_sec_per_gpu": 3736.82, + "tokens/trainable": 15242382 + }, + { + "epoch": 3.4872611464968153, + "grad_norm": 0.158203125, + "learning_rate": 1.274145205157972e-05, + "loss": 0.0027202137280255556, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00272, + "step": 1095, + "tokens/total": 143351808, + "tokens/train_per_sec_per_gpu": 3200.5, + "tokens/trainable": 15255782 + }, + { + "epoch": 3.4904458598726116, + "grad_norm": 0.1708984375, + "learning_rate": 1.269303949002118e-05, + "loss": 0.0031496393494307995, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00315, + "step": 1096, + "tokens/total": 143482880, + "tokens/train_per_sec_per_gpu": 3206.14, + "tokens/trainable": 15269719 + }, + { + "epoch": 3.4936305732484074, + "grad_norm": 0.1748046875, + "learning_rate": 1.2644687765218874e-05, + "loss": 0.0028139406349509954, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00282, + "step": 1097, + "tokens/total": 143613952, + "tokens/train_per_sec_per_gpu": 3399.76, + "tokens/trainable": 15283962 + }, + { + "epoch": 3.4968152866242037, + "grad_norm": 0.1767578125, + "learning_rate": 1.2596397116188946e-05, + "loss": 0.0032941231038421392, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0033, + "step": 1098, + "tokens/total": 143745024, + "tokens/train_per_sec_per_gpu": 3149.95, + "tokens/trainable": 15297099 + }, + { + "epoch": 3.5, + "grad_norm": 0.1689453125, + "learning_rate": 1.2548167781645616e-05, + "loss": 0.00317127862945199, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00318, + "step": 1099, + "tokens/total": 143876096, + "tokens/train_per_sec_per_gpu": 3666.83, + "tokens/trainable": 15312299 + }, + { + "epoch": 3.5, + "eval_loss": 0.010016990825533867, + "eval_ppl": 1.01007, + "eval_runtime": 43.0422, + "eval_samples_per_second": 62.752, + "eval_steps_per_second": 3.926, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 54.61, + "memory/max_allocated (GiB)": 54.61, + "step": 1099 + }, + { + "epoch": 3.5031847133757963, + "grad_norm": 0.162109375, + "learning_rate": 1.2500000000000006e-05, + "loss": 0.0022175521589815617, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00222, + "step": 1100, + "tokens/total": 144007168, + "tokens/train_per_sec_per_gpu": 3427.84, + "tokens/trainable": 15326710 + }, + { + "epoch": 3.5063694267515926, + "grad_norm": 0.1845703125, + "learning_rate": 1.245189400935895e-05, + "loss": 0.005054910201579332, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00507, + "step": 1101, + "tokens/total": 144138240, + "tokens/train_per_sec_per_gpu": 3351.14, + "tokens/trainable": 15340735 + }, + { + "epoch": 3.5095541401273884, + "grad_norm": 0.1630859375, + "learning_rate": 1.2403850047523866e-05, + "loss": 0.0027237918693572283, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00273, + "step": 1102, + "tokens/total": 144269312, + "tokens/train_per_sec_per_gpu": 3436.73, + "tokens/trainable": 15355132 + }, + { + "epoch": 3.5127388535031847, + "grad_norm": 0.1787109375, + "learning_rate": 1.2355868351989509e-05, + "loss": 0.0029630253557115793, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00297, + "step": 1103, + "tokens/total": 144400384, + "tokens/train_per_sec_per_gpu": 3214.66, + "tokens/trainable": 15368489 + }, + { + "epoch": 3.515923566878981, + "grad_norm": 0.142578125, + "learning_rate": 1.2307949159942862e-05, + "loss": 0.0033542895689606667, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00336, + "step": 1104, + "tokens/total": 144531456, + "tokens/train_per_sec_per_gpu": 3198.14, + "tokens/trainable": 15381840 + }, + { + "epoch": 3.519108280254777, + "grad_norm": 0.17578125, + "learning_rate": 1.2260092708261936e-05, + "loss": 0.0038351963739842176, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00384, + "step": 1105, + "tokens/total": 144662528, + "tokens/train_per_sec_per_gpu": 3503.01, + "tokens/trainable": 15396418 + }, + { + "epoch": 3.522292993630573, + "grad_norm": 0.154296875, + "learning_rate": 1.2212299233514582e-05, + "loss": 0.0025412808172404766, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00254, + "step": 1106, + "tokens/total": 144793600, + "tokens/train_per_sec_per_gpu": 3919.55, + "tokens/trainable": 15412594 + }, + { + "epoch": 3.5254777070063694, + "grad_norm": 0.1796875, + "learning_rate": 1.216456897195733e-05, + "loss": 0.0032449497375637293, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00325, + "step": 1107, + "tokens/total": 144924672, + "tokens/train_per_sec_per_gpu": 3382.55, + "tokens/trainable": 15426656 + }, + { + "epoch": 3.5286624203821657, + "grad_norm": 0.146484375, + "learning_rate": 1.211690215953427e-05, + "loss": 0.0023905187845230103, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00239, + "step": 1108, + "tokens/total": 145055744, + "tokens/train_per_sec_per_gpu": 3011.33, + "tokens/trainable": 15439226 + }, + { + "epoch": 3.531847133757962, + "grad_norm": 0.15625, + "learning_rate": 1.2069299031875795e-05, + "loss": 0.0024083037860691547, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00241, + "step": 1109, + "tokens/total": 145186816, + "tokens/train_per_sec_per_gpu": 2939.76, + "tokens/trainable": 15451512 + }, + { + "epoch": 3.535031847133758, + "grad_norm": 0.1787109375, + "learning_rate": 1.2021759824297524e-05, + "loss": 0.004423599690198898, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00443, + "step": 1110, + "tokens/total": 145317888, + "tokens/train_per_sec_per_gpu": 3466.29, + "tokens/trainable": 15465910 + }, + { + "epoch": 3.538216560509554, + "grad_norm": 0.1455078125, + "learning_rate": 1.1974284771799096e-05, + "loss": 0.002882221946492791, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00289, + "step": 1111, + "tokens/total": 145448960, + "tokens/train_per_sec_per_gpu": 3506.98, + "tokens/trainable": 15480477 + }, + { + "epoch": 3.5414012738853504, + "grad_norm": 0.1826171875, + "learning_rate": 1.1926874109063e-05, + "loss": 0.003006345359608531, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00301, + "step": 1112, + "tokens/total": 145580032, + "tokens/train_per_sec_per_gpu": 3365.33, + "tokens/trainable": 15494478 + }, + { + "epoch": 3.5445859872611463, + "grad_norm": 0.154296875, + "learning_rate": 1.1879528070453423e-05, + "loss": 0.0027234896551817656, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00273, + "step": 1113, + "tokens/total": 145711104, + "tokens/train_per_sec_per_gpu": 3535.63, + "tokens/trainable": 15509199 + }, + { + "epoch": 3.5477707006369426, + "grad_norm": 0.177734375, + "learning_rate": 1.1832246890015125e-05, + "loss": 0.0036931924987584352, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0037, + "step": 1114, + "tokens/total": 145842176, + "tokens/train_per_sec_per_gpu": 3246.79, + "tokens/trainable": 15522710 + }, + { + "epoch": 3.550955414012739, + "grad_norm": 0.1474609375, + "learning_rate": 1.1785030801472221e-05, + "loss": 0.0028704549185931683, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00287, + "step": 1115, + "tokens/total": 145973248, + "tokens/train_per_sec_per_gpu": 3848.56, + "tokens/trainable": 15538730 + }, + { + "epoch": 3.554140127388535, + "grad_norm": 0.15625, + "learning_rate": 1.1737880038227082e-05, + "loss": 0.00254430272616446, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00255, + "step": 1116, + "tokens/total": 146104320, + "tokens/train_per_sec_per_gpu": 3397.78, + "tokens/trainable": 15552911 + }, + { + "epoch": 3.5573248407643314, + "grad_norm": 0.1630859375, + "learning_rate": 1.1690794833359159e-05, + "loss": 0.0025816336274147034, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00258, + "step": 1117, + "tokens/total": 146235392, + "tokens/train_per_sec_per_gpu": 2881.17, + "tokens/trainable": 15564987 + }, + { + "epoch": 3.5605095541401273, + "grad_norm": 0.19140625, + "learning_rate": 1.1643775419623812e-05, + "loss": 0.003014686517417431, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00302, + "step": 1118, + "tokens/total": 146366464, + "tokens/train_per_sec_per_gpu": 3324.99, + "tokens/trainable": 15578834 + }, + { + "epoch": 3.5636942675159236, + "grad_norm": 0.146484375, + "learning_rate": 1.1596822029451177e-05, + "loss": 0.0020668318029493093, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00207, + "step": 1119, + "tokens/total": 146497536, + "tokens/train_per_sec_per_gpu": 3575.91, + "tokens/trainable": 15593729 + }, + { + "epoch": 3.56687898089172, + "grad_norm": 0.142578125, + "learning_rate": 1.1549934894945045e-05, + "loss": 0.002621435560286045, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00262, + "step": 1120, + "tokens/total": 146628608, + "tokens/train_per_sec_per_gpu": 3223.75, + "tokens/trainable": 15607251 + }, + { + "epoch": 3.5700636942675157, + "grad_norm": 0.16796875, + "learning_rate": 1.1503114247881648e-05, + "loss": 0.002985800849273801, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00299, + "step": 1121, + "tokens/total": 146759680, + "tokens/train_per_sec_per_gpu": 3585.21, + "tokens/trainable": 15622149 + }, + { + "epoch": 3.573248407643312, + "grad_norm": 0.0966796875, + "learning_rate": 1.1456360319708578e-05, + "loss": 0.0013212183257564902, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00132, + "step": 1122, + "tokens/total": 146890752, + "tokens/train_per_sec_per_gpu": 3312.34, + "tokens/trainable": 15636033 + }, + { + "epoch": 3.5764331210191083, + "grad_norm": 0.17578125, + "learning_rate": 1.1409673341543625e-05, + "loss": 0.0023485145065933466, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00235, + "step": 1123, + "tokens/total": 147021824, + "tokens/train_per_sec_per_gpu": 3184.56, + "tokens/trainable": 15649372 + }, + { + "epoch": 3.5796178343949046, + "grad_norm": 0.1767578125, + "learning_rate": 1.1363053544173596e-05, + "loss": 0.002514764666557312, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00252, + "step": 1124, + "tokens/total": 147152896, + "tokens/train_per_sec_per_gpu": 3358.29, + "tokens/trainable": 15663368 + }, + { + "epoch": 3.582802547770701, + "grad_norm": 0.13671875, + "learning_rate": 1.1316501158053216e-05, + "loss": 0.002817730186507106, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00282, + "step": 1125, + "tokens/total": 147283968, + "tokens/train_per_sec_per_gpu": 3488.28, + "tokens/trainable": 15677861 + }, + { + "epoch": 3.5859872611464967, + "grad_norm": 0.150390625, + "learning_rate": 1.1270016413303997e-05, + "loss": 0.0023807904217392206, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00238, + "step": 1126, + "tokens/total": 147415040, + "tokens/train_per_sec_per_gpu": 3351.72, + "tokens/trainable": 15691892 + }, + { + "epoch": 3.589171974522293, + "grad_norm": 0.13671875, + "learning_rate": 1.1223599539713046e-05, + "loss": 0.0022236828226596117, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00223, + "step": 1127, + "tokens/total": 147546112, + "tokens/train_per_sec_per_gpu": 3133.35, + "tokens/trainable": 15705012 + }, + { + "epoch": 3.5923566878980893, + "grad_norm": 0.169921875, + "learning_rate": 1.1177250766731992e-05, + "loss": 0.0034954429138451815, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0035, + "step": 1128, + "tokens/total": 147677184, + "tokens/train_per_sec_per_gpu": 3390.97, + "tokens/trainable": 15719238 + }, + { + "epoch": 3.595541401273885, + "grad_norm": 0.1474609375, + "learning_rate": 1.1130970323475825e-05, + "loss": 0.0024684793315827847, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00247, + "step": 1129, + "tokens/total": 147808256, + "tokens/train_per_sec_per_gpu": 3373.56, + "tokens/trainable": 15733335 + }, + { + "epoch": 3.5987261146496814, + "grad_norm": 0.177734375, + "learning_rate": 1.1084758438721743e-05, + "loss": 0.003184695728123188, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00319, + "step": 1130, + "tokens/total": 147939328, + "tokens/train_per_sec_per_gpu": 3255.08, + "tokens/trainable": 15746979 + }, + { + "epoch": 3.6019108280254777, + "grad_norm": 0.154296875, + "learning_rate": 1.103861534090804e-05, + "loss": 0.00223728409036994, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00224, + "step": 1131, + "tokens/total": 148070400, + "tokens/train_per_sec_per_gpu": 3094.33, + "tokens/trainable": 15759937 + }, + { + "epoch": 3.605095541401274, + "grad_norm": 0.244140625, + "learning_rate": 1.0992541258132998e-05, + "loss": 0.0025429693050682545, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00255, + "step": 1132, + "tokens/total": 148201472, + "tokens/train_per_sec_per_gpu": 3264.14, + "tokens/trainable": 15773601 + }, + { + "epoch": 3.6082802547770703, + "grad_norm": 0.2265625, + "learning_rate": 1.0946536418153716e-05, + "loss": 0.0037906889338046312, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0038, + "step": 1133, + "tokens/total": 148332544, + "tokens/train_per_sec_per_gpu": 2941.44, + "tokens/trainable": 15785963 + }, + { + "epoch": 3.611464968152866, + "grad_norm": 0.1767578125, + "learning_rate": 1.0900601048385017e-05, + "loss": 0.0023014359176158905, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0023, + "step": 1134, + "tokens/total": 148463616, + "tokens/train_per_sec_per_gpu": 2661.35, + "tokens/trainable": 15797186 + }, + { + "epoch": 3.6146496815286624, + "grad_norm": 0.1669921875, + "learning_rate": 1.0854735375898328e-05, + "loss": 0.004023172426968813, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00403, + "step": 1135, + "tokens/total": 148594688, + "tokens/train_per_sec_per_gpu": 3525.3, + "tokens/trainable": 15811891 + }, + { + "epoch": 3.6178343949044587, + "grad_norm": 0.14453125, + "learning_rate": 1.0808939627420514e-05, + "loss": 0.0020967398304492235, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0021, + "step": 1136, + "tokens/total": 148725760, + "tokens/train_per_sec_per_gpu": 3402.16, + "tokens/trainable": 15826103 + }, + { + "epoch": 3.6210191082802545, + "grad_norm": 0.1494140625, + "learning_rate": 1.076321402933279e-05, + "loss": 0.002463690470904112, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00247, + "step": 1137, + "tokens/total": 148856832, + "tokens/train_per_sec_per_gpu": 3459.93, + "tokens/trainable": 15840539 + }, + { + "epoch": 3.624203821656051, + "grad_norm": 0.201171875, + "learning_rate": 1.0717558807669631e-05, + "loss": 0.0030937506817281246, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0031, + "step": 1138, + "tokens/total": 148987904, + "tokens/train_per_sec_per_gpu": 3333.74, + "tokens/trainable": 15854495 + }, + { + "epoch": 3.627388535031847, + "grad_norm": 0.134765625, + "learning_rate": 1.0671974188117572e-05, + "loss": 0.002224976196885109, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00223, + "step": 1139, + "tokens/total": 149118976, + "tokens/train_per_sec_per_gpu": 3179.62, + "tokens/trainable": 15867806 + }, + { + "epoch": 3.6305732484076434, + "grad_norm": 0.1767578125, + "learning_rate": 1.0626460396014182e-05, + "loss": 0.0029444252140820026, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00295, + "step": 1140, + "tokens/total": 149250048, + "tokens/train_per_sec_per_gpu": 3422.65, + "tokens/trainable": 15882044 + }, + { + "epoch": 3.6337579617834397, + "grad_norm": 0.185546875, + "learning_rate": 1.0581017656346904e-05, + "loss": 0.0034989488776773214, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00351, + "step": 1141, + "tokens/total": 149381120, + "tokens/train_per_sec_per_gpu": 3507.73, + "tokens/trainable": 15896741 + }, + { + "epoch": 3.6369426751592355, + "grad_norm": 0.1865234375, + "learning_rate": 1.053564619375193e-05, + "loss": 0.002628948539495468, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00263, + "step": 1142, + "tokens/total": 149512192, + "tokens/train_per_sec_per_gpu": 3219.49, + "tokens/trainable": 15910183 + }, + { + "epoch": 3.640127388535032, + "grad_norm": 0.263671875, + "learning_rate": 1.0490346232513113e-05, + "loss": 0.0031747568864375353, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00318, + "step": 1143, + "tokens/total": 149643264, + "tokens/train_per_sec_per_gpu": 3370.78, + "tokens/trainable": 15924212 + }, + { + "epoch": 3.643312101910828, + "grad_norm": 0.208984375, + "learning_rate": 1.0445117996560877e-05, + "loss": 0.003914204426109791, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00392, + "step": 1144, + "tokens/total": 149774336, + "tokens/train_per_sec_per_gpu": 3173.12, + "tokens/trainable": 15937505 + }, + { + "epoch": 3.646496815286624, + "grad_norm": 0.1494140625, + "learning_rate": 1.039996170947106e-05, + "loss": 0.002363776322454214, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00237, + "step": 1145, + "tokens/total": 149905408, + "tokens/train_per_sec_per_gpu": 3147.56, + "tokens/trainable": 15950698 + }, + { + "epoch": 3.6496815286624202, + "grad_norm": 0.16796875, + "learning_rate": 1.0354877594463852e-05, + "loss": 0.0031070299446582794, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00311, + "step": 1146, + "tokens/total": 150036480, + "tokens/train_per_sec_per_gpu": 3364.36, + "tokens/trainable": 15964717 + }, + { + "epoch": 3.6528662420382165, + "grad_norm": 0.1396484375, + "learning_rate": 1.0309865874402688e-05, + "loss": 0.001972392201423645, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00197, + "step": 1147, + "tokens/total": 150167552, + "tokens/train_per_sec_per_gpu": 3018.99, + "tokens/trainable": 15977365 + }, + { + "epoch": 3.656050955414013, + "grad_norm": 0.09423828125, + "learning_rate": 1.026492677179311e-05, + "loss": 0.0011499158572405577, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00115, + "step": 1148, + "tokens/total": 150298624, + "tokens/train_per_sec_per_gpu": 3220.38, + "tokens/trainable": 15990834 + }, + { + "epoch": 3.659235668789809, + "grad_norm": 0.1220703125, + "learning_rate": 1.022006050878169e-05, + "loss": 0.001693375059403479, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00169, + "step": 1149, + "tokens/total": 150429696, + "tokens/train_per_sec_per_gpu": 3186.29, + "tokens/trainable": 16004194 + }, + { + "epoch": 3.662420382165605, + "grad_norm": 0.1455078125, + "learning_rate": 1.0175267307154962e-05, + "loss": 0.0017610186478123069, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00176, + "step": 1150, + "tokens/total": 150560768, + "tokens/train_per_sec_per_gpu": 3312.55, + "tokens/trainable": 16018057 + }, + { + "epoch": 3.6656050955414012, + "grad_norm": 0.1826171875, + "learning_rate": 1.0130547388338268e-05, + "loss": 0.003534915391355753, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00354, + "step": 1151, + "tokens/total": 150691840, + "tokens/train_per_sec_per_gpu": 3383.89, + "tokens/trainable": 16032153 + }, + { + "epoch": 3.6687898089171975, + "grad_norm": 0.1630859375, + "learning_rate": 1.0085900973394708e-05, + "loss": 0.0027439731638878584, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00275, + "step": 1152, + "tokens/total": 150822912, + "tokens/train_per_sec_per_gpu": 3256.01, + "tokens/trainable": 16045798 + }, + { + "epoch": 3.6719745222929934, + "grad_norm": 0.1298828125, + "learning_rate": 1.004132828302404e-05, + "loss": 0.0019469019025564194, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00195, + "step": 1153, + "tokens/total": 150953984, + "tokens/train_per_sec_per_gpu": 3687.58, + "tokens/trainable": 16061140 + }, + { + "epoch": 3.6751592356687897, + "grad_norm": 0.15234375, + "learning_rate": 9.996829537561559e-06, + "loss": 0.0025109422858804464, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00251, + "step": 1154, + "tokens/total": 151085056, + "tokens/train_per_sec_per_gpu": 3535.46, + "tokens/trainable": 16075875 + }, + { + "epoch": 3.678343949044586, + "grad_norm": 0.134765625, + "learning_rate": 9.952404956977032e-06, + "loss": 0.0022808697540313005, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00228, + "step": 1155, + "tokens/total": 151216128, + "tokens/train_per_sec_per_gpu": 3050.84, + "tokens/trainable": 16088674 + }, + { + "epoch": 3.6815286624203822, + "grad_norm": 0.2041015625, + "learning_rate": 9.908054760873633e-06, + "loss": 0.003984857816249132, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00399, + "step": 1156, + "tokens/total": 151347200, + "tokens/train_per_sec_per_gpu": 3167.18, + "tokens/trainable": 16101976 + }, + { + "epoch": 3.6847133757961785, + "grad_norm": 0.16015625, + "learning_rate": 9.863779168486798e-06, + "loss": 0.002358327154070139, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00236, + "step": 1157, + "tokens/total": 151478272, + "tokens/train_per_sec_per_gpu": 3305.89, + "tokens/trainable": 16115788 + }, + { + "epoch": 3.6878980891719744, + "grad_norm": 0.1455078125, + "learning_rate": 9.819578398683202e-06, + "loss": 0.0030925837345421314, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0031, + "step": 1158, + "tokens/total": 151609344, + "tokens/train_per_sec_per_gpu": 3627.75, + "tokens/trainable": 16130893 + }, + { + "epoch": 3.6910828025477707, + "grad_norm": 0.142578125, + "learning_rate": 9.775452669959651e-06, + "loss": 0.00236108573153615, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00236, + "step": 1159, + "tokens/total": 151740416, + "tokens/train_per_sec_per_gpu": 3497.63, + "tokens/trainable": 16145461 + }, + { + "epoch": 3.694267515923567, + "grad_norm": 0.2119140625, + "learning_rate": 9.731402200441985e-06, + "loss": 0.0027799042873084545, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00278, + "step": 1160, + "tokens/total": 151871488, + "tokens/train_per_sec_per_gpu": 3288.32, + "tokens/trainable": 16159217 + }, + { + "epoch": 3.697452229299363, + "grad_norm": 0.197265625, + "learning_rate": 9.687427207884017e-06, + "loss": 0.004562960006296635, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00457, + "step": 1161, + "tokens/total": 152002560, + "tokens/train_per_sec_per_gpu": 3425.43, + "tokens/trainable": 16173551 + }, + { + "epoch": 3.700636942675159, + "grad_norm": 0.1748046875, + "learning_rate": 9.643527909666484e-06, + "loss": 0.003357633948326111, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00336, + "step": 1162, + "tokens/total": 152133632, + "tokens/train_per_sec_per_gpu": 3255.39, + "tokens/trainable": 16187139 + }, + { + "epoch": 3.7038216560509554, + "grad_norm": 0.1708984375, + "learning_rate": 9.599704522795899e-06, + "loss": 0.0035241839941591024, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00353, + "step": 1163, + "tokens/total": 152264704, + "tokens/train_per_sec_per_gpu": 3360.97, + "tokens/trainable": 16201246 + }, + { + "epoch": 3.7070063694267517, + "grad_norm": 0.12353515625, + "learning_rate": 9.55595726390357e-06, + "loss": 0.0019289179472252727, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00193, + "step": 1164, + "tokens/total": 152395776, + "tokens/train_per_sec_per_gpu": 3708.68, + "tokens/trainable": 16216642 + }, + { + "epoch": 3.710191082802548, + "grad_norm": 0.1279296875, + "learning_rate": 9.512286349244461e-06, + "loss": 0.0024172349367290735, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00242, + "step": 1165, + "tokens/total": 152526848, + "tokens/train_per_sec_per_gpu": 3152.27, + "tokens/trainable": 16229792 + }, + { + "epoch": 3.713375796178344, + "grad_norm": 0.1484375, + "learning_rate": 9.468691994696147e-06, + "loss": 0.0027571492828428745, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00276, + "step": 1166, + "tokens/total": 152657920, + "tokens/train_per_sec_per_gpu": 3544.62, + "tokens/trainable": 16244524 + }, + { + "epoch": 3.71656050955414, + "grad_norm": 0.171875, + "learning_rate": 9.42517441575773e-06, + "loss": 0.002144938800483942, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00215, + "step": 1167, + "tokens/total": 152788992, + "tokens/train_per_sec_per_gpu": 3187.0, + "tokens/trainable": 16257897 + }, + { + "epoch": 3.7197452229299364, + "grad_norm": 0.166015625, + "learning_rate": 9.381733827548825e-06, + "loss": 0.002875394420698285, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00288, + "step": 1168, + "tokens/total": 152920064, + "tokens/train_per_sec_per_gpu": 3371.36, + "tokens/trainable": 16271956 + }, + { + "epoch": 3.722929936305732, + "grad_norm": 0.146484375, + "learning_rate": 9.338370444808417e-06, + "loss": 0.0024918625131249428, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00249, + "step": 1169, + "tokens/total": 153051136, + "tokens/train_per_sec_per_gpu": 3125.56, + "tokens/trainable": 16285073 + }, + { + "epoch": 3.7261146496815285, + "grad_norm": 0.12158203125, + "learning_rate": 9.295084481893876e-06, + "loss": 0.0020116898231208324, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00201, + "step": 1170, + "tokens/total": 153182208, + "tokens/train_per_sec_per_gpu": 3620.37, + "tokens/trainable": 16300140 + }, + { + "epoch": 3.729299363057325, + "grad_norm": 0.1474609375, + "learning_rate": 9.251876152779863e-06, + "loss": 0.002456206362694502, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00246, + "step": 1171, + "tokens/total": 153313280, + "tokens/train_per_sec_per_gpu": 3421.83, + "tokens/trainable": 16314413 + }, + { + "epoch": 3.732484076433121, + "grad_norm": 0.1845703125, + "learning_rate": 9.20874567105725e-06, + "loss": 0.002665320411324501, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00267, + "step": 1172, + "tokens/total": 153444352, + "tokens/train_per_sec_per_gpu": 3586.11, + "tokens/trainable": 16329319 + }, + { + "epoch": 3.7356687898089174, + "grad_norm": 0.150390625, + "learning_rate": 9.165693249932098e-06, + "loss": 0.002760200994089246, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00276, + "step": 1173, + "tokens/total": 153575424, + "tokens/train_per_sec_per_gpu": 3499.69, + "tokens/trainable": 16343957 + }, + { + "epoch": 3.738853503184713, + "grad_norm": 0.1552734375, + "learning_rate": 9.122719102224603e-06, + "loss": 0.003271646797657013, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00328, + "step": 1174, + "tokens/total": 153706496, + "tokens/train_per_sec_per_gpu": 3357.83, + "tokens/trainable": 16358001 + }, + { + "epoch": 3.7420382165605095, + "grad_norm": 0.140625, + "learning_rate": 9.079823440368018e-06, + "loss": 0.0022282477002590895, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00223, + "step": 1175, + "tokens/total": 153837568, + "tokens/train_per_sec_per_gpu": 3662.02, + "tokens/trainable": 16373253 + }, + { + "epoch": 3.745222929936306, + "grad_norm": 0.1611328125, + "learning_rate": 9.037006476407628e-06, + "loss": 0.003906633704900742, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00391, + "step": 1176, + "tokens/total": 153968640, + "tokens/train_per_sec_per_gpu": 3414.99, + "tokens/trainable": 16387539 + }, + { + "epoch": 3.7484076433121016, + "grad_norm": 0.2041015625, + "learning_rate": 8.994268421999702e-06, + "loss": 0.0046704974956810474, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00468, + "step": 1177, + "tokens/total": 154099712, + "tokens/train_per_sec_per_gpu": 3318.42, + "tokens/trainable": 16401436 + }, + { + "epoch": 3.7515923566878984, + "grad_norm": 0.154296875, + "learning_rate": 8.951609488410414e-06, + "loss": 0.0023519096430391073, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00235, + "step": 1178, + "tokens/total": 154230784, + "tokens/train_per_sec_per_gpu": 3442.84, + "tokens/trainable": 16415791 + }, + { + "epoch": 3.754777070063694, + "grad_norm": 0.09326171875, + "learning_rate": 8.909029886514828e-06, + "loss": 0.001595214824192226, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0016, + "step": 1179, + "tokens/total": 154361856, + "tokens/train_per_sec_per_gpu": 3627.66, + "tokens/trainable": 16430902 + }, + { + "epoch": 3.7579617834394905, + "grad_norm": 0.1455078125, + "learning_rate": 8.866529826795866e-06, + "loss": 0.002106869127601385, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00211, + "step": 1180, + "tokens/total": 154492928, + "tokens/train_per_sec_per_gpu": 3379.91, + "tokens/trainable": 16445017 + }, + { + "epoch": 3.761146496815287, + "grad_norm": 0.150390625, + "learning_rate": 8.824109519343227e-06, + "loss": 0.0035120132379233837, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00352, + "step": 1181, + "tokens/total": 154624000, + "tokens/train_per_sec_per_gpu": 3416.65, + "tokens/trainable": 16459298 + }, + { + "epoch": 3.7643312101910826, + "grad_norm": 0.11865234375, + "learning_rate": 8.781769173852392e-06, + "loss": 0.002475301967933774, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00248, + "step": 1182, + "tokens/total": 154755072, + "tokens/train_per_sec_per_gpu": 3225.84, + "tokens/trainable": 16472813 + }, + { + "epoch": 3.767515923566879, + "grad_norm": 0.134765625, + "learning_rate": 8.739508999623563e-06, + "loss": 0.0018928756471723318, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00189, + "step": 1183, + "tokens/total": 154886144, + "tokens/train_per_sec_per_gpu": 3393.68, + "tokens/trainable": 16487035 + }, + { + "epoch": 3.770700636942675, + "grad_norm": 0.1298828125, + "learning_rate": 8.697329205560625e-06, + "loss": 0.0019152449676766992, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00192, + "step": 1184, + "tokens/total": 155017216, + "tokens/train_per_sec_per_gpu": 3331.91, + "tokens/trainable": 16500924 + }, + { + "epoch": 3.7738853503184715, + "grad_norm": 0.1328125, + "learning_rate": 8.655230000170117e-06, + "loss": 0.0024345512501895428, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00244, + "step": 1185, + "tokens/total": 155148288, + "tokens/train_per_sec_per_gpu": 3450.15, + "tokens/trainable": 16515278 + }, + { + "epoch": 3.777070063694268, + "grad_norm": 0.125, + "learning_rate": 8.61321159156023e-06, + "loss": 0.0017270749667659402, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00173, + "step": 1186, + "tokens/total": 155279360, + "tokens/train_per_sec_per_gpu": 2874.69, + "tokens/trainable": 16527348 + }, + { + "epoch": 3.7802547770700636, + "grad_norm": 0.1884765625, + "learning_rate": 8.571274187439724e-06, + "loss": 0.0030203748028725386, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00302, + "step": 1187, + "tokens/total": 155410432, + "tokens/train_per_sec_per_gpu": 3409.54, + "tokens/trainable": 16541593 + }, + { + "epoch": 3.78343949044586, + "grad_norm": 0.138671875, + "learning_rate": 8.529417995116947e-06, + "loss": 0.0022753621451556683, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00228, + "step": 1188, + "tokens/total": 155541504, + "tokens/train_per_sec_per_gpu": 3344.52, + "tokens/trainable": 16555605 + }, + { + "epoch": 3.786624203821656, + "grad_norm": 0.1494140625, + "learning_rate": 8.487643221498812e-06, + "loss": 0.0021583903580904007, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00216, + "step": 1189, + "tokens/total": 155672576, + "tokens/train_per_sec_per_gpu": 3003.62, + "tokens/trainable": 16568186 + }, + { + "epoch": 3.789808917197452, + "grad_norm": 0.12255859375, + "learning_rate": 8.445950073089721e-06, + "loss": 0.002155636204406619, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00216, + "step": 1190, + "tokens/total": 155803648, + "tokens/train_per_sec_per_gpu": 3463.82, + "tokens/trainable": 16582617 + }, + { + "epoch": 3.7929936305732483, + "grad_norm": 0.1787109375, + "learning_rate": 8.404338755990587e-06, + "loss": 0.003606649348512292, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00361, + "step": 1191, + "tokens/total": 155934720, + "tokens/train_per_sec_per_gpu": 3331.8, + "tokens/trainable": 16596564 + }, + { + "epoch": 3.7961783439490446, + "grad_norm": 0.1484375, + "learning_rate": 8.362809475897837e-06, + "loss": 0.0030233021825551987, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00303, + "step": 1192, + "tokens/total": 156065792, + "tokens/train_per_sec_per_gpu": 3466.06, + "tokens/trainable": 16611016 + }, + { + "epoch": 3.799363057324841, + "grad_norm": 0.1943359375, + "learning_rate": 8.32136243810233e-06, + "loss": 0.003034008899703622, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00304, + "step": 1193, + "tokens/total": 156196864, + "tokens/train_per_sec_per_gpu": 3277.31, + "tokens/trainable": 16624717 + }, + { + "epoch": 3.802547770700637, + "grad_norm": 0.126953125, + "learning_rate": 8.279997847488399e-06, + "loss": 0.0017860046355053782, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00179, + "step": 1194, + "tokens/total": 156327936, + "tokens/train_per_sec_per_gpu": 3192.19, + "tokens/trainable": 16638031 + }, + { + "epoch": 3.805732484076433, + "grad_norm": 0.2021484375, + "learning_rate": 8.238715908532824e-06, + "loss": 0.003182856598868966, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00319, + "step": 1195, + "tokens/total": 156459008, + "tokens/train_per_sec_per_gpu": 3312.32, + "tokens/trainable": 16651920 + }, + { + "epoch": 3.8089171974522293, + "grad_norm": 0.134765625, + "learning_rate": 8.197516825303792e-06, + "loss": 0.0023445822298526764, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00235, + "step": 1196, + "tokens/total": 156590080, + "tokens/train_per_sec_per_gpu": 3594.12, + "tokens/trainable": 16666821 + }, + { + "epoch": 3.8121019108280256, + "grad_norm": 0.1669921875, + "learning_rate": 8.156400801459912e-06, + "loss": 0.002362563507631421, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00237, + "step": 1197, + "tokens/total": 156721152, + "tokens/train_per_sec_per_gpu": 2878.74, + "tokens/trainable": 16679031 + }, + { + "epoch": 3.8152866242038215, + "grad_norm": 0.173828125, + "learning_rate": 8.115368040249242e-06, + "loss": 0.0029479744844138622, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00295, + "step": 1198, + "tokens/total": 156852224, + "tokens/train_per_sec_per_gpu": 3403.84, + "tokens/trainable": 16693210 + }, + { + "epoch": 3.8184713375796178, + "grad_norm": 0.1376953125, + "learning_rate": 8.074418744508202e-06, + "loss": 0.001919899950735271, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00192, + "step": 1199, + "tokens/total": 156983296, + "tokens/train_per_sec_per_gpu": 3656.6, + "tokens/trainable": 16708430 + }, + { + "epoch": 3.821656050955414, + "grad_norm": 0.1328125, + "learning_rate": 8.03355311666065e-06, + "loss": 0.0024780076928436756, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00248, + "step": 1200, + "tokens/total": 157114368, + "tokens/train_per_sec_per_gpu": 3218.15, + "tokens/trainable": 16721832 + }, + { + "epoch": 3.8248407643312103, + "grad_norm": 0.169921875, + "learning_rate": 7.992771358716852e-06, + "loss": 0.003482515923678875, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00349, + "step": 1201, + "tokens/total": 157245440, + "tokens/train_per_sec_per_gpu": 3264.75, + "tokens/trainable": 16735505 + }, + { + "epoch": 3.8280254777070066, + "grad_norm": 0.1494140625, + "learning_rate": 7.952073672272465e-06, + "loss": 0.002318483777344227, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00232, + "step": 1202, + "tokens/total": 157376512, + "tokens/train_per_sec_per_gpu": 3205.3, + "tokens/trainable": 16748926 + }, + { + "epoch": 3.8312101910828025, + "grad_norm": 0.1591796875, + "learning_rate": 7.91146025850755e-06, + "loss": 0.0027267371769994497, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00273, + "step": 1203, + "tokens/total": 157507584, + "tokens/train_per_sec_per_gpu": 3508.37, + "tokens/trainable": 16763595 + }, + { + "epoch": 3.8343949044585988, + "grad_norm": 0.142578125, + "learning_rate": 7.870931318185615e-06, + "loss": 0.0021403185091912746, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00214, + "step": 1204, + "tokens/total": 157638656, + "tokens/train_per_sec_per_gpu": 3252.09, + "tokens/trainable": 16777230 + }, + { + "epoch": 3.837579617834395, + "grad_norm": 0.1591796875, + "learning_rate": 7.830487051652562e-06, + "loss": 0.0029888248536735773, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00299, + "step": 1205, + "tokens/total": 157769728, + "tokens/train_per_sec_per_gpu": 3605.6, + "tokens/trainable": 16792264 + }, + { + "epoch": 3.840764331210191, + "grad_norm": 0.1103515625, + "learning_rate": 7.790127658835747e-06, + "loss": 0.0014124944573268294, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00141, + "step": 1206, + "tokens/total": 157900800, + "tokens/train_per_sec_per_gpu": 3439.82, + "tokens/trainable": 16806652 + }, + { + "epoch": 3.843949044585987, + "grad_norm": 0.1376953125, + "learning_rate": 7.749853339242972e-06, + "loss": 0.0024581162724643946, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00246, + "step": 1207, + "tokens/total": 158031872, + "tokens/train_per_sec_per_gpu": 3496.89, + "tokens/trainable": 16821214 + }, + { + "epoch": 3.8471337579617835, + "grad_norm": 0.1611328125, + "learning_rate": 7.70966429196148e-06, + "loss": 0.0028864797204732895, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00289, + "step": 1208, + "tokens/total": 158162944, + "tokens/train_per_sec_per_gpu": 3388.5, + "tokens/trainable": 16835398 + }, + { + "epoch": 3.8503184713375798, + "grad_norm": 0.154296875, + "learning_rate": 7.669560715656993e-06, + "loss": 0.0023927215952426195, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0024, + "step": 1209, + "tokens/total": 158294016, + "tokens/train_per_sec_per_gpu": 3419.53, + "tokens/trainable": 16849636 + }, + { + "epoch": 3.853503184713376, + "grad_norm": 0.1435546875, + "learning_rate": 7.629542808572746e-06, + "loss": 0.0018501668237149715, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00185, + "step": 1210, + "tokens/total": 158425088, + "tokens/train_per_sec_per_gpu": 3232.39, + "tokens/trainable": 16863144 + }, + { + "epoch": 3.856687898089172, + "grad_norm": 0.1689453125, + "learning_rate": 7.58961076852846e-06, + "loss": 0.0026476646307855844, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00265, + "step": 1211, + "tokens/total": 158556160, + "tokens/train_per_sec_per_gpu": 3319.46, + "tokens/trainable": 16877036 + }, + { + "epoch": 3.859872611464968, + "grad_norm": 0.162109375, + "learning_rate": 7.549764792919414e-06, + "loss": 0.0031769457273185253, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00318, + "step": 1212, + "tokens/total": 158687232, + "tokens/train_per_sec_per_gpu": 3138.51, + "tokens/trainable": 16890132 + }, + { + "epoch": 3.8630573248407645, + "grad_norm": 0.103515625, + "learning_rate": 7.510005078715443e-06, + "loss": 0.00180210976395756, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0018, + "step": 1213, + "tokens/total": 158818304, + "tokens/train_per_sec_per_gpu": 3359.16, + "tokens/trainable": 16904120 + }, + { + "epoch": 3.8662420382165603, + "grad_norm": 0.1572265625, + "learning_rate": 7.47033182245995e-06, + "loss": 0.003394015831872821, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0034, + "step": 1214, + "tokens/total": 158949376, + "tokens/train_per_sec_per_gpu": 3097.6, + "tokens/trainable": 16917088 + }, + { + "epoch": 3.8694267515923566, + "grad_norm": 0.1533203125, + "learning_rate": 7.430745220268962e-06, + "loss": 0.0019503788789734244, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00195, + "step": 1215, + "tokens/total": 159080448, + "tokens/train_per_sec_per_gpu": 3410.15, + "tokens/trainable": 16931308 + }, + { + "epoch": 3.872611464968153, + "grad_norm": 0.1552734375, + "learning_rate": 7.391245467830163e-06, + "loss": 0.002893456257879734, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0029, + "step": 1216, + "tokens/total": 159211520, + "tokens/train_per_sec_per_gpu": 3457.29, + "tokens/trainable": 16945696 + }, + { + "epoch": 3.875796178343949, + "grad_norm": 0.1640625, + "learning_rate": 7.351832760401892e-06, + "loss": 0.0023777689784765244, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00238, + "step": 1217, + "tokens/total": 159342592, + "tokens/train_per_sec_per_gpu": 2878.05, + "tokens/trainable": 16957864 + }, + { + "epoch": 3.8789808917197455, + "grad_norm": 0.1513671875, + "learning_rate": 7.312507292812215e-06, + "loss": 0.00224723806604743, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00225, + "step": 1218, + "tokens/total": 159473664, + "tokens/train_per_sec_per_gpu": 3022.25, + "tokens/trainable": 16970516 + }, + { + "epoch": 3.8821656050955413, + "grad_norm": 0.1064453125, + "learning_rate": 7.273269259457957e-06, + "loss": 0.0017601789440959692, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00176, + "step": 1219, + "tokens/total": 159604736, + "tokens/train_per_sec_per_gpu": 3153.66, + "tokens/trainable": 16983660 + }, + { + "epoch": 3.8853503184713376, + "grad_norm": 0.15625, + "learning_rate": 7.2341188543036985e-06, + "loss": 0.0024489860516041517, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00245, + "step": 1220, + "tokens/total": 159735808, + "tokens/train_per_sec_per_gpu": 3174.42, + "tokens/trainable": 16996950 + }, + { + "epoch": 3.888535031847134, + "grad_norm": 0.1884765625, + "learning_rate": 7.195056270880887e-06, + "loss": 0.0038972869515419006, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0039, + "step": 1221, + "tokens/total": 159866880, + "tokens/train_per_sec_per_gpu": 3474.13, + "tokens/trainable": 17011372 + }, + { + "epoch": 3.8917197452229297, + "grad_norm": 0.1787109375, + "learning_rate": 7.156081702286813e-06, + "loss": 0.0033518727868795395, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00336, + "step": 1222, + "tokens/total": 159997952, + "tokens/train_per_sec_per_gpu": 3190.66, + "tokens/trainable": 17024744 + }, + { + "epoch": 3.894904458598726, + "grad_norm": 0.1396484375, + "learning_rate": 7.11719534118368e-06, + "loss": 0.00255336775444448, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00256, + "step": 1223, + "tokens/total": 160129024, + "tokens/train_per_sec_per_gpu": 3465.79, + "tokens/trainable": 17039162 + }, + { + "epoch": 3.8980891719745223, + "grad_norm": 0.154296875, + "learning_rate": 7.078397379797711e-06, + "loss": 0.0020744046196341515, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00208, + "step": 1224, + "tokens/total": 160260096, + "tokens/train_per_sec_per_gpu": 3387.57, + "tokens/trainable": 17053370 + }, + { + "epoch": 3.9012738853503186, + "grad_norm": 0.1396484375, + "learning_rate": 7.039688009918083e-06, + "loss": 0.0021676502656191587, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00217, + "step": 1225, + "tokens/total": 160391168, + "tokens/train_per_sec_per_gpu": 3407.72, + "tokens/trainable": 17067644 + }, + { + "epoch": 3.904458598726115, + "grad_norm": 0.171875, + "learning_rate": 7.001067422896063e-06, + "loss": 0.002485244534909725, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00249, + "step": 1226, + "tokens/total": 160522240, + "tokens/train_per_sec_per_gpu": 3749.29, + "tokens/trainable": 17083264 + }, + { + "epoch": 3.9076433121019107, + "grad_norm": 0.173828125, + "learning_rate": 6.9625358096440496e-06, + "loss": 0.0030091169755905867, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00301, + "step": 1227, + "tokens/total": 160653312, + "tokens/train_per_sec_per_gpu": 3670.67, + "tokens/trainable": 17098510 + }, + { + "epoch": 3.910828025477707, + "grad_norm": 0.14453125, + "learning_rate": 6.924093360634601e-06, + "loss": 0.0025889542885124683, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00259, + "step": 1228, + "tokens/total": 160784384, + "tokens/train_per_sec_per_gpu": 3679.38, + "tokens/trainable": 17113820 + }, + { + "epoch": 3.9140127388535033, + "grad_norm": 0.1875, + "learning_rate": 6.885740265899526e-06, + "loss": 0.0027112660463899374, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00271, + "step": 1229, + "tokens/total": 160915456, + "tokens/train_per_sec_per_gpu": 3136.64, + "tokens/trainable": 17126964 + }, + { + "epoch": 3.917197452229299, + "grad_norm": 0.1591796875, + "learning_rate": 6.84747671502893e-06, + "loss": 0.002578144893050194, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00258, + "step": 1230, + "tokens/total": 161046528, + "tokens/train_per_sec_per_gpu": 3162.79, + "tokens/trainable": 17140190 + }, + { + "epoch": 3.9203821656050954, + "grad_norm": 0.2099609375, + "learning_rate": 6.809302897170266e-06, + "loss": 0.00427253358066082, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00428, + "step": 1231, + "tokens/total": 161177600, + "tokens/train_per_sec_per_gpu": 3541.36, + "tokens/trainable": 17154952 + }, + { + "epoch": 3.9235668789808917, + "grad_norm": 0.1552734375, + "learning_rate": 6.771219001027415e-06, + "loss": 0.002364278305321932, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00237, + "step": 1232, + "tokens/total": 161308672, + "tokens/train_per_sec_per_gpu": 3451.33, + "tokens/trainable": 17169330 + }, + { + "epoch": 3.926751592356688, + "grad_norm": 0.1357421875, + "learning_rate": 6.733225214859762e-06, + "loss": 0.0026184916496276855, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00262, + "step": 1233, + "tokens/total": 161439744, + "tokens/train_per_sec_per_gpu": 3611.52, + "tokens/trainable": 17184330 + }, + { + "epoch": 3.9299363057324843, + "grad_norm": 0.1396484375, + "learning_rate": 6.695321726481232e-06, + "loss": 0.0022467318922281265, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00225, + "step": 1234, + "tokens/total": 161570816, + "tokens/train_per_sec_per_gpu": 3270.16, + "tokens/trainable": 17198012 + }, + { + "epoch": 3.93312101910828, + "grad_norm": 0.1484375, + "learning_rate": 6.657508723259404e-06, + "loss": 0.0020928632002323866, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0021, + "step": 1235, + "tokens/total": 161701888, + "tokens/train_per_sec_per_gpu": 3467.93, + "tokens/trainable": 17212436 + }, + { + "epoch": 3.9363057324840764, + "grad_norm": 0.12060546875, + "learning_rate": 6.619786392114557e-06, + "loss": 0.0016596732893958688, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00166, + "step": 1236, + "tokens/total": 161832960, + "tokens/train_per_sec_per_gpu": 3175.9, + "tokens/trainable": 17225718 + }, + { + "epoch": 3.9394904458598727, + "grad_norm": 0.177734375, + "learning_rate": 6.582154919518746e-06, + "loss": 0.0028763054870069027, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00288, + "step": 1237, + "tokens/total": 161964032, + "tokens/train_per_sec_per_gpu": 3459.65, + "tokens/trainable": 17240132 + }, + { + "epoch": 3.9426751592356686, + "grad_norm": 0.1416015625, + "learning_rate": 6.544614491494885e-06, + "loss": 0.0023539350368082523, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00236, + "step": 1238, + "tokens/total": 162095104, + "tokens/train_per_sec_per_gpu": 3782.05, + "tokens/trainable": 17255722 + }, + { + "epoch": 3.945859872611465, + "grad_norm": 0.12353515625, + "learning_rate": 6.507165293615847e-06, + "loss": 0.001856530667282641, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00186, + "step": 1239, + "tokens/total": 162226176, + "tokens/train_per_sec_per_gpu": 3107.57, + "tokens/trainable": 17268686 + }, + { + "epoch": 3.949044585987261, + "grad_norm": 0.2041015625, + "learning_rate": 6.469807511003501e-06, + "loss": 0.0025471888948231936, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00255, + "step": 1240, + "tokens/total": 162357248, + "tokens/train_per_sec_per_gpu": 3185.35, + "tokens/trainable": 17282018 + }, + { + "epoch": 3.9522292993630574, + "grad_norm": 0.185546875, + "learning_rate": 6.432541328327848e-06, + "loss": 0.0031703345011919737, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00318, + "step": 1241, + "tokens/total": 162488320, + "tokens/train_per_sec_per_gpu": 3523.34, + "tokens/trainable": 17296706 + }, + { + "epoch": 3.9554140127388537, + "grad_norm": 0.1591796875, + "learning_rate": 6.395366929806084e-06, + "loss": 0.002728913212195039, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00273, + "step": 1242, + "tokens/total": 162619392, + "tokens/train_per_sec_per_gpu": 3361.25, + "tokens/trainable": 17310780 + }, + { + "epoch": 3.9585987261146496, + "grad_norm": 0.1298828125, + "learning_rate": 6.358284499201681e-06, + "loss": 0.00209011766128242, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00209, + "step": 1243, + "tokens/total": 162750464, + "tokens/train_per_sec_per_gpu": 3299.45, + "tokens/trainable": 17324532 + }, + { + "epoch": 3.961783439490446, + "grad_norm": 0.18359375, + "learning_rate": 6.3212942198234755e-06, + "loss": 0.003096578875556588, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0031, + "step": 1244, + "tokens/total": 162881536, + "tokens/train_per_sec_per_gpu": 3590.65, + "tokens/trainable": 17339484 + }, + { + "epoch": 3.964968152866242, + "grad_norm": 0.177734375, + "learning_rate": 6.284396274524809e-06, + "loss": 0.002964367624372244, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00297, + "step": 1245, + "tokens/total": 163012608, + "tokens/train_per_sec_per_gpu": 3356.72, + "tokens/trainable": 17353532 + }, + { + "epoch": 3.968152866242038, + "grad_norm": 0.1630859375, + "learning_rate": 6.247590845702553e-06, + "loss": 0.0029587389435619116, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00296, + "step": 1246, + "tokens/total": 163143680, + "tokens/train_per_sec_per_gpu": 3114.15, + "tokens/trainable": 17366524 + }, + { + "epoch": 3.9713375796178343, + "grad_norm": 0.1376953125, + "learning_rate": 6.210878115296267e-06, + "loss": 0.0023161745630204678, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00232, + "step": 1247, + "tokens/total": 163274752, + "tokens/train_per_sec_per_gpu": 3507.4, + "tokens/trainable": 17381160 + }, + { + "epoch": 3.9745222929936306, + "grad_norm": 0.15234375, + "learning_rate": 6.174258264787283e-06, + "loss": 0.002960086800158024, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00296, + "step": 1248, + "tokens/total": 163405824, + "tokens/train_per_sec_per_gpu": 3401.28, + "tokens/trainable": 17395386 + }, + { + "epoch": 3.977707006369427, + "grad_norm": 0.1435546875, + "learning_rate": 6.137731475197775e-06, + "loss": 0.0018720726948231459, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00187, + "step": 1249, + "tokens/total": 163536896, + "tokens/train_per_sec_per_gpu": 3215.48, + "tokens/trainable": 17408856 + }, + { + "epoch": 3.980891719745223, + "grad_norm": 0.142578125, + "learning_rate": 6.101297927089905e-06, + "loss": 0.0030803410336375237, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00309, + "step": 1250, + "tokens/total": 163667968, + "tokens/train_per_sec_per_gpu": 3254.38, + "tokens/trainable": 17422484 + }, + { + "epoch": 3.984076433121019, + "grad_norm": 0.1865234375, + "learning_rate": 6.064957800564924e-06, + "loss": 0.0036575605627149343, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00366, + "step": 1251, + "tokens/total": 163799040, + "tokens/train_per_sec_per_gpu": 3076.57, + "tokens/trainable": 17435390 + }, + { + "epoch": 3.9872611464968153, + "grad_norm": 0.16796875, + "learning_rate": 6.028711275262252e-06, + "loss": 0.002414201619103551, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00242, + "step": 1252, + "tokens/total": 163930112, + "tokens/train_per_sec_per_gpu": 3358.96, + "tokens/trainable": 17449432 + }, + { + "epoch": 3.9904458598726116, + "grad_norm": 0.1572265625, + "learning_rate": 5.992558530358638e-06, + "loss": 0.002453506924211979, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00246, + "step": 1253, + "tokens/total": 164061184, + "tokens/train_per_sec_per_gpu": 3465.43, + "tokens/trainable": 17463888 + }, + { + "epoch": 3.9936305732484074, + "grad_norm": 0.173828125, + "learning_rate": 5.95649974456724e-06, + "loss": 0.0030917448457330465, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0031, + "step": 1254, + "tokens/total": 164192256, + "tokens/train_per_sec_per_gpu": 3406.16, + "tokens/trainable": 17478118 + }, + { + "epoch": 3.9968152866242037, + "grad_norm": 0.1845703125, + "learning_rate": 5.920535096136737e-06, + "loss": 0.003019727533683181, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00302, + "step": 1255, + "tokens/total": 164323328, + "tokens/train_per_sec_per_gpu": 3013.86, + "tokens/trainable": 17491116 + }, + { + "epoch": 4.0, + "grad_norm": 0.2255859375, + "learning_rate": 5.884664762850467e-06, + "loss": 0.0035042453091591597, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 39.25, + "memory/max_allocated (GiB)": 39.25, + "ppl": 1.00351, + "step": 1256, + "tokens/total": 164397056, + "tokens/train_per_sec_per_gpu": 3307.41, + "tokens/trainable": 17498700 + }, + { + "epoch": 4.0, + "eval_loss": 0.010103554464876652, + "eval_ppl": 1.01015, + "eval_runtime": 43.1815, + "eval_samples_per_second": 62.55, + "eval_steps_per_second": 3.914, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 54.61, + "memory/max_allocated (GiB)": 54.61, + "step": 1256 + }, + { + "epoch": 4.003184713375796, + "grad_norm": 0.1142578125, + "learning_rate": 5.848888922025553e-06, + "loss": 0.0019946754910051823, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.002, + "step": 1257, + "tokens/total": 164528128, + "tokens/train_per_sec_per_gpu": 3365.43, + "tokens/trainable": 17512708 + }, + { + "epoch": 4.006369426751593, + "grad_norm": 0.125, + "learning_rate": 5.813207750511995e-06, + "loss": 0.002120796823874116, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00212, + "step": 1258, + "tokens/total": 164659200, + "tokens/train_per_sec_per_gpu": 3244.01, + "tokens/trainable": 17526132 + }, + { + "epoch": 4.009554140127388, + "grad_norm": 0.142578125, + "learning_rate": 5.777621424691834e-06, + "loss": 0.0018959781154990196, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0019, + "step": 1259, + "tokens/total": 164790272, + "tokens/train_per_sec_per_gpu": 2857.78, + "tokens/trainable": 17538062 + }, + { + "epoch": 4.012738853503185, + "grad_norm": 0.126953125, + "learning_rate": 5.742130120478265e-06, + "loss": 0.002416697796434164, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00242, + "step": 1260, + "tokens/total": 164921344, + "tokens/train_per_sec_per_gpu": 3561.58, + "tokens/trainable": 17552824 + }, + { + "epoch": 4.015923566878981, + "grad_norm": 0.109375, + "learning_rate": 5.706734013314746e-06, + "loss": 0.00218612770549953, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00219, + "step": 1261, + "tokens/total": 165052416, + "tokens/train_per_sec_per_gpu": 3426.83, + "tokens/trainable": 17567024 + }, + { + "epoch": 4.019108280254777, + "grad_norm": 0.10791015625, + "learning_rate": 5.671433278174151e-06, + "loss": 0.0017273772973567247, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00173, + "step": 1262, + "tokens/total": 165183488, + "tokens/train_per_sec_per_gpu": 3392.24, + "tokens/trainable": 17581128 + }, + { + "epoch": 4.022292993630574, + "grad_norm": 0.11328125, + "learning_rate": 5.636228089557926e-06, + "loss": 0.0017078241799026728, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00171, + "step": 1263, + "tokens/total": 165314560, + "tokens/train_per_sec_per_gpu": 3547.78, + "tokens/trainable": 17595852 + }, + { + "epoch": 4.025477707006369, + "grad_norm": 0.10400390625, + "learning_rate": 5.601118621495175e-06, + "loss": 0.0014550643973052502, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00146, + "step": 1264, + "tokens/total": 165445632, + "tokens/train_per_sec_per_gpu": 3380.55, + "tokens/trainable": 17609924 + }, + { + "epoch": 4.028662420382165, + "grad_norm": 0.158203125, + "learning_rate": 5.566105047541847e-06, + "loss": 0.0025803535245358944, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00258, + "step": 1265, + "tokens/total": 165576704, + "tokens/train_per_sec_per_gpu": 3404.37, + "tokens/trainable": 17624118 + }, + { + "epoch": 4.031847133757962, + "grad_norm": 0.12890625, + "learning_rate": 5.531187540779864e-06, + "loss": 0.0025620046071708202, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00257, + "step": 1266, + "tokens/total": 165707776, + "tokens/train_per_sec_per_gpu": 3185.3, + "tokens/trainable": 17637446 + }, + { + "epoch": 4.035031847133758, + "grad_norm": 0.12158203125, + "learning_rate": 5.4963662738162445e-06, + "loss": 0.0018056983826681972, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00181, + "step": 1267, + "tokens/total": 165838848, + "tokens/train_per_sec_per_gpu": 3696.5, + "tokens/trainable": 17652856 + }, + { + "epoch": 4.038216560509555, + "grad_norm": 0.1044921875, + "learning_rate": 5.461641418782268e-06, + "loss": 0.0014057126827538013, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00141, + "step": 1268, + "tokens/total": 165969920, + "tokens/train_per_sec_per_gpu": 3383.57, + "tokens/trainable": 17666958 + }, + { + "epoch": 4.04140127388535, + "grad_norm": 0.1240234375, + "learning_rate": 5.427013147332638e-06, + "loss": 0.0026509405579417944, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00265, + "step": 1269, + "tokens/total": 166100992, + "tokens/train_per_sec_per_gpu": 3482.02, + "tokens/trainable": 17681546 + }, + { + "epoch": 4.044585987261146, + "grad_norm": 0.1318359375, + "learning_rate": 5.392481630644597e-06, + "loss": 0.002696407027542591, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0027, + "step": 1270, + "tokens/total": 166232064, + "tokens/train_per_sec_per_gpu": 3270.12, + "tokens/trainable": 17695240 + }, + { + "epoch": 4.047770700636943, + "grad_norm": 0.11376953125, + "learning_rate": 5.358047039417122e-06, + "loss": 0.0018320954404771328, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00183, + "step": 1271, + "tokens/total": 166363136, + "tokens/train_per_sec_per_gpu": 3274.88, + "tokens/trainable": 17708940 + }, + { + "epoch": 4.050955414012739, + "grad_norm": 0.1201171875, + "learning_rate": 5.323709543870059e-06, + "loss": 0.0021537388674914837, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00216, + "step": 1272, + "tokens/total": 166494208, + "tokens/train_per_sec_per_gpu": 3453.25, + "tokens/trainable": 17723348 + }, + { + "epoch": 4.054140127388535, + "grad_norm": 0.130859375, + "learning_rate": 5.2894693137432645e-06, + "loss": 0.0018690548604354262, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00187, + "step": 1273, + "tokens/total": 166625280, + "tokens/train_per_sec_per_gpu": 3002.21, + "tokens/trainable": 17735952 + }, + { + "epoch": 4.057324840764331, + "grad_norm": 0.162109375, + "learning_rate": 5.255326518295792e-06, + "loss": 0.002879355102777481, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00288, + "step": 1274, + "tokens/total": 166756352, + "tokens/train_per_sec_per_gpu": 3683.9, + "tokens/trainable": 17751344 + }, + { + "epoch": 4.060509554140127, + "grad_norm": 0.1220703125, + "learning_rate": 5.221281326305066e-06, + "loss": 0.0022269003093242645, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00223, + "step": 1275, + "tokens/total": 166887424, + "tokens/train_per_sec_per_gpu": 3651.91, + "tokens/trainable": 17766626 + }, + { + "epoch": 4.063694267515924, + "grad_norm": 0.11083984375, + "learning_rate": 5.187333906065999e-06, + "loss": 0.001456463593058288, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00146, + "step": 1276, + "tokens/total": 167018496, + "tokens/train_per_sec_per_gpu": 3273.46, + "tokens/trainable": 17780338 + }, + { + "epoch": 4.06687898089172, + "grad_norm": 0.07763671875, + "learning_rate": 5.15348442539022e-06, + "loss": 0.0010698458645492792, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00107, + "step": 1277, + "tokens/total": 167149568, + "tokens/train_per_sec_per_gpu": 3378.38, + "tokens/trainable": 17794556 + }, + { + "epoch": 4.070063694267516, + "grad_norm": 0.1396484375, + "learning_rate": 5.1197330516052025e-06, + "loss": 0.002229275880381465, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00223, + "step": 1278, + "tokens/total": 167280640, + "tokens/train_per_sec_per_gpu": 3141.49, + "tokens/trainable": 17807720 + }, + { + "epoch": 4.073248407643312, + "grad_norm": 0.1513671875, + "learning_rate": 5.086079951553444e-06, + "loss": 0.0030983053147792816, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0031, + "step": 1279, + "tokens/total": 167411712, + "tokens/train_per_sec_per_gpu": 3466.5, + "tokens/trainable": 17822220 + }, + { + "epoch": 4.076433121019108, + "grad_norm": 0.1611328125, + "learning_rate": 5.052525291591651e-06, + "loss": 0.0031875702552497387, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00319, + "step": 1280, + "tokens/total": 167542784, + "tokens/train_per_sec_per_gpu": 3276.44, + "tokens/trainable": 17835898 + }, + { + "epoch": 4.079617834394904, + "grad_norm": 0.111328125, + "learning_rate": 5.019069237589921e-06, + "loss": 0.0019920531194657087, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00199, + "step": 1281, + "tokens/total": 167673856, + "tokens/train_per_sec_per_gpu": 3318.63, + "tokens/trainable": 17849768 + }, + { + "epoch": 4.082802547770701, + "grad_norm": 0.1328125, + "learning_rate": 4.985711954930902e-06, + "loss": 0.0015500528970733285, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00155, + "step": 1282, + "tokens/total": 167804928, + "tokens/train_per_sec_per_gpu": 3025.08, + "tokens/trainable": 17862448 + }, + { + "epoch": 4.085987261146497, + "grad_norm": 0.138671875, + "learning_rate": 4.952453608509e-06, + "loss": 0.0018041220027953386, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00181, + "step": 1283, + "tokens/total": 167936000, + "tokens/train_per_sec_per_gpu": 3421.79, + "tokens/trainable": 17876746 + }, + { + "epoch": 4.089171974522293, + "grad_norm": 0.109375, + "learning_rate": 4.919294362729551e-06, + "loss": 0.0015523162437602878, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00155, + "step": 1284, + "tokens/total": 168067072, + "tokens/train_per_sec_per_gpu": 3216.81, + "tokens/trainable": 17890232 + }, + { + "epoch": 4.092356687898089, + "grad_norm": 0.1259765625, + "learning_rate": 4.886234381507998e-06, + "loss": 0.0025541428476572037, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00256, + "step": 1285, + "tokens/total": 168198144, + "tokens/train_per_sec_per_gpu": 3474.88, + "tokens/trainable": 17904764 + }, + { + "epoch": 4.095541401273885, + "grad_norm": 0.181640625, + "learning_rate": 4.853273828269089e-06, + "loss": 0.0028677769005298615, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00287, + "step": 1286, + "tokens/total": 168329216, + "tokens/train_per_sec_per_gpu": 3678.61, + "tokens/trainable": 17920040 + }, + { + "epoch": 4.098726114649682, + "grad_norm": 0.1845703125, + "learning_rate": 4.820412865946092e-06, + "loss": 0.003095669439062476, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0031, + "step": 1287, + "tokens/total": 168460288, + "tokens/train_per_sec_per_gpu": 3226.04, + "tokens/trainable": 17933544 + }, + { + "epoch": 4.101910828025478, + "grad_norm": 0.1044921875, + "learning_rate": 4.787651656979949e-06, + "loss": 0.001217160257510841, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00122, + "step": 1288, + "tokens/total": 168591360, + "tokens/train_per_sec_per_gpu": 3346.97, + "tokens/trainable": 17947562 + }, + { + "epoch": 4.1050955414012735, + "grad_norm": 0.10791015625, + "learning_rate": 4.754990363318501e-06, + "loss": 0.0015003203880041838, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0015, + "step": 1289, + "tokens/total": 168722432, + "tokens/train_per_sec_per_gpu": 3462.58, + "tokens/trainable": 17962074 + }, + { + "epoch": 4.10828025477707, + "grad_norm": 0.10595703125, + "learning_rate": 4.722429146415691e-06, + "loss": 0.001935549546033144, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00194, + "step": 1290, + "tokens/total": 168853504, + "tokens/train_per_sec_per_gpu": 3245.6, + "tokens/trainable": 17975652 + }, + { + "epoch": 4.111464968152866, + "grad_norm": 0.1328125, + "learning_rate": 4.6899681672307346e-06, + "loss": 0.00210759905166924, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00211, + "step": 1291, + "tokens/total": 168984576, + "tokens/train_per_sec_per_gpu": 3417.61, + "tokens/trainable": 17989954 + }, + { + "epoch": 4.114649681528663, + "grad_norm": 0.111328125, + "learning_rate": 4.657607586227345e-06, + "loss": 0.0014702447224408388, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00147, + "step": 1292, + "tokens/total": 169115648, + "tokens/train_per_sec_per_gpu": 3709.61, + "tokens/trainable": 18005404 + }, + { + "epoch": 4.117834394904459, + "grad_norm": 0.12890625, + "learning_rate": 4.625347563372964e-06, + "loss": 0.0019532586447894573, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00196, + "step": 1293, + "tokens/total": 169246720, + "tokens/train_per_sec_per_gpu": 3349.32, + "tokens/trainable": 18019456 + }, + { + "epoch": 4.1210191082802545, + "grad_norm": 0.10205078125, + "learning_rate": 4.593188258137912e-06, + "loss": 0.0014989221235737205, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0015, + "step": 1294, + "tokens/total": 169377792, + "tokens/train_per_sec_per_gpu": 3412.09, + "tokens/trainable": 18033708 + }, + { + "epoch": 4.124203821656051, + "grad_norm": 0.1240234375, + "learning_rate": 4.5611298294946596e-06, + "loss": 0.0016446541994810104, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00165, + "step": 1295, + "tokens/total": 169508864, + "tokens/train_per_sec_per_gpu": 3299.98, + "tokens/trainable": 18047556 + }, + { + "epoch": 4.127388535031847, + "grad_norm": 0.12890625, + "learning_rate": 4.529172435917012e-06, + "loss": 0.001521661994047463, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00152, + "step": 1296, + "tokens/total": 169639936, + "tokens/train_per_sec_per_gpu": 3090.9, + "tokens/trainable": 18060560 + }, + { + "epoch": 4.130573248407643, + "grad_norm": 0.1357421875, + "learning_rate": 4.497316235379323e-06, + "loss": 0.002716638380661607, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00272, + "step": 1297, + "tokens/total": 169771008, + "tokens/train_per_sec_per_gpu": 3121.65, + "tokens/trainable": 18073696 + }, + { + "epoch": 4.13375796178344, + "grad_norm": 0.1484375, + "learning_rate": 4.465561385355712e-06, + "loss": 0.0017709597013890743, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00177, + "step": 1298, + "tokens/total": 169902080, + "tokens/train_per_sec_per_gpu": 3532.51, + "tokens/trainable": 18088448 + }, + { + "epoch": 4.1369426751592355, + "grad_norm": 0.11474609375, + "learning_rate": 4.433908042819323e-06, + "loss": 0.0015186622040346265, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00152, + "step": 1299, + "tokens/total": 170033152, + "tokens/train_per_sec_per_gpu": 3144.35, + "tokens/trainable": 18101652 + }, + { + "epoch": 4.140127388535032, + "grad_norm": 0.11767578125, + "learning_rate": 4.402356364241489e-06, + "loss": 0.001659161178395152, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00166, + "step": 1300, + "tokens/total": 170164224, + "tokens/train_per_sec_per_gpu": 3339.31, + "tokens/trainable": 18115704 + }, + { + "epoch": 4.143312101910828, + "grad_norm": 0.11474609375, + "learning_rate": 4.370906505591007e-06, + "loss": 0.0014578705886378884, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00146, + "step": 1301, + "tokens/total": 170295296, + "tokens/train_per_sec_per_gpu": 3235.03, + "tokens/trainable": 18129240 + }, + { + "epoch": 4.146496815286624, + "grad_norm": 0.13671875, + "learning_rate": 4.339558622333353e-06, + "loss": 0.0024085917975753546, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00241, + "step": 1302, + "tokens/total": 170426368, + "tokens/train_per_sec_per_gpu": 3590.36, + "tokens/trainable": 18144174 + }, + { + "epoch": 4.149681528662421, + "grad_norm": 0.1669921875, + "learning_rate": 4.308312869429898e-06, + "loss": 0.0028695266228169203, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00287, + "step": 1303, + "tokens/total": 170557440, + "tokens/train_per_sec_per_gpu": 3280.59, + "tokens/trainable": 18157882 + }, + { + "epoch": 4.1528662420382165, + "grad_norm": 0.13671875, + "learning_rate": 4.27716940133715e-06, + "loss": 0.0024525150656700134, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00246, + "step": 1304, + "tokens/total": 170688512, + "tokens/train_per_sec_per_gpu": 3221.62, + "tokens/trainable": 18171456 + }, + { + "epoch": 4.156050955414012, + "grad_norm": 0.138671875, + "learning_rate": 4.246128372006017e-06, + "loss": 0.00208856794051826, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00209, + "step": 1305, + "tokens/total": 170819584, + "tokens/train_per_sec_per_gpu": 3030.29, + "tokens/trainable": 18184178 + }, + { + "epoch": 4.159235668789809, + "grad_norm": 0.119140625, + "learning_rate": 4.215189934881001e-06, + "loss": 0.0016645672731101513, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00167, + "step": 1306, + "tokens/total": 170950656, + "tokens/train_per_sec_per_gpu": 3509.84, + "tokens/trainable": 18198820 + }, + { + "epoch": 4.162420382165605, + "grad_norm": 0.080078125, + "learning_rate": 4.1843542428994685e-06, + "loss": 0.0010691338684409857, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00107, + "step": 1307, + "tokens/total": 171081728, + "tokens/train_per_sec_per_gpu": 2998.84, + "tokens/trainable": 18211426 + }, + { + "epoch": 4.165605095541402, + "grad_norm": 0.140625, + "learning_rate": 4.153621448490905e-06, + "loss": 0.0030363069381564856, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00304, + "step": 1308, + "tokens/total": 171212800, + "tokens/train_per_sec_per_gpu": 3613.64, + "tokens/trainable": 18226492 + }, + { + "epoch": 4.1687898089171975, + "grad_norm": 0.1650390625, + "learning_rate": 4.122991703576121e-06, + "loss": 0.0039181094616651535, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00393, + "step": 1309, + "tokens/total": 171343872, + "tokens/train_per_sec_per_gpu": 3326.43, + "tokens/trainable": 18240364 + }, + { + "epoch": 4.171974522292993, + "grad_norm": 0.1494140625, + "learning_rate": 4.092465159566525e-06, + "loss": 0.0018522969912737608, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00185, + "step": 1310, + "tokens/total": 171474944, + "tokens/train_per_sec_per_gpu": 3142.02, + "tokens/trainable": 18253528 + }, + { + "epoch": 4.17515923566879, + "grad_norm": 0.1279296875, + "learning_rate": 4.062041967363395e-06, + "loss": 0.0022721600253134966, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00227, + "step": 1311, + "tokens/total": 171606016, + "tokens/train_per_sec_per_gpu": 3346.0, + "tokens/trainable": 18267536 + }, + { + "epoch": 4.178343949044586, + "grad_norm": 0.1328125, + "learning_rate": 4.031722277357086e-06, + "loss": 0.0017200830625370145, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00172, + "step": 1312, + "tokens/total": 171737088, + "tokens/train_per_sec_per_gpu": 3265.04, + "tokens/trainable": 18281204 + }, + { + "epoch": 4.181528662420382, + "grad_norm": 0.119140625, + "learning_rate": 4.001506239426339e-06, + "loss": 0.0018201316706836224, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00182, + "step": 1313, + "tokens/total": 171868160, + "tokens/train_per_sec_per_gpu": 3661.03, + "tokens/trainable": 18296460 + }, + { + "epoch": 4.1847133757961785, + "grad_norm": 0.09326171875, + "learning_rate": 3.971394002937501e-06, + "loss": 0.0008904569549486041, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00089, + "step": 1314, + "tokens/total": 171999232, + "tokens/train_per_sec_per_gpu": 2682.19, + "tokens/trainable": 18307764 + }, + { + "epoch": 4.187898089171974, + "grad_norm": 0.115234375, + "learning_rate": 3.941385716743795e-06, + "loss": 0.0016649002209305763, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00167, + "step": 1315, + "tokens/total": 172130304, + "tokens/train_per_sec_per_gpu": 3367.15, + "tokens/trainable": 18321816 + }, + { + "epoch": 4.191082802547771, + "grad_norm": 0.11181640625, + "learning_rate": 3.911481529184588e-06, + "loss": 0.0019004822243005037, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0019, + "step": 1316, + "tokens/total": 172261376, + "tokens/train_per_sec_per_gpu": 3450.07, + "tokens/trainable": 18336222 + }, + { + "epoch": 4.194267515923567, + "grad_norm": 0.12353515625, + "learning_rate": 3.881681588084674e-06, + "loss": 0.0020820728968828917, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00208, + "step": 1317, + "tokens/total": 172392448, + "tokens/train_per_sec_per_gpu": 3549.92, + "tokens/trainable": 18351078 + }, + { + "epoch": 4.197452229299363, + "grad_norm": 0.16015625, + "learning_rate": 3.851986040753505e-06, + "loss": 0.002381009515374899, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00238, + "step": 1318, + "tokens/total": 172523520, + "tokens/train_per_sec_per_gpu": 3442.29, + "tokens/trainable": 18365480 + }, + { + "epoch": 4.2006369426751595, + "grad_norm": 0.08056640625, + "learning_rate": 3.822395033984502e-06, + "loss": 0.0012018627021461725, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0012, + "step": 1319, + "tokens/total": 172654592, + "tokens/train_per_sec_per_gpu": 3791.81, + "tokens/trainable": 18381260 + }, + { + "epoch": 4.203821656050955, + "grad_norm": 0.142578125, + "learning_rate": 3.792908714054316e-06, + "loss": 0.002608443144708872, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00261, + "step": 1320, + "tokens/total": 172785664, + "tokens/train_per_sec_per_gpu": 3220.83, + "tokens/trainable": 18394756 + }, + { + "epoch": 4.207006369426751, + "grad_norm": 0.12060546875, + "learning_rate": 3.7635272267220858e-06, + "loss": 0.0018472287338227034, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00185, + "step": 1321, + "tokens/total": 172916736, + "tokens/train_per_sec_per_gpu": 3254.63, + "tokens/trainable": 18408414 + }, + { + "epoch": 4.210191082802548, + "grad_norm": 0.1669921875, + "learning_rate": 3.734250717228735e-06, + "loss": 0.00441823760047555, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00443, + "step": 1322, + "tokens/total": 173047808, + "tokens/train_per_sec_per_gpu": 3065.51, + "tokens/trainable": 18421262 + }, + { + "epoch": 4.213375796178344, + "grad_norm": 0.10791015625, + "learning_rate": 3.7050793302962685e-06, + "loss": 0.0016929913545027375, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00169, + "step": 1323, + "tokens/total": 173178880, + "tokens/train_per_sec_per_gpu": 3282.3, + "tokens/trainable": 18434992 + }, + { + "epoch": 4.2165605095541405, + "grad_norm": 0.1455078125, + "learning_rate": 3.676013210127022e-06, + "loss": 0.0025385431945323944, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00254, + "step": 1324, + "tokens/total": 173309952, + "tokens/train_per_sec_per_gpu": 3103.93, + "tokens/trainable": 18447992 + }, + { + "epoch": 4.219745222929936, + "grad_norm": 0.1123046875, + "learning_rate": 3.647052500402981e-06, + "loss": 0.0015956538263708353, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0016, + "step": 1325, + "tokens/total": 173441024, + "tokens/train_per_sec_per_gpu": 3147.17, + "tokens/trainable": 18461184 + }, + { + "epoch": 4.222929936305732, + "grad_norm": 0.1259765625, + "learning_rate": 3.6181973442850597e-06, + "loss": 0.001635034685023129, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00164, + "step": 1326, + "tokens/total": 173572096, + "tokens/train_per_sec_per_gpu": 3715.4, + "tokens/trainable": 18476658 + }, + { + "epoch": 4.226114649681529, + "grad_norm": 0.1416015625, + "learning_rate": 3.589447884412378e-06, + "loss": 0.0025338195264339447, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00254, + "step": 1327, + "tokens/total": 173703168, + "tokens/train_per_sec_per_gpu": 3364.82, + "tokens/trainable": 18490728 + }, + { + "epoch": 4.229299363057325, + "grad_norm": 0.1025390625, + "learning_rate": 3.5608042629015707e-06, + "loss": 0.001267962739802897, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00127, + "step": 1328, + "tokens/total": 173834240, + "tokens/train_per_sec_per_gpu": 3268.7, + "tokens/trainable": 18504432 + }, + { + "epoch": 4.232484076433121, + "grad_norm": 0.1396484375, + "learning_rate": 3.532266621346103e-06, + "loss": 0.0019486568635329604, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00195, + "step": 1329, + "tokens/total": 173965312, + "tokens/train_per_sec_per_gpu": 3287.05, + "tokens/trainable": 18518208 + }, + { + "epoch": 4.235668789808917, + "grad_norm": 0.1748046875, + "learning_rate": 3.5038351008155226e-06, + "loss": 0.002935834927484393, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00294, + "step": 1330, + "tokens/total": 174096384, + "tokens/train_per_sec_per_gpu": 3210.37, + "tokens/trainable": 18531630 + }, + { + "epoch": 4.238853503184713, + "grad_norm": 0.11767578125, + "learning_rate": 3.4755098418548155e-06, + "loss": 0.0018774013733491302, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00188, + "step": 1331, + "tokens/total": 174227456, + "tokens/train_per_sec_per_gpu": 3384.93, + "tokens/trainable": 18545764 + }, + { + "epoch": 4.24203821656051, + "grad_norm": 0.11083984375, + "learning_rate": 3.4472909844836837e-06, + "loss": 0.001764771994203329, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00177, + "step": 1332, + "tokens/total": 174358528, + "tokens/train_per_sec_per_gpu": 3577.08, + "tokens/trainable": 18560662 + }, + { + "epoch": 4.245222929936306, + "grad_norm": 0.130859375, + "learning_rate": 3.4191786681958437e-06, + "loss": 0.0026986815501004457, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0027, + "step": 1333, + "tokens/total": 174489600, + "tokens/train_per_sec_per_gpu": 3282.73, + "tokens/trainable": 18574404 + }, + { + "epoch": 4.248407643312102, + "grad_norm": 0.138671875, + "learning_rate": 3.39117303195835e-06, + "loss": 0.0022144122049212456, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00222, + "step": 1334, + "tokens/total": 174620672, + "tokens/train_per_sec_per_gpu": 3194.16, + "tokens/trainable": 18587768 + }, + { + "epoch": 4.251592356687898, + "grad_norm": 0.1435546875, + "learning_rate": 3.3632742142109293e-06, + "loss": 0.00270890723913908, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00271, + "step": 1335, + "tokens/total": 174751744, + "tokens/train_per_sec_per_gpu": 3411.3, + "tokens/trainable": 18602002 + }, + { + "epoch": 4.254777070063694, + "grad_norm": 0.16796875, + "learning_rate": 3.3354823528652463e-06, + "loss": 0.0023235008120536804, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00233, + "step": 1336, + "tokens/total": 174882816, + "tokens/train_per_sec_per_gpu": 3230.27, + "tokens/trainable": 18615552 + }, + { + "epoch": 4.25796178343949, + "grad_norm": 0.1767578125, + "learning_rate": 3.3077975853042703e-06, + "loss": 0.002815892221406102, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00282, + "step": 1337, + "tokens/total": 175013888, + "tokens/train_per_sec_per_gpu": 3239.06, + "tokens/trainable": 18629086 + }, + { + "epoch": 4.261146496815287, + "grad_norm": 0.166015625, + "learning_rate": 3.280220048381574e-06, + "loss": 0.002695944393053651, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0027, + "step": 1338, + "tokens/total": 175144960, + "tokens/train_per_sec_per_gpu": 3597.1, + "tokens/trainable": 18644090 + }, + { + "epoch": 4.264331210191083, + "grad_norm": 0.1396484375, + "learning_rate": 3.252749878420647e-06, + "loss": 0.0021448852494359016, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00215, + "step": 1339, + "tokens/total": 175276032, + "tokens/train_per_sec_per_gpu": 3364.52, + "tokens/trainable": 18658172 + }, + { + "epoch": 4.267515923566879, + "grad_norm": 0.12060546875, + "learning_rate": 3.225387211214237e-06, + "loss": 0.001379702938720584, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00138, + "step": 1340, + "tokens/total": 175407104, + "tokens/train_per_sec_per_gpu": 3324.64, + "tokens/trainable": 18672098 + }, + { + "epoch": 4.270700636942675, + "grad_norm": 0.146484375, + "learning_rate": 3.1981321820236885e-06, + "loss": 0.002582112792879343, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00259, + "step": 1341, + "tokens/total": 175538176, + "tokens/train_per_sec_per_gpu": 3665.7, + "tokens/trainable": 18687300 + }, + { + "epoch": 4.273885350318471, + "grad_norm": 0.126953125, + "learning_rate": 3.1709849255782466e-06, + "loss": 0.0017478655790910125, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00175, + "step": 1342, + "tokens/total": 175669248, + "tokens/train_per_sec_per_gpu": 3348.03, + "tokens/trainable": 18701322 + }, + { + "epoch": 4.277070063694268, + "grad_norm": 0.12890625, + "learning_rate": 3.1439455760744112e-06, + "loss": 0.0016232930356636643, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00162, + "step": 1343, + "tokens/total": 175800320, + "tokens/train_per_sec_per_gpu": 3355.1, + "tokens/trainable": 18715342 + }, + { + "epoch": 4.280254777070064, + "grad_norm": 0.1005859375, + "learning_rate": 3.117014267175275e-06, + "loss": 0.0013508808333426714, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00135, + "step": 1344, + "tokens/total": 175931392, + "tokens/train_per_sec_per_gpu": 3356.71, + "tokens/trainable": 18729364 + }, + { + "epoch": 4.2834394904458595, + "grad_norm": 0.158203125, + "learning_rate": 3.0901911320098426e-06, + "loss": 0.002793082967400551, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0028, + "step": 1345, + "tokens/total": 176062464, + "tokens/train_per_sec_per_gpu": 3247.41, + "tokens/trainable": 18742944 + }, + { + "epoch": 4.286624203821656, + "grad_norm": 0.130859375, + "learning_rate": 3.0634763031723882e-06, + "loss": 0.0016741371946409345, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00168, + "step": 1346, + "tokens/total": 176193536, + "tokens/train_per_sec_per_gpu": 3132.92, + "tokens/trainable": 18756064 + }, + { + "epoch": 4.289808917197452, + "grad_norm": 0.0947265625, + "learning_rate": 3.036869912721807e-06, + "loss": 0.0012669205898419023, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00127, + "step": 1347, + "tokens/total": 176324608, + "tokens/train_per_sec_per_gpu": 3626.61, + "tokens/trainable": 18771148 + }, + { + "epoch": 4.292993630573249, + "grad_norm": 0.095703125, + "learning_rate": 3.010372092180941e-06, + "loss": 0.0014189573703333735, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00142, + "step": 1348, + "tokens/total": 176455680, + "tokens/train_per_sec_per_gpu": 3173.45, + "tokens/trainable": 18784398 + }, + { + "epoch": 4.296178343949045, + "grad_norm": 0.1279296875, + "learning_rate": 2.983982972535948e-06, + "loss": 0.0028286417946219444, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00283, + "step": 1349, + "tokens/total": 176586752, + "tokens/train_per_sec_per_gpu": 3281.6, + "tokens/trainable": 18798140 + }, + { + "epoch": 4.2993630573248405, + "grad_norm": 0.1552734375, + "learning_rate": 2.9577026842356527e-06, + "loss": 0.002894408069550991, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0029, + "step": 1350, + "tokens/total": 176717824, + "tokens/train_per_sec_per_gpu": 3578.34, + "tokens/trainable": 18813032 + }, + { + "epoch": 4.302547770700637, + "grad_norm": 0.154296875, + "learning_rate": 2.931531357190881e-06, + "loss": 0.0021069981157779694, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00211, + "step": 1351, + "tokens/total": 176848896, + "tokens/train_per_sec_per_gpu": 3439.97, + "tokens/trainable": 18827370 + }, + { + "epoch": 4.305732484076433, + "grad_norm": 0.11376953125, + "learning_rate": 2.905469120773835e-06, + "loss": 0.002290198812261224, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00229, + "step": 1352, + "tokens/total": 176979968, + "tokens/train_per_sec_per_gpu": 3402.53, + "tokens/trainable": 18841572 + }, + { + "epoch": 4.308917197452229, + "grad_norm": 0.16015625, + "learning_rate": 2.8795161038174675e-06, + "loss": 0.0023499338421970606, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00235, + "step": 1353, + "tokens/total": 177111040, + "tokens/train_per_sec_per_gpu": 3406.22, + "tokens/trainable": 18855776 + }, + { + "epoch": 4.312101910828026, + "grad_norm": 0.10205078125, + "learning_rate": 2.853672434614807e-06, + "loss": 0.0013938483316451311, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00139, + "step": 1354, + "tokens/total": 177242112, + "tokens/train_per_sec_per_gpu": 3788.18, + "tokens/trainable": 18871550 + }, + { + "epoch": 4.3152866242038215, + "grad_norm": 0.1328125, + "learning_rate": 2.8279382409183598e-06, + "loss": 0.0020433831959962845, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00205, + "step": 1355, + "tokens/total": 177373184, + "tokens/train_per_sec_per_gpu": 3264.03, + "tokens/trainable": 18885240 + }, + { + "epoch": 4.318471337579618, + "grad_norm": 0.10302734375, + "learning_rate": 2.802313649939467e-06, + "loss": 0.0011658279690891504, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00117, + "step": 1356, + "tokens/total": 177504256, + "tokens/train_per_sec_per_gpu": 3001.88, + "tokens/trainable": 18897798 + }, + { + "epoch": 4.321656050955414, + "grad_norm": 0.1318359375, + "learning_rate": 2.7767987883476622e-06, + "loss": 0.0021784165874123573, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00218, + "step": 1357, + "tokens/total": 177635328, + "tokens/train_per_sec_per_gpu": 3337.15, + "tokens/trainable": 18911724 + }, + { + "epoch": 4.32484076433121, + "grad_norm": 0.1396484375, + "learning_rate": 2.7513937822700508e-06, + "loss": 0.002125969622284174, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00213, + "step": 1358, + "tokens/total": 177766400, + "tokens/train_per_sec_per_gpu": 3060.55, + "tokens/trainable": 18924596 + }, + { + "epoch": 4.328025477707007, + "grad_norm": 0.142578125, + "learning_rate": 2.7260987572907153e-06, + "loss": 0.0018263484816998243, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00183, + "step": 1359, + "tokens/total": 177897472, + "tokens/train_per_sec_per_gpu": 3272.47, + "tokens/trainable": 18938296 + }, + { + "epoch": 4.3312101910828025, + "grad_norm": 0.11865234375, + "learning_rate": 2.700913838450042e-06, + "loss": 0.0014197917189449072, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00142, + "step": 1360, + "tokens/total": 178028544, + "tokens/train_per_sec_per_gpu": 3245.66, + "tokens/trainable": 18951818 + }, + { + "epoch": 4.334394904458598, + "grad_norm": 0.11376953125, + "learning_rate": 2.675839150244153e-06, + "loss": 0.0016245257575064898, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00163, + "step": 1361, + "tokens/total": 178159616, + "tokens/train_per_sec_per_gpu": 3313.42, + "tokens/trainable": 18965684 + }, + { + "epoch": 4.337579617834395, + "grad_norm": 0.1376953125, + "learning_rate": 2.650874816624266e-06, + "loss": 0.0019813040271401405, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00198, + "step": 1362, + "tokens/total": 178290688, + "tokens/train_per_sec_per_gpu": 3158.46, + "tokens/trainable": 18978904 + }, + { + "epoch": 4.340764331210191, + "grad_norm": 0.130859375, + "learning_rate": 2.6260209609960757e-06, + "loss": 0.0024794619530439377, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00248, + "step": 1363, + "tokens/total": 178421760, + "tokens/train_per_sec_per_gpu": 3507.13, + "tokens/trainable": 18993500 + }, + { + "epoch": 4.343949044585988, + "grad_norm": 0.1474609375, + "learning_rate": 2.6012777062191547e-06, + "loss": 0.002862154971808195, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00287, + "step": 1364, + "tokens/total": 178552832, + "tokens/train_per_sec_per_gpu": 3531.99, + "tokens/trainable": 19008250 + }, + { + "epoch": 4.3471337579617835, + "grad_norm": 0.10693359375, + "learning_rate": 2.5766451746063598e-06, + "loss": 0.0013462984934449196, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00135, + "step": 1365, + "tokens/total": 178683904, + "tokens/train_per_sec_per_gpu": 3460.1, + "tokens/trainable": 19022668 + }, + { + "epoch": 4.350318471337579, + "grad_norm": 0.142578125, + "learning_rate": 2.5521234879231887e-06, + "loss": 0.002731763059273362, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00274, + "step": 1366, + "tokens/total": 178814976, + "tokens/train_per_sec_per_gpu": 3570.61, + "tokens/trainable": 19037614 + }, + { + "epoch": 4.353503184713376, + "grad_norm": 0.1083984375, + "learning_rate": 2.527712767387222e-06, + "loss": 0.0014442024985328317, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00145, + "step": 1367, + "tokens/total": 178946048, + "tokens/train_per_sec_per_gpu": 3629.25, + "tokens/trainable": 19052728 + }, + { + "epoch": 4.356687898089172, + "grad_norm": 0.123046875, + "learning_rate": 2.5034131336674956e-06, + "loss": 0.0018038805574178696, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00181, + "step": 1368, + "tokens/total": 179077120, + "tokens/train_per_sec_per_gpu": 3323.04, + "tokens/trainable": 19066614 + }, + { + "epoch": 4.359872611464968, + "grad_norm": 0.177734375, + "learning_rate": 2.4792247068839064e-06, + "loss": 0.0023225173354148865, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00233, + "step": 1369, + "tokens/total": 179208192, + "tokens/train_per_sec_per_gpu": 3675.14, + "tokens/trainable": 19081856 + }, + { + "epoch": 4.3630573248407645, + "grad_norm": 0.158203125, + "learning_rate": 2.4551476066066307e-06, + "loss": 0.003056393703445792, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00306, + "step": 1370, + "tokens/total": 179339264, + "tokens/train_per_sec_per_gpu": 3829.68, + "tokens/trainable": 19097794 + }, + { + "epoch": 4.36624203821656, + "grad_norm": 0.1806640625, + "learning_rate": 2.4311819518555295e-06, + "loss": 0.0030934589449316263, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0031, + "step": 1371, + "tokens/total": 179470336, + "tokens/train_per_sec_per_gpu": 3060.11, + "tokens/trainable": 19110620 + }, + { + "epoch": 4.369426751592357, + "grad_norm": 0.14453125, + "learning_rate": 2.407327861099548e-06, + "loss": 0.0017585513414815068, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00176, + "step": 1372, + "tokens/total": 179601408, + "tokens/train_per_sec_per_gpu": 3435.78, + "tokens/trainable": 19125008 + }, + { + "epoch": 4.372611464968153, + "grad_norm": 0.09814453125, + "learning_rate": 2.383585452256146e-06, + "loss": 0.0014080167748034, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00141, + "step": 1373, + "tokens/total": 179732480, + "tokens/train_per_sec_per_gpu": 3677.42, + "tokens/trainable": 19140292 + }, + { + "epoch": 4.375796178343949, + "grad_norm": 0.12109375, + "learning_rate": 2.359954842690712e-06, + "loss": 0.0016012933338060975, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0016, + "step": 1374, + "tokens/total": 179863552, + "tokens/train_per_sec_per_gpu": 2983.63, + "tokens/trainable": 19152852 + }, + { + "epoch": 4.3789808917197455, + "grad_norm": 0.1455078125, + "learning_rate": 2.336436149215973e-06, + "loss": 0.00259294337593019, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0026, + "step": 1375, + "tokens/total": 179994624, + "tokens/train_per_sec_per_gpu": 3180.68, + "tokens/trainable": 19166198 + }, + { + "epoch": 4.382165605095541, + "grad_norm": 0.1171875, + "learning_rate": 2.3130294880914173e-06, + "loss": 0.0015589562244713306, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00156, + "step": 1376, + "tokens/total": 180125696, + "tokens/train_per_sec_per_gpu": 3318.63, + "tokens/trainable": 19180092 + }, + { + "epoch": 4.385350318471337, + "grad_norm": 0.146484375, + "learning_rate": 2.289734975022742e-06, + "loss": 0.0024165399372577667, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00242, + "step": 1377, + "tokens/total": 180256768, + "tokens/train_per_sec_per_gpu": 3487.98, + "tokens/trainable": 19194700 + }, + { + "epoch": 4.388535031847134, + "grad_norm": 0.10791015625, + "learning_rate": 2.266552725161247e-06, + "loss": 0.0012368856696411967, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00124, + "step": 1378, + "tokens/total": 180387840, + "tokens/train_per_sec_per_gpu": 3407.24, + "tokens/trainable": 19208968 + }, + { + "epoch": 4.39171974522293, + "grad_norm": 0.1318359375, + "learning_rate": 2.2434828531032988e-06, + "loss": 0.002780412556603551, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00278, + "step": 1379, + "tokens/total": 180518912, + "tokens/train_per_sec_per_gpu": 3096.75, + "tokens/trainable": 19221936 + }, + { + "epoch": 4.3949044585987265, + "grad_norm": 0.10400390625, + "learning_rate": 2.220525472889748e-06, + "loss": 0.0017908208537846804, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00179, + "step": 1380, + "tokens/total": 180649984, + "tokens/train_per_sec_per_gpu": 3574.84, + "tokens/trainable": 19236852 + }, + { + "epoch": 4.398089171974522, + "grad_norm": 0.150390625, + "learning_rate": 2.1976806980053556e-06, + "loss": 0.0019308646442368627, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00193, + "step": 1381, + "tokens/total": 180781056, + "tokens/train_per_sec_per_gpu": 3120.85, + "tokens/trainable": 19249910 + }, + { + "epoch": 4.401273885350318, + "grad_norm": 0.142578125, + "learning_rate": 2.1749486413782437e-06, + "loss": 0.001861095312051475, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00186, + "step": 1382, + "tokens/total": 180912128, + "tokens/train_per_sec_per_gpu": 3109.88, + "tokens/trainable": 19262932 + }, + { + "epoch": 4.404458598726115, + "grad_norm": 0.1279296875, + "learning_rate": 2.1523294153793532e-06, + "loss": 0.0020333300344645977, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00204, + "step": 1383, + "tokens/total": 181043200, + "tokens/train_per_sec_per_gpu": 3773.27, + "tokens/trainable": 19278666 + }, + { + "epoch": 4.407643312101911, + "grad_norm": 0.1357421875, + "learning_rate": 2.129823131821848e-06, + "loss": 0.0016740905120968819, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00168, + "step": 1384, + "tokens/total": 181174272, + "tokens/train_per_sec_per_gpu": 3304.58, + "tokens/trainable": 19292464 + }, + { + "epoch": 4.4108280254777075, + "grad_norm": 0.134765625, + "learning_rate": 2.107429901960603e-06, + "loss": 0.0017093883361667395, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00171, + "step": 1385, + "tokens/total": 181305344, + "tokens/train_per_sec_per_gpu": 3359.66, + "tokens/trainable": 19306468 + }, + { + "epoch": 4.414012738853503, + "grad_norm": 0.140625, + "learning_rate": 2.0851498364916345e-06, + "loss": 0.002314978279173374, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00232, + "step": 1386, + "tokens/total": 181436416, + "tokens/train_per_sec_per_gpu": 3591.52, + "tokens/trainable": 19321404 + }, + { + "epoch": 4.417197452229299, + "grad_norm": 0.10546875, + "learning_rate": 2.062983045551553e-06, + "loss": 0.0015969820087775588, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0016, + "step": 1387, + "tokens/total": 181567488, + "tokens/train_per_sec_per_gpu": 2748.49, + "tokens/trainable": 19334018 + }, + { + "epoch": 4.420382165605096, + "grad_norm": 0.12890625, + "learning_rate": 2.0409296387170125e-06, + "loss": 0.002041134750470519, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00204, + "step": 1388, + "tokens/total": 181698560, + "tokens/train_per_sec_per_gpu": 3103.87, + "tokens/trainable": 19347006 + }, + { + "epoch": 4.423566878980892, + "grad_norm": 0.1357421875, + "learning_rate": 2.0189897250041945e-06, + "loss": 0.002131557324901223, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00213, + "step": 1389, + "tokens/total": 181829632, + "tokens/train_per_sec_per_gpu": 3187.5, + "tokens/trainable": 19360352 + }, + { + "epoch": 4.426751592356688, + "grad_norm": 0.1474609375, + "learning_rate": 1.997163412868239e-06, + "loss": 0.002050690818578005, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00205, + "step": 1390, + "tokens/total": 181960704, + "tokens/train_per_sec_per_gpu": 3264.66, + "tokens/trainable": 19374028 + }, + { + "epoch": 4.429936305732484, + "grad_norm": 0.1376953125, + "learning_rate": 1.975450810202725e-06, + "loss": 0.002214430132880807, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00222, + "step": 1391, + "tokens/total": 182091776, + "tokens/train_per_sec_per_gpu": 3059.02, + "tokens/trainable": 19386836 + }, + { + "epoch": 4.43312101910828, + "grad_norm": 0.1455078125, + "learning_rate": 1.953852024339145e-06, + "loss": 0.0023007793352007866, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0023, + "step": 1392, + "tokens/total": 182222848, + "tokens/train_per_sec_per_gpu": 3096.57, + "tokens/trainable": 19399808 + }, + { + "epoch": 4.436305732484076, + "grad_norm": 0.12060546875, + "learning_rate": 1.9323671620463446e-06, + "loss": 0.002242110203951597, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00224, + "step": 1393, + "tokens/total": 182353920, + "tokens/train_per_sec_per_gpu": 3424.27, + "tokens/trainable": 19414156 + }, + { + "epoch": 4.439490445859873, + "grad_norm": 0.142578125, + "learning_rate": 1.9109963295300183e-06, + "loss": 0.002074864227324724, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00208, + "step": 1394, + "tokens/total": 182484992, + "tokens/train_per_sec_per_gpu": 2963.93, + "tokens/trainable": 19426592 + }, + { + "epoch": 4.442675159235669, + "grad_norm": 0.158203125, + "learning_rate": 1.8897396324321914e-06, + "loss": 0.0032230939250439405, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00323, + "step": 1395, + "tokens/total": 182616064, + "tokens/train_per_sec_per_gpu": 3361.36, + "tokens/trainable": 19440628 + }, + { + "epoch": 4.445859872611465, + "grad_norm": 0.1845703125, + "learning_rate": 1.8685971758306691e-06, + "loss": 0.0027499471325427294, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00275, + "step": 1396, + "tokens/total": 182747136, + "tokens/train_per_sec_per_gpu": 3511.89, + "tokens/trainable": 19455336 + }, + { + "epoch": 4.449044585987261, + "grad_norm": 0.130859375, + "learning_rate": 1.8475690642385468e-06, + "loss": 0.0020744299981743097, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00208, + "step": 1397, + "tokens/total": 182878208, + "tokens/train_per_sec_per_gpu": 3570.48, + "tokens/trainable": 19470248 + }, + { + "epoch": 4.452229299363057, + "grad_norm": 0.123046875, + "learning_rate": 1.8266554016036803e-06, + "loss": 0.0015029326314106584, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0015, + "step": 1398, + "tokens/total": 183009280, + "tokens/train_per_sec_per_gpu": 3170.84, + "tokens/trainable": 19483584 + }, + { + "epoch": 4.455414012738854, + "grad_norm": 0.1171875, + "learning_rate": 1.805856291308161e-06, + "loss": 0.0015301044331863523, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00153, + "step": 1399, + "tokens/total": 183140352, + "tokens/train_per_sec_per_gpu": 3104.15, + "tokens/trainable": 19496574 + }, + { + "epoch": 4.45859872611465, + "grad_norm": 0.1328125, + "learning_rate": 1.7851718361678205e-06, + "loss": 0.0024863574653863907, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00249, + "step": 1400, + "tokens/total": 183271424, + "tokens/train_per_sec_per_gpu": 3049.29, + "tokens/trainable": 19509346 + }, + { + "epoch": 4.461783439490446, + "grad_norm": 0.130859375, + "learning_rate": 1.7646021384317201e-06, + "loss": 0.0017364751547574997, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00174, + "step": 1401, + "tokens/total": 183402496, + "tokens/train_per_sec_per_gpu": 2926.42, + "tokens/trainable": 19521620 + }, + { + "epoch": 4.464968152866242, + "grad_norm": 0.1494140625, + "learning_rate": 1.7441472997816538e-06, + "loss": 0.0021231744904071093, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00213, + "step": 1402, + "tokens/total": 183533568, + "tokens/train_per_sec_per_gpu": 3355.02, + "tokens/trainable": 19535636 + }, + { + "epoch": 4.468152866242038, + "grad_norm": 0.12255859375, + "learning_rate": 1.7238074213316107e-06, + "loss": 0.0017344644293189049, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00174, + "step": 1403, + "tokens/total": 183664640, + "tokens/train_per_sec_per_gpu": 3171.71, + "tokens/trainable": 19548924 + }, + { + "epoch": 4.471337579617835, + "grad_norm": 0.119140625, + "learning_rate": 1.703582603627321e-06, + "loss": 0.0015079887816682458, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00151, + "step": 1404, + "tokens/total": 183795712, + "tokens/train_per_sec_per_gpu": 3552.52, + "tokens/trainable": 19563736 + }, + { + "epoch": 4.474522292993631, + "grad_norm": 0.1279296875, + "learning_rate": 1.6834729466457256e-06, + "loss": 0.0015849830815568566, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00159, + "step": 1405, + "tokens/total": 183926784, + "tokens/train_per_sec_per_gpu": 3232.9, + "tokens/trainable": 19577276 + }, + { + "epoch": 4.477707006369426, + "grad_norm": 0.1337890625, + "learning_rate": 1.6634785497944922e-06, + "loss": 0.002168088685721159, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00217, + "step": 1406, + "tokens/total": 184057856, + "tokens/train_per_sec_per_gpu": 3279.65, + "tokens/trainable": 19591000 + }, + { + "epoch": 4.480891719745223, + "grad_norm": 0.15234375, + "learning_rate": 1.6435995119115367e-06, + "loss": 0.0026027678977698088, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00261, + "step": 1407, + "tokens/total": 184188928, + "tokens/train_per_sec_per_gpu": 3467.93, + "tokens/trainable": 19605518 + }, + { + "epoch": 4.484076433121019, + "grad_norm": 0.130859375, + "learning_rate": 1.6238359312645168e-06, + "loss": 0.0017946372972801328, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0018, + "step": 1408, + "tokens/total": 184320000, + "tokens/train_per_sec_per_gpu": 3080.27, + "tokens/trainable": 19618436 + }, + { + "epoch": 4.487261146496815, + "grad_norm": 0.126953125, + "learning_rate": 1.6041879055503473e-06, + "loss": 0.002403007121756673, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00241, + "step": 1409, + "tokens/total": 184451072, + "tokens/train_per_sec_per_gpu": 3474.59, + "tokens/trainable": 19632904 + }, + { + "epoch": 4.490445859872612, + "grad_norm": 0.12890625, + "learning_rate": 1.5846555318947353e-06, + "loss": 0.0019406620413064957, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00194, + "step": 1410, + "tokens/total": 184582144, + "tokens/train_per_sec_per_gpu": 3262.74, + "tokens/trainable": 19646574 + }, + { + "epoch": 4.493630573248407, + "grad_norm": 0.123046875, + "learning_rate": 1.5652389068516765e-06, + "loss": 0.0018433219520375133, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00185, + "step": 1411, + "tokens/total": 184713216, + "tokens/train_per_sec_per_gpu": 3327.14, + "tokens/trainable": 19660446 + }, + { + "epoch": 4.496815286624204, + "grad_norm": 0.1171875, + "learning_rate": 1.5459381264029904e-06, + "loss": 0.0018597168382257223, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00186, + "step": 1412, + "tokens/total": 184844288, + "tokens/train_per_sec_per_gpu": 3955.39, + "tokens/trainable": 19676842 + }, + { + "epoch": 4.5, + "grad_norm": 0.1328125, + "learning_rate": 1.5267532859578437e-06, + "loss": 0.0019280803389847279, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00193, + "step": 1413, + "tokens/total": 184975360, + "tokens/train_per_sec_per_gpu": 3216.74, + "tokens/trainable": 19690306 + }, + { + "epoch": 4.5, + "eval_loss": 0.010314718820154667, + "eval_ppl": 1.01037, + "eval_runtime": 41.6339, + "eval_samples_per_second": 64.875, + "eval_steps_per_second": 4.059, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 54.61, + "memory/max_allocated (GiB)": 54.61, + "step": 1413 + }, + { + "epoch": 4.503184713375796, + "grad_norm": 0.1279296875, + "learning_rate": 1.5076844803522922e-06, + "loss": 0.0020255008712410927, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00203, + "step": 1414, + "tokens/total": 185106432, + "tokens/train_per_sec_per_gpu": 3418.68, + "tokens/trainable": 19704608 + }, + { + "epoch": 4.506369426751593, + "grad_norm": 0.1220703125, + "learning_rate": 1.4887318038487752e-06, + "loss": 0.0020268706139177084, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00203, + "step": 1415, + "tokens/total": 185237504, + "tokens/train_per_sec_per_gpu": 3371.87, + "tokens/trainable": 19718710 + }, + { + "epoch": 4.509554140127388, + "grad_norm": 0.12890625, + "learning_rate": 1.4698953501356972e-06, + "loss": 0.00200156238861382, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.002, + "step": 1416, + "tokens/total": 185368576, + "tokens/train_per_sec_per_gpu": 3222.46, + "tokens/trainable": 19732218 + }, + { + "epoch": 4.512738853503185, + "grad_norm": 0.115234375, + "learning_rate": 1.4511752123269245e-06, + "loss": 0.0017808325355872512, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00178, + "step": 1417, + "tokens/total": 185499648, + "tokens/train_per_sec_per_gpu": 3317.65, + "tokens/trainable": 19746108 + }, + { + "epoch": 4.515923566878981, + "grad_norm": 0.12255859375, + "learning_rate": 1.432571482961345e-06, + "loss": 0.0017151820939034224, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00172, + "step": 1418, + "tokens/total": 185630720, + "tokens/train_per_sec_per_gpu": 3182.32, + "tokens/trainable": 19759432 + }, + { + "epoch": 4.519108280254777, + "grad_norm": 0.17578125, + "learning_rate": 1.4140842540024123e-06, + "loss": 0.002563396468758583, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00257, + "step": 1419, + "tokens/total": 185761792, + "tokens/train_per_sec_per_gpu": 3137.69, + "tokens/trainable": 19772572 + }, + { + "epoch": 4.522292993630574, + "grad_norm": 0.11376953125, + "learning_rate": 1.3957136168376822e-06, + "loss": 0.0014816210605204105, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00148, + "step": 1420, + "tokens/total": 185892864, + "tokens/train_per_sec_per_gpu": 3152.7, + "tokens/trainable": 19785788 + }, + { + "epoch": 4.525477707006369, + "grad_norm": 0.11767578125, + "learning_rate": 1.3774596622783604e-06, + "loss": 0.0015394608490169048, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00154, + "step": 1421, + "tokens/total": 186023936, + "tokens/train_per_sec_per_gpu": 3228.29, + "tokens/trainable": 19799284 + }, + { + "epoch": 4.528662420382165, + "grad_norm": 0.1455078125, + "learning_rate": 1.3593224805588722e-06, + "loss": 0.0022464555222541094, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00225, + "step": 1422, + "tokens/total": 186155008, + "tokens/train_per_sec_per_gpu": 3587.26, + "tokens/trainable": 19814314 + }, + { + "epoch": 4.531847133757962, + "grad_norm": 0.07177734375, + "learning_rate": 1.341302161336383e-06, + "loss": 0.0008602555026300251, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00086, + "step": 1423, + "tokens/total": 186286080, + "tokens/train_per_sec_per_gpu": 3203.52, + "tokens/trainable": 19827672 + }, + { + "epoch": 4.535031847133758, + "grad_norm": 0.11572265625, + "learning_rate": 1.3233987936903808e-06, + "loss": 0.0015553171979263425, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00156, + "step": 1424, + "tokens/total": 186417152, + "tokens/train_per_sec_per_gpu": 3742.26, + "tokens/trainable": 19843252 + }, + { + "epoch": 4.538216560509554, + "grad_norm": 0.1435546875, + "learning_rate": 1.3056124661222357e-06, + "loss": 0.0025198939256370068, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00252, + "step": 1425, + "tokens/total": 186548224, + "tokens/train_per_sec_per_gpu": 3418.51, + "tokens/trainable": 19857490 + }, + { + "epoch": 4.54140127388535, + "grad_norm": 0.1123046875, + "learning_rate": 1.2879432665547558e-06, + "loss": 0.002200118498876691, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0022, + "step": 1426, + "tokens/total": 186679296, + "tokens/train_per_sec_per_gpu": 3288.15, + "tokens/trainable": 19871216 + }, + { + "epoch": 4.544585987261146, + "grad_norm": 0.134765625, + "learning_rate": 1.27039128233174e-06, + "loss": 0.0021478794515132904, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00215, + "step": 1427, + "tokens/total": 186810368, + "tokens/train_per_sec_per_gpu": 3148.41, + "tokens/trainable": 19884396 + }, + { + "epoch": 4.547770700636943, + "grad_norm": 0.1572265625, + "learning_rate": 1.2529566002175753e-06, + "loss": 0.002553946105763316, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00256, + "step": 1428, + "tokens/total": 186941440, + "tokens/train_per_sec_per_gpu": 3403.73, + "tokens/trainable": 19898644 + }, + { + "epoch": 4.550955414012739, + "grad_norm": 0.12890625, + "learning_rate": 1.2356393063967798e-06, + "loss": 0.001968652941286564, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00197, + "step": 1429, + "tokens/total": 187072512, + "tokens/train_per_sec_per_gpu": 3468.59, + "tokens/trainable": 19913104 + }, + { + "epoch": 4.554140127388535, + "grad_norm": 0.12255859375, + "learning_rate": 1.2184394864735881e-06, + "loss": 0.00198244652710855, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00198, + "step": 1430, + "tokens/total": 187203584, + "tokens/train_per_sec_per_gpu": 3461.21, + "tokens/trainable": 19927596 + }, + { + "epoch": 4.557324840764331, + "grad_norm": 0.1494140625, + "learning_rate": 1.201357225471536e-06, + "loss": 0.0016815853305161, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00168, + "step": 1431, + "tokens/total": 187334656, + "tokens/train_per_sec_per_gpu": 3043.87, + "tokens/trainable": 19940348 + }, + { + "epoch": 4.560509554140127, + "grad_norm": 0.1318359375, + "learning_rate": 1.184392607833032e-06, + "loss": 0.0021309617441147566, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00213, + "step": 1432, + "tokens/total": 187465728, + "tokens/train_per_sec_per_gpu": 3664.84, + "tokens/trainable": 19955600 + }, + { + "epoch": 4.563694267515924, + "grad_norm": 0.1494140625, + "learning_rate": 1.1675457174189302e-06, + "loss": 0.00207577389664948, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00208, + "step": 1433, + "tokens/total": 187596800, + "tokens/train_per_sec_per_gpu": 3301.91, + "tokens/trainable": 19969374 + }, + { + "epoch": 4.56687898089172, + "grad_norm": 0.138671875, + "learning_rate": 1.1508166375081424e-06, + "loss": 0.0015523422043770552, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00155, + "step": 1434, + "tokens/total": 187727872, + "tokens/train_per_sec_per_gpu": 3163.05, + "tokens/trainable": 19982636 + }, + { + "epoch": 4.570063694267516, + "grad_norm": 0.134765625, + "learning_rate": 1.1342054507971933e-06, + "loss": 0.0017875637859106064, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00179, + "step": 1435, + "tokens/total": 187858944, + "tokens/train_per_sec_per_gpu": 3297.76, + "tokens/trainable": 19996446 + }, + { + "epoch": 4.573248407643312, + "grad_norm": 0.11962890625, + "learning_rate": 1.1177122393998374e-06, + "loss": 0.0017204630421474576, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00172, + "step": 1436, + "tokens/total": 187990016, + "tokens/train_per_sec_per_gpu": 3545.62, + "tokens/trainable": 20011220 + }, + { + "epoch": 4.576433121019108, + "grad_norm": 0.1171875, + "learning_rate": 1.101337084846643e-06, + "loss": 0.0016106198308989406, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00161, + "step": 1437, + "tokens/total": 188121088, + "tokens/train_per_sec_per_gpu": 3421.36, + "tokens/trainable": 20025440 + }, + { + "epoch": 4.579617834394904, + "grad_norm": 0.1103515625, + "learning_rate": 1.0850800680845929e-06, + "loss": 0.0017103978898376226, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00171, + "step": 1438, + "tokens/total": 188252160, + "tokens/train_per_sec_per_gpu": 3540.04, + "tokens/trainable": 20040228 + }, + { + "epoch": 4.582802547770701, + "grad_norm": 0.1025390625, + "learning_rate": 1.0689412694766753e-06, + "loss": 0.0013984747929498553, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0014, + "step": 1439, + "tokens/total": 188383232, + "tokens/train_per_sec_per_gpu": 3402.4, + "tokens/trainable": 20054474 + }, + { + "epoch": 4.585987261146497, + "grad_norm": 0.1025390625, + "learning_rate": 1.0529207688015018e-06, + "loss": 0.0012951147509738803, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0013, + "step": 1440, + "tokens/total": 188514304, + "tokens/train_per_sec_per_gpu": 3638.2, + "tokens/trainable": 20069706 + }, + { + "epoch": 4.5891719745222925, + "grad_norm": 0.1181640625, + "learning_rate": 1.0370186452528935e-06, + "loss": 0.0015985879581421614, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0016, + "step": 1441, + "tokens/total": 188645376, + "tokens/train_per_sec_per_gpu": 3136.96, + "tokens/trainable": 20082850 + }, + { + "epoch": 4.592356687898089, + "grad_norm": 0.11572265625, + "learning_rate": 1.021234977439503e-06, + "loss": 0.0018211111892014742, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00182, + "step": 1442, + "tokens/total": 188776448, + "tokens/train_per_sec_per_gpu": 3227.61, + "tokens/trainable": 20096360 + }, + { + "epoch": 4.595541401273885, + "grad_norm": 0.146484375, + "learning_rate": 1.0055698433844324e-06, + "loss": 0.002404790371656418, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00241, + "step": 1443, + "tokens/total": 188907520, + "tokens/train_per_sec_per_gpu": 3174.22, + "tokens/trainable": 20109664 + }, + { + "epoch": 4.598726114649682, + "grad_norm": 0.13671875, + "learning_rate": 9.9002332052483e-07, + "loss": 0.0017899831291288137, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00179, + "step": 1444, + "tokens/total": 189038592, + "tokens/train_per_sec_per_gpu": 3046.84, + "tokens/trainable": 20122424 + }, + { + "epoch": 4.601910828025478, + "grad_norm": 0.1142578125, + "learning_rate": 9.745954857115102e-07, + "loss": 0.0016956630861386657, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0017, + "step": 1445, + "tokens/total": 189169664, + "tokens/train_per_sec_per_gpu": 3263.68, + "tokens/trainable": 20136088 + }, + { + "epoch": 4.6050955414012735, + "grad_norm": 0.1083984375, + "learning_rate": 9.592864152085963e-07, + "loss": 0.0015517222927883267, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00155, + "step": 1446, + "tokens/total": 189300736, + "tokens/train_per_sec_per_gpu": 3336.28, + "tokens/trainable": 20150052 + }, + { + "epoch": 4.60828025477707, + "grad_norm": 0.10986328125, + "learning_rate": 9.440961846931107e-07, + "loss": 0.0015380029799416661, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00154, + "step": 1447, + "tokens/total": 189431808, + "tokens/train_per_sec_per_gpu": 3506.95, + "tokens/trainable": 20164734 + }, + { + "epoch": 4.611464968152866, + "grad_norm": 0.119140625, + "learning_rate": 9.290248692546189e-07, + "loss": 0.0016031761188060045, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0016, + "step": 1448, + "tokens/total": 189562880, + "tokens/train_per_sec_per_gpu": 3064.35, + "tokens/trainable": 20177570 + }, + { + "epoch": 4.614649681528663, + "grad_norm": 0.12109375, + "learning_rate": 9.140725433948616e-07, + "loss": 0.002197918714955449, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0022, + "step": 1449, + "tokens/total": 189693952, + "tokens/train_per_sec_per_gpu": 3280.73, + "tokens/trainable": 20191308 + }, + { + "epoch": 4.617834394904459, + "grad_norm": 0.10595703125, + "learning_rate": 8.992392810273781e-07, + "loss": 0.0015633050352334976, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00156, + "step": 1450, + "tokens/total": 189825024, + "tokens/train_per_sec_per_gpu": 3466.66, + "tokens/trainable": 20205824 + }, + { + "epoch": 4.6210191082802545, + "grad_norm": 0.10205078125, + "learning_rate": 8.845251554771422e-07, + "loss": 0.0020091324113309383, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00201, + "step": 1451, + "tokens/total": 189956096, + "tokens/train_per_sec_per_gpu": 3548.93, + "tokens/trainable": 20220694 + }, + { + "epoch": 4.624203821656051, + "grad_norm": 0.10546875, + "learning_rate": 8.699302394802016e-07, + "loss": 0.0017181969014927745, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00172, + "step": 1452, + "tokens/total": 190087168, + "tokens/train_per_sec_per_gpu": 3437.21, + "tokens/trainable": 20235080 + }, + { + "epoch": 4.627388535031847, + "grad_norm": 0.11669921875, + "learning_rate": 8.554546051833201e-07, + "loss": 0.0018156894948333502, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00182, + "step": 1453, + "tokens/total": 190218240, + "tokens/train_per_sec_per_gpu": 3566.23, + "tokens/trainable": 20250016 + }, + { + "epoch": 4.630573248407643, + "grad_norm": 0.1298828125, + "learning_rate": 8.410983241436132e-07, + "loss": 0.002036329824477434, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00204, + "step": 1454, + "tokens/total": 190349312, + "tokens/train_per_sec_per_gpu": 3178.74, + "tokens/trainable": 20263338 + }, + { + "epoch": 4.63375796178344, + "grad_norm": 0.09521484375, + "learning_rate": 8.268614673282021e-07, + "loss": 0.0012238912750035524, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00122, + "step": 1455, + "tokens/total": 190480384, + "tokens/train_per_sec_per_gpu": 3217.83, + "tokens/trainable": 20276824 + }, + { + "epoch": 4.6369426751592355, + "grad_norm": 0.162109375, + "learning_rate": 8.127441051138662e-07, + "loss": 0.0029940090607851744, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.003, + "step": 1456, + "tokens/total": 190611456, + "tokens/train_per_sec_per_gpu": 3579.23, + "tokens/trainable": 20291792 + }, + { + "epoch": 4.640127388535031, + "grad_norm": 0.0986328125, + "learning_rate": 7.987463072866852e-07, + "loss": 0.001104258350096643, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0011, + "step": 1457, + "tokens/total": 190742528, + "tokens/train_per_sec_per_gpu": 3201.93, + "tokens/trainable": 20305198 + }, + { + "epoch": 4.643312101910828, + "grad_norm": 0.126953125, + "learning_rate": 7.848681430416948e-07, + "loss": 0.0020911027677357197, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00209, + "step": 1458, + "tokens/total": 190873600, + "tokens/train_per_sec_per_gpu": 3356.29, + "tokens/trainable": 20319196 + }, + { + "epoch": 4.646496815286624, + "grad_norm": 0.12451171875, + "learning_rate": 7.711096809825513e-07, + "loss": 0.0017163840821012855, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00172, + "step": 1459, + "tokens/total": 191004672, + "tokens/train_per_sec_per_gpu": 3241.44, + "tokens/trainable": 20332736 + }, + { + "epoch": 4.649681528662421, + "grad_norm": 0.1337890625, + "learning_rate": 7.574709891211951e-07, + "loss": 0.0014391193399205804, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00144, + "step": 1460, + "tokens/total": 191135744, + "tokens/train_per_sec_per_gpu": 2792.75, + "tokens/trainable": 20344452 + }, + { + "epoch": 4.6528662420382165, + "grad_norm": 0.12158203125, + "learning_rate": 7.439521348774959e-07, + "loss": 0.0014456507051363587, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00145, + "step": 1461, + "tokens/total": 191266816, + "tokens/train_per_sec_per_gpu": 3650.92, + "tokens/trainable": 20359712 + }, + { + "epoch": 4.656050955414012, + "grad_norm": 0.126953125, + "learning_rate": 7.305531850789444e-07, + "loss": 0.0015093558467924595, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00151, + "step": 1462, + "tokens/total": 191397888, + "tokens/train_per_sec_per_gpu": 3308.41, + "tokens/trainable": 20373528 + }, + { + "epoch": 4.659235668789809, + "grad_norm": 0.09716796875, + "learning_rate": 7.17274205960311e-07, + "loss": 0.0016126552363857627, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00161, + "step": 1463, + "tokens/total": 191528960, + "tokens/train_per_sec_per_gpu": 3625.83, + "tokens/trainable": 20388600 + }, + { + "epoch": 4.662420382165605, + "grad_norm": 0.11181640625, + "learning_rate": 7.041152631633075e-07, + "loss": 0.0025427560321986675, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00255, + "step": 1464, + "tokens/total": 191660032, + "tokens/train_per_sec_per_gpu": 3430.12, + "tokens/trainable": 20402944 + }, + { + "epoch": 4.665605095541402, + "grad_norm": 0.1435546875, + "learning_rate": 6.910764217362753e-07, + "loss": 0.002073355484753847, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00208, + "step": 1465, + "tokens/total": 191791104, + "tokens/train_per_sec_per_gpu": 3534.12, + "tokens/trainable": 20417666 + }, + { + "epoch": 4.6687898089171975, + "grad_norm": 0.15234375, + "learning_rate": 6.781577461338673e-07, + "loss": 0.0026118066161870956, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00262, + "step": 1466, + "tokens/total": 191922176, + "tokens/train_per_sec_per_gpu": 3348.83, + "tokens/trainable": 20431704 + }, + { + "epoch": 4.671974522292993, + "grad_norm": 0.1259765625, + "learning_rate": 6.653593002167168e-07, + "loss": 0.0018058358691632748, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00181, + "step": 1467, + "tokens/total": 192053248, + "tokens/train_per_sec_per_gpu": 3093.22, + "tokens/trainable": 20444646 + }, + { + "epoch": 4.67515923566879, + "grad_norm": 0.1201171875, + "learning_rate": 6.526811472511302e-07, + "loss": 0.0014479233650490642, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00145, + "step": 1468, + "tokens/total": 192184320, + "tokens/train_per_sec_per_gpu": 3514.53, + "tokens/trainable": 20459360 + }, + { + "epoch": 4.678343949044586, + "grad_norm": 0.140625, + "learning_rate": 6.40123349908775e-07, + "loss": 0.002245377516373992, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00225, + "step": 1469, + "tokens/total": 192315392, + "tokens/train_per_sec_per_gpu": 3310.51, + "tokens/trainable": 20473228 + }, + { + "epoch": 4.681528662420382, + "grad_norm": 0.1201171875, + "learning_rate": 6.276859702663618e-07, + "loss": 0.001856306567788124, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00186, + "step": 1470, + "tokens/total": 192446464, + "tokens/train_per_sec_per_gpu": 3067.13, + "tokens/trainable": 20486072 + }, + { + "epoch": 4.6847133757961785, + "grad_norm": 0.1474609375, + "learning_rate": 6.153690698053438e-07, + "loss": 0.0019508072873577476, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00195, + "step": 1471, + "tokens/total": 192577536, + "tokens/train_per_sec_per_gpu": 3417.64, + "tokens/trainable": 20500378 + }, + { + "epoch": 4.687898089171974, + "grad_norm": 0.1552734375, + "learning_rate": 6.031727094116175e-07, + "loss": 0.0022490478586405516, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00225, + "step": 1472, + "tokens/total": 192708608, + "tokens/train_per_sec_per_gpu": 3687.19, + "tokens/trainable": 20515722 + }, + { + "epoch": 4.69108280254777, + "grad_norm": 0.123046875, + "learning_rate": 5.910969493752055e-07, + "loss": 0.0018782130209729075, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00188, + "step": 1473, + "tokens/total": 192839680, + "tokens/train_per_sec_per_gpu": 3720.3, + "tokens/trainable": 20531248 + }, + { + "epoch": 4.694267515923567, + "grad_norm": 0.126953125, + "learning_rate": 5.791418493899803e-07, + "loss": 0.0018554049311205745, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00186, + "step": 1474, + "tokens/total": 192970752, + "tokens/train_per_sec_per_gpu": 3362.35, + "tokens/trainable": 20545276 + }, + { + "epoch": 4.697452229299363, + "grad_norm": 0.1708984375, + "learning_rate": 5.673074685533547e-07, + "loss": 0.00283794361166656, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00284, + "step": 1475, + "tokens/total": 193101824, + "tokens/train_per_sec_per_gpu": 3107.71, + "tokens/trainable": 20558290 + }, + { + "epoch": 4.7006369426751595, + "grad_norm": 0.115234375, + "learning_rate": 5.555938653659859e-07, + "loss": 0.0015586434165015817, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00156, + "step": 1476, + "tokens/total": 193232896, + "tokens/train_per_sec_per_gpu": 3495.85, + "tokens/trainable": 20572836 + }, + { + "epoch": 4.703821656050955, + "grad_norm": 0.1728515625, + "learning_rate": 5.440010977315003e-07, + "loss": 0.002725705737248063, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00273, + "step": 1477, + "tokens/total": 193363968, + "tokens/train_per_sec_per_gpu": 2926.02, + "tokens/trainable": 20585176 + }, + { + "epoch": 4.707006369426751, + "grad_norm": 0.1689453125, + "learning_rate": 5.32529222956199e-07, + "loss": 0.003224026644602418, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00323, + "step": 1478, + "tokens/total": 193495040, + "tokens/train_per_sec_per_gpu": 3124.75, + "tokens/trainable": 20598252 + }, + { + "epoch": 4.710191082802548, + "grad_norm": 0.1201171875, + "learning_rate": 5.211782977487728e-07, + "loss": 0.0022572882007807493, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00226, + "step": 1479, + "tokens/total": 193626112, + "tokens/train_per_sec_per_gpu": 3719.62, + "tokens/trainable": 20613736 + }, + { + "epoch": 4.713375796178344, + "grad_norm": 0.126953125, + "learning_rate": 5.099483782200321e-07, + "loss": 0.0020106916781514883, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00201, + "step": 1480, + "tokens/total": 193757184, + "tokens/train_per_sec_per_gpu": 3386.3, + "tokens/trainable": 20627916 + }, + { + "epoch": 4.7165605095541405, + "grad_norm": 0.150390625, + "learning_rate": 4.988395198826157e-07, + "loss": 0.002159472554922104, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00216, + "step": 1481, + "tokens/total": 193888256, + "tokens/train_per_sec_per_gpu": 3132.67, + "tokens/trainable": 20641036 + }, + { + "epoch": 4.719745222929936, + "grad_norm": 0.1376953125, + "learning_rate": 4.878517776507247e-07, + "loss": 0.0026867706328630447, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00269, + "step": 1482, + "tokens/total": 194019328, + "tokens/train_per_sec_per_gpu": 3359.56, + "tokens/trainable": 20655048 + }, + { + "epoch": 4.722929936305732, + "grad_norm": 0.10595703125, + "learning_rate": 4.7698520583985e-07, + "loss": 0.0017674706177785993, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00177, + "step": 1483, + "tokens/total": 194150400, + "tokens/train_per_sec_per_gpu": 3297.46, + "tokens/trainable": 20668772 + }, + { + "epoch": 4.726114649681529, + "grad_norm": 0.1103515625, + "learning_rate": 4.662398581665006e-07, + "loss": 0.0014837021008133888, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00148, + "step": 1484, + "tokens/total": 194281472, + "tokens/train_per_sec_per_gpu": 3494.12, + "tokens/trainable": 20683348 + }, + { + "epoch": 4.729299363057325, + "grad_norm": 0.123046875, + "learning_rate": 4.5561578774794276e-07, + "loss": 0.0021369662135839462, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00214, + "step": 1485, + "tokens/total": 194412544, + "tokens/train_per_sec_per_gpu": 3607.71, + "tokens/trainable": 20698316 + }, + { + "epoch": 4.732484076433121, + "grad_norm": 0.158203125, + "learning_rate": 4.45113047101936e-07, + "loss": 0.002360973972827196, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00236, + "step": 1486, + "tokens/total": 194543616, + "tokens/train_per_sec_per_gpu": 3584.68, + "tokens/trainable": 20713220 + }, + { + "epoch": 4.735668789808917, + "grad_norm": 0.1162109375, + "learning_rate": 4.3473168814647525e-07, + "loss": 0.0015863839071244001, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00159, + "step": 1487, + "tokens/total": 194674688, + "tokens/train_per_sec_per_gpu": 3400.0, + "tokens/trainable": 20727406 + }, + { + "epoch": 4.738853503184713, + "grad_norm": 0.1162109375, + "learning_rate": 4.24471762199527e-07, + "loss": 0.0016582348616793752, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00166, + "step": 1488, + "tokens/total": 194805760, + "tokens/train_per_sec_per_gpu": 3258.7, + "tokens/trainable": 20741076 + }, + { + "epoch": 4.742038216560509, + "grad_norm": 0.1328125, + "learning_rate": 4.143333199787769e-07, + "loss": 0.00176681496668607, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00177, + "step": 1489, + "tokens/total": 194936832, + "tokens/train_per_sec_per_gpu": 2999.18, + "tokens/trainable": 20753696 + }, + { + "epoch": 4.745222929936306, + "grad_norm": 0.1337890625, + "learning_rate": 4.0431641160139367e-07, + "loss": 0.002107662847265601, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00211, + "step": 1490, + "tokens/total": 195067904, + "tokens/train_per_sec_per_gpu": 3418.27, + "tokens/trainable": 20767910 + }, + { + "epoch": 4.748407643312102, + "grad_norm": 0.140625, + "learning_rate": 3.944210865837572e-07, + "loss": 0.0021030758507549763, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00211, + "step": 1491, + "tokens/total": 195198976, + "tokens/train_per_sec_per_gpu": 3144.49, + "tokens/trainable": 20781092 + }, + { + "epoch": 4.751592356687898, + "grad_norm": 0.1240234375, + "learning_rate": 3.846473938412365e-07, + "loss": 0.0020006736740469933, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.002, + "step": 1492, + "tokens/total": 195330048, + "tokens/train_per_sec_per_gpu": 3535.97, + "tokens/trainable": 20795832 + }, + { + "epoch": 4.754777070063694, + "grad_norm": 0.11572265625, + "learning_rate": 3.749953816879398e-07, + "loss": 0.001961378613486886, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00196, + "step": 1493, + "tokens/total": 195461120, + "tokens/train_per_sec_per_gpu": 3395.84, + "tokens/trainable": 20810046 + }, + { + "epoch": 4.757961783439491, + "grad_norm": 0.1328125, + "learning_rate": 3.654650978364649e-07, + "loss": 0.0024665065575391054, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00247, + "step": 1494, + "tokens/total": 195592192, + "tokens/train_per_sec_per_gpu": 3389.65, + "tokens/trainable": 20824160 + }, + { + "epoch": 4.761146496815287, + "grad_norm": 0.1318359375, + "learning_rate": 3.560565893976742e-07, + "loss": 0.0024471194483339787, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00245, + "step": 1495, + "tokens/total": 195723264, + "tokens/train_per_sec_per_gpu": 3146.42, + "tokens/trainable": 20837344 + }, + { + "epoch": 4.764331210191083, + "grad_norm": 0.201171875, + "learning_rate": 3.467699028804672e-07, + "loss": 0.003118871245533228, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00312, + "step": 1496, + "tokens/total": 195854336, + "tokens/train_per_sec_per_gpu": 3068.73, + "tokens/trainable": 20850148 + }, + { + "epoch": 4.767515923566879, + "grad_norm": 0.146484375, + "learning_rate": 3.376050841915335e-07, + "loss": 0.0028909991960972548, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0029, + "step": 1497, + "tokens/total": 195985408, + "tokens/train_per_sec_per_gpu": 3299.09, + "tokens/trainable": 20863890 + }, + { + "epoch": 4.770700636942675, + "grad_norm": 0.11181640625, + "learning_rate": 3.2856217863514727e-07, + "loss": 0.001599812414497137, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0016, + "step": 1498, + "tokens/total": 196116480, + "tokens/train_per_sec_per_gpu": 3543.45, + "tokens/trainable": 20878628 + }, + { + "epoch": 4.773885350318471, + "grad_norm": 0.158203125, + "learning_rate": 3.1964123091292595e-07, + "loss": 0.0027794367633759975, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00278, + "step": 1499, + "tokens/total": 196247552, + "tokens/train_per_sec_per_gpu": 3411.09, + "tokens/trainable": 20892812 + }, + { + "epoch": 4.777070063694268, + "grad_norm": 0.09814453125, + "learning_rate": 3.108422851236137e-07, + "loss": 0.0011374036548659205, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00114, + "step": 1500, + "tokens/total": 196378624, + "tokens/train_per_sec_per_gpu": 3329.43, + "tokens/trainable": 20906680 + }, + { + "epoch": 4.780254777070064, + "grad_norm": 0.11865234375, + "learning_rate": 3.0216538476286196e-07, + "loss": 0.0018032776424661279, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0018, + "step": 1501, + "tokens/total": 196509696, + "tokens/train_per_sec_per_gpu": 3433.1, + "tokens/trainable": 20920932 + }, + { + "epoch": 4.7834394904458595, + "grad_norm": 0.11669921875, + "learning_rate": 2.936105727230298e-07, + "loss": 0.0027445517480373383, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00275, + "step": 1502, + "tokens/total": 196640768, + "tokens/train_per_sec_per_gpu": 3841.28, + "tokens/trainable": 20936896 + }, + { + "epoch": 4.786624203821656, + "grad_norm": 0.10107421875, + "learning_rate": 2.851778912929426e-07, + "loss": 0.001024644705466926, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00103, + "step": 1503, + "tokens/total": 196771840, + "tokens/train_per_sec_per_gpu": 3599.56, + "tokens/trainable": 20951852 + }, + { + "epoch": 4.789808917197452, + "grad_norm": 0.09423828125, + "learning_rate": 2.768673821577167e-07, + "loss": 0.0011879701633006334, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00119, + "step": 1504, + "tokens/total": 196902912, + "tokens/train_per_sec_per_gpu": 3009.96, + "tokens/trainable": 20964468 + }, + { + "epoch": 4.792993630573249, + "grad_norm": 0.1591796875, + "learning_rate": 2.6867908639852944e-07, + "loss": 0.0033508751075714827, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00336, + "step": 1505, + "tokens/total": 197033984, + "tokens/train_per_sec_per_gpu": 3536.08, + "tokens/trainable": 20979168 + }, + { + "epoch": 4.796178343949045, + "grad_norm": 0.166015625, + "learning_rate": 2.6061304449241655e-07, + "loss": 0.0030738934874534607, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00308, + "step": 1506, + "tokens/total": 197165056, + "tokens/train_per_sec_per_gpu": 2876.89, + "tokens/trainable": 20991260 + }, + { + "epoch": 4.7993630573248405, + "grad_norm": 0.1533203125, + "learning_rate": 2.526692963120858e-07, + "loss": 0.002285804832354188, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00229, + "step": 1507, + "tokens/total": 197296128, + "tokens/train_per_sec_per_gpu": 3400.76, + "tokens/trainable": 21005408 + }, + { + "epoch": 4.802547770700637, + "grad_norm": 0.1298828125, + "learning_rate": 2.448478811257149e-07, + "loss": 0.002408439526334405, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00241, + "step": 1508, + "tokens/total": 197427200, + "tokens/train_per_sec_per_gpu": 3679.66, + "tokens/trainable": 21020666 + }, + { + "epoch": 4.805732484076433, + "grad_norm": 0.08837890625, + "learning_rate": 2.3714883759674566e-07, + "loss": 0.0013570735463872552, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00136, + "step": 1509, + "tokens/total": 197558272, + "tokens/train_per_sec_per_gpu": 3549.55, + "tokens/trainable": 21035448 + }, + { + "epoch": 4.80891719745223, + "grad_norm": 0.1142578125, + "learning_rate": 2.295722037837178e-07, + "loss": 0.0017048909794539213, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00171, + "step": 1510, + "tokens/total": 197689344, + "tokens/train_per_sec_per_gpu": 2969.31, + "tokens/trainable": 21047892 + }, + { + "epoch": 4.812101910828026, + "grad_norm": 0.09521484375, + "learning_rate": 2.2211801714004942e-07, + "loss": 0.0012713008327409625, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00127, + "step": 1511, + "tokens/total": 197820416, + "tokens/train_per_sec_per_gpu": 3589.68, + "tokens/trainable": 21062844 + }, + { + "epoch": 4.8152866242038215, + "grad_norm": 0.1259765625, + "learning_rate": 2.1478631451387898e-07, + "loss": 0.002427282277494669, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00243, + "step": 1512, + "tokens/total": 197951488, + "tokens/train_per_sec_per_gpu": 3365.34, + "tokens/trainable": 21076902 + }, + { + "epoch": 4.818471337579618, + "grad_norm": 0.1513671875, + "learning_rate": 2.0757713214786533e-07, + "loss": 0.0020946285221725702, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0021, + "step": 1513, + "tokens/total": 198082560, + "tokens/train_per_sec_per_gpu": 3593.33, + "tokens/trainable": 21091820 + }, + { + "epoch": 4.821656050955414, + "grad_norm": 0.09912109375, + "learning_rate": 2.0049050567902128e-07, + "loss": 0.0015513665275648236, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00155, + "step": 1514, + "tokens/total": 198213632, + "tokens/train_per_sec_per_gpu": 3683.66, + "tokens/trainable": 21107128 + }, + { + "epoch": 4.82484076433121, + "grad_norm": 0.130859375, + "learning_rate": 1.9352647013852477e-07, + "loss": 0.001911777420900762, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00191, + "step": 1515, + "tokens/total": 198344704, + "tokens/train_per_sec_per_gpu": 3288.64, + "tokens/trainable": 21120836 + }, + { + "epoch": 4.828025477707007, + "grad_norm": 0.1220703125, + "learning_rate": 1.8668505995155515e-07, + "loss": 0.0022345585748553276, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00224, + "step": 1516, + "tokens/total": 198475776, + "tokens/train_per_sec_per_gpu": 3563.74, + "tokens/trainable": 21135672 + }, + { + "epoch": 4.8312101910828025, + "grad_norm": 0.130859375, + "learning_rate": 1.7996630893712675e-07, + "loss": 0.0015912681119516492, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00159, + "step": 1517, + "tokens/total": 198606848, + "tokens/train_per_sec_per_gpu": 3421.44, + "tokens/trainable": 21149896 + }, + { + "epoch": 4.834394904458598, + "grad_norm": 0.1337890625, + "learning_rate": 1.7337025030790543e-07, + "loss": 0.0015856210375204682, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00159, + "step": 1518, + "tokens/total": 198737920, + "tokens/train_per_sec_per_gpu": 2977.95, + "tokens/trainable": 21162350 + }, + { + "epoch": 4.837579617834395, + "grad_norm": 0.1767578125, + "learning_rate": 1.6689691667005902e-07, + "loss": 0.0021609310060739517, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00216, + "step": 1519, + "tokens/total": 198868992, + "tokens/train_per_sec_per_gpu": 3149.82, + "tokens/trainable": 21175516 + }, + { + "epoch": 4.840764331210191, + "grad_norm": 0.11572265625, + "learning_rate": 1.6054634002309054e-07, + "loss": 0.0015277141937986016, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00153, + "step": 1520, + "tokens/total": 199000064, + "tokens/train_per_sec_per_gpu": 3155.1, + "tokens/trainable": 21188692 + }, + { + "epoch": 4.843949044585988, + "grad_norm": 0.162109375, + "learning_rate": 1.5431855175968014e-07, + "loss": 0.002204909920692444, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00221, + "step": 1521, + "tokens/total": 199131136, + "tokens/train_per_sec_per_gpu": 3326.25, + "tokens/trainable": 21202538 + }, + { + "epoch": 4.8471337579617835, + "grad_norm": 0.138671875, + "learning_rate": 1.4821358266553231e-07, + "loss": 0.002712359419092536, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00272, + "step": 1522, + "tokens/total": 199262208, + "tokens/train_per_sec_per_gpu": 3457.57, + "tokens/trainable": 21216952 + }, + { + "epoch": 4.850318471337579, + "grad_norm": 0.1337890625, + "learning_rate": 1.4223146291922062e-07, + "loss": 0.0019022361375391483, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0019, + "step": 1523, + "tokens/total": 199393280, + "tokens/train_per_sec_per_gpu": 3419.62, + "tokens/trainable": 21231184 + }, + { + "epoch": 4.853503184713376, + "grad_norm": 0.1357421875, + "learning_rate": 1.3637222209204327e-07, + "loss": 0.0018241211073473096, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00183, + "step": 1524, + "tokens/total": 199524352, + "tokens/train_per_sec_per_gpu": 3139.7, + "tokens/trainable": 21244294 + }, + { + "epoch": 4.856687898089172, + "grad_norm": 0.1044921875, + "learning_rate": 1.3063588914786207e-07, + "loss": 0.001210428192280233, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00121, + "step": 1525, + "tokens/total": 199655424, + "tokens/train_per_sec_per_gpu": 3122.56, + "tokens/trainable": 21257322 + }, + { + "epoch": 4.859872611464969, + "grad_norm": 0.11474609375, + "learning_rate": 1.250224924429888e-07, + "loss": 0.0014607764314860106, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00146, + "step": 1526, + "tokens/total": 199786496, + "tokens/train_per_sec_per_gpu": 3032.1, + "tokens/trainable": 21269998 + }, + { + "epoch": 4.8630573248407645, + "grad_norm": 0.1474609375, + "learning_rate": 1.1953205972601022e-07, + "loss": 0.002046809531748295, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00205, + "step": 1527, + "tokens/total": 199917568, + "tokens/train_per_sec_per_gpu": 3360.03, + "tokens/trainable": 21284056 + }, + { + "epoch": 4.86624203821656, + "grad_norm": 0.140625, + "learning_rate": 1.1416461813767709e-07, + "loss": 0.002186344237998128, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00219, + "step": 1528, + "tokens/total": 200048640, + "tokens/train_per_sec_per_gpu": 3252.14, + "tokens/trainable": 21297784 + }, + { + "epoch": 4.869426751592357, + "grad_norm": 0.134765625, + "learning_rate": 1.0892019421075706e-07, + "loss": 0.002091720700263977, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00209, + "step": 1529, + "tokens/total": 200179712, + "tokens/train_per_sec_per_gpu": 3436.08, + "tokens/trainable": 21312112 + }, + { + "epoch": 4.872611464968153, + "grad_norm": 0.1279296875, + "learning_rate": 1.0379881386990974e-07, + "loss": 0.001913387910462916, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00192, + "step": 1530, + "tokens/total": 200310784, + "tokens/train_per_sec_per_gpu": 3187.83, + "tokens/trainable": 21325434 + }, + { + "epoch": 4.875796178343949, + "grad_norm": 0.154296875, + "learning_rate": 9.880050243155359e-08, + "loss": 0.0024429503828287125, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00245, + "step": 1531, + "tokens/total": 200441856, + "tokens/train_per_sec_per_gpu": 3182.45, + "tokens/trainable": 21338740 + }, + { + "epoch": 4.8789808917197455, + "grad_norm": 0.130859375, + "learning_rate": 9.392528460374362e-08, + "loss": 0.0016927801771089435, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00169, + "step": 1532, + "tokens/total": 200572928, + "tokens/train_per_sec_per_gpu": 3158.89, + "tokens/trainable": 21351978 + }, + { + "epoch": 4.882165605095541, + "grad_norm": 0.1142578125, + "learning_rate": 8.917318448604661e-08, + "loss": 0.0016512478468939662, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00165, + "step": 1533, + "tokens/total": 200704000, + "tokens/train_per_sec_per_gpu": 3721.88, + "tokens/trainable": 21367448 + }, + { + "epoch": 4.885350318471337, + "grad_norm": 0.1474609375, + "learning_rate": 8.454422556942454e-08, + "loss": 0.0020441263914108276, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00205, + "step": 1534, + "tokens/total": 200835072, + "tokens/train_per_sec_per_gpu": 3451.49, + "tokens/trainable": 21381804 + }, + { + "epoch": 4.888535031847134, + "grad_norm": 0.12890625, + "learning_rate": 8.003843073612627e-08, + "loss": 0.0019288958283141255, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00193, + "step": 1535, + "tokens/total": 200966144, + "tokens/train_per_sec_per_gpu": 3544.84, + "tokens/trainable": 21396546 + }, + { + "epoch": 4.89171974522293, + "grad_norm": 0.1435546875, + "learning_rate": 7.565582225955158e-08, + "loss": 0.0020149427000433207, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00202, + "step": 1536, + "tokens/total": 201097216, + "tokens/train_per_sec_per_gpu": 3206.12, + "tokens/trainable": 21410012 + }, + { + "epoch": 4.8949044585987265, + "grad_norm": 0.1591796875, + "learning_rate": 7.139642180416517e-08, + "loss": 0.00250299577601254, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00251, + "step": 1537, + "tokens/total": 201228288, + "tokens/train_per_sec_per_gpu": 3708.05, + "tokens/trainable": 21425414 + }, + { + "epoch": 4.898089171974522, + "grad_norm": 0.158203125, + "learning_rate": 6.726025042537721e-08, + "loss": 0.002223816467449069, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00223, + "step": 1538, + "tokens/total": 201359360, + "tokens/train_per_sec_per_gpu": 3310.1, + "tokens/trainable": 21439244 + }, + { + "epoch": 4.901273885350318, + "grad_norm": 0.1337890625, + "learning_rate": 6.324732856944349e-08, + "loss": 0.002602557884529233, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00261, + "step": 1539, + "tokens/total": 201490432, + "tokens/train_per_sec_per_gpu": 3309.88, + "tokens/trainable": 21453112 + }, + { + "epoch": 4.904458598726115, + "grad_norm": 0.1435546875, + "learning_rate": 5.935767607336273e-08, + "loss": 0.0018828021129593253, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00188, + "step": 1540, + "tokens/total": 201621504, + "tokens/train_per_sec_per_gpu": 3360.0, + "tokens/trainable": 21467172 + }, + { + "epoch": 4.907643312101911, + "grad_norm": 0.1103515625, + "learning_rate": 5.5591312164776646e-08, + "loss": 0.0018976753344759345, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0019, + "step": 1541, + "tokens/total": 201752576, + "tokens/train_per_sec_per_gpu": 3485.96, + "tokens/trainable": 21481770 + }, + { + "epoch": 4.9108280254777075, + "grad_norm": 0.126953125, + "learning_rate": 5.194825546187831e-08, + "loss": 0.0018805229337885976, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00188, + "step": 1542, + "tokens/total": 201883648, + "tokens/train_per_sec_per_gpu": 3369.24, + "tokens/trainable": 21495882 + }, + { + "epoch": 4.914012738853503, + "grad_norm": 0.1171875, + "learning_rate": 4.84285239733151e-08, + "loss": 0.0020450761076062918, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00205, + "step": 1543, + "tokens/total": 202014720, + "tokens/train_per_sec_per_gpu": 3212.2, + "tokens/trainable": 21509344 + }, + { + "epoch": 4.917197452229299, + "grad_norm": 0.1357421875, + "learning_rate": 4.503213509811088e-08, + "loss": 0.00226628128439188, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00227, + "step": 1544, + "tokens/total": 202145792, + "tokens/train_per_sec_per_gpu": 3154.8, + "tokens/trainable": 21522558 + }, + { + "epoch": 4.920382165605096, + "grad_norm": 0.125, + "learning_rate": 4.175910562556895e-08, + "loss": 0.0018778032390400767, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00188, + "step": 1545, + "tokens/total": 202276864, + "tokens/train_per_sec_per_gpu": 3493.16, + "tokens/trainable": 21537180 + }, + { + "epoch": 4.923566878980892, + "grad_norm": 0.134765625, + "learning_rate": 3.860945173518593e-08, + "loss": 0.0019706811290234327, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00197, + "step": 1546, + "tokens/total": 202407936, + "tokens/train_per_sec_per_gpu": 3480.51, + "tokens/trainable": 21551752 + }, + { + "epoch": 4.926751592356688, + "grad_norm": 0.1435546875, + "learning_rate": 3.5583188996587965e-08, + "loss": 0.001993852434679866, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.002, + "step": 1547, + "tokens/total": 202539008, + "tokens/train_per_sec_per_gpu": 3231.61, + "tokens/trainable": 21565284 + }, + { + "epoch": 4.929936305732484, + "grad_norm": 0.12890625, + "learning_rate": 3.26803323694419e-08, + "loss": 0.0025727523025125265, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00258, + "step": 1548, + "tokens/total": 202670080, + "tokens/train_per_sec_per_gpu": 3483.78, + "tokens/trainable": 21579872 + }, + { + "epoch": 4.93312101910828, + "grad_norm": 0.12158203125, + "learning_rate": 2.990089620337755e-08, + "loss": 0.00160361104644835, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0016, + "step": 1549, + "tokens/total": 202801152, + "tokens/train_per_sec_per_gpu": 3091.07, + "tokens/trainable": 21592838 + }, + { + "epoch": 4.936305732484076, + "grad_norm": 0.1337890625, + "learning_rate": 2.724489423792942e-08, + "loss": 0.002017256570979953, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00202, + "step": 1550, + "tokens/total": 202932224, + "tokens/train_per_sec_per_gpu": 3197.05, + "tokens/trainable": 21606214 + }, + { + "epoch": 4.939490445859873, + "grad_norm": 0.16015625, + "learning_rate": 2.4712339602461774e-08, + "loss": 0.0018039483111351728, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00181, + "step": 1551, + "tokens/total": 203063296, + "tokens/train_per_sec_per_gpu": 3119.44, + "tokens/trainable": 21619296 + }, + { + "epoch": 4.942675159235669, + "grad_norm": 0.1162109375, + "learning_rate": 2.2303244816099244e-08, + "loss": 0.001978665590286255, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00198, + "step": 1552, + "tokens/total": 203194368, + "tokens/train_per_sec_per_gpu": 2961.33, + "tokens/trainable": 21631746 + }, + { + "epoch": 4.945859872611465, + "grad_norm": 0.130859375, + "learning_rate": 2.0017621787671303e-08, + "loss": 0.0023562528658658266, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00236, + "step": 1553, + "tokens/total": 203325440, + "tokens/train_per_sec_per_gpu": 3105.93, + "tokens/trainable": 21644684 + }, + { + "epoch": 4.949044585987261, + "grad_norm": 0.16796875, + "learning_rate": 1.7855481815659546e-08, + "loss": 0.0023984115105122328, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0024, + "step": 1554, + "tokens/total": 203456512, + "tokens/train_per_sec_per_gpu": 3024.15, + "tokens/trainable": 21657424 + }, + { + "epoch": 4.952229299363057, + "grad_norm": 0.10205078125, + "learning_rate": 1.5816835588122748e-08, + "loss": 0.0020472980104386806, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00205, + "step": 1555, + "tokens/total": 203587584, + "tokens/train_per_sec_per_gpu": 3146.82, + "tokens/trainable": 21670582 + }, + { + "epoch": 4.955414012738854, + "grad_norm": 0.1513671875, + "learning_rate": 1.3901693182660768e-08, + "loss": 0.002163731260225177, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00217, + "step": 1556, + "tokens/total": 203718656, + "tokens/train_per_sec_per_gpu": 3104.79, + "tokens/trainable": 21683604 + }, + { + "epoch": 4.95859872611465, + "grad_norm": 0.119140625, + "learning_rate": 1.2110064066361836e-08, + "loss": 0.002627151319757104, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00263, + "step": 1557, + "tokens/total": 203849728, + "tokens/train_per_sec_per_gpu": 3130.19, + "tokens/trainable": 21696704 + }, + { + "epoch": 4.961783439490446, + "grad_norm": 0.10400390625, + "learning_rate": 1.0441957095752574e-08, + "loss": 0.0015952385729178786, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0016, + "step": 1558, + "tokens/total": 203980800, + "tokens/train_per_sec_per_gpu": 3351.63, + "tokens/trainable": 21710708 + }, + { + "epoch": 4.964968152866242, + "grad_norm": 0.1494140625, + "learning_rate": 8.897380516748044e-09, + "loss": 0.002128974301740527, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00213, + "step": 1559, + "tokens/total": 204111872, + "tokens/train_per_sec_per_gpu": 3197.67, + "tokens/trainable": 21724200 + }, + { + "epoch": 4.968152866242038, + "grad_norm": 0.107421875, + "learning_rate": 7.476341964626766e-09, + "loss": 0.0021477844566106796, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00215, + "step": 1560, + "tokens/total": 204242944, + "tokens/train_per_sec_per_gpu": 3209.08, + "tokens/trainable": 21737668 + }, + { + "epoch": 4.971337579617835, + "grad_norm": 0.1337890625, + "learning_rate": 6.178848463980758e-09, + "loss": 0.00202268292196095, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00202, + "step": 1561, + "tokens/total": 204374016, + "tokens/train_per_sec_per_gpu": 3300.82, + "tokens/trainable": 21751480 + }, + { + "epoch": 4.974522292993631, + "grad_norm": 0.09521484375, + "learning_rate": 5.004906428685008e-09, + "loss": 0.0012176063610240817, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00122, + "step": 1562, + "tokens/total": 204505088, + "tokens/train_per_sec_per_gpu": 3387.1, + "tokens/trainable": 21765732 + }, + { + "epoch": 4.977707006369426, + "grad_norm": 0.1533203125, + "learning_rate": 3.954521661861388e-09, + "loss": 0.0024965633638203144, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.0025, + "step": 1563, + "tokens/total": 204636160, + "tokens/train_per_sec_per_gpu": 3190.66, + "tokens/trainable": 21779126 + }, + { + "epoch": 4.980891719745223, + "grad_norm": 0.142578125, + "learning_rate": 3.027699355859226e-09, + "loss": 0.0016142029780894518, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00162, + "step": 1564, + "tokens/total": 204767232, + "tokens/train_per_sec_per_gpu": 3249.58, + "tokens/trainable": 21792812 + }, + { + "epoch": 4.984076433121019, + "grad_norm": 0.12451171875, + "learning_rate": 2.2244440922164487e-09, + "loss": 0.0019386119674891233, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00194, + "step": 1565, + "tokens/total": 204898304, + "tokens/train_per_sec_per_gpu": 3286.69, + "tokens/trainable": 21806628 + }, + { + "epoch": 4.987261146496815, + "grad_norm": 0.12158203125, + "learning_rate": 1.544759841654031e-09, + "loss": 0.001636000582948327, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00164, + "step": 1566, + "tokens/total": 205029376, + "tokens/train_per_sec_per_gpu": 3407.55, + "tokens/trainable": 21820912 + }, + { + "epoch": 4.990445859872612, + "grad_norm": 0.130859375, + "learning_rate": 9.886499640399116e-10, + "loss": 0.0022515307646244764, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00225, + "step": 1567, + "tokens/total": 205160448, + "tokens/train_per_sec_per_gpu": 3312.13, + "tokens/trainable": 21834766 + }, + { + "epoch": 4.993630573248407, + "grad_norm": 0.1162109375, + "learning_rate": 5.561172083806688e-10, + "loss": 0.0021433548536151648, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00215, + "step": 1568, + "tokens/total": 205291520, + "tokens/train_per_sec_per_gpu": 3715.64, + "tokens/trainable": 21850308 + }, + { + "epoch": 4.996815286624204, + "grad_norm": 0.11572265625, + "learning_rate": 2.4716371280764093e-10, + "loss": 0.002227420685812831, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 64.64, + "memory/max_allocated (GiB)": 64.64, + "ppl": 1.00223, + "step": 1569, + "tokens/total": 205422592, + "tokens/train_per_sec_per_gpu": 3672.72, + "tokens/trainable": 21865620 + }, + { + "epoch": 5.0, + "grad_norm": 0.2041015625, + "learning_rate": 6.179100456582543e-11, + "loss": 0.002160376403480768, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 39.25, + "memory/max_allocated (GiB)": 39.25, + "ppl": 1.00216, + "step": 1570, + "tokens/total": 205496320, + "tokens/train_per_sec_per_gpu": 3367.0, + "tokens/trainable": 21873388 + }, + { + "epoch": 5.0, + "eval_loss": 0.010312405414879322, + "eval_ppl": 1.01037, + "eval_runtime": 41.6326, + "eval_samples_per_second": 64.877, + "eval_steps_per_second": 4.059, + "memory/device_reserved (GiB)": 74.81, + "memory/max_active (GiB)": 54.61, + "memory/max_allocated (GiB)": 54.61, + "step": 1570 + } + ], + "logging_steps": 1, + "max_steps": 1570, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 314, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2775166334468096e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}