{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 157, "global_step": 1570, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 2.523393154144287, "eval_ppl": 12.47084, "eval_runtime": 43.864, "eval_samples_per_second": 61.577, "eval_steps_per_second": 3.853, "memory/device_reserved (GiB)": 60.88, "memory/max_active (GiB)": 50.21, "memory/max_allocated (GiB)": 50.21, "step": 0 }, { "epoch": 0.0031847133757961785, "grad_norm": 26.125, "learning_rate": 0.0, "loss": 2.513824939727783, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 60.25, "memory/max_allocated (GiB)": 60.25, "ppl": 12.35209, "step": 1, "tokens/total": 131072, "tokens/train_per_sec_per_gpu": 2648.66, "tokens/trainable": 14388 }, { "epoch": 0.006369426751592357, "grad_norm": 26.5, "learning_rate": 3.1847133757961787e-07, "loss": 2.5059545040130615, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 12.25525, "step": 2, "tokens/total": 262144, "tokens/train_per_sec_per_gpu": 3269.04, "tokens/trainable": 27845 }, { "epoch": 0.009554140127388535, "grad_norm": 25.625, "learning_rate": 6.369426751592357e-07, "loss": 2.4954071044921875, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 12.12667, "step": 3, "tokens/total": 393216, "tokens/train_per_sec_per_gpu": 3166.68, "tokens/trainable": 40998 }, { "epoch": 0.012738853503184714, "grad_norm": 26.25, "learning_rate": 9.554140127388535e-07, "loss": 2.526397943496704, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 12.50837, "step": 4, "tokens/total": 524288, "tokens/train_per_sec_per_gpu": 3343.09, "tokens/trainable": 54878 }, { "epoch": 0.01592356687898089, "grad_norm": 26.0, "learning_rate": 1.2738853503184715e-06, "loss": 2.480510711669922, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 11.94736, "step": 5, "tokens/total": 655360, "tokens/train_per_sec_per_gpu": 3076.8, "tokens/trainable": 67675 }, { "epoch": 0.01910828025477707, "grad_norm": 26.25, "learning_rate": 1.5923566878980892e-06, "loss": 2.5267443656921387, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 12.5127, "step": 6, "tokens/total": 786432, "tokens/train_per_sec_per_gpu": 3657.85, "tokens/trainable": 82725 }, { "epoch": 0.022292993630573247, "grad_norm": 25.625, "learning_rate": 1.910828025477707e-06, "loss": 2.505220651626587, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 12.24626, "step": 7, "tokens/total": 917504, "tokens/train_per_sec_per_gpu": 3615.3, "tokens/trainable": 97609 }, { "epoch": 0.025477707006369428, "grad_norm": 26.25, "learning_rate": 2.229299363057325e-06, "loss": 2.47495174407959, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 11.88113, "step": 8, "tokens/total": 1048576, "tokens/train_per_sec_per_gpu": 3341.37, "tokens/trainable": 111360 }, { "epoch": 0.028662420382165606, "grad_norm": 25.375, "learning_rate": 2.547770700636943e-06, "loss": 2.464661121368408, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 11.7595, "step": 9, "tokens/total": 1179648, "tokens/train_per_sec_per_gpu": 3391.98, "tokens/trainable": 125377 }, { "epoch": 0.03184713375796178, "grad_norm": 25.125, "learning_rate": 2.8662420382165605e-06, "loss": 2.4051315784454346, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 11.07989, "step": 10, "tokens/total": 1310720, "tokens/train_per_sec_per_gpu": 3538.93, "tokens/trainable": 139941 }, { "epoch": 0.03503184713375796, "grad_norm": 24.25, "learning_rate": 3.1847133757961785e-06, "loss": 2.3649113178253174, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 10.64309, "step": 11, "tokens/total": 1441792, "tokens/train_per_sec_per_gpu": 3190.84, "tokens/trainable": 153243 }, { "epoch": 0.03821656050955414, "grad_norm": 23.75, "learning_rate": 3.5031847133757964e-06, "loss": 2.2840771675109863, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 9.81662, "step": 12, "tokens/total": 1572864, "tokens/train_per_sec_per_gpu": 3122.46, "tokens/trainable": 166236 }, { "epoch": 0.041401273885350316, "grad_norm": 23.5, "learning_rate": 3.821656050955414e-06, "loss": 2.2835350036621094, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 9.8113, "step": 13, "tokens/total": 1703936, "tokens/train_per_sec_per_gpu": 3508.01, "tokens/trainable": 180765 }, { "epoch": 0.044585987261146494, "grad_norm": 22.625, "learning_rate": 4.140127388535032e-06, "loss": 2.178839921951294, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 8.83605, "step": 14, "tokens/total": 1835008, "tokens/train_per_sec_per_gpu": 3391.4, "tokens/trainable": 194814 }, { "epoch": 0.04777070063694268, "grad_norm": 20.625, "learning_rate": 4.45859872611465e-06, "loss": 2.029291868209839, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 7.6087, "step": 15, "tokens/total": 1966080, "tokens/train_per_sec_per_gpu": 2894.11, "tokens/trainable": 206939 }, { "epoch": 0.050955414012738856, "grad_norm": 19.125, "learning_rate": 4.777070063694268e-06, "loss": 1.9433990716934204, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 6.98244, "step": 16, "tokens/total": 2097152, "tokens/train_per_sec_per_gpu": 3260.95, "tokens/trainable": 220459 }, { "epoch": 0.054140127388535034, "grad_norm": 17.0, "learning_rate": 5.095541401273886e-06, "loss": 1.825382113456726, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 6.20517, "step": 17, "tokens/total": 2228224, "tokens/train_per_sec_per_gpu": 3108.44, "tokens/trainable": 233450 }, { "epoch": 0.05732484076433121, "grad_norm": 15.8125, "learning_rate": 5.414012738853504e-06, "loss": 1.7230491638183594, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 5.60158, "step": 18, "tokens/total": 2359296, "tokens/train_per_sec_per_gpu": 3341.04, "tokens/trainable": 247328 }, { "epoch": 0.06050955414012739, "grad_norm": 14.8125, "learning_rate": 5.732484076433121e-06, "loss": 1.6547000408172607, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 5.23151, "step": 19, "tokens/total": 2490368, "tokens/train_per_sec_per_gpu": 3383.25, "tokens/trainable": 261435 }, { "epoch": 0.06369426751592357, "grad_norm": 13.5625, "learning_rate": 6.050955414012739e-06, "loss": 1.544914960861206, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 4.68757, "step": 20, "tokens/total": 2621440, "tokens/train_per_sec_per_gpu": 3349.84, "tokens/trainable": 275370 }, { "epoch": 0.06687898089171974, "grad_norm": 12.6875, "learning_rate": 6.369426751592357e-06, "loss": 1.4839664697647095, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 4.4104, "step": 21, "tokens/total": 2752512, "tokens/train_per_sec_per_gpu": 3158.43, "tokens/trainable": 288580 }, { "epoch": 0.07006369426751592, "grad_norm": 12.0625, "learning_rate": 6.687898089171975e-06, "loss": 1.3859291076660156, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 3.99854, "step": 22, "tokens/total": 2883584, "tokens/train_per_sec_per_gpu": 3623.28, "tokens/trainable": 303623 }, { "epoch": 0.0732484076433121, "grad_norm": 11.1875, "learning_rate": 7.006369426751593e-06, "loss": 1.2559714317321777, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 3.51125, "step": 23, "tokens/total": 3014656, "tokens/train_per_sec_per_gpu": 3333.96, "tokens/trainable": 317478 }, { "epoch": 0.07643312101910828, "grad_norm": 10.1875, "learning_rate": 7.32484076433121e-06, "loss": 1.1163444519042969, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 3.05367, "step": 24, "tokens/total": 3145728, "tokens/train_per_sec_per_gpu": 3273.07, "tokens/trainable": 331087 }, { "epoch": 0.07961783439490445, "grad_norm": 9.625, "learning_rate": 7.643312101910828e-06, "loss": 0.9755889177322388, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 2.65273, "step": 25, "tokens/total": 3276800, "tokens/train_per_sec_per_gpu": 3686.54, "tokens/trainable": 346421 }, { "epoch": 0.08280254777070063, "grad_norm": 8.5625, "learning_rate": 7.961783439490445e-06, "loss": 0.8369104266166687, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 2.30922, "step": 26, "tokens/total": 3407872, "tokens/train_per_sec_per_gpu": 3225.45, "tokens/trainable": 359891 }, { "epoch": 0.08598726114649681, "grad_norm": 7.65625, "learning_rate": 8.280254777070064e-06, "loss": 0.7086498737335205, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 2.03125, "step": 27, "tokens/total": 3538944, "tokens/train_per_sec_per_gpu": 3049.77, "tokens/trainable": 372710 }, { "epoch": 0.08917197452229299, "grad_norm": 7.03125, "learning_rate": 8.598726114649681e-06, "loss": 0.6029537320137024, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.82751, "step": 28, "tokens/total": 3670016, "tokens/train_per_sec_per_gpu": 3413.19, "tokens/trainable": 386972 }, { "epoch": 0.09235668789808917, "grad_norm": 6.59375, "learning_rate": 8.9171974522293e-06, "loss": 0.5023248195648193, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.65256, "step": 29, "tokens/total": 3801088, "tokens/train_per_sec_per_gpu": 2978.06, "tokens/trainable": 399448 }, { "epoch": 0.09554140127388536, "grad_norm": 5.96875, "learning_rate": 9.235668789808917e-06, "loss": 0.4153555631637573, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.51491, "step": 30, "tokens/total": 3932160, "tokens/train_per_sec_per_gpu": 3448.36, "tokens/trainable": 413796 }, { "epoch": 0.09872611464968153, "grad_norm": 5.3125, "learning_rate": 9.554140127388536e-06, "loss": 0.329733669757843, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.3906, "step": 31, "tokens/total": 4063232, "tokens/train_per_sec_per_gpu": 3050.66, "tokens/trainable": 426585 }, { "epoch": 0.10191082802547771, "grad_norm": 4.65625, "learning_rate": 9.872611464968155e-06, "loss": 0.2749524414539337, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.31647, "step": 32, "tokens/total": 4194304, "tokens/train_per_sec_per_gpu": 3412.69, "tokens/trainable": 440864 }, { "epoch": 0.10509554140127389, "grad_norm": 3.8125, "learning_rate": 1.0191082802547772e-05, "loss": 0.2164468914270401, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.24166, "step": 33, "tokens/total": 4325376, "tokens/train_per_sec_per_gpu": 3101.83, "tokens/trainable": 453864 }, { "epoch": 0.10828025477707007, "grad_norm": 3.125, "learning_rate": 1.0509554140127389e-05, "loss": 0.16533951461315155, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.17979, "step": 34, "tokens/total": 4456448, "tokens/train_per_sec_per_gpu": 2919.92, "tokens/trainable": 466189 }, { "epoch": 0.11146496815286625, "grad_norm": 2.3125, "learning_rate": 1.0828025477707008e-05, "loss": 0.13319599628448486, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.14247, "step": 35, "tokens/total": 4587520, "tokens/train_per_sec_per_gpu": 3395.27, "tokens/trainable": 480345 }, { "epoch": 0.11464968152866242, "grad_norm": 1.734375, "learning_rate": 1.1146496815286625e-05, "loss": 0.11769881844520569, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.12491, "step": 36, "tokens/total": 4718592, "tokens/train_per_sec_per_gpu": 3283.23, "tokens/trainable": 494113 }, { "epoch": 0.1178343949044586, "grad_norm": 1.2734375, "learning_rate": 1.1464968152866242e-05, "loss": 0.09715006500482559, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.10203, "step": 37, "tokens/total": 4849664, "tokens/train_per_sec_per_gpu": 3440.9, "tokens/trainable": 508490 }, { "epoch": 0.12101910828025478, "grad_norm": 1.3828125, "learning_rate": 1.178343949044586e-05, "loss": 0.08853279799222946, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.09257, "step": 38, "tokens/total": 4980736, "tokens/train_per_sec_per_gpu": 3324.91, "tokens/trainable": 522428 }, { "epoch": 0.12420382165605096, "grad_norm": 1.0625, "learning_rate": 1.2101910828025478e-05, "loss": 0.07282212376594543, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.07554, "step": 39, "tokens/total": 5111808, "tokens/train_per_sec_per_gpu": 3291.66, "tokens/trainable": 536220 }, { "epoch": 0.12738853503184713, "grad_norm": 0.921875, "learning_rate": 1.2420382165605097e-05, "loss": 0.07131636142730713, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.07392, "step": 40, "tokens/total": 5242880, "tokens/train_per_sec_per_gpu": 3067.47, "tokens/trainable": 549148 }, { "epoch": 0.1305732484076433, "grad_norm": 0.91015625, "learning_rate": 1.2738853503184714e-05, "loss": 0.07583475857973099, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.07878, "step": 41, "tokens/total": 5373952, "tokens/train_per_sec_per_gpu": 3078.11, "tokens/trainable": 562021 }, { "epoch": 0.1337579617834395, "grad_norm": 1.015625, "learning_rate": 1.3057324840764331e-05, "loss": 0.05423282831907272, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.05573, "step": 42, "tokens/total": 5505024, "tokens/train_per_sec_per_gpu": 3152.3, "tokens/trainable": 575214 }, { "epoch": 0.13694267515923567, "grad_norm": 1.0703125, "learning_rate": 1.337579617834395e-05, "loss": 0.05849003419280052, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.06023, "step": 43, "tokens/total": 5636096, "tokens/train_per_sec_per_gpu": 3026.82, "tokens/trainable": 587989 }, { "epoch": 0.14012738853503184, "grad_norm": 0.671875, "learning_rate": 1.3694267515923567e-05, "loss": 0.047232724726200104, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.04837, "step": 44, "tokens/total": 5767168, "tokens/train_per_sec_per_gpu": 3186.14, "tokens/trainable": 601337 }, { "epoch": 0.14331210191082802, "grad_norm": 0.8203125, "learning_rate": 1.4012738853503186e-05, "loss": 0.0633855015039444, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.06544, "step": 45, "tokens/total": 5898240, "tokens/train_per_sec_per_gpu": 3243.91, "tokens/trainable": 614903 }, { "epoch": 0.1464968152866242, "grad_norm": 0.7734375, "learning_rate": 1.4331210191082803e-05, "loss": 0.057890165597200394, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0596, "step": 46, "tokens/total": 6029312, "tokens/train_per_sec_per_gpu": 3235.78, "tokens/trainable": 628512 }, { "epoch": 0.14968152866242038, "grad_norm": 0.62890625, "learning_rate": 1.464968152866242e-05, "loss": 0.057463180273771286, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.05915, "step": 47, "tokens/total": 6160384, "tokens/train_per_sec_per_gpu": 3663.91, "tokens/trainable": 643746 }, { "epoch": 0.15286624203821655, "grad_norm": 0.55859375, "learning_rate": 1.4968152866242039e-05, "loss": 0.047860756516456604, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.04902, "step": 48, "tokens/total": 6291456, "tokens/train_per_sec_per_gpu": 3663.1, "tokens/trainable": 659004 }, { "epoch": 0.15605095541401273, "grad_norm": 0.69140625, "learning_rate": 1.5286624203821656e-05, "loss": 0.04775935783982277, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.04892, "step": 49, "tokens/total": 6422528, "tokens/train_per_sec_per_gpu": 3484.38, "tokens/trainable": 673537 }, { "epoch": 0.1592356687898089, "grad_norm": 0.65234375, "learning_rate": 1.5605095541401275e-05, "loss": 0.041205767542123795, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.04207, "step": 50, "tokens/total": 6553600, "tokens/train_per_sec_per_gpu": 3230.47, "tokens/trainable": 687060 }, { "epoch": 0.1624203821656051, "grad_norm": 0.5625, "learning_rate": 1.592356687898089e-05, "loss": 0.04386754706501961, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.04484, "step": 51, "tokens/total": 6684672, "tokens/train_per_sec_per_gpu": 3268.41, "tokens/trainable": 700730 }, { "epoch": 0.16560509554140126, "grad_norm": 0.44140625, "learning_rate": 1.624203821656051e-05, "loss": 0.041807860136032104, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.04269, "step": 52, "tokens/total": 6815744, "tokens/train_per_sec_per_gpu": 3368.11, "tokens/trainable": 714773 }, { "epoch": 0.16878980891719744, "grad_norm": 0.54296875, "learning_rate": 1.6560509554140128e-05, "loss": 0.04267745837569237, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0436, "step": 53, "tokens/total": 6946816, "tokens/train_per_sec_per_gpu": 3215.88, "tokens/trainable": 728248 }, { "epoch": 0.17197452229299362, "grad_norm": 0.54296875, "learning_rate": 1.6878980891719747e-05, "loss": 0.04988788813352585, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.05115, "step": 54, "tokens/total": 7077888, "tokens/train_per_sec_per_gpu": 3378.45, "tokens/trainable": 742393 }, { "epoch": 0.1751592356687898, "grad_norm": 0.60546875, "learning_rate": 1.7197452229299362e-05, "loss": 0.03681975603103638, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03751, "step": 55, "tokens/total": 7208960, "tokens/train_per_sec_per_gpu": 3317.61, "tokens/trainable": 756289 }, { "epoch": 0.17834394904458598, "grad_norm": 0.54296875, "learning_rate": 1.751592356687898e-05, "loss": 0.03921874612569809, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.04, "step": 56, "tokens/total": 7340032, "tokens/train_per_sec_per_gpu": 3135.92, "tokens/trainable": 769413 }, { "epoch": 0.18152866242038215, "grad_norm": 0.498046875, "learning_rate": 1.78343949044586e-05, "loss": 0.03980698809027672, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.04061, "step": 57, "tokens/total": 7471104, "tokens/train_per_sec_per_gpu": 3113.74, "tokens/trainable": 782484 }, { "epoch": 0.18471337579617833, "grad_norm": 0.62109375, "learning_rate": 1.8152866242038215e-05, "loss": 0.03426855802536011, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03486, "step": 58, "tokens/total": 7602176, "tokens/train_per_sec_per_gpu": 3252.75, "tokens/trainable": 796083 }, { "epoch": 0.18789808917197454, "grad_norm": 0.51953125, "learning_rate": 1.8471337579617834e-05, "loss": 0.03522620350122452, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03585, "step": 59, "tokens/total": 7733248, "tokens/train_per_sec_per_gpu": 3557.53, "tokens/trainable": 810976 }, { "epoch": 0.1910828025477707, "grad_norm": 0.609375, "learning_rate": 1.8789808917197453e-05, "loss": 0.03881306201219559, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03958, "step": 60, "tokens/total": 7864320, "tokens/train_per_sec_per_gpu": 3437.92, "tokens/trainable": 825388 }, { "epoch": 0.1942675159235669, "grad_norm": 0.7890625, "learning_rate": 1.910828025477707e-05, "loss": 0.04205251485109329, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.04295, "step": 61, "tokens/total": 7995392, "tokens/train_per_sec_per_gpu": 2932.75, "tokens/trainable": 837817 }, { "epoch": 0.19745222929936307, "grad_norm": 0.58203125, "learning_rate": 1.942675159235669e-05, "loss": 0.03300648555159569, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03356, "step": 62, "tokens/total": 8126464, "tokens/train_per_sec_per_gpu": 3125.85, "tokens/trainable": 850991 }, { "epoch": 0.20063694267515925, "grad_norm": 0.87109375, "learning_rate": 1.974522292993631e-05, "loss": 0.03468535467982292, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03529, "step": 63, "tokens/total": 8257536, "tokens/train_per_sec_per_gpu": 3543.61, "tokens/trainable": 865759 }, { "epoch": 0.20382165605095542, "grad_norm": 0.6171875, "learning_rate": 2.0063694267515925e-05, "loss": 0.035250235348939896, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03588, "step": 64, "tokens/total": 8388608, "tokens/train_per_sec_per_gpu": 3393.88, "tokens/trainable": 879904 }, { "epoch": 0.2070063694267516, "grad_norm": 0.63671875, "learning_rate": 2.0382165605095544e-05, "loss": 0.03242558240890503, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03296, "step": 65, "tokens/total": 8519680, "tokens/train_per_sec_per_gpu": 2965.68, "tokens/trainable": 892375 }, { "epoch": 0.21019108280254778, "grad_norm": 0.765625, "learning_rate": 2.0700636942675162e-05, "loss": 0.04080452769994736, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.04165, "step": 66, "tokens/total": 8650752, "tokens/train_per_sec_per_gpu": 3513.42, "tokens/trainable": 907090 }, { "epoch": 0.21337579617834396, "grad_norm": 0.40625, "learning_rate": 2.1019108280254778e-05, "loss": 0.02815978415310383, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02856, "step": 67, "tokens/total": 8781824, "tokens/train_per_sec_per_gpu": 3257.68, "tokens/trainable": 920761 }, { "epoch": 0.21656050955414013, "grad_norm": 0.53125, "learning_rate": 2.1337579617834397e-05, "loss": 0.034378018230199814, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03498, "step": 68, "tokens/total": 8912896, "tokens/train_per_sec_per_gpu": 3612.11, "tokens/trainable": 935785 }, { "epoch": 0.2197452229299363, "grad_norm": 0.65234375, "learning_rate": 2.1656050955414015e-05, "loss": 0.03373882547020912, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03431, "step": 69, "tokens/total": 9043968, "tokens/train_per_sec_per_gpu": 3727.74, "tokens/trainable": 951259 }, { "epoch": 0.2229299363057325, "grad_norm": 0.458984375, "learning_rate": 2.197452229299363e-05, "loss": 0.03272494301199913, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03327, "step": 70, "tokens/total": 9175040, "tokens/train_per_sec_per_gpu": 3482.14, "tokens/trainable": 965829 }, { "epoch": 0.22611464968152867, "grad_norm": 0.55078125, "learning_rate": 2.229299363057325e-05, "loss": 0.02994038723409176, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03039, "step": 71, "tokens/total": 9306112, "tokens/train_per_sec_per_gpu": 3238.41, "tokens/trainable": 979395 }, { "epoch": 0.22929936305732485, "grad_norm": 0.75390625, "learning_rate": 2.261146496815287e-05, "loss": 0.033101145178079605, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03366, "step": 72, "tokens/total": 9437184, "tokens/train_per_sec_per_gpu": 3700.58, "tokens/trainable": 994803 }, { "epoch": 0.23248407643312102, "grad_norm": 0.396484375, "learning_rate": 2.2929936305732484e-05, "loss": 0.03042842261493206, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0309, "step": 73, "tokens/total": 9568256, "tokens/train_per_sec_per_gpu": 3386.78, "tokens/trainable": 1008996 }, { "epoch": 0.2356687898089172, "grad_norm": 0.53515625, "learning_rate": 2.3248407643312103e-05, "loss": 0.02688576839864254, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02725, "step": 74, "tokens/total": 9699328, "tokens/train_per_sec_per_gpu": 3353.01, "tokens/trainable": 1023021 }, { "epoch": 0.23885350318471338, "grad_norm": 0.51953125, "learning_rate": 2.356687898089172e-05, "loss": 0.028813578188419342, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02923, "step": 75, "tokens/total": 9830400, "tokens/train_per_sec_per_gpu": 3035.97, "tokens/trainable": 1035757 }, { "epoch": 0.24203821656050956, "grad_norm": 0.546875, "learning_rate": 2.388535031847134e-05, "loss": 0.035763900727033615, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03641, "step": 76, "tokens/total": 9961472, "tokens/train_per_sec_per_gpu": 2971.17, "tokens/trainable": 1048202 }, { "epoch": 0.24522292993630573, "grad_norm": 0.61328125, "learning_rate": 2.4203821656050956e-05, "loss": 0.026223331689834595, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02657, "step": 77, "tokens/total": 10092544, "tokens/train_per_sec_per_gpu": 3195.37, "tokens/trainable": 1061576 }, { "epoch": 0.2484076433121019, "grad_norm": 0.451171875, "learning_rate": 2.4522292993630575e-05, "loss": 0.037136998027563095, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.03784, "step": 78, "tokens/total": 10223616, "tokens/train_per_sec_per_gpu": 3185.64, "tokens/trainable": 1074924 }, { "epoch": 0.2515923566878981, "grad_norm": 0.44140625, "learning_rate": 2.4840764331210193e-05, "loss": 0.02757476083934307, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02796, "step": 79, "tokens/total": 10354688, "tokens/train_per_sec_per_gpu": 3141.94, "tokens/trainable": 1088089 }, { "epoch": 0.25477707006369427, "grad_norm": 0.60546875, "learning_rate": 2.515923566878981e-05, "loss": 0.026085954159498215, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02643, "step": 80, "tokens/total": 10485760, "tokens/train_per_sec_per_gpu": 3340.19, "tokens/trainable": 1102070 }, { "epoch": 0.25796178343949044, "grad_norm": 0.41015625, "learning_rate": 2.5477707006369428e-05, "loss": 0.027341356500983238, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02772, "step": 81, "tokens/total": 10616832, "tokens/train_per_sec_per_gpu": 3294.79, "tokens/trainable": 1115858 }, { "epoch": 0.2611464968152866, "grad_norm": 0.431640625, "learning_rate": 2.5796178343949047e-05, "loss": 0.028896335512399673, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02932, "step": 82, "tokens/total": 10747904, "tokens/train_per_sec_per_gpu": 3433.89, "tokens/trainable": 1130226 }, { "epoch": 0.2643312101910828, "grad_norm": 0.466796875, "learning_rate": 2.6114649681528662e-05, "loss": 0.026260778307914734, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02661, "step": 83, "tokens/total": 10878976, "tokens/train_per_sec_per_gpu": 3711.62, "tokens/trainable": 1145755 }, { "epoch": 0.267515923566879, "grad_norm": 0.53125, "learning_rate": 2.643312101910828e-05, "loss": 0.027284812182188034, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02766, "step": 84, "tokens/total": 11010048, "tokens/train_per_sec_per_gpu": 3309.83, "tokens/trainable": 1159641 }, { "epoch": 0.27070063694267515, "grad_norm": 0.376953125, "learning_rate": 2.67515923566879e-05, "loss": 0.02594919502735138, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02629, "step": 85, "tokens/total": 11141120, "tokens/train_per_sec_per_gpu": 3424.34, "tokens/trainable": 1173967 }, { "epoch": 0.27388535031847133, "grad_norm": 0.50390625, "learning_rate": 2.707006369426752e-05, "loss": 0.025507405400276184, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02584, "step": 86, "tokens/total": 11272192, "tokens/train_per_sec_per_gpu": 2757.24, "tokens/trainable": 1185544 }, { "epoch": 0.2770700636942675, "grad_norm": 0.4765625, "learning_rate": 2.7388535031847134e-05, "loss": 0.024133453145623207, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02443, "step": 87, "tokens/total": 11403264, "tokens/train_per_sec_per_gpu": 3215.45, "tokens/trainable": 1199051 }, { "epoch": 0.2802547770700637, "grad_norm": 0.45703125, "learning_rate": 2.7707006369426753e-05, "loss": 0.026854459196329117, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02722, "step": 88, "tokens/total": 11534336, "tokens/train_per_sec_per_gpu": 3550.27, "tokens/trainable": 1213857 }, { "epoch": 0.28343949044585987, "grad_norm": 0.45703125, "learning_rate": 2.802547770700637e-05, "loss": 0.02602829411625862, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02637, "step": 89, "tokens/total": 11665408, "tokens/train_per_sec_per_gpu": 3183.98, "tokens/trainable": 1227192 }, { "epoch": 0.28662420382165604, "grad_norm": 0.337890625, "learning_rate": 2.8343949044585987e-05, "loss": 0.020508471876382828, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02072, "step": 90, "tokens/total": 11796480, "tokens/train_per_sec_per_gpu": 3402.64, "tokens/trainable": 1241432 }, { "epoch": 0.2898089171974522, "grad_norm": 0.408203125, "learning_rate": 2.8662420382165606e-05, "loss": 0.017694037407636642, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01785, "step": 91, "tokens/total": 11927552, "tokens/train_per_sec_per_gpu": 3333.79, "tokens/trainable": 1255396 }, { "epoch": 0.2929936305732484, "grad_norm": 0.4140625, "learning_rate": 2.8980891719745225e-05, "loss": 0.027573810890316963, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02796, "step": 92, "tokens/total": 12058624, "tokens/train_per_sec_per_gpu": 2994.34, "tokens/trainable": 1268041 }, { "epoch": 0.2961783439490446, "grad_norm": 0.486328125, "learning_rate": 2.929936305732484e-05, "loss": 0.028143662959337234, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02854, "step": 93, "tokens/total": 12189696, "tokens/train_per_sec_per_gpu": 3516.43, "tokens/trainable": 1282765 }, { "epoch": 0.29936305732484075, "grad_norm": 0.4765625, "learning_rate": 2.961783439490446e-05, "loss": 0.026264818385243416, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02661, "step": 94, "tokens/total": 12320768, "tokens/train_per_sec_per_gpu": 3304.3, "tokens/trainable": 1296613 }, { "epoch": 0.30254777070063693, "grad_norm": 0.462890625, "learning_rate": 2.9936305732484078e-05, "loss": 0.026661768555641174, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02702, "step": 95, "tokens/total": 12451840, "tokens/train_per_sec_per_gpu": 3563.3, "tokens/trainable": 1311465 }, { "epoch": 0.3057324840764331, "grad_norm": 0.306640625, "learning_rate": 3.0254777070063693e-05, "loss": 0.017260678112506866, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01741, "step": 96, "tokens/total": 12582912, "tokens/train_per_sec_per_gpu": 3428.76, "tokens/trainable": 1325753 }, { "epoch": 0.3089171974522293, "grad_norm": 0.5703125, "learning_rate": 3.057324840764331e-05, "loss": 0.022419072687625885, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02267, "step": 97, "tokens/total": 12713984, "tokens/train_per_sec_per_gpu": 3443.07, "tokens/trainable": 1340109 }, { "epoch": 0.31210191082802546, "grad_norm": 0.50390625, "learning_rate": 3.089171974522293e-05, "loss": 0.023397397249937057, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02367, "step": 98, "tokens/total": 12845056, "tokens/train_per_sec_per_gpu": 3420.73, "tokens/trainable": 1354398 }, { "epoch": 0.31528662420382164, "grad_norm": 0.43359375, "learning_rate": 3.121019108280255e-05, "loss": 0.024743150919675827, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02505, "step": 99, "tokens/total": 12976128, "tokens/train_per_sec_per_gpu": 3420.87, "tokens/trainable": 1368740 }, { "epoch": 0.3184713375796178, "grad_norm": 0.3984375, "learning_rate": 3.1528662420382165e-05, "loss": 0.023541904985904694, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02382, "step": 100, "tokens/total": 13107200, "tokens/train_per_sec_per_gpu": 3192.28, "tokens/trainable": 1382180 }, { "epoch": 0.321656050955414, "grad_norm": 0.66015625, "learning_rate": 3.184713375796178e-05, "loss": 0.023172177374362946, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02344, "step": 101, "tokens/total": 13238272, "tokens/train_per_sec_per_gpu": 3177.98, "tokens/trainable": 1395593 }, { "epoch": 0.3248407643312102, "grad_norm": 0.48828125, "learning_rate": 3.21656050955414e-05, "loss": 0.025406980887055397, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02573, "step": 102, "tokens/total": 13369344, "tokens/train_per_sec_per_gpu": 3638.95, "tokens/trainable": 1410783 }, { "epoch": 0.32802547770700635, "grad_norm": 0.69921875, "learning_rate": 3.248407643312102e-05, "loss": 0.02435356006026268, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02465, "step": 103, "tokens/total": 13500416, "tokens/train_per_sec_per_gpu": 3263.52, "tokens/trainable": 1424464 }, { "epoch": 0.33121019108280253, "grad_norm": 0.404296875, "learning_rate": 3.2802547770700634e-05, "loss": 0.02753208577632904, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02791, "step": 104, "tokens/total": 13631488, "tokens/train_per_sec_per_gpu": 3426.67, "tokens/trainable": 1438808 }, { "epoch": 0.3343949044585987, "grad_norm": 0.404296875, "learning_rate": 3.3121019108280256e-05, "loss": 0.0209305789321661, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02115, "step": 105, "tokens/total": 13762560, "tokens/train_per_sec_per_gpu": 3761.41, "tokens/trainable": 1454488 }, { "epoch": 0.3375796178343949, "grad_norm": 0.5859375, "learning_rate": 3.343949044585987e-05, "loss": 0.023175280541181564, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02345, "step": 106, "tokens/total": 13893632, "tokens/train_per_sec_per_gpu": 3065.06, "tokens/trainable": 1467330 }, { "epoch": 0.34076433121019106, "grad_norm": 0.443359375, "learning_rate": 3.375796178343949e-05, "loss": 0.022064058110117912, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02231, "step": 107, "tokens/total": 14024704, "tokens/train_per_sec_per_gpu": 3306.24, "tokens/trainable": 1481142 }, { "epoch": 0.34394904458598724, "grad_norm": 0.490234375, "learning_rate": 3.407643312101911e-05, "loss": 0.0202829297631979, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02049, "step": 108, "tokens/total": 14155776, "tokens/train_per_sec_per_gpu": 3532.79, "tokens/trainable": 1495947 }, { "epoch": 0.3471337579617834, "grad_norm": 0.4453125, "learning_rate": 3.4394904458598724e-05, "loss": 0.01804858073592186, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01821, "step": 109, "tokens/total": 14286848, "tokens/train_per_sec_per_gpu": 3518.95, "tokens/trainable": 1510694 }, { "epoch": 0.3503184713375796, "grad_norm": 0.42578125, "learning_rate": 3.4713375796178346e-05, "loss": 0.0210330281406641, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02126, "step": 110, "tokens/total": 14417920, "tokens/train_per_sec_per_gpu": 2978.43, "tokens/trainable": 1523251 }, { "epoch": 0.3535031847133758, "grad_norm": 0.427734375, "learning_rate": 3.503184713375796e-05, "loss": 0.026296302676200867, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02665, "step": 111, "tokens/total": 14548992, "tokens/train_per_sec_per_gpu": 3167.03, "tokens/trainable": 1536545 }, { "epoch": 0.35668789808917195, "grad_norm": 0.5234375, "learning_rate": 3.535031847133758e-05, "loss": 0.020682599395513535, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0209, "step": 112, "tokens/total": 14680064, "tokens/train_per_sec_per_gpu": 3285.72, "tokens/trainable": 1550307 }, { "epoch": 0.35987261146496813, "grad_norm": 0.53125, "learning_rate": 3.56687898089172e-05, "loss": 0.018929051235318184, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01911, "step": 113, "tokens/total": 14811136, "tokens/train_per_sec_per_gpu": 3527.93, "tokens/trainable": 1565056 }, { "epoch": 0.3630573248407643, "grad_norm": 0.453125, "learning_rate": 3.5987261146496815e-05, "loss": 0.02578428015112877, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02612, "step": 114, "tokens/total": 14942208, "tokens/train_per_sec_per_gpu": 3195.27, "tokens/trainable": 1578471 }, { "epoch": 0.3662420382165605, "grad_norm": 0.54296875, "learning_rate": 3.630573248407643e-05, "loss": 0.02062690444290638, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02084, "step": 115, "tokens/total": 15073280, "tokens/train_per_sec_per_gpu": 3476.31, "tokens/trainable": 1593028 }, { "epoch": 0.36942675159235666, "grad_norm": 0.5546875, "learning_rate": 3.662420382165605e-05, "loss": 0.018274614587426186, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01844, "step": 116, "tokens/total": 15204352, "tokens/train_per_sec_per_gpu": 3437.34, "tokens/trainable": 1607412 }, { "epoch": 0.37261146496815284, "grad_norm": 0.3359375, "learning_rate": 3.694267515923567e-05, "loss": 0.02159012109041214, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02182, "step": 117, "tokens/total": 15335424, "tokens/train_per_sec_per_gpu": 3467.82, "tokens/trainable": 1621934 }, { "epoch": 0.37579617834394907, "grad_norm": 0.4609375, "learning_rate": 3.7261146496815283e-05, "loss": 0.0239134319126606, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0242, "step": 118, "tokens/total": 15466496, "tokens/train_per_sec_per_gpu": 3526.82, "tokens/trainable": 1636693 }, { "epoch": 0.37898089171974525, "grad_norm": 0.546875, "learning_rate": 3.7579617834394906e-05, "loss": 0.021818162873387337, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02206, "step": 119, "tokens/total": 15597568, "tokens/train_per_sec_per_gpu": 3233.2, "tokens/trainable": 1650256 }, { "epoch": 0.3821656050955414, "grad_norm": 0.3671875, "learning_rate": 3.789808917197453e-05, "loss": 0.023171117529273033, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02344, "step": 120, "tokens/total": 15728640, "tokens/train_per_sec_per_gpu": 3502.77, "tokens/trainable": 1664915 }, { "epoch": 0.3853503184713376, "grad_norm": 0.408203125, "learning_rate": 3.821656050955414e-05, "loss": 0.019905205816030502, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0201, "step": 121, "tokens/total": 15859712, "tokens/train_per_sec_per_gpu": 3495.55, "tokens/trainable": 1679527 }, { "epoch": 0.3885350318471338, "grad_norm": 0.4765625, "learning_rate": 3.8535031847133766e-05, "loss": 0.01511327363550663, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01523, "step": 122, "tokens/total": 15990784, "tokens/train_per_sec_per_gpu": 3507.46, "tokens/trainable": 1694159 }, { "epoch": 0.39171974522292996, "grad_norm": 0.44921875, "learning_rate": 3.885350318471338e-05, "loss": 0.02048143371939659, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02069, "step": 123, "tokens/total": 16121856, "tokens/train_per_sec_per_gpu": 3490.03, "tokens/trainable": 1708712 }, { "epoch": 0.39490445859872614, "grad_norm": 0.392578125, "learning_rate": 3.9171974522292996e-05, "loss": 0.02280033566057682, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02306, "step": 124, "tokens/total": 16252928, "tokens/train_per_sec_per_gpu": 3443.73, "tokens/trainable": 1723059 }, { "epoch": 0.3980891719745223, "grad_norm": 0.322265625, "learning_rate": 3.949044585987262e-05, "loss": 0.01703651435673237, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01718, "step": 125, "tokens/total": 16384000, "tokens/train_per_sec_per_gpu": 3396.46, "tokens/trainable": 1737268 }, { "epoch": 0.4012738853503185, "grad_norm": 0.37109375, "learning_rate": 3.9808917197452234e-05, "loss": 0.019548913463950157, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01974, "step": 126, "tokens/total": 16515072, "tokens/train_per_sec_per_gpu": 3220.09, "tokens/trainable": 1750779 }, { "epoch": 0.40445859872611467, "grad_norm": 0.4609375, "learning_rate": 4.012738853503185e-05, "loss": 0.021433480083942413, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02166, "step": 127, "tokens/total": 16646144, "tokens/train_per_sec_per_gpu": 3135.99, "tokens/trainable": 1763916 }, { "epoch": 0.40764331210191085, "grad_norm": 0.36328125, "learning_rate": 4.044585987261147e-05, "loss": 0.01608860120177269, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01622, "step": 128, "tokens/total": 16777216, "tokens/train_per_sec_per_gpu": 3294.85, "tokens/trainable": 1777688 }, { "epoch": 0.410828025477707, "grad_norm": 0.384765625, "learning_rate": 4.076433121019109e-05, "loss": 0.02616111747920513, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02651, "step": 129, "tokens/total": 16908288, "tokens/train_per_sec_per_gpu": 3542.44, "tokens/trainable": 1792526 }, { "epoch": 0.4140127388535032, "grad_norm": 0.359375, "learning_rate": 4.10828025477707e-05, "loss": 0.023339644074440002, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02361, "step": 130, "tokens/total": 17039360, "tokens/train_per_sec_per_gpu": 3579.44, "tokens/trainable": 1807456 }, { "epoch": 0.4171974522292994, "grad_norm": 0.396484375, "learning_rate": 4.1401273885350325e-05, "loss": 0.01703963428735733, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01719, "step": 131, "tokens/total": 17170432, "tokens/train_per_sec_per_gpu": 3374.03, "tokens/trainable": 1821617 }, { "epoch": 0.42038216560509556, "grad_norm": 0.322265625, "learning_rate": 4.171974522292994e-05, "loss": 0.018855011090636253, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01903, "step": 132, "tokens/total": 17301504, "tokens/train_per_sec_per_gpu": 3358.78, "tokens/trainable": 1835657 }, { "epoch": 0.42356687898089174, "grad_norm": 0.32421875, "learning_rate": 4.2038216560509556e-05, "loss": 0.018383294343948364, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01855, "step": 133, "tokens/total": 17432576, "tokens/train_per_sec_per_gpu": 3288.93, "tokens/trainable": 1849363 }, { "epoch": 0.4267515923566879, "grad_norm": 0.341796875, "learning_rate": 4.235668789808918e-05, "loss": 0.018167613074183464, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01833, "step": 134, "tokens/total": 17563648, "tokens/train_per_sec_per_gpu": 3327.75, "tokens/trainable": 1863304 }, { "epoch": 0.4299363057324841, "grad_norm": 0.263671875, "learning_rate": 4.267515923566879e-05, "loss": 0.016551347449421883, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01669, "step": 135, "tokens/total": 17694720, "tokens/train_per_sec_per_gpu": 3278.66, "tokens/trainable": 1877019 }, { "epoch": 0.43312101910828027, "grad_norm": 0.3359375, "learning_rate": 4.299363057324841e-05, "loss": 0.02233925275504589, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02259, "step": 136, "tokens/total": 17825792, "tokens/train_per_sec_per_gpu": 3065.32, "tokens/trainable": 1889991 }, { "epoch": 0.43630573248407645, "grad_norm": 0.35546875, "learning_rate": 4.331210191082803e-05, "loss": 0.01874961145222187, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01893, "step": 137, "tokens/total": 17956864, "tokens/train_per_sec_per_gpu": 3421.98, "tokens/trainable": 1904258 }, { "epoch": 0.4394904458598726, "grad_norm": 0.35546875, "learning_rate": 4.3630573248407646e-05, "loss": 0.016853082925081253, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.017, "step": 138, "tokens/total": 18087936, "tokens/train_per_sec_per_gpu": 3173.54, "tokens/trainable": 1917589 }, { "epoch": 0.4426751592356688, "grad_norm": 0.373046875, "learning_rate": 4.394904458598726e-05, "loss": 0.015192901715636253, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01531, "step": 139, "tokens/total": 18219008, "tokens/train_per_sec_per_gpu": 2954.41, "tokens/trainable": 1930014 }, { "epoch": 0.445859872611465, "grad_norm": 0.302734375, "learning_rate": 4.4267515923566884e-05, "loss": 0.01463925652205944, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01475, "step": 140, "tokens/total": 18350080, "tokens/train_per_sec_per_gpu": 3666.98, "tokens/trainable": 1945307 }, { "epoch": 0.44904458598726116, "grad_norm": 0.390625, "learning_rate": 4.45859872611465e-05, "loss": 0.020933344960212708, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.02115, "step": 141, "tokens/total": 18481152, "tokens/train_per_sec_per_gpu": 3580.73, "tokens/trainable": 1960244 }, { "epoch": 0.45222929936305734, "grad_norm": 0.345703125, "learning_rate": 4.4904458598726115e-05, "loss": 0.016706032678484917, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01685, "step": 142, "tokens/total": 18612224, "tokens/train_per_sec_per_gpu": 3692.46, "tokens/trainable": 1975680 }, { "epoch": 0.4554140127388535, "grad_norm": 0.271484375, "learning_rate": 4.522292993630574e-05, "loss": 0.0143811646848917, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01449, "step": 143, "tokens/total": 18743296, "tokens/train_per_sec_per_gpu": 3610.19, "tokens/trainable": 1990745 }, { "epoch": 0.4585987261146497, "grad_norm": 0.333984375, "learning_rate": 4.554140127388535e-05, "loss": 0.015790347009897232, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01592, "step": 144, "tokens/total": 18874368, "tokens/train_per_sec_per_gpu": 3290.56, "tokens/trainable": 2004531 }, { "epoch": 0.46178343949044587, "grad_norm": 0.251953125, "learning_rate": 4.585987261146497e-05, "loss": 0.013354619033634663, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01344, "step": 145, "tokens/total": 19005440, "tokens/train_per_sec_per_gpu": 3241.55, "tokens/trainable": 2018101 }, { "epoch": 0.46496815286624205, "grad_norm": 0.376953125, "learning_rate": 4.617834394904459e-05, "loss": 0.01745392382144928, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01761, "step": 146, "tokens/total": 19136512, "tokens/train_per_sec_per_gpu": 3409.89, "tokens/trainable": 2032310 }, { "epoch": 0.4681528662420382, "grad_norm": 0.38671875, "learning_rate": 4.6496815286624206e-05, "loss": 0.015100197866559029, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01521, "step": 147, "tokens/total": 19267584, "tokens/train_per_sec_per_gpu": 3269.82, "tokens/trainable": 2045999 }, { "epoch": 0.4713375796178344, "grad_norm": 0.310546875, "learning_rate": 4.681528662420383e-05, "loss": 0.01744706742465496, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0176, "step": 148, "tokens/total": 19398656, "tokens/train_per_sec_per_gpu": 3709.08, "tokens/trainable": 2061453 }, { "epoch": 0.4745222929936306, "grad_norm": 0.283203125, "learning_rate": 4.713375796178344e-05, "loss": 0.013093837536871433, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01318, "step": 149, "tokens/total": 19529728, "tokens/train_per_sec_per_gpu": 3292.43, "tokens/trainable": 2075180 }, { "epoch": 0.47770700636942676, "grad_norm": 0.275390625, "learning_rate": 4.745222929936306e-05, "loss": 0.01639549434185028, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01653, "step": 150, "tokens/total": 19660800, "tokens/train_per_sec_per_gpu": 3175.73, "tokens/trainable": 2088491 }, { "epoch": 0.48089171974522293, "grad_norm": 0.31640625, "learning_rate": 4.777070063694268e-05, "loss": 0.015184286050498486, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0153, "step": 151, "tokens/total": 19791872, "tokens/train_per_sec_per_gpu": 3611.48, "tokens/trainable": 2103581 }, { "epoch": 0.4840764331210191, "grad_norm": 0.318359375, "learning_rate": 4.8089171974522296e-05, "loss": 0.015232382342219353, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01535, "step": 152, "tokens/total": 19922944, "tokens/train_per_sec_per_gpu": 3138.84, "tokens/trainable": 2116743 }, { "epoch": 0.4872611464968153, "grad_norm": 0.4140625, "learning_rate": 4.840764331210191e-05, "loss": 0.018071118742227554, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01824, "step": 153, "tokens/total": 20054016, "tokens/train_per_sec_per_gpu": 2935.94, "tokens/trainable": 2129049 }, { "epoch": 0.49044585987261147, "grad_norm": 0.26953125, "learning_rate": 4.8726114649681534e-05, "loss": 0.015034169889986515, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01515, "step": 154, "tokens/total": 20185088, "tokens/train_per_sec_per_gpu": 3956.78, "tokens/trainable": 2145499 }, { "epoch": 0.49363057324840764, "grad_norm": 0.2734375, "learning_rate": 4.904458598726115e-05, "loss": 0.013894051313400269, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01399, "step": 155, "tokens/total": 20316160, "tokens/train_per_sec_per_gpu": 3559.16, "tokens/trainable": 2160294 }, { "epoch": 0.4968152866242038, "grad_norm": 0.29296875, "learning_rate": 4.9363057324840765e-05, "loss": 0.01629924215376377, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01643, "step": 156, "tokens/total": 20447232, "tokens/train_per_sec_per_gpu": 3108.59, "tokens/trainable": 2173313 }, { "epoch": 0.5, "grad_norm": 0.34375, "learning_rate": 4.968152866242039e-05, "loss": 0.014140879735350609, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01424, "step": 157, "tokens/total": 20578304, "tokens/train_per_sec_per_gpu": 3311.55, "tokens/trainable": 2187121 }, { "epoch": 0.5, "eval_loss": 0.0162150077521801, "eval_ppl": 1.01635, "eval_runtime": 42.1529, "eval_samples_per_second": 64.076, "eval_steps_per_second": 4.009, "memory/device_reserved (GiB)": 68.88, "memory/max_active (GiB)": 54.61, "memory/max_allocated (GiB)": 54.61, "step": 157 }, { "epoch": 0.5031847133757962, "grad_norm": 0.255859375, "learning_rate": 5e-05, "loss": 0.012421849183738232, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0125, "step": 158, "tokens/total": 20709376, "tokens/train_per_sec_per_gpu": 3796.22, "tokens/trainable": 2202882 }, { "epoch": 0.5063694267515924, "grad_norm": 0.298828125, "learning_rate": 4.999993820899543e-05, "loss": 0.014737301506102085, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01485, "step": 159, "tokens/total": 20840448, "tokens/train_per_sec_per_gpu": 2912.87, "tokens/trainable": 2215142 }, { "epoch": 0.5095541401273885, "grad_norm": 0.3828125, "learning_rate": 4.999975283628719e-05, "loss": 0.017280632629990578, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01743, "step": 160, "tokens/total": 20971520, "tokens/train_per_sec_per_gpu": 2864.73, "tokens/trainable": 2227241 }, { "epoch": 0.5127388535031847, "grad_norm": 0.30078125, "learning_rate": 4.999944388279162e-05, "loss": 0.014671262353658676, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01478, "step": 161, "tokens/total": 21102592, "tokens/train_per_sec_per_gpu": 3598.96, "tokens/trainable": 2242266 }, { "epoch": 0.5159235668789809, "grad_norm": 0.357421875, "learning_rate": 4.999901135003596e-05, "loss": 0.01328805461525917, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01338, "step": 162, "tokens/total": 21233664, "tokens/train_per_sec_per_gpu": 3491.44, "tokens/trainable": 2256820 }, { "epoch": 0.5191082802547771, "grad_norm": 0.294921875, "learning_rate": 4.9998455240158346e-05, "loss": 0.015039588324725628, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01515, "step": 163, "tokens/total": 21364736, "tokens/train_per_sec_per_gpu": 2929.21, "tokens/trainable": 2269119 }, { "epoch": 0.5222929936305732, "grad_norm": 0.3203125, "learning_rate": 4.999777555590779e-05, "loss": 0.014336930587887764, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01444, "step": 164, "tokens/total": 21495808, "tokens/train_per_sec_per_gpu": 3728.34, "tokens/trainable": 2284700 }, { "epoch": 0.5254777070063694, "grad_norm": 0.279296875, "learning_rate": 4.999697230064414e-05, "loss": 0.01668444462120533, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01682, "step": 165, "tokens/total": 21626880, "tokens/train_per_sec_per_gpu": 3542.38, "tokens/trainable": 2299523 }, { "epoch": 0.5286624203821656, "grad_norm": 0.275390625, "learning_rate": 4.999604547833814e-05, "loss": 0.01559534203261137, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01572, "step": 166, "tokens/total": 21757952, "tokens/train_per_sec_per_gpu": 3348.3, "tokens/trainable": 2313539 }, { "epoch": 0.5318471337579618, "grad_norm": 0.251953125, "learning_rate": 4.9994995093571314e-05, "loss": 0.01181457843631506, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01188, "step": 167, "tokens/total": 21889024, "tokens/train_per_sec_per_gpu": 3193.91, "tokens/trainable": 2326972 }, { "epoch": 0.535031847133758, "grad_norm": 0.326171875, "learning_rate": 4.9993821151536024e-05, "loss": 0.014408236369490623, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01451, "step": 168, "tokens/total": 22020096, "tokens/train_per_sec_per_gpu": 3172.43, "tokens/trainable": 2340305 }, { "epoch": 0.5382165605095541, "grad_norm": 0.259765625, "learning_rate": 4.9992523658035376e-05, "loss": 0.010526357218623161, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01058, "step": 169, "tokens/total": 22151168, "tokens/train_per_sec_per_gpu": 3477.3, "tokens/trainable": 2354865 }, { "epoch": 0.5414012738853503, "grad_norm": 0.2734375, "learning_rate": 4.9991102619483254e-05, "loss": 0.015866123139858246, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01599, "step": 170, "tokens/total": 22282240, "tokens/train_per_sec_per_gpu": 3352.2, "tokens/trainable": 2368942 }, { "epoch": 0.5445859872611465, "grad_norm": 0.34765625, "learning_rate": 4.998955804290425e-05, "loss": 0.015990689396858215, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01612, "step": 171, "tokens/total": 22413312, "tokens/train_per_sec_per_gpu": 3329.47, "tokens/trainable": 2382903 }, { "epoch": 0.5477707006369427, "grad_norm": 0.294921875, "learning_rate": 4.998788993593364e-05, "loss": 0.012892219237983227, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01298, "step": 172, "tokens/total": 22544384, "tokens/train_per_sec_per_gpu": 3491.15, "tokens/trainable": 2397472 }, { "epoch": 0.5509554140127388, "grad_norm": 0.326171875, "learning_rate": 4.998609830681734e-05, "loss": 0.016418559476733208, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01655, "step": 173, "tokens/total": 22675456, "tokens/train_per_sec_per_gpu": 3177.44, "tokens/trainable": 2410837 }, { "epoch": 0.554140127388535, "grad_norm": 0.275390625, "learning_rate": 4.998418316441188e-05, "loss": 0.0159194003790617, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01605, "step": 174, "tokens/total": 22806528, "tokens/train_per_sec_per_gpu": 3252.23, "tokens/trainable": 2424499 }, { "epoch": 0.5573248407643312, "grad_norm": 0.255859375, "learning_rate": 4.998214451818434e-05, "loss": 0.017272397875785828, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01742, "step": 175, "tokens/total": 22937600, "tokens/train_per_sec_per_gpu": 3335.6, "tokens/trainable": 2438525 }, { "epoch": 0.5605095541401274, "grad_norm": 0.3671875, "learning_rate": 4.997998237821233e-05, "loss": 0.018668157979846, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01884, "step": 176, "tokens/total": 23068672, "tokens/train_per_sec_per_gpu": 3037.8, "tokens/trainable": 2451344 }, { "epoch": 0.5636942675159236, "grad_norm": 0.275390625, "learning_rate": 4.99776967551839e-05, "loss": 0.013892064802348614, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01399, "step": 177, "tokens/total": 23199744, "tokens/train_per_sec_per_gpu": 3608.18, "tokens/trainable": 2466462 }, { "epoch": 0.5668789808917197, "grad_norm": 0.318359375, "learning_rate": 4.997528766039754e-05, "loss": 0.018128130584955215, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01829, "step": 178, "tokens/total": 23330816, "tokens/train_per_sec_per_gpu": 3418.17, "tokens/trainable": 2480794 }, { "epoch": 0.5700636942675159, "grad_norm": 0.279296875, "learning_rate": 4.997275510576207e-05, "loss": 0.015599234029650688, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01572, "step": 179, "tokens/total": 23461888, "tokens/train_per_sec_per_gpu": 3348.28, "tokens/trainable": 2494826 }, { "epoch": 0.5732484076433121, "grad_norm": 0.263671875, "learning_rate": 4.9970099103796625e-05, "loss": 0.01772911660373211, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01789, "step": 180, "tokens/total": 23592960, "tokens/train_per_sec_per_gpu": 3350.41, "tokens/trainable": 2508825 }, { "epoch": 0.5764331210191083, "grad_norm": 0.3046875, "learning_rate": 4.9967319667630567e-05, "loss": 0.017531519755721092, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01769, "step": 181, "tokens/total": 23724032, "tokens/train_per_sec_per_gpu": 3416.59, "tokens/trainable": 2523152 }, { "epoch": 0.5796178343949044, "grad_norm": 0.25390625, "learning_rate": 4.9964416811003414e-05, "loss": 0.01645725592970848, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01659, "step": 182, "tokens/total": 23855104, "tokens/train_per_sec_per_gpu": 3286.33, "tokens/trainable": 2536956 }, { "epoch": 0.5828025477707006, "grad_norm": 0.298828125, "learning_rate": 4.996139054826482e-05, "loss": 0.017507638782262802, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01766, "step": 183, "tokens/total": 23986176, "tokens/train_per_sec_per_gpu": 3802.12, "tokens/trainable": 2552813 }, { "epoch": 0.5859872611464968, "grad_norm": 0.2333984375, "learning_rate": 4.9958240894374433e-05, "loss": 0.015289016999304295, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01541, "step": 184, "tokens/total": 24117248, "tokens/train_per_sec_per_gpu": 3166.13, "tokens/trainable": 2566093 }, { "epoch": 0.589171974522293, "grad_norm": 0.2275390625, "learning_rate": 4.995496786490189e-05, "loss": 0.01385944988578558, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01396, "step": 185, "tokens/total": 24248320, "tokens/train_per_sec_per_gpu": 3395.23, "tokens/trainable": 2580324 }, { "epoch": 0.5923566878980892, "grad_norm": 0.28515625, "learning_rate": 4.995157147602669e-05, "loss": 0.01804269105195999, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01821, "step": 186, "tokens/total": 24379392, "tokens/train_per_sec_per_gpu": 3278.99, "tokens/trainable": 2594113 }, { "epoch": 0.5955414012738853, "grad_norm": 0.3359375, "learning_rate": 4.994805174453813e-05, "loss": 0.01675378903746605, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01689, "step": 187, "tokens/total": 24510464, "tokens/train_per_sec_per_gpu": 3247.25, "tokens/trainable": 2607778 }, { "epoch": 0.5987261146496815, "grad_norm": 0.2578125, "learning_rate": 4.994440868783522e-05, "loss": 0.014898994006216526, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01501, "step": 188, "tokens/total": 24641536, "tokens/train_per_sec_per_gpu": 3439.86, "tokens/trainable": 2622165 }, { "epoch": 0.6019108280254777, "grad_norm": 0.236328125, "learning_rate": 4.994064232392664e-05, "loss": 0.012711770832538605, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01279, "step": 189, "tokens/total": 24772608, "tokens/train_per_sec_per_gpu": 3250.18, "tokens/trainable": 2635842 }, { "epoch": 0.6050955414012739, "grad_norm": 0.201171875, "learning_rate": 4.993675267143056e-05, "loss": 0.0118938647210598, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01196, "step": 190, "tokens/total": 24903680, "tokens/train_per_sec_per_gpu": 3684.73, "tokens/trainable": 2651259 }, { "epoch": 0.60828025477707, "grad_norm": 0.265625, "learning_rate": 4.993273974957463e-05, "loss": 0.011486702598631382, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01155, "step": 191, "tokens/total": 25034752, "tokens/train_per_sec_per_gpu": 3177.1, "tokens/trainable": 2664566 }, { "epoch": 0.6114649681528662, "grad_norm": 0.23046875, "learning_rate": 4.992860357819584e-05, "loss": 0.012811151333153248, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01289, "step": 192, "tokens/total": 25165824, "tokens/train_per_sec_per_gpu": 3414.45, "tokens/trainable": 2678871 }, { "epoch": 0.6146496815286624, "grad_norm": 0.30078125, "learning_rate": 4.992434417774045e-05, "loss": 0.011826693080365658, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0119, "step": 193, "tokens/total": 25296896, "tokens/train_per_sec_per_gpu": 3298.7, "tokens/trainable": 2692737 }, { "epoch": 0.6178343949044586, "grad_norm": 0.2353515625, "learning_rate": 4.991996156926387e-05, "loss": 0.01326029933989048, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01335, "step": 194, "tokens/total": 25427968, "tokens/train_per_sec_per_gpu": 3122.96, "tokens/trainable": 2705928 }, { "epoch": 0.6210191082802548, "grad_norm": 0.2890625, "learning_rate": 4.991545577443057e-05, "loss": 0.012153583578765392, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01223, "step": 195, "tokens/total": 25559040, "tokens/train_per_sec_per_gpu": 3089.07, "tokens/trainable": 2718915 }, { "epoch": 0.6242038216560509, "grad_norm": 0.296875, "learning_rate": 4.991082681551396e-05, "loss": 0.014371933415532112, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01448, "step": 196, "tokens/total": 25690112, "tokens/train_per_sec_per_gpu": 3197.51, "tokens/trainable": 2732376 }, { "epoch": 0.6273885350318471, "grad_norm": 0.26953125, "learning_rate": 4.990607471539626e-05, "loss": 0.012046409770846367, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01212, "step": 197, "tokens/total": 25821184, "tokens/train_per_sec_per_gpu": 3374.92, "tokens/trainable": 2746546 }, { "epoch": 0.6305732484076433, "grad_norm": 0.2392578125, "learning_rate": 4.990119949756845e-05, "loss": 0.009664296172559261, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00971, "step": 198, "tokens/total": 25952256, "tokens/train_per_sec_per_gpu": 3569.8, "tokens/trainable": 2761477 }, { "epoch": 0.6337579617834395, "grad_norm": 0.279296875, "learning_rate": 4.989620118613009e-05, "loss": 0.00950827170163393, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00955, "step": 199, "tokens/total": 26083328, "tokens/train_per_sec_per_gpu": 3265.27, "tokens/trainable": 2775167 }, { "epoch": 0.6369426751592356, "grad_norm": 0.310546875, "learning_rate": 4.989107980578924e-05, "loss": 0.01698843576014042, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01713, "step": 200, "tokens/total": 26214400, "tokens/train_per_sec_per_gpu": 3262.25, "tokens/trainable": 2788865 }, { "epoch": 0.6401273885350318, "grad_norm": 0.248046875, "learning_rate": 4.9885835381862326e-05, "loss": 0.009720825590193272, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00977, "step": 201, "tokens/total": 26345472, "tokens/train_per_sec_per_gpu": 3459.38, "tokens/trainable": 2803380 }, { "epoch": 0.643312101910828, "grad_norm": 0.30859375, "learning_rate": 4.988046794027399e-05, "loss": 0.01347583532333374, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01357, "step": 202, "tokens/total": 26476544, "tokens/train_per_sec_per_gpu": 3450.44, "tokens/trainable": 2817829 }, { "epoch": 0.6464968152866242, "grad_norm": 0.2890625, "learning_rate": 4.987497750755702e-05, "loss": 0.014860209077596664, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01497, "step": 203, "tokens/total": 26607616, "tokens/train_per_sec_per_gpu": 3450.98, "tokens/trainable": 2832277 }, { "epoch": 0.6496815286624203, "grad_norm": 0.31640625, "learning_rate": 4.986936411085214e-05, "loss": 0.016120830550789833, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01625, "step": 204, "tokens/total": 26738688, "tokens/train_per_sec_per_gpu": 3184.35, "tokens/trainable": 2845614 }, { "epoch": 0.6528662420382165, "grad_norm": 0.2578125, "learning_rate": 4.986362777790796e-05, "loss": 0.01890011504292488, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01908, "step": 205, "tokens/total": 26869760, "tokens/train_per_sec_per_gpu": 3386.54, "tokens/trainable": 2859717 }, { "epoch": 0.6560509554140127, "grad_norm": 0.333984375, "learning_rate": 4.9857768537080784e-05, "loss": 0.014317265711724758, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01442, "step": 206, "tokens/total": 27000832, "tokens/train_per_sec_per_gpu": 3426.69, "tokens/trainable": 2874068 }, { "epoch": 0.6592356687898089, "grad_norm": 0.31640625, "learning_rate": 4.9851786417334466e-05, "loss": 0.013661851175129414, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01376, "step": 207, "tokens/total": 27131904, "tokens/train_per_sec_per_gpu": 3324.85, "tokens/trainable": 2887963 }, { "epoch": 0.6624203821656051, "grad_norm": 0.251953125, "learning_rate": 4.984568144824032e-05, "loss": 0.01245003379881382, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01253, "step": 208, "tokens/total": 27262976, "tokens/train_per_sec_per_gpu": 3335.64, "tokens/trainable": 2901885 }, { "epoch": 0.6656050955414012, "grad_norm": 0.265625, "learning_rate": 4.983945365997691e-05, "loss": 0.010308452881872654, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01036, "step": 209, "tokens/total": 27394048, "tokens/train_per_sec_per_gpu": 2771.97, "tokens/trainable": 2913512 }, { "epoch": 0.6687898089171974, "grad_norm": 0.234375, "learning_rate": 4.9833103083329947e-05, "loss": 0.013119550421833992, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01321, "step": 210, "tokens/total": 27525120, "tokens/train_per_sec_per_gpu": 3729.48, "tokens/trainable": 2929046 }, { "epoch": 0.6719745222929936, "grad_norm": 0.259765625, "learning_rate": 4.98266297496921e-05, "loss": 0.01352207362651825, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01361, "step": 211, "tokens/total": 27656192, "tokens/train_per_sec_per_gpu": 3277.56, "tokens/trainable": 2942780 }, { "epoch": 0.6751592356687898, "grad_norm": 0.34765625, "learning_rate": 4.982003369106287e-05, "loss": 0.017431171610951424, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01758, "step": 212, "tokens/total": 27787264, "tokens/train_per_sec_per_gpu": 3344.98, "tokens/trainable": 2956783 }, { "epoch": 0.678343949044586, "grad_norm": 0.255859375, "learning_rate": 4.981331494004845e-05, "loss": 0.01397764589637518, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01408, "step": 213, "tokens/total": 27918336, "tokens/train_per_sec_per_gpu": 3185.6, "tokens/trainable": 2970117 }, { "epoch": 0.6815286624203821, "grad_norm": 0.30859375, "learning_rate": 4.980647352986148e-05, "loss": 0.014616122469305992, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01472, "step": 214, "tokens/total": 28049408, "tokens/train_per_sec_per_gpu": 3594.29, "tokens/trainable": 2985083 }, { "epoch": 0.6847133757961783, "grad_norm": 0.34375, "learning_rate": 4.979950949432098e-05, "loss": 0.012630216777324677, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01271, "step": 215, "tokens/total": 28180480, "tokens/train_per_sec_per_gpu": 3114.53, "tokens/trainable": 2998164 }, { "epoch": 0.6878980891719745, "grad_norm": 0.369140625, "learning_rate": 4.979242286785214e-05, "loss": 0.01619878038764, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01633, "step": 216, "tokens/total": 28311552, "tokens/train_per_sec_per_gpu": 3343.4, "tokens/trainable": 3012168 }, { "epoch": 0.6910828025477707, "grad_norm": 0.1923828125, "learning_rate": 4.978521368548612e-05, "loss": 0.00897720456123352, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00902, "step": 217, "tokens/total": 28442624, "tokens/train_per_sec_per_gpu": 3292.3, "tokens/trainable": 3025888 }, { "epoch": 0.6942675159235668, "grad_norm": 0.232421875, "learning_rate": 4.977788198285995e-05, "loss": 0.010021158494055271, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01007, "step": 218, "tokens/total": 28573696, "tokens/train_per_sec_per_gpu": 3319.6, "tokens/trainable": 3039763 }, { "epoch": 0.697452229299363, "grad_norm": 0.23828125, "learning_rate": 4.9770427796216284e-05, "loss": 0.01425202563405037, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01435, "step": 219, "tokens/total": 28704768, "tokens/train_per_sec_per_gpu": 2847.77, "tokens/trainable": 3051731 }, { "epoch": 0.7006369426751592, "grad_norm": 0.322265625, "learning_rate": 4.976285116240326e-05, "loss": 0.014778842218220234, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01489, "step": 220, "tokens/total": 28835840, "tokens/train_per_sec_per_gpu": 3280.14, "tokens/trainable": 3065475 }, { "epoch": 0.7038216560509554, "grad_norm": 0.2275390625, "learning_rate": 4.9755152118874294e-05, "loss": 0.011257003992795944, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01132, "step": 221, "tokens/total": 28966912, "tokens/train_per_sec_per_gpu": 3367.48, "tokens/trainable": 3079510 }, { "epoch": 0.7070063694267515, "grad_norm": 0.2021484375, "learning_rate": 4.9747330703687914e-05, "loss": 0.013675577938556671, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01377, "step": 222, "tokens/total": 29097984, "tokens/train_per_sec_per_gpu": 3844.8, "tokens/trainable": 3095524 }, { "epoch": 0.7101910828025477, "grad_norm": 0.2294921875, "learning_rate": 4.9739386955507587e-05, "loss": 0.01433156430721283, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01443, "step": 223, "tokens/total": 29229056, "tokens/train_per_sec_per_gpu": 3346.69, "tokens/trainable": 3109543 }, { "epoch": 0.7133757961783439, "grad_norm": 0.2177734375, "learning_rate": 4.9731320913601474e-05, "loss": 0.010345865972340107, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0104, "step": 224, "tokens/total": 29360128, "tokens/train_per_sec_per_gpu": 3025.76, "tokens/trainable": 3122229 }, { "epoch": 0.7165605095541401, "grad_norm": 0.2109375, "learning_rate": 4.9723132617842284e-05, "loss": 0.014529074542224407, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01464, "step": 225, "tokens/total": 29491200, "tokens/train_per_sec_per_gpu": 3346.66, "tokens/trainable": 3136235 }, { "epoch": 0.7197452229299363, "grad_norm": 0.263671875, "learning_rate": 4.971482210870706e-05, "loss": 0.017442386597394943, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0176, "step": 226, "tokens/total": 29622272, "tokens/train_per_sec_per_gpu": 3192.22, "tokens/trainable": 3149606 }, { "epoch": 0.7229299363057324, "grad_norm": 0.1875, "learning_rate": 4.970638942727698e-05, "loss": 0.00844226311892271, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00848, "step": 227, "tokens/total": 29753344, "tokens/train_per_sec_per_gpu": 3247.88, "tokens/trainable": 3163147 }, { "epoch": 0.7261146496815286, "grad_norm": 0.1748046875, "learning_rate": 4.969783461523714e-05, "loss": 0.010366439819335938, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01042, "step": 228, "tokens/total": 29884416, "tokens/train_per_sec_per_gpu": 3545.1, "tokens/trainable": 3177891 }, { "epoch": 0.7292993630573248, "grad_norm": 0.259765625, "learning_rate": 4.968915771487639e-05, "loss": 0.011432585306465626, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0115, "step": 229, "tokens/total": 30015488, "tokens/train_per_sec_per_gpu": 3336.0, "tokens/trainable": 3191819 }, { "epoch": 0.732484076433121, "grad_norm": 0.2412109375, "learning_rate": 4.9680358769087076e-05, "loss": 0.012058578431606293, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01213, "step": 230, "tokens/total": 30146560, "tokens/train_per_sec_per_gpu": 3245.98, "tokens/trainable": 3205431 }, { "epoch": 0.7356687898089171, "grad_norm": 0.216796875, "learning_rate": 4.9671437821364855e-05, "loss": 0.013203555718064308, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01329, "step": 231, "tokens/total": 30277632, "tokens/train_per_sec_per_gpu": 2895.23, "tokens/trainable": 3217538 }, { "epoch": 0.7388535031847133, "grad_norm": 0.2109375, "learning_rate": 4.966239491580847e-05, "loss": 0.011110116727650166, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01117, "step": 232, "tokens/total": 30408704, "tokens/train_per_sec_per_gpu": 3255.67, "tokens/trainable": 3231099 }, { "epoch": 0.7420382165605095, "grad_norm": 0.19921875, "learning_rate": 4.965323009711954e-05, "loss": 0.01235074270516634, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01243, "step": 233, "tokens/total": 30539776, "tokens/train_per_sec_per_gpu": 3738.25, "tokens/trainable": 3246613 }, { "epoch": 0.7452229299363057, "grad_norm": 0.2119140625, "learning_rate": 4.964394341060233e-05, "loss": 0.014128293842077255, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01423, "step": 234, "tokens/total": 30670848, "tokens/train_per_sec_per_gpu": 3075.78, "tokens/trainable": 3259483 }, { "epoch": 0.7484076433121019, "grad_norm": 0.2041015625, "learning_rate": 4.9634534902163544e-05, "loss": 0.011594554409384727, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01166, "step": 235, "tokens/total": 30801920, "tokens/train_per_sec_per_gpu": 3397.95, "tokens/trainable": 3273641 }, { "epoch": 0.7515923566878981, "grad_norm": 0.34375, "learning_rate": 4.962500461831207e-05, "loss": 0.015983082354068756, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01611, "step": 236, "tokens/total": 30932992, "tokens/train_per_sec_per_gpu": 3322.87, "tokens/trainable": 3287575 }, { "epoch": 0.7547770700636943, "grad_norm": 0.2333984375, "learning_rate": 4.961535260615876e-05, "loss": 0.01292226929217577, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01301, "step": 237, "tokens/total": 31064064, "tokens/train_per_sec_per_gpu": 3320.22, "tokens/trainable": 3301421 }, { "epoch": 0.7579617834394905, "grad_norm": 0.2197265625, "learning_rate": 4.9605578913416245e-05, "loss": 0.014275891706347466, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01438, "step": 238, "tokens/total": 31195136, "tokens/train_per_sec_per_gpu": 3614.8, "tokens/trainable": 3316404 }, { "epoch": 0.7611464968152867, "grad_norm": 0.267578125, "learning_rate": 4.959568358839861e-05, "loss": 0.01322453934699297, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01331, "step": 239, "tokens/total": 31326208, "tokens/train_per_sec_per_gpu": 3704.99, "tokens/trainable": 3331869 }, { "epoch": 0.7643312101910829, "grad_norm": 0.240234375, "learning_rate": 4.958566668002123e-05, "loss": 0.01428250689059496, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01438, "step": 240, "tokens/total": 31457280, "tokens/train_per_sec_per_gpu": 3217.37, "tokens/trainable": 3345254 }, { "epoch": 0.767515923566879, "grad_norm": 0.248046875, "learning_rate": 4.957552823780047e-05, "loss": 0.011499980464577675, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01157, "step": 241, "tokens/total": 31588352, "tokens/train_per_sec_per_gpu": 3332.37, "tokens/trainable": 3359111 }, { "epoch": 0.7707006369426752, "grad_norm": 0.25, "learning_rate": 4.956526831185353e-05, "loss": 0.014339377172291279, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01444, "step": 242, "tokens/total": 31719424, "tokens/train_per_sec_per_gpu": 3461.36, "tokens/trainable": 3373551 }, { "epoch": 0.7738853503184714, "grad_norm": 0.1787109375, "learning_rate": 4.955488695289806e-05, "loss": 0.009887355379760265, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00994, "step": 243, "tokens/total": 31850496, "tokens/train_per_sec_per_gpu": 3502.72, "tokens/trainable": 3388151 }, { "epoch": 0.7770700636942676, "grad_norm": 0.236328125, "learning_rate": 4.954438421225206e-05, "loss": 0.013017972931265831, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0131, "step": 244, "tokens/total": 31981568, "tokens/train_per_sec_per_gpu": 3313.42, "tokens/trainable": 3401935 }, { "epoch": 0.7802547770700637, "grad_norm": 0.22265625, "learning_rate": 4.9533760141833506e-05, "loss": 0.012434033676981926, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01251, "step": 245, "tokens/total": 32112640, "tokens/train_per_sec_per_gpu": 3363.79, "tokens/trainable": 3415979 }, { "epoch": 0.7834394904458599, "grad_norm": 0.19140625, "learning_rate": 4.952301479416015e-05, "loss": 0.011714441701769829, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01178, "step": 246, "tokens/total": 32243712, "tokens/train_per_sec_per_gpu": 3236.03, "tokens/trainable": 3429486 }, { "epoch": 0.7866242038216561, "grad_norm": 0.2294921875, "learning_rate": 4.9512148222349274e-05, "loss": 0.01364858727902174, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01374, "step": 247, "tokens/total": 32374784, "tokens/train_per_sec_per_gpu": 3117.68, "tokens/trainable": 3442584 }, { "epoch": 0.7898089171974523, "grad_norm": 0.185546875, "learning_rate": 4.950116048011739e-05, "loss": 0.00907064788043499, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00911, "step": 248, "tokens/total": 32505856, "tokens/train_per_sec_per_gpu": 3310.3, "tokens/trainable": 3456412 }, { "epoch": 0.7929936305732485, "grad_norm": 0.185546875, "learning_rate": 4.949005162177997e-05, "loss": 0.011760072782635689, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01183, "step": 249, "tokens/total": 32636928, "tokens/train_per_sec_per_gpu": 3404.86, "tokens/trainable": 3470647 }, { "epoch": 0.7961783439490446, "grad_norm": 0.2294921875, "learning_rate": 4.9478821702251234e-05, "loss": 0.014284678734838963, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01439, "step": 250, "tokens/total": 32768000, "tokens/train_per_sec_per_gpu": 3377.89, "tokens/trainable": 3484748 }, { "epoch": 0.7993630573248408, "grad_norm": 0.18359375, "learning_rate": 4.9467470777043806e-05, "loss": 0.011529207229614258, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0116, "step": 251, "tokens/total": 32899072, "tokens/train_per_sec_per_gpu": 3574.68, "tokens/trainable": 3499669 }, { "epoch": 0.802547770700637, "grad_norm": 0.2314453125, "learning_rate": 4.9455998902268504e-05, "loss": 0.01309981569647789, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01319, "step": 252, "tokens/total": 33030144, "tokens/train_per_sec_per_gpu": 3255.28, "tokens/trainable": 3513312 }, { "epoch": 0.8057324840764332, "grad_norm": 0.1748046875, "learning_rate": 4.944440613463402e-05, "loss": 0.007244420703500509, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00727, "step": 253, "tokens/total": 33161216, "tokens/train_per_sec_per_gpu": 3061.67, "tokens/trainable": 3526131 }, { "epoch": 0.8089171974522293, "grad_norm": 0.1982421875, "learning_rate": 4.943269253144664e-05, "loss": 0.012152907438576221, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01223, "step": 254, "tokens/total": 33292288, "tokens/train_per_sec_per_gpu": 3129.1, "tokens/trainable": 3539258 }, { "epoch": 0.8121019108280255, "grad_norm": 0.1923828125, "learning_rate": 4.9420858150610025e-05, "loss": 0.009945802390575409, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01, "step": 255, "tokens/total": 33423360, "tokens/train_per_sec_per_gpu": 3101.37, "tokens/trainable": 3552212 }, { "epoch": 0.8152866242038217, "grad_norm": 0.1826171875, "learning_rate": 4.9408903050624796e-05, "loss": 0.00950522068887949, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00955, "step": 256, "tokens/total": 33554432, "tokens/train_per_sec_per_gpu": 3437.25, "tokens/trainable": 3566622 }, { "epoch": 0.8184713375796179, "grad_norm": 0.265625, "learning_rate": 4.939682729058839e-05, "loss": 0.012676852755248547, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01276, "step": 257, "tokens/total": 33685504, "tokens/train_per_sec_per_gpu": 3405.34, "tokens/trainable": 3580857 }, { "epoch": 0.821656050955414, "grad_norm": 0.2392578125, "learning_rate": 4.938463093019466e-05, "loss": 0.012163055129349232, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01224, "step": 258, "tokens/total": 33816576, "tokens/train_per_sec_per_gpu": 3175.85, "tokens/trainable": 3594180 }, { "epoch": 0.8248407643312102, "grad_norm": 0.220703125, "learning_rate": 4.937231402973365e-05, "loss": 0.011768801137804985, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01184, "step": 259, "tokens/total": 33947648, "tokens/train_per_sec_per_gpu": 3036.11, "tokens/trainable": 3606954 }, { "epoch": 0.8280254777070064, "grad_norm": 0.2333984375, "learning_rate": 4.935987665009123e-05, "loss": 0.01067468523979187, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01073, "step": 260, "tokens/total": 34078720, "tokens/train_per_sec_per_gpu": 3332.71, "tokens/trainable": 3620834 }, { "epoch": 0.8312101910828026, "grad_norm": 0.208984375, "learning_rate": 4.934731885274887e-05, "loss": 0.008789247833192348, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00883, "step": 261, "tokens/total": 34209792, "tokens/train_per_sec_per_gpu": 3139.67, "tokens/trainable": 3633998 }, { "epoch": 0.8343949044585988, "grad_norm": 0.2119140625, "learning_rate": 4.9334640699783286e-05, "loss": 0.011909011751413345, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01198, "step": 262, "tokens/total": 34340864, "tokens/train_per_sec_per_gpu": 3340.74, "tokens/trainable": 3647974 }, { "epoch": 0.8375796178343949, "grad_norm": 0.265625, "learning_rate": 4.9321842253866136e-05, "loss": 0.013996127992868423, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01409, "step": 263, "tokens/total": 34471936, "tokens/train_per_sec_per_gpu": 3762.99, "tokens/trainable": 3663593 }, { "epoch": 0.8407643312101911, "grad_norm": 0.228515625, "learning_rate": 4.930892357826373e-05, "loss": 0.014773533679544926, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01488, "step": 264, "tokens/total": 34603008, "tokens/train_per_sec_per_gpu": 3474.74, "tokens/trainable": 3678065 }, { "epoch": 0.8439490445859873, "grad_norm": 0.2138671875, "learning_rate": 4.92958847368367e-05, "loss": 0.01498363260179758, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0151, "step": 265, "tokens/total": 34734080, "tokens/train_per_sec_per_gpu": 3050.93, "tokens/trainable": 3690846 }, { "epoch": 0.8471337579617835, "grad_norm": 0.1884765625, "learning_rate": 4.928272579403969e-05, "loss": 0.009248088113963604, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00929, "step": 266, "tokens/total": 34865152, "tokens/train_per_sec_per_gpu": 3185.95, "tokens/trainable": 3704117 }, { "epoch": 0.8503184713375797, "grad_norm": 0.2138671875, "learning_rate": 4.926944681492106e-05, "loss": 0.012684832327067852, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01277, "step": 267, "tokens/total": 34996224, "tokens/train_per_sec_per_gpu": 3411.13, "tokens/trainable": 3718339 }, { "epoch": 0.8535031847133758, "grad_norm": 0.2099609375, "learning_rate": 4.925604786512251e-05, "loss": 0.0118259247392416, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0119, "step": 268, "tokens/total": 35127296, "tokens/train_per_sec_per_gpu": 3032.33, "tokens/trainable": 3731032 }, { "epoch": 0.856687898089172, "grad_norm": 0.1953125, "learning_rate": 4.924252901087881e-05, "loss": 0.009350091218948364, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00939, "step": 269, "tokens/total": 35258368, "tokens/train_per_sec_per_gpu": 3595.2, "tokens/trainable": 3746006 }, { "epoch": 0.8598726114649682, "grad_norm": 0.2275390625, "learning_rate": 4.922889031901745e-05, "loss": 0.01463128998875618, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01474, "step": 270, "tokens/total": 35389440, "tokens/train_per_sec_per_gpu": 3514.64, "tokens/trainable": 3760731 }, { "epoch": 0.8630573248407644, "grad_norm": 0.185546875, "learning_rate": 4.921513185695831e-05, "loss": 0.009343666024506092, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00939, "step": 271, "tokens/total": 35520512, "tokens/train_per_sec_per_gpu": 3137.39, "tokens/trainable": 3773865 }, { "epoch": 0.8662420382165605, "grad_norm": 0.1806640625, "learning_rate": 4.920125369271332e-05, "loss": 0.011359314434230328, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01142, "step": 272, "tokens/total": 35651584, "tokens/train_per_sec_per_gpu": 3710.71, "tokens/trainable": 3789305 }, { "epoch": 0.8694267515923567, "grad_norm": 0.173828125, "learning_rate": 4.9187255894886134e-05, "loss": 0.011224365793168545, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01129, "step": 273, "tokens/total": 35782656, "tokens/train_per_sec_per_gpu": 3673.45, "tokens/trainable": 3804528 }, { "epoch": 0.8726114649681529, "grad_norm": 0.2353515625, "learning_rate": 4.9173138532671796e-05, "loss": 0.012716785073280334, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0128, "step": 274, "tokens/total": 35913728, "tokens/train_per_sec_per_gpu": 3495.34, "tokens/trainable": 3819131 }, { "epoch": 0.8757961783439491, "grad_norm": 0.193359375, "learning_rate": 4.9158901675856395e-05, "loss": 0.008782695978879929, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00882, "step": 275, "tokens/total": 36044800, "tokens/train_per_sec_per_gpu": 3305.01, "tokens/trainable": 3832973 }, { "epoch": 0.8789808917197452, "grad_norm": 0.169921875, "learning_rate": 4.9144545394816687e-05, "loss": 0.008706534281373024, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00874, "step": 276, "tokens/total": 36175872, "tokens/train_per_sec_per_gpu": 3043.21, "tokens/trainable": 3845728 }, { "epoch": 0.8821656050955414, "grad_norm": 0.27734375, "learning_rate": 4.91300697605198e-05, "loss": 0.01517584826797247, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01529, "step": 277, "tokens/total": 36306944, "tokens/train_per_sec_per_gpu": 3664.41, "tokens/trainable": 3860973 }, { "epoch": 0.8853503184713376, "grad_norm": 0.2099609375, "learning_rate": 4.911547484452286e-05, "loss": 0.009684903547167778, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00973, "step": 278, "tokens/total": 36438016, "tokens/train_per_sec_per_gpu": 3416.95, "tokens/trainable": 3875221 }, { "epoch": 0.8885350318471338, "grad_norm": 0.201171875, "learning_rate": 4.9100760718972624e-05, "loss": 0.011975611560046673, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01205, "step": 279, "tokens/total": 36569088, "tokens/train_per_sec_per_gpu": 3231.7, "tokens/trainable": 3888737 }, { "epoch": 0.89171974522293, "grad_norm": 0.171875, "learning_rate": 4.908592745660514e-05, "loss": 0.009973946958780289, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01002, "step": 280, "tokens/total": 36700160, "tokens/train_per_sec_per_gpu": 3510.18, "tokens/trainable": 3903383 }, { "epoch": 0.8949044585987261, "grad_norm": 0.189453125, "learning_rate": 4.9070975130745387e-05, "loss": 0.009210948832333088, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00925, "step": 281, "tokens/total": 36831232, "tokens/train_per_sec_per_gpu": 3276.53, "tokens/trainable": 3917095 }, { "epoch": 0.8980891719745223, "grad_norm": 0.216796875, "learning_rate": 4.905590381530689e-05, "loss": 0.010272481478750706, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01033, "step": 282, "tokens/total": 36962304, "tokens/train_per_sec_per_gpu": 3515.84, "tokens/trainable": 3931741 }, { "epoch": 0.9012738853503185, "grad_norm": 0.203125, "learning_rate": 4.9040713584791406e-05, "loss": 0.009833472780883312, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00988, "step": 283, "tokens/total": 37093376, "tokens/train_per_sec_per_gpu": 2930.03, "tokens/trainable": 3944068 }, { "epoch": 0.9044585987261147, "grad_norm": 0.173828125, "learning_rate": 4.902540451428849e-05, "loss": 0.008189358748495579, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00822, "step": 284, "tokens/total": 37224448, "tokens/train_per_sec_per_gpu": 3765.41, "tokens/trainable": 3959725 }, { "epoch": 0.9076433121019108, "grad_norm": 0.2216796875, "learning_rate": 4.900997667947518e-05, "loss": 0.013849266804754734, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01395, "step": 285, "tokens/total": 37355520, "tokens/train_per_sec_per_gpu": 3186.67, "tokens/trainable": 3973038 }, { "epoch": 0.910828025477707, "grad_norm": 0.2373046875, "learning_rate": 4.899443015661557e-05, "loss": 0.008526762947440147, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00856, "step": 286, "tokens/total": 37486592, "tokens/train_per_sec_per_gpu": 3056.98, "tokens/trainable": 3985851 }, { "epoch": 0.9140127388535032, "grad_norm": 0.1650390625, "learning_rate": 4.89787650225605e-05, "loss": 0.008836560882627964, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00888, "step": 287, "tokens/total": 37617664, "tokens/train_per_sec_per_gpu": 3316.33, "tokens/trainable": 3999725 }, { "epoch": 0.9171974522292994, "grad_norm": 0.263671875, "learning_rate": 4.896298135474711e-05, "loss": 0.01038228627294302, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01044, "step": 288, "tokens/total": 37748736, "tokens/train_per_sec_per_gpu": 3125.36, "tokens/trainable": 4012867 }, { "epoch": 0.9203821656050956, "grad_norm": 0.21875, "learning_rate": 4.8947079231198504e-05, "loss": 0.012707007117569447, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01279, "step": 289, "tokens/total": 37879808, "tokens/train_per_sec_per_gpu": 3307.2, "tokens/trainable": 4026670 }, { "epoch": 0.9235668789808917, "grad_norm": 0.2060546875, "learning_rate": 4.893105873052333e-05, "loss": 0.010869958437979221, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01093, "step": 290, "tokens/total": 38010880, "tokens/train_per_sec_per_gpu": 3449.15, "tokens/trainable": 4041053 }, { "epoch": 0.9267515923566879, "grad_norm": 0.2216796875, "learning_rate": 4.8914919931915407e-05, "loss": 0.010028751567006111, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01008, "step": 291, "tokens/total": 38141952, "tokens/train_per_sec_per_gpu": 3442.24, "tokens/trainable": 4055450 }, { "epoch": 0.9299363057324841, "grad_norm": 0.220703125, "learning_rate": 4.889866291515336e-05, "loss": 0.012203947640955448, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01228, "step": 292, "tokens/total": 38273024, "tokens/train_per_sec_per_gpu": 2829.0, "tokens/trainable": 4067366 }, { "epoch": 0.9331210191082803, "grad_norm": 0.1884765625, "learning_rate": 4.888228776060016e-05, "loss": 0.010833281092345715, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01089, "step": 293, "tokens/total": 38404096, "tokens/train_per_sec_per_gpu": 3495.99, "tokens/trainable": 4081929 }, { "epoch": 0.9363057324840764, "grad_norm": 0.181640625, "learning_rate": 4.886579454920281e-05, "loss": 0.012121611274778843, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0122, "step": 294, "tokens/total": 38535168, "tokens/train_per_sec_per_gpu": 3777.6, "tokens/trainable": 4097707 }, { "epoch": 0.9394904458598726, "grad_norm": 0.1826171875, "learning_rate": 4.884918336249186e-05, "loss": 0.009699760004878044, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00975, "step": 295, "tokens/total": 38666240, "tokens/train_per_sec_per_gpu": 3588.34, "tokens/trainable": 4112623 }, { "epoch": 0.9426751592356688, "grad_norm": 0.2138671875, "learning_rate": 4.883245428258107e-05, "loss": 0.011465213261544704, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01153, "step": 296, "tokens/total": 38797312, "tokens/train_per_sec_per_gpu": 3411.03, "tokens/trainable": 4126849 }, { "epoch": 0.945859872611465, "grad_norm": 0.1904296875, "learning_rate": 4.881560739216697e-05, "loss": 0.009318836033344269, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00936, "step": 297, "tokens/total": 38928384, "tokens/train_per_sec_per_gpu": 3338.53, "tokens/trainable": 4140757 }, { "epoch": 0.9490445859872612, "grad_norm": 0.2216796875, "learning_rate": 4.879864277452847e-05, "loss": 0.012642276473343372, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01272, "step": 298, "tokens/total": 39059456, "tokens/train_per_sec_per_gpu": 3555.91, "tokens/trainable": 4155522 }, { "epoch": 0.9522292993630573, "grad_norm": 0.20703125, "learning_rate": 4.8781560513526414e-05, "loss": 0.013654773123562336, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01375, "step": 299, "tokens/total": 39190528, "tokens/train_per_sec_per_gpu": 3459.38, "tokens/trainable": 4169921 }, { "epoch": 0.9554140127388535, "grad_norm": 0.1787109375, "learning_rate": 4.876436069360323e-05, "loss": 0.006959032732993364, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00698, "step": 300, "tokens/total": 39321600, "tokens/train_per_sec_per_gpu": 3298.43, "tokens/trainable": 4183671 }, { "epoch": 0.9585987261146497, "grad_norm": 0.2109375, "learning_rate": 4.8747043399782424e-05, "loss": 0.01015427801758051, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01021, "step": 301, "tokens/total": 39452672, "tokens/train_per_sec_per_gpu": 3056.79, "tokens/trainable": 4196527 }, { "epoch": 0.9617834394904459, "grad_norm": 0.189453125, "learning_rate": 4.8729608717668265e-05, "loss": 0.015600456856191158, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01572, "step": 302, "tokens/total": 39583744, "tokens/train_per_sec_per_gpu": 3500.83, "tokens/trainable": 4211124 }, { "epoch": 0.964968152866242, "grad_norm": 0.275390625, "learning_rate": 4.871205673344525e-05, "loss": 0.014728494919836521, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01484, "step": 303, "tokens/total": 39714816, "tokens/train_per_sec_per_gpu": 3241.93, "tokens/trainable": 4224632 }, { "epoch": 0.9681528662420382, "grad_norm": 0.185546875, "learning_rate": 4.869438753387777e-05, "loss": 0.008857826702296734, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0089, "step": 304, "tokens/total": 39845888, "tokens/train_per_sec_per_gpu": 3447.73, "tokens/trainable": 4239052 }, { "epoch": 0.9713375796178344, "grad_norm": 0.1572265625, "learning_rate": 4.867660120630962e-05, "loss": 0.006837591528892517, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00686, "step": 305, "tokens/total": 39976960, "tokens/train_per_sec_per_gpu": 3652.81, "tokens/trainable": 4254227 }, { "epoch": 0.9745222929936306, "grad_norm": 0.21484375, "learning_rate": 4.8658697838663625e-05, "loss": 0.01278127171099186, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01286, "step": 306, "tokens/total": 40108032, "tokens/train_per_sec_per_gpu": 3363.52, "tokens/trainable": 4268312 }, { "epoch": 0.9777070063694268, "grad_norm": 0.19140625, "learning_rate": 4.864067751944113e-05, "loss": 0.010625463910400867, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01068, "step": 307, "tokens/total": 40239104, "tokens/train_per_sec_per_gpu": 3301.6, "tokens/trainable": 4282394 }, { "epoch": 0.9808917197452229, "grad_norm": 0.19140625, "learning_rate": 4.862254033772164e-05, "loss": 0.010408475063741207, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01046, "step": 308, "tokens/total": 40370176, "tokens/train_per_sec_per_gpu": 3139.89, "tokens/trainable": 4295549 }, { "epoch": 0.9840764331210191, "grad_norm": 0.15625, "learning_rate": 4.8604286383162326e-05, "loss": 0.00865277647972107, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00869, "step": 309, "tokens/total": 40501248, "tokens/train_per_sec_per_gpu": 3451.88, "tokens/trainable": 4309931 }, { "epoch": 0.9872611464968153, "grad_norm": 0.173828125, "learning_rate": 4.858591574599759e-05, "loss": 0.010455441661179066, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01051, "step": 310, "tokens/total": 40632320, "tokens/train_per_sec_per_gpu": 3652.99, "tokens/trainable": 4325145 }, { "epoch": 0.9904458598726115, "grad_norm": 0.1806640625, "learning_rate": 4.856742851703866e-05, "loss": 0.009725190699100494, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00977, "step": 311, "tokens/total": 40763392, "tokens/train_per_sec_per_gpu": 3095.9, "tokens/trainable": 4338115 }, { "epoch": 0.9936305732484076, "grad_norm": 0.189453125, "learning_rate": 4.854882478767308e-05, "loss": 0.0067247929982841015, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00675, "step": 312, "tokens/total": 40894464, "tokens/train_per_sec_per_gpu": 3608.14, "tokens/trainable": 4353094 }, { "epoch": 0.9968152866242038, "grad_norm": 0.177734375, "learning_rate": 4.8530104649864306e-05, "loss": 0.008235358633100986, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00827, "step": 313, "tokens/total": 41025536, "tokens/train_per_sec_per_gpu": 3438.93, "tokens/trainable": 4367439 }, { "epoch": 1.0, "grad_norm": 0.31640625, "learning_rate": 4.8511268196151224e-05, "loss": 0.013931503519415855, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 39.25, "memory/max_allocated (GiB)": 39.25, "ppl": 1.01403, "step": 314, "tokens/total": 41099264, "tokens/train_per_sec_per_gpu": 2079.74, "tokens/trainable": 4374676 }, { "epoch": 1.0, "eval_loss": 0.010794572532176971, "eval_ppl": 1.01085, "eval_runtime": 42.176, "eval_samples_per_second": 64.041, "eval_steps_per_second": 4.007, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 54.61, "memory/max_allocated (GiB)": 54.61, "step": 314 }, { "epoch": 1.0031847133757963, "grad_norm": 0.19921875, "learning_rate": 4.849231551964771e-05, "loss": 0.01005562860518694, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01011, "step": 315, "tokens/total": 41230336, "tokens/train_per_sec_per_gpu": 3300.4, "tokens/trainable": 4388312 }, { "epoch": 1.0063694267515924, "grad_norm": 0.1962890625, "learning_rate": 4.8473246714042155e-05, "loss": 0.009829830378293991, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00988, "step": 316, "tokens/total": 41361408, "tokens/train_per_sec_per_gpu": 2786.13, "tokens/trainable": 4400052 }, { "epoch": 1.0095541401273886, "grad_norm": 0.2119140625, "learning_rate": 4.845406187359701e-05, "loss": 0.009766732342541218, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00981, "step": 317, "tokens/total": 41492480, "tokens/train_per_sec_per_gpu": 3444.53, "tokens/trainable": 4414268 }, { "epoch": 1.0127388535031847, "grad_norm": 0.17578125, "learning_rate": 4.843476109314833e-05, "loss": 0.009223168715834618, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00927, "step": 318, "tokens/total": 41623552, "tokens/train_per_sec_per_gpu": 3515.7, "tokens/trainable": 4428804 }, { "epoch": 1.015923566878981, "grad_norm": 0.1611328125, "learning_rate": 4.841534446810527e-05, "loss": 0.008030703291296959, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00806, "step": 319, "tokens/total": 41754624, "tokens/train_per_sec_per_gpu": 3297.15, "tokens/trainable": 4442458 }, { "epoch": 1.019108280254777, "grad_norm": 0.1669921875, "learning_rate": 4.839581209444966e-05, "loss": 0.008971852250397205, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00901, "step": 320, "tokens/total": 41885696, "tokens/train_per_sec_per_gpu": 3348.3, "tokens/trainable": 4456319 }, { "epoch": 1.0222929936305734, "grad_norm": 0.189453125, "learning_rate": 4.8376164068735485e-05, "loss": 0.011034002527594566, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0111, "step": 321, "tokens/total": 42016768, "tokens/train_per_sec_per_gpu": 3467.0, "tokens/trainable": 4470692 }, { "epoch": 1.0254777070063694, "grad_norm": 0.21484375, "learning_rate": 4.835640048808847e-05, "loss": 0.008709516376256943, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00875, "step": 322, "tokens/total": 42147840, "tokens/train_per_sec_per_gpu": 3335.02, "tokens/trainable": 4484563 }, { "epoch": 1.0286624203821657, "grad_norm": 0.166015625, "learning_rate": 4.833652145020551e-05, "loss": 0.006180301308631897, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0062, "step": 323, "tokens/total": 42278912, "tokens/train_per_sec_per_gpu": 3293.93, "tokens/trainable": 4498340 }, { "epoch": 1.0318471337579618, "grad_norm": 0.15234375, "learning_rate": 4.831652705335428e-05, "loss": 0.007071372587233782, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0071, "step": 324, "tokens/total": 42409984, "tokens/train_per_sec_per_gpu": 3496.34, "tokens/trainable": 4512959 }, { "epoch": 1.035031847133758, "grad_norm": 0.2216796875, "learning_rate": 4.829641739637269e-05, "loss": 0.010390223003923893, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01044, "step": 325, "tokens/total": 42541056, "tokens/train_per_sec_per_gpu": 3109.54, "tokens/trainable": 4525947 }, { "epoch": 1.0382165605095541, "grad_norm": 0.19140625, "learning_rate": 4.827619257866839e-05, "loss": 0.010280653834342957, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01033, "step": 326, "tokens/total": 42672128, "tokens/train_per_sec_per_gpu": 3494.82, "tokens/trainable": 4540559 }, { "epoch": 1.0414012738853504, "grad_norm": 0.291015625, "learning_rate": 4.825585270021835e-05, "loss": 0.009634558111429214, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00968, "step": 327, "tokens/total": 42803200, "tokens/train_per_sec_per_gpu": 3081.6, "tokens/trainable": 4553474 }, { "epoch": 1.0445859872611465, "grad_norm": 0.21875, "learning_rate": 4.823539786156828e-05, "loss": 0.012012935243546963, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01209, "step": 328, "tokens/total": 42934272, "tokens/train_per_sec_per_gpu": 3405.54, "tokens/trainable": 4567721 }, { "epoch": 1.0477707006369428, "grad_norm": 0.1552734375, "learning_rate": 4.821482816383218e-05, "loss": 0.005780364852398634, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0058, "step": 329, "tokens/total": 43065344, "tokens/train_per_sec_per_gpu": 3703.56, "tokens/trainable": 4583144 }, { "epoch": 1.0509554140127388, "grad_norm": 0.1787109375, "learning_rate": 4.8194143708691844e-05, "loss": 0.010735648684203625, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01079, "step": 330, "tokens/total": 43196416, "tokens/train_per_sec_per_gpu": 3454.77, "tokens/trainable": 4597528 }, { "epoch": 1.0541401273885351, "grad_norm": 0.2119140625, "learning_rate": 4.817334459839633e-05, "loss": 0.009996584616601467, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01005, "step": 331, "tokens/total": 43327488, "tokens/train_per_sec_per_gpu": 3088.63, "tokens/trainable": 4610506 }, { "epoch": 1.0573248407643312, "grad_norm": 0.1513671875, "learning_rate": 4.8152430935761456e-05, "loss": 0.007421544287353754, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00745, "step": 332, "tokens/total": 43458560, "tokens/train_per_sec_per_gpu": 3395.75, "tokens/trainable": 4624715 }, { "epoch": 1.0605095541401275, "grad_norm": 0.12255859375, "learning_rate": 4.8131402824169336e-05, "loss": 0.004339924082159996, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00435, "step": 333, "tokens/total": 43589632, "tokens/train_per_sec_per_gpu": 2923.1, "tokens/trainable": 4636991 }, { "epoch": 1.0636942675159236, "grad_norm": 0.2109375, "learning_rate": 4.8110260367567816e-05, "loss": 0.007030356675386429, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00706, "step": 334, "tokens/total": 43720704, "tokens/train_per_sec_per_gpu": 3278.5, "tokens/trainable": 4650745 }, { "epoch": 1.0668789808917198, "grad_norm": 0.2373046875, "learning_rate": 4.808900367046999e-05, "loss": 0.00917564332485199, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00922, "step": 335, "tokens/total": 43851776, "tokens/train_per_sec_per_gpu": 3402.45, "tokens/trainable": 4664997 }, { "epoch": 1.070063694267516, "grad_norm": 0.158203125, "learning_rate": 4.806763283795366e-05, "loss": 0.0065734670497477055, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0066, "step": 336, "tokens/total": 43982848, "tokens/train_per_sec_per_gpu": 2932.03, "tokens/trainable": 4677280 }, { "epoch": 1.0732484076433122, "grad_norm": 0.154296875, "learning_rate": 4.804614797566086e-05, "loss": 0.00853950995951891, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00858, "step": 337, "tokens/total": 44113920, "tokens/train_per_sec_per_gpu": 3499.45, "tokens/trainable": 4691898 }, { "epoch": 1.0764331210191083, "grad_norm": 0.271484375, "learning_rate": 4.8024549189797276e-05, "loss": 0.012293344363570213, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01237, "step": 338, "tokens/total": 44244992, "tokens/train_per_sec_per_gpu": 3312.19, "tokens/trainable": 4705870 }, { "epoch": 1.0796178343949046, "grad_norm": 0.1728515625, "learning_rate": 4.800283658713177e-05, "loss": 0.010073346085846424, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01012, "step": 339, "tokens/total": 44376064, "tokens/train_per_sec_per_gpu": 3473.54, "tokens/trainable": 4720409 }, { "epoch": 1.0828025477707006, "grad_norm": 0.1962890625, "learning_rate": 4.798101027499581e-05, "loss": 0.010279987938702106, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01033, "step": 340, "tokens/total": 44507136, "tokens/train_per_sec_per_gpu": 3370.76, "tokens/trainable": 4734524 }, { "epoch": 1.085987261146497, "grad_norm": 0.2041015625, "learning_rate": 4.795907036128299e-05, "loss": 0.009196259081363678, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00924, "step": 341, "tokens/total": 44638208, "tokens/train_per_sec_per_gpu": 3347.17, "tokens/trainable": 4748535 }, { "epoch": 1.089171974522293, "grad_norm": 0.2080078125, "learning_rate": 4.793701695444846e-05, "loss": 0.009703228250145912, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00975, "step": 342, "tokens/total": 44769280, "tokens/train_per_sec_per_gpu": 3220.71, "tokens/trainable": 4762018 }, { "epoch": 1.0923566878980893, "grad_norm": 0.18359375, "learning_rate": 4.791485016350837e-05, "loss": 0.010180710814893246, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01023, "step": 343, "tokens/total": 44900352, "tokens/train_per_sec_per_gpu": 3726.69, "tokens/trainable": 4777568 }, { "epoch": 1.0955414012738853, "grad_norm": 0.1826171875, "learning_rate": 4.78925700980394e-05, "loss": 0.007739739958196878, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00777, "step": 344, "tokens/total": 45031424, "tokens/train_per_sec_per_gpu": 3151.58, "tokens/trainable": 4790766 }, { "epoch": 1.0987261146496816, "grad_norm": 0.265625, "learning_rate": 4.787017686817816e-05, "loss": 0.013002859428524971, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01309, "step": 345, "tokens/total": 45162496, "tokens/train_per_sec_per_gpu": 3615.54, "tokens/trainable": 4805850 }, { "epoch": 1.1019108280254777, "grad_norm": 0.1669921875, "learning_rate": 4.7847670584620653e-05, "loss": 0.008513463661074638, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00855, "step": 346, "tokens/total": 45293568, "tokens/train_per_sec_per_gpu": 3554.41, "tokens/trainable": 4820707 }, { "epoch": 1.105095541401274, "grad_norm": 0.2041015625, "learning_rate": 4.782505135862176e-05, "loss": 0.012663084082305431, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01274, "step": 347, "tokens/total": 45424640, "tokens/train_per_sec_per_gpu": 3406.8, "tokens/trainable": 4834965 }, { "epoch": 1.10828025477707, "grad_norm": 0.1650390625, "learning_rate": 4.780231930199465e-05, "loss": 0.006982079707086086, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00701, "step": 348, "tokens/total": 45555712, "tokens/train_per_sec_per_gpu": 3420.63, "tokens/trainable": 4849306 }, { "epoch": 1.1114649681528663, "grad_norm": 0.150390625, "learning_rate": 4.777947452711026e-05, "loss": 0.007746942341327667, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00778, "step": 349, "tokens/total": 45686784, "tokens/train_per_sec_per_gpu": 3182.73, "tokens/trainable": 4862654 }, { "epoch": 1.1146496815286624, "grad_norm": 0.2021484375, "learning_rate": 4.77565171468967e-05, "loss": 0.008427651599049568, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00846, "step": 350, "tokens/total": 45817856, "tokens/train_per_sec_per_gpu": 3011.28, "tokens/trainable": 4875396 }, { "epoch": 1.1178343949044587, "grad_norm": 0.150390625, "learning_rate": 4.773344727483876e-05, "loss": 0.007029036991298199, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00705, "step": 351, "tokens/total": 45948928, "tokens/train_per_sec_per_gpu": 2910.12, "tokens/trainable": 4887648 }, { "epoch": 1.1210191082802548, "grad_norm": 0.203125, "learning_rate": 4.771026502497726e-05, "loss": 0.009960726834833622, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01001, "step": 352, "tokens/total": 46080000, "tokens/train_per_sec_per_gpu": 3171.62, "tokens/trainable": 4900946 }, { "epoch": 1.124203821656051, "grad_norm": 0.2109375, "learning_rate": 4.7686970511908594e-05, "loss": 0.010911881923675537, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01097, "step": 353, "tokens/total": 46211072, "tokens/train_per_sec_per_gpu": 3471.86, "tokens/trainable": 4915383 }, { "epoch": 1.127388535031847, "grad_norm": 0.19921875, "learning_rate": 4.766356385078403e-05, "loss": 0.01082072127610445, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01088, "step": 354, "tokens/total": 46342144, "tokens/train_per_sec_per_gpu": 3528.47, "tokens/trainable": 4930118 }, { "epoch": 1.1305732484076434, "grad_norm": 0.189453125, "learning_rate": 4.7640045157309286e-05, "loss": 0.00796705111861229, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.008, "step": 355, "tokens/total": 46473216, "tokens/train_per_sec_per_gpu": 3675.06, "tokens/trainable": 4945407 }, { "epoch": 1.1337579617834395, "grad_norm": 0.1650390625, "learning_rate": 4.761641454774386e-05, "loss": 0.009853512980043888, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0099, "step": 356, "tokens/total": 46604288, "tokens/train_per_sec_per_gpu": 3426.1, "tokens/trainable": 4959713 }, { "epoch": 1.1369426751592357, "grad_norm": 0.1728515625, "learning_rate": 4.759267213890046e-05, "loss": 0.008251532912254333, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00829, "step": 357, "tokens/total": 46735360, "tokens/train_per_sec_per_gpu": 3370.12, "tokens/trainable": 4973803 }, { "epoch": 1.1401273885350318, "grad_norm": 0.171875, "learning_rate": 4.756881804814448e-05, "loss": 0.007583227939903736, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00761, "step": 358, "tokens/total": 46866432, "tokens/train_per_sec_per_gpu": 3085.13, "tokens/trainable": 4986783 }, { "epoch": 1.143312101910828, "grad_norm": 0.1171875, "learning_rate": 4.7544852393393375e-05, "loss": 0.005565401166677475, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00558, "step": 359, "tokens/total": 46997504, "tokens/train_per_sec_per_gpu": 3283.64, "tokens/trainable": 5000464 }, { "epoch": 1.1464968152866242, "grad_norm": 0.158203125, "learning_rate": 4.7520775293116096e-05, "loss": 0.007274336647242308, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0073, "step": 360, "tokens/total": 47128576, "tokens/train_per_sec_per_gpu": 3219.09, "tokens/trainable": 5013941 }, { "epoch": 1.1496815286624205, "grad_norm": 0.173828125, "learning_rate": 4.749658686633251e-05, "loss": 0.007295841351151466, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00732, "step": 361, "tokens/total": 47259648, "tokens/train_per_sec_per_gpu": 3222.39, "tokens/trainable": 5027460 }, { "epoch": 1.1528662420382165, "grad_norm": 0.126953125, "learning_rate": 4.747228723261278e-05, "loss": 0.004342417698353529, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00435, "step": 362, "tokens/total": 47390720, "tokens/train_per_sec_per_gpu": 3121.81, "tokens/trainable": 5040541 }, { "epoch": 1.1560509554140128, "grad_norm": 0.197265625, "learning_rate": 4.7447876512076815e-05, "loss": 0.00851562898606062, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00855, "step": 363, "tokens/total": 47521792, "tokens/train_per_sec_per_gpu": 3480.8, "tokens/trainable": 5055042 }, { "epoch": 1.1592356687898089, "grad_norm": 0.1923828125, "learning_rate": 4.7423354825393646e-05, "loss": 0.011735991574823856, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01181, "step": 364, "tokens/total": 47652864, "tokens/train_per_sec_per_gpu": 3454.35, "tokens/trainable": 5069432 }, { "epoch": 1.1624203821656052, "grad_norm": 0.203125, "learning_rate": 4.739872229378085e-05, "loss": 0.009628934785723686, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00968, "step": 365, "tokens/total": 47783936, "tokens/train_per_sec_per_gpu": 3056.5, "tokens/trainable": 5082238 }, { "epoch": 1.1656050955414012, "grad_norm": 0.181640625, "learning_rate": 4.737397903900393e-05, "loss": 0.008178248070180416, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00821, "step": 366, "tokens/total": 47915008, "tokens/train_per_sec_per_gpu": 3187.45, "tokens/trainable": 5095582 }, { "epoch": 1.1687898089171975, "grad_norm": 0.2109375, "learning_rate": 4.734912518337574e-05, "loss": 0.010145166888833046, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0102, "step": 367, "tokens/total": 48046080, "tokens/train_per_sec_per_gpu": 3535.81, "tokens/trainable": 5110321 }, { "epoch": 1.1719745222929936, "grad_norm": 0.158203125, "learning_rate": 4.732416084975585e-05, "loss": 0.008553897961974144, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00859, "step": 368, "tokens/total": 48177152, "tokens/train_per_sec_per_gpu": 3223.93, "tokens/trainable": 5123813 }, { "epoch": 1.1751592356687899, "grad_norm": 0.146484375, "learning_rate": 4.729908616154996e-05, "loss": 0.007267483975738287, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00729, "step": 369, "tokens/total": 48308224, "tokens/train_per_sec_per_gpu": 3596.57, "tokens/trainable": 5138875 }, { "epoch": 1.178343949044586, "grad_norm": 0.20703125, "learning_rate": 4.727390124270929e-05, "loss": 0.010045611299574375, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0101, "step": 370, "tokens/total": 48439296, "tokens/train_per_sec_per_gpu": 3361.05, "tokens/trainable": 5152957 }, { "epoch": 1.1815286624203822, "grad_norm": 0.166015625, "learning_rate": 4.724860621772995e-05, "loss": 0.006381361745297909, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0064, "step": 371, "tokens/total": 48570368, "tokens/train_per_sec_per_gpu": 3270.56, "tokens/trainable": 5166655 }, { "epoch": 1.1847133757961783, "grad_norm": 0.1259765625, "learning_rate": 4.7223201211652346e-05, "loss": 0.0061474088579416275, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00617, "step": 372, "tokens/total": 48701440, "tokens/train_per_sec_per_gpu": 3413.31, "tokens/trainable": 5180889 }, { "epoch": 1.1878980891719746, "grad_norm": 0.205078125, "learning_rate": 4.7197686350060535e-05, "loss": 0.013294153846800327, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01338, "step": 373, "tokens/total": 48832512, "tokens/train_per_sec_per_gpu": 3307.53, "tokens/trainable": 5194736 }, { "epoch": 1.1910828025477707, "grad_norm": 0.1669921875, "learning_rate": 4.717206175908164e-05, "loss": 0.009227165952324867, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00927, "step": 374, "tokens/total": 48963584, "tokens/train_per_sec_per_gpu": 3407.27, "tokens/trainable": 5208974 }, { "epoch": 1.194267515923567, "grad_norm": 0.2421875, "learning_rate": 4.7146327565385195e-05, "loss": 0.009992158971726894, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01004, "step": 375, "tokens/total": 49094656, "tokens/train_per_sec_per_gpu": 3078.36, "tokens/trainable": 5221898 }, { "epoch": 1.197452229299363, "grad_norm": 0.1630859375, "learning_rate": 4.712048389618254e-05, "loss": 0.0076246620155870914, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00765, "step": 376, "tokens/total": 49225728, "tokens/train_per_sec_per_gpu": 3454.62, "tokens/trainable": 5236300 }, { "epoch": 1.2006369426751593, "grad_norm": 0.2119140625, "learning_rate": 4.7094530879226166e-05, "loss": 0.010849738493561745, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01091, "step": 377, "tokens/total": 49356800, "tokens/train_per_sec_per_gpu": 3211.73, "tokens/trainable": 5249796 }, { "epoch": 1.2038216560509554, "grad_norm": 0.1669921875, "learning_rate": 4.706846864280913e-05, "loss": 0.00665281992405653, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00667, "step": 378, "tokens/total": 49487872, "tokens/train_per_sec_per_gpu": 3615.73, "tokens/trainable": 5264940 }, { "epoch": 1.2070063694267517, "grad_norm": 0.1689453125, "learning_rate": 4.704229731576435e-05, "loss": 0.009321301244199276, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00936, "step": 379, "tokens/total": 49618944, "tokens/train_per_sec_per_gpu": 3521.87, "tokens/trainable": 5279679 }, { "epoch": 1.2101910828025477, "grad_norm": 0.1728515625, "learning_rate": 4.701601702746405e-05, "loss": 0.009726524353027344, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00977, "step": 380, "tokens/total": 49750016, "tokens/train_per_sec_per_gpu": 3758.86, "tokens/trainable": 5295322 }, { "epoch": 1.213375796178344, "grad_norm": 0.138671875, "learning_rate": 4.698962790781906e-05, "loss": 0.00720211723819375, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00723, "step": 381, "tokens/total": 49881088, "tokens/train_per_sec_per_gpu": 3392.46, "tokens/trainable": 5309524 }, { "epoch": 1.21656050955414, "grad_norm": 0.1943359375, "learning_rate": 4.696313008727819e-05, "loss": 0.009434825740754604, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00948, "step": 382, "tokens/total": 50012160, "tokens/train_per_sec_per_gpu": 3237.9, "tokens/trainable": 5323073 }, { "epoch": 1.2197452229299364, "grad_norm": 0.203125, "learning_rate": 4.6936523696827615e-05, "loss": 0.013360480777919292, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01345, "step": 383, "tokens/total": 50143232, "tokens/train_per_sec_per_gpu": 3386.03, "tokens/trainable": 5337238 }, { "epoch": 1.2229299363057324, "grad_norm": 0.1748046875, "learning_rate": 4.690980886799016e-05, "loss": 0.009163031354546547, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00921, "step": 384, "tokens/total": 50274304, "tokens/train_per_sec_per_gpu": 3800.38, "tokens/trainable": 5353034 }, { "epoch": 1.2261146496815287, "grad_norm": 0.142578125, "learning_rate": 4.688298573282473e-05, "loss": 0.006065514404326677, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00608, "step": 385, "tokens/total": 50405376, "tokens/train_per_sec_per_gpu": 3325.28, "tokens/trainable": 5366994 }, { "epoch": 1.2292993630573248, "grad_norm": 0.1611328125, "learning_rate": 4.685605442392559e-05, "loss": 0.007522703614085913, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00755, "step": 386, "tokens/total": 50536448, "tokens/train_per_sec_per_gpu": 3244.1, "tokens/trainable": 5380585 }, { "epoch": 1.232484076433121, "grad_norm": 0.158203125, "learning_rate": 4.6829015074421754e-05, "loss": 0.008297629654407501, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00833, "step": 387, "tokens/total": 50667520, "tokens/train_per_sec_per_gpu": 3675.24, "tokens/trainable": 5395883 }, { "epoch": 1.2356687898089171, "grad_norm": 0.1923828125, "learning_rate": 4.680186781797632e-05, "loss": 0.008283684030175209, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00832, "step": 388, "tokens/total": 50798592, "tokens/train_per_sec_per_gpu": 3323.41, "tokens/trainable": 5409819 }, { "epoch": 1.2388535031847134, "grad_norm": 0.1669921875, "learning_rate": 4.677461278878577e-05, "loss": 0.009029434062540531, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00907, "step": 389, "tokens/total": 50929664, "tokens/train_per_sec_per_gpu": 2967.79, "tokens/trainable": 5422282 }, { "epoch": 1.2420382165605095, "grad_norm": 0.1298828125, "learning_rate": 4.674725012157936e-05, "loss": 0.0059669832699000835, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00598, "step": 390, "tokens/total": 51060736, "tokens/train_per_sec_per_gpu": 3230.78, "tokens/trainable": 5435820 }, { "epoch": 1.2452229299363058, "grad_norm": 0.14453125, "learning_rate": 4.671977995161843e-05, "loss": 0.005600204225629568, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00562, "step": 391, "tokens/total": 51191808, "tokens/train_per_sec_per_gpu": 3398.04, "tokens/trainable": 5450055 }, { "epoch": 1.2484076433121019, "grad_norm": 0.166015625, "learning_rate": 4.669220241469573e-05, "loss": 0.007735088467597961, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00777, "step": 392, "tokens/total": 51322880, "tokens/train_per_sec_per_gpu": 3315.69, "tokens/trainable": 5463943 }, { "epoch": 1.2515923566878981, "grad_norm": 0.2138671875, "learning_rate": 4.666451764713475e-05, "loss": 0.010222709737718105, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01028, "step": 393, "tokens/total": 51453952, "tokens/train_per_sec_per_gpu": 3438.45, "tokens/trainable": 5478266 }, { "epoch": 1.2547770700636942, "grad_norm": 0.154296875, "learning_rate": 4.663672578578908e-05, "loss": 0.007789981085807085, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00782, "step": 394, "tokens/total": 51585024, "tokens/train_per_sec_per_gpu": 3144.21, "tokens/trainable": 5491440 }, { "epoch": 1.2579617834394905, "grad_norm": 0.1982421875, "learning_rate": 4.660882696804165e-05, "loss": 0.01257528830319643, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01265, "step": 395, "tokens/total": 51716096, "tokens/train_per_sec_per_gpu": 3704.11, "tokens/trainable": 5506947 }, { "epoch": 1.2611464968152866, "grad_norm": 0.1669921875, "learning_rate": 4.658082133180416e-05, "loss": 0.007808534894138575, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00784, "step": 396, "tokens/total": 51847168, "tokens/train_per_sec_per_gpu": 3138.89, "tokens/trainable": 5520102 }, { "epoch": 1.2643312101910829, "grad_norm": 0.1787109375, "learning_rate": 4.655270901551632e-05, "loss": 0.008749695494771004, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00879, "step": 397, "tokens/total": 51978240, "tokens/train_per_sec_per_gpu": 3068.52, "tokens/trainable": 5532992 }, { "epoch": 1.267515923566879, "grad_norm": 0.193359375, "learning_rate": 4.652449015814518e-05, "loss": 0.010582723654806614, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01064, "step": 398, "tokens/total": 52109312, "tokens/train_per_sec_per_gpu": 3477.89, "tokens/trainable": 5547568 }, { "epoch": 1.2707006369426752, "grad_norm": 0.177734375, "learning_rate": 4.649616489918448e-05, "loss": 0.007580795791000128, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00761, "step": 399, "tokens/total": 52240384, "tokens/train_per_sec_per_gpu": 3136.71, "tokens/trainable": 5560738 }, { "epoch": 1.2738853503184713, "grad_norm": 0.177734375, "learning_rate": 4.646773337865391e-05, "loss": 0.00638965331017971, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00641, "step": 400, "tokens/total": 52371456, "tokens/train_per_sec_per_gpu": 3189.44, "tokens/trainable": 5574146 }, { "epoch": 1.2770700636942676, "grad_norm": 0.185546875, "learning_rate": 4.643919573709843e-05, "loss": 0.007701355963945389, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00773, "step": 401, "tokens/total": 52502528, "tokens/train_per_sec_per_gpu": 3217.16, "tokens/trainable": 5587632 }, { "epoch": 1.2802547770700636, "grad_norm": 0.1962890625, "learning_rate": 4.641055211558762e-05, "loss": 0.009735530242323875, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00978, "step": 402, "tokens/total": 52633600, "tokens/train_per_sec_per_gpu": 3104.94, "tokens/trainable": 5600617 }, { "epoch": 1.28343949044586, "grad_norm": 0.193359375, "learning_rate": 4.6381802655714946e-05, "loss": 0.009511996060609818, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00956, "step": 403, "tokens/total": 52764672, "tokens/train_per_sec_per_gpu": 3181.89, "tokens/trainable": 5613940 }, { "epoch": 1.286624203821656, "grad_norm": 0.1669921875, "learning_rate": 4.6352947499597024e-05, "loss": 0.008532877080142498, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00857, "step": 404, "tokens/total": 52895744, "tokens/train_per_sec_per_gpu": 3220.05, "tokens/trainable": 5627419 }, { "epoch": 1.2898089171974523, "grad_norm": 0.1787109375, "learning_rate": 4.632398678987298e-05, "loss": 0.007435362320393324, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00746, "step": 405, "tokens/total": 53026816, "tokens/train_per_sec_per_gpu": 3293.31, "tokens/trainable": 5641255 }, { "epoch": 1.2929936305732483, "grad_norm": 0.185546875, "learning_rate": 4.629492066970373e-05, "loss": 0.009640632197260857, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00969, "step": 406, "tokens/total": 53157888, "tokens/train_per_sec_per_gpu": 3502.03, "tokens/trainable": 5655889 }, { "epoch": 1.2961783439490446, "grad_norm": 0.1865234375, "learning_rate": 4.626574928277127e-05, "loss": 0.00989444274455309, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00994, "step": 407, "tokens/total": 53288960, "tokens/train_per_sec_per_gpu": 3544.59, "tokens/trainable": 5670642 }, { "epoch": 1.2993630573248407, "grad_norm": 0.23828125, "learning_rate": 4.623647277327792e-05, "loss": 0.009198141284286976, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00924, "step": 408, "tokens/total": 53420032, "tokens/train_per_sec_per_gpu": 3306.22, "tokens/trainable": 5684524 }, { "epoch": 1.302547770700637, "grad_norm": 0.216796875, "learning_rate": 4.6207091285945694e-05, "loss": 0.010384837165474892, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01044, "step": 409, "tokens/total": 53551104, "tokens/train_per_sec_per_gpu": 3444.91, "tokens/trainable": 5698889 }, { "epoch": 1.305732484076433, "grad_norm": 0.1640625, "learning_rate": 4.61776049660155e-05, "loss": 0.0068597206845879555, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00688, "step": 410, "tokens/total": 53682176, "tokens/train_per_sec_per_gpu": 3083.21, "tokens/trainable": 5711820 }, { "epoch": 1.3089171974522293, "grad_norm": 0.125, "learning_rate": 4.614801395924649e-05, "loss": 0.005090971477329731, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0051, "step": 411, "tokens/total": 53813248, "tokens/train_per_sec_per_gpu": 3042.48, "tokens/trainable": 5724613 }, { "epoch": 1.3121019108280254, "grad_norm": 0.142578125, "learning_rate": 4.611831841191533e-05, "loss": 0.005095964763313532, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00511, "step": 412, "tokens/total": 53944320, "tokens/train_per_sec_per_gpu": 3189.46, "tokens/trainable": 5737985 }, { "epoch": 1.3152866242038217, "grad_norm": 0.177734375, "learning_rate": 4.608851847081542e-05, "loss": 0.009599323384463787, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00965, "step": 413, "tokens/total": 54075392, "tokens/train_per_sec_per_gpu": 3425.1, "tokens/trainable": 5752257 }, { "epoch": 1.3184713375796178, "grad_norm": 0.1533203125, "learning_rate": 4.6058614283256205e-05, "loss": 0.007107466459274292, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00713, "step": 414, "tokens/total": 54206464, "tokens/train_per_sec_per_gpu": 3284.05, "tokens/trainable": 5766000 }, { "epoch": 1.321656050955414, "grad_norm": 0.2021484375, "learning_rate": 4.60286059970625e-05, "loss": 0.009428326040506363, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00947, "step": 415, "tokens/total": 54337536, "tokens/train_per_sec_per_gpu": 3375.64, "tokens/trainable": 5780141 }, { "epoch": 1.3248407643312101, "grad_norm": 0.1484375, "learning_rate": 4.599849376057366e-05, "loss": 0.006207283120602369, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00623, "step": 416, "tokens/total": 54468608, "tokens/train_per_sec_per_gpu": 3150.96, "tokens/trainable": 5793324 }, { "epoch": 1.3280254777070064, "grad_norm": 0.193359375, "learning_rate": 4.5968277722642915e-05, "loss": 0.011342452839016914, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01141, "step": 417, "tokens/total": 54599680, "tokens/train_per_sec_per_gpu": 3068.96, "tokens/trainable": 5806288 }, { "epoch": 1.3312101910828025, "grad_norm": 0.2197265625, "learning_rate": 4.593795803263661e-05, "loss": 0.0096285380423069, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00968, "step": 418, "tokens/total": 54730752, "tokens/train_per_sec_per_gpu": 3414.96, "tokens/trainable": 5820535 }, { "epoch": 1.3343949044585988, "grad_norm": 0.1787109375, "learning_rate": 4.590753484043348e-05, "loss": 0.008351242169737816, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00839, "step": 419, "tokens/total": 54861824, "tokens/train_per_sec_per_gpu": 3383.61, "tokens/trainable": 5834705 }, { "epoch": 1.3375796178343948, "grad_norm": 0.20703125, "learning_rate": 4.5877008296423886e-05, "loss": 0.010140678845345974, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01019, "step": 420, "tokens/total": 54992896, "tokens/train_per_sec_per_gpu": 3557.31, "tokens/trainable": 5849593 }, { "epoch": 1.3407643312101911, "grad_norm": 0.1005859375, "learning_rate": 4.5846378551509097e-05, "loss": 0.003956064116209745, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00396, "step": 421, "tokens/total": 55123968, "tokens/train_per_sec_per_gpu": 3127.02, "tokens/trainable": 5862715 }, { "epoch": 1.3439490445859872, "grad_norm": 0.19921875, "learning_rate": 4.581564575710053e-05, "loss": 0.011450878344476223, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01152, "step": 422, "tokens/total": 55255040, "tokens/train_per_sec_per_gpu": 3045.36, "tokens/trainable": 5875602 }, { "epoch": 1.3471337579617835, "grad_norm": 0.1689453125, "learning_rate": 4.5784810065119e-05, "loss": 0.008104214444756508, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00814, "step": 423, "tokens/total": 55386112, "tokens/train_per_sec_per_gpu": 3284.72, "tokens/trainable": 5889428 }, { "epoch": 1.3503184713375795, "grad_norm": 0.14453125, "learning_rate": 4.575387162799399e-05, "loss": 0.006891798693686724, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00692, "step": 424, "tokens/total": 55517184, "tokens/train_per_sec_per_gpu": 3722.92, "tokens/trainable": 5904973 }, { "epoch": 1.3535031847133758, "grad_norm": 0.1669921875, "learning_rate": 4.5722830598662854e-05, "loss": 0.009776144288480282, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00982, "step": 425, "tokens/total": 55648256, "tokens/train_per_sec_per_gpu": 3412.77, "tokens/trainable": 5919245 }, { "epoch": 1.356687898089172, "grad_norm": 0.166015625, "learning_rate": 4.56916871305701e-05, "loss": 0.007931388914585114, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00796, "step": 426, "tokens/total": 55779328, "tokens/train_per_sec_per_gpu": 3407.46, "tokens/trainable": 5933535 }, { "epoch": 1.3598726114649682, "grad_norm": 0.1904296875, "learning_rate": 4.5660441377666654e-05, "loss": 0.008083492517471313, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00812, "step": 427, "tokens/total": 55910400, "tokens/train_per_sec_per_gpu": 3599.03, "tokens/trainable": 5948481 }, { "epoch": 1.3630573248407643, "grad_norm": 0.1484375, "learning_rate": 4.562909349440899e-05, "loss": 0.006925994995981455, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00695, "step": 428, "tokens/total": 56041472, "tokens/train_per_sec_per_gpu": 3517.79, "tokens/trainable": 5963175 }, { "epoch": 1.3662420382165605, "grad_norm": 0.1484375, "learning_rate": 4.559764363575851e-05, "loss": 0.008385020308196545, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00842, "step": 429, "tokens/total": 56172544, "tokens/train_per_sec_per_gpu": 3404.08, "tokens/trainable": 5977423 }, { "epoch": 1.3694267515923566, "grad_norm": 0.1669921875, "learning_rate": 4.556609195718068e-05, "loss": 0.005221434403210878, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00524, "step": 430, "tokens/total": 56303616, "tokens/train_per_sec_per_gpu": 3212.05, "tokens/trainable": 5990835 }, { "epoch": 1.372611464968153, "grad_norm": 0.193359375, "learning_rate": 4.5534438614644294e-05, "loss": 0.009253652766346931, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0093, "step": 431, "tokens/total": 56434688, "tokens/train_per_sec_per_gpu": 3584.45, "tokens/trainable": 6005749 }, { "epoch": 1.3757961783439492, "grad_norm": 0.2021484375, "learning_rate": 4.550268376462068e-05, "loss": 0.009988540783524513, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01004, "step": 432, "tokens/total": 56565760, "tokens/train_per_sec_per_gpu": 3148.92, "tokens/trainable": 6018952 }, { "epoch": 1.3789808917197452, "grad_norm": 0.166015625, "learning_rate": 4.547082756408299e-05, "loss": 0.007521233521401882, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00755, "step": 433, "tokens/total": 56696832, "tokens/train_per_sec_per_gpu": 3309.63, "tokens/trainable": 6032837 }, { "epoch": 1.3821656050955413, "grad_norm": 0.1337890625, "learning_rate": 4.543887017050534e-05, "loss": 0.005825295113027096, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00584, "step": 434, "tokens/total": 56827904, "tokens/train_per_sec_per_gpu": 3375.36, "tokens/trainable": 6046929 }, { "epoch": 1.3853503184713376, "grad_norm": 0.2265625, "learning_rate": 4.540681174186209e-05, "loss": 0.011601070873439312, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01167, "step": 435, "tokens/total": 56958976, "tokens/train_per_sec_per_gpu": 3215.41, "tokens/trainable": 6060425 }, { "epoch": 1.388535031847134, "grad_norm": 0.1884765625, "learning_rate": 4.537465243662704e-05, "loss": 0.008219108916819096, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00825, "step": 436, "tokens/total": 57090048, "tokens/train_per_sec_per_gpu": 3133.32, "tokens/trainable": 6073533 }, { "epoch": 1.39171974522293, "grad_norm": 0.140625, "learning_rate": 4.534239241377266e-05, "loss": 0.007054620422422886, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00708, "step": 437, "tokens/total": 57221120, "tokens/train_per_sec_per_gpu": 3767.46, "tokens/trainable": 6089174 }, { "epoch": 1.394904458598726, "grad_norm": 0.1455078125, "learning_rate": 4.5310031832769275e-05, "loss": 0.007198185659945011, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00722, "step": 438, "tokens/total": 57352192, "tokens/train_per_sec_per_gpu": 3417.69, "tokens/trainable": 6103402 }, { "epoch": 1.3980891719745223, "grad_norm": 0.1474609375, "learning_rate": 4.527757085358431e-05, "loss": 0.007888494990766048, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00792, "step": 439, "tokens/total": 57483264, "tokens/train_per_sec_per_gpu": 3744.21, "tokens/trainable": 6119012 }, { "epoch": 1.4012738853503186, "grad_norm": 0.19140625, "learning_rate": 4.52450096366815e-05, "loss": 0.010496556758880615, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01055, "step": 440, "tokens/total": 57614336, "tokens/train_per_sec_per_gpu": 3474.71, "tokens/trainable": 6133429 }, { "epoch": 1.4044585987261147, "grad_norm": 0.1572265625, "learning_rate": 4.521234834302006e-05, "loss": 0.008718312717974186, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00876, "step": 441, "tokens/total": 57745408, "tokens/train_per_sec_per_gpu": 3481.7, "tokens/trainable": 6147945 }, { "epoch": 1.4076433121019107, "grad_norm": 0.2041015625, "learning_rate": 4.5179587134053916e-05, "loss": 0.01150327455252409, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01157, "step": 442, "tokens/total": 57876480, "tokens/train_per_sec_per_gpu": 3229.57, "tokens/trainable": 6161469 }, { "epoch": 1.410828025477707, "grad_norm": 0.216796875, "learning_rate": 4.514672617173091e-05, "loss": 0.011761811561882496, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01183, "step": 443, "tokens/total": 58007552, "tokens/train_per_sec_per_gpu": 3416.37, "tokens/trainable": 6175738 }, { "epoch": 1.4140127388535033, "grad_norm": 0.177734375, "learning_rate": 4.511376561849201e-05, "loss": 0.008984040468931198, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00902, "step": 444, "tokens/total": 58138624, "tokens/train_per_sec_per_gpu": 3352.83, "tokens/trainable": 6189737 }, { "epoch": 1.4171974522292994, "grad_norm": 0.1748046875, "learning_rate": 4.5080705637270446e-05, "loss": 0.006133932154625654, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00615, "step": 445, "tokens/total": 58269696, "tokens/train_per_sec_per_gpu": 3183.46, "tokens/trainable": 6203050 }, { "epoch": 1.4203821656050954, "grad_norm": 0.169921875, "learning_rate": 4.5047546391491e-05, "loss": 0.008717117831110954, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00876, "step": 446, "tokens/total": 58400768, "tokens/train_per_sec_per_gpu": 3819.2, "tokens/trainable": 6218900 }, { "epoch": 1.4235668789808917, "grad_norm": 0.14453125, "learning_rate": 4.50142880450691e-05, "loss": 0.006517563946545124, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00654, "step": 447, "tokens/total": 58531840, "tokens/train_per_sec_per_gpu": 3083.03, "tokens/trainable": 6231819 }, { "epoch": 1.426751592356688, "grad_norm": 0.1591796875, "learning_rate": 4.4980930762410084e-05, "loss": 0.010371977463364601, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01043, "step": 448, "tokens/total": 58662912, "tokens/train_per_sec_per_gpu": 3608.97, "tokens/trainable": 6246842 }, { "epoch": 1.429936305732484, "grad_norm": 0.1787109375, "learning_rate": 4.4947474708408353e-05, "loss": 0.00814439170062542, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00818, "step": 449, "tokens/total": 58793984, "tokens/train_per_sec_per_gpu": 3576.32, "tokens/trainable": 6261750 }, { "epoch": 1.4331210191082802, "grad_norm": 0.181640625, "learning_rate": 4.491392004844656e-05, "loss": 0.00930082332342863, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00934, "step": 450, "tokens/total": 58925056, "tokens/train_per_sec_per_gpu": 3149.76, "tokens/trainable": 6274962 }, { "epoch": 1.4363057324840764, "grad_norm": 0.1875, "learning_rate": 4.48802669483948e-05, "loss": 0.01012382097542286, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01018, "step": 451, "tokens/total": 59056128, "tokens/train_per_sec_per_gpu": 3375.6, "tokens/trainable": 6289094 }, { "epoch": 1.4394904458598727, "grad_norm": 0.13671875, "learning_rate": 4.484651557460978e-05, "loss": 0.007823411375284195, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00785, "step": 452, "tokens/total": 59187200, "tokens/train_per_sec_per_gpu": 3541.02, "tokens/trainable": 6303818 }, { "epoch": 1.4426751592356688, "grad_norm": 0.1767578125, "learning_rate": 4.4812666093934e-05, "loss": 0.010683316737413406, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01074, "step": 453, "tokens/total": 59318272, "tokens/train_per_sec_per_gpu": 3656.59, "tokens/trainable": 6319060 }, { "epoch": 1.4458598726114649, "grad_norm": 0.1650390625, "learning_rate": 4.477871867369494e-05, "loss": 0.01043397095054388, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01049, "step": 454, "tokens/total": 59449344, "tokens/train_per_sec_per_gpu": 3638.87, "tokens/trainable": 6334323 }, { "epoch": 1.4490445859872612, "grad_norm": 0.16015625, "learning_rate": 4.474467348170421e-05, "loss": 0.008449015207588673, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00848, "step": 455, "tokens/total": 59580416, "tokens/train_per_sec_per_gpu": 3481.97, "tokens/trainable": 6348832 }, { "epoch": 1.4522292993630574, "grad_norm": 0.1494140625, "learning_rate": 4.471053068625674e-05, "loss": 0.008155008777976036, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00819, "step": 456, "tokens/total": 59711488, "tokens/train_per_sec_per_gpu": 3541.25, "tokens/trainable": 6363586 }, { "epoch": 1.4554140127388535, "grad_norm": 0.15625, "learning_rate": 4.467629045612994e-05, "loss": 0.008736428804695606, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00877, "step": 457, "tokens/total": 59842560, "tokens/train_per_sec_per_gpu": 3481.13, "tokens/trainable": 6378173 }, { "epoch": 1.4585987261146496, "grad_norm": 0.185546875, "learning_rate": 4.4641952960582877e-05, "loss": 0.013414832763373852, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01351, "step": 458, "tokens/total": 59973632, "tokens/train_per_sec_per_gpu": 3571.41, "tokens/trainable": 6393061 }, { "epoch": 1.4617834394904459, "grad_norm": 0.208984375, "learning_rate": 4.4607518369355403e-05, "loss": 0.008803540840744972, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00884, "step": 459, "tokens/total": 60104704, "tokens/train_per_sec_per_gpu": 3270.87, "tokens/trainable": 6406746 }, { "epoch": 1.4649681528662422, "grad_norm": 0.1806640625, "learning_rate": 4.457298685266737e-05, "loss": 0.008787565864622593, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00883, "step": 460, "tokens/total": 60235776, "tokens/train_per_sec_per_gpu": 3181.98, "tokens/trainable": 6420083 }, { "epoch": 1.4681528662420382, "grad_norm": 0.1943359375, "learning_rate": 4.453835858121773e-05, "loss": 0.008562528528273106, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0086, "step": 461, "tokens/total": 60366848, "tokens/train_per_sec_per_gpu": 3258.95, "tokens/trainable": 6433715 }, { "epoch": 1.4713375796178343, "grad_norm": 0.162109375, "learning_rate": 4.450363372618376e-05, "loss": 0.0074198306538164616, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00745, "step": 462, "tokens/total": 60497920, "tokens/train_per_sec_per_gpu": 3544.34, "tokens/trainable": 6448466 }, { "epoch": 1.4745222929936306, "grad_norm": 0.1484375, "learning_rate": 4.4468812459220135e-05, "loss": 0.006448620930314064, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00647, "step": 463, "tokens/total": 60628992, "tokens/train_per_sec_per_gpu": 3102.02, "tokens/trainable": 6461464 }, { "epoch": 1.4777070063694269, "grad_norm": 0.1572265625, "learning_rate": 4.4433894952458156e-05, "loss": 0.008648392744362354, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00869, "step": 464, "tokens/total": 60760064, "tokens/train_per_sec_per_gpu": 3168.95, "tokens/trainable": 6475263 }, { "epoch": 1.480891719745223, "grad_norm": 0.15234375, "learning_rate": 4.439888137850483e-05, "loss": 0.008528076112270355, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00856, "step": 465, "tokens/total": 60891136, "tokens/train_per_sec_per_gpu": 3278.68, "tokens/trainable": 6488927 }, { "epoch": 1.484076433121019, "grad_norm": 0.1806640625, "learning_rate": 4.436377191044208e-05, "loss": 0.009064987301826477, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00911, "step": 466, "tokens/total": 61022208, "tokens/train_per_sec_per_gpu": 3401.94, "tokens/trainable": 6503171 }, { "epoch": 1.4872611464968153, "grad_norm": 0.1552734375, "learning_rate": 4.4328566721825846e-05, "loss": 0.009180644527077675, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00922, "step": 467, "tokens/total": 61153280, "tokens/train_per_sec_per_gpu": 3399.29, "tokens/trainable": 6517402 }, { "epoch": 1.4904458598726116, "grad_norm": 0.2158203125, "learning_rate": 4.4293265986685264e-05, "loss": 0.00970767717808485, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00975, "step": 468, "tokens/total": 61284352, "tokens/train_per_sec_per_gpu": 2969.25, "tokens/trainable": 6529847 }, { "epoch": 1.4936305732484076, "grad_norm": 0.154296875, "learning_rate": 4.425786987952174e-05, "loss": 0.009157263673841953, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0092, "step": 469, "tokens/total": 61415424, "tokens/train_per_sec_per_gpu": 3645.56, "tokens/trainable": 6545001 }, { "epoch": 1.4968152866242037, "grad_norm": 0.1396484375, "learning_rate": 4.4222378575308164e-05, "loss": 0.0058856685645878315, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0059, "step": 470, "tokens/total": 61546496, "tokens/train_per_sec_per_gpu": 3065.44, "tokens/trainable": 6557875 }, { "epoch": 1.5, "grad_norm": 0.1552734375, "learning_rate": 4.4186792249488005e-05, "loss": 0.006844916380941868, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00687, "step": 471, "tokens/total": 61677568, "tokens/train_per_sec_per_gpu": 3406.5, "tokens/trainable": 6572069 }, { "epoch": 1.5, "eval_loss": 0.009513070806860924, "eval_ppl": 1.00956, "eval_runtime": 41.9975, "eval_samples_per_second": 64.313, "eval_steps_per_second": 4.024, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 54.61, "memory/max_allocated (GiB)": 54.61, "step": 471 }, { "epoch": 1.5031847133757963, "grad_norm": 0.18359375, "learning_rate": 4.415111107797445e-05, "loss": 0.007119299378246069, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00714, "step": 472, "tokens/total": 61808640, "tokens/train_per_sec_per_gpu": 3291.47, "tokens/trainable": 6585775 }, { "epoch": 1.5063694267515924, "grad_norm": 0.1689453125, "learning_rate": 4.411533523714954e-05, "loss": 0.007842868566513062, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00787, "step": 473, "tokens/total": 61939712, "tokens/train_per_sec_per_gpu": 3180.93, "tokens/trainable": 6599115 }, { "epoch": 1.5095541401273884, "grad_norm": 0.181640625, "learning_rate": 4.4079464903863266e-05, "loss": 0.008342721499502659, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00838, "step": 474, "tokens/total": 62070784, "tokens/train_per_sec_per_gpu": 3367.39, "tokens/trainable": 6613147 }, { "epoch": 1.5127388535031847, "grad_norm": 0.171875, "learning_rate": 4.404350025543276e-05, "loss": 0.010307609103620052, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01036, "step": 475, "tokens/total": 62201856, "tokens/train_per_sec_per_gpu": 3430.72, "tokens/trainable": 6627509 }, { "epoch": 1.515923566878981, "grad_norm": 0.1787109375, "learning_rate": 4.400744146964136e-05, "loss": 0.008362861350178719, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0084, "step": 476, "tokens/total": 62332928, "tokens/train_per_sec_per_gpu": 3051.31, "tokens/trainable": 6640317 }, { "epoch": 1.519108280254777, "grad_norm": 0.232421875, "learning_rate": 4.3971288724737745e-05, "loss": 0.009740196168422699, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00979, "step": 477, "tokens/total": 62464000, "tokens/train_per_sec_per_gpu": 2966.55, "tokens/trainable": 6652748 }, { "epoch": 1.5222929936305731, "grad_norm": 0.1328125, "learning_rate": 4.393504219943509e-05, "loss": 0.004925255198031664, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00494, "step": 478, "tokens/total": 62595072, "tokens/train_per_sec_per_gpu": 3160.26, "tokens/trainable": 6666019 }, { "epoch": 1.5254777070063694, "grad_norm": 0.1748046875, "learning_rate": 4.3898702072910095e-05, "loss": 0.008841407485306263, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00888, "step": 479, "tokens/total": 62726144, "tokens/train_per_sec_per_gpu": 3085.05, "tokens/trainable": 6679004 }, { "epoch": 1.5286624203821657, "grad_norm": 0.15625, "learning_rate": 4.386226852480223e-05, "loss": 0.007529627997428179, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00756, "step": 480, "tokens/total": 62857216, "tokens/train_per_sec_per_gpu": 3263.46, "tokens/trainable": 6692672 }, { "epoch": 1.5318471337579618, "grad_norm": 0.1376953125, "learning_rate": 4.382574173521272e-05, "loss": 0.006781514268368483, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0068, "step": 481, "tokens/total": 62988288, "tokens/train_per_sec_per_gpu": 3583.46, "tokens/trainable": 6707600 }, { "epoch": 1.5350318471337578, "grad_norm": 0.16015625, "learning_rate": 4.378912188470373e-05, "loss": 0.0076340967789292336, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00766, "step": 482, "tokens/total": 63119360, "tokens/train_per_sec_per_gpu": 3108.97, "tokens/trainable": 6720612 }, { "epoch": 1.5382165605095541, "grad_norm": 0.212890625, "learning_rate": 4.375240915429745e-05, "loss": 0.009363564662635326, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00941, "step": 483, "tokens/total": 63250432, "tokens/train_per_sec_per_gpu": 3151.23, "tokens/trainable": 6733897 }, { "epoch": 1.5414012738853504, "grad_norm": 0.1474609375, "learning_rate": 4.3715603725475195e-05, "loss": 0.008497594855725765, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00853, "step": 484, "tokens/total": 63381504, "tokens/train_per_sec_per_gpu": 3630.74, "tokens/trainable": 6749020 }, { "epoch": 1.5445859872611465, "grad_norm": 0.1162109375, "learning_rate": 4.367870578017653e-05, "loss": 0.004754690453410149, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00477, "step": 485, "tokens/total": 63512576, "tokens/train_per_sec_per_gpu": 3401.99, "tokens/trainable": 6763237 }, { "epoch": 1.5477707006369426, "grad_norm": 0.1748046875, "learning_rate": 4.364171550079833e-05, "loss": 0.010673021897673607, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01073, "step": 486, "tokens/total": 63643648, "tokens/train_per_sec_per_gpu": 3289.49, "tokens/trainable": 6777048 }, { "epoch": 1.5509554140127388, "grad_norm": 0.1748046875, "learning_rate": 4.3604633070193915e-05, "loss": 0.009158292785286903, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0092, "step": 487, "tokens/total": 63774720, "tokens/train_per_sec_per_gpu": 3323.4, "tokens/trainable": 6790934 }, { "epoch": 1.5541401273885351, "grad_norm": 0.1298828125, "learning_rate": 4.3567458671672154e-05, "loss": 0.007650249172002077, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00768, "step": 488, "tokens/total": 63905792, "tokens/train_per_sec_per_gpu": 3747.82, "tokens/trainable": 6806546 }, { "epoch": 1.5573248407643312, "grad_norm": 0.142578125, "learning_rate": 4.35301924889965e-05, "loss": 0.006640854757279158, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00666, "step": 489, "tokens/total": 64036864, "tokens/train_per_sec_per_gpu": 3390.74, "tokens/trainable": 6820768 }, { "epoch": 1.5605095541401273, "grad_norm": 0.1611328125, "learning_rate": 4.3492834706384154e-05, "loss": 0.008299214765429497, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00833, "step": 490, "tokens/total": 64167936, "tokens/train_per_sec_per_gpu": 3305.77, "tokens/trainable": 6834601 }, { "epoch": 1.5636942675159236, "grad_norm": 0.1416015625, "learning_rate": 4.345538550850512e-05, "loss": 0.0071832421235740185, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00721, "step": 491, "tokens/total": 64299008, "tokens/train_per_sec_per_gpu": 3301.85, "tokens/trainable": 6848433 }, { "epoch": 1.5668789808917198, "grad_norm": 0.16796875, "learning_rate": 4.3417845080481255e-05, "loss": 0.008073330856859684, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00811, "step": 492, "tokens/total": 64430080, "tokens/train_per_sec_per_gpu": 3378.84, "tokens/trainable": 6862587 }, { "epoch": 1.570063694267516, "grad_norm": 0.1611328125, "learning_rate": 4.3380213607885443e-05, "loss": 0.009880865924060345, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00993, "step": 493, "tokens/total": 64561152, "tokens/train_per_sec_per_gpu": 3303.22, "tokens/trainable": 6876421 }, { "epoch": 1.573248407643312, "grad_norm": 0.1806640625, "learning_rate": 4.3342491276740595e-05, "loss": 0.008753279224038124, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00879, "step": 494, "tokens/total": 64692224, "tokens/train_per_sec_per_gpu": 3323.89, "tokens/trainable": 6890346 }, { "epoch": 1.5764331210191083, "grad_norm": 0.15234375, "learning_rate": 4.3304678273518776e-05, "loss": 0.009203528985381126, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00925, "step": 495, "tokens/total": 64823296, "tokens/train_per_sec_per_gpu": 3358.21, "tokens/trainable": 6904412 }, { "epoch": 1.5796178343949046, "grad_norm": 0.1689453125, "learning_rate": 4.326677478514024e-05, "loss": 0.00659502949565649, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00662, "step": 496, "tokens/total": 64954368, "tokens/train_per_sec_per_gpu": 3221.86, "tokens/trainable": 6917910 }, { "epoch": 1.5828025477707006, "grad_norm": 0.1591796875, "learning_rate": 4.322878099897259e-05, "loss": 0.009297506883740425, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00934, "step": 497, "tokens/total": 65085440, "tokens/train_per_sec_per_gpu": 3425.35, "tokens/trainable": 6932231 }, { "epoch": 1.5859872611464967, "grad_norm": 0.134765625, "learning_rate": 4.319069710282974e-05, "loss": 0.006143941078335047, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00616, "step": 498, "tokens/total": 65216512, "tokens/train_per_sec_per_gpu": 3594.69, "tokens/trainable": 6947330 }, { "epoch": 1.589171974522293, "grad_norm": 0.1689453125, "learning_rate": 4.315252328497107e-05, "loss": 0.006281242705881596, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0063, "step": 499, "tokens/total": 65347584, "tokens/train_per_sec_per_gpu": 3393.37, "tokens/trainable": 6961586 }, { "epoch": 1.5923566878980893, "grad_norm": 0.1572265625, "learning_rate": 4.311425973410047e-05, "loss": 0.007922859862446785, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00795, "step": 500, "tokens/total": 65478656, "tokens/train_per_sec_per_gpu": 3263.1, "tokens/trainable": 6975331 }, { "epoch": 1.5955414012738853, "grad_norm": 0.23046875, "learning_rate": 4.307590663936541e-05, "loss": 0.009491047821938992, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00954, "step": 501, "tokens/total": 65609728, "tokens/train_per_sec_per_gpu": 3050.63, "tokens/trainable": 6988184 }, { "epoch": 1.5987261146496814, "grad_norm": 0.1591796875, "learning_rate": 4.3037464190355955e-05, "loss": 0.007340395823121071, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00737, "step": 502, "tokens/total": 65740800, "tokens/train_per_sec_per_gpu": 3185.7, "tokens/trainable": 7001560 }, { "epoch": 1.6019108280254777, "grad_norm": 0.13671875, "learning_rate": 4.299893257710394e-05, "loss": 0.006943684071302414, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00697, "step": 503, "tokens/total": 65871872, "tokens/train_per_sec_per_gpu": 3219.92, "tokens/trainable": 7015042 }, { "epoch": 1.605095541401274, "grad_norm": 0.185546875, "learning_rate": 4.2960311990081924e-05, "loss": 0.009585048072040081, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00963, "step": 504, "tokens/total": 66002944, "tokens/train_per_sec_per_gpu": 3349.25, "tokens/trainable": 7029069 }, { "epoch": 1.60828025477707, "grad_norm": 0.1748046875, "learning_rate": 4.292160262020229e-05, "loss": 0.007607592269778252, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00764, "step": 505, "tokens/total": 66134016, "tokens/train_per_sec_per_gpu": 3369.3, "tokens/trainable": 7043148 }, { "epoch": 1.611464968152866, "grad_norm": 0.16015625, "learning_rate": 4.288280465881632e-05, "loss": 0.009396728128194809, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00944, "step": 506, "tokens/total": 66265088, "tokens/train_per_sec_per_gpu": 3676.06, "tokens/trainable": 7058458 }, { "epoch": 1.6146496815286624, "grad_norm": 0.1474609375, "learning_rate": 4.2843918297713196e-05, "loss": 0.007050440181046724, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00708, "step": 507, "tokens/total": 66396160, "tokens/train_per_sec_per_gpu": 3145.55, "tokens/trainable": 7071711 }, { "epoch": 1.6178343949044587, "grad_norm": 0.126953125, "learning_rate": 4.2804943729119115e-05, "loss": 0.007194128353148699, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00722, "step": 508, "tokens/total": 66527232, "tokens/train_per_sec_per_gpu": 3462.21, "tokens/trainable": 7086201 }, { "epoch": 1.6210191082802548, "grad_norm": 0.17578125, "learning_rate": 4.2765881145696306e-05, "loss": 0.00787313375622034, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0079, "step": 509, "tokens/total": 66658304, "tokens/train_per_sec_per_gpu": 3054.79, "tokens/trainable": 7099037 }, { "epoch": 1.6242038216560508, "grad_norm": 0.1572265625, "learning_rate": 4.272673074054205e-05, "loss": 0.006892327219247818, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00692, "step": 510, "tokens/total": 66789376, "tokens/train_per_sec_per_gpu": 3439.83, "tokens/trainable": 7113420 }, { "epoch": 1.627388535031847, "grad_norm": 0.13671875, "learning_rate": 4.268749270718778e-05, "loss": 0.006877953186631203, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0069, "step": 511, "tokens/total": 66920448, "tokens/train_per_sec_per_gpu": 3209.06, "tokens/trainable": 7126871 }, { "epoch": 1.6305732484076434, "grad_norm": 0.1474609375, "learning_rate": 4.2648167239598115e-05, "loss": 0.00894979014992714, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00899, "step": 512, "tokens/total": 67051520, "tokens/train_per_sec_per_gpu": 3488.79, "tokens/trainable": 7141439 }, { "epoch": 1.6337579617834395, "grad_norm": 0.2021484375, "learning_rate": 4.260875453216985e-05, "loss": 0.011133270338177681, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0112, "step": 513, "tokens/total": 67182592, "tokens/train_per_sec_per_gpu": 3127.41, "tokens/trainable": 7154645 }, { "epoch": 1.6369426751592355, "grad_norm": 0.1826171875, "learning_rate": 4.256925477973105e-05, "loss": 0.00897931307554245, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00902, "step": 514, "tokens/total": 67313664, "tokens/train_per_sec_per_gpu": 3531.85, "tokens/trainable": 7169429 }, { "epoch": 1.6401273885350318, "grad_norm": 0.1689453125, "learning_rate": 4.2529668177540064e-05, "loss": 0.007193025201559067, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00722, "step": 515, "tokens/total": 67444736, "tokens/train_per_sec_per_gpu": 3300.04, "tokens/trainable": 7183294 }, { "epoch": 1.643312101910828, "grad_norm": 0.1953125, "learning_rate": 4.248999492128456e-05, "loss": 0.008410904556512833, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00845, "step": 516, "tokens/total": 67575808, "tokens/train_per_sec_per_gpu": 3053.28, "tokens/trainable": 7196109 }, { "epoch": 1.6464968152866242, "grad_norm": 0.169921875, "learning_rate": 4.2450235207080594e-05, "loss": 0.007929853163659573, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00796, "step": 517, "tokens/total": 67706880, "tokens/train_per_sec_per_gpu": 3368.48, "tokens/trainable": 7210198 }, { "epoch": 1.6496815286624202, "grad_norm": 0.166015625, "learning_rate": 4.241038923147154e-05, "loss": 0.011742248199880123, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01181, "step": 518, "tokens/total": 67837952, "tokens/train_per_sec_per_gpu": 3747.84, "tokens/trainable": 7225870 }, { "epoch": 1.6528662420382165, "grad_norm": 0.150390625, "learning_rate": 4.237045719142726e-05, "loss": 0.007296052295714617, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00732, "step": 519, "tokens/total": 67969024, "tokens/train_per_sec_per_gpu": 3095.92, "tokens/trainable": 7238841 }, { "epoch": 1.6560509554140128, "grad_norm": 0.15234375, "learning_rate": 4.2330439284343015e-05, "loss": 0.006907866336405277, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00693, "step": 520, "tokens/total": 68100096, "tokens/train_per_sec_per_gpu": 3589.32, "tokens/trainable": 7253801 }, { "epoch": 1.6592356687898089, "grad_norm": 0.15625, "learning_rate": 4.229033570803853e-05, "loss": 0.0074706668965518475, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0075, "step": 521, "tokens/total": 68231168, "tokens/train_per_sec_per_gpu": 3802.59, "tokens/trainable": 7269629 }, { "epoch": 1.662420382165605, "grad_norm": 0.1513671875, "learning_rate": 4.2250146660757036e-05, "loss": 0.009104968048632145, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00915, "step": 522, "tokens/total": 68362240, "tokens/train_per_sec_per_gpu": 3755.79, "tokens/trainable": 7285363 }, { "epoch": 1.6656050955414012, "grad_norm": 0.1484375, "learning_rate": 4.220987234116426e-05, "loss": 0.005891850218176842, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00591, "step": 523, "tokens/total": 68493312, "tokens/train_per_sec_per_gpu": 3446.53, "tokens/trainable": 7299790 }, { "epoch": 1.6687898089171975, "grad_norm": 0.162109375, "learning_rate": 4.216951294834744e-05, "loss": 0.006473960820585489, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00649, "step": 524, "tokens/total": 68624384, "tokens/train_per_sec_per_gpu": 3751.7, "tokens/trainable": 7315516 }, { "epoch": 1.6719745222929936, "grad_norm": 0.1337890625, "learning_rate": 4.2129068681814396e-05, "loss": 0.0052047837525606155, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00522, "step": 525, "tokens/total": 68755456, "tokens/train_per_sec_per_gpu": 3241.92, "tokens/trainable": 7329146 }, { "epoch": 1.6751592356687897, "grad_norm": 0.2490234375, "learning_rate": 4.208853974149246e-05, "loss": 0.01116788387298584, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01123, "step": 526, "tokens/total": 68886528, "tokens/train_per_sec_per_gpu": 3005.85, "tokens/trainable": 7341894 }, { "epoch": 1.678343949044586, "grad_norm": 0.2490234375, "learning_rate": 4.204792632772754e-05, "loss": 0.01081200409680605, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01087, "step": 527, "tokens/total": 69017600, "tokens/train_per_sec_per_gpu": 3072.53, "tokens/trainable": 7354819 }, { "epoch": 1.6815286624203822, "grad_norm": 0.181640625, "learning_rate": 4.200722864128315e-05, "loss": 0.007884484715759754, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00792, "step": 528, "tokens/total": 69148672, "tokens/train_per_sec_per_gpu": 3481.52, "tokens/trainable": 7369372 }, { "epoch": 1.6847133757961783, "grad_norm": 0.146484375, "learning_rate": 4.196644688333935e-05, "loss": 0.006211051717400551, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00623, "step": 529, "tokens/total": 69279744, "tokens/train_per_sec_per_gpu": 3436.61, "tokens/trainable": 7383760 }, { "epoch": 1.6878980891719744, "grad_norm": 0.1689453125, "learning_rate": 4.19255812554918e-05, "loss": 0.007918811403214931, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00795, "step": 530, "tokens/total": 69410816, "tokens/train_per_sec_per_gpu": 3498.93, "tokens/trainable": 7398384 }, { "epoch": 1.6910828025477707, "grad_norm": 0.2177734375, "learning_rate": 4.1884631959750766e-05, "loss": 0.007444203365594149, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00747, "step": 531, "tokens/total": 69541888, "tokens/train_per_sec_per_gpu": 3052.82, "tokens/trainable": 7411204 }, { "epoch": 1.694267515923567, "grad_norm": 0.2392578125, "learning_rate": 4.1843599198540095e-05, "loss": 0.006427375599741936, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00645, "step": 532, "tokens/total": 69672960, "tokens/train_per_sec_per_gpu": 2976.57, "tokens/trainable": 7423690 }, { "epoch": 1.697452229299363, "grad_norm": 0.169921875, "learning_rate": 4.1802483174696214e-05, "loss": 0.007701891474425793, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00773, "step": 533, "tokens/total": 69804032, "tokens/train_per_sec_per_gpu": 2933.51, "tokens/trainable": 7436112 }, { "epoch": 1.700636942675159, "grad_norm": 0.13671875, "learning_rate": 4.176128409146718e-05, "loss": 0.006673748139292002, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0067, "step": 534, "tokens/total": 69935104, "tokens/train_per_sec_per_gpu": 3182.67, "tokens/trainable": 7449477 }, { "epoch": 1.7038216560509554, "grad_norm": 0.1328125, "learning_rate": 4.172000215251161e-05, "loss": 0.008220399729907513, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00825, "step": 535, "tokens/total": 70066176, "tokens/train_per_sec_per_gpu": 3196.8, "tokens/trainable": 7462890 }, { "epoch": 1.7070063694267517, "grad_norm": 0.1982421875, "learning_rate": 4.167863756189767e-05, "loss": 0.008523629046976566, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00856, "step": 536, "tokens/total": 70197248, "tokens/train_per_sec_per_gpu": 3266.89, "tokens/trainable": 7476579 }, { "epoch": 1.7101910828025477, "grad_norm": 0.1630859375, "learning_rate": 4.163719052410217e-05, "loss": 0.008510093204677105, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00855, "step": 537, "tokens/total": 70328320, "tokens/train_per_sec_per_gpu": 3648.67, "tokens/trainable": 7491858 }, { "epoch": 1.7133757961783438, "grad_norm": 0.16796875, "learning_rate": 4.159566124400942e-05, "loss": 0.00962991826236248, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00968, "step": 538, "tokens/total": 70459392, "tokens/train_per_sec_per_gpu": 3612.85, "tokens/trainable": 7507000 }, { "epoch": 1.71656050955414, "grad_norm": 0.1611328125, "learning_rate": 4.1554049926910285e-05, "loss": 0.006633860524743795, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00666, "step": 539, "tokens/total": 70590464, "tokens/train_per_sec_per_gpu": 3414.03, "tokens/trainable": 7521257 }, { "epoch": 1.7197452229299364, "grad_norm": 0.1611328125, "learning_rate": 4.151235677850119e-05, "loss": 0.007898521609604359, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00793, "step": 540, "tokens/total": 70721536, "tokens/train_per_sec_per_gpu": 3441.64, "tokens/trainable": 7535621 }, { "epoch": 1.7229299363057324, "grad_norm": 0.154296875, "learning_rate": 4.147058200488305e-05, "loss": 0.009673213586211205, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00972, "step": 541, "tokens/total": 70852608, "tokens/train_per_sec_per_gpu": 3247.07, "tokens/trainable": 7549162 }, { "epoch": 1.7261146496815285, "grad_norm": 0.1416015625, "learning_rate": 4.142872581256028e-05, "loss": 0.007840042002499104, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00787, "step": 542, "tokens/total": 70983680, "tokens/train_per_sec_per_gpu": 3313.54, "tokens/trainable": 7563047 }, { "epoch": 1.7292993630573248, "grad_norm": 0.158203125, "learning_rate": 4.1386788408439784e-05, "loss": 0.005681775975972414, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0057, "step": 543, "tokens/total": 71114752, "tokens/train_per_sec_per_gpu": 3239.22, "tokens/trainable": 7576603 }, { "epoch": 1.732484076433121, "grad_norm": 0.140625, "learning_rate": 4.134476999982989e-05, "loss": 0.005047548562288284, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00506, "step": 544, "tokens/total": 71245824, "tokens/train_per_sec_per_gpu": 3265.49, "tokens/trainable": 7590285 }, { "epoch": 1.7356687898089171, "grad_norm": 0.1572265625, "learning_rate": 4.130267079443938e-05, "loss": 0.0074127367697656155, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00744, "step": 545, "tokens/total": 71376896, "tokens/train_per_sec_per_gpu": 3477.95, "tokens/trainable": 7604842 }, { "epoch": 1.7388535031847132, "grad_norm": 0.197265625, "learning_rate": 4.1260491000376446e-05, "loss": 0.007608677726238966, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00764, "step": 546, "tokens/total": 71507968, "tokens/train_per_sec_per_gpu": 3118.58, "tokens/trainable": 7617958 }, { "epoch": 1.7420382165605095, "grad_norm": 0.21484375, "learning_rate": 4.1218230826147615e-05, "loss": 0.01108642015606165, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01115, "step": 547, "tokens/total": 71639040, "tokens/train_per_sec_per_gpu": 3443.01, "tokens/trainable": 7632387 }, { "epoch": 1.7452229299363058, "grad_norm": 0.1552734375, "learning_rate": 4.117589048065677e-05, "loss": 0.006157029885798693, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00618, "step": 548, "tokens/total": 71770112, "tokens/train_per_sec_per_gpu": 3439.27, "tokens/trainable": 7646780 }, { "epoch": 1.7484076433121019, "grad_norm": 0.138671875, "learning_rate": 4.113347017320414e-05, "loss": 0.005342322401702404, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00536, "step": 549, "tokens/total": 71901184, "tokens/train_per_sec_per_gpu": 3001.79, "tokens/trainable": 7659368 }, { "epoch": 1.7515923566878981, "grad_norm": 0.09423828125, "learning_rate": 4.1090970113485184e-05, "loss": 0.0040708379819989204, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00408, "step": 550, "tokens/total": 72032256, "tokens/train_per_sec_per_gpu": 3546.66, "tokens/trainable": 7674195 }, { "epoch": 1.7547770700636942, "grad_norm": 0.19921875, "learning_rate": 4.1048390511589595e-05, "loss": 0.01067125890403986, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01073, "step": 551, "tokens/total": 72163328, "tokens/train_per_sec_per_gpu": 3415.06, "tokens/trainable": 7688507 }, { "epoch": 1.7579617834394905, "grad_norm": 0.171875, "learning_rate": 4.1005731578000305e-05, "loss": 0.008569694124162197, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00861, "step": 552, "tokens/total": 72294400, "tokens/train_per_sec_per_gpu": 3627.75, "tokens/trainable": 7703620 }, { "epoch": 1.7611464968152868, "grad_norm": 0.1767578125, "learning_rate": 4.0962993523592374e-05, "loss": 0.009042307734489441, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00908, "step": 553, "tokens/total": 72425472, "tokens/train_per_sec_per_gpu": 3269.54, "tokens/trainable": 7717318 }, { "epoch": 1.7643312101910829, "grad_norm": 0.1943359375, "learning_rate": 4.092017655963198e-05, "loss": 0.007899527437984943, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00793, "step": 554, "tokens/total": 72556544, "tokens/train_per_sec_per_gpu": 3212.13, "tokens/trainable": 7730777 }, { "epoch": 1.767515923566879, "grad_norm": 0.1787109375, "learning_rate": 4.0877280897775406e-05, "loss": 0.010296393185853958, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01035, "step": 555, "tokens/total": 72687616, "tokens/train_per_sec_per_gpu": 3337.93, "tokens/trainable": 7744761 }, { "epoch": 1.7707006369426752, "grad_norm": 0.146484375, "learning_rate": 4.083430675006791e-05, "loss": 0.009942286647856236, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00999, "step": 556, "tokens/total": 72818688, "tokens/train_per_sec_per_gpu": 3400.69, "tokens/trainable": 7759000 }, { "epoch": 1.7738853503184715, "grad_norm": 0.197265625, "learning_rate": 4.0791254328942756e-05, "loss": 0.00717775197699666, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0072, "step": 557, "tokens/total": 72949760, "tokens/train_per_sec_per_gpu": 3122.89, "tokens/trainable": 7772100 }, { "epoch": 1.7770700636942676, "grad_norm": 0.1435546875, "learning_rate": 4.074812384722014e-05, "loss": 0.008067919872701168, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0081, "step": 558, "tokens/total": 73080832, "tokens/train_per_sec_per_gpu": 3519.68, "tokens/trainable": 7786740 }, { "epoch": 1.7802547770700636, "grad_norm": 0.169921875, "learning_rate": 4.0704915518106125e-05, "loss": 0.007346912752836943, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00737, "step": 559, "tokens/total": 73211904, "tokens/train_per_sec_per_gpu": 3104.83, "tokens/trainable": 7799749 }, { "epoch": 1.78343949044586, "grad_norm": 0.1298828125, "learning_rate": 4.066162955519159e-05, "loss": 0.0073562380857765675, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00738, "step": 560, "tokens/total": 73342976, "tokens/train_per_sec_per_gpu": 3360.32, "tokens/trainable": 7813821 }, { "epoch": 1.7866242038216562, "grad_norm": 0.19921875, "learning_rate": 4.061826617245119e-05, "loss": 0.007865460589528084, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0079, "step": 561, "tokens/total": 73474048, "tokens/train_per_sec_per_gpu": 2859.48, "tokens/trainable": 7825883 }, { "epoch": 1.7898089171974523, "grad_norm": 0.189453125, "learning_rate": 4.0574825584242275e-05, "loss": 0.008709411136806011, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00875, "step": 562, "tokens/total": 73605120, "tokens/train_per_sec_per_gpu": 3470.03, "tokens/trainable": 7840437 }, { "epoch": 1.7929936305732483, "grad_norm": 0.1728515625, "learning_rate": 4.053130800530386e-05, "loss": 0.010312874801456928, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01037, "step": 563, "tokens/total": 73736192, "tokens/train_per_sec_per_gpu": 3416.83, "tokens/trainable": 7854748 }, { "epoch": 1.7961783439490446, "grad_norm": 0.1787109375, "learning_rate": 4.048771365075554e-05, "loss": 0.006712635047733784, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00674, "step": 564, "tokens/total": 73867264, "tokens/train_per_sec_per_gpu": 3008.08, "tokens/trainable": 7867356 }, { "epoch": 1.799363057324841, "grad_norm": 0.216796875, "learning_rate": 4.0444042736096435e-05, "loss": 0.012959079816937447, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01304, "step": 565, "tokens/total": 73998336, "tokens/train_per_sec_per_gpu": 3455.78, "tokens/trainable": 7881813 }, { "epoch": 1.802547770700637, "grad_norm": 0.142578125, "learning_rate": 4.0400295477204105e-05, "loss": 0.006475909147411585, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0065, "step": 566, "tokens/total": 74129408, "tokens/train_per_sec_per_gpu": 3215.58, "tokens/trainable": 7895287 }, { "epoch": 1.805732484076433, "grad_norm": 0.1748046875, "learning_rate": 4.035647209033353e-05, "loss": 0.009855620563030243, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0099, "step": 567, "tokens/total": 74260480, "tokens/train_per_sec_per_gpu": 3762.07, "tokens/trainable": 7910983 }, { "epoch": 1.8089171974522293, "grad_norm": 0.169921875, "learning_rate": 4.031257279211599e-05, "loss": 0.007472330704331398, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0075, "step": 568, "tokens/total": 74391552, "tokens/train_per_sec_per_gpu": 3163.77, "tokens/trainable": 7924299 }, { "epoch": 1.8121019108280256, "grad_norm": 0.16796875, "learning_rate": 4.026859779955802e-05, "loss": 0.008227458223700523, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00826, "step": 569, "tokens/total": 74522624, "tokens/train_per_sec_per_gpu": 3363.95, "tokens/trainable": 7938293 }, { "epoch": 1.8152866242038217, "grad_norm": 0.1591796875, "learning_rate": 4.022454733004035e-05, "loss": 0.0075818696059286594, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00761, "step": 570, "tokens/total": 74653696, "tokens/train_per_sec_per_gpu": 2956.76, "tokens/trainable": 7950764 }, { "epoch": 1.8184713375796178, "grad_norm": 0.1865234375, "learning_rate": 4.01804216013168e-05, "loss": 0.009609042666852474, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00966, "step": 571, "tokens/total": 74784768, "tokens/train_per_sec_per_gpu": 3564.99, "tokens/trainable": 7965654 }, { "epoch": 1.821656050955414, "grad_norm": 0.2041015625, "learning_rate": 4.013622083151321e-05, "loss": 0.011200753971934319, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01126, "step": 572, "tokens/total": 74915840, "tokens/train_per_sec_per_gpu": 3175.53, "tokens/trainable": 7978976 }, { "epoch": 1.8248407643312103, "grad_norm": 0.1943359375, "learning_rate": 4.009194523912638e-05, "loss": 0.011081540025770664, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01114, "step": 573, "tokens/total": 75046912, "tokens/train_per_sec_per_gpu": 3512.32, "tokens/trainable": 7993605 }, { "epoch": 1.8280254777070064, "grad_norm": 0.150390625, "learning_rate": 4.004759504302297e-05, "loss": 0.007977863773703575, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00801, "step": 574, "tokens/total": 75177984, "tokens/train_per_sec_per_gpu": 3555.25, "tokens/trainable": 8008422 }, { "epoch": 1.8312101910828025, "grad_norm": 0.1474609375, "learning_rate": 4.000317046243845e-05, "loss": 0.005992071703076363, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00601, "step": 575, "tokens/total": 75309056, "tokens/train_per_sec_per_gpu": 2628.73, "tokens/trainable": 8019575 }, { "epoch": 1.8343949044585988, "grad_norm": 0.134765625, "learning_rate": 3.9958671716975966e-05, "loss": 0.005763609427958727, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00578, "step": 576, "tokens/total": 75440128, "tokens/train_per_sec_per_gpu": 3099.03, "tokens/trainable": 8032607 }, { "epoch": 1.837579617834395, "grad_norm": 0.18359375, "learning_rate": 3.9914099026605286e-05, "loss": 0.00910909567028284, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00915, "step": 577, "tokens/total": 75571200, "tokens/train_per_sec_per_gpu": 3311.4, "tokens/trainable": 8046493 }, { "epoch": 1.8407643312101911, "grad_norm": 0.1630859375, "learning_rate": 3.986945261166174e-05, "loss": 0.0058432393707334995, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00586, "step": 578, "tokens/total": 75702272, "tokens/train_per_sec_per_gpu": 2863.58, "tokens/trainable": 8058562 }, { "epoch": 1.8439490445859872, "grad_norm": 0.1494140625, "learning_rate": 3.9824732692845045e-05, "loss": 0.006885102018713951, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00691, "step": 579, "tokens/total": 75833344, "tokens/train_per_sec_per_gpu": 3605.27, "tokens/trainable": 8073625 }, { "epoch": 1.8471337579617835, "grad_norm": 0.150390625, "learning_rate": 3.977993949121831e-05, "loss": 0.007448772434145212, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00748, "step": 580, "tokens/total": 75964416, "tokens/train_per_sec_per_gpu": 3178.81, "tokens/trainable": 8086952 }, { "epoch": 1.8503184713375798, "grad_norm": 0.1787109375, "learning_rate": 3.9735073228206896e-05, "loss": 0.01037865225225687, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01043, "step": 581, "tokens/total": 76095488, "tokens/train_per_sec_per_gpu": 3771.77, "tokens/trainable": 8102605 }, { "epoch": 1.8535031847133758, "grad_norm": 0.1728515625, "learning_rate": 3.9690134125597315e-05, "loss": 0.005139034241437912, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00515, "step": 582, "tokens/total": 76226560, "tokens/train_per_sec_per_gpu": 3043.71, "tokens/trainable": 8115368 }, { "epoch": 1.856687898089172, "grad_norm": 0.1376953125, "learning_rate": 3.9645122405536144e-05, "loss": 0.006013063248246908, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00603, "step": 583, "tokens/total": 76357632, "tokens/train_per_sec_per_gpu": 3251.38, "tokens/trainable": 8129041 }, { "epoch": 1.8598726114649682, "grad_norm": 0.1728515625, "learning_rate": 3.9600038290528944e-05, "loss": 0.00799723993986845, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00803, "step": 584, "tokens/total": 76488704, "tokens/train_per_sec_per_gpu": 3320.59, "tokens/trainable": 8142910 }, { "epoch": 1.8630573248407645, "grad_norm": 0.142578125, "learning_rate": 3.955488200343913e-05, "loss": 0.00701179401949048, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00704, "step": 585, "tokens/total": 76619776, "tokens/train_per_sec_per_gpu": 3561.59, "tokens/trainable": 8157807 }, { "epoch": 1.8662420382165605, "grad_norm": 0.1533203125, "learning_rate": 3.950965376748689e-05, "loss": 0.00536251999437809, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00538, "step": 586, "tokens/total": 76750848, "tokens/train_per_sec_per_gpu": 3253.01, "tokens/trainable": 8171483 }, { "epoch": 1.8694267515923566, "grad_norm": 0.1474609375, "learning_rate": 3.946435380624808e-05, "loss": 0.005477463360875845, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00549, "step": 587, "tokens/total": 76881920, "tokens/train_per_sec_per_gpu": 2986.43, "tokens/trainable": 8184075 }, { "epoch": 1.872611464968153, "grad_norm": 0.154296875, "learning_rate": 3.94189823436531e-05, "loss": 0.007740751840174198, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00777, "step": 588, "tokens/total": 77012992, "tokens/train_per_sec_per_gpu": 3302.5, "tokens/trainable": 8197922 }, { "epoch": 1.8757961783439492, "grad_norm": 0.1669921875, "learning_rate": 3.937353960398581e-05, "loss": 0.007541216444224119, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00757, "step": 589, "tokens/total": 77144064, "tokens/train_per_sec_per_gpu": 3178.73, "tokens/trainable": 8211342 }, { "epoch": 1.8789808917197452, "grad_norm": 0.1435546875, "learning_rate": 3.932802581188243e-05, "loss": 0.006363678723573685, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00638, "step": 590, "tokens/total": 77275136, "tokens/train_per_sec_per_gpu": 3260.49, "tokens/trainable": 8225038 }, { "epoch": 1.8821656050955413, "grad_norm": 0.2060546875, "learning_rate": 3.928244119233038e-05, "loss": 0.010623229667544365, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01068, "step": 591, "tokens/total": 77406208, "tokens/train_per_sec_per_gpu": 3108.47, "tokens/trainable": 8238178 }, { "epoch": 1.8853503184713376, "grad_norm": 0.205078125, "learning_rate": 3.9236785970667214e-05, "loss": 0.008010565303266048, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00804, "step": 592, "tokens/total": 77537280, "tokens/train_per_sec_per_gpu": 3537.95, "tokens/trainable": 8252970 }, { "epoch": 1.888535031847134, "grad_norm": 0.130859375, "learning_rate": 3.91910603725795e-05, "loss": 0.006147422362118959, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00617, "step": 593, "tokens/total": 77668352, "tokens/train_per_sec_per_gpu": 3429.45, "tokens/trainable": 8267246 }, { "epoch": 1.89171974522293, "grad_norm": 0.162109375, "learning_rate": 3.9145264624101676e-05, "loss": 0.007066651247441769, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00709, "step": 594, "tokens/total": 77799424, "tokens/train_per_sec_per_gpu": 3344.2, "tokens/trainable": 8281242 }, { "epoch": 1.894904458598726, "grad_norm": 0.169921875, "learning_rate": 3.909939895161498e-05, "loss": 0.007343544624745846, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00737, "step": 595, "tokens/total": 77930496, "tokens/train_per_sec_per_gpu": 3615.15, "tokens/trainable": 8296272 }, { "epoch": 1.8980891719745223, "grad_norm": 0.1669921875, "learning_rate": 3.905346358184629e-05, "loss": 0.006192365661263466, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00621, "step": 596, "tokens/total": 78061568, "tokens/train_per_sec_per_gpu": 3043.7, "tokens/trainable": 8309000 }, { "epoch": 1.9012738853503186, "grad_norm": 0.201171875, "learning_rate": 3.900745874186701e-05, "loss": 0.008313626050949097, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00835, "step": 597, "tokens/total": 78192640, "tokens/train_per_sec_per_gpu": 3567.72, "tokens/trainable": 8323863 }, { "epoch": 1.9044585987261147, "grad_norm": 0.1298828125, "learning_rate": 3.896138465909196e-05, "loss": 0.006214370485395193, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00623, "step": 598, "tokens/total": 78323712, "tokens/train_per_sec_per_gpu": 3365.54, "tokens/trainable": 8337954 }, { "epoch": 1.9076433121019107, "grad_norm": 0.1591796875, "learning_rate": 3.8915241561278266e-05, "loss": 0.007558876648545265, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00759, "step": 599, "tokens/total": 78454784, "tokens/train_per_sec_per_gpu": 3425.9, "tokens/trainable": 8352302 }, { "epoch": 1.910828025477707, "grad_norm": 0.166015625, "learning_rate": 3.8869029676524174e-05, "loss": 0.005686955992132425, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0057, "step": 600, "tokens/total": 78585856, "tokens/train_per_sec_per_gpu": 3357.73, "tokens/trainable": 8366310 }, { "epoch": 1.9140127388535033, "grad_norm": 0.1630859375, "learning_rate": 3.8822749233268006e-05, "loss": 0.0077353366650640965, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00777, "step": 601, "tokens/total": 78716928, "tokens/train_per_sec_per_gpu": 3165.47, "tokens/trainable": 8379573 }, { "epoch": 1.9171974522292994, "grad_norm": 0.1279296875, "learning_rate": 3.877640046028696e-05, "loss": 0.0062081338837742805, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00623, "step": 602, "tokens/total": 78848000, "tokens/train_per_sec_per_gpu": 3562.58, "tokens/trainable": 8394385 }, { "epoch": 1.9203821656050954, "grad_norm": 0.154296875, "learning_rate": 3.872998358669601e-05, "loss": 0.006809039041399956, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00683, "step": 603, "tokens/total": 78979072, "tokens/train_per_sec_per_gpu": 3598.12, "tokens/trainable": 8409414 }, { "epoch": 1.9235668789808917, "grad_norm": 0.1513671875, "learning_rate": 3.868349884194678e-05, "loss": 0.004747939296066761, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00476, "step": 604, "tokens/total": 79110144, "tokens/train_per_sec_per_gpu": 3006.34, "tokens/trainable": 8422068 }, { "epoch": 1.926751592356688, "grad_norm": 0.1806640625, "learning_rate": 3.863694645582642e-05, "loss": 0.007029777858406305, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00705, "step": 605, "tokens/total": 79241216, "tokens/train_per_sec_per_gpu": 3244.04, "tokens/trainable": 8435649 }, { "epoch": 1.929936305732484, "grad_norm": 0.1650390625, "learning_rate": 3.8590326658456376e-05, "loss": 0.006050920579582453, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00607, "step": 606, "tokens/total": 79372288, "tokens/train_per_sec_per_gpu": 3305.31, "tokens/trainable": 8449489 }, { "epoch": 1.9331210191082802, "grad_norm": 0.17578125, "learning_rate": 3.854363968029142e-05, "loss": 0.0075315129943192005, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00756, "step": 607, "tokens/total": 79503360, "tokens/train_per_sec_per_gpu": 3186.77, "tokens/trainable": 8462838 }, { "epoch": 1.9363057324840764, "grad_norm": 0.158203125, "learning_rate": 3.849688575211836e-05, "loss": 0.006646899972110987, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00667, "step": 608, "tokens/total": 79634432, "tokens/train_per_sec_per_gpu": 3410.5, "tokens/trainable": 8477093 }, { "epoch": 1.9394904458598727, "grad_norm": 0.189453125, "learning_rate": 3.8450065105054966e-05, "loss": 0.00727155851200223, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0073, "step": 609, "tokens/total": 79765504, "tokens/train_per_sec_per_gpu": 3308.9, "tokens/trainable": 8490972 }, { "epoch": 1.9426751592356688, "grad_norm": 0.1875, "learning_rate": 3.840317797054882e-05, "loss": 0.009210377931594849, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00925, "step": 610, "tokens/total": 79896576, "tokens/train_per_sec_per_gpu": 3480.61, "tokens/trainable": 8505449 }, { "epoch": 1.9458598726114649, "grad_norm": 0.1650390625, "learning_rate": 3.83562245803762e-05, "loss": 0.008728603832423687, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00877, "step": 611, "tokens/total": 80027648, "tokens/train_per_sec_per_gpu": 3432.92, "tokens/trainable": 8519873 }, { "epoch": 1.9490445859872612, "grad_norm": 0.2119140625, "learning_rate": 3.830920516664085e-05, "loss": 0.00592332798987627, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00594, "step": 612, "tokens/total": 80158720, "tokens/train_per_sec_per_gpu": 2869.74, "tokens/trainable": 8531911 }, { "epoch": 1.9522292993630574, "grad_norm": 0.162109375, "learning_rate": 3.826211996177291e-05, "loss": 0.00876440480351448, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0088, "step": 613, "tokens/total": 80289792, "tokens/train_per_sec_per_gpu": 3232.65, "tokens/trainable": 8545475 }, { "epoch": 1.9554140127388535, "grad_norm": 0.173828125, "learning_rate": 3.8214969198527787e-05, "loss": 0.010759076103568077, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.01082, "step": 614, "tokens/total": 80420864, "tokens/train_per_sec_per_gpu": 3370.92, "tokens/trainable": 8559559 }, { "epoch": 1.9585987261146496, "grad_norm": 0.158203125, "learning_rate": 3.8167753109984886e-05, "loss": 0.007340329699218273, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00737, "step": 615, "tokens/total": 80551936, "tokens/train_per_sec_per_gpu": 3167.2, "tokens/trainable": 8572860 }, { "epoch": 1.9617834394904459, "grad_norm": 0.1923828125, "learning_rate": 3.8120471929546576e-05, "loss": 0.009786421433091164, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00983, "step": 616, "tokens/total": 80683008, "tokens/train_per_sec_per_gpu": 3398.38, "tokens/trainable": 8587062 }, { "epoch": 1.9649681528662422, "grad_norm": 0.1591796875, "learning_rate": 3.807312589093701e-05, "loss": 0.007565875072032213, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00759, "step": 617, "tokens/total": 80814080, "tokens/train_per_sec_per_gpu": 3589.95, "tokens/trainable": 8602027 }, { "epoch": 1.9681528662420382, "grad_norm": 0.158203125, "learning_rate": 3.802571522820091e-05, "loss": 0.005681060254573822, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0057, "step": 618, "tokens/total": 80945152, "tokens/train_per_sec_per_gpu": 3097.22, "tokens/trainable": 8615057 }, { "epoch": 1.9713375796178343, "grad_norm": 0.140625, "learning_rate": 3.7978240175702475e-05, "loss": 0.007764302659779787, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00779, "step": 619, "tokens/total": 81076224, "tokens/train_per_sec_per_gpu": 3585.75, "tokens/trainable": 8630164 }, { "epoch": 1.9745222929936306, "grad_norm": 0.1787109375, "learning_rate": 3.7930700968124214e-05, "loss": 0.007851460948586464, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00788, "step": 620, "tokens/total": 81207296, "tokens/train_per_sec_per_gpu": 3407.67, "tokens/trainable": 8644417 }, { "epoch": 1.9777070063694269, "grad_norm": 0.158203125, "learning_rate": 3.788309784046574e-05, "loss": 0.007941392250359058, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00797, "step": 621, "tokens/total": 81338368, "tokens/train_per_sec_per_gpu": 3178.19, "tokens/trainable": 8657791 }, { "epoch": 1.980891719745223, "grad_norm": 0.19921875, "learning_rate": 3.7835431028042664e-05, "loss": 0.008540787734091282, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00858, "step": 622, "tokens/total": 81469440, "tokens/train_per_sec_per_gpu": 3471.65, "tokens/trainable": 8672330 }, { "epoch": 1.984076433121019, "grad_norm": 0.1611328125, "learning_rate": 3.778770076648543e-05, "loss": 0.008716538548469543, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00875, "step": 623, "tokens/total": 81600512, "tokens/train_per_sec_per_gpu": 3219.98, "tokens/trainable": 8685828 }, { "epoch": 1.9872611464968153, "grad_norm": 0.158203125, "learning_rate": 3.773990729173807e-05, "loss": 0.007769486866891384, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0078, "step": 624, "tokens/total": 81731584, "tokens/train_per_sec_per_gpu": 3473.48, "tokens/trainable": 8700372 }, { "epoch": 1.9904458598726116, "grad_norm": 0.1484375, "learning_rate": 3.769205084005714e-05, "loss": 0.008443665690720081, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00848, "step": 625, "tokens/total": 81862656, "tokens/train_per_sec_per_gpu": 3400.54, "tokens/trainable": 8714943 }, { "epoch": 1.9936305732484076, "grad_norm": 0.201171875, "learning_rate": 3.7644131648010494e-05, "loss": 0.009850014001131058, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0099, "step": 626, "tokens/total": 81993728, "tokens/train_per_sec_per_gpu": 3193.43, "tokens/trainable": 8728328 }, { "epoch": 1.9968152866242037, "grad_norm": 0.1416015625, "learning_rate": 3.759614995247615e-05, "loss": 0.0070216236636042595, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00705, "step": 627, "tokens/total": 82124800, "tokens/train_per_sec_per_gpu": 3280.91, "tokens/trainable": 8742135 }, { "epoch": 2.0, "grad_norm": 0.2431640625, "learning_rate": 3.7548105990641055e-05, "loss": 0.008461863733828068, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 39.25, "memory/max_allocated (GiB)": 39.25, "ppl": 1.0085, "step": 628, "tokens/total": 82198528, "tokens/train_per_sec_per_gpu": 3127.15, "tokens/trainable": 8749352 }, { "epoch": 2.0, "eval_loss": 0.00880392361432314, "eval_ppl": 1.00884, "eval_runtime": 41.5789, "eval_samples_per_second": 64.961, "eval_steps_per_second": 4.065, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 54.61, "memory/max_allocated (GiB)": 54.61, "step": 628 }, { "epoch": 2.0031847133757963, "grad_norm": 0.1103515625, "learning_rate": 3.7500000000000003e-05, "loss": 0.00489779282361269, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00491, "step": 629, "tokens/total": 82329600, "tokens/train_per_sec_per_gpu": 3227.88, "tokens/trainable": 8762717 }, { "epoch": 2.0063694267515926, "grad_norm": 0.140625, "learning_rate": 3.745183221835439e-05, "loss": 0.0062369704246521, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00626, "step": 630, "tokens/total": 82460672, "tokens/train_per_sec_per_gpu": 3063.9, "tokens/trainable": 8775458 }, { "epoch": 2.0095541401273884, "grad_norm": 0.1015625, "learning_rate": 3.740360288381105e-05, "loss": 0.004345161374658346, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00435, "step": 631, "tokens/total": 82591744, "tokens/train_per_sec_per_gpu": 3775.58, "tokens/trainable": 8791109 }, { "epoch": 2.0127388535031847, "grad_norm": 0.09326171875, "learning_rate": 3.735531223478113e-05, "loss": 0.003698494518175721, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00371, "step": 632, "tokens/total": 82722816, "tokens/train_per_sec_per_gpu": 3064.96, "tokens/trainable": 8803927 }, { "epoch": 2.015923566878981, "grad_norm": 0.1416015625, "learning_rate": 3.730696050997883e-05, "loss": 0.006370588671416044, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00639, "step": 633, "tokens/total": 82853888, "tokens/train_per_sec_per_gpu": 3757.38, "tokens/trainable": 8819539 }, { "epoch": 2.0191082802547773, "grad_norm": 0.12255859375, "learning_rate": 3.725854794842028e-05, "loss": 0.004867184441536665, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00488, "step": 634, "tokens/total": 82984960, "tokens/train_per_sec_per_gpu": 3367.17, "tokens/trainable": 8833596 }, { "epoch": 2.022292993630573, "grad_norm": 0.11669921875, "learning_rate": 3.721007478942236e-05, "loss": 0.005630412604659796, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00565, "step": 635, "tokens/total": 83116032, "tokens/train_per_sec_per_gpu": 3394.97, "tokens/trainable": 8847823 }, { "epoch": 2.0254777070063694, "grad_norm": 0.158203125, "learning_rate": 3.716154127260147e-05, "loss": 0.0077174571342766285, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00775, "step": 636, "tokens/total": 83247104, "tokens/train_per_sec_per_gpu": 3334.91, "tokens/trainable": 8861774 }, { "epoch": 2.0286624203821657, "grad_norm": 0.11376953125, "learning_rate": 3.7112947637872395e-05, "loss": 0.0045103938318789005, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00452, "step": 637, "tokens/total": 83378176, "tokens/train_per_sec_per_gpu": 3076.34, "tokens/trainable": 8874743 }, { "epoch": 2.031847133757962, "grad_norm": 0.126953125, "learning_rate": 3.706429412544711e-05, "loss": 0.005497838370501995, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00551, "step": 638, "tokens/total": 83509248, "tokens/train_per_sec_per_gpu": 3103.99, "tokens/trainable": 8887731 }, { "epoch": 2.035031847133758, "grad_norm": 0.1279296875, "learning_rate": 3.701558097583355e-05, "loss": 0.004869392607361078, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00488, "step": 639, "tokens/total": 83640320, "tokens/train_per_sec_per_gpu": 3107.44, "tokens/trainable": 8900746 }, { "epoch": 2.038216560509554, "grad_norm": 0.13671875, "learning_rate": 3.696680842983447e-05, "loss": 0.006643592845648527, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00667, "step": 640, "tokens/total": 83771392, "tokens/train_per_sec_per_gpu": 3319.13, "tokens/trainable": 8914631 }, { "epoch": 2.0414012738853504, "grad_norm": 0.1376953125, "learning_rate": 3.691797672854625e-05, "loss": 0.005362721625715494, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00538, "step": 641, "tokens/total": 83902464, "tokens/train_per_sec_per_gpu": 3013.92, "tokens/trainable": 8927265 }, { "epoch": 2.0445859872611467, "grad_norm": 0.1357421875, "learning_rate": 3.686908611335768e-05, "loss": 0.005462102126330137, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00548, "step": 642, "tokens/total": 84033536, "tokens/train_per_sec_per_gpu": 3114.98, "tokens/trainable": 8940288 }, { "epoch": 2.0477707006369426, "grad_norm": 0.125, "learning_rate": 3.682013682594876e-05, "loss": 0.0043016825802624226, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00431, "step": 643, "tokens/total": 84164608, "tokens/train_per_sec_per_gpu": 3431.73, "tokens/trainable": 8954653 }, { "epoch": 2.050955414012739, "grad_norm": 0.19140625, "learning_rate": 3.677112910828957e-05, "loss": 0.0066072107292711735, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00663, "step": 644, "tokens/total": 84295680, "tokens/train_per_sec_per_gpu": 2967.06, "tokens/trainable": 8967100 }, { "epoch": 2.054140127388535, "grad_norm": 0.1455078125, "learning_rate": 3.672206320263897e-05, "loss": 0.0054827104322612286, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0055, "step": 645, "tokens/total": 84426752, "tokens/train_per_sec_per_gpu": 3194.77, "tokens/trainable": 8980536 }, { "epoch": 2.0573248407643314, "grad_norm": 0.1513671875, "learning_rate": 3.66729393515435e-05, "loss": 0.005452790763229132, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00547, "step": 646, "tokens/total": 84557824, "tokens/train_per_sec_per_gpu": 3109.68, "tokens/trainable": 8993576 }, { "epoch": 2.0605095541401273, "grad_norm": 0.181640625, "learning_rate": 3.662375779783614e-05, "loss": 0.0072727687656879425, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0073, "step": 647, "tokens/total": 84688896, "tokens/train_per_sec_per_gpu": 3147.5, "tokens/trainable": 9006855 }, { "epoch": 2.0636942675159236, "grad_norm": 0.1064453125, "learning_rate": 3.657451878463508e-05, "loss": 0.003491069655865431, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0035, "step": 648, "tokens/total": 84819968, "tokens/train_per_sec_per_gpu": 3224.99, "tokens/trainable": 9020369 }, { "epoch": 2.06687898089172, "grad_norm": 0.1455078125, "learning_rate": 3.652522255534258e-05, "loss": 0.005467304494231939, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00548, "step": 649, "tokens/total": 84951040, "tokens/train_per_sec_per_gpu": 3640.83, "tokens/trainable": 9035605 }, { "epoch": 2.070063694267516, "grad_norm": 0.1337890625, "learning_rate": 3.647586935364372e-05, "loss": 0.004504749551415443, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00451, "step": 650, "tokens/total": 85082112, "tokens/train_per_sec_per_gpu": 3394.33, "tokens/trainable": 9049828 }, { "epoch": 2.073248407643312, "grad_norm": 0.1787109375, "learning_rate": 3.6426459423505214e-05, "loss": 0.007018570322543383, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00704, "step": 651, "tokens/total": 85213184, "tokens/train_per_sec_per_gpu": 2787.07, "tokens/trainable": 9061509 }, { "epoch": 2.0764331210191083, "grad_norm": 0.12890625, "learning_rate": 3.637699300917418e-05, "loss": 0.005671660415828228, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00569, "step": 652, "tokens/total": 85344256, "tokens/train_per_sec_per_gpu": 3667.49, "tokens/trainable": 9076828 }, { "epoch": 2.0796178343949046, "grad_norm": 0.1455078125, "learning_rate": 3.632747035517701e-05, "loss": 0.005398279055953026, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00541, "step": 653, "tokens/total": 85475328, "tokens/train_per_sec_per_gpu": 3551.7, "tokens/trainable": 9091646 }, { "epoch": 2.082802547770701, "grad_norm": 0.2099609375, "learning_rate": 3.6277891706318036e-05, "loss": 0.007613079622387886, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00764, "step": 654, "tokens/total": 85606400, "tokens/train_per_sec_per_gpu": 3571.21, "tokens/trainable": 9106545 }, { "epoch": 2.0859872611464967, "grad_norm": 0.1640625, "learning_rate": 3.622825730767842e-05, "loss": 0.0069300332106649876, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00695, "step": 655, "tokens/total": 85737472, "tokens/train_per_sec_per_gpu": 3786.4, "tokens/trainable": 9122295 }, { "epoch": 2.089171974522293, "grad_norm": 0.19140625, "learning_rate": 3.6178567404614936e-05, "loss": 0.006750217638909817, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00677, "step": 656, "tokens/total": 85868544, "tokens/train_per_sec_per_gpu": 3589.29, "tokens/trainable": 9137329 }, { "epoch": 2.0923566878980893, "grad_norm": 0.162109375, "learning_rate": 3.6128822242758686e-05, "loss": 0.0060827480629086494, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0061, "step": 657, "tokens/total": 85999616, "tokens/train_per_sec_per_gpu": 3096.84, "tokens/trainable": 9150353 }, { "epoch": 2.0955414012738856, "grad_norm": 0.1337890625, "learning_rate": 3.6079022068013945e-05, "loss": 0.006425363477319479, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00645, "step": 658, "tokens/total": 86130688, "tokens/train_per_sec_per_gpu": 3687.66, "tokens/trainable": 9165791 }, { "epoch": 2.0987261146496814, "grad_norm": 0.13671875, "learning_rate": 3.602916712655697e-05, "loss": 0.004524726886302233, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00453, "step": 659, "tokens/total": 86261760, "tokens/train_per_sec_per_gpu": 3224.45, "tokens/trainable": 9179333 }, { "epoch": 2.1019108280254777, "grad_norm": 0.1806640625, "learning_rate": 3.597925766483468e-05, "loss": 0.008739529177546501, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00878, "step": 660, "tokens/total": 86392832, "tokens/train_per_sec_per_gpu": 3380.82, "tokens/trainable": 9193503 }, { "epoch": 2.105095541401274, "grad_norm": 0.125, "learning_rate": 3.592929392956355e-05, "loss": 0.003972796723246574, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00398, "step": 661, "tokens/total": 86523904, "tokens/train_per_sec_per_gpu": 3337.39, "tokens/trainable": 9207523 }, { "epoch": 2.1082802547770703, "grad_norm": 0.1435546875, "learning_rate": 3.587927616772834e-05, "loss": 0.00485801137983799, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00487, "step": 662, "tokens/total": 86654976, "tokens/train_per_sec_per_gpu": 3418.39, "tokens/trainable": 9221801 }, { "epoch": 2.111464968152866, "grad_norm": 0.14453125, "learning_rate": 3.5829204626580856e-05, "loss": 0.005488412454724312, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0055, "step": 663, "tokens/total": 86786048, "tokens/train_per_sec_per_gpu": 3308.78, "tokens/trainable": 9235658 }, { "epoch": 2.1146496815286624, "grad_norm": 0.1708984375, "learning_rate": 3.577907955363877e-05, "loss": 0.007495546247810125, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00752, "step": 664, "tokens/total": 86917120, "tokens/train_per_sec_per_gpu": 3508.8, "tokens/trainable": 9250377 }, { "epoch": 2.1178343949044587, "grad_norm": 0.185546875, "learning_rate": 3.572890119668439e-05, "loss": 0.007228251546621323, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00725, "step": 665, "tokens/total": 87048192, "tokens/train_per_sec_per_gpu": 3480.87, "tokens/trainable": 9264939 }, { "epoch": 2.121019108280255, "grad_norm": 0.1396484375, "learning_rate": 3.567866980376337e-05, "loss": 0.005014233291149139, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00503, "step": 666, "tokens/total": 87179264, "tokens/train_per_sec_per_gpu": 3039.74, "tokens/trainable": 9277680 }, { "epoch": 2.124203821656051, "grad_norm": 0.16015625, "learning_rate": 3.562838562318358e-05, "loss": 0.004775107838213444, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00479, "step": 667, "tokens/total": 87310336, "tokens/train_per_sec_per_gpu": 3171.11, "tokens/trainable": 9291025 }, { "epoch": 2.127388535031847, "grad_norm": 0.146484375, "learning_rate": 3.557804890351383e-05, "loss": 0.006139194592833519, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00616, "step": 668, "tokens/total": 87441408, "tokens/train_per_sec_per_gpu": 3193.18, "tokens/trainable": 9304405 }, { "epoch": 2.1305732484076434, "grad_norm": 0.1279296875, "learning_rate": 3.5527659893582635e-05, "loss": 0.004298456013202667, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00431, "step": 669, "tokens/total": 87572480, "tokens/train_per_sec_per_gpu": 3227.18, "tokens/trainable": 9317913 }, { "epoch": 2.1337579617834397, "grad_norm": 0.162109375, "learning_rate": 3.547721884247699e-05, "loss": 0.005037225782871246, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00505, "step": 670, "tokens/total": 87703552, "tokens/train_per_sec_per_gpu": 3102.83, "tokens/trainable": 9331033 }, { "epoch": 2.1369426751592355, "grad_norm": 0.1640625, "learning_rate": 3.5426725999541174e-05, "loss": 0.005763325374573469, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00578, "step": 671, "tokens/total": 87834624, "tokens/train_per_sec_per_gpu": 3489.71, "tokens/trainable": 9345625 }, { "epoch": 2.140127388535032, "grad_norm": 0.1591796875, "learning_rate": 3.5376181614375436e-05, "loss": 0.005982933100312948, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.006, "step": 672, "tokens/total": 87965696, "tokens/train_per_sec_per_gpu": 3141.99, "tokens/trainable": 9358787 }, { "epoch": 2.143312101910828, "grad_norm": 0.140625, "learning_rate": 3.532558593683486e-05, "loss": 0.005526629742234945, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00554, "step": 673, "tokens/total": 88096768, "tokens/train_per_sec_per_gpu": 3602.99, "tokens/trainable": 9373882 }, { "epoch": 2.1464968152866244, "grad_norm": 0.11767578125, "learning_rate": 3.527493921702807e-05, "loss": 0.0037272945046424866, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00373, "step": 674, "tokens/total": 88227840, "tokens/train_per_sec_per_gpu": 3350.15, "tokens/trainable": 9387904 }, { "epoch": 2.1496815286624202, "grad_norm": 0.158203125, "learning_rate": 3.5224241705316e-05, "loss": 0.006022762041538954, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00604, "step": 675, "tokens/total": 88358912, "tokens/train_per_sec_per_gpu": 3348.71, "tokens/trainable": 9401921 }, { "epoch": 2.1528662420382165, "grad_norm": 0.138671875, "learning_rate": 3.517349365231065e-05, "loss": 0.005744612775743008, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00576, "step": 676, "tokens/total": 88489984, "tokens/train_per_sec_per_gpu": 3430.41, "tokens/trainable": 9416291 }, { "epoch": 2.156050955414013, "grad_norm": 0.1484375, "learning_rate": 3.5122695308873886e-05, "loss": 0.005131675861775875, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00514, "step": 677, "tokens/total": 88621056, "tokens/train_per_sec_per_gpu": 3279.37, "tokens/trainable": 9430037 }, { "epoch": 2.159235668789809, "grad_norm": 0.1806640625, "learning_rate": 3.5071846926116156e-05, "loss": 0.007699973881244659, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00773, "step": 678, "tokens/total": 88752128, "tokens/train_per_sec_per_gpu": 3222.97, "tokens/trainable": 9443541 }, { "epoch": 2.162420382165605, "grad_norm": 0.142578125, "learning_rate": 3.502094875539528e-05, "loss": 0.004470378626137972, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00448, "step": 679, "tokens/total": 88883200, "tokens/train_per_sec_per_gpu": 3648.92, "tokens/trainable": 9458725 }, { "epoch": 2.1656050955414012, "grad_norm": 0.1982421875, "learning_rate": 3.497000104831518e-05, "loss": 0.00871230848133564, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00875, "step": 680, "tokens/total": 89014272, "tokens/train_per_sec_per_gpu": 3137.55, "tokens/trainable": 9471880 }, { "epoch": 2.1687898089171975, "grad_norm": 0.130859375, "learning_rate": 3.491900405672466e-05, "loss": 0.0037058612797409296, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00371, "step": 681, "tokens/total": 89145344, "tokens/train_per_sec_per_gpu": 3191.78, "tokens/trainable": 9485245 }, { "epoch": 2.171974522292994, "grad_norm": 0.1318359375, "learning_rate": 3.486795803271614e-05, "loss": 0.004613788798451424, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00462, "step": 682, "tokens/total": 89276416, "tokens/train_per_sec_per_gpu": 3499.78, "tokens/trainable": 9499844 }, { "epoch": 2.1751592356687897, "grad_norm": 0.12109375, "learning_rate": 3.481686322862443e-05, "loss": 0.003956732805818319, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00396, "step": 683, "tokens/total": 89407488, "tokens/train_per_sec_per_gpu": 3088.9, "tokens/trainable": 9512840 }, { "epoch": 2.178343949044586, "grad_norm": 0.1611328125, "learning_rate": 3.476571989702548e-05, "loss": 0.006073053926229477, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00609, "step": 684, "tokens/total": 89538560, "tokens/train_per_sec_per_gpu": 3425.79, "tokens/trainable": 9527160 }, { "epoch": 2.1815286624203822, "grad_norm": 0.2412109375, "learning_rate": 3.4714528290735105e-05, "loss": 0.005430576391518116, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00545, "step": 685, "tokens/total": 89669632, "tokens/train_per_sec_per_gpu": 3295.01, "tokens/trainable": 9540964 }, { "epoch": 2.1847133757961785, "grad_norm": 0.10205078125, "learning_rate": 3.466328866280778e-05, "loss": 0.003143883775919676, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00315, "step": 686, "tokens/total": 89800704, "tokens/train_per_sec_per_gpu": 3469.02, "tokens/trainable": 9555485 }, { "epoch": 2.1878980891719744, "grad_norm": 0.1328125, "learning_rate": 3.4612001266535345e-05, "loss": 0.005530213471502066, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00555, "step": 687, "tokens/total": 89931776, "tokens/train_per_sec_per_gpu": 3563.1, "tokens/trainable": 9570400 }, { "epoch": 2.1910828025477707, "grad_norm": 0.1357421875, "learning_rate": 3.456066635544577e-05, "loss": 0.004905232228338718, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00492, "step": 688, "tokens/total": 90062848, "tokens/train_per_sec_per_gpu": 3391.03, "tokens/trainable": 9584608 }, { "epoch": 2.194267515923567, "grad_norm": 0.146484375, "learning_rate": 3.450928418330193e-05, "loss": 0.006313517689704895, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00633, "step": 689, "tokens/total": 90193920, "tokens/train_per_sec_per_gpu": 3288.42, "tokens/trainable": 9598448 }, { "epoch": 2.1974522292993632, "grad_norm": 0.1201171875, "learning_rate": 3.44578550041003e-05, "loss": 0.0040069082751870155, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00401, "step": 690, "tokens/total": 90324992, "tokens/train_per_sec_per_gpu": 3679.5, "tokens/trainable": 9613777 }, { "epoch": 2.200636942675159, "grad_norm": 0.1396484375, "learning_rate": 3.440637907206973e-05, "loss": 0.0068097589537501335, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00683, "step": 691, "tokens/total": 90456064, "tokens/train_per_sec_per_gpu": 3546.91, "tokens/trainable": 9628632 }, { "epoch": 2.2038216560509554, "grad_norm": 0.1357421875, "learning_rate": 3.435485664167019e-05, "loss": 0.004060130566358566, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00407, "step": 692, "tokens/total": 90587136, "tokens/train_per_sec_per_gpu": 3218.86, "tokens/trainable": 9642131 }, { "epoch": 2.2070063694267517, "grad_norm": 0.1923828125, "learning_rate": 3.4303287967591484e-05, "loss": 0.008195128291845322, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00823, "step": 693, "tokens/total": 90718208, "tokens/train_per_sec_per_gpu": 3409.39, "tokens/trainable": 9656400 }, { "epoch": 2.210191082802548, "grad_norm": 0.1796875, "learning_rate": 3.425167330475205e-05, "loss": 0.0061119189485907555, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00613, "step": 694, "tokens/total": 90849280, "tokens/train_per_sec_per_gpu": 3495.48, "tokens/trainable": 9670962 }, { "epoch": 2.213375796178344, "grad_norm": 0.13671875, "learning_rate": 3.420001290829761e-05, "loss": 0.004308244213461876, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00432, "step": 695, "tokens/total": 90980352, "tokens/train_per_sec_per_gpu": 3283.37, "tokens/trainable": 9684728 }, { "epoch": 2.21656050955414, "grad_norm": 0.1494140625, "learning_rate": 3.4148307033600014e-05, "loss": 0.006343189161270857, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00636, "step": 696, "tokens/total": 91111424, "tokens/train_per_sec_per_gpu": 3576.41, "tokens/trainable": 9699704 }, { "epoch": 2.2197452229299364, "grad_norm": 0.1533203125, "learning_rate": 3.409655593625587e-05, "loss": 0.006463784724473953, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00648, "step": 697, "tokens/total": 91242496, "tokens/train_per_sec_per_gpu": 3253.16, "tokens/trainable": 9713318 }, { "epoch": 2.2229299363057327, "grad_norm": 0.12451171875, "learning_rate": 3.404475987208539e-05, "loss": 0.0030284496024250984, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00303, "step": 698, "tokens/total": 91373568, "tokens/train_per_sec_per_gpu": 3353.96, "tokens/trainable": 9727366 }, { "epoch": 2.2261146496815285, "grad_norm": 0.1357421875, "learning_rate": 3.399291909713101e-05, "loss": 0.004884797614067793, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0049, "step": 699, "tokens/total": 91504640, "tokens/train_per_sec_per_gpu": 3792.66, "tokens/trainable": 9743134 }, { "epoch": 2.229299363057325, "grad_norm": 0.1875, "learning_rate": 3.394103386765625e-05, "loss": 0.005894185043871403, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00591, "step": 700, "tokens/total": 91635712, "tokens/train_per_sec_per_gpu": 2956.26, "tokens/trainable": 9755576 }, { "epoch": 2.232484076433121, "grad_norm": 0.1533203125, "learning_rate": 3.388910444014432e-05, "loss": 0.0050967601127922535, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00511, "step": 701, "tokens/total": 91766784, "tokens/train_per_sec_per_gpu": 3181.89, "tokens/trainable": 9768924 }, { "epoch": 2.2356687898089174, "grad_norm": 0.1357421875, "learning_rate": 3.3837131071296945e-05, "loss": 0.004923132713884115, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00494, "step": 702, "tokens/total": 91897856, "tokens/train_per_sec_per_gpu": 3211.71, "tokens/trainable": 9782384 }, { "epoch": 2.238853503184713, "grad_norm": 0.1416015625, "learning_rate": 3.378511401803307e-05, "loss": 0.005397360771894455, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00541, "step": 703, "tokens/total": 92028928, "tokens/train_per_sec_per_gpu": 3335.08, "tokens/trainable": 9796325 }, { "epoch": 2.2420382165605095, "grad_norm": 0.1376953125, "learning_rate": 3.373305353748755e-05, "loss": 0.004327027127146721, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00434, "step": 704, "tokens/total": 92160000, "tokens/train_per_sec_per_gpu": 3315.25, "tokens/trainable": 9810212 }, { "epoch": 2.245222929936306, "grad_norm": 0.15625, "learning_rate": 3.368094988700996e-05, "loss": 0.007469909265637398, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0075, "step": 705, "tokens/total": 92291072, "tokens/train_per_sec_per_gpu": 3489.88, "tokens/trainable": 9824826 }, { "epoch": 2.248407643312102, "grad_norm": 0.1513671875, "learning_rate": 3.3628803324163236e-05, "loss": 0.005583882797509432, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0056, "step": 706, "tokens/total": 92422144, "tokens/train_per_sec_per_gpu": 3391.08, "tokens/trainable": 9839023 }, { "epoch": 2.251592356687898, "grad_norm": 0.1298828125, "learning_rate": 3.357661410672247e-05, "loss": 0.004044718574732542, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00405, "step": 707, "tokens/total": 92553216, "tokens/train_per_sec_per_gpu": 3547.4, "tokens/trainable": 9853877 }, { "epoch": 2.254777070063694, "grad_norm": 0.1552734375, "learning_rate": 3.352438249267359e-05, "loss": 0.005919166840612888, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00594, "step": 708, "tokens/total": 92684288, "tokens/train_per_sec_per_gpu": 3416.57, "tokens/trainable": 9868162 }, { "epoch": 2.2579617834394905, "grad_norm": 0.150390625, "learning_rate": 3.347210874021211e-05, "loss": 0.005268896464258432, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00528, "step": 709, "tokens/total": 92815360, "tokens/train_per_sec_per_gpu": 3315.44, "tokens/trainable": 9882010 }, { "epoch": 2.261146496815287, "grad_norm": 0.1630859375, "learning_rate": 3.3419793107741834e-05, "loss": 0.0063535538502037525, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00637, "step": 710, "tokens/total": 92946432, "tokens/train_per_sec_per_gpu": 3215.77, "tokens/trainable": 9895483 }, { "epoch": 2.2643312101910826, "grad_norm": 0.11865234375, "learning_rate": 3.336743585387362e-05, "loss": 0.0036360113881528378, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00364, "step": 711, "tokens/total": 93077504, "tokens/train_per_sec_per_gpu": 3574.66, "tokens/trainable": 9910386 }, { "epoch": 2.267515923566879, "grad_norm": 0.1552734375, "learning_rate": 3.3315037237424036e-05, "loss": 0.0054854946210980415, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0055, "step": 712, "tokens/total": 93208576, "tokens/train_per_sec_per_gpu": 3491.98, "tokens/trainable": 9924935 }, { "epoch": 2.270700636942675, "grad_norm": 0.1884765625, "learning_rate": 3.326259751741414e-05, "loss": 0.0039428528398275375, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00395, "step": 713, "tokens/total": 93339648, "tokens/train_per_sec_per_gpu": 3219.92, "tokens/trainable": 9938434 }, { "epoch": 2.2738853503184715, "grad_norm": 0.169921875, "learning_rate": 3.321011695306818e-05, "loss": 0.007426953874528408, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00745, "step": 714, "tokens/total": 93470720, "tokens/train_per_sec_per_gpu": 3424.44, "tokens/trainable": 9952758 }, { "epoch": 2.2770700636942673, "grad_norm": 0.1787109375, "learning_rate": 3.315759580381228e-05, "loss": 0.006136072333902121, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00615, "step": 715, "tokens/total": 93601792, "tokens/train_per_sec_per_gpu": 3058.02, "tokens/trainable": 9965569 }, { "epoch": 2.2802547770700636, "grad_norm": 0.140625, "learning_rate": 3.310503432927322e-05, "loss": 0.004970425274223089, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00498, "step": 716, "tokens/total": 93732864, "tokens/train_per_sec_per_gpu": 3320.74, "tokens/trainable": 9979461 }, { "epoch": 2.28343949044586, "grad_norm": 0.201171875, "learning_rate": 3.305243278927711e-05, "loss": 0.006117875222116709, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00614, "step": 717, "tokens/total": 93863936, "tokens/train_per_sec_per_gpu": 3419.81, "tokens/trainable": 9993781 }, { "epoch": 2.286624203821656, "grad_norm": 0.1474609375, "learning_rate": 3.299979144384808e-05, "loss": 0.005094599910080433, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00511, "step": 718, "tokens/total": 93995008, "tokens/train_per_sec_per_gpu": 3621.22, "tokens/trainable": 10008873 }, { "epoch": 2.289808917197452, "grad_norm": 0.150390625, "learning_rate": 3.29471105532071e-05, "loss": 0.005003094207495451, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00502, "step": 719, "tokens/total": 94126080, "tokens/train_per_sec_per_gpu": 3296.22, "tokens/trainable": 10022678 }, { "epoch": 2.2929936305732483, "grad_norm": 0.16015625, "learning_rate": 3.2894390377770556e-05, "loss": 0.005475780460983515, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00549, "step": 720, "tokens/total": 94257152, "tokens/train_per_sec_per_gpu": 3130.91, "tokens/trainable": 10035849 }, { "epoch": 2.2961783439490446, "grad_norm": 0.1796875, "learning_rate": 3.284163117814906e-05, "loss": 0.005412337835878134, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00543, "step": 721, "tokens/total": 94388224, "tokens/train_per_sec_per_gpu": 3388.52, "tokens/trainable": 10050035 }, { "epoch": 2.299363057324841, "grad_norm": 0.15625, "learning_rate": 3.278883321514613e-05, "loss": 0.005983334966003895, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.006, "step": 722, "tokens/total": 94519296, "tokens/train_per_sec_per_gpu": 3388.95, "tokens/trainable": 10064219 }, { "epoch": 2.3025477707006368, "grad_norm": 0.1865234375, "learning_rate": 3.27359967497569e-05, "loss": 0.006622286047786474, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00664, "step": 723, "tokens/total": 94650368, "tokens/train_per_sec_per_gpu": 3043.49, "tokens/trainable": 10077049 }, { "epoch": 2.305732484076433, "grad_norm": 0.15234375, "learning_rate": 3.268312204316684e-05, "loss": 0.005963774397969246, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00598, "step": 724, "tokens/total": 94781440, "tokens/train_per_sec_per_gpu": 3577.18, "tokens/trainable": 10091953 }, { "epoch": 2.3089171974522293, "grad_norm": 0.15625, "learning_rate": 3.263020935675043e-05, "loss": 0.003999189008027315, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00401, "step": 725, "tokens/total": 94912512, "tokens/train_per_sec_per_gpu": 3227.08, "tokens/trainable": 10105451 }, { "epoch": 2.3121019108280256, "grad_norm": 0.1337890625, "learning_rate": 3.2577258952069934e-05, "loss": 0.0032455208711326122, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00325, "step": 726, "tokens/total": 95043584, "tokens/train_per_sec_per_gpu": 3151.03, "tokens/trainable": 10118668 }, { "epoch": 2.3152866242038215, "grad_norm": 0.1611328125, "learning_rate": 3.252427109087403e-05, "loss": 0.004745165351778269, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00476, "step": 727, "tokens/total": 95174656, "tokens/train_per_sec_per_gpu": 3383.32, "tokens/trainable": 10132813 }, { "epoch": 2.3184713375796178, "grad_norm": 0.1630859375, "learning_rate": 3.247124603509659e-05, "loss": 0.004897519946098328, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00491, "step": 728, "tokens/total": 95305728, "tokens/train_per_sec_per_gpu": 3389.91, "tokens/trainable": 10147011 }, { "epoch": 2.321656050955414, "grad_norm": 0.1396484375, "learning_rate": 3.241818404685531e-05, "loss": 0.0032559458632022142, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00326, "step": 729, "tokens/total": 95436800, "tokens/train_per_sec_per_gpu": 3355.7, "tokens/trainable": 10161054 }, { "epoch": 2.3248407643312103, "grad_norm": 0.2001953125, "learning_rate": 3.236508538845049e-05, "loss": 0.007957718335092068, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00799, "step": 730, "tokens/total": 95567872, "tokens/train_per_sec_per_gpu": 3374.92, "tokens/trainable": 10175261 }, { "epoch": 2.328025477707006, "grad_norm": 0.1552734375, "learning_rate": 3.2311950322363685e-05, "loss": 0.004248796030879021, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00426, "step": 731, "tokens/total": 95698944, "tokens/train_per_sec_per_gpu": 2920.63, "tokens/trainable": 10187549 }, { "epoch": 2.3312101910828025, "grad_norm": 0.1689453125, "learning_rate": 3.225877911125642e-05, "loss": 0.0069992574863135815, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00702, "step": 732, "tokens/total": 95830016, "tokens/train_per_sec_per_gpu": 3329.17, "tokens/trainable": 10201476 }, { "epoch": 2.3343949044585988, "grad_norm": 0.142578125, "learning_rate": 3.2205572017968895e-05, "loss": 0.0038090457674115896, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00382, "step": 733, "tokens/total": 95961088, "tokens/train_per_sec_per_gpu": 3853.91, "tokens/trainable": 10217517 }, { "epoch": 2.337579617834395, "grad_norm": 0.2041015625, "learning_rate": 3.21523293055187e-05, "loss": 0.005244470667093992, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00526, "step": 734, "tokens/total": 96092160, "tokens/train_per_sec_per_gpu": 3372.34, "tokens/trainable": 10231610 }, { "epoch": 2.340764331210191, "grad_norm": 0.224609375, "learning_rate": 3.2099051237099475e-05, "loss": 0.007509202696382999, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00754, "step": 735, "tokens/total": 96223232, "tokens/train_per_sec_per_gpu": 3226.63, "tokens/trainable": 10245132 }, { "epoch": 2.343949044585987, "grad_norm": 0.1396484375, "learning_rate": 3.204573807607967e-05, "loss": 0.004419627133756876, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00443, "step": 736, "tokens/total": 96354304, "tokens/train_per_sec_per_gpu": 3439.56, "tokens/trainable": 10259460 }, { "epoch": 2.3471337579617835, "grad_norm": 0.125, "learning_rate": 3.199239008600117e-05, "loss": 0.0039891735650599, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.004, "step": 737, "tokens/total": 96485376, "tokens/train_per_sec_per_gpu": 3495.62, "tokens/trainable": 10274090 }, { "epoch": 2.3503184713375798, "grad_norm": 0.1572265625, "learning_rate": 3.193900753057805e-05, "loss": 0.004535307642072439, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00455, "step": 738, "tokens/total": 96616448, "tokens/train_per_sec_per_gpu": 3314.96, "tokens/trainable": 10287987 }, { "epoch": 2.3535031847133756, "grad_norm": 0.1552734375, "learning_rate": 3.188559067369525e-05, "loss": 0.004258223343640566, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00427, "step": 739, "tokens/total": 96747520, "tokens/train_per_sec_per_gpu": 3314.93, "tokens/trainable": 10301858 }, { "epoch": 2.356687898089172, "grad_norm": 0.1611328125, "learning_rate": 3.183213977940726e-05, "loss": 0.00545046990737319, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00547, "step": 740, "tokens/total": 96878592, "tokens/train_per_sec_per_gpu": 3349.75, "tokens/trainable": 10315882 }, { "epoch": 2.359872611464968, "grad_norm": 0.255859375, "learning_rate": 3.1778655111936866e-05, "loss": 0.005119058303534985, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00513, "step": 741, "tokens/total": 97009664, "tokens/train_per_sec_per_gpu": 3190.1, "tokens/trainable": 10329249 }, { "epoch": 2.3630573248407645, "grad_norm": 0.150390625, "learning_rate": 3.172513693567375e-05, "loss": 0.004317954182624817, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00433, "step": 742, "tokens/total": 97140736, "tokens/train_per_sec_per_gpu": 3570.82, "tokens/trainable": 10344194 }, { "epoch": 2.3662420382165603, "grad_norm": 0.1494140625, "learning_rate": 3.167158551517326e-05, "loss": 0.004607304465025663, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00462, "step": 743, "tokens/total": 97271808, "tokens/train_per_sec_per_gpu": 2783.4, "tokens/trainable": 10355961 }, { "epoch": 2.3694267515923566, "grad_norm": 0.185546875, "learning_rate": 3.1618001115155095e-05, "loss": 0.00533033162355423, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00534, "step": 744, "tokens/total": 97402880, "tokens/train_per_sec_per_gpu": 3466.49, "tokens/trainable": 10370470 }, { "epoch": 2.372611464968153, "grad_norm": 0.154296875, "learning_rate": 3.1564384000501954e-05, "loss": 0.003959702793508768, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00397, "step": 745, "tokens/total": 97533952, "tokens/train_per_sec_per_gpu": 3617.49, "tokens/trainable": 10385521 }, { "epoch": 2.375796178343949, "grad_norm": 0.166015625, "learning_rate": 3.151073443625828e-05, "loss": 0.006154323928058147, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00617, "step": 746, "tokens/total": 97665024, "tokens/train_per_sec_per_gpu": 3457.18, "tokens/trainable": 10400021 }, { "epoch": 2.3789808917197455, "grad_norm": 0.1748046875, "learning_rate": 3.1457052687628905e-05, "loss": 0.0052504888735711575, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00526, "step": 747, "tokens/total": 97796096, "tokens/train_per_sec_per_gpu": 2728.56, "tokens/trainable": 10411548 }, { "epoch": 2.3821656050955413, "grad_norm": 0.1435546875, "learning_rate": 3.140333901997776e-05, "loss": 0.004432502668350935, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00444, "step": 748, "tokens/total": 97927168, "tokens/train_per_sec_per_gpu": 3271.0, "tokens/trainable": 10425216 }, { "epoch": 2.3853503184713376, "grad_norm": 0.1865234375, "learning_rate": 3.1349593698826566e-05, "loss": 0.006921032909303904, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00695, "step": 749, "tokens/total": 98058240, "tokens/train_per_sec_per_gpu": 3261.31, "tokens/trainable": 10438873 }, { "epoch": 2.388535031847134, "grad_norm": 0.216796875, "learning_rate": 3.1295816989853514e-05, "loss": 0.004738848190754652, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00475, "step": 750, "tokens/total": 98189312, "tokens/train_per_sec_per_gpu": 3321.82, "tokens/trainable": 10452755 }, { "epoch": 2.3917197452229297, "grad_norm": 0.1708984375, "learning_rate": 3.124200915889195e-05, "loss": 0.0069868722930550575, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00701, "step": 751, "tokens/total": 98320384, "tokens/train_per_sec_per_gpu": 3436.55, "tokens/trainable": 10467133 }, { "epoch": 2.394904458598726, "grad_norm": 0.1142578125, "learning_rate": 3.118817047192907e-05, "loss": 0.0037817361298948526, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00379, "step": 752, "tokens/total": 98451456, "tokens/train_per_sec_per_gpu": 3295.93, "tokens/trainable": 10480957 }, { "epoch": 2.3980891719745223, "grad_norm": 0.2109375, "learning_rate": 3.11343011951046e-05, "loss": 0.006747876293957233, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00677, "step": 753, "tokens/total": 98582528, "tokens/train_per_sec_per_gpu": 3181.77, "tokens/trainable": 10494276 }, { "epoch": 2.4012738853503186, "grad_norm": 0.1650390625, "learning_rate": 3.108040159470949e-05, "loss": 0.005729879718273878, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00575, "step": 754, "tokens/total": 98713600, "tokens/train_per_sec_per_gpu": 3542.0, "tokens/trainable": 10509041 }, { "epoch": 2.404458598726115, "grad_norm": 0.193359375, "learning_rate": 3.1026471937184554e-05, "loss": 0.005885195918381214, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0059, "step": 755, "tokens/total": 98844672, "tokens/train_per_sec_per_gpu": 3159.44, "tokens/trainable": 10522288 }, { "epoch": 2.4076433121019107, "grad_norm": 0.1630859375, "learning_rate": 3.097251248911922e-05, "loss": 0.005482829641550779, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0055, "step": 756, "tokens/total": 98975744, "tokens/train_per_sec_per_gpu": 3443.41, "tokens/trainable": 10536684 }, { "epoch": 2.410828025477707, "grad_norm": 0.150390625, "learning_rate": 3.091852351725018e-05, "loss": 0.003930831328034401, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00394, "step": 757, "tokens/total": 99106816, "tokens/train_per_sec_per_gpu": 3398.83, "tokens/trainable": 10550908 }, { "epoch": 2.4140127388535033, "grad_norm": 0.1708984375, "learning_rate": 3.0864505288460034e-05, "loss": 0.006072892341762781, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00609, "step": 758, "tokens/total": 99237888, "tokens/train_per_sec_per_gpu": 3411.03, "tokens/trainable": 10565215 }, { "epoch": 2.417197452229299, "grad_norm": 0.166015625, "learning_rate": 3.0810458069776044e-05, "loss": 0.0038501895032823086, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00386, "step": 759, "tokens/total": 99368960, "tokens/train_per_sec_per_gpu": 3448.42, "tokens/trainable": 10579654 }, { "epoch": 2.4203821656050954, "grad_norm": 0.1796875, "learning_rate": 3.0756382128368765e-05, "loss": 0.006182640325278044, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0062, "step": 760, "tokens/total": 99500032, "tokens/train_per_sec_per_gpu": 3253.44, "tokens/trainable": 10593291 }, { "epoch": 2.4235668789808917, "grad_norm": 0.1767578125, "learning_rate": 3.070227773155074e-05, "loss": 0.0059751239605247974, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00599, "step": 761, "tokens/total": 99631104, "tokens/train_per_sec_per_gpu": 3587.67, "tokens/trainable": 10608279 }, { "epoch": 2.426751592356688, "grad_norm": 0.1513671875, "learning_rate": 3.064814514677517e-05, "loss": 0.005476124584674835, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00549, "step": 762, "tokens/total": 99762176, "tokens/train_per_sec_per_gpu": 3337.14, "tokens/trainable": 10622276 }, { "epoch": 2.4299363057324843, "grad_norm": 0.2216796875, "learning_rate": 3.0593984641634595e-05, "loss": 0.007891716435551643, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00792, "step": 763, "tokens/total": 99893248, "tokens/train_per_sec_per_gpu": 2999.6, "tokens/trainable": 10634845 }, { "epoch": 2.43312101910828, "grad_norm": 0.130859375, "learning_rate": 3.053979648385957e-05, "loss": 0.004688839428126812, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0047, "step": 764, "tokens/total": 100024320, "tokens/train_per_sec_per_gpu": 3484.41, "tokens/trainable": 10649410 }, { "epoch": 2.4363057324840764, "grad_norm": 0.150390625, "learning_rate": 3.048558094131737e-05, "loss": 0.004935243632644415, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00495, "step": 765, "tokens/total": 100155392, "tokens/train_per_sec_per_gpu": 3011.34, "tokens/trainable": 10662117 }, { "epoch": 2.4394904458598727, "grad_norm": 0.1689453125, "learning_rate": 3.0431338282010606e-05, "loss": 0.004069786984473467, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00408, "step": 766, "tokens/total": 100286464, "tokens/train_per_sec_per_gpu": 3252.38, "tokens/trainable": 10675770 }, { "epoch": 2.4426751592356686, "grad_norm": 0.16796875, "learning_rate": 3.0377068774075957e-05, "loss": 0.005909848026931286, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00593, "step": 767, "tokens/total": 100417536, "tokens/train_per_sec_per_gpu": 3084.65, "tokens/trainable": 10688759 }, { "epoch": 2.445859872611465, "grad_norm": 0.1689453125, "learning_rate": 3.0322772685782815e-05, "loss": 0.005527772940695286, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00554, "step": 768, "tokens/total": 100548608, "tokens/train_per_sec_per_gpu": 3143.83, "tokens/trainable": 10701925 }, { "epoch": 2.449044585987261, "grad_norm": 0.1728515625, "learning_rate": 3.0268450285531967e-05, "loss": 0.005178853869438171, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00519, "step": 769, "tokens/total": 100679680, "tokens/train_per_sec_per_gpu": 3510.23, "tokens/trainable": 10716553 }, { "epoch": 2.4522292993630574, "grad_norm": 0.115234375, "learning_rate": 3.021410184185427e-05, "loss": 0.0034743379801511765, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00348, "step": 770, "tokens/total": 100810752, "tokens/train_per_sec_per_gpu": 3316.12, "tokens/trainable": 10730411 }, { "epoch": 2.4554140127388537, "grad_norm": 0.1650390625, "learning_rate": 3.0159727623409313e-05, "loss": 0.0041341050527989864, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00414, "step": 771, "tokens/total": 100941824, "tokens/train_per_sec_per_gpu": 3027.88, "tokens/trainable": 10743149 }, { "epoch": 2.4585987261146496, "grad_norm": 0.1513671875, "learning_rate": 3.0105327898984102e-05, "loss": 0.004606778733432293, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00462, "step": 772, "tokens/total": 101072896, "tokens/train_per_sec_per_gpu": 3423.19, "tokens/trainable": 10757437 }, { "epoch": 2.461783439490446, "grad_norm": 0.193359375, "learning_rate": 3.005090293749174e-05, "loss": 0.006537875160574913, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00656, "step": 773, "tokens/total": 101203968, "tokens/train_per_sec_per_gpu": 3675.05, "tokens/trainable": 10772736 }, { "epoch": 2.464968152866242, "grad_norm": 0.181640625, "learning_rate": 2.9996453007970056e-05, "loss": 0.006382662802934647, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0064, "step": 774, "tokens/total": 101335040, "tokens/train_per_sec_per_gpu": 3821.84, "tokens/trainable": 10788651 }, { "epoch": 2.468152866242038, "grad_norm": 0.16015625, "learning_rate": 2.994197837958032e-05, "loss": 0.005575335118919611, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00559, "step": 775, "tokens/total": 101466112, "tokens/train_per_sec_per_gpu": 3358.0, "tokens/trainable": 10802732 }, { "epoch": 2.4713375796178343, "grad_norm": 0.16015625, "learning_rate": 2.9887479321605895e-05, "loss": 0.005272061098366976, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00529, "step": 776, "tokens/total": 101597184, "tokens/train_per_sec_per_gpu": 3377.54, "tokens/trainable": 10816888 }, { "epoch": 2.4745222929936306, "grad_norm": 0.1376953125, "learning_rate": 2.9832956103450905e-05, "loss": 0.0034832200035452843, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00349, "step": 777, "tokens/total": 101728256, "tokens/train_per_sec_per_gpu": 3313.99, "tokens/trainable": 10830748 }, { "epoch": 2.477707006369427, "grad_norm": 0.166015625, "learning_rate": 2.9778408994638906e-05, "loss": 0.005426026880741119, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00544, "step": 778, "tokens/total": 101859328, "tokens/train_per_sec_per_gpu": 3283.57, "tokens/trainable": 10844538 }, { "epoch": 2.480891719745223, "grad_norm": 0.1748046875, "learning_rate": 2.9723838264811545e-05, "loss": 0.00458392733708024, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00459, "step": 779, "tokens/total": 101990400, "tokens/train_per_sec_per_gpu": 3438.29, "tokens/trainable": 10858901 }, { "epoch": 2.484076433121019, "grad_norm": 0.1875, "learning_rate": 2.966924418372724e-05, "loss": 0.006339904386550188, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00636, "step": 780, "tokens/total": 102121472, "tokens/train_per_sec_per_gpu": 3321.39, "tokens/trainable": 10873475 }, { "epoch": 2.4872611464968153, "grad_norm": 0.1826171875, "learning_rate": 2.9614627021259846e-05, "loss": 0.006326707080006599, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00635, "step": 781, "tokens/total": 102252544, "tokens/train_per_sec_per_gpu": 3305.16, "tokens/trainable": 10887308 }, { "epoch": 2.4904458598726116, "grad_norm": 0.1884765625, "learning_rate": 2.9559987047397303e-05, "loss": 0.006832771003246307, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00686, "step": 782, "tokens/total": 102383616, "tokens/train_per_sec_per_gpu": 2932.75, "tokens/trainable": 10899670 }, { "epoch": 2.4936305732484074, "grad_norm": 0.1357421875, "learning_rate": 2.950532453224032e-05, "loss": 0.003962225280702114, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00397, "step": 783, "tokens/total": 102514688, "tokens/train_per_sec_per_gpu": 3082.59, "tokens/trainable": 10912648 }, { "epoch": 2.4968152866242037, "grad_norm": 0.1533203125, "learning_rate": 2.945063974600104e-05, "loss": 0.005036994814872742, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00505, "step": 784, "tokens/total": 102645760, "tokens/train_per_sec_per_gpu": 3643.04, "tokens/trainable": 10927870 }, { "epoch": 2.5, "grad_norm": 0.173828125, "learning_rate": 2.9395932959001692e-05, "loss": 0.0055970605462789536, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00561, "step": 785, "tokens/total": 102776832, "tokens/train_per_sec_per_gpu": 3510.01, "tokens/trainable": 10942573 }, { "epoch": 2.5, "eval_loss": 0.00919391866773367, "eval_ppl": 1.00924, "eval_runtime": 41.9998, "eval_samples_per_second": 64.31, "eval_steps_per_second": 4.024, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 54.61, "memory/max_allocated (GiB)": 54.61, "step": 785 }, { "epoch": 2.5031847133757963, "grad_norm": 0.1689453125, "learning_rate": 2.9341204441673266e-05, "loss": 0.004385429434478283, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0044, "step": 786, "tokens/total": 102907904, "tokens/train_per_sec_per_gpu": 3007.8, "tokens/trainable": 10955164 }, { "epoch": 2.5063694267515926, "grad_norm": 0.2080078125, "learning_rate": 2.9286454464554152e-05, "loss": 0.006849427707493305, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00687, "step": 787, "tokens/total": 103038976, "tokens/train_per_sec_per_gpu": 3371.47, "tokens/trainable": 10969219 }, { "epoch": 2.5095541401273884, "grad_norm": 0.150390625, "learning_rate": 2.9231683298288853e-05, "loss": 0.005230756010860205, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00524, "step": 788, "tokens/total": 103170048, "tokens/train_per_sec_per_gpu": 3590.97, "tokens/trainable": 10984159 }, { "epoch": 2.5127388535031847, "grad_norm": 0.1533203125, "learning_rate": 2.9176891213626595e-05, "loss": 0.00515084620565176, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00516, "step": 789, "tokens/total": 103301120, "tokens/train_per_sec_per_gpu": 3471.15, "tokens/trainable": 10998703 }, { "epoch": 2.515923566878981, "grad_norm": 0.1513671875, "learning_rate": 2.9122078481420012e-05, "loss": 0.005567297339439392, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00558, "step": 790, "tokens/total": 103432192, "tokens/train_per_sec_per_gpu": 3566.36, "tokens/trainable": 11013580 }, { "epoch": 2.519108280254777, "grad_norm": 0.1650390625, "learning_rate": 2.906724537262381e-05, "loss": 0.005145716480910778, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00516, "step": 791, "tokens/total": 103563264, "tokens/train_per_sec_per_gpu": 3399.88, "tokens/trainable": 11027822 }, { "epoch": 2.522292993630573, "grad_norm": 0.1162109375, "learning_rate": 2.901239215829341e-05, "loss": 0.0032891561277210712, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00329, "step": 792, "tokens/total": 103694336, "tokens/train_per_sec_per_gpu": 3050.99, "tokens/trainable": 11040610 }, { "epoch": 2.5254777070063694, "grad_norm": 0.166015625, "learning_rate": 2.895751910958364e-05, "loss": 0.005250695627182722, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00526, "step": 793, "tokens/total": 103825408, "tokens/train_per_sec_per_gpu": 3579.93, "tokens/trainable": 11055502 }, { "epoch": 2.5286624203821657, "grad_norm": 0.1865234375, "learning_rate": 2.8902626497747366e-05, "loss": 0.005496595986187458, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00551, "step": 794, "tokens/total": 103956480, "tokens/train_per_sec_per_gpu": 3718.84, "tokens/trainable": 11070940 }, { "epoch": 2.531847133757962, "grad_norm": 0.1630859375, "learning_rate": 2.8847714594134144e-05, "loss": 0.006310721859335899, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00633, "step": 795, "tokens/total": 104087552, "tokens/train_per_sec_per_gpu": 3917.97, "tokens/trainable": 11087217 }, { "epoch": 2.535031847133758, "grad_norm": 0.1767578125, "learning_rate": 2.8792783670188927e-05, "loss": 0.005432881880551577, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00545, "step": 796, "tokens/total": 104218624, "tokens/train_per_sec_per_gpu": 3115.17, "tokens/trainable": 11100308 }, { "epoch": 2.538216560509554, "grad_norm": 0.1748046875, "learning_rate": 2.873783399745066e-05, "loss": 0.005197789054363966, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00521, "step": 797, "tokens/total": 104349696, "tokens/train_per_sec_per_gpu": 3361.6, "tokens/trainable": 11114376 }, { "epoch": 2.5414012738853504, "grad_norm": 0.1767578125, "learning_rate": 2.868286584755099e-05, "loss": 0.005434297490864992, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00545, "step": 798, "tokens/total": 104480768, "tokens/train_per_sec_per_gpu": 3144.12, "tokens/trainable": 11127547 }, { "epoch": 2.5445859872611463, "grad_norm": 0.1005859375, "learning_rate": 2.862787949221288e-05, "loss": 0.0028167557902634144, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00282, "step": 799, "tokens/total": 104611840, "tokens/train_per_sec_per_gpu": 3311.57, "tokens/trainable": 11141342 }, { "epoch": 2.5477707006369426, "grad_norm": 0.1376953125, "learning_rate": 2.857287520324931e-05, "loss": 0.0033000826369971037, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00331, "step": 800, "tokens/total": 104742912, "tokens/train_per_sec_per_gpu": 3334.18, "tokens/trainable": 11155303 }, { "epoch": 2.550955414012739, "grad_norm": 0.1279296875, "learning_rate": 2.8517853252561906e-05, "loss": 0.004212173167616129, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00422, "step": 801, "tokens/total": 104873984, "tokens/train_per_sec_per_gpu": 3504.45, "tokens/trainable": 11169930 }, { "epoch": 2.554140127388535, "grad_norm": 0.166015625, "learning_rate": 2.8462813912139586e-05, "loss": 0.005329788196831942, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00534, "step": 802, "tokens/total": 105005056, "tokens/train_per_sec_per_gpu": 3283.1, "tokens/trainable": 11183661 }, { "epoch": 2.5573248407643314, "grad_norm": 0.134765625, "learning_rate": 2.8407757454057248e-05, "loss": 0.0038679102435708046, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00388, "step": 803, "tokens/total": 105136128, "tokens/train_per_sec_per_gpu": 3540.39, "tokens/trainable": 11198409 }, { "epoch": 2.5605095541401273, "grad_norm": 0.1787109375, "learning_rate": 2.83526841504744e-05, "loss": 0.004407938569784164, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00442, "step": 804, "tokens/total": 105267200, "tokens/train_per_sec_per_gpu": 3017.63, "tokens/trainable": 11211070 }, { "epoch": 2.5636942675159236, "grad_norm": 0.16015625, "learning_rate": 2.8297594273633816e-05, "loss": 0.004717926029115915, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00473, "step": 805, "tokens/total": 105398272, "tokens/train_per_sec_per_gpu": 3319.15, "tokens/trainable": 11224899 }, { "epoch": 2.56687898089172, "grad_norm": 0.1875, "learning_rate": 2.824248809586021e-05, "loss": 0.005949638783931732, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00597, "step": 806, "tokens/total": 105529344, "tokens/train_per_sec_per_gpu": 3475.48, "tokens/trainable": 11239401 }, { "epoch": 2.5700636942675157, "grad_norm": 0.1787109375, "learning_rate": 2.8187365889558858e-05, "loss": 0.004526551812887192, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00454, "step": 807, "tokens/total": 105660416, "tokens/train_per_sec_per_gpu": 3481.42, "tokens/trainable": 11253859 }, { "epoch": 2.573248407643312, "grad_norm": 0.1552734375, "learning_rate": 2.81322279272143e-05, "loss": 0.005305514670908451, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00532, "step": 808, "tokens/total": 105791488, "tokens/train_per_sec_per_gpu": 3355.57, "tokens/trainable": 11267936 }, { "epoch": 2.5764331210191083, "grad_norm": 0.189453125, "learning_rate": 2.8077074481388927e-05, "loss": 0.003922187723219395, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00393, "step": 809, "tokens/total": 105922560, "tokens/train_per_sec_per_gpu": 3053.79, "tokens/trainable": 11280737 }, { "epoch": 2.5796178343949046, "grad_norm": 0.1416015625, "learning_rate": 2.802190582472168e-05, "loss": 0.004988102242350578, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.005, "step": 810, "tokens/total": 106053632, "tokens/train_per_sec_per_gpu": 3417.76, "tokens/trainable": 11295020 }, { "epoch": 2.582802547770701, "grad_norm": 0.1494140625, "learning_rate": 2.7966722229926712e-05, "loss": 0.002851355355232954, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00286, "step": 811, "tokens/total": 106184704, "tokens/train_per_sec_per_gpu": 3055.95, "tokens/trainable": 11307828 }, { "epoch": 2.5859872611464967, "grad_norm": 0.16015625, "learning_rate": 2.7911523969791997e-05, "loss": 0.00479587959125638, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00481, "step": 812, "tokens/total": 106315776, "tokens/train_per_sec_per_gpu": 3332.89, "tokens/trainable": 11321718 }, { "epoch": 2.589171974522293, "grad_norm": 0.1533203125, "learning_rate": 2.7856311317178002e-05, "loss": 0.00479497155174613, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00481, "step": 813, "tokens/total": 106446848, "tokens/train_per_sec_per_gpu": 3224.61, "tokens/trainable": 11335234 }, { "epoch": 2.5923566878980893, "grad_norm": 0.1767578125, "learning_rate": 2.7801084545016364e-05, "loss": 0.005322256591171026, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00534, "step": 814, "tokens/total": 106577920, "tokens/train_per_sec_per_gpu": 3067.32, "tokens/trainable": 11348079 }, { "epoch": 2.595541401273885, "grad_norm": 0.162109375, "learning_rate": 2.774584392630849e-05, "loss": 0.004532738588750362, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00454, "step": 815, "tokens/total": 106708992, "tokens/train_per_sec_per_gpu": 3239.47, "tokens/trainable": 11361632 }, { "epoch": 2.5987261146496814, "grad_norm": 0.1748046875, "learning_rate": 2.769058973412424e-05, "loss": 0.005558821838349104, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00557, "step": 816, "tokens/total": 106840064, "tokens/train_per_sec_per_gpu": 3432.78, "tokens/trainable": 11376015 }, { "epoch": 2.6019108280254777, "grad_norm": 0.23046875, "learning_rate": 2.7635322241600603e-05, "loss": 0.008326980285346508, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00836, "step": 817, "tokens/total": 106971136, "tokens/train_per_sec_per_gpu": 3064.13, "tokens/trainable": 11388898 }, { "epoch": 2.605095541401274, "grad_norm": 0.1796875, "learning_rate": 2.7580041721940264e-05, "loss": 0.005567263346165419, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00558, "step": 818, "tokens/total": 107102208, "tokens/train_per_sec_per_gpu": 3339.15, "tokens/trainable": 11402886 }, { "epoch": 2.6082802547770703, "grad_norm": 0.10693359375, "learning_rate": 2.7524748448410337e-05, "loss": 0.0028434821870177984, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00285, "step": 819, "tokens/total": 107233280, "tokens/train_per_sec_per_gpu": 3304.29, "tokens/trainable": 11416722 }, { "epoch": 2.611464968152866, "grad_norm": 0.1748046875, "learning_rate": 2.7469442694340984e-05, "loss": 0.0058287507854402065, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00585, "step": 820, "tokens/total": 107364352, "tokens/train_per_sec_per_gpu": 3313.02, "tokens/trainable": 11430598 }, { "epoch": 2.6146496815286624, "grad_norm": 0.15625, "learning_rate": 2.7414124733124046e-05, "loss": 0.004522873554378748, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00453, "step": 821, "tokens/total": 107495424, "tokens/train_per_sec_per_gpu": 3603.28, "tokens/trainable": 11445614 }, { "epoch": 2.6178343949044587, "grad_norm": 0.1298828125, "learning_rate": 2.735879483821171e-05, "loss": 0.004399726167321205, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00441, "step": 822, "tokens/total": 107626496, "tokens/train_per_sec_per_gpu": 3592.02, "tokens/trainable": 11460570 }, { "epoch": 2.6210191082802545, "grad_norm": 0.13671875, "learning_rate": 2.7303453283115177e-05, "loss": 0.004378693178296089, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00439, "step": 823, "tokens/total": 107757568, "tokens/train_per_sec_per_gpu": 3491.55, "tokens/trainable": 11475144 }, { "epoch": 2.624203821656051, "grad_norm": 0.1650390625, "learning_rate": 2.7248100341403247e-05, "loss": 0.0058170161210000515, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00583, "step": 824, "tokens/total": 107888640, "tokens/train_per_sec_per_gpu": 3439.8, "tokens/trainable": 11489495 }, { "epoch": 2.627388535031847, "grad_norm": 0.13671875, "learning_rate": 2.7192736286701042e-05, "loss": 0.0035439918283373117, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00355, "step": 825, "tokens/total": 108019712, "tokens/train_per_sec_per_gpu": 3559.22, "tokens/trainable": 11504322 }, { "epoch": 2.6305732484076434, "grad_norm": 0.1435546875, "learning_rate": 2.7137361392688613e-05, "loss": 0.004517707973718643, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00453, "step": 826, "tokens/total": 108150784, "tokens/train_per_sec_per_gpu": 3300.49, "tokens/trainable": 11518136 }, { "epoch": 2.6337579617834397, "grad_norm": 0.13671875, "learning_rate": 2.7081975933099573e-05, "loss": 0.005291810724884272, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00531, "step": 827, "tokens/total": 108281856, "tokens/train_per_sec_per_gpu": 3485.86, "tokens/trainable": 11532728 }, { "epoch": 2.6369426751592355, "grad_norm": 0.123046875, "learning_rate": 2.7026580181719774e-05, "loss": 0.0031160882208496332, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00312, "step": 828, "tokens/total": 108412928, "tokens/train_per_sec_per_gpu": 3088.88, "tokens/trainable": 11545669 }, { "epoch": 2.640127388535032, "grad_norm": 0.1748046875, "learning_rate": 2.697117441238597e-05, "loss": 0.004703770391643047, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00471, "step": 829, "tokens/total": 108544000, "tokens/train_per_sec_per_gpu": 3390.81, "tokens/trainable": 11559794 }, { "epoch": 2.643312101910828, "grad_norm": 0.1875, "learning_rate": 2.6915758898984384e-05, "loss": 0.006808799225836992, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00683, "step": 830, "tokens/total": 108675072, "tokens/train_per_sec_per_gpu": 3340.4, "tokens/trainable": 11573796 }, { "epoch": 2.646496815286624, "grad_norm": 0.212890625, "learning_rate": 2.686033391544945e-05, "loss": 0.005405929870903492, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00542, "step": 831, "tokens/total": 108806144, "tokens/train_per_sec_per_gpu": 3397.97, "tokens/trainable": 11588025 }, { "epoch": 2.6496815286624202, "grad_norm": 0.1611328125, "learning_rate": 2.6804899735762405e-05, "loss": 0.006530239712446928, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00655, "step": 832, "tokens/total": 108937216, "tokens/train_per_sec_per_gpu": 3240.0, "tokens/trainable": 11601588 }, { "epoch": 2.6528662420382165, "grad_norm": 0.142578125, "learning_rate": 2.6749456633949932e-05, "loss": 0.0037627576384693384, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00377, "step": 833, "tokens/total": 109068288, "tokens/train_per_sec_per_gpu": 3152.61, "tokens/trainable": 11614787 }, { "epoch": 2.656050955414013, "grad_norm": 0.158203125, "learning_rate": 2.6694004884082825e-05, "loss": 0.0034914424177259207, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0035, "step": 834, "tokens/total": 109199360, "tokens/train_per_sec_per_gpu": 3518.76, "tokens/trainable": 11629463 }, { "epoch": 2.659235668789809, "grad_norm": 0.1318359375, "learning_rate": 2.663854476027465e-05, "loss": 0.004583639558404684, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00459, "step": 835, "tokens/total": 109330432, "tokens/train_per_sec_per_gpu": 3674.53, "tokens/trainable": 11644760 }, { "epoch": 2.662420382165605, "grad_norm": 0.21875, "learning_rate": 2.6583076536680323e-05, "loss": 0.007365885656327009, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00739, "step": 836, "tokens/total": 109461504, "tokens/train_per_sec_per_gpu": 3133.81, "tokens/trainable": 11657934 }, { "epoch": 2.6656050955414012, "grad_norm": 0.1396484375, "learning_rate": 2.652760048749483e-05, "loss": 0.004122959915548563, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00413, "step": 837, "tokens/total": 109592576, "tokens/train_per_sec_per_gpu": 3445.42, "tokens/trainable": 11672312 }, { "epoch": 2.6687898089171975, "grad_norm": 0.150390625, "learning_rate": 2.647211688695186e-05, "loss": 0.005676808767020702, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00569, "step": 838, "tokens/total": 109723648, "tokens/train_per_sec_per_gpu": 3626.52, "tokens/trainable": 11687353 }, { "epoch": 2.6719745222929934, "grad_norm": 0.2138671875, "learning_rate": 2.6416626009322375e-05, "loss": 0.005739385262131691, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00576, "step": 839, "tokens/total": 109854720, "tokens/train_per_sec_per_gpu": 3288.19, "tokens/trainable": 11701118 }, { "epoch": 2.6751592356687897, "grad_norm": 0.1689453125, "learning_rate": 2.6361128128913347e-05, "loss": 0.00492321141064167, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00494, "step": 840, "tokens/total": 109985792, "tokens/train_per_sec_per_gpu": 3438.3, "tokens/trainable": 11715518 }, { "epoch": 2.678343949044586, "grad_norm": 0.1611328125, "learning_rate": 2.6305623520066382e-05, "loss": 0.0048889112658798695, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0049, "step": 841, "tokens/total": 110116864, "tokens/train_per_sec_per_gpu": 3362.44, "tokens/trainable": 11729606 }, { "epoch": 2.6815286624203822, "grad_norm": 0.189453125, "learning_rate": 2.6250112457156296e-05, "loss": 0.005592016503214836, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00561, "step": 842, "tokens/total": 110247936, "tokens/train_per_sec_per_gpu": 2882.24, "tokens/trainable": 11741702 }, { "epoch": 2.6847133757961785, "grad_norm": 0.1787109375, "learning_rate": 2.619459521458984e-05, "loss": 0.0058587053790688515, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00588, "step": 843, "tokens/total": 110379008, "tokens/train_per_sec_per_gpu": 3463.6, "tokens/trainable": 11756164 }, { "epoch": 2.6878980891719744, "grad_norm": 0.1796875, "learning_rate": 2.6139072066804332e-05, "loss": 0.004927500616759062, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00494, "step": 844, "tokens/total": 110510080, "tokens/train_per_sec_per_gpu": 3446.83, "tokens/trainable": 11770544 }, { "epoch": 2.6910828025477707, "grad_norm": 0.2109375, "learning_rate": 2.6083543288266233e-05, "loss": 0.007675695698708296, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00771, "step": 845, "tokens/total": 110641152, "tokens/train_per_sec_per_gpu": 3146.49, "tokens/trainable": 11783721 }, { "epoch": 2.694267515923567, "grad_norm": 0.2470703125, "learning_rate": 2.602800915346986e-05, "loss": 0.004762662574648857, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00477, "step": 846, "tokens/total": 110772224, "tokens/train_per_sec_per_gpu": 3111.12, "tokens/trainable": 11796753 }, { "epoch": 2.697452229299363, "grad_norm": 0.1865234375, "learning_rate": 2.5972469936936046e-05, "loss": 0.006559155881404877, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00658, "step": 847, "tokens/total": 110903296, "tokens/train_per_sec_per_gpu": 3445.43, "tokens/trainable": 11811161 }, { "epoch": 2.700636942675159, "grad_norm": 0.185546875, "learning_rate": 2.5916925913210677e-05, "loss": 0.005181832704693079, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0052, "step": 848, "tokens/total": 111034368, "tokens/train_per_sec_per_gpu": 3072.78, "tokens/trainable": 11824046 }, { "epoch": 2.7038216560509554, "grad_norm": 0.1640625, "learning_rate": 2.5861377356863437e-05, "loss": 0.005784741137176752, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0058, "step": 849, "tokens/total": 111165440, "tokens/train_per_sec_per_gpu": 3386.17, "tokens/trainable": 11838220 }, { "epoch": 2.7070063694267517, "grad_norm": 0.181640625, "learning_rate": 2.5805824542486434e-05, "loss": 0.006970499642193317, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00699, "step": 850, "tokens/total": 111296512, "tokens/train_per_sec_per_gpu": 3653.3, "tokens/trainable": 11853456 }, { "epoch": 2.710191082802548, "grad_norm": 0.1806640625, "learning_rate": 2.5750267744692786e-05, "loss": 0.005797088146209717, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00581, "step": 851, "tokens/total": 111427584, "tokens/train_per_sec_per_gpu": 3852.62, "tokens/trainable": 11869442 }, { "epoch": 2.713375796178344, "grad_norm": 0.138671875, "learning_rate": 2.5694707238115323e-05, "loss": 0.003937084693461657, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00394, "step": 852, "tokens/total": 111558656, "tokens/train_per_sec_per_gpu": 3164.69, "tokens/trainable": 11882674 }, { "epoch": 2.71656050955414, "grad_norm": 0.177734375, "learning_rate": 2.5639143297405222e-05, "loss": 0.004891657270491123, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0049, "step": 853, "tokens/total": 111689728, "tokens/train_per_sec_per_gpu": 3223.75, "tokens/trainable": 11896189 }, { "epoch": 2.7197452229299364, "grad_norm": 0.14453125, "learning_rate": 2.5583576197230603e-05, "loss": 0.003982385154813528, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00399, "step": 854, "tokens/total": 111820800, "tokens/train_per_sec_per_gpu": 3223.56, "tokens/trainable": 11909688 }, { "epoch": 2.722929936305732, "grad_norm": 0.1484375, "learning_rate": 2.5528006212275218e-05, "loss": 0.0039648148231208324, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00397, "step": 855, "tokens/total": 111951872, "tokens/train_per_sec_per_gpu": 3301.63, "tokens/trainable": 11923437 }, { "epoch": 2.7261146496815285, "grad_norm": 0.193359375, "learning_rate": 2.5472433617237107e-05, "loss": 0.006385331507772207, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00641, "step": 856, "tokens/total": 112082944, "tokens/train_per_sec_per_gpu": 3430.81, "tokens/trainable": 11937805 }, { "epoch": 2.729299363057325, "grad_norm": 0.162109375, "learning_rate": 2.541685868682716e-05, "loss": 0.005221599247306585, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00524, "step": 857, "tokens/total": 112214016, "tokens/train_per_sec_per_gpu": 3535.81, "tokens/trainable": 11952610 }, { "epoch": 2.732484076433121, "grad_norm": 0.1650390625, "learning_rate": 2.5361281695767854e-05, "loss": 0.004517777357250452, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00453, "step": 858, "tokens/total": 112345088, "tokens/train_per_sec_per_gpu": 3097.23, "tokens/trainable": 11965620 }, { "epoch": 2.7356687898089174, "grad_norm": 0.1328125, "learning_rate": 2.530570291879184e-05, "loss": 0.003382981289178133, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00339, "step": 859, "tokens/total": 112476160, "tokens/train_per_sec_per_gpu": 3500.78, "tokens/trainable": 11980302 }, { "epoch": 2.738853503184713, "grad_norm": 0.162109375, "learning_rate": 2.5250122630640587e-05, "loss": 0.005662713665515184, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00568, "step": 860, "tokens/total": 112607232, "tokens/train_per_sec_per_gpu": 3446.65, "tokens/trainable": 11994679 }, { "epoch": 2.7420382165605095, "grad_norm": 0.15625, "learning_rate": 2.519454110606304e-05, "loss": 0.004983518272638321, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.005, "step": 861, "tokens/total": 112738304, "tokens/train_per_sec_per_gpu": 3673.71, "tokens/trainable": 12009986 }, { "epoch": 2.745222929936306, "grad_norm": 0.146484375, "learning_rate": 2.5138958619814275e-05, "loss": 0.004905369598418474, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00492, "step": 862, "tokens/total": 112869376, "tokens/train_per_sec_per_gpu": 3063.86, "tokens/trainable": 12022816 }, { "epoch": 2.7484076433121016, "grad_norm": 0.16796875, "learning_rate": 2.5083375446654083e-05, "loss": 0.006565258372575045, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00659, "step": 863, "tokens/total": 113000448, "tokens/train_per_sec_per_gpu": 3636.75, "tokens/trainable": 12037957 }, { "epoch": 2.7515923566878984, "grad_norm": 0.142578125, "learning_rate": 2.502779186134568e-05, "loss": 0.004305466078221798, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00431, "step": 864, "tokens/total": 113131520, "tokens/train_per_sec_per_gpu": 3370.27, "tokens/trainable": 12052017 }, { "epoch": 2.754777070063694, "grad_norm": 0.130859375, "learning_rate": 2.497220813865432e-05, "loss": 0.0037764415610581636, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00378, "step": 865, "tokens/total": 113262592, "tokens/train_per_sec_per_gpu": 3212.97, "tokens/trainable": 12065431 }, { "epoch": 2.7579617834394905, "grad_norm": 0.1689453125, "learning_rate": 2.491662455334592e-05, "loss": 0.005136569030582905, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00515, "step": 866, "tokens/total": 113393664, "tokens/train_per_sec_per_gpu": 3273.22, "tokens/trainable": 12079122 }, { "epoch": 2.761146496815287, "grad_norm": 0.1513671875, "learning_rate": 2.4861041380185738e-05, "loss": 0.003261574311181903, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00327, "step": 867, "tokens/total": 113524736, "tokens/train_per_sec_per_gpu": 3109.77, "tokens/trainable": 12092147 }, { "epoch": 2.7643312101910826, "grad_norm": 0.2021484375, "learning_rate": 2.4805458893936963e-05, "loss": 0.0064933402463793755, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00651, "step": 868, "tokens/total": 113655808, "tokens/train_per_sec_per_gpu": 3269.96, "tokens/trainable": 12105826 }, { "epoch": 2.767515923566879, "grad_norm": 0.140625, "learning_rate": 2.474987736935942e-05, "loss": 0.004877043422311544, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00489, "step": 869, "tokens/total": 113786880, "tokens/train_per_sec_per_gpu": 3459.54, "tokens/trainable": 12120248 }, { "epoch": 2.770700636942675, "grad_norm": 0.15625, "learning_rate": 2.469429708120817e-05, "loss": 0.004386639688163996, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0044, "step": 870, "tokens/total": 113917952, "tokens/train_per_sec_per_gpu": 3176.3, "tokens/trainable": 12133552 }, { "epoch": 2.7738853503184715, "grad_norm": 0.166015625, "learning_rate": 2.463871830423215e-05, "loss": 0.00508409459143877, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0051, "step": 871, "tokens/total": 114049024, "tokens/train_per_sec_per_gpu": 3403.46, "tokens/trainable": 12147799 }, { "epoch": 2.777070063694268, "grad_norm": 0.17578125, "learning_rate": 2.4583141313172842e-05, "loss": 0.003352643456310034, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00336, "step": 872, "tokens/total": 114180096, "tokens/train_per_sec_per_gpu": 3192.03, "tokens/trainable": 12161167 }, { "epoch": 2.7802547770700636, "grad_norm": 0.1962890625, "learning_rate": 2.4527566382762902e-05, "loss": 0.005316773895174265, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00533, "step": 873, "tokens/total": 114311168, "tokens/train_per_sec_per_gpu": 3192.46, "tokens/trainable": 12174546 }, { "epoch": 2.78343949044586, "grad_norm": 0.1298828125, "learning_rate": 2.4471993787724777e-05, "loss": 0.00329143856652081, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0033, "step": 874, "tokens/total": 114442240, "tokens/train_per_sec_per_gpu": 3208.11, "tokens/trainable": 12187974 }, { "epoch": 2.786624203821656, "grad_norm": 0.162109375, "learning_rate": 2.4416423802769403e-05, "loss": 0.0036203130148351192, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00363, "step": 875, "tokens/total": 114573312, "tokens/train_per_sec_per_gpu": 2915.58, "tokens/trainable": 12200203 }, { "epoch": 2.789808917197452, "grad_norm": 0.1259765625, "learning_rate": 2.436085670259479e-05, "loss": 0.003102727932855487, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00311, "step": 876, "tokens/total": 114704384, "tokens/train_per_sec_per_gpu": 3023.1, "tokens/trainable": 12212847 }, { "epoch": 2.7929936305732483, "grad_norm": 0.2080078125, "learning_rate": 2.4305292761884676e-05, "loss": 0.005169394891709089, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00518, "step": 877, "tokens/total": 114835456, "tokens/train_per_sec_per_gpu": 3140.34, "tokens/trainable": 12226003 }, { "epoch": 2.7961783439490446, "grad_norm": 0.1728515625, "learning_rate": 2.4249732255307216e-05, "loss": 0.004676941316574812, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00469, "step": 878, "tokens/total": 114966528, "tokens/train_per_sec_per_gpu": 2960.89, "tokens/trainable": 12238405 }, { "epoch": 2.799363057324841, "grad_norm": 0.2001953125, "learning_rate": 2.4194175457513575e-05, "loss": 0.005923910532146692, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00594, "step": 879, "tokens/total": 115097600, "tokens/train_per_sec_per_gpu": 3330.91, "tokens/trainable": 12252333 }, { "epoch": 2.802547770700637, "grad_norm": 0.16015625, "learning_rate": 2.4138622643136562e-05, "loss": 0.004777503665536642, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00479, "step": 880, "tokens/total": 115228672, "tokens/train_per_sec_per_gpu": 3471.55, "tokens/trainable": 12266874 }, { "epoch": 2.805732484076433, "grad_norm": 0.15234375, "learning_rate": 2.4083074086789332e-05, "loss": 0.004388585686683655, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0044, "step": 881, "tokens/total": 115359744, "tokens/train_per_sec_per_gpu": 3109.72, "tokens/trainable": 12279904 }, { "epoch": 2.8089171974522293, "grad_norm": 0.2041015625, "learning_rate": 2.4027530063063966e-05, "loss": 0.00651566544547677, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00654, "step": 882, "tokens/total": 115490816, "tokens/train_per_sec_per_gpu": 3355.68, "tokens/trainable": 12293954 }, { "epoch": 2.8121019108280256, "grad_norm": 0.14453125, "learning_rate": 2.3971990846530134e-05, "loss": 0.0046853781677782536, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0047, "step": 883, "tokens/total": 115621888, "tokens/train_per_sec_per_gpu": 3459.29, "tokens/trainable": 12308453 }, { "epoch": 2.8152866242038215, "grad_norm": 0.1748046875, "learning_rate": 2.3916456711733776e-05, "loss": 0.004514369182288647, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00452, "step": 884, "tokens/total": 115752960, "tokens/train_per_sec_per_gpu": 3622.31, "tokens/trainable": 12323539 }, { "epoch": 2.8184713375796178, "grad_norm": 0.130859375, "learning_rate": 2.386092793319568e-05, "loss": 0.004971818067133427, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00498, "step": 885, "tokens/total": 115884032, "tokens/train_per_sec_per_gpu": 3500.64, "tokens/trainable": 12338133 }, { "epoch": 2.821656050955414, "grad_norm": 0.150390625, "learning_rate": 2.3805404785410157e-05, "loss": 0.004273276310414076, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00428, "step": 886, "tokens/total": 116015104, "tokens/train_per_sec_per_gpu": 3731.5, "tokens/trainable": 12353671 }, { "epoch": 2.8248407643312103, "grad_norm": 0.130859375, "learning_rate": 2.374988754284371e-05, "loss": 0.0031330641359090805, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00314, "step": 887, "tokens/total": 116146176, "tokens/train_per_sec_per_gpu": 3214.83, "tokens/trainable": 12367172 }, { "epoch": 2.8280254777070066, "grad_norm": 0.1708984375, "learning_rate": 2.369437647993363e-05, "loss": 0.007122378330677748, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00715, "step": 888, "tokens/total": 116277248, "tokens/train_per_sec_per_gpu": 3830.69, "tokens/trainable": 12383092 }, { "epoch": 2.8312101910828025, "grad_norm": 0.1435546875, "learning_rate": 2.3638871871086652e-05, "loss": 0.003396370681002736, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0034, "step": 889, "tokens/total": 116408320, "tokens/train_per_sec_per_gpu": 3247.86, "tokens/trainable": 12396628 }, { "epoch": 2.8343949044585988, "grad_norm": 0.1748046875, "learning_rate": 2.358337399067763e-05, "loss": 0.00505115557461977, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00506, "step": 890, "tokens/total": 116539392, "tokens/train_per_sec_per_gpu": 3332.28, "tokens/trainable": 12410665 }, { "epoch": 2.837579617834395, "grad_norm": 0.1259765625, "learning_rate": 2.3527883113048154e-05, "loss": 0.0035984639544039965, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0036, "step": 891, "tokens/total": 116670464, "tokens/train_per_sec_per_gpu": 3242.03, "tokens/trainable": 12424229 }, { "epoch": 2.840764331210191, "grad_norm": 0.2109375, "learning_rate": 2.3472399512505165e-05, "loss": 0.007709989324212074, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00774, "step": 892, "tokens/total": 116801536, "tokens/train_per_sec_per_gpu": 3025.59, "tokens/trainable": 12436996 }, { "epoch": 2.843949044585987, "grad_norm": 0.1669921875, "learning_rate": 2.3416923463319686e-05, "loss": 0.00600704038515687, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00603, "step": 893, "tokens/total": 116932608, "tokens/train_per_sec_per_gpu": 3495.42, "tokens/trainable": 12451599 }, { "epoch": 2.8471337579617835, "grad_norm": 0.1474609375, "learning_rate": 2.3361455239725364e-05, "loss": 0.0037581382784992456, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00377, "step": 894, "tokens/total": 117063680, "tokens/train_per_sec_per_gpu": 3183.5, "tokens/trainable": 12464960 }, { "epoch": 2.8503184713375798, "grad_norm": 0.177734375, "learning_rate": 2.3305995115917177e-05, "loss": 0.004449051804840565, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00446, "step": 895, "tokens/total": 117194752, "tokens/train_per_sec_per_gpu": 3342.72, "tokens/trainable": 12478964 }, { "epoch": 2.853503184713376, "grad_norm": 0.12890625, "learning_rate": 2.3250543366050074e-05, "loss": 0.004355857148766518, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00437, "step": 896, "tokens/total": 117325824, "tokens/train_per_sec_per_gpu": 3500.36, "tokens/trainable": 12493585 }, { "epoch": 2.856687898089172, "grad_norm": 0.138671875, "learning_rate": 2.3195100264237607e-05, "loss": 0.004324641078710556, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00433, "step": 897, "tokens/total": 117456896, "tokens/train_per_sec_per_gpu": 3278.5, "tokens/trainable": 12507318 }, { "epoch": 2.859872611464968, "grad_norm": 0.2392578125, "learning_rate": 2.3139666084550553e-05, "loss": 0.005408423021435738, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00542, "step": 898, "tokens/total": 117587968, "tokens/train_per_sec_per_gpu": 3103.8, "tokens/trainable": 12520317 }, { "epoch": 2.8630573248407645, "grad_norm": 0.1796875, "learning_rate": 2.308424110101562e-05, "loss": 0.005885708145797253, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0059, "step": 899, "tokens/total": 117719040, "tokens/train_per_sec_per_gpu": 3937.35, "tokens/trainable": 12536633 }, { "epoch": 2.8662420382165603, "grad_norm": 0.1904296875, "learning_rate": 2.3028825587614044e-05, "loss": 0.0059039052575826645, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00592, "step": 900, "tokens/total": 117850112, "tokens/train_per_sec_per_gpu": 3269.26, "tokens/trainable": 12550322 }, { "epoch": 2.8694267515923566, "grad_norm": 0.1630859375, "learning_rate": 2.2973419818280225e-05, "loss": 0.004266998264938593, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00428, "step": 901, "tokens/total": 117981184, "tokens/train_per_sec_per_gpu": 2909.52, "tokens/trainable": 12562584 }, { "epoch": 2.872611464968153, "grad_norm": 0.19140625, "learning_rate": 2.2918024066900433e-05, "loss": 0.005715237930417061, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00573, "step": 902, "tokens/total": 118112256, "tokens/train_per_sec_per_gpu": 3359.95, "tokens/trainable": 12576629 }, { "epoch": 2.875796178343949, "grad_norm": 0.12158203125, "learning_rate": 2.28626386073114e-05, "loss": 0.0025465991348028183, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00255, "step": 903, "tokens/total": 118243328, "tokens/train_per_sec_per_gpu": 3119.57, "tokens/trainable": 12589699 }, { "epoch": 2.8789808917197455, "grad_norm": 0.1328125, "learning_rate": 2.2807263713298957e-05, "loss": 0.003974359482526779, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00398, "step": 904, "tokens/total": 118374400, "tokens/train_per_sec_per_gpu": 3276.02, "tokens/trainable": 12603410 }, { "epoch": 2.8821656050955413, "grad_norm": 0.1328125, "learning_rate": 2.2751899658596755e-05, "loss": 0.004021751694381237, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00403, "step": 905, "tokens/total": 118505472, "tokens/train_per_sec_per_gpu": 3674.77, "tokens/trainable": 12618803 }, { "epoch": 2.8853503184713376, "grad_norm": 0.1435546875, "learning_rate": 2.2696546716884835e-05, "loss": 0.003338857088238001, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00334, "step": 906, "tokens/total": 118636544, "tokens/train_per_sec_per_gpu": 2909.02, "tokens/trainable": 12631025 }, { "epoch": 2.888535031847134, "grad_norm": 0.12451171875, "learning_rate": 2.2641205161788287e-05, "loss": 0.0033922025468200445, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0034, "step": 907, "tokens/total": 118767616, "tokens/train_per_sec_per_gpu": 3269.87, "tokens/trainable": 12644738 }, { "epoch": 2.8917197452229297, "grad_norm": 0.1484375, "learning_rate": 2.2585875266875956e-05, "loss": 0.005157338920980692, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00517, "step": 908, "tokens/total": 118898688, "tokens/train_per_sec_per_gpu": 3300.06, "tokens/trainable": 12658494 }, { "epoch": 2.894904458598726, "grad_norm": 0.1943359375, "learning_rate": 2.253055730565902e-05, "loss": 0.006811058614403009, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00683, "step": 909, "tokens/total": 119029760, "tokens/train_per_sec_per_gpu": 3546.15, "tokens/trainable": 12673250 }, { "epoch": 2.8980891719745223, "grad_norm": 0.14453125, "learning_rate": 2.2475251551589662e-05, "loss": 0.003177374368533492, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00318, "step": 910, "tokens/total": 119160832, "tokens/train_per_sec_per_gpu": 3016.5, "tokens/trainable": 12685906 }, { "epoch": 2.9012738853503186, "grad_norm": 0.1669921875, "learning_rate": 2.241995827805974e-05, "loss": 0.005059496965259314, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00507, "step": 911, "tokens/total": 119291904, "tokens/train_per_sec_per_gpu": 3674.19, "tokens/trainable": 12701221 }, { "epoch": 2.904458598726115, "grad_norm": 0.126953125, "learning_rate": 2.2364677758399406e-05, "loss": 0.0032712582033127546, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00328, "step": 912, "tokens/total": 119422976, "tokens/train_per_sec_per_gpu": 3246.71, "tokens/trainable": 12714734 }, { "epoch": 2.9076433121019107, "grad_norm": 0.212890625, "learning_rate": 2.230941026587576e-05, "loss": 0.007138803135603666, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00716, "step": 913, "tokens/total": 119554048, "tokens/train_per_sec_per_gpu": 3265.13, "tokens/trainable": 12728429 }, { "epoch": 2.910828025477707, "grad_norm": 0.1708984375, "learning_rate": 2.2254156073691518e-05, "loss": 0.00541570782661438, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00543, "step": 914, "tokens/total": 119685120, "tokens/train_per_sec_per_gpu": 3389.26, "tokens/trainable": 12742539 }, { "epoch": 2.9140127388535033, "grad_norm": 0.1640625, "learning_rate": 2.219891545498365e-05, "loss": 0.0042840586975216866, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00429, "step": 915, "tokens/total": 119816192, "tokens/train_per_sec_per_gpu": 3318.34, "tokens/trainable": 12756353 }, { "epoch": 2.917197452229299, "grad_norm": 0.17578125, "learning_rate": 2.2143688682822e-05, "loss": 0.005752744618803263, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00577, "step": 916, "tokens/total": 119947264, "tokens/train_per_sec_per_gpu": 3560.06, "tokens/trainable": 12771211 }, { "epoch": 2.9203821656050954, "grad_norm": 0.1728515625, "learning_rate": 2.2088476030208012e-05, "loss": 0.003762285690754652, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00377, "step": 917, "tokens/total": 120078336, "tokens/train_per_sec_per_gpu": 2930.2, "tokens/trainable": 12783504 }, { "epoch": 2.9235668789808917, "grad_norm": 0.11767578125, "learning_rate": 2.2033277770073297e-05, "loss": 0.0025295563973486423, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00253, "step": 918, "tokens/total": 120209408, "tokens/train_per_sec_per_gpu": 3098.34, "tokens/trainable": 12796457 }, { "epoch": 2.926751592356688, "grad_norm": 0.1337890625, "learning_rate": 2.1978094175278323e-05, "loss": 0.004149306565523148, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00416, "step": 919, "tokens/total": 120340480, "tokens/train_per_sec_per_gpu": 3238.11, "tokens/trainable": 12810005 }, { "epoch": 2.9299363057324843, "grad_norm": 0.1826171875, "learning_rate": 2.192292551861108e-05, "loss": 0.006155917886644602, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00617, "step": 920, "tokens/total": 120471552, "tokens/train_per_sec_per_gpu": 3351.96, "tokens/trainable": 12824046 }, { "epoch": 2.93312101910828, "grad_norm": 0.140625, "learning_rate": 2.1867772072785708e-05, "loss": 0.005103899631649256, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00512, "step": 921, "tokens/total": 120602624, "tokens/train_per_sec_per_gpu": 3263.43, "tokens/trainable": 12837714 }, { "epoch": 2.9363057324840764, "grad_norm": 0.171875, "learning_rate": 2.181263411044114e-05, "loss": 0.004437371157109737, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00445, "step": 922, "tokens/total": 120733696, "tokens/train_per_sec_per_gpu": 3276.05, "tokens/trainable": 12851431 }, { "epoch": 2.9394904458598727, "grad_norm": 0.1689453125, "learning_rate": 2.1757511904139793e-05, "loss": 0.005264171864837408, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00528, "step": 923, "tokens/total": 120864768, "tokens/train_per_sec_per_gpu": 3525.14, "tokens/trainable": 12866186 }, { "epoch": 2.9426751592356686, "grad_norm": 0.16796875, "learning_rate": 2.1702405726366193e-05, "loss": 0.0048398361541330814, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00485, "step": 924, "tokens/total": 120995840, "tokens/train_per_sec_per_gpu": 3474.68, "tokens/trainable": 12880741 }, { "epoch": 2.945859872611465, "grad_norm": 0.1513671875, "learning_rate": 2.1647315849525606e-05, "loss": 0.0037978398613631725, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00381, "step": 925, "tokens/total": 121126912, "tokens/train_per_sec_per_gpu": 3132.29, "tokens/trainable": 12893946 }, { "epoch": 2.949044585987261, "grad_norm": 0.1650390625, "learning_rate": 2.1592242545942755e-05, "loss": 0.005401961971074343, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00542, "step": 926, "tokens/total": 121257984, "tokens/train_per_sec_per_gpu": 3184.76, "tokens/trainable": 12907278 }, { "epoch": 2.9522292993630574, "grad_norm": 0.1474609375, "learning_rate": 2.1537186087860423e-05, "loss": 0.005091848783195019, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0051, "step": 927, "tokens/total": 121389056, "tokens/train_per_sec_per_gpu": 3525.92, "tokens/trainable": 12921953 }, { "epoch": 2.9554140127388537, "grad_norm": 0.162109375, "learning_rate": 2.14821467474381e-05, "loss": 0.005307201761752367, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00532, "step": 928, "tokens/total": 121520128, "tokens/train_per_sec_per_gpu": 3465.19, "tokens/trainable": 12936423 }, { "epoch": 2.9585987261146496, "grad_norm": 0.12109375, "learning_rate": 2.1427124796750696e-05, "loss": 0.002976613584905863, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00298, "step": 929, "tokens/total": 121651200, "tokens/train_per_sec_per_gpu": 3415.41, "tokens/trainable": 12950697 }, { "epoch": 2.961783439490446, "grad_norm": 0.2021484375, "learning_rate": 2.1372120507787134e-05, "loss": 0.004961484577506781, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00497, "step": 930, "tokens/total": 121782272, "tokens/train_per_sec_per_gpu": 3237.03, "tokens/trainable": 12964260 }, { "epoch": 2.964968152866242, "grad_norm": 0.193359375, "learning_rate": 2.131713415244902e-05, "loss": 0.0067651160061359406, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00679, "step": 931, "tokens/total": 121913344, "tokens/train_per_sec_per_gpu": 3323.81, "tokens/trainable": 12978164 }, { "epoch": 2.968152866242038, "grad_norm": 0.166015625, "learning_rate": 2.1262166002549344e-05, "loss": 0.005593163892626762, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00561, "step": 932, "tokens/total": 122044416, "tokens/train_per_sec_per_gpu": 3178.79, "tokens/trainable": 12991495 }, { "epoch": 2.9713375796178343, "grad_norm": 0.177734375, "learning_rate": 2.1207216329811082e-05, "loss": 0.0055503519251942635, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00557, "step": 933, "tokens/total": 122175488, "tokens/train_per_sec_per_gpu": 2983.66, "tokens/trainable": 13003996 }, { "epoch": 2.9745222929936306, "grad_norm": 0.162109375, "learning_rate": 2.115228540586586e-05, "loss": 0.004628556780517101, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00464, "step": 934, "tokens/total": 122306560, "tokens/train_per_sec_per_gpu": 3348.38, "tokens/trainable": 13017998 }, { "epoch": 2.977707006369427, "grad_norm": 0.146484375, "learning_rate": 2.109737350225264e-05, "loss": 0.0036150780506432056, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00362, "step": 935, "tokens/total": 122437632, "tokens/train_per_sec_per_gpu": 3386.9, "tokens/trainable": 13032100 }, { "epoch": 2.980891719745223, "grad_norm": 0.15234375, "learning_rate": 2.1042480890416368e-05, "loss": 0.004233770538121462, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00424, "step": 936, "tokens/total": 122568704, "tokens/train_per_sec_per_gpu": 3171.53, "tokens/trainable": 13045341 }, { "epoch": 2.984076433121019, "grad_norm": 0.1728515625, "learning_rate": 2.0987607841706595e-05, "loss": 0.004372127819806337, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00438, "step": 937, "tokens/total": 122699776, "tokens/train_per_sec_per_gpu": 3077.26, "tokens/trainable": 13058291 }, { "epoch": 2.9872611464968153, "grad_norm": 0.154296875, "learning_rate": 2.09327546273762e-05, "loss": 0.005242812447249889, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00526, "step": 938, "tokens/total": 122830848, "tokens/train_per_sec_per_gpu": 3362.79, "tokens/trainable": 13072317 }, { "epoch": 2.9904458598726116, "grad_norm": 0.150390625, "learning_rate": 2.087792151858e-05, "loss": 0.0044011822901666164, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00441, "step": 939, "tokens/total": 122961920, "tokens/train_per_sec_per_gpu": 3314.79, "tokens/trainable": 13086158 }, { "epoch": 2.9936305732484074, "grad_norm": 0.1650390625, "learning_rate": 2.0823108786373414e-05, "loss": 0.004296471830457449, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00431, "step": 940, "tokens/total": 123092992, "tokens/train_per_sec_per_gpu": 3532.4, "tokens/trainable": 13100899 }, { "epoch": 2.9968152866242037, "grad_norm": 0.134765625, "learning_rate": 2.0768316701711153e-05, "loss": 0.0038203117437660694, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00383, "step": 941, "tokens/total": 123224064, "tokens/train_per_sec_per_gpu": 3339.11, "tokens/trainable": 13115218 }, { "epoch": 3.0, "grad_norm": 0.2158203125, "learning_rate": 2.0713545535445857e-05, "loss": 0.005111368373036385, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 39.25, "memory/max_allocated (GiB)": 39.25, "ppl": 1.00512, "step": 942, "tokens/total": 123297792, "tokens/train_per_sec_per_gpu": 3851.53, "tokens/trainable": 13124028 }, { "epoch": 3.0, "eval_loss": 0.008717856369912624, "eval_ppl": 1.00876, "eval_runtime": 41.6707, "eval_samples_per_second": 64.818, "eval_steps_per_second": 4.056, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 54.61, "memory/max_allocated (GiB)": 54.61, "step": 942 }, { "epoch": 3.0031847133757963, "grad_norm": 0.111328125, "learning_rate": 2.0658795558326743e-05, "loss": 0.0027498805429786444, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00275, "step": 943, "tokens/total": 123428864, "tokens/train_per_sec_per_gpu": 3250.23, "tokens/trainable": 13137492 }, { "epoch": 3.0063694267515926, "grad_norm": 0.08642578125, "learning_rate": 2.0604067040998314e-05, "loss": 0.002591141266748309, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00259, "step": 944, "tokens/total": 123559936, "tokens/train_per_sec_per_gpu": 3658.64, "tokens/trainable": 13152727 }, { "epoch": 3.0095541401273884, "grad_norm": 0.11328125, "learning_rate": 2.054936025399897e-05, "loss": 0.0033186483196914196, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00332, "step": 945, "tokens/total": 123691008, "tokens/train_per_sec_per_gpu": 3830.57, "tokens/trainable": 13168699 }, { "epoch": 3.0127388535031847, "grad_norm": 0.1318359375, "learning_rate": 2.049467546775968e-05, "loss": 0.0039662388153374195, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00397, "step": 946, "tokens/total": 123822080, "tokens/train_per_sec_per_gpu": 3532.79, "tokens/trainable": 13183492 }, { "epoch": 3.015923566878981, "grad_norm": 0.10986328125, "learning_rate": 2.0440012952602706e-05, "loss": 0.003088605822995305, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00309, "step": 947, "tokens/total": 123953152, "tokens/train_per_sec_per_gpu": 3257.92, "tokens/trainable": 13197146 }, { "epoch": 3.0191082802547773, "grad_norm": 0.12890625, "learning_rate": 2.0385372978740167e-05, "loss": 0.0031338452827185392, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00314, "step": 948, "tokens/total": 124084224, "tokens/train_per_sec_per_gpu": 3231.23, "tokens/trainable": 13210673 }, { "epoch": 3.022292993630573, "grad_norm": 0.123046875, "learning_rate": 2.033075581627276e-05, "loss": 0.0032858422491699457, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00329, "step": 949, "tokens/total": 124215296, "tokens/train_per_sec_per_gpu": 3298.75, "tokens/trainable": 13224347 }, { "epoch": 3.0254777070063694, "grad_norm": 0.10205078125, "learning_rate": 2.0276161735188458e-05, "loss": 0.0026432094164192677, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00265, "step": 950, "tokens/total": 124346368, "tokens/train_per_sec_per_gpu": 3518.56, "tokens/trainable": 13238926 }, { "epoch": 3.0286624203821657, "grad_norm": 0.1279296875, "learning_rate": 2.0221591005361104e-05, "loss": 0.0035607037134468555, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00357, "step": 951, "tokens/total": 124477440, "tokens/train_per_sec_per_gpu": 3364.92, "tokens/trainable": 13252966 }, { "epoch": 3.031847133757962, "grad_norm": 0.140625, "learning_rate": 2.0167043896549097e-05, "loss": 0.004281069617718458, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00429, "step": 952, "tokens/total": 124608512, "tokens/train_per_sec_per_gpu": 3140.17, "tokens/trainable": 13266012 }, { "epoch": 3.035031847133758, "grad_norm": 0.140625, "learning_rate": 2.0112520678394107e-05, "loss": 0.003244205377995968, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00325, "step": 953, "tokens/total": 124739584, "tokens/train_per_sec_per_gpu": 3319.22, "tokens/trainable": 13279830 }, { "epoch": 3.038216560509554, "grad_norm": 0.1357421875, "learning_rate": 2.005802162041969e-05, "loss": 0.0033878230024129152, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00339, "step": 954, "tokens/total": 124870656, "tokens/train_per_sec_per_gpu": 3384.16, "tokens/trainable": 13293926 }, { "epoch": 3.0414012738853504, "grad_norm": 0.134765625, "learning_rate": 2.0003546992029953e-05, "loss": 0.002641953295096755, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00265, "step": 955, "tokens/total": 125001728, "tokens/train_per_sec_per_gpu": 2720.72, "tokens/trainable": 13305413 }, { "epoch": 3.0445859872611467, "grad_norm": 0.138671875, "learning_rate": 1.9949097062508267e-05, "loss": 0.003417475149035454, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00342, "step": 956, "tokens/total": 125132800, "tokens/train_per_sec_per_gpu": 3595.19, "tokens/trainable": 13320381 }, { "epoch": 3.0477707006369426, "grad_norm": 0.1064453125, "learning_rate": 1.9894672101015904e-05, "loss": 0.002634722040966153, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00264, "step": 957, "tokens/total": 125263872, "tokens/train_per_sec_per_gpu": 3285.4, "tokens/trainable": 13334096 }, { "epoch": 3.050955414012739, "grad_norm": 0.1669921875, "learning_rate": 1.9840272376590693e-05, "loss": 0.0045495470985770226, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00456, "step": 958, "tokens/total": 125394944, "tokens/train_per_sec_per_gpu": 3160.53, "tokens/trainable": 13347392 }, { "epoch": 3.054140127388535, "grad_norm": 0.126953125, "learning_rate": 1.9785898158145738e-05, "loss": 0.0035640057176351547, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00357, "step": 959, "tokens/total": 125526016, "tokens/train_per_sec_per_gpu": 3402.01, "tokens/trainable": 13361641 }, { "epoch": 3.0573248407643314, "grad_norm": 0.12890625, "learning_rate": 1.9731549714468045e-05, "loss": 0.003452250501140952, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00346, "step": 960, "tokens/total": 125657088, "tokens/train_per_sec_per_gpu": 3116.14, "tokens/trainable": 13374682 }, { "epoch": 3.0605095541401273, "grad_norm": 0.1220703125, "learning_rate": 1.9677227314217188e-05, "loss": 0.0024322110693901777, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00244, "step": 961, "tokens/total": 125788160, "tokens/train_per_sec_per_gpu": 2974.51, "tokens/trainable": 13387164 }, { "epoch": 3.0636942675159236, "grad_norm": 0.12451171875, "learning_rate": 1.962293122592405e-05, "loss": 0.00328466366045177, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00329, "step": 962, "tokens/total": 125919232, "tokens/train_per_sec_per_gpu": 3223.72, "tokens/trainable": 13400626 }, { "epoch": 3.06687898089172, "grad_norm": 0.1171875, "learning_rate": 1.9568661717989407e-05, "loss": 0.0021802615374326706, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00218, "step": 963, "tokens/total": 126050304, "tokens/train_per_sec_per_gpu": 3537.27, "tokens/trainable": 13415382 }, { "epoch": 3.070063694267516, "grad_norm": 0.150390625, "learning_rate": 1.951441905868264e-05, "loss": 0.003219526493921876, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00322, "step": 964, "tokens/total": 126181376, "tokens/train_per_sec_per_gpu": 3173.36, "tokens/trainable": 13428745 }, { "epoch": 3.073248407643312, "grad_norm": 0.130859375, "learning_rate": 1.9460203516140433e-05, "loss": 0.0025150931905955076, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00252, "step": 965, "tokens/total": 126312448, "tokens/train_per_sec_per_gpu": 3357.46, "tokens/trainable": 13442783 }, { "epoch": 3.0764331210191083, "grad_norm": 0.13671875, "learning_rate": 1.940601535836542e-05, "loss": 0.002752315253019333, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00276, "step": 966, "tokens/total": 126443520, "tokens/train_per_sec_per_gpu": 3434.62, "tokens/trainable": 13457099 }, { "epoch": 3.0796178343949046, "grad_norm": 0.11865234375, "learning_rate": 1.9351854853224837e-05, "loss": 0.002302248729392886, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0023, "step": 967, "tokens/total": 126574592, "tokens/train_per_sec_per_gpu": 3069.39, "tokens/trainable": 13470035 }, { "epoch": 3.082802547770701, "grad_norm": 0.138671875, "learning_rate": 1.9297722268449264e-05, "loss": 0.00326096941716969, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00327, "step": 968, "tokens/total": 126705664, "tokens/train_per_sec_per_gpu": 3547.61, "tokens/trainable": 13484891 }, { "epoch": 3.0859872611464967, "grad_norm": 0.14453125, "learning_rate": 1.9243617871631245e-05, "loss": 0.0029772731941193342, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00298, "step": 969, "tokens/total": 126836736, "tokens/train_per_sec_per_gpu": 3593.08, "tokens/trainable": 13499838 }, { "epoch": 3.089171974522293, "grad_norm": 0.12890625, "learning_rate": 1.9189541930223965e-05, "loss": 0.0024753999896347523, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00248, "step": 970, "tokens/total": 126967808, "tokens/train_per_sec_per_gpu": 3311.76, "tokens/trainable": 13513723 }, { "epoch": 3.0923566878980893, "grad_norm": 0.134765625, "learning_rate": 1.9135494711539975e-05, "loss": 0.003328888211399317, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00333, "step": 971, "tokens/total": 127098880, "tokens/train_per_sec_per_gpu": 3188.59, "tokens/trainable": 13527089 }, { "epoch": 3.0955414012738856, "grad_norm": 0.10693359375, "learning_rate": 1.9081476482749838e-05, "loss": 0.0020992374047636986, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0021, "step": 972, "tokens/total": 127229952, "tokens/train_per_sec_per_gpu": 3319.78, "tokens/trainable": 13540974 }, { "epoch": 3.0987261146496814, "grad_norm": 0.1474609375, "learning_rate": 1.902748751088078e-05, "loss": 0.0023126029409468174, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00232, "step": 973, "tokens/total": 127361024, "tokens/train_per_sec_per_gpu": 3187.82, "tokens/trainable": 13554328 }, { "epoch": 3.1019108280254777, "grad_norm": 0.11572265625, "learning_rate": 1.8973528062815452e-05, "loss": 0.001823435421101749, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00183, "step": 974, "tokens/total": 127492096, "tokens/train_per_sec_per_gpu": 2959.79, "tokens/trainable": 13566755 }, { "epoch": 3.105095541401274, "grad_norm": 0.134765625, "learning_rate": 1.8919598405290522e-05, "loss": 0.002975163981318474, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00298, "step": 975, "tokens/total": 127623168, "tokens/train_per_sec_per_gpu": 3605.57, "tokens/trainable": 13581801 }, { "epoch": 3.1082802547770703, "grad_norm": 0.1455078125, "learning_rate": 1.88656988048954e-05, "loss": 0.0033629476092755795, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00337, "step": 976, "tokens/total": 127754240, "tokens/train_per_sec_per_gpu": 3305.94, "tokens/trainable": 13595673 }, { "epoch": 3.111464968152866, "grad_norm": 0.11474609375, "learning_rate": 1.8811829528070935e-05, "loss": 0.0019825787749141455, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00198, "step": 977, "tokens/total": 127885312, "tokens/train_per_sec_per_gpu": 3321.78, "tokens/trainable": 13609562 }, { "epoch": 3.1146496815286624, "grad_norm": 0.134765625, "learning_rate": 1.8757990841108065e-05, "loss": 0.00240930519066751, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00241, "step": 978, "tokens/total": 128016384, "tokens/train_per_sec_per_gpu": 3254.37, "tokens/trainable": 13623198 }, { "epoch": 3.1178343949044587, "grad_norm": 0.146484375, "learning_rate": 1.87041830101465e-05, "loss": 0.0034942845813930035, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0035, "step": 979, "tokens/total": 128147456, "tokens/train_per_sec_per_gpu": 3162.68, "tokens/trainable": 13636512 }, { "epoch": 3.121019108280255, "grad_norm": 0.1494140625, "learning_rate": 1.8650406301173447e-05, "loss": 0.0034091034904122353, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00341, "step": 980, "tokens/total": 128278528, "tokens/train_per_sec_per_gpu": 3456.01, "tokens/trainable": 13650976 }, { "epoch": 3.124203821656051, "grad_norm": 0.15625, "learning_rate": 1.8596660980022258e-05, "loss": 0.0025934309232980013, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0026, "step": 981, "tokens/total": 128409600, "tokens/train_per_sec_per_gpu": 3098.66, "tokens/trainable": 13663952 }, { "epoch": 3.127388535031847, "grad_norm": 0.10595703125, "learning_rate": 1.8542947312371108e-05, "loss": 0.0022293792571872473, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00223, "step": 982, "tokens/total": 128540672, "tokens/train_per_sec_per_gpu": 3308.14, "tokens/trainable": 13677809 }, { "epoch": 3.1305732484076434, "grad_norm": 0.220703125, "learning_rate": 1.8489265563741725e-05, "loss": 0.0036684228107333183, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00368, "step": 983, "tokens/total": 128671744, "tokens/train_per_sec_per_gpu": 2671.82, "tokens/trainable": 13689205 }, { "epoch": 3.1337579617834397, "grad_norm": 0.146484375, "learning_rate": 1.8435615999498045e-05, "loss": 0.003023945726454258, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00303, "step": 984, "tokens/total": 128802816, "tokens/train_per_sec_per_gpu": 3353.12, "tokens/trainable": 13703248 }, { "epoch": 3.1369426751592355, "grad_norm": 0.1357421875, "learning_rate": 1.8381998884844914e-05, "loss": 0.0030851985793560743, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00309, "step": 985, "tokens/total": 128933888, "tokens/train_per_sec_per_gpu": 3504.98, "tokens/trainable": 13717913 }, { "epoch": 3.140127388535032, "grad_norm": 0.1416015625, "learning_rate": 1.8328414484826745e-05, "loss": 0.002863124944269657, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00287, "step": 986, "tokens/total": 129064960, "tokens/train_per_sec_per_gpu": 3309.73, "tokens/trainable": 13731778 }, { "epoch": 3.143312101910828, "grad_norm": 0.138671875, "learning_rate": 1.8274863064326253e-05, "loss": 0.0033043615985661745, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00331, "step": 987, "tokens/total": 129196032, "tokens/train_per_sec_per_gpu": 3489.76, "tokens/trainable": 13746393 }, { "epoch": 3.1464968152866244, "grad_norm": 0.1669921875, "learning_rate": 1.822134488806314e-05, "loss": 0.003721470246091485, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00373, "step": 988, "tokens/total": 129327104, "tokens/train_per_sec_per_gpu": 3260.94, "tokens/trainable": 13760070 }, { "epoch": 3.1496815286624202, "grad_norm": 0.107421875, "learning_rate": 1.8167860220592736e-05, "loss": 0.002208119258284569, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00221, "step": 989, "tokens/total": 129458176, "tokens/train_per_sec_per_gpu": 3466.56, "tokens/trainable": 13774565 }, { "epoch": 3.1528662420382165, "grad_norm": 0.1591796875, "learning_rate": 1.8114409326304754e-05, "loss": 0.0030963195022195578, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0031, "step": 990, "tokens/total": 129589248, "tokens/train_per_sec_per_gpu": 3297.94, "tokens/trainable": 13788405 }, { "epoch": 3.156050955414013, "grad_norm": 0.146484375, "learning_rate": 1.806099246942196e-05, "loss": 0.0031601302325725555, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00317, "step": 991, "tokens/total": 129720320, "tokens/train_per_sec_per_gpu": 3314.16, "tokens/trainable": 13802332 }, { "epoch": 3.159235668789809, "grad_norm": 0.1650390625, "learning_rate": 1.800760991399884e-05, "loss": 0.003068044548854232, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00307, "step": 992, "tokens/total": 129851392, "tokens/train_per_sec_per_gpu": 3131.59, "tokens/trainable": 13815450 }, { "epoch": 3.162420382165605, "grad_norm": 0.142578125, "learning_rate": 1.7954261923920335e-05, "loss": 0.003088792786002159, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00309, "step": 993, "tokens/total": 129982464, "tokens/train_per_sec_per_gpu": 3446.24, "tokens/trainable": 13829844 }, { "epoch": 3.1656050955414012, "grad_norm": 0.1259765625, "learning_rate": 1.7900948762900527e-05, "loss": 0.002409819047898054, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00241, "step": 994, "tokens/total": 130113536, "tokens/train_per_sec_per_gpu": 3166.51, "tokens/trainable": 13843168 }, { "epoch": 3.1687898089171975, "grad_norm": 0.1669921875, "learning_rate": 1.7847670694481307e-05, "loss": 0.004092029761523008, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0041, "step": 995, "tokens/total": 130244608, "tokens/train_per_sec_per_gpu": 3606.98, "tokens/trainable": 13858161 }, { "epoch": 3.171974522292994, "grad_norm": 0.11865234375, "learning_rate": 1.7794427982031104e-05, "loss": 0.001977186882868409, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00198, "step": 996, "tokens/total": 130375680, "tokens/train_per_sec_per_gpu": 3153.56, "tokens/trainable": 13871441 }, { "epoch": 3.1751592356687897, "grad_norm": 0.1728515625, "learning_rate": 1.7741220888743587e-05, "loss": 0.0029397865291684866, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00294, "step": 997, "tokens/total": 130506752, "tokens/train_per_sec_per_gpu": 3096.09, "tokens/trainable": 13884512 }, { "epoch": 3.178343949044586, "grad_norm": 0.1416015625, "learning_rate": 1.768804967763632e-05, "loss": 0.0025828841608017683, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00259, "step": 998, "tokens/total": 130637824, "tokens/train_per_sec_per_gpu": 3237.57, "tokens/trainable": 13898147 }, { "epoch": 3.1815286624203822, "grad_norm": 0.1455078125, "learning_rate": 1.763491461154951e-05, "loss": 0.002550513716414571, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00255, "step": 999, "tokens/total": 130768896, "tokens/train_per_sec_per_gpu": 3248.32, "tokens/trainable": 13911818 }, { "epoch": 3.1847133757961785, "grad_norm": 0.1220703125, "learning_rate": 1.7581815953144694e-05, "loss": 0.0023207683116197586, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00232, "step": 1000, "tokens/total": 130899968, "tokens/train_per_sec_per_gpu": 3189.52, "tokens/trainable": 13925184 }, { "epoch": 3.1878980891719744, "grad_norm": 0.14453125, "learning_rate": 1.7528753964903422e-05, "loss": 0.0033754960168153048, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00338, "step": 1001, "tokens/total": 131031040, "tokens/train_per_sec_per_gpu": 3425.19, "tokens/trainable": 13939522 }, { "epoch": 3.1910828025477707, "grad_norm": 0.11767578125, "learning_rate": 1.7475728909125967e-05, "loss": 0.0025386540219187737, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00254, "step": 1002, "tokens/total": 131162112, "tokens/train_per_sec_per_gpu": 3600.39, "tokens/trainable": 13954592 }, { "epoch": 3.194267515923567, "grad_norm": 0.10888671875, "learning_rate": 1.7422741047930075e-05, "loss": 0.00221554609015584, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00222, "step": 1003, "tokens/total": 131293184, "tokens/train_per_sec_per_gpu": 3073.34, "tokens/trainable": 13967458 }, { "epoch": 3.1974522292993632, "grad_norm": 0.1572265625, "learning_rate": 1.7369790643249573e-05, "loss": 0.0035816675517708063, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00359, "step": 1004, "tokens/total": 131424256, "tokens/train_per_sec_per_gpu": 3426.39, "tokens/trainable": 13981803 }, { "epoch": 3.200636942675159, "grad_norm": 0.15234375, "learning_rate": 1.731687795683316e-05, "loss": 0.0033436615485697985, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00335, "step": 1005, "tokens/total": 131555328, "tokens/train_per_sec_per_gpu": 3313.83, "tokens/trainable": 13995695 }, { "epoch": 3.2038216560509554, "grad_norm": 0.1416015625, "learning_rate": 1.7264003250243102e-05, "loss": 0.002780565060675144, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00278, "step": 1006, "tokens/total": 131686400, "tokens/train_per_sec_per_gpu": 3199.71, "tokens/trainable": 14009116 }, { "epoch": 3.2070063694267517, "grad_norm": 0.1552734375, "learning_rate": 1.7211166784853874e-05, "loss": 0.003775578923523426, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00378, "step": 1007, "tokens/total": 131817472, "tokens/train_per_sec_per_gpu": 3328.07, "tokens/trainable": 14023153 }, { "epoch": 3.210191082802548, "grad_norm": 0.1201171875, "learning_rate": 1.715836882185094e-05, "loss": 0.0018264808459207416, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00183, "step": 1008, "tokens/total": 131948544, "tokens/train_per_sec_per_gpu": 2972.58, "tokens/trainable": 14035645 }, { "epoch": 3.213375796178344, "grad_norm": 0.1171875, "learning_rate": 1.710560962222945e-05, "loss": 0.0018301783129572868, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00183, "step": 1009, "tokens/total": 132079616, "tokens/train_per_sec_per_gpu": 3211.08, "tokens/trainable": 14049113 }, { "epoch": 3.21656050955414, "grad_norm": 0.11328125, "learning_rate": 1.705288944679291e-05, "loss": 0.002403366146609187, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00241, "step": 1010, "tokens/total": 132210688, "tokens/train_per_sec_per_gpu": 3364.87, "tokens/trainable": 14063206 }, { "epoch": 3.2197452229299364, "grad_norm": 0.1640625, "learning_rate": 1.7000208556151915e-05, "loss": 0.00280455662868917, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00281, "step": 1011, "tokens/total": 132341760, "tokens/train_per_sec_per_gpu": 3264.32, "tokens/trainable": 14076868 }, { "epoch": 3.2229299363057327, "grad_norm": 0.1513671875, "learning_rate": 1.6947567210722905e-05, "loss": 0.0029342826455831528, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00294, "step": 1012, "tokens/total": 132472832, "tokens/train_per_sec_per_gpu": 3305.91, "tokens/trainable": 14090726 }, { "epoch": 3.2261146496815285, "grad_norm": 0.1875, "learning_rate": 1.689496567072678e-05, "loss": 0.0028477348387241364, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00285, "step": 1013, "tokens/total": 132603904, "tokens/train_per_sec_per_gpu": 3194.5, "tokens/trainable": 14104098 }, { "epoch": 3.229299363057325, "grad_norm": 0.15625, "learning_rate": 1.6842404196187715e-05, "loss": 0.002830425277352333, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00283, "step": 1014, "tokens/total": 132734976, "tokens/train_per_sec_per_gpu": 3606.8, "tokens/trainable": 14119202 }, { "epoch": 3.232484076433121, "grad_norm": 0.12451171875, "learning_rate": 1.678988304693183e-05, "loss": 0.002606867579743266, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00261, "step": 1015, "tokens/total": 132866048, "tokens/train_per_sec_per_gpu": 3574.5, "tokens/trainable": 14134144 }, { "epoch": 3.2356687898089174, "grad_norm": 0.1484375, "learning_rate": 1.6737402482585863e-05, "loss": 0.0034160753712058067, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00342, "step": 1016, "tokens/total": 132997120, "tokens/train_per_sec_per_gpu": 3134.2, "tokens/trainable": 14147367 }, { "epoch": 3.238853503184713, "grad_norm": 0.12060546875, "learning_rate": 1.6684962762575966e-05, "loss": 0.0016203324776142836, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00162, "step": 1017, "tokens/total": 133128192, "tokens/train_per_sec_per_gpu": 3101.12, "tokens/trainable": 14160359 }, { "epoch": 3.2420382165605095, "grad_norm": 0.1611328125, "learning_rate": 1.663256414612639e-05, "loss": 0.0028734614606946707, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00288, "step": 1018, "tokens/total": 133259264, "tokens/train_per_sec_per_gpu": 2813.19, "tokens/trainable": 14172273 }, { "epoch": 3.245222929936306, "grad_norm": 0.1630859375, "learning_rate": 1.658020689225817e-05, "loss": 0.0035582587588578463, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00356, "step": 1019, "tokens/total": 133390336, "tokens/train_per_sec_per_gpu": 3006.37, "tokens/trainable": 14184925 }, { "epoch": 3.248407643312102, "grad_norm": 0.16796875, "learning_rate": 1.6527891259787895e-05, "loss": 0.0026477861683815718, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00265, "step": 1020, "tokens/total": 133521408, "tokens/train_per_sec_per_gpu": 3004.12, "tokens/trainable": 14197554 }, { "epoch": 3.251592356687898, "grad_norm": 0.15234375, "learning_rate": 1.6475617507326418e-05, "loss": 0.0031140560749918222, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00312, "step": 1021, "tokens/total": 133652480, "tokens/train_per_sec_per_gpu": 3175.24, "tokens/trainable": 14210893 }, { "epoch": 3.254777070063694, "grad_norm": 0.1611328125, "learning_rate": 1.6423385893277536e-05, "loss": 0.003689323551952839, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0037, "step": 1022, "tokens/total": 133783552, "tokens/train_per_sec_per_gpu": 3444.39, "tokens/trainable": 14225297 }, { "epoch": 3.2579617834394905, "grad_norm": 0.13671875, "learning_rate": 1.6371196675836763e-05, "loss": 0.0028125548269599676, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00282, "step": 1023, "tokens/total": 133914624, "tokens/train_per_sec_per_gpu": 3577.78, "tokens/trainable": 14240285 }, { "epoch": 3.261146496815287, "grad_norm": 0.1513671875, "learning_rate": 1.631905011299005e-05, "loss": 0.003101219655945897, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00311, "step": 1024, "tokens/total": 134045696, "tokens/train_per_sec_per_gpu": 3314.34, "tokens/trainable": 14254160 }, { "epoch": 3.2643312101910826, "grad_norm": 0.1962890625, "learning_rate": 1.6266946462512455e-05, "loss": 0.002571912482380867, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00258, "step": 1025, "tokens/total": 134176768, "tokens/train_per_sec_per_gpu": 3129.65, "tokens/trainable": 14267272 }, { "epoch": 3.267515923566879, "grad_norm": 0.126953125, "learning_rate": 1.6214885981966937e-05, "loss": 0.002030417090281844, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00203, "step": 1026, "tokens/total": 134307840, "tokens/train_per_sec_per_gpu": 3312.29, "tokens/trainable": 14281152 }, { "epoch": 3.270700636942675, "grad_norm": 0.142578125, "learning_rate": 1.6162868928703057e-05, "loss": 0.0021212187130004168, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00212, "step": 1027, "tokens/total": 134438912, "tokens/train_per_sec_per_gpu": 3278.18, "tokens/trainable": 14294941 }, { "epoch": 3.2738853503184715, "grad_norm": 0.1337890625, "learning_rate": 1.6110895559855684e-05, "loss": 0.0034488090313971043, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00345, "step": 1028, "tokens/total": 134569984, "tokens/train_per_sec_per_gpu": 3722.82, "tokens/trainable": 14310525 }, { "epoch": 3.2770700636942673, "grad_norm": 0.138671875, "learning_rate": 1.605896613234375e-05, "loss": 0.002809841651469469, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00281, "step": 1029, "tokens/total": 134701056, "tokens/train_per_sec_per_gpu": 3356.33, "tokens/trainable": 14324590 }, { "epoch": 3.2802547770700636, "grad_norm": 0.1572265625, "learning_rate": 1.6007080902868986e-05, "loss": 0.003251892514526844, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00326, "step": 1030, "tokens/total": 134832128, "tokens/train_per_sec_per_gpu": 3390.28, "tokens/trainable": 14338793 }, { "epoch": 3.28343949044586, "grad_norm": 0.1591796875, "learning_rate": 1.5955240127914618e-05, "loss": 0.003499697893857956, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00351, "step": 1031, "tokens/total": 134963200, "tokens/train_per_sec_per_gpu": 3280.67, "tokens/trainable": 14352526 }, { "epoch": 3.286624203821656, "grad_norm": 0.126953125, "learning_rate": 1.5903444063744126e-05, "loss": 0.0027691691648215055, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00277, "step": 1032, "tokens/total": 135094272, "tokens/train_per_sec_per_gpu": 3269.79, "tokens/trainable": 14366213 }, { "epoch": 3.289808917197452, "grad_norm": 0.1640625, "learning_rate": 1.5851692966399996e-05, "loss": 0.004021272994577885, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00403, "step": 1033, "tokens/total": 135225344, "tokens/train_per_sec_per_gpu": 3501.62, "tokens/trainable": 14380810 }, { "epoch": 3.2929936305732483, "grad_norm": 0.1484375, "learning_rate": 1.579998709170239e-05, "loss": 0.003093718783929944, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0031, "step": 1034, "tokens/total": 135356416, "tokens/train_per_sec_per_gpu": 3052.87, "tokens/trainable": 14393602 }, { "epoch": 3.2961783439490446, "grad_norm": 0.1533203125, "learning_rate": 1.5748326695247957e-05, "loss": 0.003595340298488736, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0036, "step": 1035, "tokens/total": 135487488, "tokens/train_per_sec_per_gpu": 3610.85, "tokens/trainable": 14408657 }, { "epoch": 3.299363057324841, "grad_norm": 0.17578125, "learning_rate": 1.569671203240852e-05, "loss": 0.0037980927154421806, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00381, "step": 1036, "tokens/total": 135618560, "tokens/train_per_sec_per_gpu": 3399.48, "tokens/trainable": 14422876 }, { "epoch": 3.3025477707006368, "grad_norm": 0.1796875, "learning_rate": 1.5645143358329815e-05, "loss": 0.003825873602181673, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00383, "step": 1037, "tokens/total": 135749632, "tokens/train_per_sec_per_gpu": 3345.54, "tokens/trainable": 14436870 }, { "epoch": 3.305732484076433, "grad_norm": 0.12255859375, "learning_rate": 1.559362092793027e-05, "loss": 0.002097800839692354, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0021, "step": 1038, "tokens/total": 135880704, "tokens/train_per_sec_per_gpu": 3530.34, "tokens/trainable": 14451577 }, { "epoch": 3.3089171974522293, "grad_norm": 0.1572265625, "learning_rate": 1.5542144995899698e-05, "loss": 0.003578023286536336, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00358, "step": 1039, "tokens/total": 136011776, "tokens/train_per_sec_per_gpu": 3208.09, "tokens/trainable": 14465046 }, { "epoch": 3.3121019108280256, "grad_norm": 0.1376953125, "learning_rate": 1.5490715816698077e-05, "loss": 0.002384308958426118, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00239, "step": 1040, "tokens/total": 136142848, "tokens/train_per_sec_per_gpu": 3313.93, "tokens/trainable": 14478889 }, { "epoch": 3.3152866242038215, "grad_norm": 0.1396484375, "learning_rate": 1.5439333644554227e-05, "loss": 0.0023124567233026028, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00232, "step": 1041, "tokens/total": 136273920, "tokens/train_per_sec_per_gpu": 3490.61, "tokens/trainable": 14493436 }, { "epoch": 3.3184713375796178, "grad_norm": 0.1640625, "learning_rate": 1.538799873346466e-05, "loss": 0.004312054719775915, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00432, "step": 1042, "tokens/total": 136404992, "tokens/train_per_sec_per_gpu": 3468.79, "tokens/trainable": 14508009 }, { "epoch": 3.321656050955414, "grad_norm": 0.1611328125, "learning_rate": 1.5336711337192227e-05, "loss": 0.0034810621291399, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00349, "step": 1043, "tokens/total": 136536064, "tokens/train_per_sec_per_gpu": 3681.02, "tokens/trainable": 14523389 }, { "epoch": 3.3248407643312103, "grad_norm": 0.1201171875, "learning_rate": 1.5285471709264897e-05, "loss": 0.0020460544619709253, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00205, "step": 1044, "tokens/total": 136667136, "tokens/train_per_sec_per_gpu": 3329.86, "tokens/trainable": 14537340 }, { "epoch": 3.328025477707006, "grad_norm": 0.1455078125, "learning_rate": 1.5234280102974525e-05, "loss": 0.003296096809208393, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0033, "step": 1045, "tokens/total": 136798208, "tokens/train_per_sec_per_gpu": 3446.83, "tokens/trainable": 14551699 }, { "epoch": 3.3312101910828025, "grad_norm": 0.1328125, "learning_rate": 1.5183136771375579e-05, "loss": 0.0019932978320866823, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.002, "step": 1046, "tokens/total": 136929280, "tokens/train_per_sec_per_gpu": 3210.34, "tokens/trainable": 14565142 }, { "epoch": 3.3343949044585988, "grad_norm": 0.1376953125, "learning_rate": 1.5132041967283866e-05, "loss": 0.001847305684350431, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00185, "step": 1047, "tokens/total": 137060352, "tokens/train_per_sec_per_gpu": 3505.73, "tokens/trainable": 14579823 }, { "epoch": 3.337579617834395, "grad_norm": 0.1474609375, "learning_rate": 1.5080995943275348e-05, "loss": 0.00248389202170074, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00249, "step": 1048, "tokens/total": 137191424, "tokens/train_per_sec_per_gpu": 3588.55, "tokens/trainable": 14594782 }, { "epoch": 3.340764331210191, "grad_norm": 0.18359375, "learning_rate": 1.5029998951684828e-05, "loss": 0.00269156857393682, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0027, "step": 1049, "tokens/total": 137322496, "tokens/train_per_sec_per_gpu": 3464.55, "tokens/trainable": 14609308 }, { "epoch": 3.343949044585987, "grad_norm": 0.173828125, "learning_rate": 1.4979051244604722e-05, "loss": 0.003072477411478758, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00308, "step": 1050, "tokens/total": 137453568, "tokens/train_per_sec_per_gpu": 3052.82, "tokens/trainable": 14622170 }, { "epoch": 3.3471337579617835, "grad_norm": 0.1767578125, "learning_rate": 1.4928153073883843e-05, "loss": 0.003987753763794899, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.004, "step": 1051, "tokens/total": 137584640, "tokens/train_per_sec_per_gpu": 3233.94, "tokens/trainable": 14635795 }, { "epoch": 3.3503184713375798, "grad_norm": 0.130859375, "learning_rate": 1.4877304691126123e-05, "loss": 0.0029561547562479973, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00296, "step": 1052, "tokens/total": 137715712, "tokens/train_per_sec_per_gpu": 3268.66, "tokens/trainable": 14649498 }, { "epoch": 3.3535031847133756, "grad_norm": 0.150390625, "learning_rate": 1.4826506347689353e-05, "loss": 0.0022640160750597715, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00227, "step": 1053, "tokens/total": 137846784, "tokens/train_per_sec_per_gpu": 3172.82, "tokens/trainable": 14662788 }, { "epoch": 3.356687898089172, "grad_norm": 0.181640625, "learning_rate": 1.4775758294684006e-05, "loss": 0.0038375440053641796, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00384, "step": 1054, "tokens/total": 137977856, "tokens/train_per_sec_per_gpu": 3000.65, "tokens/trainable": 14675379 }, { "epoch": 3.359872611464968, "grad_norm": 0.1630859375, "learning_rate": 1.4725060782971933e-05, "loss": 0.0024567164946347475, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00246, "step": 1055, "tokens/total": 138108928, "tokens/train_per_sec_per_gpu": 3533.93, "tokens/trainable": 14690140 }, { "epoch": 3.3630573248407645, "grad_norm": 0.10205078125, "learning_rate": 1.4674414063165137e-05, "loss": 0.0013129838043823838, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00131, "step": 1056, "tokens/total": 138240000, "tokens/train_per_sec_per_gpu": 3290.43, "tokens/trainable": 14703961 }, { "epoch": 3.3662420382165603, "grad_norm": 0.1748046875, "learning_rate": 1.4623818385624566e-05, "loss": 0.003262344980612397, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00327, "step": 1057, "tokens/total": 138371072, "tokens/train_per_sec_per_gpu": 3399.17, "tokens/trainable": 14718152 }, { "epoch": 3.3694267515923566, "grad_norm": 0.1767578125, "learning_rate": 1.457327400045884e-05, "loss": 0.0037125989329069853, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00372, "step": 1058, "tokens/total": 138502144, "tokens/train_per_sec_per_gpu": 3413.99, "tokens/trainable": 14732369 }, { "epoch": 3.372611464968153, "grad_norm": 0.171875, "learning_rate": 1.4522781157523008e-05, "loss": 0.003059735056012869, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00306, "step": 1059, "tokens/total": 138633216, "tokens/train_per_sec_per_gpu": 3170.31, "tokens/trainable": 14745664 }, { "epoch": 3.375796178343949, "grad_norm": 0.16796875, "learning_rate": 1.4472340106417375e-05, "loss": 0.0033829023595899343, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00339, "step": 1060, "tokens/total": 138764288, "tokens/train_per_sec_per_gpu": 3121.95, "tokens/trainable": 14758786 }, { "epoch": 3.3789808917197455, "grad_norm": 0.1220703125, "learning_rate": 1.4421951096486171e-05, "loss": 0.0024168547242879868, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00242, "step": 1061, "tokens/total": 138895360, "tokens/train_per_sec_per_gpu": 3410.32, "tokens/trainable": 14773023 }, { "epoch": 3.3821656050955413, "grad_norm": 0.1728515625, "learning_rate": 1.4371614376816416e-05, "loss": 0.0038187310565263033, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00383, "step": 1062, "tokens/total": 139026432, "tokens/train_per_sec_per_gpu": 3277.49, "tokens/trainable": 14786713 }, { "epoch": 3.3853503184713376, "grad_norm": 0.130859375, "learning_rate": 1.4321330196236638e-05, "loss": 0.002092313254252076, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00209, "step": 1063, "tokens/total": 139157504, "tokens/train_per_sec_per_gpu": 3363.98, "tokens/trainable": 14800746 }, { "epoch": 3.388535031847134, "grad_norm": 0.16015625, "learning_rate": 1.4271098803315624e-05, "loss": 0.0034465331118553877, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00345, "step": 1064, "tokens/total": 139288576, "tokens/train_per_sec_per_gpu": 3553.56, "tokens/trainable": 14815617 }, { "epoch": 3.3917197452229297, "grad_norm": 0.1728515625, "learning_rate": 1.4220920446361224e-05, "loss": 0.003886766964569688, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00389, "step": 1065, "tokens/total": 139419648, "tokens/train_per_sec_per_gpu": 3092.87, "tokens/trainable": 14828591 }, { "epoch": 3.394904458598726, "grad_norm": 0.1376953125, "learning_rate": 1.4170795373419148e-05, "loss": 0.0024511385709047318, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00245, "step": 1066, "tokens/total": 139550720, "tokens/train_per_sec_per_gpu": 3130.21, "tokens/trainable": 14841695 }, { "epoch": 3.3980891719745223, "grad_norm": 0.1748046875, "learning_rate": 1.4120723832271665e-05, "loss": 0.0035048723220825195, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00351, "step": 1067, "tokens/total": 139681792, "tokens/train_per_sec_per_gpu": 3774.26, "tokens/trainable": 14857394 }, { "epoch": 3.4012738853503186, "grad_norm": 0.154296875, "learning_rate": 1.4070706070436446e-05, "loss": 0.0028158228378742933, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00282, "step": 1068, "tokens/total": 139812864, "tokens/train_per_sec_per_gpu": 3417.94, "tokens/trainable": 14871671 }, { "epoch": 3.404458598726115, "grad_norm": 0.1669921875, "learning_rate": 1.4020742335165326e-05, "loss": 0.003797327633947134, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0038, "step": 1069, "tokens/total": 139943936, "tokens/train_per_sec_per_gpu": 3477.81, "tokens/trainable": 14886204 }, { "epoch": 3.4076433121019107, "grad_norm": 0.11474609375, "learning_rate": 1.3970832873443043e-05, "loss": 0.0019341235747560859, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00194, "step": 1070, "tokens/total": 140075008, "tokens/train_per_sec_per_gpu": 3491.7, "tokens/trainable": 14900766 }, { "epoch": 3.410828025477707, "grad_norm": 0.1533203125, "learning_rate": 1.392097793198605e-05, "loss": 0.0030175955034792423, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00302, "step": 1071, "tokens/total": 140206080, "tokens/train_per_sec_per_gpu": 3393.54, "tokens/trainable": 14914981 }, { "epoch": 3.4140127388535033, "grad_norm": 0.12255859375, "learning_rate": 1.3871177757241326e-05, "loss": 0.001799887279048562, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0018, "step": 1072, "tokens/total": 140337152, "tokens/train_per_sec_per_gpu": 3339.98, "tokens/trainable": 14928954 }, { "epoch": 3.417197452229299, "grad_norm": 0.1396484375, "learning_rate": 1.382143259538507e-05, "loss": 0.001962024951353669, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00196, "step": 1073, "tokens/total": 140468224, "tokens/train_per_sec_per_gpu": 3376.61, "tokens/trainable": 14943033 }, { "epoch": 3.4203821656050954, "grad_norm": 0.16015625, "learning_rate": 1.3771742692321574e-05, "loss": 0.0027512316592037678, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00276, "step": 1074, "tokens/total": 140599296, "tokens/train_per_sec_per_gpu": 3139.25, "tokens/trainable": 14956205 }, { "epoch": 3.4235668789808917, "grad_norm": 0.15625, "learning_rate": 1.3722108293681973e-05, "loss": 0.0029566381126642227, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00296, "step": 1075, "tokens/total": 140730368, "tokens/train_per_sec_per_gpu": 3445.88, "tokens/trainable": 14970584 }, { "epoch": 3.426751592356688, "grad_norm": 0.1640625, "learning_rate": 1.3672529644823004e-05, "loss": 0.0029452519956976175, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00295, "step": 1076, "tokens/total": 140861440, "tokens/train_per_sec_per_gpu": 3354.99, "tokens/trainable": 14984596 }, { "epoch": 3.4299363057324843, "grad_norm": 0.10693359375, "learning_rate": 1.362300699082582e-05, "loss": 0.0017804743256419897, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00178, "step": 1077, "tokens/total": 140992512, "tokens/train_per_sec_per_gpu": 3354.98, "tokens/trainable": 14998636 }, { "epoch": 3.43312101910828, "grad_norm": 0.2001953125, "learning_rate": 1.35735405764948e-05, "loss": 0.003846959676593542, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00385, "step": 1078, "tokens/total": 141123584, "tokens/train_per_sec_per_gpu": 3226.73, "tokens/trainable": 15012165 }, { "epoch": 3.4363057324840764, "grad_norm": 0.166015625, "learning_rate": 1.3524130646356283e-05, "loss": 0.0025776573456823826, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00258, "step": 1079, "tokens/total": 141254656, "tokens/train_per_sec_per_gpu": 3226.06, "tokens/trainable": 15025665 }, { "epoch": 3.4394904458598727, "grad_norm": 0.1630859375, "learning_rate": 1.3474777444657415e-05, "loss": 0.0029838993214070797, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00299, "step": 1080, "tokens/total": 141385728, "tokens/train_per_sec_per_gpu": 3689.69, "tokens/trainable": 15041028 }, { "epoch": 3.4426751592356686, "grad_norm": 0.14453125, "learning_rate": 1.3425481215364922e-05, "loss": 0.0022048731334507465, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00221, "step": 1081, "tokens/total": 141516800, "tokens/train_per_sec_per_gpu": 3238.87, "tokens/trainable": 15054618 }, { "epoch": 3.445859872611465, "grad_norm": 0.185546875, "learning_rate": 1.3376242202163868e-05, "loss": 0.004590876400470734, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0046, "step": 1082, "tokens/total": 141647872, "tokens/train_per_sec_per_gpu": 3402.12, "tokens/trainable": 15068791 }, { "epoch": 3.449044585987261, "grad_norm": 0.15625, "learning_rate": 1.3327060648456502e-05, "loss": 0.0026096594519913197, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00261, "step": 1083, "tokens/total": 141778944, "tokens/train_per_sec_per_gpu": 3599.47, "tokens/trainable": 15083794 }, { "epoch": 3.4522292993630574, "grad_norm": 0.1162109375, "learning_rate": 1.3277936797361043e-05, "loss": 0.0020494635682553053, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00205, "step": 1084, "tokens/total": 141910016, "tokens/train_per_sec_per_gpu": 3307.62, "tokens/trainable": 15097640 }, { "epoch": 3.4554140127388537, "grad_norm": 0.1552734375, "learning_rate": 1.3228870891710443e-05, "loss": 0.003234599716961384, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00324, "step": 1085, "tokens/total": 142041088, "tokens/train_per_sec_per_gpu": 3210.5, "tokens/trainable": 15111127 }, { "epoch": 3.4585987261146496, "grad_norm": 0.14453125, "learning_rate": 1.3179863174051238e-05, "loss": 0.002322172513231635, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00232, "step": 1086, "tokens/total": 142172160, "tokens/train_per_sec_per_gpu": 3065.87, "tokens/trainable": 15123986 }, { "epoch": 3.461783439490446, "grad_norm": 0.16796875, "learning_rate": 1.3130913886642333e-05, "loss": 0.003022089833393693, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00303, "step": 1087, "tokens/total": 142303232, "tokens/train_per_sec_per_gpu": 3611.73, "tokens/trainable": 15139047 }, { "epoch": 3.464968152866242, "grad_norm": 0.1396484375, "learning_rate": 1.3082023271453759e-05, "loss": 0.0020968448370695114, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0021, "step": 1088, "tokens/total": 142434304, "tokens/train_per_sec_per_gpu": 3221.7, "tokens/trainable": 15152542 }, { "epoch": 3.468152866242038, "grad_norm": 0.171875, "learning_rate": 1.3033191570165532e-05, "loss": 0.00432826392352581, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00434, "step": 1089, "tokens/total": 142565376, "tokens/train_per_sec_per_gpu": 3192.76, "tokens/trainable": 15165913 }, { "epoch": 3.4713375796178343, "grad_norm": 0.1142578125, "learning_rate": 1.298441902416646e-05, "loss": 0.0018635153537616134, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00187, "step": 1090, "tokens/total": 142696448, "tokens/train_per_sec_per_gpu": 3635.08, "tokens/trainable": 15181017 }, { "epoch": 3.4745222929936306, "grad_norm": 0.1806640625, "learning_rate": 1.2935705874552894e-05, "loss": 0.0037171547301113605, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00372, "step": 1091, "tokens/total": 142827520, "tokens/train_per_sec_per_gpu": 3549.32, "tokens/trainable": 15195900 }, { "epoch": 3.477707006369427, "grad_norm": 0.154296875, "learning_rate": 1.2887052362127594e-05, "loss": 0.0025141574442386627, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00252, "step": 1092, "tokens/total": 142958592, "tokens/train_per_sec_per_gpu": 3427.88, "tokens/trainable": 15210182 }, { "epoch": 3.480891719745223, "grad_norm": 0.1630859375, "learning_rate": 1.2838458727398531e-05, "loss": 0.0030665546655654907, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00307, "step": 1093, "tokens/total": 143089664, "tokens/train_per_sec_per_gpu": 4042.03, "tokens/trainable": 15226897 }, { "epoch": 3.484076433121019, "grad_norm": 0.12890625, "learning_rate": 1.2789925210577647e-05, "loss": 0.0020227362401783466, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00202, "step": 1094, "tokens/total": 143220736, "tokens/train_per_sec_per_gpu": 3736.82, "tokens/trainable": 15242382 }, { "epoch": 3.4872611464968153, "grad_norm": 0.158203125, "learning_rate": 1.274145205157972e-05, "loss": 0.0027202137280255556, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00272, "step": 1095, "tokens/total": 143351808, "tokens/train_per_sec_per_gpu": 3200.5, "tokens/trainable": 15255782 }, { "epoch": 3.4904458598726116, "grad_norm": 0.1708984375, "learning_rate": 1.269303949002118e-05, "loss": 0.0031496393494307995, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00315, "step": 1096, "tokens/total": 143482880, "tokens/train_per_sec_per_gpu": 3206.14, "tokens/trainable": 15269719 }, { "epoch": 3.4936305732484074, "grad_norm": 0.1748046875, "learning_rate": 1.2644687765218874e-05, "loss": 0.0028139406349509954, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00282, "step": 1097, "tokens/total": 143613952, "tokens/train_per_sec_per_gpu": 3399.76, "tokens/trainable": 15283962 }, { "epoch": 3.4968152866242037, "grad_norm": 0.1767578125, "learning_rate": 1.2596397116188946e-05, "loss": 0.0032941231038421392, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0033, "step": 1098, "tokens/total": 143745024, "tokens/train_per_sec_per_gpu": 3149.95, "tokens/trainable": 15297099 }, { "epoch": 3.5, "grad_norm": 0.1689453125, "learning_rate": 1.2548167781645616e-05, "loss": 0.00317127862945199, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00318, "step": 1099, "tokens/total": 143876096, "tokens/train_per_sec_per_gpu": 3666.83, "tokens/trainable": 15312299 }, { "epoch": 3.5, "eval_loss": 0.010016990825533867, "eval_ppl": 1.01007, "eval_runtime": 43.0422, "eval_samples_per_second": 62.752, "eval_steps_per_second": 3.926, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 54.61, "memory/max_allocated (GiB)": 54.61, "step": 1099 }, { "epoch": 3.5031847133757963, "grad_norm": 0.162109375, "learning_rate": 1.2500000000000006e-05, "loss": 0.0022175521589815617, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00222, "step": 1100, "tokens/total": 144007168, "tokens/train_per_sec_per_gpu": 3427.84, "tokens/trainable": 15326710 }, { "epoch": 3.5063694267515926, "grad_norm": 0.1845703125, "learning_rate": 1.245189400935895e-05, "loss": 0.005054910201579332, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00507, "step": 1101, "tokens/total": 144138240, "tokens/train_per_sec_per_gpu": 3351.14, "tokens/trainable": 15340735 }, { "epoch": 3.5095541401273884, "grad_norm": 0.1630859375, "learning_rate": 1.2403850047523866e-05, "loss": 0.0027237918693572283, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00273, "step": 1102, "tokens/total": 144269312, "tokens/train_per_sec_per_gpu": 3436.73, "tokens/trainable": 15355132 }, { "epoch": 3.5127388535031847, "grad_norm": 0.1787109375, "learning_rate": 1.2355868351989509e-05, "loss": 0.0029630253557115793, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00297, "step": 1103, "tokens/total": 144400384, "tokens/train_per_sec_per_gpu": 3214.66, "tokens/trainable": 15368489 }, { "epoch": 3.515923566878981, "grad_norm": 0.142578125, "learning_rate": 1.2307949159942862e-05, "loss": 0.0033542895689606667, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00336, "step": 1104, "tokens/total": 144531456, "tokens/train_per_sec_per_gpu": 3198.14, "tokens/trainable": 15381840 }, { "epoch": 3.519108280254777, "grad_norm": 0.17578125, "learning_rate": 1.2260092708261936e-05, "loss": 0.0038351963739842176, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00384, "step": 1105, "tokens/total": 144662528, "tokens/train_per_sec_per_gpu": 3503.01, "tokens/trainable": 15396418 }, { "epoch": 3.522292993630573, "grad_norm": 0.154296875, "learning_rate": 1.2212299233514582e-05, "loss": 0.0025412808172404766, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00254, "step": 1106, "tokens/total": 144793600, "tokens/train_per_sec_per_gpu": 3919.55, "tokens/trainable": 15412594 }, { "epoch": 3.5254777070063694, "grad_norm": 0.1796875, "learning_rate": 1.216456897195733e-05, "loss": 0.0032449497375637293, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00325, "step": 1107, "tokens/total": 144924672, "tokens/train_per_sec_per_gpu": 3382.55, "tokens/trainable": 15426656 }, { "epoch": 3.5286624203821657, "grad_norm": 0.146484375, "learning_rate": 1.211690215953427e-05, "loss": 0.0023905187845230103, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00239, "step": 1108, "tokens/total": 145055744, "tokens/train_per_sec_per_gpu": 3011.33, "tokens/trainable": 15439226 }, { "epoch": 3.531847133757962, "grad_norm": 0.15625, "learning_rate": 1.2069299031875795e-05, "loss": 0.0024083037860691547, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00241, "step": 1109, "tokens/total": 145186816, "tokens/train_per_sec_per_gpu": 2939.76, "tokens/trainable": 15451512 }, { "epoch": 3.535031847133758, "grad_norm": 0.1787109375, "learning_rate": 1.2021759824297524e-05, "loss": 0.004423599690198898, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00443, "step": 1110, "tokens/total": 145317888, "tokens/train_per_sec_per_gpu": 3466.29, "tokens/trainable": 15465910 }, { "epoch": 3.538216560509554, "grad_norm": 0.1455078125, "learning_rate": 1.1974284771799096e-05, "loss": 0.002882221946492791, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00289, "step": 1111, "tokens/total": 145448960, "tokens/train_per_sec_per_gpu": 3506.98, "tokens/trainable": 15480477 }, { "epoch": 3.5414012738853504, "grad_norm": 0.1826171875, "learning_rate": 1.1926874109063e-05, "loss": 0.003006345359608531, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00301, "step": 1112, "tokens/total": 145580032, "tokens/train_per_sec_per_gpu": 3365.33, "tokens/trainable": 15494478 }, { "epoch": 3.5445859872611463, "grad_norm": 0.154296875, "learning_rate": 1.1879528070453423e-05, "loss": 0.0027234896551817656, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00273, "step": 1113, "tokens/total": 145711104, "tokens/train_per_sec_per_gpu": 3535.63, "tokens/trainable": 15509199 }, { "epoch": 3.5477707006369426, "grad_norm": 0.177734375, "learning_rate": 1.1832246890015125e-05, "loss": 0.0036931924987584352, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0037, "step": 1114, "tokens/total": 145842176, "tokens/train_per_sec_per_gpu": 3246.79, "tokens/trainable": 15522710 }, { "epoch": 3.550955414012739, "grad_norm": 0.1474609375, "learning_rate": 1.1785030801472221e-05, "loss": 0.0028704549185931683, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00287, "step": 1115, "tokens/total": 145973248, "tokens/train_per_sec_per_gpu": 3848.56, "tokens/trainable": 15538730 }, { "epoch": 3.554140127388535, "grad_norm": 0.15625, "learning_rate": 1.1737880038227082e-05, "loss": 0.00254430272616446, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00255, "step": 1116, "tokens/total": 146104320, "tokens/train_per_sec_per_gpu": 3397.78, "tokens/trainable": 15552911 }, { "epoch": 3.5573248407643314, "grad_norm": 0.1630859375, "learning_rate": 1.1690794833359159e-05, "loss": 0.0025816336274147034, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00258, "step": 1117, "tokens/total": 146235392, "tokens/train_per_sec_per_gpu": 2881.17, "tokens/trainable": 15564987 }, { "epoch": 3.5605095541401273, "grad_norm": 0.19140625, "learning_rate": 1.1643775419623812e-05, "loss": 0.003014686517417431, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00302, "step": 1118, "tokens/total": 146366464, "tokens/train_per_sec_per_gpu": 3324.99, "tokens/trainable": 15578834 }, { "epoch": 3.5636942675159236, "grad_norm": 0.146484375, "learning_rate": 1.1596822029451177e-05, "loss": 0.0020668318029493093, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00207, "step": 1119, "tokens/total": 146497536, "tokens/train_per_sec_per_gpu": 3575.91, "tokens/trainable": 15593729 }, { "epoch": 3.56687898089172, "grad_norm": 0.142578125, "learning_rate": 1.1549934894945045e-05, "loss": 0.002621435560286045, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00262, "step": 1120, "tokens/total": 146628608, "tokens/train_per_sec_per_gpu": 3223.75, "tokens/trainable": 15607251 }, { "epoch": 3.5700636942675157, "grad_norm": 0.16796875, "learning_rate": 1.1503114247881648e-05, "loss": 0.002985800849273801, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00299, "step": 1121, "tokens/total": 146759680, "tokens/train_per_sec_per_gpu": 3585.21, "tokens/trainable": 15622149 }, { "epoch": 3.573248407643312, "grad_norm": 0.0966796875, "learning_rate": 1.1456360319708578e-05, "loss": 0.0013212183257564902, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00132, "step": 1122, "tokens/total": 146890752, "tokens/train_per_sec_per_gpu": 3312.34, "tokens/trainable": 15636033 }, { "epoch": 3.5764331210191083, "grad_norm": 0.17578125, "learning_rate": 1.1409673341543625e-05, "loss": 0.0023485145065933466, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00235, "step": 1123, "tokens/total": 147021824, "tokens/train_per_sec_per_gpu": 3184.56, "tokens/trainable": 15649372 }, { "epoch": 3.5796178343949046, "grad_norm": 0.1767578125, "learning_rate": 1.1363053544173596e-05, "loss": 0.002514764666557312, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00252, "step": 1124, "tokens/total": 147152896, "tokens/train_per_sec_per_gpu": 3358.29, "tokens/trainable": 15663368 }, { "epoch": 3.582802547770701, "grad_norm": 0.13671875, "learning_rate": 1.1316501158053216e-05, "loss": 0.002817730186507106, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00282, "step": 1125, "tokens/total": 147283968, "tokens/train_per_sec_per_gpu": 3488.28, "tokens/trainable": 15677861 }, { "epoch": 3.5859872611464967, "grad_norm": 0.150390625, "learning_rate": 1.1270016413303997e-05, "loss": 0.0023807904217392206, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00238, "step": 1126, "tokens/total": 147415040, "tokens/train_per_sec_per_gpu": 3351.72, "tokens/trainable": 15691892 }, { "epoch": 3.589171974522293, "grad_norm": 0.13671875, "learning_rate": 1.1223599539713046e-05, "loss": 0.0022236828226596117, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00223, "step": 1127, "tokens/total": 147546112, "tokens/train_per_sec_per_gpu": 3133.35, "tokens/trainable": 15705012 }, { "epoch": 3.5923566878980893, "grad_norm": 0.169921875, "learning_rate": 1.1177250766731992e-05, "loss": 0.0034954429138451815, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0035, "step": 1128, "tokens/total": 147677184, "tokens/train_per_sec_per_gpu": 3390.97, "tokens/trainable": 15719238 }, { "epoch": 3.595541401273885, "grad_norm": 0.1474609375, "learning_rate": 1.1130970323475825e-05, "loss": 0.0024684793315827847, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00247, "step": 1129, "tokens/total": 147808256, "tokens/train_per_sec_per_gpu": 3373.56, "tokens/trainable": 15733335 }, { "epoch": 3.5987261146496814, "grad_norm": 0.177734375, "learning_rate": 1.1084758438721743e-05, "loss": 0.003184695728123188, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00319, "step": 1130, "tokens/total": 147939328, "tokens/train_per_sec_per_gpu": 3255.08, "tokens/trainable": 15746979 }, { "epoch": 3.6019108280254777, "grad_norm": 0.154296875, "learning_rate": 1.103861534090804e-05, "loss": 0.00223728409036994, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00224, "step": 1131, "tokens/total": 148070400, "tokens/train_per_sec_per_gpu": 3094.33, "tokens/trainable": 15759937 }, { "epoch": 3.605095541401274, "grad_norm": 0.244140625, "learning_rate": 1.0992541258132998e-05, "loss": 0.0025429693050682545, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00255, "step": 1132, "tokens/total": 148201472, "tokens/train_per_sec_per_gpu": 3264.14, "tokens/trainable": 15773601 }, { "epoch": 3.6082802547770703, "grad_norm": 0.2265625, "learning_rate": 1.0946536418153716e-05, "loss": 0.0037906889338046312, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0038, "step": 1133, "tokens/total": 148332544, "tokens/train_per_sec_per_gpu": 2941.44, "tokens/trainable": 15785963 }, { "epoch": 3.611464968152866, "grad_norm": 0.1767578125, "learning_rate": 1.0900601048385017e-05, "loss": 0.0023014359176158905, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0023, "step": 1134, "tokens/total": 148463616, "tokens/train_per_sec_per_gpu": 2661.35, "tokens/trainable": 15797186 }, { "epoch": 3.6146496815286624, "grad_norm": 0.1669921875, "learning_rate": 1.0854735375898328e-05, "loss": 0.004023172426968813, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00403, "step": 1135, "tokens/total": 148594688, "tokens/train_per_sec_per_gpu": 3525.3, "tokens/trainable": 15811891 }, { "epoch": 3.6178343949044587, "grad_norm": 0.14453125, "learning_rate": 1.0808939627420514e-05, "loss": 0.0020967398304492235, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0021, "step": 1136, "tokens/total": 148725760, "tokens/train_per_sec_per_gpu": 3402.16, "tokens/trainable": 15826103 }, { "epoch": 3.6210191082802545, "grad_norm": 0.1494140625, "learning_rate": 1.076321402933279e-05, "loss": 0.002463690470904112, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00247, "step": 1137, "tokens/total": 148856832, "tokens/train_per_sec_per_gpu": 3459.93, "tokens/trainable": 15840539 }, { "epoch": 3.624203821656051, "grad_norm": 0.201171875, "learning_rate": 1.0717558807669631e-05, "loss": 0.0030937506817281246, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0031, "step": 1138, "tokens/total": 148987904, "tokens/train_per_sec_per_gpu": 3333.74, "tokens/trainable": 15854495 }, { "epoch": 3.627388535031847, "grad_norm": 0.134765625, "learning_rate": 1.0671974188117572e-05, "loss": 0.002224976196885109, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00223, "step": 1139, "tokens/total": 149118976, "tokens/train_per_sec_per_gpu": 3179.62, "tokens/trainable": 15867806 }, { "epoch": 3.6305732484076434, "grad_norm": 0.1767578125, "learning_rate": 1.0626460396014182e-05, "loss": 0.0029444252140820026, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00295, "step": 1140, "tokens/total": 149250048, "tokens/train_per_sec_per_gpu": 3422.65, "tokens/trainable": 15882044 }, { "epoch": 3.6337579617834397, "grad_norm": 0.185546875, "learning_rate": 1.0581017656346904e-05, "loss": 0.0034989488776773214, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00351, "step": 1141, "tokens/total": 149381120, "tokens/train_per_sec_per_gpu": 3507.73, "tokens/trainable": 15896741 }, { "epoch": 3.6369426751592355, "grad_norm": 0.1865234375, "learning_rate": 1.053564619375193e-05, "loss": 0.002628948539495468, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00263, "step": 1142, "tokens/total": 149512192, "tokens/train_per_sec_per_gpu": 3219.49, "tokens/trainable": 15910183 }, { "epoch": 3.640127388535032, "grad_norm": 0.263671875, "learning_rate": 1.0490346232513113e-05, "loss": 0.0031747568864375353, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00318, "step": 1143, "tokens/total": 149643264, "tokens/train_per_sec_per_gpu": 3370.78, "tokens/trainable": 15924212 }, { "epoch": 3.643312101910828, "grad_norm": 0.208984375, "learning_rate": 1.0445117996560877e-05, "loss": 0.003914204426109791, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00392, "step": 1144, "tokens/total": 149774336, "tokens/train_per_sec_per_gpu": 3173.12, "tokens/trainable": 15937505 }, { "epoch": 3.646496815286624, "grad_norm": 0.1494140625, "learning_rate": 1.039996170947106e-05, "loss": 0.002363776322454214, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00237, "step": 1145, "tokens/total": 149905408, "tokens/train_per_sec_per_gpu": 3147.56, "tokens/trainable": 15950698 }, { "epoch": 3.6496815286624202, "grad_norm": 0.16796875, "learning_rate": 1.0354877594463852e-05, "loss": 0.0031070299446582794, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00311, "step": 1146, "tokens/total": 150036480, "tokens/train_per_sec_per_gpu": 3364.36, "tokens/trainable": 15964717 }, { "epoch": 3.6528662420382165, "grad_norm": 0.1396484375, "learning_rate": 1.0309865874402688e-05, "loss": 0.001972392201423645, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00197, "step": 1147, "tokens/total": 150167552, "tokens/train_per_sec_per_gpu": 3018.99, "tokens/trainable": 15977365 }, { "epoch": 3.656050955414013, "grad_norm": 0.09423828125, "learning_rate": 1.026492677179311e-05, "loss": 0.0011499158572405577, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00115, "step": 1148, "tokens/total": 150298624, "tokens/train_per_sec_per_gpu": 3220.38, "tokens/trainable": 15990834 }, { "epoch": 3.659235668789809, "grad_norm": 0.1220703125, "learning_rate": 1.022006050878169e-05, "loss": 0.001693375059403479, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00169, "step": 1149, "tokens/total": 150429696, "tokens/train_per_sec_per_gpu": 3186.29, "tokens/trainable": 16004194 }, { "epoch": 3.662420382165605, "grad_norm": 0.1455078125, "learning_rate": 1.0175267307154962e-05, "loss": 0.0017610186478123069, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00176, "step": 1150, "tokens/total": 150560768, "tokens/train_per_sec_per_gpu": 3312.55, "tokens/trainable": 16018057 }, { "epoch": 3.6656050955414012, "grad_norm": 0.1826171875, "learning_rate": 1.0130547388338268e-05, "loss": 0.003534915391355753, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00354, "step": 1151, "tokens/total": 150691840, "tokens/train_per_sec_per_gpu": 3383.89, "tokens/trainable": 16032153 }, { "epoch": 3.6687898089171975, "grad_norm": 0.1630859375, "learning_rate": 1.0085900973394708e-05, "loss": 0.0027439731638878584, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00275, "step": 1152, "tokens/total": 150822912, "tokens/train_per_sec_per_gpu": 3256.01, "tokens/trainable": 16045798 }, { "epoch": 3.6719745222929934, "grad_norm": 0.1298828125, "learning_rate": 1.004132828302404e-05, "loss": 0.0019469019025564194, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00195, "step": 1153, "tokens/total": 150953984, "tokens/train_per_sec_per_gpu": 3687.58, "tokens/trainable": 16061140 }, { "epoch": 3.6751592356687897, "grad_norm": 0.15234375, "learning_rate": 9.996829537561559e-06, "loss": 0.0025109422858804464, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00251, "step": 1154, "tokens/total": 151085056, "tokens/train_per_sec_per_gpu": 3535.46, "tokens/trainable": 16075875 }, { "epoch": 3.678343949044586, "grad_norm": 0.134765625, "learning_rate": 9.952404956977032e-06, "loss": 0.0022808697540313005, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00228, "step": 1155, "tokens/total": 151216128, "tokens/train_per_sec_per_gpu": 3050.84, "tokens/trainable": 16088674 }, { "epoch": 3.6815286624203822, "grad_norm": 0.2041015625, "learning_rate": 9.908054760873633e-06, "loss": 0.003984857816249132, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00399, "step": 1156, "tokens/total": 151347200, "tokens/train_per_sec_per_gpu": 3167.18, "tokens/trainable": 16101976 }, { "epoch": 3.6847133757961785, "grad_norm": 0.16015625, "learning_rate": 9.863779168486798e-06, "loss": 0.002358327154070139, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00236, "step": 1157, "tokens/total": 151478272, "tokens/train_per_sec_per_gpu": 3305.89, "tokens/trainable": 16115788 }, { "epoch": 3.6878980891719744, "grad_norm": 0.1455078125, "learning_rate": 9.819578398683202e-06, "loss": 0.0030925837345421314, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0031, "step": 1158, "tokens/total": 151609344, "tokens/train_per_sec_per_gpu": 3627.75, "tokens/trainable": 16130893 }, { "epoch": 3.6910828025477707, "grad_norm": 0.142578125, "learning_rate": 9.775452669959651e-06, "loss": 0.00236108573153615, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00236, "step": 1159, "tokens/total": 151740416, "tokens/train_per_sec_per_gpu": 3497.63, "tokens/trainable": 16145461 }, { "epoch": 3.694267515923567, "grad_norm": 0.2119140625, "learning_rate": 9.731402200441985e-06, "loss": 0.0027799042873084545, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00278, "step": 1160, "tokens/total": 151871488, "tokens/train_per_sec_per_gpu": 3288.32, "tokens/trainable": 16159217 }, { "epoch": 3.697452229299363, "grad_norm": 0.197265625, "learning_rate": 9.687427207884017e-06, "loss": 0.004562960006296635, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00457, "step": 1161, "tokens/total": 152002560, "tokens/train_per_sec_per_gpu": 3425.43, "tokens/trainable": 16173551 }, { "epoch": 3.700636942675159, "grad_norm": 0.1748046875, "learning_rate": 9.643527909666484e-06, "loss": 0.003357633948326111, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00336, "step": 1162, "tokens/total": 152133632, "tokens/train_per_sec_per_gpu": 3255.39, "tokens/trainable": 16187139 }, { "epoch": 3.7038216560509554, "grad_norm": 0.1708984375, "learning_rate": 9.599704522795899e-06, "loss": 0.0035241839941591024, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00353, "step": 1163, "tokens/total": 152264704, "tokens/train_per_sec_per_gpu": 3360.97, "tokens/trainable": 16201246 }, { "epoch": 3.7070063694267517, "grad_norm": 0.12353515625, "learning_rate": 9.55595726390357e-06, "loss": 0.0019289179472252727, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00193, "step": 1164, "tokens/total": 152395776, "tokens/train_per_sec_per_gpu": 3708.68, "tokens/trainable": 16216642 }, { "epoch": 3.710191082802548, "grad_norm": 0.1279296875, "learning_rate": 9.512286349244461e-06, "loss": 0.0024172349367290735, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00242, "step": 1165, "tokens/total": 152526848, "tokens/train_per_sec_per_gpu": 3152.27, "tokens/trainable": 16229792 }, { "epoch": 3.713375796178344, "grad_norm": 0.1484375, "learning_rate": 9.468691994696147e-06, "loss": 0.0027571492828428745, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00276, "step": 1166, "tokens/total": 152657920, "tokens/train_per_sec_per_gpu": 3544.62, "tokens/trainable": 16244524 }, { "epoch": 3.71656050955414, "grad_norm": 0.171875, "learning_rate": 9.42517441575773e-06, "loss": 0.002144938800483942, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00215, "step": 1167, "tokens/total": 152788992, "tokens/train_per_sec_per_gpu": 3187.0, "tokens/trainable": 16257897 }, { "epoch": 3.7197452229299364, "grad_norm": 0.166015625, "learning_rate": 9.381733827548825e-06, "loss": 0.002875394420698285, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00288, "step": 1168, "tokens/total": 152920064, "tokens/train_per_sec_per_gpu": 3371.36, "tokens/trainable": 16271956 }, { "epoch": 3.722929936305732, "grad_norm": 0.146484375, "learning_rate": 9.338370444808417e-06, "loss": 0.0024918625131249428, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00249, "step": 1169, "tokens/total": 153051136, "tokens/train_per_sec_per_gpu": 3125.56, "tokens/trainable": 16285073 }, { "epoch": 3.7261146496815285, "grad_norm": 0.12158203125, "learning_rate": 9.295084481893876e-06, "loss": 0.0020116898231208324, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00201, "step": 1170, "tokens/total": 153182208, "tokens/train_per_sec_per_gpu": 3620.37, "tokens/trainable": 16300140 }, { "epoch": 3.729299363057325, "grad_norm": 0.1474609375, "learning_rate": 9.251876152779863e-06, "loss": 0.002456206362694502, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00246, "step": 1171, "tokens/total": 153313280, "tokens/train_per_sec_per_gpu": 3421.83, "tokens/trainable": 16314413 }, { "epoch": 3.732484076433121, "grad_norm": 0.1845703125, "learning_rate": 9.20874567105725e-06, "loss": 0.002665320411324501, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00267, "step": 1172, "tokens/total": 153444352, "tokens/train_per_sec_per_gpu": 3586.11, "tokens/trainable": 16329319 }, { "epoch": 3.7356687898089174, "grad_norm": 0.150390625, "learning_rate": 9.165693249932098e-06, "loss": 0.002760200994089246, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00276, "step": 1173, "tokens/total": 153575424, "tokens/train_per_sec_per_gpu": 3499.69, "tokens/trainable": 16343957 }, { "epoch": 3.738853503184713, "grad_norm": 0.1552734375, "learning_rate": 9.122719102224603e-06, "loss": 0.003271646797657013, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00328, "step": 1174, "tokens/total": 153706496, "tokens/train_per_sec_per_gpu": 3357.83, "tokens/trainable": 16358001 }, { "epoch": 3.7420382165605095, "grad_norm": 0.140625, "learning_rate": 9.079823440368018e-06, "loss": 0.0022282477002590895, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00223, "step": 1175, "tokens/total": 153837568, "tokens/train_per_sec_per_gpu": 3662.02, "tokens/trainable": 16373253 }, { "epoch": 3.745222929936306, "grad_norm": 0.1611328125, "learning_rate": 9.037006476407628e-06, "loss": 0.003906633704900742, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00391, "step": 1176, "tokens/total": 153968640, "tokens/train_per_sec_per_gpu": 3414.99, "tokens/trainable": 16387539 }, { "epoch": 3.7484076433121016, "grad_norm": 0.2041015625, "learning_rate": 8.994268421999702e-06, "loss": 0.0046704974956810474, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00468, "step": 1177, "tokens/total": 154099712, "tokens/train_per_sec_per_gpu": 3318.42, "tokens/trainable": 16401436 }, { "epoch": 3.7515923566878984, "grad_norm": 0.154296875, "learning_rate": 8.951609488410414e-06, "loss": 0.0023519096430391073, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00235, "step": 1178, "tokens/total": 154230784, "tokens/train_per_sec_per_gpu": 3442.84, "tokens/trainable": 16415791 }, { "epoch": 3.754777070063694, "grad_norm": 0.09326171875, "learning_rate": 8.909029886514828e-06, "loss": 0.001595214824192226, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0016, "step": 1179, "tokens/total": 154361856, "tokens/train_per_sec_per_gpu": 3627.66, "tokens/trainable": 16430902 }, { "epoch": 3.7579617834394905, "grad_norm": 0.1455078125, "learning_rate": 8.866529826795866e-06, "loss": 0.002106869127601385, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00211, "step": 1180, "tokens/total": 154492928, "tokens/train_per_sec_per_gpu": 3379.91, "tokens/trainable": 16445017 }, { "epoch": 3.761146496815287, "grad_norm": 0.150390625, "learning_rate": 8.824109519343227e-06, "loss": 0.0035120132379233837, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00352, "step": 1181, "tokens/total": 154624000, "tokens/train_per_sec_per_gpu": 3416.65, "tokens/trainable": 16459298 }, { "epoch": 3.7643312101910826, "grad_norm": 0.11865234375, "learning_rate": 8.781769173852392e-06, "loss": 0.002475301967933774, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00248, "step": 1182, "tokens/total": 154755072, "tokens/train_per_sec_per_gpu": 3225.84, "tokens/trainable": 16472813 }, { "epoch": 3.767515923566879, "grad_norm": 0.134765625, "learning_rate": 8.739508999623563e-06, "loss": 0.0018928756471723318, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00189, "step": 1183, "tokens/total": 154886144, "tokens/train_per_sec_per_gpu": 3393.68, "tokens/trainable": 16487035 }, { "epoch": 3.770700636942675, "grad_norm": 0.1298828125, "learning_rate": 8.697329205560625e-06, "loss": 0.0019152449676766992, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00192, "step": 1184, "tokens/total": 155017216, "tokens/train_per_sec_per_gpu": 3331.91, "tokens/trainable": 16500924 }, { "epoch": 3.7738853503184715, "grad_norm": 0.1328125, "learning_rate": 8.655230000170117e-06, "loss": 0.0024345512501895428, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00244, "step": 1185, "tokens/total": 155148288, "tokens/train_per_sec_per_gpu": 3450.15, "tokens/trainable": 16515278 }, { "epoch": 3.777070063694268, "grad_norm": 0.125, "learning_rate": 8.61321159156023e-06, "loss": 0.0017270749667659402, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00173, "step": 1186, "tokens/total": 155279360, "tokens/train_per_sec_per_gpu": 2874.69, "tokens/trainable": 16527348 }, { "epoch": 3.7802547770700636, "grad_norm": 0.1884765625, "learning_rate": 8.571274187439724e-06, "loss": 0.0030203748028725386, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00302, "step": 1187, "tokens/total": 155410432, "tokens/train_per_sec_per_gpu": 3409.54, "tokens/trainable": 16541593 }, { "epoch": 3.78343949044586, "grad_norm": 0.138671875, "learning_rate": 8.529417995116947e-06, "loss": 0.0022753621451556683, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00228, "step": 1188, "tokens/total": 155541504, "tokens/train_per_sec_per_gpu": 3344.52, "tokens/trainable": 16555605 }, { "epoch": 3.786624203821656, "grad_norm": 0.1494140625, "learning_rate": 8.487643221498812e-06, "loss": 0.0021583903580904007, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00216, "step": 1189, "tokens/total": 155672576, "tokens/train_per_sec_per_gpu": 3003.62, "tokens/trainable": 16568186 }, { "epoch": 3.789808917197452, "grad_norm": 0.12255859375, "learning_rate": 8.445950073089721e-06, "loss": 0.002155636204406619, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00216, "step": 1190, "tokens/total": 155803648, "tokens/train_per_sec_per_gpu": 3463.82, "tokens/trainable": 16582617 }, { "epoch": 3.7929936305732483, "grad_norm": 0.1787109375, "learning_rate": 8.404338755990587e-06, "loss": 0.003606649348512292, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00361, "step": 1191, "tokens/total": 155934720, "tokens/train_per_sec_per_gpu": 3331.8, "tokens/trainable": 16596564 }, { "epoch": 3.7961783439490446, "grad_norm": 0.1484375, "learning_rate": 8.362809475897837e-06, "loss": 0.0030233021825551987, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00303, "step": 1192, "tokens/total": 156065792, "tokens/train_per_sec_per_gpu": 3466.06, "tokens/trainable": 16611016 }, { "epoch": 3.799363057324841, "grad_norm": 0.1943359375, "learning_rate": 8.32136243810233e-06, "loss": 0.003034008899703622, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00304, "step": 1193, "tokens/total": 156196864, "tokens/train_per_sec_per_gpu": 3277.31, "tokens/trainable": 16624717 }, { "epoch": 3.802547770700637, "grad_norm": 0.126953125, "learning_rate": 8.279997847488399e-06, "loss": 0.0017860046355053782, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00179, "step": 1194, "tokens/total": 156327936, "tokens/train_per_sec_per_gpu": 3192.19, "tokens/trainable": 16638031 }, { "epoch": 3.805732484076433, "grad_norm": 0.2021484375, "learning_rate": 8.238715908532824e-06, "loss": 0.003182856598868966, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00319, "step": 1195, "tokens/total": 156459008, "tokens/train_per_sec_per_gpu": 3312.32, "tokens/trainable": 16651920 }, { "epoch": 3.8089171974522293, "grad_norm": 0.134765625, "learning_rate": 8.197516825303792e-06, "loss": 0.0023445822298526764, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00235, "step": 1196, "tokens/total": 156590080, "tokens/train_per_sec_per_gpu": 3594.12, "tokens/trainable": 16666821 }, { "epoch": 3.8121019108280256, "grad_norm": 0.1669921875, "learning_rate": 8.156400801459912e-06, "loss": 0.002362563507631421, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00237, "step": 1197, "tokens/total": 156721152, "tokens/train_per_sec_per_gpu": 2878.74, "tokens/trainable": 16679031 }, { "epoch": 3.8152866242038215, "grad_norm": 0.173828125, "learning_rate": 8.115368040249242e-06, "loss": 0.0029479744844138622, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00295, "step": 1198, "tokens/total": 156852224, "tokens/train_per_sec_per_gpu": 3403.84, "tokens/trainable": 16693210 }, { "epoch": 3.8184713375796178, "grad_norm": 0.1376953125, "learning_rate": 8.074418744508202e-06, "loss": 0.001919899950735271, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00192, "step": 1199, "tokens/total": 156983296, "tokens/train_per_sec_per_gpu": 3656.6, "tokens/trainable": 16708430 }, { "epoch": 3.821656050955414, "grad_norm": 0.1328125, "learning_rate": 8.03355311666065e-06, "loss": 0.0024780076928436756, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00248, "step": 1200, "tokens/total": 157114368, "tokens/train_per_sec_per_gpu": 3218.15, "tokens/trainable": 16721832 }, { "epoch": 3.8248407643312103, "grad_norm": 0.169921875, "learning_rate": 7.992771358716852e-06, "loss": 0.003482515923678875, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00349, "step": 1201, "tokens/total": 157245440, "tokens/train_per_sec_per_gpu": 3264.75, "tokens/trainable": 16735505 }, { "epoch": 3.8280254777070066, "grad_norm": 0.1494140625, "learning_rate": 7.952073672272465e-06, "loss": 0.002318483777344227, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00232, "step": 1202, "tokens/total": 157376512, "tokens/train_per_sec_per_gpu": 3205.3, "tokens/trainable": 16748926 }, { "epoch": 3.8312101910828025, "grad_norm": 0.1591796875, "learning_rate": 7.91146025850755e-06, "loss": 0.0027267371769994497, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00273, "step": 1203, "tokens/total": 157507584, "tokens/train_per_sec_per_gpu": 3508.37, "tokens/trainable": 16763595 }, { "epoch": 3.8343949044585988, "grad_norm": 0.142578125, "learning_rate": 7.870931318185615e-06, "loss": 0.0021403185091912746, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00214, "step": 1204, "tokens/total": 157638656, "tokens/train_per_sec_per_gpu": 3252.09, "tokens/trainable": 16777230 }, { "epoch": 3.837579617834395, "grad_norm": 0.1591796875, "learning_rate": 7.830487051652562e-06, "loss": 0.0029888248536735773, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00299, "step": 1205, "tokens/total": 157769728, "tokens/train_per_sec_per_gpu": 3605.6, "tokens/trainable": 16792264 }, { "epoch": 3.840764331210191, "grad_norm": 0.1103515625, "learning_rate": 7.790127658835747e-06, "loss": 0.0014124944573268294, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00141, "step": 1206, "tokens/total": 157900800, "tokens/train_per_sec_per_gpu": 3439.82, "tokens/trainable": 16806652 }, { "epoch": 3.843949044585987, "grad_norm": 0.1376953125, "learning_rate": 7.749853339242972e-06, "loss": 0.0024581162724643946, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00246, "step": 1207, "tokens/total": 158031872, "tokens/train_per_sec_per_gpu": 3496.89, "tokens/trainable": 16821214 }, { "epoch": 3.8471337579617835, "grad_norm": 0.1611328125, "learning_rate": 7.70966429196148e-06, "loss": 0.0028864797204732895, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00289, "step": 1208, "tokens/total": 158162944, "tokens/train_per_sec_per_gpu": 3388.5, "tokens/trainable": 16835398 }, { "epoch": 3.8503184713375798, "grad_norm": 0.154296875, "learning_rate": 7.669560715656993e-06, "loss": 0.0023927215952426195, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0024, "step": 1209, "tokens/total": 158294016, "tokens/train_per_sec_per_gpu": 3419.53, "tokens/trainable": 16849636 }, { "epoch": 3.853503184713376, "grad_norm": 0.1435546875, "learning_rate": 7.629542808572746e-06, "loss": 0.0018501668237149715, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00185, "step": 1210, "tokens/total": 158425088, "tokens/train_per_sec_per_gpu": 3232.39, "tokens/trainable": 16863144 }, { "epoch": 3.856687898089172, "grad_norm": 0.1689453125, "learning_rate": 7.58961076852846e-06, "loss": 0.0026476646307855844, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00265, "step": 1211, "tokens/total": 158556160, "tokens/train_per_sec_per_gpu": 3319.46, "tokens/trainable": 16877036 }, { "epoch": 3.859872611464968, "grad_norm": 0.162109375, "learning_rate": 7.549764792919414e-06, "loss": 0.0031769457273185253, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00318, "step": 1212, "tokens/total": 158687232, "tokens/train_per_sec_per_gpu": 3138.51, "tokens/trainable": 16890132 }, { "epoch": 3.8630573248407645, "grad_norm": 0.103515625, "learning_rate": 7.510005078715443e-06, "loss": 0.00180210976395756, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0018, "step": 1213, "tokens/total": 158818304, "tokens/train_per_sec_per_gpu": 3359.16, "tokens/trainable": 16904120 }, { "epoch": 3.8662420382165603, "grad_norm": 0.1572265625, "learning_rate": 7.47033182245995e-06, "loss": 0.003394015831872821, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0034, "step": 1214, "tokens/total": 158949376, "tokens/train_per_sec_per_gpu": 3097.6, "tokens/trainable": 16917088 }, { "epoch": 3.8694267515923566, "grad_norm": 0.1533203125, "learning_rate": 7.430745220268962e-06, "loss": 0.0019503788789734244, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00195, "step": 1215, "tokens/total": 159080448, "tokens/train_per_sec_per_gpu": 3410.15, "tokens/trainable": 16931308 }, { "epoch": 3.872611464968153, "grad_norm": 0.1552734375, "learning_rate": 7.391245467830163e-06, "loss": 0.002893456257879734, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0029, "step": 1216, "tokens/total": 159211520, "tokens/train_per_sec_per_gpu": 3457.29, "tokens/trainable": 16945696 }, { "epoch": 3.875796178343949, "grad_norm": 0.1640625, "learning_rate": 7.351832760401892e-06, "loss": 0.0023777689784765244, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00238, "step": 1217, "tokens/total": 159342592, "tokens/train_per_sec_per_gpu": 2878.05, "tokens/trainable": 16957864 }, { "epoch": 3.8789808917197455, "grad_norm": 0.1513671875, "learning_rate": 7.312507292812215e-06, "loss": 0.00224723806604743, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00225, "step": 1218, "tokens/total": 159473664, "tokens/train_per_sec_per_gpu": 3022.25, "tokens/trainable": 16970516 }, { "epoch": 3.8821656050955413, "grad_norm": 0.1064453125, "learning_rate": 7.273269259457957e-06, "loss": 0.0017601789440959692, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00176, "step": 1219, "tokens/total": 159604736, "tokens/train_per_sec_per_gpu": 3153.66, "tokens/trainable": 16983660 }, { "epoch": 3.8853503184713376, "grad_norm": 0.15625, "learning_rate": 7.2341188543036985e-06, "loss": 0.0024489860516041517, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00245, "step": 1220, "tokens/total": 159735808, "tokens/train_per_sec_per_gpu": 3174.42, "tokens/trainable": 16996950 }, { "epoch": 3.888535031847134, "grad_norm": 0.1884765625, "learning_rate": 7.195056270880887e-06, "loss": 0.0038972869515419006, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0039, "step": 1221, "tokens/total": 159866880, "tokens/train_per_sec_per_gpu": 3474.13, "tokens/trainable": 17011372 }, { "epoch": 3.8917197452229297, "grad_norm": 0.1787109375, "learning_rate": 7.156081702286813e-06, "loss": 0.0033518727868795395, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00336, "step": 1222, "tokens/total": 159997952, "tokens/train_per_sec_per_gpu": 3190.66, "tokens/trainable": 17024744 }, { "epoch": 3.894904458598726, "grad_norm": 0.1396484375, "learning_rate": 7.11719534118368e-06, "loss": 0.00255336775444448, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00256, "step": 1223, "tokens/total": 160129024, "tokens/train_per_sec_per_gpu": 3465.79, "tokens/trainable": 17039162 }, { "epoch": 3.8980891719745223, "grad_norm": 0.154296875, "learning_rate": 7.078397379797711e-06, "loss": 0.0020744046196341515, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00208, "step": 1224, "tokens/total": 160260096, "tokens/train_per_sec_per_gpu": 3387.57, "tokens/trainable": 17053370 }, { "epoch": 3.9012738853503186, "grad_norm": 0.1396484375, "learning_rate": 7.039688009918083e-06, "loss": 0.0021676502656191587, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00217, "step": 1225, "tokens/total": 160391168, "tokens/train_per_sec_per_gpu": 3407.72, "tokens/trainable": 17067644 }, { "epoch": 3.904458598726115, "grad_norm": 0.171875, "learning_rate": 7.001067422896063e-06, "loss": 0.002485244534909725, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00249, "step": 1226, "tokens/total": 160522240, "tokens/train_per_sec_per_gpu": 3749.29, "tokens/trainable": 17083264 }, { "epoch": 3.9076433121019107, "grad_norm": 0.173828125, "learning_rate": 6.9625358096440496e-06, "loss": 0.0030091169755905867, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00301, "step": 1227, "tokens/total": 160653312, "tokens/train_per_sec_per_gpu": 3670.67, "tokens/trainable": 17098510 }, { "epoch": 3.910828025477707, "grad_norm": 0.14453125, "learning_rate": 6.924093360634601e-06, "loss": 0.0025889542885124683, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00259, "step": 1228, "tokens/total": 160784384, "tokens/train_per_sec_per_gpu": 3679.38, "tokens/trainable": 17113820 }, { "epoch": 3.9140127388535033, "grad_norm": 0.1875, "learning_rate": 6.885740265899526e-06, "loss": 0.0027112660463899374, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00271, "step": 1229, "tokens/total": 160915456, "tokens/train_per_sec_per_gpu": 3136.64, "tokens/trainable": 17126964 }, { "epoch": 3.917197452229299, "grad_norm": 0.1591796875, "learning_rate": 6.84747671502893e-06, "loss": 0.002578144893050194, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00258, "step": 1230, "tokens/total": 161046528, "tokens/train_per_sec_per_gpu": 3162.79, "tokens/trainable": 17140190 }, { "epoch": 3.9203821656050954, "grad_norm": 0.2099609375, "learning_rate": 6.809302897170266e-06, "loss": 0.00427253358066082, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00428, "step": 1231, "tokens/total": 161177600, "tokens/train_per_sec_per_gpu": 3541.36, "tokens/trainable": 17154952 }, { "epoch": 3.9235668789808917, "grad_norm": 0.1552734375, "learning_rate": 6.771219001027415e-06, "loss": 0.002364278305321932, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00237, "step": 1232, "tokens/total": 161308672, "tokens/train_per_sec_per_gpu": 3451.33, "tokens/trainable": 17169330 }, { "epoch": 3.926751592356688, "grad_norm": 0.1357421875, "learning_rate": 6.733225214859762e-06, "loss": 0.0026184916496276855, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00262, "step": 1233, "tokens/total": 161439744, "tokens/train_per_sec_per_gpu": 3611.52, "tokens/trainable": 17184330 }, { "epoch": 3.9299363057324843, "grad_norm": 0.1396484375, "learning_rate": 6.695321726481232e-06, "loss": 0.0022467318922281265, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00225, "step": 1234, "tokens/total": 161570816, "tokens/train_per_sec_per_gpu": 3270.16, "tokens/trainable": 17198012 }, { "epoch": 3.93312101910828, "grad_norm": 0.1484375, "learning_rate": 6.657508723259404e-06, "loss": 0.0020928632002323866, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0021, "step": 1235, "tokens/total": 161701888, "tokens/train_per_sec_per_gpu": 3467.93, "tokens/trainable": 17212436 }, { "epoch": 3.9363057324840764, "grad_norm": 0.12060546875, "learning_rate": 6.619786392114557e-06, "loss": 0.0016596732893958688, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00166, "step": 1236, "tokens/total": 161832960, "tokens/train_per_sec_per_gpu": 3175.9, "tokens/trainable": 17225718 }, { "epoch": 3.9394904458598727, "grad_norm": 0.177734375, "learning_rate": 6.582154919518746e-06, "loss": 0.0028763054870069027, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00288, "step": 1237, "tokens/total": 161964032, "tokens/train_per_sec_per_gpu": 3459.65, "tokens/trainable": 17240132 }, { "epoch": 3.9426751592356686, "grad_norm": 0.1416015625, "learning_rate": 6.544614491494885e-06, "loss": 0.0023539350368082523, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00236, "step": 1238, "tokens/total": 162095104, "tokens/train_per_sec_per_gpu": 3782.05, "tokens/trainable": 17255722 }, { "epoch": 3.945859872611465, "grad_norm": 0.12353515625, "learning_rate": 6.507165293615847e-06, "loss": 0.001856530667282641, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00186, "step": 1239, "tokens/total": 162226176, "tokens/train_per_sec_per_gpu": 3107.57, "tokens/trainable": 17268686 }, { "epoch": 3.949044585987261, "grad_norm": 0.2041015625, "learning_rate": 6.469807511003501e-06, "loss": 0.0025471888948231936, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00255, "step": 1240, "tokens/total": 162357248, "tokens/train_per_sec_per_gpu": 3185.35, "tokens/trainable": 17282018 }, { "epoch": 3.9522292993630574, "grad_norm": 0.185546875, "learning_rate": 6.432541328327848e-06, "loss": 0.0031703345011919737, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00318, "step": 1241, "tokens/total": 162488320, "tokens/train_per_sec_per_gpu": 3523.34, "tokens/trainable": 17296706 }, { "epoch": 3.9554140127388537, "grad_norm": 0.1591796875, "learning_rate": 6.395366929806084e-06, "loss": 0.002728913212195039, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00273, "step": 1242, "tokens/total": 162619392, "tokens/train_per_sec_per_gpu": 3361.25, "tokens/trainable": 17310780 }, { "epoch": 3.9585987261146496, "grad_norm": 0.1298828125, "learning_rate": 6.358284499201681e-06, "loss": 0.00209011766128242, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00209, "step": 1243, "tokens/total": 162750464, "tokens/train_per_sec_per_gpu": 3299.45, "tokens/trainable": 17324532 }, { "epoch": 3.961783439490446, "grad_norm": 0.18359375, "learning_rate": 6.3212942198234755e-06, "loss": 0.003096578875556588, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0031, "step": 1244, "tokens/total": 162881536, "tokens/train_per_sec_per_gpu": 3590.65, "tokens/trainable": 17339484 }, { "epoch": 3.964968152866242, "grad_norm": 0.177734375, "learning_rate": 6.284396274524809e-06, "loss": 0.002964367624372244, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00297, "step": 1245, "tokens/total": 163012608, "tokens/train_per_sec_per_gpu": 3356.72, "tokens/trainable": 17353532 }, { "epoch": 3.968152866242038, "grad_norm": 0.1630859375, "learning_rate": 6.247590845702553e-06, "loss": 0.0029587389435619116, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00296, "step": 1246, "tokens/total": 163143680, "tokens/train_per_sec_per_gpu": 3114.15, "tokens/trainable": 17366524 }, { "epoch": 3.9713375796178343, "grad_norm": 0.1376953125, "learning_rate": 6.210878115296267e-06, "loss": 0.0023161745630204678, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00232, "step": 1247, "tokens/total": 163274752, "tokens/train_per_sec_per_gpu": 3507.4, "tokens/trainable": 17381160 }, { "epoch": 3.9745222929936306, "grad_norm": 0.15234375, "learning_rate": 6.174258264787283e-06, "loss": 0.002960086800158024, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00296, "step": 1248, "tokens/total": 163405824, "tokens/train_per_sec_per_gpu": 3401.28, "tokens/trainable": 17395386 }, { "epoch": 3.977707006369427, "grad_norm": 0.1435546875, "learning_rate": 6.137731475197775e-06, "loss": 0.0018720726948231459, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00187, "step": 1249, "tokens/total": 163536896, "tokens/train_per_sec_per_gpu": 3215.48, "tokens/trainable": 17408856 }, { "epoch": 3.980891719745223, "grad_norm": 0.142578125, "learning_rate": 6.101297927089905e-06, "loss": 0.0030803410336375237, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00309, "step": 1250, "tokens/total": 163667968, "tokens/train_per_sec_per_gpu": 3254.38, "tokens/trainable": 17422484 }, { "epoch": 3.984076433121019, "grad_norm": 0.1865234375, "learning_rate": 6.064957800564924e-06, "loss": 0.0036575605627149343, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00366, "step": 1251, "tokens/total": 163799040, "tokens/train_per_sec_per_gpu": 3076.57, "tokens/trainable": 17435390 }, { "epoch": 3.9872611464968153, "grad_norm": 0.16796875, "learning_rate": 6.028711275262252e-06, "loss": 0.002414201619103551, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00242, "step": 1252, "tokens/total": 163930112, "tokens/train_per_sec_per_gpu": 3358.96, "tokens/trainable": 17449432 }, { "epoch": 3.9904458598726116, "grad_norm": 0.1572265625, "learning_rate": 5.992558530358638e-06, "loss": 0.002453506924211979, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00246, "step": 1253, "tokens/total": 164061184, "tokens/train_per_sec_per_gpu": 3465.43, "tokens/trainable": 17463888 }, { "epoch": 3.9936305732484074, "grad_norm": 0.173828125, "learning_rate": 5.95649974456724e-06, "loss": 0.0030917448457330465, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0031, "step": 1254, "tokens/total": 164192256, "tokens/train_per_sec_per_gpu": 3406.16, "tokens/trainable": 17478118 }, { "epoch": 3.9968152866242037, "grad_norm": 0.1845703125, "learning_rate": 5.920535096136737e-06, "loss": 0.003019727533683181, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00302, "step": 1255, "tokens/total": 164323328, "tokens/train_per_sec_per_gpu": 3013.86, "tokens/trainable": 17491116 }, { "epoch": 4.0, "grad_norm": 0.2255859375, "learning_rate": 5.884664762850467e-06, "loss": 0.0035042453091591597, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 39.25, "memory/max_allocated (GiB)": 39.25, "ppl": 1.00351, "step": 1256, "tokens/total": 164397056, "tokens/train_per_sec_per_gpu": 3307.41, "tokens/trainable": 17498700 }, { "epoch": 4.0, "eval_loss": 0.010103554464876652, "eval_ppl": 1.01015, "eval_runtime": 43.1815, "eval_samples_per_second": 62.55, "eval_steps_per_second": 3.914, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 54.61, "memory/max_allocated (GiB)": 54.61, "step": 1256 }, { "epoch": 4.003184713375796, "grad_norm": 0.1142578125, "learning_rate": 5.848888922025553e-06, "loss": 0.0019946754910051823, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.002, "step": 1257, "tokens/total": 164528128, "tokens/train_per_sec_per_gpu": 3365.43, "tokens/trainable": 17512708 }, { "epoch": 4.006369426751593, "grad_norm": 0.125, "learning_rate": 5.813207750511995e-06, "loss": 0.002120796823874116, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00212, "step": 1258, "tokens/total": 164659200, "tokens/train_per_sec_per_gpu": 3244.01, "tokens/trainable": 17526132 }, { "epoch": 4.009554140127388, "grad_norm": 0.142578125, "learning_rate": 5.777621424691834e-06, "loss": 0.0018959781154990196, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0019, "step": 1259, "tokens/total": 164790272, "tokens/train_per_sec_per_gpu": 2857.78, "tokens/trainable": 17538062 }, { "epoch": 4.012738853503185, "grad_norm": 0.126953125, "learning_rate": 5.742130120478265e-06, "loss": 0.002416697796434164, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00242, "step": 1260, "tokens/total": 164921344, "tokens/train_per_sec_per_gpu": 3561.58, "tokens/trainable": 17552824 }, { "epoch": 4.015923566878981, "grad_norm": 0.109375, "learning_rate": 5.706734013314746e-06, "loss": 0.00218612770549953, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00219, "step": 1261, "tokens/total": 165052416, "tokens/train_per_sec_per_gpu": 3426.83, "tokens/trainable": 17567024 }, { "epoch": 4.019108280254777, "grad_norm": 0.10791015625, "learning_rate": 5.671433278174151e-06, "loss": 0.0017273772973567247, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00173, "step": 1262, "tokens/total": 165183488, "tokens/train_per_sec_per_gpu": 3392.24, "tokens/trainable": 17581128 }, { "epoch": 4.022292993630574, "grad_norm": 0.11328125, "learning_rate": 5.636228089557926e-06, "loss": 0.0017078241799026728, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00171, "step": 1263, "tokens/total": 165314560, "tokens/train_per_sec_per_gpu": 3547.78, "tokens/trainable": 17595852 }, { "epoch": 4.025477707006369, "grad_norm": 0.10400390625, "learning_rate": 5.601118621495175e-06, "loss": 0.0014550643973052502, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00146, "step": 1264, "tokens/total": 165445632, "tokens/train_per_sec_per_gpu": 3380.55, "tokens/trainable": 17609924 }, { "epoch": 4.028662420382165, "grad_norm": 0.158203125, "learning_rate": 5.566105047541847e-06, "loss": 0.0025803535245358944, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00258, "step": 1265, "tokens/total": 165576704, "tokens/train_per_sec_per_gpu": 3404.37, "tokens/trainable": 17624118 }, { "epoch": 4.031847133757962, "grad_norm": 0.12890625, "learning_rate": 5.531187540779864e-06, "loss": 0.0025620046071708202, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00257, "step": 1266, "tokens/total": 165707776, "tokens/train_per_sec_per_gpu": 3185.3, "tokens/trainable": 17637446 }, { "epoch": 4.035031847133758, "grad_norm": 0.12158203125, "learning_rate": 5.4963662738162445e-06, "loss": 0.0018056983826681972, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00181, "step": 1267, "tokens/total": 165838848, "tokens/train_per_sec_per_gpu": 3696.5, "tokens/trainable": 17652856 }, { "epoch": 4.038216560509555, "grad_norm": 0.1044921875, "learning_rate": 5.461641418782268e-06, "loss": 0.0014057126827538013, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00141, "step": 1268, "tokens/total": 165969920, "tokens/train_per_sec_per_gpu": 3383.57, "tokens/trainable": 17666958 }, { "epoch": 4.04140127388535, "grad_norm": 0.1240234375, "learning_rate": 5.427013147332638e-06, "loss": 0.0026509405579417944, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00265, "step": 1269, "tokens/total": 166100992, "tokens/train_per_sec_per_gpu": 3482.02, "tokens/trainable": 17681546 }, { "epoch": 4.044585987261146, "grad_norm": 0.1318359375, "learning_rate": 5.392481630644597e-06, "loss": 0.002696407027542591, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0027, "step": 1270, "tokens/total": 166232064, "tokens/train_per_sec_per_gpu": 3270.12, "tokens/trainable": 17695240 }, { "epoch": 4.047770700636943, "grad_norm": 0.11376953125, "learning_rate": 5.358047039417122e-06, "loss": 0.0018320954404771328, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00183, "step": 1271, "tokens/total": 166363136, "tokens/train_per_sec_per_gpu": 3274.88, "tokens/trainable": 17708940 }, { "epoch": 4.050955414012739, "grad_norm": 0.1201171875, "learning_rate": 5.323709543870059e-06, "loss": 0.0021537388674914837, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00216, "step": 1272, "tokens/total": 166494208, "tokens/train_per_sec_per_gpu": 3453.25, "tokens/trainable": 17723348 }, { "epoch": 4.054140127388535, "grad_norm": 0.130859375, "learning_rate": 5.2894693137432645e-06, "loss": 0.0018690548604354262, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00187, "step": 1273, "tokens/total": 166625280, "tokens/train_per_sec_per_gpu": 3002.21, "tokens/trainable": 17735952 }, { "epoch": 4.057324840764331, "grad_norm": 0.162109375, "learning_rate": 5.255326518295792e-06, "loss": 0.002879355102777481, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00288, "step": 1274, "tokens/total": 166756352, "tokens/train_per_sec_per_gpu": 3683.9, "tokens/trainable": 17751344 }, { "epoch": 4.060509554140127, "grad_norm": 0.1220703125, "learning_rate": 5.221281326305066e-06, "loss": 0.0022269003093242645, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00223, "step": 1275, "tokens/total": 166887424, "tokens/train_per_sec_per_gpu": 3651.91, "tokens/trainable": 17766626 }, { "epoch": 4.063694267515924, "grad_norm": 0.11083984375, "learning_rate": 5.187333906065999e-06, "loss": 0.001456463593058288, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00146, "step": 1276, "tokens/total": 167018496, "tokens/train_per_sec_per_gpu": 3273.46, "tokens/trainable": 17780338 }, { "epoch": 4.06687898089172, "grad_norm": 0.07763671875, "learning_rate": 5.15348442539022e-06, "loss": 0.0010698458645492792, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00107, "step": 1277, "tokens/total": 167149568, "tokens/train_per_sec_per_gpu": 3378.38, "tokens/trainable": 17794556 }, { "epoch": 4.070063694267516, "grad_norm": 0.1396484375, "learning_rate": 5.1197330516052025e-06, "loss": 0.002229275880381465, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00223, "step": 1278, "tokens/total": 167280640, "tokens/train_per_sec_per_gpu": 3141.49, "tokens/trainable": 17807720 }, { "epoch": 4.073248407643312, "grad_norm": 0.1513671875, "learning_rate": 5.086079951553444e-06, "loss": 0.0030983053147792816, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0031, "step": 1279, "tokens/total": 167411712, "tokens/train_per_sec_per_gpu": 3466.5, "tokens/trainable": 17822220 }, { "epoch": 4.076433121019108, "grad_norm": 0.1611328125, "learning_rate": 5.052525291591651e-06, "loss": 0.0031875702552497387, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00319, "step": 1280, "tokens/total": 167542784, "tokens/train_per_sec_per_gpu": 3276.44, "tokens/trainable": 17835898 }, { "epoch": 4.079617834394904, "grad_norm": 0.111328125, "learning_rate": 5.019069237589921e-06, "loss": 0.0019920531194657087, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00199, "step": 1281, "tokens/total": 167673856, "tokens/train_per_sec_per_gpu": 3318.63, "tokens/trainable": 17849768 }, { "epoch": 4.082802547770701, "grad_norm": 0.1328125, "learning_rate": 4.985711954930902e-06, "loss": 0.0015500528970733285, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00155, "step": 1282, "tokens/total": 167804928, "tokens/train_per_sec_per_gpu": 3025.08, "tokens/trainable": 17862448 }, { "epoch": 4.085987261146497, "grad_norm": 0.138671875, "learning_rate": 4.952453608509e-06, "loss": 0.0018041220027953386, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00181, "step": 1283, "tokens/total": 167936000, "tokens/train_per_sec_per_gpu": 3421.79, "tokens/trainable": 17876746 }, { "epoch": 4.089171974522293, "grad_norm": 0.109375, "learning_rate": 4.919294362729551e-06, "loss": 0.0015523162437602878, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00155, "step": 1284, "tokens/total": 168067072, "tokens/train_per_sec_per_gpu": 3216.81, "tokens/trainable": 17890232 }, { "epoch": 4.092356687898089, "grad_norm": 0.1259765625, "learning_rate": 4.886234381507998e-06, "loss": 0.0025541428476572037, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00256, "step": 1285, "tokens/total": 168198144, "tokens/train_per_sec_per_gpu": 3474.88, "tokens/trainable": 17904764 }, { "epoch": 4.095541401273885, "grad_norm": 0.181640625, "learning_rate": 4.853273828269089e-06, "loss": 0.0028677769005298615, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00287, "step": 1286, "tokens/total": 168329216, "tokens/train_per_sec_per_gpu": 3678.61, "tokens/trainable": 17920040 }, { "epoch": 4.098726114649682, "grad_norm": 0.1845703125, "learning_rate": 4.820412865946092e-06, "loss": 0.003095669439062476, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0031, "step": 1287, "tokens/total": 168460288, "tokens/train_per_sec_per_gpu": 3226.04, "tokens/trainable": 17933544 }, { "epoch": 4.101910828025478, "grad_norm": 0.1044921875, "learning_rate": 4.787651656979949e-06, "loss": 0.001217160257510841, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00122, "step": 1288, "tokens/total": 168591360, "tokens/train_per_sec_per_gpu": 3346.97, "tokens/trainable": 17947562 }, { "epoch": 4.1050955414012735, "grad_norm": 0.10791015625, "learning_rate": 4.754990363318501e-06, "loss": 0.0015003203880041838, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0015, "step": 1289, "tokens/total": 168722432, "tokens/train_per_sec_per_gpu": 3462.58, "tokens/trainable": 17962074 }, { "epoch": 4.10828025477707, "grad_norm": 0.10595703125, "learning_rate": 4.722429146415691e-06, "loss": 0.001935549546033144, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00194, "step": 1290, "tokens/total": 168853504, "tokens/train_per_sec_per_gpu": 3245.6, "tokens/trainable": 17975652 }, { "epoch": 4.111464968152866, "grad_norm": 0.1328125, "learning_rate": 4.6899681672307346e-06, "loss": 0.00210759905166924, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00211, "step": 1291, "tokens/total": 168984576, "tokens/train_per_sec_per_gpu": 3417.61, "tokens/trainable": 17989954 }, { "epoch": 4.114649681528663, "grad_norm": 0.111328125, "learning_rate": 4.657607586227345e-06, "loss": 0.0014702447224408388, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00147, "step": 1292, "tokens/total": 169115648, "tokens/train_per_sec_per_gpu": 3709.61, "tokens/trainable": 18005404 }, { "epoch": 4.117834394904459, "grad_norm": 0.12890625, "learning_rate": 4.625347563372964e-06, "loss": 0.0019532586447894573, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00196, "step": 1293, "tokens/total": 169246720, "tokens/train_per_sec_per_gpu": 3349.32, "tokens/trainable": 18019456 }, { "epoch": 4.1210191082802545, "grad_norm": 0.10205078125, "learning_rate": 4.593188258137912e-06, "loss": 0.0014989221235737205, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0015, "step": 1294, "tokens/total": 169377792, "tokens/train_per_sec_per_gpu": 3412.09, "tokens/trainable": 18033708 }, { "epoch": 4.124203821656051, "grad_norm": 0.1240234375, "learning_rate": 4.5611298294946596e-06, "loss": 0.0016446541994810104, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00165, "step": 1295, "tokens/total": 169508864, "tokens/train_per_sec_per_gpu": 3299.98, "tokens/trainable": 18047556 }, { "epoch": 4.127388535031847, "grad_norm": 0.12890625, "learning_rate": 4.529172435917012e-06, "loss": 0.001521661994047463, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00152, "step": 1296, "tokens/total": 169639936, "tokens/train_per_sec_per_gpu": 3090.9, "tokens/trainable": 18060560 }, { "epoch": 4.130573248407643, "grad_norm": 0.1357421875, "learning_rate": 4.497316235379323e-06, "loss": 0.002716638380661607, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00272, "step": 1297, "tokens/total": 169771008, "tokens/train_per_sec_per_gpu": 3121.65, "tokens/trainable": 18073696 }, { "epoch": 4.13375796178344, "grad_norm": 0.1484375, "learning_rate": 4.465561385355712e-06, "loss": 0.0017709597013890743, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00177, "step": 1298, "tokens/total": 169902080, "tokens/train_per_sec_per_gpu": 3532.51, "tokens/trainable": 18088448 }, { "epoch": 4.1369426751592355, "grad_norm": 0.11474609375, "learning_rate": 4.433908042819323e-06, "loss": 0.0015186622040346265, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00152, "step": 1299, "tokens/total": 170033152, "tokens/train_per_sec_per_gpu": 3144.35, "tokens/trainable": 18101652 }, { "epoch": 4.140127388535032, "grad_norm": 0.11767578125, "learning_rate": 4.402356364241489e-06, "loss": 0.001659161178395152, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00166, "step": 1300, "tokens/total": 170164224, "tokens/train_per_sec_per_gpu": 3339.31, "tokens/trainable": 18115704 }, { "epoch": 4.143312101910828, "grad_norm": 0.11474609375, "learning_rate": 4.370906505591007e-06, "loss": 0.0014578705886378884, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00146, "step": 1301, "tokens/total": 170295296, "tokens/train_per_sec_per_gpu": 3235.03, "tokens/trainable": 18129240 }, { "epoch": 4.146496815286624, "grad_norm": 0.13671875, "learning_rate": 4.339558622333353e-06, "loss": 0.0024085917975753546, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00241, "step": 1302, "tokens/total": 170426368, "tokens/train_per_sec_per_gpu": 3590.36, "tokens/trainable": 18144174 }, { "epoch": 4.149681528662421, "grad_norm": 0.1669921875, "learning_rate": 4.308312869429898e-06, "loss": 0.0028695266228169203, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00287, "step": 1303, "tokens/total": 170557440, "tokens/train_per_sec_per_gpu": 3280.59, "tokens/trainable": 18157882 }, { "epoch": 4.1528662420382165, "grad_norm": 0.13671875, "learning_rate": 4.27716940133715e-06, "loss": 0.0024525150656700134, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00246, "step": 1304, "tokens/total": 170688512, "tokens/train_per_sec_per_gpu": 3221.62, "tokens/trainable": 18171456 }, { "epoch": 4.156050955414012, "grad_norm": 0.138671875, "learning_rate": 4.246128372006017e-06, "loss": 0.00208856794051826, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00209, "step": 1305, "tokens/total": 170819584, "tokens/train_per_sec_per_gpu": 3030.29, "tokens/trainable": 18184178 }, { "epoch": 4.159235668789809, "grad_norm": 0.119140625, "learning_rate": 4.215189934881001e-06, "loss": 0.0016645672731101513, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00167, "step": 1306, "tokens/total": 170950656, "tokens/train_per_sec_per_gpu": 3509.84, "tokens/trainable": 18198820 }, { "epoch": 4.162420382165605, "grad_norm": 0.080078125, "learning_rate": 4.1843542428994685e-06, "loss": 0.0010691338684409857, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00107, "step": 1307, "tokens/total": 171081728, "tokens/train_per_sec_per_gpu": 2998.84, "tokens/trainable": 18211426 }, { "epoch": 4.165605095541402, "grad_norm": 0.140625, "learning_rate": 4.153621448490905e-06, "loss": 0.0030363069381564856, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00304, "step": 1308, "tokens/total": 171212800, "tokens/train_per_sec_per_gpu": 3613.64, "tokens/trainable": 18226492 }, { "epoch": 4.1687898089171975, "grad_norm": 0.1650390625, "learning_rate": 4.122991703576121e-06, "loss": 0.0039181094616651535, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00393, "step": 1309, "tokens/total": 171343872, "tokens/train_per_sec_per_gpu": 3326.43, "tokens/trainable": 18240364 }, { "epoch": 4.171974522292993, "grad_norm": 0.1494140625, "learning_rate": 4.092465159566525e-06, "loss": 0.0018522969912737608, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00185, "step": 1310, "tokens/total": 171474944, "tokens/train_per_sec_per_gpu": 3142.02, "tokens/trainable": 18253528 }, { "epoch": 4.17515923566879, "grad_norm": 0.1279296875, "learning_rate": 4.062041967363395e-06, "loss": 0.0022721600253134966, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00227, "step": 1311, "tokens/total": 171606016, "tokens/train_per_sec_per_gpu": 3346.0, "tokens/trainable": 18267536 }, { "epoch": 4.178343949044586, "grad_norm": 0.1328125, "learning_rate": 4.031722277357086e-06, "loss": 0.0017200830625370145, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00172, "step": 1312, "tokens/total": 171737088, "tokens/train_per_sec_per_gpu": 3265.04, "tokens/trainable": 18281204 }, { "epoch": 4.181528662420382, "grad_norm": 0.119140625, "learning_rate": 4.001506239426339e-06, "loss": 0.0018201316706836224, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00182, "step": 1313, "tokens/total": 171868160, "tokens/train_per_sec_per_gpu": 3661.03, "tokens/trainable": 18296460 }, { "epoch": 4.1847133757961785, "grad_norm": 0.09326171875, "learning_rate": 3.971394002937501e-06, "loss": 0.0008904569549486041, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00089, "step": 1314, "tokens/total": 171999232, "tokens/train_per_sec_per_gpu": 2682.19, "tokens/trainable": 18307764 }, { "epoch": 4.187898089171974, "grad_norm": 0.115234375, "learning_rate": 3.941385716743795e-06, "loss": 0.0016649002209305763, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00167, "step": 1315, "tokens/total": 172130304, "tokens/train_per_sec_per_gpu": 3367.15, "tokens/trainable": 18321816 }, { "epoch": 4.191082802547771, "grad_norm": 0.11181640625, "learning_rate": 3.911481529184588e-06, "loss": 0.0019004822243005037, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0019, "step": 1316, "tokens/total": 172261376, "tokens/train_per_sec_per_gpu": 3450.07, "tokens/trainable": 18336222 }, { "epoch": 4.194267515923567, "grad_norm": 0.12353515625, "learning_rate": 3.881681588084674e-06, "loss": 0.0020820728968828917, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00208, "step": 1317, "tokens/total": 172392448, "tokens/train_per_sec_per_gpu": 3549.92, "tokens/trainable": 18351078 }, { "epoch": 4.197452229299363, "grad_norm": 0.16015625, "learning_rate": 3.851986040753505e-06, "loss": 0.002381009515374899, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00238, "step": 1318, "tokens/total": 172523520, "tokens/train_per_sec_per_gpu": 3442.29, "tokens/trainable": 18365480 }, { "epoch": 4.2006369426751595, "grad_norm": 0.08056640625, "learning_rate": 3.822395033984502e-06, "loss": 0.0012018627021461725, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0012, "step": 1319, "tokens/total": 172654592, "tokens/train_per_sec_per_gpu": 3791.81, "tokens/trainable": 18381260 }, { "epoch": 4.203821656050955, "grad_norm": 0.142578125, "learning_rate": 3.792908714054316e-06, "loss": 0.002608443144708872, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00261, "step": 1320, "tokens/total": 172785664, "tokens/train_per_sec_per_gpu": 3220.83, "tokens/trainable": 18394756 }, { "epoch": 4.207006369426751, "grad_norm": 0.12060546875, "learning_rate": 3.7635272267220858e-06, "loss": 0.0018472287338227034, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00185, "step": 1321, "tokens/total": 172916736, "tokens/train_per_sec_per_gpu": 3254.63, "tokens/trainable": 18408414 }, { "epoch": 4.210191082802548, "grad_norm": 0.1669921875, "learning_rate": 3.734250717228735e-06, "loss": 0.00441823760047555, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00443, "step": 1322, "tokens/total": 173047808, "tokens/train_per_sec_per_gpu": 3065.51, "tokens/trainable": 18421262 }, { "epoch": 4.213375796178344, "grad_norm": 0.10791015625, "learning_rate": 3.7050793302962685e-06, "loss": 0.0016929913545027375, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00169, "step": 1323, "tokens/total": 173178880, "tokens/train_per_sec_per_gpu": 3282.3, "tokens/trainable": 18434992 }, { "epoch": 4.2165605095541405, "grad_norm": 0.1455078125, "learning_rate": 3.676013210127022e-06, "loss": 0.0025385431945323944, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00254, "step": 1324, "tokens/total": 173309952, "tokens/train_per_sec_per_gpu": 3103.93, "tokens/trainable": 18447992 }, { "epoch": 4.219745222929936, "grad_norm": 0.1123046875, "learning_rate": 3.647052500402981e-06, "loss": 0.0015956538263708353, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0016, "step": 1325, "tokens/total": 173441024, "tokens/train_per_sec_per_gpu": 3147.17, "tokens/trainable": 18461184 }, { "epoch": 4.222929936305732, "grad_norm": 0.1259765625, "learning_rate": 3.6181973442850597e-06, "loss": 0.001635034685023129, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00164, "step": 1326, "tokens/total": 173572096, "tokens/train_per_sec_per_gpu": 3715.4, "tokens/trainable": 18476658 }, { "epoch": 4.226114649681529, "grad_norm": 0.1416015625, "learning_rate": 3.589447884412378e-06, "loss": 0.0025338195264339447, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00254, "step": 1327, "tokens/total": 173703168, "tokens/train_per_sec_per_gpu": 3364.82, "tokens/trainable": 18490728 }, { "epoch": 4.229299363057325, "grad_norm": 0.1025390625, "learning_rate": 3.5608042629015707e-06, "loss": 0.001267962739802897, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00127, "step": 1328, "tokens/total": 173834240, "tokens/train_per_sec_per_gpu": 3268.7, "tokens/trainable": 18504432 }, { "epoch": 4.232484076433121, "grad_norm": 0.1396484375, "learning_rate": 3.532266621346103e-06, "loss": 0.0019486568635329604, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00195, "step": 1329, "tokens/total": 173965312, "tokens/train_per_sec_per_gpu": 3287.05, "tokens/trainable": 18518208 }, { "epoch": 4.235668789808917, "grad_norm": 0.1748046875, "learning_rate": 3.5038351008155226e-06, "loss": 0.002935834927484393, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00294, "step": 1330, "tokens/total": 174096384, "tokens/train_per_sec_per_gpu": 3210.37, "tokens/trainable": 18531630 }, { "epoch": 4.238853503184713, "grad_norm": 0.11767578125, "learning_rate": 3.4755098418548155e-06, "loss": 0.0018774013733491302, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00188, "step": 1331, "tokens/total": 174227456, "tokens/train_per_sec_per_gpu": 3384.93, "tokens/trainable": 18545764 }, { "epoch": 4.24203821656051, "grad_norm": 0.11083984375, "learning_rate": 3.4472909844836837e-06, "loss": 0.001764771994203329, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00177, "step": 1332, "tokens/total": 174358528, "tokens/train_per_sec_per_gpu": 3577.08, "tokens/trainable": 18560662 }, { "epoch": 4.245222929936306, "grad_norm": 0.130859375, "learning_rate": 3.4191786681958437e-06, "loss": 0.0026986815501004457, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0027, "step": 1333, "tokens/total": 174489600, "tokens/train_per_sec_per_gpu": 3282.73, "tokens/trainable": 18574404 }, { "epoch": 4.248407643312102, "grad_norm": 0.138671875, "learning_rate": 3.39117303195835e-06, "loss": 0.0022144122049212456, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00222, "step": 1334, "tokens/total": 174620672, "tokens/train_per_sec_per_gpu": 3194.16, "tokens/trainable": 18587768 }, { "epoch": 4.251592356687898, "grad_norm": 0.1435546875, "learning_rate": 3.3632742142109293e-06, "loss": 0.00270890723913908, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00271, "step": 1335, "tokens/total": 174751744, "tokens/train_per_sec_per_gpu": 3411.3, "tokens/trainable": 18602002 }, { "epoch": 4.254777070063694, "grad_norm": 0.16796875, "learning_rate": 3.3354823528652463e-06, "loss": 0.0023235008120536804, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00233, "step": 1336, "tokens/total": 174882816, "tokens/train_per_sec_per_gpu": 3230.27, "tokens/trainable": 18615552 }, { "epoch": 4.25796178343949, "grad_norm": 0.1767578125, "learning_rate": 3.3077975853042703e-06, "loss": 0.002815892221406102, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00282, "step": 1337, "tokens/total": 175013888, "tokens/train_per_sec_per_gpu": 3239.06, "tokens/trainable": 18629086 }, { "epoch": 4.261146496815287, "grad_norm": 0.166015625, "learning_rate": 3.280220048381574e-06, "loss": 0.002695944393053651, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0027, "step": 1338, "tokens/total": 175144960, "tokens/train_per_sec_per_gpu": 3597.1, "tokens/trainable": 18644090 }, { "epoch": 4.264331210191083, "grad_norm": 0.1396484375, "learning_rate": 3.252749878420647e-06, "loss": 0.0021448852494359016, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00215, "step": 1339, "tokens/total": 175276032, "tokens/train_per_sec_per_gpu": 3364.52, "tokens/trainable": 18658172 }, { "epoch": 4.267515923566879, "grad_norm": 0.12060546875, "learning_rate": 3.225387211214237e-06, "loss": 0.001379702938720584, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00138, "step": 1340, "tokens/total": 175407104, "tokens/train_per_sec_per_gpu": 3324.64, "tokens/trainable": 18672098 }, { "epoch": 4.270700636942675, "grad_norm": 0.146484375, "learning_rate": 3.1981321820236885e-06, "loss": 0.002582112792879343, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00259, "step": 1341, "tokens/total": 175538176, "tokens/train_per_sec_per_gpu": 3665.7, "tokens/trainable": 18687300 }, { "epoch": 4.273885350318471, "grad_norm": 0.126953125, "learning_rate": 3.1709849255782466e-06, "loss": 0.0017478655790910125, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00175, "step": 1342, "tokens/total": 175669248, "tokens/train_per_sec_per_gpu": 3348.03, "tokens/trainable": 18701322 }, { "epoch": 4.277070063694268, "grad_norm": 0.12890625, "learning_rate": 3.1439455760744112e-06, "loss": 0.0016232930356636643, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00162, "step": 1343, "tokens/total": 175800320, "tokens/train_per_sec_per_gpu": 3355.1, "tokens/trainable": 18715342 }, { "epoch": 4.280254777070064, "grad_norm": 0.1005859375, "learning_rate": 3.117014267175275e-06, "loss": 0.0013508808333426714, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00135, "step": 1344, "tokens/total": 175931392, "tokens/train_per_sec_per_gpu": 3356.71, "tokens/trainable": 18729364 }, { "epoch": 4.2834394904458595, "grad_norm": 0.158203125, "learning_rate": 3.0901911320098426e-06, "loss": 0.002793082967400551, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0028, "step": 1345, "tokens/total": 176062464, "tokens/train_per_sec_per_gpu": 3247.41, "tokens/trainable": 18742944 }, { "epoch": 4.286624203821656, "grad_norm": 0.130859375, "learning_rate": 3.0634763031723882e-06, "loss": 0.0016741371946409345, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00168, "step": 1346, "tokens/total": 176193536, "tokens/train_per_sec_per_gpu": 3132.92, "tokens/trainable": 18756064 }, { "epoch": 4.289808917197452, "grad_norm": 0.0947265625, "learning_rate": 3.036869912721807e-06, "loss": 0.0012669205898419023, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00127, "step": 1347, "tokens/total": 176324608, "tokens/train_per_sec_per_gpu": 3626.61, "tokens/trainable": 18771148 }, { "epoch": 4.292993630573249, "grad_norm": 0.095703125, "learning_rate": 3.010372092180941e-06, "loss": 0.0014189573703333735, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00142, "step": 1348, "tokens/total": 176455680, "tokens/train_per_sec_per_gpu": 3173.45, "tokens/trainable": 18784398 }, { "epoch": 4.296178343949045, "grad_norm": 0.1279296875, "learning_rate": 2.983982972535948e-06, "loss": 0.0028286417946219444, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00283, "step": 1349, "tokens/total": 176586752, "tokens/train_per_sec_per_gpu": 3281.6, "tokens/trainable": 18798140 }, { "epoch": 4.2993630573248405, "grad_norm": 0.1552734375, "learning_rate": 2.9577026842356527e-06, "loss": 0.002894408069550991, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0029, "step": 1350, "tokens/total": 176717824, "tokens/train_per_sec_per_gpu": 3578.34, "tokens/trainable": 18813032 }, { "epoch": 4.302547770700637, "grad_norm": 0.154296875, "learning_rate": 2.931531357190881e-06, "loss": 0.0021069981157779694, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00211, "step": 1351, "tokens/total": 176848896, "tokens/train_per_sec_per_gpu": 3439.97, "tokens/trainable": 18827370 }, { "epoch": 4.305732484076433, "grad_norm": 0.11376953125, "learning_rate": 2.905469120773835e-06, "loss": 0.002290198812261224, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00229, "step": 1352, "tokens/total": 176979968, "tokens/train_per_sec_per_gpu": 3402.53, "tokens/trainable": 18841572 }, { "epoch": 4.308917197452229, "grad_norm": 0.16015625, "learning_rate": 2.8795161038174675e-06, "loss": 0.0023499338421970606, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00235, "step": 1353, "tokens/total": 177111040, "tokens/train_per_sec_per_gpu": 3406.22, "tokens/trainable": 18855776 }, { "epoch": 4.312101910828026, "grad_norm": 0.10205078125, "learning_rate": 2.853672434614807e-06, "loss": 0.0013938483316451311, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00139, "step": 1354, "tokens/total": 177242112, "tokens/train_per_sec_per_gpu": 3788.18, "tokens/trainable": 18871550 }, { "epoch": 4.3152866242038215, "grad_norm": 0.1328125, "learning_rate": 2.8279382409183598e-06, "loss": 0.0020433831959962845, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00205, "step": 1355, "tokens/total": 177373184, "tokens/train_per_sec_per_gpu": 3264.03, "tokens/trainable": 18885240 }, { "epoch": 4.318471337579618, "grad_norm": 0.10302734375, "learning_rate": 2.802313649939467e-06, "loss": 0.0011658279690891504, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00117, "step": 1356, "tokens/total": 177504256, "tokens/train_per_sec_per_gpu": 3001.88, "tokens/trainable": 18897798 }, { "epoch": 4.321656050955414, "grad_norm": 0.1318359375, "learning_rate": 2.7767987883476622e-06, "loss": 0.0021784165874123573, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00218, "step": 1357, "tokens/total": 177635328, "tokens/train_per_sec_per_gpu": 3337.15, "tokens/trainable": 18911724 }, { "epoch": 4.32484076433121, "grad_norm": 0.1396484375, "learning_rate": 2.7513937822700508e-06, "loss": 0.002125969622284174, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00213, "step": 1358, "tokens/total": 177766400, "tokens/train_per_sec_per_gpu": 3060.55, "tokens/trainable": 18924596 }, { "epoch": 4.328025477707007, "grad_norm": 0.142578125, "learning_rate": 2.7260987572907153e-06, "loss": 0.0018263484816998243, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00183, "step": 1359, "tokens/total": 177897472, "tokens/train_per_sec_per_gpu": 3272.47, "tokens/trainable": 18938296 }, { "epoch": 4.3312101910828025, "grad_norm": 0.11865234375, "learning_rate": 2.700913838450042e-06, "loss": 0.0014197917189449072, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00142, "step": 1360, "tokens/total": 178028544, "tokens/train_per_sec_per_gpu": 3245.66, "tokens/trainable": 18951818 }, { "epoch": 4.334394904458598, "grad_norm": 0.11376953125, "learning_rate": 2.675839150244153e-06, "loss": 0.0016245257575064898, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00163, "step": 1361, "tokens/total": 178159616, "tokens/train_per_sec_per_gpu": 3313.42, "tokens/trainable": 18965684 }, { "epoch": 4.337579617834395, "grad_norm": 0.1376953125, "learning_rate": 2.650874816624266e-06, "loss": 0.0019813040271401405, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00198, "step": 1362, "tokens/total": 178290688, "tokens/train_per_sec_per_gpu": 3158.46, "tokens/trainable": 18978904 }, { "epoch": 4.340764331210191, "grad_norm": 0.130859375, "learning_rate": 2.6260209609960757e-06, "loss": 0.0024794619530439377, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00248, "step": 1363, "tokens/total": 178421760, "tokens/train_per_sec_per_gpu": 3507.13, "tokens/trainable": 18993500 }, { "epoch": 4.343949044585988, "grad_norm": 0.1474609375, "learning_rate": 2.6012777062191547e-06, "loss": 0.002862154971808195, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00287, "step": 1364, "tokens/total": 178552832, "tokens/train_per_sec_per_gpu": 3531.99, "tokens/trainable": 19008250 }, { "epoch": 4.3471337579617835, "grad_norm": 0.10693359375, "learning_rate": 2.5766451746063598e-06, "loss": 0.0013462984934449196, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00135, "step": 1365, "tokens/total": 178683904, "tokens/train_per_sec_per_gpu": 3460.1, "tokens/trainable": 19022668 }, { "epoch": 4.350318471337579, "grad_norm": 0.142578125, "learning_rate": 2.5521234879231887e-06, "loss": 0.002731763059273362, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00274, "step": 1366, "tokens/total": 178814976, "tokens/train_per_sec_per_gpu": 3570.61, "tokens/trainable": 19037614 }, { "epoch": 4.353503184713376, "grad_norm": 0.1083984375, "learning_rate": 2.527712767387222e-06, "loss": 0.0014442024985328317, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00145, "step": 1367, "tokens/total": 178946048, "tokens/train_per_sec_per_gpu": 3629.25, "tokens/trainable": 19052728 }, { "epoch": 4.356687898089172, "grad_norm": 0.123046875, "learning_rate": 2.5034131336674956e-06, "loss": 0.0018038805574178696, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00181, "step": 1368, "tokens/total": 179077120, "tokens/train_per_sec_per_gpu": 3323.04, "tokens/trainable": 19066614 }, { "epoch": 4.359872611464968, "grad_norm": 0.177734375, "learning_rate": 2.4792247068839064e-06, "loss": 0.0023225173354148865, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00233, "step": 1369, "tokens/total": 179208192, "tokens/train_per_sec_per_gpu": 3675.14, "tokens/trainable": 19081856 }, { "epoch": 4.3630573248407645, "grad_norm": 0.158203125, "learning_rate": 2.4551476066066307e-06, "loss": 0.003056393703445792, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00306, "step": 1370, "tokens/total": 179339264, "tokens/train_per_sec_per_gpu": 3829.68, "tokens/trainable": 19097794 }, { "epoch": 4.36624203821656, "grad_norm": 0.1806640625, "learning_rate": 2.4311819518555295e-06, "loss": 0.0030934589449316263, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0031, "step": 1371, "tokens/total": 179470336, "tokens/train_per_sec_per_gpu": 3060.11, "tokens/trainable": 19110620 }, { "epoch": 4.369426751592357, "grad_norm": 0.14453125, "learning_rate": 2.407327861099548e-06, "loss": 0.0017585513414815068, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00176, "step": 1372, "tokens/total": 179601408, "tokens/train_per_sec_per_gpu": 3435.78, "tokens/trainable": 19125008 }, { "epoch": 4.372611464968153, "grad_norm": 0.09814453125, "learning_rate": 2.383585452256146e-06, "loss": 0.0014080167748034, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00141, "step": 1373, "tokens/total": 179732480, "tokens/train_per_sec_per_gpu": 3677.42, "tokens/trainable": 19140292 }, { "epoch": 4.375796178343949, "grad_norm": 0.12109375, "learning_rate": 2.359954842690712e-06, "loss": 0.0016012933338060975, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0016, "step": 1374, "tokens/total": 179863552, "tokens/train_per_sec_per_gpu": 2983.63, "tokens/trainable": 19152852 }, { "epoch": 4.3789808917197455, "grad_norm": 0.1455078125, "learning_rate": 2.336436149215973e-06, "loss": 0.00259294337593019, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0026, "step": 1375, "tokens/total": 179994624, "tokens/train_per_sec_per_gpu": 3180.68, "tokens/trainable": 19166198 }, { "epoch": 4.382165605095541, "grad_norm": 0.1171875, "learning_rate": 2.3130294880914173e-06, "loss": 0.0015589562244713306, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00156, "step": 1376, "tokens/total": 180125696, "tokens/train_per_sec_per_gpu": 3318.63, "tokens/trainable": 19180092 }, { "epoch": 4.385350318471337, "grad_norm": 0.146484375, "learning_rate": 2.289734975022742e-06, "loss": 0.0024165399372577667, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00242, "step": 1377, "tokens/total": 180256768, "tokens/train_per_sec_per_gpu": 3487.98, "tokens/trainable": 19194700 }, { "epoch": 4.388535031847134, "grad_norm": 0.10791015625, "learning_rate": 2.266552725161247e-06, "loss": 0.0012368856696411967, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00124, "step": 1378, "tokens/total": 180387840, "tokens/train_per_sec_per_gpu": 3407.24, "tokens/trainable": 19208968 }, { "epoch": 4.39171974522293, "grad_norm": 0.1318359375, "learning_rate": 2.2434828531032988e-06, "loss": 0.002780412556603551, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00278, "step": 1379, "tokens/total": 180518912, "tokens/train_per_sec_per_gpu": 3096.75, "tokens/trainable": 19221936 }, { "epoch": 4.3949044585987265, "grad_norm": 0.10400390625, "learning_rate": 2.220525472889748e-06, "loss": 0.0017908208537846804, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00179, "step": 1380, "tokens/total": 180649984, "tokens/train_per_sec_per_gpu": 3574.84, "tokens/trainable": 19236852 }, { "epoch": 4.398089171974522, "grad_norm": 0.150390625, "learning_rate": 2.1976806980053556e-06, "loss": 0.0019308646442368627, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00193, "step": 1381, "tokens/total": 180781056, "tokens/train_per_sec_per_gpu": 3120.85, "tokens/trainable": 19249910 }, { "epoch": 4.401273885350318, "grad_norm": 0.142578125, "learning_rate": 2.1749486413782437e-06, "loss": 0.001861095312051475, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00186, "step": 1382, "tokens/total": 180912128, "tokens/train_per_sec_per_gpu": 3109.88, "tokens/trainable": 19262932 }, { "epoch": 4.404458598726115, "grad_norm": 0.1279296875, "learning_rate": 2.1523294153793532e-06, "loss": 0.0020333300344645977, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00204, "step": 1383, "tokens/total": 181043200, "tokens/train_per_sec_per_gpu": 3773.27, "tokens/trainable": 19278666 }, { "epoch": 4.407643312101911, "grad_norm": 0.1357421875, "learning_rate": 2.129823131821848e-06, "loss": 0.0016740905120968819, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00168, "step": 1384, "tokens/total": 181174272, "tokens/train_per_sec_per_gpu": 3304.58, "tokens/trainable": 19292464 }, { "epoch": 4.4108280254777075, "grad_norm": 0.134765625, "learning_rate": 2.107429901960603e-06, "loss": 0.0017093883361667395, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00171, "step": 1385, "tokens/total": 181305344, "tokens/train_per_sec_per_gpu": 3359.66, "tokens/trainable": 19306468 }, { "epoch": 4.414012738853503, "grad_norm": 0.140625, "learning_rate": 2.0851498364916345e-06, "loss": 0.002314978279173374, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00232, "step": 1386, "tokens/total": 181436416, "tokens/train_per_sec_per_gpu": 3591.52, "tokens/trainable": 19321404 }, { "epoch": 4.417197452229299, "grad_norm": 0.10546875, "learning_rate": 2.062983045551553e-06, "loss": 0.0015969820087775588, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0016, "step": 1387, "tokens/total": 181567488, "tokens/train_per_sec_per_gpu": 2748.49, "tokens/trainable": 19334018 }, { "epoch": 4.420382165605096, "grad_norm": 0.12890625, "learning_rate": 2.0409296387170125e-06, "loss": 0.002041134750470519, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00204, "step": 1388, "tokens/total": 181698560, "tokens/train_per_sec_per_gpu": 3103.87, "tokens/trainable": 19347006 }, { "epoch": 4.423566878980892, "grad_norm": 0.1357421875, "learning_rate": 2.0189897250041945e-06, "loss": 0.002131557324901223, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00213, "step": 1389, "tokens/total": 181829632, "tokens/train_per_sec_per_gpu": 3187.5, "tokens/trainable": 19360352 }, { "epoch": 4.426751592356688, "grad_norm": 0.1474609375, "learning_rate": 1.997163412868239e-06, "loss": 0.002050690818578005, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00205, "step": 1390, "tokens/total": 181960704, "tokens/train_per_sec_per_gpu": 3264.66, "tokens/trainable": 19374028 }, { "epoch": 4.429936305732484, "grad_norm": 0.1376953125, "learning_rate": 1.975450810202725e-06, "loss": 0.002214430132880807, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00222, "step": 1391, "tokens/total": 182091776, "tokens/train_per_sec_per_gpu": 3059.02, "tokens/trainable": 19386836 }, { "epoch": 4.43312101910828, "grad_norm": 0.1455078125, "learning_rate": 1.953852024339145e-06, "loss": 0.0023007793352007866, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0023, "step": 1392, "tokens/total": 182222848, "tokens/train_per_sec_per_gpu": 3096.57, "tokens/trainable": 19399808 }, { "epoch": 4.436305732484076, "grad_norm": 0.12060546875, "learning_rate": 1.9323671620463446e-06, "loss": 0.002242110203951597, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00224, "step": 1393, "tokens/total": 182353920, "tokens/train_per_sec_per_gpu": 3424.27, "tokens/trainable": 19414156 }, { "epoch": 4.439490445859873, "grad_norm": 0.142578125, "learning_rate": 1.9109963295300183e-06, "loss": 0.002074864227324724, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00208, "step": 1394, "tokens/total": 182484992, "tokens/train_per_sec_per_gpu": 2963.93, "tokens/trainable": 19426592 }, { "epoch": 4.442675159235669, "grad_norm": 0.158203125, "learning_rate": 1.8897396324321914e-06, "loss": 0.0032230939250439405, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00323, "step": 1395, "tokens/total": 182616064, "tokens/train_per_sec_per_gpu": 3361.36, "tokens/trainable": 19440628 }, { "epoch": 4.445859872611465, "grad_norm": 0.1845703125, "learning_rate": 1.8685971758306691e-06, "loss": 0.0027499471325427294, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00275, "step": 1396, "tokens/total": 182747136, "tokens/train_per_sec_per_gpu": 3511.89, "tokens/trainable": 19455336 }, { "epoch": 4.449044585987261, "grad_norm": 0.130859375, "learning_rate": 1.8475690642385468e-06, "loss": 0.0020744299981743097, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00208, "step": 1397, "tokens/total": 182878208, "tokens/train_per_sec_per_gpu": 3570.48, "tokens/trainable": 19470248 }, { "epoch": 4.452229299363057, "grad_norm": 0.123046875, "learning_rate": 1.8266554016036803e-06, "loss": 0.0015029326314106584, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0015, "step": 1398, "tokens/total": 183009280, "tokens/train_per_sec_per_gpu": 3170.84, "tokens/trainable": 19483584 }, { "epoch": 4.455414012738854, "grad_norm": 0.1171875, "learning_rate": 1.805856291308161e-06, "loss": 0.0015301044331863523, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00153, "step": 1399, "tokens/total": 183140352, "tokens/train_per_sec_per_gpu": 3104.15, "tokens/trainable": 19496574 }, { "epoch": 4.45859872611465, "grad_norm": 0.1328125, "learning_rate": 1.7851718361678205e-06, "loss": 0.0024863574653863907, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00249, "step": 1400, "tokens/total": 183271424, "tokens/train_per_sec_per_gpu": 3049.29, "tokens/trainable": 19509346 }, { "epoch": 4.461783439490446, "grad_norm": 0.130859375, "learning_rate": 1.7646021384317201e-06, "loss": 0.0017364751547574997, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00174, "step": 1401, "tokens/total": 183402496, "tokens/train_per_sec_per_gpu": 2926.42, "tokens/trainable": 19521620 }, { "epoch": 4.464968152866242, "grad_norm": 0.1494140625, "learning_rate": 1.7441472997816538e-06, "loss": 0.0021231744904071093, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00213, "step": 1402, "tokens/total": 183533568, "tokens/train_per_sec_per_gpu": 3355.02, "tokens/trainable": 19535636 }, { "epoch": 4.468152866242038, "grad_norm": 0.12255859375, "learning_rate": 1.7238074213316107e-06, "loss": 0.0017344644293189049, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00174, "step": 1403, "tokens/total": 183664640, "tokens/train_per_sec_per_gpu": 3171.71, "tokens/trainable": 19548924 }, { "epoch": 4.471337579617835, "grad_norm": 0.119140625, "learning_rate": 1.703582603627321e-06, "loss": 0.0015079887816682458, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00151, "step": 1404, "tokens/total": 183795712, "tokens/train_per_sec_per_gpu": 3552.52, "tokens/trainable": 19563736 }, { "epoch": 4.474522292993631, "grad_norm": 0.1279296875, "learning_rate": 1.6834729466457256e-06, "loss": 0.0015849830815568566, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00159, "step": 1405, "tokens/total": 183926784, "tokens/train_per_sec_per_gpu": 3232.9, "tokens/trainable": 19577276 }, { "epoch": 4.477707006369426, "grad_norm": 0.1337890625, "learning_rate": 1.6634785497944922e-06, "loss": 0.002168088685721159, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00217, "step": 1406, "tokens/total": 184057856, "tokens/train_per_sec_per_gpu": 3279.65, "tokens/trainable": 19591000 }, { "epoch": 4.480891719745223, "grad_norm": 0.15234375, "learning_rate": 1.6435995119115367e-06, "loss": 0.0026027678977698088, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00261, "step": 1407, "tokens/total": 184188928, "tokens/train_per_sec_per_gpu": 3467.93, "tokens/trainable": 19605518 }, { "epoch": 4.484076433121019, "grad_norm": 0.130859375, "learning_rate": 1.6238359312645168e-06, "loss": 0.0017946372972801328, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0018, "step": 1408, "tokens/total": 184320000, "tokens/train_per_sec_per_gpu": 3080.27, "tokens/trainable": 19618436 }, { "epoch": 4.487261146496815, "grad_norm": 0.126953125, "learning_rate": 1.6041879055503473e-06, "loss": 0.002403007121756673, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00241, "step": 1409, "tokens/total": 184451072, "tokens/train_per_sec_per_gpu": 3474.59, "tokens/trainable": 19632904 }, { "epoch": 4.490445859872612, "grad_norm": 0.12890625, "learning_rate": 1.5846555318947353e-06, "loss": 0.0019406620413064957, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00194, "step": 1410, "tokens/total": 184582144, "tokens/train_per_sec_per_gpu": 3262.74, "tokens/trainable": 19646574 }, { "epoch": 4.493630573248407, "grad_norm": 0.123046875, "learning_rate": 1.5652389068516765e-06, "loss": 0.0018433219520375133, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00185, "step": 1411, "tokens/total": 184713216, "tokens/train_per_sec_per_gpu": 3327.14, "tokens/trainable": 19660446 }, { "epoch": 4.496815286624204, "grad_norm": 0.1171875, "learning_rate": 1.5459381264029904e-06, "loss": 0.0018597168382257223, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00186, "step": 1412, "tokens/total": 184844288, "tokens/train_per_sec_per_gpu": 3955.39, "tokens/trainable": 19676842 }, { "epoch": 4.5, "grad_norm": 0.1328125, "learning_rate": 1.5267532859578437e-06, "loss": 0.0019280803389847279, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00193, "step": 1413, "tokens/total": 184975360, "tokens/train_per_sec_per_gpu": 3216.74, "tokens/trainable": 19690306 }, { "epoch": 4.5, "eval_loss": 0.010314718820154667, "eval_ppl": 1.01037, "eval_runtime": 41.6339, "eval_samples_per_second": 64.875, "eval_steps_per_second": 4.059, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 54.61, "memory/max_allocated (GiB)": 54.61, "step": 1413 }, { "epoch": 4.503184713375796, "grad_norm": 0.1279296875, "learning_rate": 1.5076844803522922e-06, "loss": 0.0020255008712410927, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00203, "step": 1414, "tokens/total": 185106432, "tokens/train_per_sec_per_gpu": 3418.68, "tokens/trainable": 19704608 }, { "epoch": 4.506369426751593, "grad_norm": 0.1220703125, "learning_rate": 1.4887318038487752e-06, "loss": 0.0020268706139177084, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00203, "step": 1415, "tokens/total": 185237504, "tokens/train_per_sec_per_gpu": 3371.87, "tokens/trainable": 19718710 }, { "epoch": 4.509554140127388, "grad_norm": 0.12890625, "learning_rate": 1.4698953501356972e-06, "loss": 0.00200156238861382, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.002, "step": 1416, "tokens/total": 185368576, "tokens/train_per_sec_per_gpu": 3222.46, "tokens/trainable": 19732218 }, { "epoch": 4.512738853503185, "grad_norm": 0.115234375, "learning_rate": 1.4511752123269245e-06, "loss": 0.0017808325355872512, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00178, "step": 1417, "tokens/total": 185499648, "tokens/train_per_sec_per_gpu": 3317.65, "tokens/trainable": 19746108 }, { "epoch": 4.515923566878981, "grad_norm": 0.12255859375, "learning_rate": 1.432571482961345e-06, "loss": 0.0017151820939034224, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00172, "step": 1418, "tokens/total": 185630720, "tokens/train_per_sec_per_gpu": 3182.32, "tokens/trainable": 19759432 }, { "epoch": 4.519108280254777, "grad_norm": 0.17578125, "learning_rate": 1.4140842540024123e-06, "loss": 0.002563396468758583, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00257, "step": 1419, "tokens/total": 185761792, "tokens/train_per_sec_per_gpu": 3137.69, "tokens/trainable": 19772572 }, { "epoch": 4.522292993630574, "grad_norm": 0.11376953125, "learning_rate": 1.3957136168376822e-06, "loss": 0.0014816210605204105, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00148, "step": 1420, "tokens/total": 185892864, "tokens/train_per_sec_per_gpu": 3152.7, "tokens/trainable": 19785788 }, { "epoch": 4.525477707006369, "grad_norm": 0.11767578125, "learning_rate": 1.3774596622783604e-06, "loss": 0.0015394608490169048, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00154, "step": 1421, "tokens/total": 186023936, "tokens/train_per_sec_per_gpu": 3228.29, "tokens/trainable": 19799284 }, { "epoch": 4.528662420382165, "grad_norm": 0.1455078125, "learning_rate": 1.3593224805588722e-06, "loss": 0.0022464555222541094, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00225, "step": 1422, "tokens/total": 186155008, "tokens/train_per_sec_per_gpu": 3587.26, "tokens/trainable": 19814314 }, { "epoch": 4.531847133757962, "grad_norm": 0.07177734375, "learning_rate": 1.341302161336383e-06, "loss": 0.0008602555026300251, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00086, "step": 1423, "tokens/total": 186286080, "tokens/train_per_sec_per_gpu": 3203.52, "tokens/trainable": 19827672 }, { "epoch": 4.535031847133758, "grad_norm": 0.11572265625, "learning_rate": 1.3233987936903808e-06, "loss": 0.0015553171979263425, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00156, "step": 1424, "tokens/total": 186417152, "tokens/train_per_sec_per_gpu": 3742.26, "tokens/trainable": 19843252 }, { "epoch": 4.538216560509554, "grad_norm": 0.1435546875, "learning_rate": 1.3056124661222357e-06, "loss": 0.0025198939256370068, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00252, "step": 1425, "tokens/total": 186548224, "tokens/train_per_sec_per_gpu": 3418.51, "tokens/trainable": 19857490 }, { "epoch": 4.54140127388535, "grad_norm": 0.1123046875, "learning_rate": 1.2879432665547558e-06, "loss": 0.002200118498876691, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0022, "step": 1426, "tokens/total": 186679296, "tokens/train_per_sec_per_gpu": 3288.15, "tokens/trainable": 19871216 }, { "epoch": 4.544585987261146, "grad_norm": 0.134765625, "learning_rate": 1.27039128233174e-06, "loss": 0.0021478794515132904, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00215, "step": 1427, "tokens/total": 186810368, "tokens/train_per_sec_per_gpu": 3148.41, "tokens/trainable": 19884396 }, { "epoch": 4.547770700636943, "grad_norm": 0.1572265625, "learning_rate": 1.2529566002175753e-06, "loss": 0.002553946105763316, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00256, "step": 1428, "tokens/total": 186941440, "tokens/train_per_sec_per_gpu": 3403.73, "tokens/trainable": 19898644 }, { "epoch": 4.550955414012739, "grad_norm": 0.12890625, "learning_rate": 1.2356393063967798e-06, "loss": 0.001968652941286564, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00197, "step": 1429, "tokens/total": 187072512, "tokens/train_per_sec_per_gpu": 3468.59, "tokens/trainable": 19913104 }, { "epoch": 4.554140127388535, "grad_norm": 0.12255859375, "learning_rate": 1.2184394864735881e-06, "loss": 0.00198244652710855, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00198, "step": 1430, "tokens/total": 187203584, "tokens/train_per_sec_per_gpu": 3461.21, "tokens/trainable": 19927596 }, { "epoch": 4.557324840764331, "grad_norm": 0.1494140625, "learning_rate": 1.201357225471536e-06, "loss": 0.0016815853305161, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00168, "step": 1431, "tokens/total": 187334656, "tokens/train_per_sec_per_gpu": 3043.87, "tokens/trainable": 19940348 }, { "epoch": 4.560509554140127, "grad_norm": 0.1318359375, "learning_rate": 1.184392607833032e-06, "loss": 0.0021309617441147566, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00213, "step": 1432, "tokens/total": 187465728, "tokens/train_per_sec_per_gpu": 3664.84, "tokens/trainable": 19955600 }, { "epoch": 4.563694267515924, "grad_norm": 0.1494140625, "learning_rate": 1.1675457174189302e-06, "loss": 0.00207577389664948, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00208, "step": 1433, "tokens/total": 187596800, "tokens/train_per_sec_per_gpu": 3301.91, "tokens/trainable": 19969374 }, { "epoch": 4.56687898089172, "grad_norm": 0.138671875, "learning_rate": 1.1508166375081424e-06, "loss": 0.0015523422043770552, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00155, "step": 1434, "tokens/total": 187727872, "tokens/train_per_sec_per_gpu": 3163.05, "tokens/trainable": 19982636 }, { "epoch": 4.570063694267516, "grad_norm": 0.134765625, "learning_rate": 1.1342054507971933e-06, "loss": 0.0017875637859106064, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00179, "step": 1435, "tokens/total": 187858944, "tokens/train_per_sec_per_gpu": 3297.76, "tokens/trainable": 19996446 }, { "epoch": 4.573248407643312, "grad_norm": 0.11962890625, "learning_rate": 1.1177122393998374e-06, "loss": 0.0017204630421474576, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00172, "step": 1436, "tokens/total": 187990016, "tokens/train_per_sec_per_gpu": 3545.62, "tokens/trainable": 20011220 }, { "epoch": 4.576433121019108, "grad_norm": 0.1171875, "learning_rate": 1.101337084846643e-06, "loss": 0.0016106198308989406, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00161, "step": 1437, "tokens/total": 188121088, "tokens/train_per_sec_per_gpu": 3421.36, "tokens/trainable": 20025440 }, { "epoch": 4.579617834394904, "grad_norm": 0.1103515625, "learning_rate": 1.0850800680845929e-06, "loss": 0.0017103978898376226, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00171, "step": 1438, "tokens/total": 188252160, "tokens/train_per_sec_per_gpu": 3540.04, "tokens/trainable": 20040228 }, { "epoch": 4.582802547770701, "grad_norm": 0.1025390625, "learning_rate": 1.0689412694766753e-06, "loss": 0.0013984747929498553, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0014, "step": 1439, "tokens/total": 188383232, "tokens/train_per_sec_per_gpu": 3402.4, "tokens/trainable": 20054474 }, { "epoch": 4.585987261146497, "grad_norm": 0.1025390625, "learning_rate": 1.0529207688015018e-06, "loss": 0.0012951147509738803, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0013, "step": 1440, "tokens/total": 188514304, "tokens/train_per_sec_per_gpu": 3638.2, "tokens/trainable": 20069706 }, { "epoch": 4.5891719745222925, "grad_norm": 0.1181640625, "learning_rate": 1.0370186452528935e-06, "loss": 0.0015985879581421614, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0016, "step": 1441, "tokens/total": 188645376, "tokens/train_per_sec_per_gpu": 3136.96, "tokens/trainable": 20082850 }, { "epoch": 4.592356687898089, "grad_norm": 0.11572265625, "learning_rate": 1.021234977439503e-06, "loss": 0.0018211111892014742, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00182, "step": 1442, "tokens/total": 188776448, "tokens/train_per_sec_per_gpu": 3227.61, "tokens/trainable": 20096360 }, { "epoch": 4.595541401273885, "grad_norm": 0.146484375, "learning_rate": 1.0055698433844324e-06, "loss": 0.002404790371656418, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00241, "step": 1443, "tokens/total": 188907520, "tokens/train_per_sec_per_gpu": 3174.22, "tokens/trainable": 20109664 }, { "epoch": 4.598726114649682, "grad_norm": 0.13671875, "learning_rate": 9.9002332052483e-07, "loss": 0.0017899831291288137, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00179, "step": 1444, "tokens/total": 189038592, "tokens/train_per_sec_per_gpu": 3046.84, "tokens/trainable": 20122424 }, { "epoch": 4.601910828025478, "grad_norm": 0.1142578125, "learning_rate": 9.745954857115102e-07, "loss": 0.0016956630861386657, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0017, "step": 1445, "tokens/total": 189169664, "tokens/train_per_sec_per_gpu": 3263.68, "tokens/trainable": 20136088 }, { "epoch": 4.6050955414012735, "grad_norm": 0.1083984375, "learning_rate": 9.592864152085963e-07, "loss": 0.0015517222927883267, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00155, "step": 1446, "tokens/total": 189300736, "tokens/train_per_sec_per_gpu": 3336.28, "tokens/trainable": 20150052 }, { "epoch": 4.60828025477707, "grad_norm": 0.10986328125, "learning_rate": 9.440961846931107e-07, "loss": 0.0015380029799416661, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00154, "step": 1447, "tokens/total": 189431808, "tokens/train_per_sec_per_gpu": 3506.95, "tokens/trainable": 20164734 }, { "epoch": 4.611464968152866, "grad_norm": 0.119140625, "learning_rate": 9.290248692546189e-07, "loss": 0.0016031761188060045, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0016, "step": 1448, "tokens/total": 189562880, "tokens/train_per_sec_per_gpu": 3064.35, "tokens/trainable": 20177570 }, { "epoch": 4.614649681528663, "grad_norm": 0.12109375, "learning_rate": 9.140725433948616e-07, "loss": 0.002197918714955449, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0022, "step": 1449, "tokens/total": 189693952, "tokens/train_per_sec_per_gpu": 3280.73, "tokens/trainable": 20191308 }, { "epoch": 4.617834394904459, "grad_norm": 0.10595703125, "learning_rate": 8.992392810273781e-07, "loss": 0.0015633050352334976, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00156, "step": 1450, "tokens/total": 189825024, "tokens/train_per_sec_per_gpu": 3466.66, "tokens/trainable": 20205824 }, { "epoch": 4.6210191082802545, "grad_norm": 0.10205078125, "learning_rate": 8.845251554771422e-07, "loss": 0.0020091324113309383, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00201, "step": 1451, "tokens/total": 189956096, "tokens/train_per_sec_per_gpu": 3548.93, "tokens/trainable": 20220694 }, { "epoch": 4.624203821656051, "grad_norm": 0.10546875, "learning_rate": 8.699302394802016e-07, "loss": 0.0017181969014927745, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00172, "step": 1452, "tokens/total": 190087168, "tokens/train_per_sec_per_gpu": 3437.21, "tokens/trainable": 20235080 }, { "epoch": 4.627388535031847, "grad_norm": 0.11669921875, "learning_rate": 8.554546051833201e-07, "loss": 0.0018156894948333502, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00182, "step": 1453, "tokens/total": 190218240, "tokens/train_per_sec_per_gpu": 3566.23, "tokens/trainable": 20250016 }, { "epoch": 4.630573248407643, "grad_norm": 0.1298828125, "learning_rate": 8.410983241436132e-07, "loss": 0.002036329824477434, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00204, "step": 1454, "tokens/total": 190349312, "tokens/train_per_sec_per_gpu": 3178.74, "tokens/trainable": 20263338 }, { "epoch": 4.63375796178344, "grad_norm": 0.09521484375, "learning_rate": 8.268614673282021e-07, "loss": 0.0012238912750035524, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00122, "step": 1455, "tokens/total": 190480384, "tokens/train_per_sec_per_gpu": 3217.83, "tokens/trainable": 20276824 }, { "epoch": 4.6369426751592355, "grad_norm": 0.162109375, "learning_rate": 8.127441051138662e-07, "loss": 0.0029940090607851744, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.003, "step": 1456, "tokens/total": 190611456, "tokens/train_per_sec_per_gpu": 3579.23, "tokens/trainable": 20291792 }, { "epoch": 4.640127388535031, "grad_norm": 0.0986328125, "learning_rate": 7.987463072866852e-07, "loss": 0.001104258350096643, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0011, "step": 1457, "tokens/total": 190742528, "tokens/train_per_sec_per_gpu": 3201.93, "tokens/trainable": 20305198 }, { "epoch": 4.643312101910828, "grad_norm": 0.126953125, "learning_rate": 7.848681430416948e-07, "loss": 0.0020911027677357197, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00209, "step": 1458, "tokens/total": 190873600, "tokens/train_per_sec_per_gpu": 3356.29, "tokens/trainable": 20319196 }, { "epoch": 4.646496815286624, "grad_norm": 0.12451171875, "learning_rate": 7.711096809825513e-07, "loss": 0.0017163840821012855, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00172, "step": 1459, "tokens/total": 191004672, "tokens/train_per_sec_per_gpu": 3241.44, "tokens/trainable": 20332736 }, { "epoch": 4.649681528662421, "grad_norm": 0.1337890625, "learning_rate": 7.574709891211951e-07, "loss": 0.0014391193399205804, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00144, "step": 1460, "tokens/total": 191135744, "tokens/train_per_sec_per_gpu": 2792.75, "tokens/trainable": 20344452 }, { "epoch": 4.6528662420382165, "grad_norm": 0.12158203125, "learning_rate": 7.439521348774959e-07, "loss": 0.0014456507051363587, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00145, "step": 1461, "tokens/total": 191266816, "tokens/train_per_sec_per_gpu": 3650.92, "tokens/trainable": 20359712 }, { "epoch": 4.656050955414012, "grad_norm": 0.126953125, "learning_rate": 7.305531850789444e-07, "loss": 0.0015093558467924595, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00151, "step": 1462, "tokens/total": 191397888, "tokens/train_per_sec_per_gpu": 3308.41, "tokens/trainable": 20373528 }, { "epoch": 4.659235668789809, "grad_norm": 0.09716796875, "learning_rate": 7.17274205960311e-07, "loss": 0.0016126552363857627, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00161, "step": 1463, "tokens/total": 191528960, "tokens/train_per_sec_per_gpu": 3625.83, "tokens/trainable": 20388600 }, { "epoch": 4.662420382165605, "grad_norm": 0.11181640625, "learning_rate": 7.041152631633075e-07, "loss": 0.0025427560321986675, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00255, "step": 1464, "tokens/total": 191660032, "tokens/train_per_sec_per_gpu": 3430.12, "tokens/trainable": 20402944 }, { "epoch": 4.665605095541402, "grad_norm": 0.1435546875, "learning_rate": 6.910764217362753e-07, "loss": 0.002073355484753847, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00208, "step": 1465, "tokens/total": 191791104, "tokens/train_per_sec_per_gpu": 3534.12, "tokens/trainable": 20417666 }, { "epoch": 4.6687898089171975, "grad_norm": 0.15234375, "learning_rate": 6.781577461338673e-07, "loss": 0.0026118066161870956, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00262, "step": 1466, "tokens/total": 191922176, "tokens/train_per_sec_per_gpu": 3348.83, "tokens/trainable": 20431704 }, { "epoch": 4.671974522292993, "grad_norm": 0.1259765625, "learning_rate": 6.653593002167168e-07, "loss": 0.0018058358691632748, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00181, "step": 1467, "tokens/total": 192053248, "tokens/train_per_sec_per_gpu": 3093.22, "tokens/trainable": 20444646 }, { "epoch": 4.67515923566879, "grad_norm": 0.1201171875, "learning_rate": 6.526811472511302e-07, "loss": 0.0014479233650490642, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00145, "step": 1468, "tokens/total": 192184320, "tokens/train_per_sec_per_gpu": 3514.53, "tokens/trainable": 20459360 }, { "epoch": 4.678343949044586, "grad_norm": 0.140625, "learning_rate": 6.40123349908775e-07, "loss": 0.002245377516373992, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00225, "step": 1469, "tokens/total": 192315392, "tokens/train_per_sec_per_gpu": 3310.51, "tokens/trainable": 20473228 }, { "epoch": 4.681528662420382, "grad_norm": 0.1201171875, "learning_rate": 6.276859702663618e-07, "loss": 0.001856306567788124, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00186, "step": 1470, "tokens/total": 192446464, "tokens/train_per_sec_per_gpu": 3067.13, "tokens/trainable": 20486072 }, { "epoch": 4.6847133757961785, "grad_norm": 0.1474609375, "learning_rate": 6.153690698053438e-07, "loss": 0.0019508072873577476, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00195, "step": 1471, "tokens/total": 192577536, "tokens/train_per_sec_per_gpu": 3417.64, "tokens/trainable": 20500378 }, { "epoch": 4.687898089171974, "grad_norm": 0.1552734375, "learning_rate": 6.031727094116175e-07, "loss": 0.0022490478586405516, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00225, "step": 1472, "tokens/total": 192708608, "tokens/train_per_sec_per_gpu": 3687.19, "tokens/trainable": 20515722 }, { "epoch": 4.69108280254777, "grad_norm": 0.123046875, "learning_rate": 5.910969493752055e-07, "loss": 0.0018782130209729075, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00188, "step": 1473, "tokens/total": 192839680, "tokens/train_per_sec_per_gpu": 3720.3, "tokens/trainable": 20531248 }, { "epoch": 4.694267515923567, "grad_norm": 0.126953125, "learning_rate": 5.791418493899803e-07, "loss": 0.0018554049311205745, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00186, "step": 1474, "tokens/total": 192970752, "tokens/train_per_sec_per_gpu": 3362.35, "tokens/trainable": 20545276 }, { "epoch": 4.697452229299363, "grad_norm": 0.1708984375, "learning_rate": 5.673074685533547e-07, "loss": 0.00283794361166656, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00284, "step": 1475, "tokens/total": 193101824, "tokens/train_per_sec_per_gpu": 3107.71, "tokens/trainable": 20558290 }, { "epoch": 4.7006369426751595, "grad_norm": 0.115234375, "learning_rate": 5.555938653659859e-07, "loss": 0.0015586434165015817, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00156, "step": 1476, "tokens/total": 193232896, "tokens/train_per_sec_per_gpu": 3495.85, "tokens/trainable": 20572836 }, { "epoch": 4.703821656050955, "grad_norm": 0.1728515625, "learning_rate": 5.440010977315003e-07, "loss": 0.002725705737248063, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00273, "step": 1477, "tokens/total": 193363968, "tokens/train_per_sec_per_gpu": 2926.02, "tokens/trainable": 20585176 }, { "epoch": 4.707006369426751, "grad_norm": 0.1689453125, "learning_rate": 5.32529222956199e-07, "loss": 0.003224026644602418, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00323, "step": 1478, "tokens/total": 193495040, "tokens/train_per_sec_per_gpu": 3124.75, "tokens/trainable": 20598252 }, { "epoch": 4.710191082802548, "grad_norm": 0.1201171875, "learning_rate": 5.211782977487728e-07, "loss": 0.0022572882007807493, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00226, "step": 1479, "tokens/total": 193626112, "tokens/train_per_sec_per_gpu": 3719.62, "tokens/trainable": 20613736 }, { "epoch": 4.713375796178344, "grad_norm": 0.126953125, "learning_rate": 5.099483782200321e-07, "loss": 0.0020106916781514883, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00201, "step": 1480, "tokens/total": 193757184, "tokens/train_per_sec_per_gpu": 3386.3, "tokens/trainable": 20627916 }, { "epoch": 4.7165605095541405, "grad_norm": 0.150390625, "learning_rate": 4.988395198826157e-07, "loss": 0.002159472554922104, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00216, "step": 1481, "tokens/total": 193888256, "tokens/train_per_sec_per_gpu": 3132.67, "tokens/trainable": 20641036 }, { "epoch": 4.719745222929936, "grad_norm": 0.1376953125, "learning_rate": 4.878517776507247e-07, "loss": 0.0026867706328630447, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00269, "step": 1482, "tokens/total": 194019328, "tokens/train_per_sec_per_gpu": 3359.56, "tokens/trainable": 20655048 }, { "epoch": 4.722929936305732, "grad_norm": 0.10595703125, "learning_rate": 4.7698520583985e-07, "loss": 0.0017674706177785993, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00177, "step": 1483, "tokens/total": 194150400, "tokens/train_per_sec_per_gpu": 3297.46, "tokens/trainable": 20668772 }, { "epoch": 4.726114649681529, "grad_norm": 0.1103515625, "learning_rate": 4.662398581665006e-07, "loss": 0.0014837021008133888, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00148, "step": 1484, "tokens/total": 194281472, "tokens/train_per_sec_per_gpu": 3494.12, "tokens/trainable": 20683348 }, { "epoch": 4.729299363057325, "grad_norm": 0.123046875, "learning_rate": 4.5561578774794276e-07, "loss": 0.0021369662135839462, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00214, "step": 1485, "tokens/total": 194412544, "tokens/train_per_sec_per_gpu": 3607.71, "tokens/trainable": 20698316 }, { "epoch": 4.732484076433121, "grad_norm": 0.158203125, "learning_rate": 4.45113047101936e-07, "loss": 0.002360973972827196, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00236, "step": 1486, "tokens/total": 194543616, "tokens/train_per_sec_per_gpu": 3584.68, "tokens/trainable": 20713220 }, { "epoch": 4.735668789808917, "grad_norm": 0.1162109375, "learning_rate": 4.3473168814647525e-07, "loss": 0.0015863839071244001, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00159, "step": 1487, "tokens/total": 194674688, "tokens/train_per_sec_per_gpu": 3400.0, "tokens/trainable": 20727406 }, { "epoch": 4.738853503184713, "grad_norm": 0.1162109375, "learning_rate": 4.24471762199527e-07, "loss": 0.0016582348616793752, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00166, "step": 1488, "tokens/total": 194805760, "tokens/train_per_sec_per_gpu": 3258.7, "tokens/trainable": 20741076 }, { "epoch": 4.742038216560509, "grad_norm": 0.1328125, "learning_rate": 4.143333199787769e-07, "loss": 0.00176681496668607, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00177, "step": 1489, "tokens/total": 194936832, "tokens/train_per_sec_per_gpu": 2999.18, "tokens/trainable": 20753696 }, { "epoch": 4.745222929936306, "grad_norm": 0.1337890625, "learning_rate": 4.0431641160139367e-07, "loss": 0.002107662847265601, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00211, "step": 1490, "tokens/total": 195067904, "tokens/train_per_sec_per_gpu": 3418.27, "tokens/trainable": 20767910 }, { "epoch": 4.748407643312102, "grad_norm": 0.140625, "learning_rate": 3.944210865837572e-07, "loss": 0.0021030758507549763, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00211, "step": 1491, "tokens/total": 195198976, "tokens/train_per_sec_per_gpu": 3144.49, "tokens/trainable": 20781092 }, { "epoch": 4.751592356687898, "grad_norm": 0.1240234375, "learning_rate": 3.846473938412365e-07, "loss": 0.0020006736740469933, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.002, "step": 1492, "tokens/total": 195330048, "tokens/train_per_sec_per_gpu": 3535.97, "tokens/trainable": 20795832 }, { "epoch": 4.754777070063694, "grad_norm": 0.11572265625, "learning_rate": 3.749953816879398e-07, "loss": 0.001961378613486886, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00196, "step": 1493, "tokens/total": 195461120, "tokens/train_per_sec_per_gpu": 3395.84, "tokens/trainable": 20810046 }, { "epoch": 4.757961783439491, "grad_norm": 0.1328125, "learning_rate": 3.654650978364649e-07, "loss": 0.0024665065575391054, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00247, "step": 1494, "tokens/total": 195592192, "tokens/train_per_sec_per_gpu": 3389.65, "tokens/trainable": 20824160 }, { "epoch": 4.761146496815287, "grad_norm": 0.1318359375, "learning_rate": 3.560565893976742e-07, "loss": 0.0024471194483339787, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00245, "step": 1495, "tokens/total": 195723264, "tokens/train_per_sec_per_gpu": 3146.42, "tokens/trainable": 20837344 }, { "epoch": 4.764331210191083, "grad_norm": 0.201171875, "learning_rate": 3.467699028804672e-07, "loss": 0.003118871245533228, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00312, "step": 1496, "tokens/total": 195854336, "tokens/train_per_sec_per_gpu": 3068.73, "tokens/trainable": 20850148 }, { "epoch": 4.767515923566879, "grad_norm": 0.146484375, "learning_rate": 3.376050841915335e-07, "loss": 0.0028909991960972548, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0029, "step": 1497, "tokens/total": 195985408, "tokens/train_per_sec_per_gpu": 3299.09, "tokens/trainable": 20863890 }, { "epoch": 4.770700636942675, "grad_norm": 0.11181640625, "learning_rate": 3.2856217863514727e-07, "loss": 0.001599812414497137, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0016, "step": 1498, "tokens/total": 196116480, "tokens/train_per_sec_per_gpu": 3543.45, "tokens/trainable": 20878628 }, { "epoch": 4.773885350318471, "grad_norm": 0.158203125, "learning_rate": 3.1964123091292595e-07, "loss": 0.0027794367633759975, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00278, "step": 1499, "tokens/total": 196247552, "tokens/train_per_sec_per_gpu": 3411.09, "tokens/trainable": 20892812 }, { "epoch": 4.777070063694268, "grad_norm": 0.09814453125, "learning_rate": 3.108422851236137e-07, "loss": 0.0011374036548659205, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00114, "step": 1500, "tokens/total": 196378624, "tokens/train_per_sec_per_gpu": 3329.43, "tokens/trainable": 20906680 }, { "epoch": 4.780254777070064, "grad_norm": 0.11865234375, "learning_rate": 3.0216538476286196e-07, "loss": 0.0018032776424661279, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0018, "step": 1501, "tokens/total": 196509696, "tokens/train_per_sec_per_gpu": 3433.1, "tokens/trainable": 20920932 }, { "epoch": 4.7834394904458595, "grad_norm": 0.11669921875, "learning_rate": 2.936105727230298e-07, "loss": 0.0027445517480373383, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00275, "step": 1502, "tokens/total": 196640768, "tokens/train_per_sec_per_gpu": 3841.28, "tokens/trainable": 20936896 }, { "epoch": 4.786624203821656, "grad_norm": 0.10107421875, "learning_rate": 2.851778912929426e-07, "loss": 0.001024644705466926, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00103, "step": 1503, "tokens/total": 196771840, "tokens/train_per_sec_per_gpu": 3599.56, "tokens/trainable": 20951852 }, { "epoch": 4.789808917197452, "grad_norm": 0.09423828125, "learning_rate": 2.768673821577167e-07, "loss": 0.0011879701633006334, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00119, "step": 1504, "tokens/total": 196902912, "tokens/train_per_sec_per_gpu": 3009.96, "tokens/trainable": 20964468 }, { "epoch": 4.792993630573249, "grad_norm": 0.1591796875, "learning_rate": 2.6867908639852944e-07, "loss": 0.0033508751075714827, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00336, "step": 1505, "tokens/total": 197033984, "tokens/train_per_sec_per_gpu": 3536.08, "tokens/trainable": 20979168 }, { "epoch": 4.796178343949045, "grad_norm": 0.166015625, "learning_rate": 2.6061304449241655e-07, "loss": 0.0030738934874534607, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00308, "step": 1506, "tokens/total": 197165056, "tokens/train_per_sec_per_gpu": 2876.89, "tokens/trainable": 20991260 }, { "epoch": 4.7993630573248405, "grad_norm": 0.1533203125, "learning_rate": 2.526692963120858e-07, "loss": 0.002285804832354188, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00229, "step": 1507, "tokens/total": 197296128, "tokens/train_per_sec_per_gpu": 3400.76, "tokens/trainable": 21005408 }, { "epoch": 4.802547770700637, "grad_norm": 0.1298828125, "learning_rate": 2.448478811257149e-07, "loss": 0.002408439526334405, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00241, "step": 1508, "tokens/total": 197427200, "tokens/train_per_sec_per_gpu": 3679.66, "tokens/trainable": 21020666 }, { "epoch": 4.805732484076433, "grad_norm": 0.08837890625, "learning_rate": 2.3714883759674566e-07, "loss": 0.0013570735463872552, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00136, "step": 1509, "tokens/total": 197558272, "tokens/train_per_sec_per_gpu": 3549.55, "tokens/trainable": 21035448 }, { "epoch": 4.80891719745223, "grad_norm": 0.1142578125, "learning_rate": 2.295722037837178e-07, "loss": 0.0017048909794539213, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00171, "step": 1510, "tokens/total": 197689344, "tokens/train_per_sec_per_gpu": 2969.31, "tokens/trainable": 21047892 }, { "epoch": 4.812101910828026, "grad_norm": 0.09521484375, "learning_rate": 2.2211801714004942e-07, "loss": 0.0012713008327409625, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00127, "step": 1511, "tokens/total": 197820416, "tokens/train_per_sec_per_gpu": 3589.68, "tokens/trainable": 21062844 }, { "epoch": 4.8152866242038215, "grad_norm": 0.1259765625, "learning_rate": 2.1478631451387898e-07, "loss": 0.002427282277494669, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00243, "step": 1512, "tokens/total": 197951488, "tokens/train_per_sec_per_gpu": 3365.34, "tokens/trainable": 21076902 }, { "epoch": 4.818471337579618, "grad_norm": 0.1513671875, "learning_rate": 2.0757713214786533e-07, "loss": 0.0020946285221725702, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0021, "step": 1513, "tokens/total": 198082560, "tokens/train_per_sec_per_gpu": 3593.33, "tokens/trainable": 21091820 }, { "epoch": 4.821656050955414, "grad_norm": 0.09912109375, "learning_rate": 2.0049050567902128e-07, "loss": 0.0015513665275648236, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00155, "step": 1514, "tokens/total": 198213632, "tokens/train_per_sec_per_gpu": 3683.66, "tokens/trainable": 21107128 }, { "epoch": 4.82484076433121, "grad_norm": 0.130859375, "learning_rate": 1.9352647013852477e-07, "loss": 0.001911777420900762, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00191, "step": 1515, "tokens/total": 198344704, "tokens/train_per_sec_per_gpu": 3288.64, "tokens/trainable": 21120836 }, { "epoch": 4.828025477707007, "grad_norm": 0.1220703125, "learning_rate": 1.8668505995155515e-07, "loss": 0.0022345585748553276, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00224, "step": 1516, "tokens/total": 198475776, "tokens/train_per_sec_per_gpu": 3563.74, "tokens/trainable": 21135672 }, { "epoch": 4.8312101910828025, "grad_norm": 0.130859375, "learning_rate": 1.7996630893712675e-07, "loss": 0.0015912681119516492, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00159, "step": 1517, "tokens/total": 198606848, "tokens/train_per_sec_per_gpu": 3421.44, "tokens/trainable": 21149896 }, { "epoch": 4.834394904458598, "grad_norm": 0.1337890625, "learning_rate": 1.7337025030790543e-07, "loss": 0.0015856210375204682, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00159, "step": 1518, "tokens/total": 198737920, "tokens/train_per_sec_per_gpu": 2977.95, "tokens/trainable": 21162350 }, { "epoch": 4.837579617834395, "grad_norm": 0.1767578125, "learning_rate": 1.6689691667005902e-07, "loss": 0.0021609310060739517, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00216, "step": 1519, "tokens/total": 198868992, "tokens/train_per_sec_per_gpu": 3149.82, "tokens/trainable": 21175516 }, { "epoch": 4.840764331210191, "grad_norm": 0.11572265625, "learning_rate": 1.6054634002309054e-07, "loss": 0.0015277141937986016, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00153, "step": 1520, "tokens/total": 199000064, "tokens/train_per_sec_per_gpu": 3155.1, "tokens/trainable": 21188692 }, { "epoch": 4.843949044585988, "grad_norm": 0.162109375, "learning_rate": 1.5431855175968014e-07, "loss": 0.002204909920692444, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00221, "step": 1521, "tokens/total": 199131136, "tokens/train_per_sec_per_gpu": 3326.25, "tokens/trainable": 21202538 }, { "epoch": 4.8471337579617835, "grad_norm": 0.138671875, "learning_rate": 1.4821358266553231e-07, "loss": 0.002712359419092536, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00272, "step": 1522, "tokens/total": 199262208, "tokens/train_per_sec_per_gpu": 3457.57, "tokens/trainable": 21216952 }, { "epoch": 4.850318471337579, "grad_norm": 0.1337890625, "learning_rate": 1.4223146291922062e-07, "loss": 0.0019022361375391483, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0019, "step": 1523, "tokens/total": 199393280, "tokens/train_per_sec_per_gpu": 3419.62, "tokens/trainable": 21231184 }, { "epoch": 4.853503184713376, "grad_norm": 0.1357421875, "learning_rate": 1.3637222209204327e-07, "loss": 0.0018241211073473096, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00183, "step": 1524, "tokens/total": 199524352, "tokens/train_per_sec_per_gpu": 3139.7, "tokens/trainable": 21244294 }, { "epoch": 4.856687898089172, "grad_norm": 0.1044921875, "learning_rate": 1.3063588914786207e-07, "loss": 0.001210428192280233, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00121, "step": 1525, "tokens/total": 199655424, "tokens/train_per_sec_per_gpu": 3122.56, "tokens/trainable": 21257322 }, { "epoch": 4.859872611464969, "grad_norm": 0.11474609375, "learning_rate": 1.250224924429888e-07, "loss": 0.0014607764314860106, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00146, "step": 1526, "tokens/total": 199786496, "tokens/train_per_sec_per_gpu": 3032.1, "tokens/trainable": 21269998 }, { "epoch": 4.8630573248407645, "grad_norm": 0.1474609375, "learning_rate": 1.1953205972601022e-07, "loss": 0.002046809531748295, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00205, "step": 1527, "tokens/total": 199917568, "tokens/train_per_sec_per_gpu": 3360.03, "tokens/trainable": 21284056 }, { "epoch": 4.86624203821656, "grad_norm": 0.140625, "learning_rate": 1.1416461813767709e-07, "loss": 0.002186344237998128, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00219, "step": 1528, "tokens/total": 200048640, "tokens/train_per_sec_per_gpu": 3252.14, "tokens/trainable": 21297784 }, { "epoch": 4.869426751592357, "grad_norm": 0.134765625, "learning_rate": 1.0892019421075706e-07, "loss": 0.002091720700263977, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00209, "step": 1529, "tokens/total": 200179712, "tokens/train_per_sec_per_gpu": 3436.08, "tokens/trainable": 21312112 }, { "epoch": 4.872611464968153, "grad_norm": 0.1279296875, "learning_rate": 1.0379881386990974e-07, "loss": 0.001913387910462916, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00192, "step": 1530, "tokens/total": 200310784, "tokens/train_per_sec_per_gpu": 3187.83, "tokens/trainable": 21325434 }, { "epoch": 4.875796178343949, "grad_norm": 0.154296875, "learning_rate": 9.880050243155359e-08, "loss": 0.0024429503828287125, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00245, "step": 1531, "tokens/total": 200441856, "tokens/train_per_sec_per_gpu": 3182.45, "tokens/trainable": 21338740 }, { "epoch": 4.8789808917197455, "grad_norm": 0.130859375, "learning_rate": 9.392528460374362e-08, "loss": 0.0016927801771089435, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00169, "step": 1532, "tokens/total": 200572928, "tokens/train_per_sec_per_gpu": 3158.89, "tokens/trainable": 21351978 }, { "epoch": 4.882165605095541, "grad_norm": 0.1142578125, "learning_rate": 8.917318448604661e-08, "loss": 0.0016512478468939662, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00165, "step": 1533, "tokens/total": 200704000, "tokens/train_per_sec_per_gpu": 3721.88, "tokens/trainable": 21367448 }, { "epoch": 4.885350318471337, "grad_norm": 0.1474609375, "learning_rate": 8.454422556942454e-08, "loss": 0.0020441263914108276, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00205, "step": 1534, "tokens/total": 200835072, "tokens/train_per_sec_per_gpu": 3451.49, "tokens/trainable": 21381804 }, { "epoch": 4.888535031847134, "grad_norm": 0.12890625, "learning_rate": 8.003843073612627e-08, "loss": 0.0019288958283141255, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00193, "step": 1535, "tokens/total": 200966144, "tokens/train_per_sec_per_gpu": 3544.84, "tokens/trainable": 21396546 }, { "epoch": 4.89171974522293, "grad_norm": 0.1435546875, "learning_rate": 7.565582225955158e-08, "loss": 0.0020149427000433207, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00202, "step": 1536, "tokens/total": 201097216, "tokens/train_per_sec_per_gpu": 3206.12, "tokens/trainable": 21410012 }, { "epoch": 4.8949044585987265, "grad_norm": 0.1591796875, "learning_rate": 7.139642180416517e-08, "loss": 0.00250299577601254, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00251, "step": 1537, "tokens/total": 201228288, "tokens/train_per_sec_per_gpu": 3708.05, "tokens/trainable": 21425414 }, { "epoch": 4.898089171974522, "grad_norm": 0.158203125, "learning_rate": 6.726025042537721e-08, "loss": 0.002223816467449069, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00223, "step": 1538, "tokens/total": 201359360, "tokens/train_per_sec_per_gpu": 3310.1, "tokens/trainable": 21439244 }, { "epoch": 4.901273885350318, "grad_norm": 0.1337890625, "learning_rate": 6.324732856944349e-08, "loss": 0.002602557884529233, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00261, "step": 1539, "tokens/total": 201490432, "tokens/train_per_sec_per_gpu": 3309.88, "tokens/trainable": 21453112 }, { "epoch": 4.904458598726115, "grad_norm": 0.1435546875, "learning_rate": 5.935767607336273e-08, "loss": 0.0018828021129593253, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00188, "step": 1540, "tokens/total": 201621504, "tokens/train_per_sec_per_gpu": 3360.0, "tokens/trainable": 21467172 }, { "epoch": 4.907643312101911, "grad_norm": 0.1103515625, "learning_rate": 5.5591312164776646e-08, "loss": 0.0018976753344759345, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0019, "step": 1541, "tokens/total": 201752576, "tokens/train_per_sec_per_gpu": 3485.96, "tokens/trainable": 21481770 }, { "epoch": 4.9108280254777075, "grad_norm": 0.126953125, "learning_rate": 5.194825546187831e-08, "loss": 0.0018805229337885976, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00188, "step": 1542, "tokens/total": 201883648, "tokens/train_per_sec_per_gpu": 3369.24, "tokens/trainable": 21495882 }, { "epoch": 4.914012738853503, "grad_norm": 0.1171875, "learning_rate": 4.84285239733151e-08, "loss": 0.0020450761076062918, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00205, "step": 1543, "tokens/total": 202014720, "tokens/train_per_sec_per_gpu": 3212.2, "tokens/trainable": 21509344 }, { "epoch": 4.917197452229299, "grad_norm": 0.1357421875, "learning_rate": 4.503213509811088e-08, "loss": 0.00226628128439188, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00227, "step": 1544, "tokens/total": 202145792, "tokens/train_per_sec_per_gpu": 3154.8, "tokens/trainable": 21522558 }, { "epoch": 4.920382165605096, "grad_norm": 0.125, "learning_rate": 4.175910562556895e-08, "loss": 0.0018778032390400767, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00188, "step": 1545, "tokens/total": 202276864, "tokens/train_per_sec_per_gpu": 3493.16, "tokens/trainable": 21537180 }, { "epoch": 4.923566878980892, "grad_norm": 0.134765625, "learning_rate": 3.860945173518593e-08, "loss": 0.0019706811290234327, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00197, "step": 1546, "tokens/total": 202407936, "tokens/train_per_sec_per_gpu": 3480.51, "tokens/trainable": 21551752 }, { "epoch": 4.926751592356688, "grad_norm": 0.1435546875, "learning_rate": 3.5583188996587965e-08, "loss": 0.001993852434679866, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.002, "step": 1547, "tokens/total": 202539008, "tokens/train_per_sec_per_gpu": 3231.61, "tokens/trainable": 21565284 }, { "epoch": 4.929936305732484, "grad_norm": 0.12890625, "learning_rate": 3.26803323694419e-08, "loss": 0.0025727523025125265, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00258, "step": 1548, "tokens/total": 202670080, "tokens/train_per_sec_per_gpu": 3483.78, "tokens/trainable": 21579872 }, { "epoch": 4.93312101910828, "grad_norm": 0.12158203125, "learning_rate": 2.990089620337755e-08, "loss": 0.00160361104644835, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0016, "step": 1549, "tokens/total": 202801152, "tokens/train_per_sec_per_gpu": 3091.07, "tokens/trainable": 21592838 }, { "epoch": 4.936305732484076, "grad_norm": 0.1337890625, "learning_rate": 2.724489423792942e-08, "loss": 0.002017256570979953, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00202, "step": 1550, "tokens/total": 202932224, "tokens/train_per_sec_per_gpu": 3197.05, "tokens/trainable": 21606214 }, { "epoch": 4.939490445859873, "grad_norm": 0.16015625, "learning_rate": 2.4712339602461774e-08, "loss": 0.0018039483111351728, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00181, "step": 1551, "tokens/total": 203063296, "tokens/train_per_sec_per_gpu": 3119.44, "tokens/trainable": 21619296 }, { "epoch": 4.942675159235669, "grad_norm": 0.1162109375, "learning_rate": 2.2303244816099244e-08, "loss": 0.001978665590286255, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00198, "step": 1552, "tokens/total": 203194368, "tokens/train_per_sec_per_gpu": 2961.33, "tokens/trainable": 21631746 }, { "epoch": 4.945859872611465, "grad_norm": 0.130859375, "learning_rate": 2.0017621787671303e-08, "loss": 0.0023562528658658266, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00236, "step": 1553, "tokens/total": 203325440, "tokens/train_per_sec_per_gpu": 3105.93, "tokens/trainable": 21644684 }, { "epoch": 4.949044585987261, "grad_norm": 0.16796875, "learning_rate": 1.7855481815659546e-08, "loss": 0.0023984115105122328, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0024, "step": 1554, "tokens/total": 203456512, "tokens/train_per_sec_per_gpu": 3024.15, "tokens/trainable": 21657424 }, { "epoch": 4.952229299363057, "grad_norm": 0.10205078125, "learning_rate": 1.5816835588122748e-08, "loss": 0.0020472980104386806, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00205, "step": 1555, "tokens/total": 203587584, "tokens/train_per_sec_per_gpu": 3146.82, "tokens/trainable": 21670582 }, { "epoch": 4.955414012738854, "grad_norm": 0.1513671875, "learning_rate": 1.3901693182660768e-08, "loss": 0.002163731260225177, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00217, "step": 1556, "tokens/total": 203718656, "tokens/train_per_sec_per_gpu": 3104.79, "tokens/trainable": 21683604 }, { "epoch": 4.95859872611465, "grad_norm": 0.119140625, "learning_rate": 1.2110064066361836e-08, "loss": 0.002627151319757104, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00263, "step": 1557, "tokens/total": 203849728, "tokens/train_per_sec_per_gpu": 3130.19, "tokens/trainable": 21696704 }, { "epoch": 4.961783439490446, "grad_norm": 0.10400390625, "learning_rate": 1.0441957095752574e-08, "loss": 0.0015952385729178786, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0016, "step": 1558, "tokens/total": 203980800, "tokens/train_per_sec_per_gpu": 3351.63, "tokens/trainable": 21710708 }, { "epoch": 4.964968152866242, "grad_norm": 0.1494140625, "learning_rate": 8.897380516748044e-09, "loss": 0.002128974301740527, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00213, "step": 1559, "tokens/total": 204111872, "tokens/train_per_sec_per_gpu": 3197.67, "tokens/trainable": 21724200 }, { "epoch": 4.968152866242038, "grad_norm": 0.107421875, "learning_rate": 7.476341964626766e-09, "loss": 0.0021477844566106796, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00215, "step": 1560, "tokens/total": 204242944, "tokens/train_per_sec_per_gpu": 3209.08, "tokens/trainable": 21737668 }, { "epoch": 4.971337579617835, "grad_norm": 0.1337890625, "learning_rate": 6.178848463980758e-09, "loss": 0.00202268292196095, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00202, "step": 1561, "tokens/total": 204374016, "tokens/train_per_sec_per_gpu": 3300.82, "tokens/trainable": 21751480 }, { "epoch": 4.974522292993631, "grad_norm": 0.09521484375, "learning_rate": 5.004906428685008e-09, "loss": 0.0012176063610240817, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00122, "step": 1562, "tokens/total": 204505088, "tokens/train_per_sec_per_gpu": 3387.1, "tokens/trainable": 21765732 }, { "epoch": 4.977707006369426, "grad_norm": 0.1533203125, "learning_rate": 3.954521661861388e-09, "loss": 0.0024965633638203144, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.0025, "step": 1563, "tokens/total": 204636160, "tokens/train_per_sec_per_gpu": 3190.66, "tokens/trainable": 21779126 }, { "epoch": 4.980891719745223, "grad_norm": 0.142578125, "learning_rate": 3.027699355859226e-09, "loss": 0.0016142029780894518, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00162, "step": 1564, "tokens/total": 204767232, "tokens/train_per_sec_per_gpu": 3249.58, "tokens/trainable": 21792812 }, { "epoch": 4.984076433121019, "grad_norm": 0.12451171875, "learning_rate": 2.2244440922164487e-09, "loss": 0.0019386119674891233, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00194, "step": 1565, "tokens/total": 204898304, "tokens/train_per_sec_per_gpu": 3286.69, "tokens/trainable": 21806628 }, { "epoch": 4.987261146496815, "grad_norm": 0.12158203125, "learning_rate": 1.544759841654031e-09, "loss": 0.001636000582948327, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00164, "step": 1566, "tokens/total": 205029376, "tokens/train_per_sec_per_gpu": 3407.55, "tokens/trainable": 21820912 }, { "epoch": 4.990445859872612, "grad_norm": 0.130859375, "learning_rate": 9.886499640399116e-10, "loss": 0.0022515307646244764, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00225, "step": 1567, "tokens/total": 205160448, "tokens/train_per_sec_per_gpu": 3312.13, "tokens/trainable": 21834766 }, { "epoch": 4.993630573248407, "grad_norm": 0.1162109375, "learning_rate": 5.561172083806688e-10, "loss": 0.0021433548536151648, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00215, "step": 1568, "tokens/total": 205291520, "tokens/train_per_sec_per_gpu": 3715.64, "tokens/trainable": 21850308 }, { "epoch": 4.996815286624204, "grad_norm": 0.11572265625, "learning_rate": 2.4716371280764093e-10, "loss": 0.002227420685812831, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 64.64, "memory/max_allocated (GiB)": 64.64, "ppl": 1.00223, "step": 1569, "tokens/total": 205422592, "tokens/train_per_sec_per_gpu": 3672.72, "tokens/trainable": 21865620 }, { "epoch": 5.0, "grad_norm": 0.2041015625, "learning_rate": 6.179100456582543e-11, "loss": 0.002160376403480768, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 39.25, "memory/max_allocated (GiB)": 39.25, "ppl": 1.00216, "step": 1570, "tokens/total": 205496320, "tokens/train_per_sec_per_gpu": 3367.0, "tokens/trainable": 21873388 }, { "epoch": 5.0, "eval_loss": 0.010312405414879322, "eval_ppl": 1.01037, "eval_runtime": 41.6326, "eval_samples_per_second": 64.877, "eval_steps_per_second": 4.059, "memory/device_reserved (GiB)": 74.81, "memory/max_active (GiB)": 54.61, "memory/max_allocated (GiB)": 54.61, "step": 1570 } ], "logging_steps": 1, "max_steps": 1570, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 314, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2775166334468096e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }